@langfuse/client 4.0.0 → 4.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,17 +1,334 @@
1
1
  import * as _langfuse_core from '@langfuse/core';
2
- import { LangfuseAPIClient, Dataset, DatasetItem, DatasetRunItem, ParsedMediaReference, CreatePromptRequest, ChatMessage, ChatMessageWithPlaceholders, PlaceholderMessage, BasePrompt, Prompt, ScoreBody } from '@langfuse/core';
2
+ import { DatasetItem, ScoreBody, Dataset, DatasetRunItem, LangfuseAPIClient, ParsedMediaReference, CreatePromptRequest, ChatMessage, ChatMessageWithPlaceholders, PlaceholderMessage, BasePrompt, Prompt } from '@langfuse/core';
3
3
  import { Span } from '@opentelemetry/api';
4
4
 
5
+ type ExperimentItem = {
6
+ /**
7
+ * The input data to pass to the task function.
8
+ *
9
+ * Can be any type - string, object, array, etc. This data will be passed
10
+ * to your task function as the `input` parameter. Structure it according
11
+ * to your task's requirements.
12
+ */
13
+ input?: any;
14
+ /**
15
+ * The expected output for evaluation purposes.
16
+ *
17
+ * Optional ground truth or reference output for this input.
18
+ * Used by evaluators to assess task performance. If not provided,
19
+ * only evaluators that don't require expected output can be used.
20
+ */
21
+ expectedOutput?: any;
22
+ };
23
+ type ExperimentTaskParams = Pick<ExperimentItem, "input">;
24
+ type ExperimentTask = (params: ExperimentTaskParams) => Promise<any>;
25
+ type Evaluation = Pick<ScoreBody, "name" | "value" | "comment" | "metadata" | "dataType">;
26
+ type EvaluatorParams = {
27
+ /**
28
+ * The original input data passed to the task.
29
+ *
30
+ * This is the same input that was provided to the task function.
31
+ * Use this for context-aware evaluations or input-output relationship analysis.
32
+ */
33
+ input: any;
34
+ /**
35
+ * The output produced by the task.
36
+ *
37
+ * This is the actual result returned by your task function.
38
+ * This is the primary value to evaluate against expectations.
39
+ */
40
+ output: any;
41
+ /**
42
+ * The expected output for comparison (optional).
43
+ *
44
+ * This is the ground truth or expected result for the given input.
45
+ * May not be available for all evaluation scenarios.
46
+ */
47
+ expectedOutput?: any;
48
+ };
49
+ type Evaluator = (params: EvaluatorParams) => Promise<Evaluation[] | Evaluation>;
50
+ type RunEvaluatorParams = {
51
+ /**
52
+ * Results from all processed experiment items.
53
+ *
54
+ * Each item contains the input, output, evaluations, and metadata from
55
+ * processing a single data item. Use this for aggregate analysis,
56
+ * statistical calculations, and cross-item comparisons.
57
+ */
58
+ itemResults: ExperimentItemResult[];
59
+ };
60
+ type RunEvaluator = (params: RunEvaluatorParams) => Promise<Evaluation[] | Evaluation>;
61
+ type ExperimentParams = {
62
+ /**
63
+ * Human-readable name for the experiment.
64
+ *
65
+ * This name will appear in Langfuse UI and experiment results.
66
+ * Choose a descriptive name that identifies the experiment's purpose.
67
+ */
68
+ name: string;
69
+ /**
70
+ * Optional description explaining the experiment's purpose.
71
+ *
72
+ * Provide context about what you're testing, methodology, or goals.
73
+ * This helps with experiment tracking and result interpretation.
74
+ */
75
+ description?: string;
76
+ /**
77
+ * Optional metadata to attach to the experiment run.
78
+ *
79
+ * Store additional context like model versions, hyperparameters,
80
+ * or any other relevant information for analysis and comparison.
81
+ */
82
+ metadata?: Record<string, any>;
83
+ /**
84
+ * Array of data items to process.
85
+ *
86
+ * Can be either custom ExperimentItem[] or DatasetItem[] from Langfuse.
87
+ * Each item should contain input data and optionally expected output.
88
+ */
89
+ data: ExperimentItem[] | DatasetItem[];
90
+ /**
91
+ * The task function to execute on each data item.
92
+ *
93
+ * This function receives input data and produces output that will be evaluated.
94
+ * It should encapsulate the model or system being tested.
95
+ */
96
+ task: ExperimentTask;
97
+ /**
98
+ * Optional array of evaluator functions to assess each item's output.
99
+ *
100
+ * Each evaluator receives input, output, and expected output (if available)
101
+ * and returns evaluation results. Multiple evaluators enable comprehensive assessment.
102
+ */
103
+ evaluators?: Evaluator[];
104
+ /**
105
+ * Optional array of run-level evaluators to assess the entire experiment.
106
+ *
107
+ * These evaluators receive all item results and can perform aggregate analysis
108
+ * like calculating averages, detecting patterns, or statistical analysis.
109
+ */
110
+ runEvaluators?: RunEvaluator[];
111
+ /**
112
+ * Maximum number of concurrent task executions (default: Infinity).
113
+ *
114
+ * Controls parallelism to manage resource usage and API rate limits.
115
+ * Set lower values for expensive operations or rate-limited services.
116
+ */
117
+ maxConcurrency?: number;
118
+ };
119
+ type ExperimentItemResult = Pick<ExperimentItem, "input" | "expectedOutput"> & {
120
+ /**
121
+ * The actual output produced by the task.
122
+ *
123
+ * This is the result returned by your task function for this specific input.
124
+ * It will be passed to evaluators for assessment against expected outputs.
125
+ */
126
+ output: any;
127
+ /**
128
+ * Results from all evaluators that ran on this item.
129
+ *
130
+ * Contains evaluation scores, comments, and metadata from each evaluator
131
+ * that successfully processed this item. Failed evaluators are excluded.
132
+ */
133
+ evaluations: Evaluation[];
134
+ /**
135
+ * Langfuse trace ID for this item's execution (for debugging and analysis).
136
+ *
137
+ * Use this ID to view detailed execution traces in the Langfuse UI,
138
+ * including timing, inputs, outputs, and any nested observations.
139
+ */
140
+ traceId?: string;
141
+ /**
142
+ * Dataset run ID if this item was part of a Langfuse dataset.
143
+ *
144
+ * Present only when running experiments on Langfuse datasets.
145
+ * Links this item result to a specific dataset run for tracking and comparison.
146
+ */
147
+ datasetRunId?: string;
148
+ };
149
+ /**
150
+ * Complete result of an experiment execution.
151
+ *
152
+ * Contains all results from processing the experiment data,
153
+ * including individual item results, run-level evaluations,
154
+ * and utilities for result visualization.
155
+ *
156
+ * @example Using experiment results
157
+ * ```typescript
158
+ * const result = await langfuse.experiment.run(config);
159
+ *
160
+ * // Access individual results
161
+ * console.log(`Processed ${result.itemResults.length} items`);
162
+ *
163
+ * // Check run-level evaluations
164
+ * const avgScore = result.runEvaluations.find(e => e.name === 'average_score');
165
+ * console.log(`Average score: ${avgScore?.value}`);
166
+ *
167
+ * // Print formatted results
168
+ * console.log(await result.prettyPrint());
169
+ *
170
+ * // Print summary only (for large datasets)
171
+ * console.log(await result.prettyPrint({ includeItemResults: false }));
172
+ *
173
+ * // Link to dataset run (if available)
174
+ * if (result.datasetRunId) {
175
+ * console.log(`View in Langfuse: dataset run ${result.datasetRunId}`);
176
+ * }
177
+ * ```
178
+ *
179
+ * @public
180
+ */
181
+ type ExperimentResult = {
182
+ /**
183
+ * ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
184
+ *
185
+ * Present only when running experiments on Langfuse datasets.
186
+ * Use this ID to access the dataset run via the Langfuse API or UI
187
+ * for detailed analysis and comparison with other runs.
188
+ */
189
+ datasetRunId?: string;
190
+ /**
191
+ * Results from processing each individual data item.
192
+ *
193
+ * Contains the complete results for every item in your experiment data,
194
+ * including inputs, outputs, evaluations, and trace information.
195
+ * Use this for detailed analysis of individual item performance.
196
+ */
197
+ itemResults: ExperimentItemResult[];
198
+ /**
199
+ * Results from run-level evaluators that assessed the entire experiment.
200
+ *
201
+ * Contains aggregate evaluations that analyze the complete experiment,
202
+ * such as average scores, statistical measures, or overall quality assessments.
203
+ */
204
+ runEvaluations: Evaluation[];
205
+ /**
206
+ * Function to format and display experiment results in a human-readable format.
207
+ *
208
+ * Generates a comprehensive, nicely formatted summary including individual results,
209
+ * aggregate statistics, evaluation scores, and links to traces and dataset runs.
210
+ *
211
+ * @param options - Formatting options
212
+ * @param options.includeItemResults - Whether to include individual item details (default: true)
213
+ * @returns Promise resolving to formatted string representation
214
+ */
215
+ prettyPrint: (options?: {
216
+ includeItemResults?: boolean;
217
+ }) => Promise<string>;
218
+ };
219
+
220
+ /**
221
+ * Function type for running experiments on Langfuse datasets.
222
+ *
223
+ * This function type is attached to fetched datasets to enable convenient
224
+ * experiment execution directly on dataset objects.
225
+ *
226
+ * @param params - Experiment parameters excluding data (since data comes from the dataset)
227
+ * @returns Promise resolving to experiment results
228
+ *
229
+ * @example
230
+ * ```typescript
231
+ * const dataset = await langfuse.dataset.get("my-dataset");
232
+ * const result = await dataset.runExperiment({
233
+ * name: "Model Evaluation",
234
+ * task: myTask,
235
+ * evaluators: [myEvaluator]
236
+ * });
237
+ * ```
238
+ *
239
+ * @public
240
+ * @since 4.0.0
241
+ */
242
+ type RunExperimentOnDataset = (params: Omit<ExperimentParams, "data" | "dataSource">) => Promise<ExperimentResult>;
243
+ /**
244
+ * Enhanced dataset object with additional methods for linking and experiments.
245
+ *
246
+ * This type extends the base Dataset with functionality for:
247
+ * - Linking dataset items to traces/observations
248
+ * - Running experiments directly on the dataset
249
+ *
250
+ * @example Working with a fetched dataset
251
+ * ```typescript
252
+ * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
253
+ *
254
+ * // Access dataset metadata
255
+ * console.log(dataset.name, dataset.description);
256
+ *
257
+ * // Work with individual items
258
+ * for (const item of dataset.items) {
259
+ * console.log(item.input, item.expectedOutput);
260
+ *
261
+ * // Link item to a trace
262
+ * await item.link(myObservation, "experiment-run-1");
263
+ * }
264
+ *
265
+ * // Run experiments on the entire dataset
266
+ * const result = await dataset.runExperiment({
267
+ * name: "Model Comparison",
268
+ * task: myTask,
269
+ * evaluators: [accuracyEvaluator]
270
+ * });
271
+ * ```
272
+ *
273
+ * @public
274
+ * @since 4.0.0
275
+ */
276
+ type FetchedDataset = Dataset & {
277
+ /** Dataset items with additional linking functionality */
278
+ items: (DatasetItem & {
279
+ link: LinkDatasetItemFunction;
280
+ })[];
281
+ /** Function to run experiments directly on this dataset */
282
+ runExperiment: RunExperimentOnDataset;
283
+ };
5
284
  /**
6
285
  * Function type for linking dataset items to OpenTelemetry spans.
7
- * This allows dataset items to be associated with specific traces for experiment tracking.
8
286
  *
9
- * @param obj - Object containing the OpenTelemetry span
10
- * @param runName - Name of the dataset run
11
- * @param runArgs - Optional arguments for the dataset run
287
+ * This function creates a connection between a dataset item and a trace/observation,
288
+ * enabling tracking of which dataset items were used in which experiments or runs.
289
+ * This is essential for creating dataset runs and tracking experiment lineage.
290
+ *
291
+ * @param obj - Object containing the OpenTelemetry span to link to
292
+ * @param obj.otelSpan - The OpenTelemetry span from a Langfuse observation
293
+ * @param runName - Name of the experiment run for grouping related items
294
+ * @param runArgs - Optional configuration for the dataset run
295
+ * @param runArgs.description - Description of the experiment run
296
+ * @param runArgs.metadata - Additional metadata to attach to the run
12
297
  * @returns Promise that resolves to the created dataset run item
13
298
  *
299
+ * @example Basic linking
300
+ * ```typescript
301
+ * const dataset = await langfuse.dataset.get("my-dataset");
302
+ * const span = startObservation("my-task", { input: "test" });
303
+ * span.update({ output: "result" });
304
+ * span.end();
305
+ *
306
+ * // Link the dataset item to this execution
307
+ * await dataset.items[0].link(
308
+ * { otelSpan: span.otelSpan },
309
+ * "experiment-run-1"
310
+ * );
311
+ * ```
312
+ *
313
+ * @example Linking with metadata
314
+ * ```typescript
315
+ * await dataset.items[0].link(
316
+ * { otelSpan: span.otelSpan },
317
+ * "model-comparison-v2",
318
+ * {
319
+ * description: "Comparing GPT-4 vs Claude performance",
320
+ * metadata: {
321
+ * modelVersion: "gpt-4-1106-preview",
322
+ * temperature: 0.7,
323
+ * timestamp: new Date().toISOString()
324
+ * }
325
+ * }
326
+ * );
327
+ * ```
328
+ *
329
+ * @see {@link https://langfuse.com/docs/datasets} Langfuse datasets documentation
14
330
  * @public
331
+ * @since 4.0.0
15
332
  */
16
333
  type LinkDatasetItemFunction = (obj: {
17
334
  otelSpan: Span;
@@ -30,7 +347,7 @@ type LinkDatasetItemFunction = (obj: {
30
347
  * @public
31
348
  */
32
349
  declare class DatasetManager {
33
- private apiClient;
350
+ private langfuseClient;
34
351
  /**
35
352
  * Creates a new DatasetManager instance.
36
353
  *
@@ -38,44 +355,87 @@ declare class DatasetManager {
38
355
  * @internal
39
356
  */
40
357
  constructor(params: {
41
- apiClient: LangfuseAPIClient;
358
+ langfuseClient: LangfuseClient;
42
359
  });
43
360
  /**
44
- * Retrieves a dataset by name along with all its items.
361
+ * Retrieves a dataset by name with all its items and experiment functionality.
45
362
  *
46
- * This method automatically handles pagination to fetch all dataset items
47
- * and enhances each item with a `link` function for easy experiment tracking.
363
+ * This method fetches a dataset and all its associated items, with support
364
+ * for automatic pagination to handle large datasets efficiently. The returned
365
+ * dataset object includes enhanced functionality for linking items to traces
366
+ * and running experiments directly on the dataset.
48
367
  *
49
368
  * @param name - The name of the dataset to retrieve
50
- * @param options - Optional configuration for fetching
369
+ * @param options - Optional configuration for data fetching
51
370
  * @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
371
+ * @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
372
+ *
373
+ * @example Basic dataset retrieval
374
+ * ```typescript
375
+ * const dataset = await langfuse.dataset.get("my-evaluation-dataset");
376
+ * console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
52
377
  *
53
- * @returns Promise that resolves to the dataset with enhanced items
378
+ * // Access dataset properties
379
+ * console.log(dataset.description);
380
+ * console.log(dataset.metadata);
381
+ * ```
54
382
  *
55
- * @example
383
+ * @example Working with dataset items
56
384
  * ```typescript
57
- * const dataset = await langfuse.dataset.get("my-dataset");
385
+ * const dataset = await langfuse.dataset.get("qa-dataset");
58
386
  *
59
387
  * for (const item of dataset.items) {
60
- * // Use the item data for your experiment
61
- * const result = await processItem(item.input);
62
- *
63
- * // Link the result to the dataset item
64
- * await item.link(
65
- * { otelSpan: currentSpan },
66
- * "experiment-run-1",
67
- * { description: "Testing new model" }
68
- * );
388
+ * console.log("Question:", item.input);
389
+ * console.log("Expected Answer:", item.expectedOutput);
390
+ *
391
+ * // Each item has a link function for connecting to traces
392
+ * // await item.link(span, "experiment-name");
69
393
  * }
70
394
  * ```
395
+ *
396
+ * @example Running experiments on datasets
397
+ * ```typescript
398
+ * const dataset = await langfuse.dataset.get("benchmark-dataset");
399
+ *
400
+ * const result = await dataset.runExperiment({
401
+ * name: "GPT-4 Benchmark",
402
+ * description: "Evaluating GPT-4 on our benchmark tasks",
403
+ * task: async ({ input }) => {
404
+ * const response = await openai.chat.completions.create({
405
+ * model: "gpt-4",
406
+ * messages: [{ role: "user", content: input }]
407
+ * });
408
+ * return response.choices[0].message.content;
409
+ * },
410
+ * evaluators: [
411
+ * async ({ output, expectedOutput }) => ({
412
+ * name: "exact_match",
413
+ * value: output === expectedOutput ? 1 : 0
414
+ * })
415
+ * ]
416
+ * });
417
+ *
418
+ * console.log(await result.prettyPrint());
419
+ * ```
420
+ *
421
+ * @example Handling large datasets
422
+ * ```typescript
423
+ * // For very large datasets, use smaller page sizes
424
+ * const largeDataset = await langfuse.dataset.get(
425
+ * "large-dataset",
426
+ * { fetchItemsPageSize: 100 }
427
+ * );
428
+ * ```
429
+ *
430
+ * @throws {Error} If the dataset does not exist or cannot be accessed
431
+ * @see {@link FetchedDataset} for the complete return type specification
432
+ * @see {@link RunExperimentOnDataset} for experiment execution details
433
+ * @public
434
+ * @since 4.0.0
71
435
  */
72
436
  get(name: string, options?: {
73
437
  fetchItemsPageSize: number;
74
- }): Promise<Dataset & {
75
- items: (DatasetItem & {
76
- link: LinkDatasetItemFunction;
77
- })[];
78
- }>;
438
+ }): Promise<FetchedDataset>;
79
439
  /**
80
440
  * Creates a link function for a specific dataset item.
81
441
  *
@@ -86,6 +446,250 @@ declare class DatasetManager {
86
446
  private createDatasetItemLinkFunction;
87
447
  }
88
448
 
449
+ /**
450
+ * Manages the execution and evaluation of experiments on datasets.
451
+ *
452
+ * The ExperimentManager provides a comprehensive framework for running experiments
453
+ * that test models or tasks against datasets, with support for automatic evaluation,
454
+ * scoring.
455
+ *
456
+ * @example Basic experiment usage
457
+ * ```typescript
458
+ * const langfuse = new LangfuseClient();
459
+ *
460
+ * const result = await langfuse.experiment.run({
461
+ * name: "Capital Cities Test",
462
+ * description: "Testing model knowledge of world capitals",
463
+ * data: [
464
+ * { input: "France", expectedOutput: "Paris" },
465
+ * { input: "Germany", expectedOutput: "Berlin" }
466
+ * ],
467
+ * task: async ({ input }) => {
468
+ * const response = await openai.chat.completions.create({
469
+ * model: "gpt-4",
470
+ * messages: [{ role: "user", content: `What is the capital of ${input}?` }]
471
+ * });
472
+ * return response.choices[0].message.content;
473
+ * },
474
+ * evaluators: [
475
+ * async ({ input, output, expectedOutput }) => ({
476
+ * name: "exact_match",
477
+ * value: output === expectedOutput ? 1 : 0
478
+ * })
479
+ * ]
480
+ * });
481
+ *
482
+ * console.log(await result.prettyPrint());
483
+ * ```
484
+ *
485
+ * @example Using with Langfuse datasets
486
+ * ```typescript
487
+ * const dataset = await langfuse.dataset.get("my-dataset");
488
+ *
489
+ * const result = await dataset.runExperiment({
490
+ * name: "Model Comparison",
491
+ * task: myTask,
492
+ * evaluators: [myEvaluator],
493
+ * runEvaluators: [averageScoreEvaluator]
494
+ * });
495
+ * ```
496
+ *
497
+ * @public
498
+ */
499
+ declare class ExperimentManager {
500
+ private langfuseClient;
501
+ /**
502
+ * Creates a new ExperimentManager instance.
503
+ *
504
+ * @param params - Configuration object
505
+ * @param params.langfuseClient - The Langfuse client instance for API communication
506
+ * @internal
507
+ */
508
+ constructor(params: {
509
+ langfuseClient: LangfuseClient;
510
+ });
511
+ /**
512
+ * Gets the global logger instance for experiment-related logging.
513
+ *
514
+ * @returns The global logger instance
515
+ * @internal
516
+ */
517
+ get logger(): _langfuse_core.Logger;
518
+ /**
519
+ * Executes an experiment by running a task on each data item and evaluating the results.
520
+ *
521
+ * This method orchestrates the complete experiment lifecycle:
522
+ * 1. Executes the task function on each data item with proper tracing
523
+ * 2. Runs item-level evaluators on each task output
524
+ * 3. Executes run-level evaluators on the complete result set
525
+ * 4. Links results to dataset runs (for Langfuse datasets)
526
+ * 5. Stores all scores and traces in Langfuse
527
+ *
528
+ * @param config - The experiment configuration
529
+ * @param config.name - Human-readable name for the experiment
530
+ * @param config.description - Optional description of the experiment's purpose
531
+ * @param config.metadata - Optional metadata to attach to the experiment run
532
+ * @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
533
+ * @param config.task - Function that processes each data item and returns output
534
+ * @param config.evaluators - Optional array of functions to evaluate each item's output
535
+ * @param config.runEvaluators - Optional array of functions to evaluate the entire run
536
+ * @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
537
+ *
538
+ * @returns Promise that resolves to experiment results including:
539
+ * - itemResults: Results for each processed data item
540
+ * - runEvaluations: Results from run-level evaluators
541
+ * - datasetRunId: ID of the dataset run (if using Langfuse datasets)
542
+ * - prettyPrint: Function to format and display results
543
+ *
544
+ * @throws {Error} When task execution fails and cannot be handled gracefully
545
+ * @throws {Error} When required evaluators fail critically
546
+ *
547
+ * @example Simple experiment
548
+ * ```typescript
549
+ * const result = await langfuse.experiment.run({
550
+ * name: "Translation Quality Test",
551
+ * data: [
552
+ * { input: "Hello world", expectedOutput: "Hola mundo" },
553
+ * { input: "Good morning", expectedOutput: "Buenos días" }
554
+ * ],
555
+ * task: async ({ input }) => translateText(input, 'es'),
556
+ * evaluators: [
557
+ * async ({ output, expectedOutput }) => ({
558
+ * name: "bleu_score",
559
+ * value: calculateBleuScore(output, expectedOutput)
560
+ * })
561
+ * ]
562
+ * });
563
+ * ```
564
+ *
565
+ * @example Experiment with concurrency control
566
+ * ```typescript
567
+ * const result = await langfuse.experiment.run({
568
+ * name: "Large Scale Evaluation",
569
+ * data: largeBatchOfItems,
570
+ * task: expensiveModelCall,
571
+ * maxConcurrency: 5, // Process max 5 items simultaneously
572
+ * evaluators: [myEvaluator],
573
+ * runEvaluators: [
574
+ * async ({ itemResults }) => ({
575
+ * name: "average_score",
576
+ * value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
577
+ * })
578
+ * ]
579
+ * });
580
+ * ```
581
+ *
582
+ * @see {@link ExperimentParams} for detailed parameter documentation
583
+ * @see {@link ExperimentResult} for detailed return value documentation
584
+ * @see {@link Evaluator} for evaluator function specifications
585
+ * @see {@link RunEvaluator} for run evaluator function specifications
586
+ *
587
+ * @public
588
+ */
589
+ run(config: ExperimentParams): Promise<ExperimentResult>;
590
+ /**
591
+ * Executes the task and evaluators for a single data item.
592
+ *
593
+ * This method handles the complete processing pipeline for one data item:
594
+ * 1. Executes the task within a traced observation span
595
+ * 2. Links the result to a dataset run (if applicable)
596
+ * 3. Runs all item-level evaluators on the output
597
+ * 4. Stores evaluation scores in Langfuse
598
+ * 5. Handles errors gracefully by continuing with remaining evaluators
599
+ *
600
+ * @param params - Parameters for item execution
601
+ * @param params.experimentName - Name of the parent experiment
602
+ * @param params.experimentDescription - Description of the parent experiment
603
+ * @param params.experimentMetadata - Metadata for the parent experiment
604
+ * @param params.item - The data item to process
605
+ * @param params.task - The task function to execute
606
+ * @param params.evaluators - Optional evaluators to run on the output
607
+ *
608
+ * @returns Promise resolving to the item result with output, evaluations, and trace info
609
+ *
610
+ * @throws {Error} When task execution fails (propagated from task function)
611
+ *
612
+ * @internal
613
+ */
614
+ private runItem;
615
+ /**
616
+ * Formats experiment results into a human-readable string representation.
617
+ *
618
+ * Creates a comprehensive, nicely formatted summary of the experiment including:
619
+ * - Individual item results with inputs, outputs, expected values, and scores
620
+ * - Dataset item and trace links (when available)
621
+ * - Experiment overview with aggregate statistics
622
+ * - Average scores across all evaluations
623
+ * - Run-level evaluation results
624
+ * - Links to dataset runs in the Langfuse UI
625
+ *
626
+ * @param params - Formatting parameters
627
+ * @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
628
+ * @param params.itemResults - Results from processing each data item
629
+ * @param params.originalData - The original input data items
630
+ * @param params.runEvaluations - Results from run-level evaluators
631
+ * @param params.name - Name of the experiment
632
+ * @param params.description - Optional description of the experiment
633
+ * @param params.includeItemResults - Whether to include individual item details (default: true)
634
+ *
635
+ * @returns Promise resolving to formatted string representation
636
+ *
637
+ * @example Output format
638
+ * ```
639
+ * 1. Item 1:
640
+ * Input: What is the capital of France?
641
+ * Expected: Paris
642
+ * Actual: Paris
643
+ * Scores:
644
+ * • exact_match: 1.000
645
+ * • similarity: 0.95
646
+ * 💭 Very close match with expected output
647
+ *
648
+ * Dataset Item:
649
+ * https://cloud.langfuse.com/project/123/datasets/456/items/789
650
+ *
651
+ * Trace:
652
+ * https://cloud.langfuse.com/project/123/traces/abc123
653
+ *
654
+ * ──────────────────────────────────────────────────
655
+ * 📊 Translation Quality Test - Testing model accuracy
656
+ * 2 items
657
+ * Evaluations:
658
+ * • exact_match
659
+ * • similarity
660
+ *
661
+ * Average Scores:
662
+ * • exact_match: 0.850
663
+ * • similarity: 0.923
664
+ *
665
+ * Run Evaluations:
666
+ * • overall_quality: 0.887
667
+ * 💭 Good performance with room for improvement
668
+ *
669
+ * 🔗 Dataset Run:
670
+ * https://cloud.langfuse.com/project/123/datasets/456/runs/def456
671
+ * ```
672
+ *
673
+ * @internal
674
+ */
675
+ private prettyPrintResults;
676
+ /**
677
+ * Formats a value for display in pretty-printed output.
678
+ *
679
+ * Handles different value types appropriately:
680
+ * - Strings: Truncates long strings to 50 characters with "..."
681
+ * - Objects/Arrays: Converts to JSON string representation
682
+ * - Primitives: Uses toString() representation
683
+ *
684
+ * @param value - The value to format
685
+ * @returns Formatted string representation suitable for display
686
+ *
687
+ * @internal
688
+ */
689
+ private formatValue;
690
+ private isOtelRegistered;
691
+ }
692
+
89
693
  /**
90
694
  * Parameters for resolving media references in objects.
91
695
  *
@@ -793,6 +1397,61 @@ declare class LangfuseClient {
793
1397
  * Manager for media upload and reference resolution.
794
1398
  */
795
1399
  media: MediaManager;
1400
+ /**
1401
+ * Manager for running experiments on datasets and data items.
1402
+ *
1403
+ * The experiment manager provides comprehensive functionality for:
1404
+ * - Running tasks on datasets or custom data arrays
1405
+ * - Evaluating outputs with custom or pre-built evaluators
1406
+ * - Tracking experiment runs with automatic tracing
1407
+ * - Generating formatted result summaries
1408
+ * - Integrating with AutoEvals library evaluators
1409
+ *
1410
+ * @example Basic experiment execution
1411
+ * ```typescript
1412
+ * const langfuse = new LangfuseClient();
1413
+ *
1414
+ * const result = await langfuse.experiment.run({
1415
+ * name: "Model Evaluation",
1416
+ * description: "Testing model performance on Q&A tasks",
1417
+ * data: [
1418
+ * { input: "What is 2+2?", expectedOutput: "4" },
1419
+ * { input: "What is the capital of France?", expectedOutput: "Paris" }
1420
+ * ],
1421
+ * task: async ({ input }) => {
1422
+ * // Your model/task implementation
1423
+ * const response = await myModel.generate(input);
1424
+ * return response;
1425
+ * },
1426
+ * evaluators: [
1427
+ * async ({ output, expectedOutput }) => ({
1428
+ * name: "exact_match",
1429
+ * value: output.trim().toLowerCase() === expectedOutput.toLowerCase() ? 1 : 0
1430
+ * })
1431
+ * ]
1432
+ * });
1433
+ *
1434
+ * console.log(await result.prettyPrint());
1435
+ * ```
1436
+ *
1437
+ * @example Using with datasets
1438
+ * ```typescript
1439
+ * const dataset = await langfuse.dataset.get("my-test-dataset");
1440
+ * const result = await dataset.runExperiment({
1441
+ * name: "Production Readiness Test",
1442
+ * task: myTask,
1443
+ * evaluators: [accuracyEvaluator, latencyEvaluator],
1444
+ * runEvaluators: [overallQualityEvaluator]
1445
+ * });
1446
+ * ```
1447
+ *
1448
+ * @see {@link ExperimentManager} for detailed API documentation
1449
+ * @see {@link ExperimentParams} for configuration options
1450
+ * @see {@link ExperimentResult} for result structure
1451
+ * @public
1452
+ * @since 4.0.0
1453
+ */
1454
+ experiment: ExperimentManager;
796
1455
  private baseUrl;
797
1456
  private projectId;
798
1457
  /**
@@ -926,4 +1585,72 @@ declare class LangfuseClient {
926
1585
  getTraceUrl(traceId: string): Promise<string>;
927
1586
  }
928
1587
 
929
- export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, ScoreManager, TextPromptClient };
1588
+ /**
1589
+ * Converts an AutoEvals evaluator to a Langfuse-compatible evaluator function.
1590
+ *
1591
+ * This adapter function bridges the gap between AutoEvals library evaluators
1592
+ * and Langfuse experiment evaluators, handling parameter mapping and result
1593
+ * formatting automatically.
1594
+ *
1595
+ * AutoEvals evaluators expect `input`, `output`, and `expected` parameters,
1596
+ * while Langfuse evaluators use `input`, `output`, and `expectedOutput`.
1597
+ * This function handles the parameter name mapping.
1598
+ *
1599
+ * @template E - Type of the AutoEvals evaluator function
1600
+ * @param autoevalEvaluator - The AutoEvals evaluator function to convert
1601
+ * @param params - Optional additional parameters to pass to the AutoEvals evaluator
1602
+ * @returns A Langfuse-compatible evaluator function
1603
+ *
1604
+ * @example Basic usage with AutoEvals
1605
+ * ```typescript
1606
+ * import { Factuality, Levenshtein } from 'autoevals';
1607
+ * import { autoevalsToLangfuseEvaluator } from '@langfuse/client';
1608
+ *
1609
+ * const factualityEvaluator = autoevalsToLangfuseEvaluator(Factuality);
1610
+ * const levenshteinEvaluator = autoevalsToLangfuseEvaluator(Levenshtein);
1611
+ *
1612
+ * await langfuse.experiment.run({
1613
+ * name: "AutoEvals Integration Test",
1614
+ * data: myDataset,
1615
+ * task: myTask,
1616
+ * evaluators: [factualityEvaluator, levenshteinEvaluator]
1617
+ * });
1618
+ * ```
1619
+ *
1620
+ * @example Using with additional parameters
1621
+ * ```typescript
1622
+ * import { Factuality } from 'autoevals';
1623
+ *
1624
+ * const factualityEvaluator = autoevalsToLangfuseEvaluator(
1625
+ * Factuality,
1626
+ * { model: 'gpt-4o' } // Additional params for AutoEvals
1627
+ * );
1628
+ *
1629
+ * await langfuse.experiment.run({
1630
+ * name: "Factuality Test",
1631
+ * data: myDataset,
1632
+ * task: myTask,
1633
+ * evaluators: [factualityEvaluator]
1634
+ * });
1635
+ * ```
1636
+ *
1637
+ * @see {@link https://github.com/braintrustdata/autoevals} AutoEvals library documentation
1638
+ * @see {@link Evaluator} for Langfuse evaluator specifications
1639
+ *
1640
+ * @public
1641
+ * @since 4.0.0
1642
+ */
1643
+ declare function autoevalsToLangfuseEvaluator<E extends CallableFunction>(autoevalEvaluator: E, params?: Params<E>): Evaluator;
1644
+ /**
1645
+ * Utility type to extract parameter types from AutoEvals evaluator functions.
1646
+ *
1647
+ * This type helper extracts the parameter type from an AutoEvals evaluator
1648
+ * and omits the standard parameters (input, output, expected) that are
1649
+ * handled by the adapter, leaving only the additional configuration parameters.
1650
+ *
1651
+ * @template E - The AutoEvals evaluator function type
1652
+ * @internal
1653
+ */
1654
+ type Params<E> = Parameters<E extends (...args: any[]) => any ? E : never>[0] extends infer P ? Omit<P, "input" | "output" | "expected"> : never;
1655
+
1656
+ export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient, autoevalsToLangfuseEvaluator };