@langfuse/client 4.0.0 → 4.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +626 -53
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +756 -29
- package/dist/index.d.ts +756 -29
- package/dist/index.mjs +610 -39
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.d.ts
CHANGED
|
@@ -1,17 +1,334 @@
|
|
|
1
1
|
import * as _langfuse_core from '@langfuse/core';
|
|
2
|
-
import {
|
|
2
|
+
import { DatasetItem, ScoreBody, Dataset, DatasetRunItem, LangfuseAPIClient, ParsedMediaReference, CreatePromptRequest, ChatMessage, ChatMessageWithPlaceholders, PlaceholderMessage, BasePrompt, Prompt } from '@langfuse/core';
|
|
3
3
|
import { Span } from '@opentelemetry/api';
|
|
4
4
|
|
|
5
|
+
type ExperimentItem = {
|
|
6
|
+
/**
|
|
7
|
+
* The input data to pass to the task function.
|
|
8
|
+
*
|
|
9
|
+
* Can be any type - string, object, array, etc. This data will be passed
|
|
10
|
+
* to your task function as the `input` parameter. Structure it according
|
|
11
|
+
* to your task's requirements.
|
|
12
|
+
*/
|
|
13
|
+
input?: any;
|
|
14
|
+
/**
|
|
15
|
+
* The expected output for evaluation purposes.
|
|
16
|
+
*
|
|
17
|
+
* Optional ground truth or reference output for this input.
|
|
18
|
+
* Used by evaluators to assess task performance. If not provided,
|
|
19
|
+
* only evaluators that don't require expected output can be used.
|
|
20
|
+
*/
|
|
21
|
+
expectedOutput?: any;
|
|
22
|
+
};
|
|
23
|
+
type ExperimentTaskParams = Pick<ExperimentItem, "input">;
|
|
24
|
+
type ExperimentTask = (params: ExperimentTaskParams) => Promise<any>;
|
|
25
|
+
type Evaluation = Pick<ScoreBody, "name" | "value" | "comment" | "metadata" | "dataType">;
|
|
26
|
+
type EvaluatorParams = {
|
|
27
|
+
/**
|
|
28
|
+
* The original input data passed to the task.
|
|
29
|
+
*
|
|
30
|
+
* This is the same input that was provided to the task function.
|
|
31
|
+
* Use this for context-aware evaluations or input-output relationship analysis.
|
|
32
|
+
*/
|
|
33
|
+
input: any;
|
|
34
|
+
/**
|
|
35
|
+
* The output produced by the task.
|
|
36
|
+
*
|
|
37
|
+
* This is the actual result returned by your task function.
|
|
38
|
+
* This is the primary value to evaluate against expectations.
|
|
39
|
+
*/
|
|
40
|
+
output: any;
|
|
41
|
+
/**
|
|
42
|
+
* The expected output for comparison (optional).
|
|
43
|
+
*
|
|
44
|
+
* This is the ground truth or expected result for the given input.
|
|
45
|
+
* May not be available for all evaluation scenarios.
|
|
46
|
+
*/
|
|
47
|
+
expectedOutput?: any;
|
|
48
|
+
};
|
|
49
|
+
type Evaluator = (params: EvaluatorParams) => Promise<Evaluation[] | Evaluation>;
|
|
50
|
+
type RunEvaluatorParams = {
|
|
51
|
+
/**
|
|
52
|
+
* Results from all processed experiment items.
|
|
53
|
+
*
|
|
54
|
+
* Each item contains the input, output, evaluations, and metadata from
|
|
55
|
+
* processing a single data item. Use this for aggregate analysis,
|
|
56
|
+
* statistical calculations, and cross-item comparisons.
|
|
57
|
+
*/
|
|
58
|
+
itemResults: ExperimentItemResult[];
|
|
59
|
+
};
|
|
60
|
+
type RunEvaluator = (params: RunEvaluatorParams) => Promise<Evaluation[] | Evaluation>;
|
|
61
|
+
type ExperimentParams = {
|
|
62
|
+
/**
|
|
63
|
+
* Human-readable name for the experiment.
|
|
64
|
+
*
|
|
65
|
+
* This name will appear in Langfuse UI and experiment results.
|
|
66
|
+
* Choose a descriptive name that identifies the experiment's purpose.
|
|
67
|
+
*/
|
|
68
|
+
name: string;
|
|
69
|
+
/**
|
|
70
|
+
* Optional description explaining the experiment's purpose.
|
|
71
|
+
*
|
|
72
|
+
* Provide context about what you're testing, methodology, or goals.
|
|
73
|
+
* This helps with experiment tracking and result interpretation.
|
|
74
|
+
*/
|
|
75
|
+
description?: string;
|
|
76
|
+
/**
|
|
77
|
+
* Optional metadata to attach to the experiment run.
|
|
78
|
+
*
|
|
79
|
+
* Store additional context like model versions, hyperparameters,
|
|
80
|
+
* or any other relevant information for analysis and comparison.
|
|
81
|
+
*/
|
|
82
|
+
metadata?: Record<string, any>;
|
|
83
|
+
/**
|
|
84
|
+
* Array of data items to process.
|
|
85
|
+
*
|
|
86
|
+
* Can be either custom ExperimentItem[] or DatasetItem[] from Langfuse.
|
|
87
|
+
* Each item should contain input data and optionally expected output.
|
|
88
|
+
*/
|
|
89
|
+
data: ExperimentItem[] | DatasetItem[];
|
|
90
|
+
/**
|
|
91
|
+
* The task function to execute on each data item.
|
|
92
|
+
*
|
|
93
|
+
* This function receives input data and produces output that will be evaluated.
|
|
94
|
+
* It should encapsulate the model or system being tested.
|
|
95
|
+
*/
|
|
96
|
+
task: ExperimentTask;
|
|
97
|
+
/**
|
|
98
|
+
* Optional array of evaluator functions to assess each item's output.
|
|
99
|
+
*
|
|
100
|
+
* Each evaluator receives input, output, and expected output (if available)
|
|
101
|
+
* and returns evaluation results. Multiple evaluators enable comprehensive assessment.
|
|
102
|
+
*/
|
|
103
|
+
evaluators?: Evaluator[];
|
|
104
|
+
/**
|
|
105
|
+
* Optional array of run-level evaluators to assess the entire experiment.
|
|
106
|
+
*
|
|
107
|
+
* These evaluators receive all item results and can perform aggregate analysis
|
|
108
|
+
* like calculating averages, detecting patterns, or statistical analysis.
|
|
109
|
+
*/
|
|
110
|
+
runEvaluators?: RunEvaluator[];
|
|
111
|
+
/**
|
|
112
|
+
* Maximum number of concurrent task executions (default: Infinity).
|
|
113
|
+
*
|
|
114
|
+
* Controls parallelism to manage resource usage and API rate limits.
|
|
115
|
+
* Set lower values for expensive operations or rate-limited services.
|
|
116
|
+
*/
|
|
117
|
+
maxConcurrency?: number;
|
|
118
|
+
};
|
|
119
|
+
type ExperimentItemResult = Pick<ExperimentItem, "input" | "expectedOutput"> & {
|
|
120
|
+
/**
|
|
121
|
+
* The actual output produced by the task.
|
|
122
|
+
*
|
|
123
|
+
* This is the result returned by your task function for this specific input.
|
|
124
|
+
* It will be passed to evaluators for assessment against expected outputs.
|
|
125
|
+
*/
|
|
126
|
+
output: any;
|
|
127
|
+
/**
|
|
128
|
+
* Results from all evaluators that ran on this item.
|
|
129
|
+
*
|
|
130
|
+
* Contains evaluation scores, comments, and metadata from each evaluator
|
|
131
|
+
* that successfully processed this item. Failed evaluators are excluded.
|
|
132
|
+
*/
|
|
133
|
+
evaluations: Evaluation[];
|
|
134
|
+
/**
|
|
135
|
+
* Langfuse trace ID for this item's execution (for debugging and analysis).
|
|
136
|
+
*
|
|
137
|
+
* Use this ID to view detailed execution traces in the Langfuse UI,
|
|
138
|
+
* including timing, inputs, outputs, and any nested observations.
|
|
139
|
+
*/
|
|
140
|
+
traceId?: string;
|
|
141
|
+
/**
|
|
142
|
+
* Dataset run ID if this item was part of a Langfuse dataset.
|
|
143
|
+
*
|
|
144
|
+
* Present only when running experiments on Langfuse datasets.
|
|
145
|
+
* Links this item result to a specific dataset run for tracking and comparison.
|
|
146
|
+
*/
|
|
147
|
+
datasetRunId?: string;
|
|
148
|
+
};
|
|
149
|
+
/**
|
|
150
|
+
* Complete result of an experiment execution.
|
|
151
|
+
*
|
|
152
|
+
* Contains all results from processing the experiment data,
|
|
153
|
+
* including individual item results, run-level evaluations,
|
|
154
|
+
* and utilities for result visualization.
|
|
155
|
+
*
|
|
156
|
+
* @example Using experiment results
|
|
157
|
+
* ```typescript
|
|
158
|
+
* const result = await langfuse.experiment.run(config);
|
|
159
|
+
*
|
|
160
|
+
* // Access individual results
|
|
161
|
+
* console.log(`Processed ${result.itemResults.length} items`);
|
|
162
|
+
*
|
|
163
|
+
* // Check run-level evaluations
|
|
164
|
+
* const avgScore = result.runEvaluations.find(e => e.name === 'average_score');
|
|
165
|
+
* console.log(`Average score: ${avgScore?.value}`);
|
|
166
|
+
*
|
|
167
|
+
* // Print formatted results
|
|
168
|
+
* console.log(await result.prettyPrint());
|
|
169
|
+
*
|
|
170
|
+
* // Print summary only (for large datasets)
|
|
171
|
+
* console.log(await result.prettyPrint({ includeItemResults: false }));
|
|
172
|
+
*
|
|
173
|
+
* // Link to dataset run (if available)
|
|
174
|
+
* if (result.datasetRunId) {
|
|
175
|
+
* console.log(`View in Langfuse: dataset run ${result.datasetRunId}`);
|
|
176
|
+
* }
|
|
177
|
+
* ```
|
|
178
|
+
*
|
|
179
|
+
* @public
|
|
180
|
+
*/
|
|
181
|
+
type ExperimentResult = {
|
|
182
|
+
/**
|
|
183
|
+
* ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
|
|
184
|
+
*
|
|
185
|
+
* Present only when running experiments on Langfuse datasets.
|
|
186
|
+
* Use this ID to access the dataset run via the Langfuse API or UI
|
|
187
|
+
* for detailed analysis and comparison with other runs.
|
|
188
|
+
*/
|
|
189
|
+
datasetRunId?: string;
|
|
190
|
+
/**
|
|
191
|
+
* Results from processing each individual data item.
|
|
192
|
+
*
|
|
193
|
+
* Contains the complete results for every item in your experiment data,
|
|
194
|
+
* including inputs, outputs, evaluations, and trace information.
|
|
195
|
+
* Use this for detailed analysis of individual item performance.
|
|
196
|
+
*/
|
|
197
|
+
itemResults: ExperimentItemResult[];
|
|
198
|
+
/**
|
|
199
|
+
* Results from run-level evaluators that assessed the entire experiment.
|
|
200
|
+
*
|
|
201
|
+
* Contains aggregate evaluations that analyze the complete experiment,
|
|
202
|
+
* such as average scores, statistical measures, or overall quality assessments.
|
|
203
|
+
*/
|
|
204
|
+
runEvaluations: Evaluation[];
|
|
205
|
+
/**
|
|
206
|
+
* Function to format and display experiment results in a human-readable format.
|
|
207
|
+
*
|
|
208
|
+
* Generates a comprehensive, nicely formatted summary including individual results,
|
|
209
|
+
* aggregate statistics, evaluation scores, and links to traces and dataset runs.
|
|
210
|
+
*
|
|
211
|
+
* @param options - Formatting options
|
|
212
|
+
* @param options.includeItemResults - Whether to include individual item details (default: true)
|
|
213
|
+
* @returns Promise resolving to formatted string representation
|
|
214
|
+
*/
|
|
215
|
+
prettyPrint: (options?: {
|
|
216
|
+
includeItemResults?: boolean;
|
|
217
|
+
}) => Promise<string>;
|
|
218
|
+
};
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Function type for running experiments on Langfuse datasets.
|
|
222
|
+
*
|
|
223
|
+
* This function type is attached to fetched datasets to enable convenient
|
|
224
|
+
* experiment execution directly on dataset objects.
|
|
225
|
+
*
|
|
226
|
+
* @param params - Experiment parameters excluding data (since data comes from the dataset)
|
|
227
|
+
* @returns Promise resolving to experiment results
|
|
228
|
+
*
|
|
229
|
+
* @example
|
|
230
|
+
* ```typescript
|
|
231
|
+
* const dataset = await langfuse.dataset.get("my-dataset");
|
|
232
|
+
* const result = await dataset.runExperiment({
|
|
233
|
+
* name: "Model Evaluation",
|
|
234
|
+
* task: myTask,
|
|
235
|
+
* evaluators: [myEvaluator]
|
|
236
|
+
* });
|
|
237
|
+
* ```
|
|
238
|
+
*
|
|
239
|
+
* @public
|
|
240
|
+
* @since 4.0.0
|
|
241
|
+
*/
|
|
242
|
+
type RunExperimentOnDataset = (params: Omit<ExperimentParams, "data" | "dataSource">) => Promise<ExperimentResult>;
|
|
243
|
+
/**
|
|
244
|
+
* Enhanced dataset object with additional methods for linking and experiments.
|
|
245
|
+
*
|
|
246
|
+
* This type extends the base Dataset with functionality for:
|
|
247
|
+
* - Linking dataset items to traces/observations
|
|
248
|
+
* - Running experiments directly on the dataset
|
|
249
|
+
*
|
|
250
|
+
* @example Working with a fetched dataset
|
|
251
|
+
* ```typescript
|
|
252
|
+
* const dataset = await langfuse.dataset.get("my-evaluation-dataset");
|
|
253
|
+
*
|
|
254
|
+
* // Access dataset metadata
|
|
255
|
+
* console.log(dataset.name, dataset.description);
|
|
256
|
+
*
|
|
257
|
+
* // Work with individual items
|
|
258
|
+
* for (const item of dataset.items) {
|
|
259
|
+
* console.log(item.input, item.expectedOutput);
|
|
260
|
+
*
|
|
261
|
+
* // Link item to a trace
|
|
262
|
+
* await item.link(myObservation, "experiment-run-1");
|
|
263
|
+
* }
|
|
264
|
+
*
|
|
265
|
+
* // Run experiments on the entire dataset
|
|
266
|
+
* const result = await dataset.runExperiment({
|
|
267
|
+
* name: "Model Comparison",
|
|
268
|
+
* task: myTask,
|
|
269
|
+
* evaluators: [accuracyEvaluator]
|
|
270
|
+
* });
|
|
271
|
+
* ```
|
|
272
|
+
*
|
|
273
|
+
* @public
|
|
274
|
+
* @since 4.0.0
|
|
275
|
+
*/
|
|
276
|
+
type FetchedDataset = Dataset & {
|
|
277
|
+
/** Dataset items with additional linking functionality */
|
|
278
|
+
items: (DatasetItem & {
|
|
279
|
+
link: LinkDatasetItemFunction;
|
|
280
|
+
})[];
|
|
281
|
+
/** Function to run experiments directly on this dataset */
|
|
282
|
+
runExperiment: RunExperimentOnDataset;
|
|
283
|
+
};
|
|
5
284
|
/**
|
|
6
285
|
* Function type for linking dataset items to OpenTelemetry spans.
|
|
7
|
-
* This allows dataset items to be associated with specific traces for experiment tracking.
|
|
8
286
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
287
|
+
* This function creates a connection between a dataset item and a trace/observation,
|
|
288
|
+
* enabling tracking of which dataset items were used in which experiments or runs.
|
|
289
|
+
* This is essential for creating dataset runs and tracking experiment lineage.
|
|
290
|
+
*
|
|
291
|
+
* @param obj - Object containing the OpenTelemetry span to link to
|
|
292
|
+
* @param obj.otelSpan - The OpenTelemetry span from a Langfuse observation
|
|
293
|
+
* @param runName - Name of the experiment run for grouping related items
|
|
294
|
+
* @param runArgs - Optional configuration for the dataset run
|
|
295
|
+
* @param runArgs.description - Description of the experiment run
|
|
296
|
+
* @param runArgs.metadata - Additional metadata to attach to the run
|
|
12
297
|
* @returns Promise that resolves to the created dataset run item
|
|
13
298
|
*
|
|
299
|
+
* @example Basic linking
|
|
300
|
+
* ```typescript
|
|
301
|
+
* const dataset = await langfuse.dataset.get("my-dataset");
|
|
302
|
+
* const span = startObservation("my-task", { input: "test" });
|
|
303
|
+
* span.update({ output: "result" });
|
|
304
|
+
* span.end();
|
|
305
|
+
*
|
|
306
|
+
* // Link the dataset item to this execution
|
|
307
|
+
* await dataset.items[0].link(
|
|
308
|
+
* { otelSpan: span.otelSpan },
|
|
309
|
+
* "experiment-run-1"
|
|
310
|
+
* );
|
|
311
|
+
* ```
|
|
312
|
+
*
|
|
313
|
+
* @example Linking with metadata
|
|
314
|
+
* ```typescript
|
|
315
|
+
* await dataset.items[0].link(
|
|
316
|
+
* { otelSpan: span.otelSpan },
|
|
317
|
+
* "model-comparison-v2",
|
|
318
|
+
* {
|
|
319
|
+
* description: "Comparing GPT-4 vs Claude performance",
|
|
320
|
+
* metadata: {
|
|
321
|
+
* modelVersion: "gpt-4-1106-preview",
|
|
322
|
+
* temperature: 0.7,
|
|
323
|
+
* timestamp: new Date().toISOString()
|
|
324
|
+
* }
|
|
325
|
+
* }
|
|
326
|
+
* );
|
|
327
|
+
* ```
|
|
328
|
+
*
|
|
329
|
+
* @see {@link https://langfuse.com/docs/datasets} Langfuse datasets documentation
|
|
14
330
|
* @public
|
|
331
|
+
* @since 4.0.0
|
|
15
332
|
*/
|
|
16
333
|
type LinkDatasetItemFunction = (obj: {
|
|
17
334
|
otelSpan: Span;
|
|
@@ -30,7 +347,7 @@ type LinkDatasetItemFunction = (obj: {
|
|
|
30
347
|
* @public
|
|
31
348
|
*/
|
|
32
349
|
declare class DatasetManager {
|
|
33
|
-
private
|
|
350
|
+
private langfuseClient;
|
|
34
351
|
/**
|
|
35
352
|
* Creates a new DatasetManager instance.
|
|
36
353
|
*
|
|
@@ -38,44 +355,87 @@ declare class DatasetManager {
|
|
|
38
355
|
* @internal
|
|
39
356
|
*/
|
|
40
357
|
constructor(params: {
|
|
41
|
-
|
|
358
|
+
langfuseClient: LangfuseClient;
|
|
42
359
|
});
|
|
43
360
|
/**
|
|
44
|
-
* Retrieves a dataset by name
|
|
361
|
+
* Retrieves a dataset by name with all its items and experiment functionality.
|
|
45
362
|
*
|
|
46
|
-
* This method
|
|
47
|
-
*
|
|
363
|
+
* This method fetches a dataset and all its associated items, with support
|
|
364
|
+
* for automatic pagination to handle large datasets efficiently. The returned
|
|
365
|
+
* dataset object includes enhanced functionality for linking items to traces
|
|
366
|
+
* and running experiments directly on the dataset.
|
|
48
367
|
*
|
|
49
368
|
* @param name - The name of the dataset to retrieve
|
|
50
|
-
* @param options - Optional configuration for fetching
|
|
369
|
+
* @param options - Optional configuration for data fetching
|
|
51
370
|
* @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
|
|
371
|
+
* @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
|
|
372
|
+
*
|
|
373
|
+
* @example Basic dataset retrieval
|
|
374
|
+
* ```typescript
|
|
375
|
+
* const dataset = await langfuse.dataset.get("my-evaluation-dataset");
|
|
376
|
+
* console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
|
|
52
377
|
*
|
|
53
|
-
*
|
|
378
|
+
* // Access dataset properties
|
|
379
|
+
* console.log(dataset.description);
|
|
380
|
+
* console.log(dataset.metadata);
|
|
381
|
+
* ```
|
|
54
382
|
*
|
|
55
|
-
* @example
|
|
383
|
+
* @example Working with dataset items
|
|
56
384
|
* ```typescript
|
|
57
|
-
* const dataset = await langfuse.dataset.get("
|
|
385
|
+
* const dataset = await langfuse.dataset.get("qa-dataset");
|
|
58
386
|
*
|
|
59
387
|
* for (const item of dataset.items) {
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
* //
|
|
64
|
-
* await item.link(
|
|
65
|
-
* { otelSpan: currentSpan },
|
|
66
|
-
* "experiment-run-1",
|
|
67
|
-
* { description: "Testing new model" }
|
|
68
|
-
* );
|
|
388
|
+
* console.log("Question:", item.input);
|
|
389
|
+
* console.log("Expected Answer:", item.expectedOutput);
|
|
390
|
+
*
|
|
391
|
+
* // Each item has a link function for connecting to traces
|
|
392
|
+
* // await item.link(span, "experiment-name");
|
|
69
393
|
* }
|
|
70
394
|
* ```
|
|
395
|
+
*
|
|
396
|
+
* @example Running experiments on datasets
|
|
397
|
+
* ```typescript
|
|
398
|
+
* const dataset = await langfuse.dataset.get("benchmark-dataset");
|
|
399
|
+
*
|
|
400
|
+
* const result = await dataset.runExperiment({
|
|
401
|
+
* name: "GPT-4 Benchmark",
|
|
402
|
+
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
403
|
+
* task: async ({ input }) => {
|
|
404
|
+
* const response = await openai.chat.completions.create({
|
|
405
|
+
* model: "gpt-4",
|
|
406
|
+
* messages: [{ role: "user", content: input }]
|
|
407
|
+
* });
|
|
408
|
+
* return response.choices[0].message.content;
|
|
409
|
+
* },
|
|
410
|
+
* evaluators: [
|
|
411
|
+
* async ({ output, expectedOutput }) => ({
|
|
412
|
+
* name: "exact_match",
|
|
413
|
+
* value: output === expectedOutput ? 1 : 0
|
|
414
|
+
* })
|
|
415
|
+
* ]
|
|
416
|
+
* });
|
|
417
|
+
*
|
|
418
|
+
* console.log(await result.prettyPrint());
|
|
419
|
+
* ```
|
|
420
|
+
*
|
|
421
|
+
* @example Handling large datasets
|
|
422
|
+
* ```typescript
|
|
423
|
+
* // For very large datasets, use smaller page sizes
|
|
424
|
+
* const largeDataset = await langfuse.dataset.get(
|
|
425
|
+
* "large-dataset",
|
|
426
|
+
* { fetchItemsPageSize: 100 }
|
|
427
|
+
* );
|
|
428
|
+
* ```
|
|
429
|
+
*
|
|
430
|
+
* @throws {Error} If the dataset does not exist or cannot be accessed
|
|
431
|
+
* @see {@link FetchedDataset} for the complete return type specification
|
|
432
|
+
* @see {@link RunExperimentOnDataset} for experiment execution details
|
|
433
|
+
* @public
|
|
434
|
+
* @since 4.0.0
|
|
71
435
|
*/
|
|
72
436
|
get(name: string, options?: {
|
|
73
437
|
fetchItemsPageSize: number;
|
|
74
|
-
}): Promise<
|
|
75
|
-
items: (DatasetItem & {
|
|
76
|
-
link: LinkDatasetItemFunction;
|
|
77
|
-
})[];
|
|
78
|
-
}>;
|
|
438
|
+
}): Promise<FetchedDataset>;
|
|
79
439
|
/**
|
|
80
440
|
* Creates a link function for a specific dataset item.
|
|
81
441
|
*
|
|
@@ -86,6 +446,250 @@ declare class DatasetManager {
|
|
|
86
446
|
private createDatasetItemLinkFunction;
|
|
87
447
|
}
|
|
88
448
|
|
|
449
|
+
/**
|
|
450
|
+
* Manages the execution and evaluation of experiments on datasets.
|
|
451
|
+
*
|
|
452
|
+
* The ExperimentManager provides a comprehensive framework for running experiments
|
|
453
|
+
* that test models or tasks against datasets, with support for automatic evaluation,
|
|
454
|
+
* scoring.
|
|
455
|
+
*
|
|
456
|
+
* @example Basic experiment usage
|
|
457
|
+
* ```typescript
|
|
458
|
+
* const langfuse = new LangfuseClient();
|
|
459
|
+
*
|
|
460
|
+
* const result = await langfuse.experiment.run({
|
|
461
|
+
* name: "Capital Cities Test",
|
|
462
|
+
* description: "Testing model knowledge of world capitals",
|
|
463
|
+
* data: [
|
|
464
|
+
* { input: "France", expectedOutput: "Paris" },
|
|
465
|
+
* { input: "Germany", expectedOutput: "Berlin" }
|
|
466
|
+
* ],
|
|
467
|
+
* task: async ({ input }) => {
|
|
468
|
+
* const response = await openai.chat.completions.create({
|
|
469
|
+
* model: "gpt-4",
|
|
470
|
+
* messages: [{ role: "user", content: `What is the capital of ${input}?` }]
|
|
471
|
+
* });
|
|
472
|
+
* return response.choices[0].message.content;
|
|
473
|
+
* },
|
|
474
|
+
* evaluators: [
|
|
475
|
+
* async ({ input, output, expectedOutput }) => ({
|
|
476
|
+
* name: "exact_match",
|
|
477
|
+
* value: output === expectedOutput ? 1 : 0
|
|
478
|
+
* })
|
|
479
|
+
* ]
|
|
480
|
+
* });
|
|
481
|
+
*
|
|
482
|
+
* console.log(await result.prettyPrint());
|
|
483
|
+
* ```
|
|
484
|
+
*
|
|
485
|
+
* @example Using with Langfuse datasets
|
|
486
|
+
* ```typescript
|
|
487
|
+
* const dataset = await langfuse.dataset.get("my-dataset");
|
|
488
|
+
*
|
|
489
|
+
* const result = await dataset.runExperiment({
|
|
490
|
+
* name: "Model Comparison",
|
|
491
|
+
* task: myTask,
|
|
492
|
+
* evaluators: [myEvaluator],
|
|
493
|
+
* runEvaluators: [averageScoreEvaluator]
|
|
494
|
+
* });
|
|
495
|
+
* ```
|
|
496
|
+
*
|
|
497
|
+
* @public
|
|
498
|
+
*/
|
|
499
|
+
declare class ExperimentManager {
|
|
500
|
+
private langfuseClient;
|
|
501
|
+
/**
|
|
502
|
+
* Creates a new ExperimentManager instance.
|
|
503
|
+
*
|
|
504
|
+
* @param params - Configuration object
|
|
505
|
+
* @param params.langfuseClient - The Langfuse client instance for API communication
|
|
506
|
+
* @internal
|
|
507
|
+
*/
|
|
508
|
+
constructor(params: {
|
|
509
|
+
langfuseClient: LangfuseClient;
|
|
510
|
+
});
|
|
511
|
+
/**
|
|
512
|
+
* Gets the global logger instance for experiment-related logging.
|
|
513
|
+
*
|
|
514
|
+
* @returns The global logger instance
|
|
515
|
+
* @internal
|
|
516
|
+
*/
|
|
517
|
+
get logger(): _langfuse_core.Logger;
|
|
518
|
+
/**
|
|
519
|
+
* Executes an experiment by running a task on each data item and evaluating the results.
|
|
520
|
+
*
|
|
521
|
+
* This method orchestrates the complete experiment lifecycle:
|
|
522
|
+
* 1. Executes the task function on each data item with proper tracing
|
|
523
|
+
* 2. Runs item-level evaluators on each task output
|
|
524
|
+
* 3. Executes run-level evaluators on the complete result set
|
|
525
|
+
* 4. Links results to dataset runs (for Langfuse datasets)
|
|
526
|
+
* 5. Stores all scores and traces in Langfuse
|
|
527
|
+
*
|
|
528
|
+
* @param config - The experiment configuration
|
|
529
|
+
* @param config.name - Human-readable name for the experiment
|
|
530
|
+
* @param config.description - Optional description of the experiment's purpose
|
|
531
|
+
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
532
|
+
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
533
|
+
* @param config.task - Function that processes each data item and returns output
|
|
534
|
+
* @param config.evaluators - Optional array of functions to evaluate each item's output
|
|
535
|
+
* @param config.runEvaluators - Optional array of functions to evaluate the entire run
|
|
536
|
+
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
537
|
+
*
|
|
538
|
+
* @returns Promise that resolves to experiment results including:
|
|
539
|
+
* - itemResults: Results for each processed data item
|
|
540
|
+
* - runEvaluations: Results from run-level evaluators
|
|
541
|
+
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
542
|
+
* - prettyPrint: Function to format and display results
|
|
543
|
+
*
|
|
544
|
+
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
545
|
+
* @throws {Error} When required evaluators fail critically
|
|
546
|
+
*
|
|
547
|
+
* @example Simple experiment
|
|
548
|
+
* ```typescript
|
|
549
|
+
* const result = await langfuse.experiment.run({
|
|
550
|
+
* name: "Translation Quality Test",
|
|
551
|
+
* data: [
|
|
552
|
+
* { input: "Hello world", expectedOutput: "Hola mundo" },
|
|
553
|
+
* { input: "Good morning", expectedOutput: "Buenos días" }
|
|
554
|
+
* ],
|
|
555
|
+
* task: async ({ input }) => translateText(input, 'es'),
|
|
556
|
+
* evaluators: [
|
|
557
|
+
* async ({ output, expectedOutput }) => ({
|
|
558
|
+
* name: "bleu_score",
|
|
559
|
+
* value: calculateBleuScore(output, expectedOutput)
|
|
560
|
+
* })
|
|
561
|
+
* ]
|
|
562
|
+
* });
|
|
563
|
+
* ```
|
|
564
|
+
*
|
|
565
|
+
* @example Experiment with concurrency control
|
|
566
|
+
* ```typescript
|
|
567
|
+
* const result = await langfuse.experiment.run({
|
|
568
|
+
* name: "Large Scale Evaluation",
|
|
569
|
+
* data: largeBatchOfItems,
|
|
570
|
+
* task: expensiveModelCall,
|
|
571
|
+
* maxConcurrency: 5, // Process max 5 items simultaneously
|
|
572
|
+
* evaluators: [myEvaluator],
|
|
573
|
+
* runEvaluators: [
|
|
574
|
+
* async ({ itemResults }) => ({
|
|
575
|
+
* name: "average_score",
|
|
576
|
+
* value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
|
|
577
|
+
* })
|
|
578
|
+
* ]
|
|
579
|
+
* });
|
|
580
|
+
* ```
|
|
581
|
+
*
|
|
582
|
+
* @see {@link ExperimentParams} for detailed parameter documentation
|
|
583
|
+
* @see {@link ExperimentResult} for detailed return value documentation
|
|
584
|
+
* @see {@link Evaluator} for evaluator function specifications
|
|
585
|
+
* @see {@link RunEvaluator} for run evaluator function specifications
|
|
586
|
+
*
|
|
587
|
+
* @public
|
|
588
|
+
*/
|
|
589
|
+
run(config: ExperimentParams): Promise<ExperimentResult>;
|
|
590
|
+
/**
|
|
591
|
+
* Executes the task and evaluators for a single data item.
|
|
592
|
+
*
|
|
593
|
+
* This method handles the complete processing pipeline for one data item:
|
|
594
|
+
* 1. Executes the task within a traced observation span
|
|
595
|
+
* 2. Links the result to a dataset run (if applicable)
|
|
596
|
+
* 3. Runs all item-level evaluators on the output
|
|
597
|
+
* 4. Stores evaluation scores in Langfuse
|
|
598
|
+
* 5. Handles errors gracefully by continuing with remaining evaluators
|
|
599
|
+
*
|
|
600
|
+
* @param params - Parameters for item execution
|
|
601
|
+
* @param params.experimentName - Name of the parent experiment
|
|
602
|
+
* @param params.experimentDescription - Description of the parent experiment
|
|
603
|
+
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
604
|
+
* @param params.item - The data item to process
|
|
605
|
+
* @param params.task - The task function to execute
|
|
606
|
+
* @param params.evaluators - Optional evaluators to run on the output
|
|
607
|
+
*
|
|
608
|
+
* @returns Promise resolving to the item result with output, evaluations, and trace info
|
|
609
|
+
*
|
|
610
|
+
* @throws {Error} When task execution fails (propagated from task function)
|
|
611
|
+
*
|
|
612
|
+
* @internal
|
|
613
|
+
*/
|
|
614
|
+
private runItem;
|
|
615
|
+
/**
|
|
616
|
+
* Formats experiment results into a human-readable string representation.
|
|
617
|
+
*
|
|
618
|
+
* Creates a comprehensive, nicely formatted summary of the experiment including:
|
|
619
|
+
* - Individual item results with inputs, outputs, expected values, and scores
|
|
620
|
+
* - Dataset item and trace links (when available)
|
|
621
|
+
* - Experiment overview with aggregate statistics
|
|
622
|
+
* - Average scores across all evaluations
|
|
623
|
+
* - Run-level evaluation results
|
|
624
|
+
* - Links to dataset runs in the Langfuse UI
|
|
625
|
+
*
|
|
626
|
+
* @param params - Formatting parameters
|
|
627
|
+
* @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
|
|
628
|
+
* @param params.itemResults - Results from processing each data item
|
|
629
|
+
* @param params.originalData - The original input data items
|
|
630
|
+
* @param params.runEvaluations - Results from run-level evaluators
|
|
631
|
+
* @param params.name - Name of the experiment
|
|
632
|
+
* @param params.description - Optional description of the experiment
|
|
633
|
+
* @param params.includeItemResults - Whether to include individual item details (default: true)
|
|
634
|
+
*
|
|
635
|
+
* @returns Promise resolving to formatted string representation
|
|
636
|
+
*
|
|
637
|
+
* @example Output format
|
|
638
|
+
* ```
|
|
639
|
+
* 1. Item 1:
|
|
640
|
+
* Input: What is the capital of France?
|
|
641
|
+
* Expected: Paris
|
|
642
|
+
* Actual: Paris
|
|
643
|
+
* Scores:
|
|
644
|
+
* • exact_match: 1.000
|
|
645
|
+
* • similarity: 0.95
|
|
646
|
+
* 💭 Very close match with expected output
|
|
647
|
+
*
|
|
648
|
+
* Dataset Item:
|
|
649
|
+
* https://cloud.langfuse.com/project/123/datasets/456/items/789
|
|
650
|
+
*
|
|
651
|
+
* Trace:
|
|
652
|
+
* https://cloud.langfuse.com/project/123/traces/abc123
|
|
653
|
+
*
|
|
654
|
+
* ──────────────────────────────────────────────────
|
|
655
|
+
* 📊 Translation Quality Test - Testing model accuracy
|
|
656
|
+
* 2 items
|
|
657
|
+
* Evaluations:
|
|
658
|
+
* • exact_match
|
|
659
|
+
* • similarity
|
|
660
|
+
*
|
|
661
|
+
* Average Scores:
|
|
662
|
+
* • exact_match: 0.850
|
|
663
|
+
* • similarity: 0.923
|
|
664
|
+
*
|
|
665
|
+
* Run Evaluations:
|
|
666
|
+
* • overall_quality: 0.887
|
|
667
|
+
* 💭 Good performance with room for improvement
|
|
668
|
+
*
|
|
669
|
+
* 🔗 Dataset Run:
|
|
670
|
+
* https://cloud.langfuse.com/project/123/datasets/456/runs/def456
|
|
671
|
+
* ```
|
|
672
|
+
*
|
|
673
|
+
* @internal
|
|
674
|
+
*/
|
|
675
|
+
private prettyPrintResults;
|
|
676
|
+
/**
|
|
677
|
+
* Formats a value for display in pretty-printed output.
|
|
678
|
+
*
|
|
679
|
+
* Handles different value types appropriately:
|
|
680
|
+
* - Strings: Truncates long strings to 50 characters with "..."
|
|
681
|
+
* - Objects/Arrays: Converts to JSON string representation
|
|
682
|
+
* - Primitives: Uses toString() representation
|
|
683
|
+
*
|
|
684
|
+
* @param value - The value to format
|
|
685
|
+
* @returns Formatted string representation suitable for display
|
|
686
|
+
*
|
|
687
|
+
* @internal
|
|
688
|
+
*/
|
|
689
|
+
private formatValue;
|
|
690
|
+
private isOtelRegistered;
|
|
691
|
+
}
|
|
692
|
+
|
|
89
693
|
/**
|
|
90
694
|
* Parameters for resolving media references in objects.
|
|
91
695
|
*
|
|
@@ -793,6 +1397,61 @@ declare class LangfuseClient {
|
|
|
793
1397
|
* Manager for media upload and reference resolution.
|
|
794
1398
|
*/
|
|
795
1399
|
media: MediaManager;
|
|
1400
|
+
/**
|
|
1401
|
+
* Manager for running experiments on datasets and data items.
|
|
1402
|
+
*
|
|
1403
|
+
* The experiment manager provides comprehensive functionality for:
|
|
1404
|
+
* - Running tasks on datasets or custom data arrays
|
|
1405
|
+
* - Evaluating outputs with custom or pre-built evaluators
|
|
1406
|
+
* - Tracking experiment runs with automatic tracing
|
|
1407
|
+
* - Generating formatted result summaries
|
|
1408
|
+
* - Integrating with AutoEvals library evaluators
|
|
1409
|
+
*
|
|
1410
|
+
* @example Basic experiment execution
|
|
1411
|
+
* ```typescript
|
|
1412
|
+
* const langfuse = new LangfuseClient();
|
|
1413
|
+
*
|
|
1414
|
+
* const result = await langfuse.experiment.run({
|
|
1415
|
+
* name: "Model Evaluation",
|
|
1416
|
+
* description: "Testing model performance on Q&A tasks",
|
|
1417
|
+
* data: [
|
|
1418
|
+
* { input: "What is 2+2?", expectedOutput: "4" },
|
|
1419
|
+
* { input: "What is the capital of France?", expectedOutput: "Paris" }
|
|
1420
|
+
* ],
|
|
1421
|
+
* task: async ({ input }) => {
|
|
1422
|
+
* // Your model/task implementation
|
|
1423
|
+
* const response = await myModel.generate(input);
|
|
1424
|
+
* return response;
|
|
1425
|
+
* },
|
|
1426
|
+
* evaluators: [
|
|
1427
|
+
* async ({ output, expectedOutput }) => ({
|
|
1428
|
+
* name: "exact_match",
|
|
1429
|
+
* value: output.trim().toLowerCase() === expectedOutput.toLowerCase() ? 1 : 0
|
|
1430
|
+
* })
|
|
1431
|
+
* ]
|
|
1432
|
+
* });
|
|
1433
|
+
*
|
|
1434
|
+
* console.log(await result.prettyPrint());
|
|
1435
|
+
* ```
|
|
1436
|
+
*
|
|
1437
|
+
* @example Using with datasets
|
|
1438
|
+
* ```typescript
|
|
1439
|
+
* const dataset = await langfuse.dataset.get("my-test-dataset");
|
|
1440
|
+
* const result = await dataset.runExperiment({
|
|
1441
|
+
* name: "Production Readiness Test",
|
|
1442
|
+
* task: myTask,
|
|
1443
|
+
* evaluators: [accuracyEvaluator, latencyEvaluator],
|
|
1444
|
+
* runEvaluators: [overallQualityEvaluator]
|
|
1445
|
+
* });
|
|
1446
|
+
* ```
|
|
1447
|
+
*
|
|
1448
|
+
* @see {@link ExperimentManager} for detailed API documentation
|
|
1449
|
+
* @see {@link ExperimentParams} for configuration options
|
|
1450
|
+
* @see {@link ExperimentResult} for result structure
|
|
1451
|
+
* @public
|
|
1452
|
+
* @since 4.0.0
|
|
1453
|
+
*/
|
|
1454
|
+
experiment: ExperimentManager;
|
|
796
1455
|
private baseUrl;
|
|
797
1456
|
private projectId;
|
|
798
1457
|
/**
|
|
@@ -926,4 +1585,72 @@ declare class LangfuseClient {
|
|
|
926
1585
|
getTraceUrl(traceId: string): Promise<string>;
|
|
927
1586
|
}
|
|
928
1587
|
|
|
929
|
-
|
|
1588
|
+
/**
|
|
1589
|
+
* Converts an AutoEvals evaluator to a Langfuse-compatible evaluator function.
|
|
1590
|
+
*
|
|
1591
|
+
* This adapter function bridges the gap between AutoEvals library evaluators
|
|
1592
|
+
* and Langfuse experiment evaluators, handling parameter mapping and result
|
|
1593
|
+
* formatting automatically.
|
|
1594
|
+
*
|
|
1595
|
+
* AutoEvals evaluators expect `input`, `output`, and `expected` parameters,
|
|
1596
|
+
* while Langfuse evaluators use `input`, `output`, and `expectedOutput`.
|
|
1597
|
+
* This function handles the parameter name mapping.
|
|
1598
|
+
*
|
|
1599
|
+
* @template E - Type of the AutoEvals evaluator function
|
|
1600
|
+
* @param autoevalEvaluator - The AutoEvals evaluator function to convert
|
|
1601
|
+
* @param params - Optional additional parameters to pass to the AutoEvals evaluator
|
|
1602
|
+
* @returns A Langfuse-compatible evaluator function
|
|
1603
|
+
*
|
|
1604
|
+
* @example Basic usage with AutoEvals
|
|
1605
|
+
* ```typescript
|
|
1606
|
+
* import { Factuality, Levenshtein } from 'autoevals';
|
|
1607
|
+
* import { autoevalsToLangfuseEvaluator } from '@langfuse/client';
|
|
1608
|
+
*
|
|
1609
|
+
* const factualityEvaluator = autoevalsToLangfuseEvaluator(Factuality);
|
|
1610
|
+
* const levenshteinEvaluator = autoevalsToLangfuseEvaluator(Levenshtein);
|
|
1611
|
+
*
|
|
1612
|
+
* await langfuse.experiment.run({
|
|
1613
|
+
* name: "AutoEvals Integration Test",
|
|
1614
|
+
* data: myDataset,
|
|
1615
|
+
* task: myTask,
|
|
1616
|
+
* evaluators: [factualityEvaluator, levenshteinEvaluator]
|
|
1617
|
+
* });
|
|
1618
|
+
* ```
|
|
1619
|
+
*
|
|
1620
|
+
* @example Using with additional parameters
|
|
1621
|
+
* ```typescript
|
|
1622
|
+
* import { Factuality } from 'autoevals';
|
|
1623
|
+
*
|
|
1624
|
+
* const factualityEvaluator = autoevalsToLangfuseEvaluator(
|
|
1625
|
+
* Factuality,
|
|
1626
|
+
* { model: 'gpt-4o' } // Additional params for AutoEvals
|
|
1627
|
+
* );
|
|
1628
|
+
*
|
|
1629
|
+
* await langfuse.experiment.run({
|
|
1630
|
+
* name: "Factuality Test",
|
|
1631
|
+
* data: myDataset,
|
|
1632
|
+
* task: myTask,
|
|
1633
|
+
* evaluators: [factualityEvaluator]
|
|
1634
|
+
* });
|
|
1635
|
+
* ```
|
|
1636
|
+
*
|
|
1637
|
+
* @see {@link https://github.com/braintrustdata/autoevals} AutoEvals library documentation
|
|
1638
|
+
* @see {@link Evaluator} for Langfuse evaluator specifications
|
|
1639
|
+
*
|
|
1640
|
+
* @public
|
|
1641
|
+
* @since 4.0.0
|
|
1642
|
+
*/
|
|
1643
|
+
declare function autoevalsToLangfuseEvaluator<E extends CallableFunction>(autoevalEvaluator: E, params?: Params<E>): Evaluator;
|
|
1644
|
+
/**
|
|
1645
|
+
* Utility type to extract parameter types from AutoEvals evaluator functions.
|
|
1646
|
+
*
|
|
1647
|
+
* This type helper extracts the parameter type from an AutoEvals evaluator
|
|
1648
|
+
* and omits the standard parameters (input, output, expected) that are
|
|
1649
|
+
* handled by the adapter, leaving only the additional configuration parameters.
|
|
1650
|
+
*
|
|
1651
|
+
* @template E - The AutoEvals evaluator function type
|
|
1652
|
+
* @internal
|
|
1653
|
+
*/
|
|
1654
|
+
type Params<E> = Parameters<E extends (...args: any[]) => any ? E : never>[0] extends infer P ? Omit<P, "input" | "output" | "expected"> : never;
|
|
1655
|
+
|
|
1656
|
+
export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient, autoevalsToLangfuseEvaluator };
|