@langfuse/client 4.0.0 → 4.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +626 -53
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +756 -29
- package/dist/index.d.ts +756 -29
- package/dist/index.mjs +610 -39
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -33,16 +33,18 @@ __export(index_exports, {
|
|
|
33
33
|
ChatMessageType: () => ChatMessageType,
|
|
34
34
|
ChatPromptClient: () => ChatPromptClient,
|
|
35
35
|
DatasetManager: () => DatasetManager,
|
|
36
|
+
ExperimentManager: () => ExperimentManager,
|
|
36
37
|
LangfuseClient: () => LangfuseClient,
|
|
37
38
|
MediaManager: () => MediaManager,
|
|
38
39
|
PromptManager: () => PromptManager,
|
|
39
40
|
ScoreManager: () => ScoreManager,
|
|
40
|
-
TextPromptClient: () => TextPromptClient
|
|
41
|
+
TextPromptClient: () => TextPromptClient,
|
|
42
|
+
autoevalsToLangfuseEvaluator: () => autoevalsToLangfuseEvaluator
|
|
41
43
|
});
|
|
42
44
|
module.exports = __toCommonJS(index_exports);
|
|
43
45
|
|
|
44
46
|
// src/LangfuseClient.ts
|
|
45
|
-
var
|
|
47
|
+
var import_core6 = require("@langfuse/core");
|
|
46
48
|
|
|
47
49
|
// src/dataset/index.ts
|
|
48
50
|
var DatasetManager = class {
|
|
@@ -53,44 +55,91 @@ var DatasetManager = class {
|
|
|
53
55
|
* @internal
|
|
54
56
|
*/
|
|
55
57
|
constructor(params) {
|
|
56
|
-
this.
|
|
58
|
+
this.langfuseClient = params.langfuseClient;
|
|
57
59
|
}
|
|
58
60
|
/**
|
|
59
|
-
* Retrieves a dataset by name
|
|
61
|
+
* Retrieves a dataset by name with all its items and experiment functionality.
|
|
60
62
|
*
|
|
61
|
-
* This method
|
|
62
|
-
*
|
|
63
|
+
* This method fetches a dataset and all its associated items, with support
|
|
64
|
+
* for automatic pagination to handle large datasets efficiently. The returned
|
|
65
|
+
* dataset object includes enhanced functionality for linking items to traces
|
|
66
|
+
* and running experiments directly on the dataset.
|
|
63
67
|
*
|
|
64
68
|
* @param name - The name of the dataset to retrieve
|
|
65
|
-
* @param options - Optional configuration for fetching
|
|
69
|
+
* @param options - Optional configuration for data fetching
|
|
66
70
|
* @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
|
|
71
|
+
* @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
|
|
67
72
|
*
|
|
68
|
-
* @
|
|
73
|
+
* @example Basic dataset retrieval
|
|
74
|
+
* ```typescript
|
|
75
|
+
* const dataset = await langfuse.dataset.get("my-evaluation-dataset");
|
|
76
|
+
* console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
|
|
69
77
|
*
|
|
70
|
-
*
|
|
78
|
+
* // Access dataset properties
|
|
79
|
+
* console.log(dataset.description);
|
|
80
|
+
* console.log(dataset.metadata);
|
|
81
|
+
* ```
|
|
82
|
+
*
|
|
83
|
+
* @example Working with dataset items
|
|
71
84
|
* ```typescript
|
|
72
|
-
* const dataset = await langfuse.dataset.get("
|
|
85
|
+
* const dataset = await langfuse.dataset.get("qa-dataset");
|
|
73
86
|
*
|
|
74
87
|
* for (const item of dataset.items) {
|
|
75
|
-
*
|
|
76
|
-
*
|
|
77
|
-
*
|
|
78
|
-
* //
|
|
79
|
-
* await item.link(
|
|
80
|
-
* { otelSpan: currentSpan },
|
|
81
|
-
* "experiment-run-1",
|
|
82
|
-
* { description: "Testing new model" }
|
|
83
|
-
* );
|
|
88
|
+
* console.log("Question:", item.input);
|
|
89
|
+
* console.log("Expected Answer:", item.expectedOutput);
|
|
90
|
+
*
|
|
91
|
+
* // Each item has a link function for connecting to traces
|
|
92
|
+
* // await item.link(span, "experiment-name");
|
|
84
93
|
* }
|
|
85
94
|
* ```
|
|
95
|
+
*
|
|
96
|
+
* @example Running experiments on datasets
|
|
97
|
+
* ```typescript
|
|
98
|
+
* const dataset = await langfuse.dataset.get("benchmark-dataset");
|
|
99
|
+
*
|
|
100
|
+
* const result = await dataset.runExperiment({
|
|
101
|
+
* name: "GPT-4 Benchmark",
|
|
102
|
+
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
103
|
+
* task: async ({ input }) => {
|
|
104
|
+
* const response = await openai.chat.completions.create({
|
|
105
|
+
* model: "gpt-4",
|
|
106
|
+
* messages: [{ role: "user", content: input }]
|
|
107
|
+
* });
|
|
108
|
+
* return response.choices[0].message.content;
|
|
109
|
+
* },
|
|
110
|
+
* evaluators: [
|
|
111
|
+
* async ({ output, expectedOutput }) => ({
|
|
112
|
+
* name: "exact_match",
|
|
113
|
+
* value: output === expectedOutput ? 1 : 0
|
|
114
|
+
* })
|
|
115
|
+
* ]
|
|
116
|
+
* });
|
|
117
|
+
*
|
|
118
|
+
* console.log(await result.prettyPrint());
|
|
119
|
+
* ```
|
|
120
|
+
*
|
|
121
|
+
* @example Handling large datasets
|
|
122
|
+
* ```typescript
|
|
123
|
+
* // For very large datasets, use smaller page sizes
|
|
124
|
+
* const largeDataset = await langfuse.dataset.get(
|
|
125
|
+
* "large-dataset",
|
|
126
|
+
* { fetchItemsPageSize: 100 }
|
|
127
|
+
* );
|
|
128
|
+
* ```
|
|
129
|
+
*
|
|
130
|
+
* @throws {Error} If the dataset does not exist or cannot be accessed
|
|
131
|
+
* @see {@link FetchedDataset} for the complete return type specification
|
|
132
|
+
* @see {@link RunExperimentOnDataset} for experiment execution details
|
|
133
|
+
* @public
|
|
134
|
+
* @since 4.0.0
|
|
86
135
|
*/
|
|
87
136
|
async get(name, options) {
|
|
88
137
|
var _a;
|
|
89
|
-
const dataset = await this.
|
|
138
|
+
const dataset = await this.langfuseClient.api.datasets.get(name);
|
|
90
139
|
const items = [];
|
|
91
140
|
let page = 1;
|
|
92
141
|
while (true) {
|
|
93
|
-
const itemsResponse = await this.
|
|
142
|
+
const itemsResponse = await this.langfuseClient.api.datasetItems.list({
|
|
94
143
|
datasetName: name,
|
|
95
144
|
limit: (_a = options == null ? void 0 : options.fetchItemsPageSize) != null ? _a : 50,
|
|
96
145
|
page
|
|
@@ -101,12 +150,20 @@ var DatasetManager = class {
|
|
|
101
150
|
}
|
|
102
151
|
page++;
|
|
103
152
|
}
|
|
153
|
+
const itemsWithLinkMethod = items.map((item) => ({
|
|
154
|
+
...item,
|
|
155
|
+
link: this.createDatasetItemLinkFunction(item)
|
|
156
|
+
}));
|
|
157
|
+
const runExperiment = (params) => {
|
|
158
|
+
return this.langfuseClient.experiment.run({
|
|
159
|
+
data: items,
|
|
160
|
+
...params
|
|
161
|
+
});
|
|
162
|
+
};
|
|
104
163
|
const returnDataset = {
|
|
105
164
|
...dataset,
|
|
106
|
-
items:
|
|
107
|
-
|
|
108
|
-
link: this.createDatasetItemLinkFunction(item)
|
|
109
|
-
}))
|
|
165
|
+
items: itemsWithLinkMethod,
|
|
166
|
+
runExperiment
|
|
110
167
|
};
|
|
111
168
|
return returnDataset;
|
|
112
169
|
}
|
|
@@ -119,7 +176,7 @@ var DatasetManager = class {
|
|
|
119
176
|
*/
|
|
120
177
|
createDatasetItemLinkFunction(item) {
|
|
121
178
|
const linkFunction = async (obj, runName, runArgs) => {
|
|
122
|
-
return await this.
|
|
179
|
+
return await this.langfuseClient.api.datasetRunItems.create({
|
|
123
180
|
runName,
|
|
124
181
|
datasetItemId: item.id,
|
|
125
182
|
traceId: obj.otelSpan.spanContext().traceId,
|
|
@@ -131,8 +188,498 @@ var DatasetManager = class {
|
|
|
131
188
|
}
|
|
132
189
|
};
|
|
133
190
|
|
|
134
|
-
// src/
|
|
191
|
+
// src/experiment/ExperimentManager.ts
|
|
135
192
|
var import_core = require("@langfuse/core");
|
|
193
|
+
var import_tracing = require("@langfuse/tracing");
|
|
194
|
+
var import_api = require("@opentelemetry/api");
|
|
195
|
+
var ExperimentManager = class {
|
|
196
|
+
/**
|
|
197
|
+
* Creates a new ExperimentManager instance.
|
|
198
|
+
*
|
|
199
|
+
* @param params - Configuration object
|
|
200
|
+
* @param params.langfuseClient - The Langfuse client instance for API communication
|
|
201
|
+
* @internal
|
|
202
|
+
*/
|
|
203
|
+
constructor(params) {
|
|
204
|
+
this.langfuseClient = params.langfuseClient;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Gets the global logger instance for experiment-related logging.
|
|
208
|
+
*
|
|
209
|
+
* @returns The global logger instance
|
|
210
|
+
* @internal
|
|
211
|
+
*/
|
|
212
|
+
get logger() {
|
|
213
|
+
return (0, import_core.getGlobalLogger)();
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Executes an experiment by running a task on each data item and evaluating the results.
|
|
217
|
+
*
|
|
218
|
+
* This method orchestrates the complete experiment lifecycle:
|
|
219
|
+
* 1. Executes the task function on each data item with proper tracing
|
|
220
|
+
* 2. Runs item-level evaluators on each task output
|
|
221
|
+
* 3. Executes run-level evaluators on the complete result set
|
|
222
|
+
* 4. Links results to dataset runs (for Langfuse datasets)
|
|
223
|
+
* 5. Stores all scores and traces in Langfuse
|
|
224
|
+
*
|
|
225
|
+
* @param config - The experiment configuration
|
|
226
|
+
* @param config.name - Human-readable name for the experiment
|
|
227
|
+
* @param config.description - Optional description of the experiment's purpose
|
|
228
|
+
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
229
|
+
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
230
|
+
* @param config.task - Function that processes each data item and returns output
|
|
231
|
+
* @param config.evaluators - Optional array of functions to evaluate each item's output
|
|
232
|
+
* @param config.runEvaluators - Optional array of functions to evaluate the entire run
|
|
233
|
+
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
234
|
+
*
|
|
235
|
+
* @returns Promise that resolves to experiment results including:
|
|
236
|
+
* - itemResults: Results for each processed data item
|
|
237
|
+
* - runEvaluations: Results from run-level evaluators
|
|
238
|
+
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
239
|
+
* - prettyPrint: Function to format and display results
|
|
240
|
+
*
|
|
241
|
+
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
242
|
+
* @throws {Error} When required evaluators fail critically
|
|
243
|
+
*
|
|
244
|
+
* @example Simple experiment
|
|
245
|
+
* ```typescript
|
|
246
|
+
* const result = await langfuse.experiment.run({
|
|
247
|
+
* name: "Translation Quality Test",
|
|
248
|
+
* data: [
|
|
249
|
+
* { input: "Hello world", expectedOutput: "Hola mundo" },
|
|
250
|
+
* { input: "Good morning", expectedOutput: "Buenos días" }
|
|
251
|
+
* ],
|
|
252
|
+
* task: async ({ input }) => translateText(input, 'es'),
|
|
253
|
+
* evaluators: [
|
|
254
|
+
* async ({ output, expectedOutput }) => ({
|
|
255
|
+
* name: "bleu_score",
|
|
256
|
+
* value: calculateBleuScore(output, expectedOutput)
|
|
257
|
+
* })
|
|
258
|
+
* ]
|
|
259
|
+
* });
|
|
260
|
+
* ```
|
|
261
|
+
*
|
|
262
|
+
* @example Experiment with concurrency control
|
|
263
|
+
* ```typescript
|
|
264
|
+
* const result = await langfuse.experiment.run({
|
|
265
|
+
* name: "Large Scale Evaluation",
|
|
266
|
+
* data: largeBatchOfItems,
|
|
267
|
+
* task: expensiveModelCall,
|
|
268
|
+
* maxConcurrency: 5, // Process max 5 items simultaneously
|
|
269
|
+
* evaluators: [myEvaluator],
|
|
270
|
+
* runEvaluators: [
|
|
271
|
+
* async ({ itemResults }) => ({
|
|
272
|
+
* name: "average_score",
|
|
273
|
+
* value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
|
|
274
|
+
* })
|
|
275
|
+
* ]
|
|
276
|
+
* });
|
|
277
|
+
* ```
|
|
278
|
+
*
|
|
279
|
+
* @see {@link ExperimentParams} for detailed parameter documentation
|
|
280
|
+
* @see {@link ExperimentResult} for detailed return value documentation
|
|
281
|
+
* @see {@link Evaluator} for evaluator function specifications
|
|
282
|
+
* @see {@link RunEvaluator} for run evaluator function specifications
|
|
283
|
+
*
|
|
284
|
+
* @public
|
|
285
|
+
*/
|
|
286
|
+
async run(config) {
|
|
287
|
+
const {
|
|
288
|
+
data,
|
|
289
|
+
evaluators,
|
|
290
|
+
task,
|
|
291
|
+
name,
|
|
292
|
+
description,
|
|
293
|
+
metadata,
|
|
294
|
+
maxConcurrency: batchSize = Infinity,
|
|
295
|
+
runEvaluators
|
|
296
|
+
} = config;
|
|
297
|
+
if (!this.isOtelRegistered()) {
|
|
298
|
+
this.logger.warn(
|
|
299
|
+
"OpenTelemetry has not been set up. Traces will not be sent to Langfuse.See our docs on how to set up OpenTelemetry: https://langfuse.com/docs/observability/sdk/typescript/setup#tracing-setup"
|
|
300
|
+
);
|
|
301
|
+
}
|
|
302
|
+
const itemResults = [];
|
|
303
|
+
for (let i = 0; i < data.length; i += batchSize) {
|
|
304
|
+
const batch = data.slice(i, i + batchSize);
|
|
305
|
+
const promises = batch.map(
|
|
306
|
+
async (item) => {
|
|
307
|
+
return this.runItem({
|
|
308
|
+
item,
|
|
309
|
+
evaluators,
|
|
310
|
+
task,
|
|
311
|
+
experimentName: name,
|
|
312
|
+
experimentDescription: description,
|
|
313
|
+
experimentMetadata: metadata
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
);
|
|
317
|
+
const results = await Promise.all(promises);
|
|
318
|
+
itemResults.push(...results);
|
|
319
|
+
}
|
|
320
|
+
const datasetRunId = itemResults.length > 0 ? itemResults[0].datasetRunId : void 0;
|
|
321
|
+
let datasetRunUrl = void 0;
|
|
322
|
+
if (datasetRunId && data.length > 0 && "datasetId" in data[0]) {
|
|
323
|
+
const datasetId = data[0].datasetId;
|
|
324
|
+
const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split(
|
|
325
|
+
"/traces"
|
|
326
|
+
)[0];
|
|
327
|
+
datasetRunUrl = `${projectUrl}/datasets/${datasetId}/runs/${datasetRunId}`;
|
|
328
|
+
}
|
|
329
|
+
let runEvaluations = [];
|
|
330
|
+
if (runEvaluators && (runEvaluators == null ? void 0 : runEvaluators.length) > 0) {
|
|
331
|
+
const promises = runEvaluators.map(async (runEvaluator) => {
|
|
332
|
+
return runEvaluator({ itemResults }).then((result) => {
|
|
333
|
+
return Array.isArray(result) ? result : [result];
|
|
334
|
+
}).catch((err) => {
|
|
335
|
+
this.logger.error("Run evaluator failed with error ", err);
|
|
336
|
+
throw err;
|
|
337
|
+
});
|
|
338
|
+
});
|
|
339
|
+
runEvaluations = (await Promise.allSettled(promises)).reduce(
|
|
340
|
+
(acc, settledPromise) => {
|
|
341
|
+
if (settledPromise.status === "fulfilled") {
|
|
342
|
+
acc.push(...settledPromise.value);
|
|
343
|
+
}
|
|
344
|
+
return acc;
|
|
345
|
+
},
|
|
346
|
+
[]
|
|
347
|
+
);
|
|
348
|
+
if (datasetRunId) {
|
|
349
|
+
runEvaluations.forEach(
|
|
350
|
+
(runEval) => this.langfuseClient.score.create({ datasetRunId, ...runEval })
|
|
351
|
+
);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
await this.langfuseClient.score.flush();
|
|
355
|
+
return {
|
|
356
|
+
itemResults,
|
|
357
|
+
datasetRunId,
|
|
358
|
+
runEvaluations,
|
|
359
|
+
prettyPrint: async (options) => {
|
|
360
|
+
var _a;
|
|
361
|
+
return await this.prettyPrintResults({
|
|
362
|
+
datasetRunUrl,
|
|
363
|
+
itemResults,
|
|
364
|
+
originalData: data,
|
|
365
|
+
runEvaluations,
|
|
366
|
+
name: config.name,
|
|
367
|
+
description: config.description,
|
|
368
|
+
includeItemResults: (_a = options == null ? void 0 : options.includeItemResults) != null ? _a : true
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Executes the task and evaluators for a single data item.
|
|
375
|
+
*
|
|
376
|
+
* This method handles the complete processing pipeline for one data item:
|
|
377
|
+
* 1. Executes the task within a traced observation span
|
|
378
|
+
* 2. Links the result to a dataset run (if applicable)
|
|
379
|
+
* 3. Runs all item-level evaluators on the output
|
|
380
|
+
* 4. Stores evaluation scores in Langfuse
|
|
381
|
+
* 5. Handles errors gracefully by continuing with remaining evaluators
|
|
382
|
+
*
|
|
383
|
+
* @param params - Parameters for item execution
|
|
384
|
+
* @param params.experimentName - Name of the parent experiment
|
|
385
|
+
* @param params.experimentDescription - Description of the parent experiment
|
|
386
|
+
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
387
|
+
* @param params.item - The data item to process
|
|
388
|
+
* @param params.task - The task function to execute
|
|
389
|
+
* @param params.evaluators - Optional evaluators to run on the output
|
|
390
|
+
*
|
|
391
|
+
* @returns Promise resolving to the item result with output, evaluations, and trace info
|
|
392
|
+
*
|
|
393
|
+
* @throws {Error} When task execution fails (propagated from task function)
|
|
394
|
+
*
|
|
395
|
+
* @internal
|
|
396
|
+
*/
|
|
397
|
+
async runItem(params) {
|
|
398
|
+
const { item, evaluators = [], task } = params;
|
|
399
|
+
const { output, traceId } = await (0, import_tracing.startActiveObservation)(
|
|
400
|
+
"experiment-item-run",
|
|
401
|
+
async (span) => {
|
|
402
|
+
const output2 = await task(item);
|
|
403
|
+
span.update({
|
|
404
|
+
input: item.input,
|
|
405
|
+
output: output2
|
|
406
|
+
});
|
|
407
|
+
return { output: output2, traceId: span.traceId };
|
|
408
|
+
}
|
|
409
|
+
);
|
|
410
|
+
let datasetRunId = void 0;
|
|
411
|
+
if ("id" in item) {
|
|
412
|
+
await this.langfuseClient.api.datasetRunItems.create({
|
|
413
|
+
runName: params.experimentName,
|
|
414
|
+
runDescription: params.experimentDescription,
|
|
415
|
+
metadata: params.experimentMetadata,
|
|
416
|
+
datasetItemId: item.id,
|
|
417
|
+
traceId
|
|
418
|
+
}).then((result) => {
|
|
419
|
+
datasetRunId = result.datasetRunId;
|
|
420
|
+
}).catch(
|
|
421
|
+
(err) => this.logger.error("Linking dataset run item failed", err)
|
|
422
|
+
);
|
|
423
|
+
}
|
|
424
|
+
const evalPromises = evaluators.map(
|
|
425
|
+
async (evaluator) => {
|
|
426
|
+
const params2 = {
|
|
427
|
+
input: item.input,
|
|
428
|
+
expectedOutput: item.expectedOutput,
|
|
429
|
+
output
|
|
430
|
+
};
|
|
431
|
+
return evaluator(params2).then((result) => {
|
|
432
|
+
return Array.isArray(result) ? result : [result];
|
|
433
|
+
}).catch((err) => {
|
|
434
|
+
this.logger.error(
|
|
435
|
+
`Evaluator '${evaluator.name}' failed for params
|
|
436
|
+
|
|
437
|
+
${JSON.stringify(params2)}
|
|
438
|
+
|
|
439
|
+
with error: ${err}`
|
|
440
|
+
);
|
|
441
|
+
throw err;
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
);
|
|
445
|
+
const evals = (await Promise.allSettled(evalPromises)).reduce(
|
|
446
|
+
(acc, promiseResult) => {
|
|
447
|
+
if (promiseResult.status === "fulfilled") {
|
|
448
|
+
acc.push(...promiseResult.value.flat());
|
|
449
|
+
}
|
|
450
|
+
return acc;
|
|
451
|
+
},
|
|
452
|
+
[]
|
|
453
|
+
);
|
|
454
|
+
for (const ev of evals) {
|
|
455
|
+
this.langfuseClient.score.create({
|
|
456
|
+
traceId,
|
|
457
|
+
name: ev.name,
|
|
458
|
+
comment: ev.comment,
|
|
459
|
+
value: ev.value,
|
|
460
|
+
metadata: ev.metadata,
|
|
461
|
+
dataType: ev.dataType
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
return {
|
|
465
|
+
output,
|
|
466
|
+
evaluations: evals,
|
|
467
|
+
traceId,
|
|
468
|
+
datasetRunId
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
/**
|
|
472
|
+
* Formats experiment results into a human-readable string representation.
|
|
473
|
+
*
|
|
474
|
+
* Creates a comprehensive, nicely formatted summary of the experiment including:
|
|
475
|
+
* - Individual item results with inputs, outputs, expected values, and scores
|
|
476
|
+
* - Dataset item and trace links (when available)
|
|
477
|
+
* - Experiment overview with aggregate statistics
|
|
478
|
+
* - Average scores across all evaluations
|
|
479
|
+
* - Run-level evaluation results
|
|
480
|
+
* - Links to dataset runs in the Langfuse UI
|
|
481
|
+
*
|
|
482
|
+
* @param params - Formatting parameters
|
|
483
|
+
* @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
|
|
484
|
+
* @param params.itemResults - Results from processing each data item
|
|
485
|
+
* @param params.originalData - The original input data items
|
|
486
|
+
* @param params.runEvaluations - Results from run-level evaluators
|
|
487
|
+
* @param params.name - Name of the experiment
|
|
488
|
+
* @param params.description - Optional description of the experiment
|
|
489
|
+
* @param params.includeItemResults - Whether to include individual item details (default: true)
|
|
490
|
+
*
|
|
491
|
+
* @returns Promise resolving to formatted string representation
|
|
492
|
+
*
|
|
493
|
+
* @example Output format
|
|
494
|
+
* ```
|
|
495
|
+
* 1. Item 1:
|
|
496
|
+
* Input: What is the capital of France?
|
|
497
|
+
* Expected: Paris
|
|
498
|
+
* Actual: Paris
|
|
499
|
+
* Scores:
|
|
500
|
+
* • exact_match: 1.000
|
|
501
|
+
* • similarity: 0.95
|
|
502
|
+
* 💭 Very close match with expected output
|
|
503
|
+
*
|
|
504
|
+
* Dataset Item:
|
|
505
|
+
* https://cloud.langfuse.com/project/123/datasets/456/items/789
|
|
506
|
+
*
|
|
507
|
+
* Trace:
|
|
508
|
+
* https://cloud.langfuse.com/project/123/traces/abc123
|
|
509
|
+
*
|
|
510
|
+
* ──────────────────────────────────────────────────
|
|
511
|
+
* 📊 Translation Quality Test - Testing model accuracy
|
|
512
|
+
* 2 items
|
|
513
|
+
* Evaluations:
|
|
514
|
+
* • exact_match
|
|
515
|
+
* • similarity
|
|
516
|
+
*
|
|
517
|
+
* Average Scores:
|
|
518
|
+
* • exact_match: 0.850
|
|
519
|
+
* • similarity: 0.923
|
|
520
|
+
*
|
|
521
|
+
* Run Evaluations:
|
|
522
|
+
* • overall_quality: 0.887
|
|
523
|
+
* 💭 Good performance with room for improvement
|
|
524
|
+
*
|
|
525
|
+
* 🔗 Dataset Run:
|
|
526
|
+
* https://cloud.langfuse.com/project/123/datasets/456/runs/def456
|
|
527
|
+
* ```
|
|
528
|
+
*
|
|
529
|
+
* @internal
|
|
530
|
+
*/
|
|
531
|
+
async prettyPrintResults(params) {
|
|
532
|
+
var _a, _b;
|
|
533
|
+
const {
|
|
534
|
+
itemResults,
|
|
535
|
+
originalData,
|
|
536
|
+
runEvaluations,
|
|
537
|
+
name,
|
|
538
|
+
description,
|
|
539
|
+
includeItemResults = true
|
|
540
|
+
} = params;
|
|
541
|
+
if (itemResults.length === 0) {
|
|
542
|
+
return "No experiment results to display.";
|
|
543
|
+
}
|
|
544
|
+
let output = "";
|
|
545
|
+
if (includeItemResults) {
|
|
546
|
+
for (let index = 0; index < itemResults.length; index++) {
|
|
547
|
+
const result = itemResults[index];
|
|
548
|
+
const originalItem = originalData[index];
|
|
549
|
+
output += `
|
|
550
|
+
${index + 1}. Item ${index + 1}:
|
|
551
|
+
`;
|
|
552
|
+
if ((originalItem == null ? void 0 : originalItem.input) !== void 0) {
|
|
553
|
+
output += ` Input: ${this.formatValue(originalItem.input)}
|
|
554
|
+
`;
|
|
555
|
+
}
|
|
556
|
+
const expectedOutput = (_b = (_a = originalItem == null ? void 0 : originalItem.expectedOutput) != null ? _a : result.expectedOutput) != null ? _b : null;
|
|
557
|
+
output += ` Expected: ${expectedOutput !== null ? this.formatValue(expectedOutput) : "null"}
|
|
558
|
+
`;
|
|
559
|
+
output += ` Actual: ${this.formatValue(result.output)}
|
|
560
|
+
`;
|
|
561
|
+
if (result.evaluations.length > 0) {
|
|
562
|
+
output += ` Scores:
|
|
563
|
+
`;
|
|
564
|
+
result.evaluations.forEach((evaluation) => {
|
|
565
|
+
const score = typeof evaluation.value === "number" ? evaluation.value.toFixed(3) : evaluation.value;
|
|
566
|
+
output += ` \u2022 ${evaluation.name}: ${score}`;
|
|
567
|
+
if (evaluation.comment) {
|
|
568
|
+
output += `
|
|
569
|
+
\u{1F4AD} ${evaluation.comment}`;
|
|
570
|
+
}
|
|
571
|
+
output += "\n";
|
|
572
|
+
});
|
|
573
|
+
}
|
|
574
|
+
if (originalItem && "id" in originalItem && "datasetId" in originalItem) {
|
|
575
|
+
const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split("/traces")[0];
|
|
576
|
+
const datasetItemUrl = `${projectUrl}/datasets/${originalItem.datasetId}/items/${originalItem.id}`;
|
|
577
|
+
output += `
|
|
578
|
+
Dataset Item:
|
|
579
|
+
${datasetItemUrl}
|
|
580
|
+
`;
|
|
581
|
+
}
|
|
582
|
+
if (result.traceId) {
|
|
583
|
+
const traceUrl = await this.langfuseClient.getTraceUrl(
|
|
584
|
+
result.traceId
|
|
585
|
+
);
|
|
586
|
+
output += `
|
|
587
|
+
Trace:
|
|
588
|
+
${traceUrl}
|
|
589
|
+
`;
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
} else {
|
|
593
|
+
output += `Individual Results: Hidden (${itemResults.length} items)
|
|
594
|
+
`;
|
|
595
|
+
output += "\u{1F4A1} Call prettyPrint({ includeItemResults: true }) to view them\n";
|
|
596
|
+
}
|
|
597
|
+
const totalItems = itemResults.length;
|
|
598
|
+
const evaluationNames = new Set(
|
|
599
|
+
itemResults.flatMap((r) => r.evaluations.map((e) => e.name))
|
|
600
|
+
);
|
|
601
|
+
output += `
|
|
602
|
+
${"\u2500".repeat(50)}
|
|
603
|
+
`;
|
|
604
|
+
output += `\u{1F4CA} ${name}`;
|
|
605
|
+
if (description) {
|
|
606
|
+
output += ` - ${description}`;
|
|
607
|
+
}
|
|
608
|
+
output += `
|
|
609
|
+
${totalItems} items`;
|
|
610
|
+
if (evaluationNames.size > 0) {
|
|
611
|
+
output += `
|
|
612
|
+
Evaluations:`;
|
|
613
|
+
Array.from(evaluationNames).forEach((evalName) => {
|
|
614
|
+
output += `
|
|
615
|
+
\u2022 ${evalName}`;
|
|
616
|
+
});
|
|
617
|
+
output += "\n";
|
|
618
|
+
}
|
|
619
|
+
if (evaluationNames.size > 0) {
|
|
620
|
+
output += `
|
|
621
|
+
Average Scores:`;
|
|
622
|
+
for (const evalName of evaluationNames) {
|
|
623
|
+
const scores = itemResults.flatMap((r) => r.evaluations).filter((e) => e.name === evalName && typeof e.value === "number").map((e) => e.value);
|
|
624
|
+
if (scores.length > 0) {
|
|
625
|
+
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
626
|
+
output += `
|
|
627
|
+
\u2022 ${evalName}: ${avg.toFixed(3)}`;
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
output += "\n";
|
|
631
|
+
}
|
|
632
|
+
if (runEvaluations.length > 0) {
|
|
633
|
+
output += `
|
|
634
|
+
Run Evaluations:`;
|
|
635
|
+
runEvaluations.forEach((runEval) => {
|
|
636
|
+
const score = typeof runEval.value === "number" ? runEval.value.toFixed(3) : runEval.value;
|
|
637
|
+
output += `
|
|
638
|
+
\u2022 ${runEval.name}: ${score}`;
|
|
639
|
+
if (runEval.comment) {
|
|
640
|
+
output += `
|
|
641
|
+
\u{1F4AD} ${runEval.comment}`;
|
|
642
|
+
}
|
|
643
|
+
});
|
|
644
|
+
output += "\n";
|
|
645
|
+
}
|
|
646
|
+
if (params.datasetRunUrl) {
|
|
647
|
+
output += `
|
|
648
|
+
\u{1F517} Dataset Run:
|
|
649
|
+
${params.datasetRunUrl}`;
|
|
650
|
+
}
|
|
651
|
+
return output;
|
|
652
|
+
}
|
|
653
|
+
/**
|
|
654
|
+
* Formats a value for display in pretty-printed output.
|
|
655
|
+
*
|
|
656
|
+
* Handles different value types appropriately:
|
|
657
|
+
* - Strings: Truncates long strings to 50 characters with "..."
|
|
658
|
+
* - Objects/Arrays: Converts to JSON string representation
|
|
659
|
+
* - Primitives: Uses toString() representation
|
|
660
|
+
*
|
|
661
|
+
* @param value - The value to format
|
|
662
|
+
* @returns Formatted string representation suitable for display
|
|
663
|
+
*
|
|
664
|
+
* @internal
|
|
665
|
+
*/
|
|
666
|
+
formatValue(value) {
|
|
667
|
+
if (typeof value === "string") {
|
|
668
|
+
return value.length > 50 ? `${value.substring(0, 47)}...` : value;
|
|
669
|
+
}
|
|
670
|
+
return JSON.stringify(value);
|
|
671
|
+
}
|
|
672
|
+
isOtelRegistered() {
|
|
673
|
+
let tracerProvider = import_api.trace.getTracerProvider();
|
|
674
|
+
if (tracerProvider instanceof import_api.ProxyTracerProvider) {
|
|
675
|
+
tracerProvider = tracerProvider.getDelegate();
|
|
676
|
+
}
|
|
677
|
+
return tracerProvider.constructor.name !== "NoopTracerProvider";
|
|
678
|
+
}
|
|
679
|
+
};
|
|
680
|
+
|
|
681
|
+
// src/media/index.ts
|
|
682
|
+
var import_core2 = require("@langfuse/core");
|
|
136
683
|
var MediaManager = class _MediaManager {
|
|
137
684
|
/**
|
|
138
685
|
* Creates a new MediaManager instance.
|
|
@@ -210,14 +757,14 @@ var MediaManager = class _MediaManager {
|
|
|
210
757
|
const uint8Content = new Uint8Array(
|
|
211
758
|
await mediaContent.arrayBuffer()
|
|
212
759
|
);
|
|
213
|
-
const base64MediaContent = (0,
|
|
760
|
+
const base64MediaContent = (0, import_core2.bytesToBase64)(uint8Content);
|
|
214
761
|
const base64DataUri = `data:${mediaData.contentType};base64,${base64MediaContent}`;
|
|
215
762
|
referenceStringToMediaContentMap.set(
|
|
216
763
|
referenceString,
|
|
217
764
|
base64DataUri
|
|
218
765
|
);
|
|
219
766
|
} catch (error) {
|
|
220
|
-
(0,
|
|
767
|
+
(0, import_core2.getGlobalLogger)().warn(
|
|
221
768
|
"Error fetching media content for reference string",
|
|
222
769
|
referenceString,
|
|
223
770
|
error
|
|
@@ -293,10 +840,10 @@ var MediaManager = class _MediaManager {
|
|
|
293
840
|
};
|
|
294
841
|
|
|
295
842
|
// src/prompt/promptManager.ts
|
|
296
|
-
var
|
|
843
|
+
var import_core4 = require("@langfuse/core");
|
|
297
844
|
|
|
298
845
|
// src/prompt/promptCache.ts
|
|
299
|
-
var
|
|
846
|
+
var import_core3 = require("@langfuse/core");
|
|
300
847
|
var DEFAULT_PROMPT_CACHE_TTL_SECONDS = 60;
|
|
301
848
|
var LangfusePromptCacheItem = class {
|
|
302
849
|
constructor(value, ttlSeconds) {
|
|
@@ -348,7 +895,7 @@ var LangfusePromptCache = class {
|
|
|
348
895
|
return this._refreshingKeys.has(key);
|
|
349
896
|
}
|
|
350
897
|
invalidate(promptName) {
|
|
351
|
-
(0,
|
|
898
|
+
(0, import_core3.getGlobalLogger)().debug(
|
|
352
899
|
"Invalidating cache keys for",
|
|
353
900
|
promptName,
|
|
354
901
|
this._cache.keys()
|
|
@@ -692,7 +1239,7 @@ var PromptManager = class {
|
|
|
692
1239
|
this.cache = new LangfusePromptCache();
|
|
693
1240
|
}
|
|
694
1241
|
get logger() {
|
|
695
|
-
return (0,
|
|
1242
|
+
return (0, import_core4.getGlobalLogger)();
|
|
696
1243
|
}
|
|
697
1244
|
/**
|
|
698
1245
|
* Creates a new prompt in Langfuse.
|
|
@@ -919,8 +1466,8 @@ var PromptManager = class {
|
|
|
919
1466
|
};
|
|
920
1467
|
|
|
921
1468
|
// src/score/index.ts
|
|
922
|
-
var
|
|
923
|
-
var
|
|
1469
|
+
var import_core5 = require("@langfuse/core");
|
|
1470
|
+
var import_api2 = require("@opentelemetry/api");
|
|
924
1471
|
var MAX_QUEUE_SIZE = 1e5;
|
|
925
1472
|
var MAX_BATCH_SIZE = 100;
|
|
926
1473
|
var ScoreManager = class {
|
|
@@ -935,13 +1482,13 @@ var ScoreManager = class {
|
|
|
935
1482
|
this.flushPromise = null;
|
|
936
1483
|
this.flushTimer = null;
|
|
937
1484
|
this.apiClient = params.apiClient;
|
|
938
|
-
const envFlushAtCount = (0,
|
|
939
|
-
const envFlushIntervalSeconds = (0,
|
|
1485
|
+
const envFlushAtCount = (0, import_core5.getEnv)("LANGFUSE_FLUSH_AT");
|
|
1486
|
+
const envFlushIntervalSeconds = (0, import_core5.getEnv)("LANGFUSE_FLUSH_INTERVAL");
|
|
940
1487
|
this.flushAtCount = envFlushAtCount ? Number(envFlushAtCount) : 10;
|
|
941
1488
|
this.flushIntervalSeconds = envFlushIntervalSeconds ? Number(envFlushIntervalSeconds) : 1;
|
|
942
1489
|
}
|
|
943
1490
|
get logger() {
|
|
944
|
-
return (0,
|
|
1491
|
+
return (0, import_core5.getGlobalLogger)();
|
|
945
1492
|
}
|
|
946
1493
|
/**
|
|
947
1494
|
* Creates a new score event and adds it to the processing queue.
|
|
@@ -966,11 +1513,11 @@ var ScoreManager = class {
|
|
|
966
1513
|
var _a, _b;
|
|
967
1514
|
const scoreData = {
|
|
968
1515
|
...data,
|
|
969
|
-
id: (_a = data.id) != null ? _a : (0,
|
|
970
|
-
environment: (_b = data.environment) != null ? _b : (0,
|
|
1516
|
+
id: (_a = data.id) != null ? _a : (0, import_core5.generateUUID)(),
|
|
1517
|
+
environment: (_b = data.environment) != null ? _b : (0, import_core5.getEnv)("LANGFUSE_TRACING_ENVIRONMENT")
|
|
971
1518
|
};
|
|
972
1519
|
const scoreIngestionEvent = {
|
|
973
|
-
id: (0,
|
|
1520
|
+
id: (0, import_core5.generateUUID)(),
|
|
974
1521
|
type: "score-create",
|
|
975
1522
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
976
1523
|
body: scoreData
|
|
@@ -982,10 +1529,14 @@ var ScoreManager = class {
|
|
|
982
1529
|
return;
|
|
983
1530
|
}
|
|
984
1531
|
this.eventQueue.push(scoreIngestionEvent);
|
|
1532
|
+
this.logger.debug(
|
|
1533
|
+
"Added score event to queue:\n",
|
|
1534
|
+
JSON.stringify(scoreIngestionEvent, null, 2)
|
|
1535
|
+
);
|
|
985
1536
|
if (this.eventQueue.length >= this.flushAtCount) {
|
|
986
1537
|
this.flushPromise = this.flush();
|
|
987
1538
|
} else if (!this.flushTimer) {
|
|
988
|
-
this.flushTimer = (0,
|
|
1539
|
+
this.flushTimer = (0, import_core5.safeSetTimeout)(() => {
|
|
989
1540
|
this.flushPromise = this.flush();
|
|
990
1541
|
}, this.flushIntervalSeconds * 1e3);
|
|
991
1542
|
}
|
|
@@ -1068,7 +1619,7 @@ var ScoreManager = class {
|
|
|
1068
1619
|
* ```
|
|
1069
1620
|
*/
|
|
1070
1621
|
activeObservation(data) {
|
|
1071
|
-
const currentOtelSpan =
|
|
1622
|
+
const currentOtelSpan = import_api2.trace.getActiveSpan();
|
|
1072
1623
|
if (!currentOtelSpan) {
|
|
1073
1624
|
this.logger.warn("No active span in context to score.");
|
|
1074
1625
|
return;
|
|
@@ -1104,7 +1655,7 @@ var ScoreManager = class {
|
|
|
1104
1655
|
* ```
|
|
1105
1656
|
*/
|
|
1106
1657
|
activeTrace(data) {
|
|
1107
|
-
const currentOtelSpan =
|
|
1658
|
+
const currentOtelSpan = import_api2.trace.getActiveSpan();
|
|
1108
1659
|
if (!currentOtelSpan) {
|
|
1109
1660
|
this.logger.warn("No active span in context to score trace.");
|
|
1110
1661
|
return;
|
|
@@ -1204,10 +1755,10 @@ var LangfuseClient = class {
|
|
|
1204
1755
|
constructor(params) {
|
|
1205
1756
|
this.projectId = null;
|
|
1206
1757
|
var _a, _b, _c, _d, _e, _f, _g;
|
|
1207
|
-
const logger = (0,
|
|
1208
|
-
const publicKey = (_a = params == null ? void 0 : params.publicKey) != null ? _a : (0,
|
|
1209
|
-
const secretKey = (_b = params == null ? void 0 : params.secretKey) != null ? _b : (0,
|
|
1210
|
-
this.baseUrl = (_e = (_d = (_c = params == null ? void 0 : params.baseUrl) != null ? _c : (0,
|
|
1758
|
+
const logger = (0, import_core6.getGlobalLogger)();
|
|
1759
|
+
const publicKey = (_a = params == null ? void 0 : params.publicKey) != null ? _a : (0, import_core6.getEnv)("LANGFUSE_PUBLIC_KEY");
|
|
1760
|
+
const secretKey = (_b = params == null ? void 0 : params.secretKey) != null ? _b : (0, import_core6.getEnv)("LANGFUSE_SECRET_KEY");
|
|
1761
|
+
this.baseUrl = (_e = (_d = (_c = params == null ? void 0 : params.baseUrl) != null ? _c : (0, import_core6.getEnv)("LANGFUSE_BASE_URL")) != null ? _d : (0, import_core6.getEnv)("LANGFUSE_BASEURL")) != null ? _e : (
|
|
1211
1762
|
// legacy v2
|
|
1212
1763
|
"https://cloud.langfuse.com"
|
|
1213
1764
|
);
|
|
@@ -1221,13 +1772,13 @@ var LangfuseClient = class {
|
|
|
1221
1772
|
"No secret key provided in constructor or as LANGFUSE_SECRET_KEY env var. Client operations will fail."
|
|
1222
1773
|
);
|
|
1223
1774
|
}
|
|
1224
|
-
const timeoutSeconds = (_g = params == null ? void 0 : params.timeout) != null ? _g : Number((_f = (0,
|
|
1225
|
-
this.api = new
|
|
1775
|
+
const timeoutSeconds = (_g = params == null ? void 0 : params.timeout) != null ? _g : Number((_f = (0, import_core6.getEnv)("LANGFUSE_TIMEOUT")) != null ? _f : 5);
|
|
1776
|
+
this.api = new import_core6.LangfuseAPIClient({
|
|
1226
1777
|
baseUrl: this.baseUrl,
|
|
1227
1778
|
username: publicKey,
|
|
1228
1779
|
password: secretKey,
|
|
1229
1780
|
xLangfusePublicKey: publicKey,
|
|
1230
|
-
xLangfuseSdkVersion:
|
|
1781
|
+
xLangfuseSdkVersion: import_core6.LANGFUSE_SDK_VERSION,
|
|
1231
1782
|
xLangfuseSdkName: "javascript",
|
|
1232
1783
|
environment: "",
|
|
1233
1784
|
// noop as baseUrl is set
|
|
@@ -1239,9 +1790,10 @@ var LangfuseClient = class {
|
|
|
1239
1790
|
timeoutSeconds
|
|
1240
1791
|
});
|
|
1241
1792
|
this.prompt = new PromptManager({ apiClient: this.api });
|
|
1242
|
-
this.dataset = new DatasetManager({
|
|
1793
|
+
this.dataset = new DatasetManager({ langfuseClient: this });
|
|
1243
1794
|
this.score = new ScoreManager({ apiClient: this.api });
|
|
1244
1795
|
this.media = new MediaManager({ apiClient: this.api });
|
|
1796
|
+
this.experiment = new ExperimentManager({ langfuseClient: this });
|
|
1245
1797
|
this.getPrompt = this.prompt.get.bind(this.prompt);
|
|
1246
1798
|
this.createPrompt = this.prompt.create.bind(this.prompt);
|
|
1247
1799
|
this.updatePrompt = this.prompt.update.bind(this.prompt);
|
|
@@ -1316,15 +1868,36 @@ var LangfuseClient = class {
|
|
|
1316
1868
|
return traceUrl;
|
|
1317
1869
|
}
|
|
1318
1870
|
};
|
|
1871
|
+
|
|
1872
|
+
// src/experiment/adapters.ts
|
|
1873
|
+
function autoevalsToLangfuseEvaluator(autoevalEvaluator, params) {
|
|
1874
|
+
const langfuseEvaluator = async (langfuseEvaluatorParams) => {
|
|
1875
|
+
var _a;
|
|
1876
|
+
const score = await autoevalEvaluator({
|
|
1877
|
+
...params != null ? params : {},
|
|
1878
|
+
input: langfuseEvaluatorParams.input,
|
|
1879
|
+
output: langfuseEvaluatorParams.output,
|
|
1880
|
+
expected: langfuseEvaluatorParams.expectedOutput
|
|
1881
|
+
});
|
|
1882
|
+
return {
|
|
1883
|
+
name: score.name,
|
|
1884
|
+
value: (_a = score.score) != null ? _a : 0,
|
|
1885
|
+
metadata: score.metadata
|
|
1886
|
+
};
|
|
1887
|
+
};
|
|
1888
|
+
return langfuseEvaluator;
|
|
1889
|
+
}
|
|
1319
1890
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1320
1891
|
0 && (module.exports = {
|
|
1321
1892
|
ChatMessageType,
|
|
1322
1893
|
ChatPromptClient,
|
|
1323
1894
|
DatasetManager,
|
|
1895
|
+
ExperimentManager,
|
|
1324
1896
|
LangfuseClient,
|
|
1325
1897
|
MediaManager,
|
|
1326
1898
|
PromptManager,
|
|
1327
1899
|
ScoreManager,
|
|
1328
|
-
TextPromptClient
|
|
1900
|
+
TextPromptClient,
|
|
1901
|
+
autoevalsToLangfuseEvaluator
|
|
1329
1902
|
});
|
|
1330
1903
|
//# sourceMappingURL=index.cjs.map
|