@langfuse/client 4.0.0 → 4.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +626 -53
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +756 -29
- package/dist/index.d.ts +756 -29
- package/dist/index.mjs +610 -39
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.mjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import {
|
|
3
3
|
LangfuseAPIClient as LangfuseAPIClient4,
|
|
4
4
|
LANGFUSE_SDK_VERSION,
|
|
5
|
-
getGlobalLogger as
|
|
5
|
+
getGlobalLogger as getGlobalLogger6,
|
|
6
6
|
getEnv as getEnv2
|
|
7
7
|
} from "@langfuse/core";
|
|
8
8
|
|
|
@@ -15,44 +15,91 @@ var DatasetManager = class {
|
|
|
15
15
|
* @internal
|
|
16
16
|
*/
|
|
17
17
|
constructor(params) {
|
|
18
|
-
this.
|
|
18
|
+
this.langfuseClient = params.langfuseClient;
|
|
19
19
|
}
|
|
20
20
|
/**
|
|
21
|
-
* Retrieves a dataset by name
|
|
21
|
+
* Retrieves a dataset by name with all its items and experiment functionality.
|
|
22
22
|
*
|
|
23
|
-
* This method
|
|
24
|
-
*
|
|
23
|
+
* This method fetches a dataset and all its associated items, with support
|
|
24
|
+
* for automatic pagination to handle large datasets efficiently. The returned
|
|
25
|
+
* dataset object includes enhanced functionality for linking items to traces
|
|
26
|
+
* and running experiments directly on the dataset.
|
|
25
27
|
*
|
|
26
28
|
* @param name - The name of the dataset to retrieve
|
|
27
|
-
* @param options - Optional configuration for fetching
|
|
29
|
+
* @param options - Optional configuration for data fetching
|
|
28
30
|
* @param options.fetchItemsPageSize - Number of items to fetch per page (default: 50)
|
|
31
|
+
* @returns Promise resolving to enhanced dataset with items, linking, and experiment capabilities
|
|
29
32
|
*
|
|
30
|
-
* @
|
|
33
|
+
* @example Basic dataset retrieval
|
|
34
|
+
* ```typescript
|
|
35
|
+
* const dataset = await langfuse.dataset.get("my-evaluation-dataset");
|
|
36
|
+
* console.log(`Dataset ${dataset.name} has ${dataset.items.length} items`);
|
|
31
37
|
*
|
|
32
|
-
*
|
|
38
|
+
* // Access dataset properties
|
|
39
|
+
* console.log(dataset.description);
|
|
40
|
+
* console.log(dataset.metadata);
|
|
41
|
+
* ```
|
|
42
|
+
*
|
|
43
|
+
* @example Working with dataset items
|
|
33
44
|
* ```typescript
|
|
34
|
-
* const dataset = await langfuse.dataset.get("
|
|
45
|
+
* const dataset = await langfuse.dataset.get("qa-dataset");
|
|
35
46
|
*
|
|
36
47
|
* for (const item of dataset.items) {
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
40
|
-
* //
|
|
41
|
-
* await item.link(
|
|
42
|
-
* { otelSpan: currentSpan },
|
|
43
|
-
* "experiment-run-1",
|
|
44
|
-
* { description: "Testing new model" }
|
|
45
|
-
* );
|
|
48
|
+
* console.log("Question:", item.input);
|
|
49
|
+
* console.log("Expected Answer:", item.expectedOutput);
|
|
50
|
+
*
|
|
51
|
+
* // Each item has a link function for connecting to traces
|
|
52
|
+
* // await item.link(span, "experiment-name");
|
|
46
53
|
* }
|
|
47
54
|
* ```
|
|
55
|
+
*
|
|
56
|
+
* @example Running experiments on datasets
|
|
57
|
+
* ```typescript
|
|
58
|
+
* const dataset = await langfuse.dataset.get("benchmark-dataset");
|
|
59
|
+
*
|
|
60
|
+
* const result = await dataset.runExperiment({
|
|
61
|
+
* name: "GPT-4 Benchmark",
|
|
62
|
+
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
63
|
+
* task: async ({ input }) => {
|
|
64
|
+
* const response = await openai.chat.completions.create({
|
|
65
|
+
* model: "gpt-4",
|
|
66
|
+
* messages: [{ role: "user", content: input }]
|
|
67
|
+
* });
|
|
68
|
+
* return response.choices[0].message.content;
|
|
69
|
+
* },
|
|
70
|
+
* evaluators: [
|
|
71
|
+
* async ({ output, expectedOutput }) => ({
|
|
72
|
+
* name: "exact_match",
|
|
73
|
+
* value: output === expectedOutput ? 1 : 0
|
|
74
|
+
* })
|
|
75
|
+
* ]
|
|
76
|
+
* });
|
|
77
|
+
*
|
|
78
|
+
* console.log(await result.prettyPrint());
|
|
79
|
+
* ```
|
|
80
|
+
*
|
|
81
|
+
* @example Handling large datasets
|
|
82
|
+
* ```typescript
|
|
83
|
+
* // For very large datasets, use smaller page sizes
|
|
84
|
+
* const largeDataset = await langfuse.dataset.get(
|
|
85
|
+
* "large-dataset",
|
|
86
|
+
* { fetchItemsPageSize: 100 }
|
|
87
|
+
* );
|
|
88
|
+
* ```
|
|
89
|
+
*
|
|
90
|
+
* @throws {Error} If the dataset does not exist or cannot be accessed
|
|
91
|
+
* @see {@link FetchedDataset} for the complete return type specification
|
|
92
|
+
* @see {@link RunExperimentOnDataset} for experiment execution details
|
|
93
|
+
* @public
|
|
94
|
+
* @since 4.0.0
|
|
48
95
|
*/
|
|
49
96
|
async get(name, options) {
|
|
50
97
|
var _a;
|
|
51
|
-
const dataset = await this.
|
|
98
|
+
const dataset = await this.langfuseClient.api.datasets.get(name);
|
|
52
99
|
const items = [];
|
|
53
100
|
let page = 1;
|
|
54
101
|
while (true) {
|
|
55
|
-
const itemsResponse = await this.
|
|
102
|
+
const itemsResponse = await this.langfuseClient.api.datasetItems.list({
|
|
56
103
|
datasetName: name,
|
|
57
104
|
limit: (_a = options == null ? void 0 : options.fetchItemsPageSize) != null ? _a : 50,
|
|
58
105
|
page
|
|
@@ -63,12 +110,20 @@ var DatasetManager = class {
|
|
|
63
110
|
}
|
|
64
111
|
page++;
|
|
65
112
|
}
|
|
113
|
+
const itemsWithLinkMethod = items.map((item) => ({
|
|
114
|
+
...item,
|
|
115
|
+
link: this.createDatasetItemLinkFunction(item)
|
|
116
|
+
}));
|
|
117
|
+
const runExperiment = (params) => {
|
|
118
|
+
return this.langfuseClient.experiment.run({
|
|
119
|
+
data: items,
|
|
120
|
+
...params
|
|
121
|
+
});
|
|
122
|
+
};
|
|
66
123
|
const returnDataset = {
|
|
67
124
|
...dataset,
|
|
68
|
-
items:
|
|
69
|
-
|
|
70
|
-
link: this.createDatasetItemLinkFunction(item)
|
|
71
|
-
}))
|
|
125
|
+
items: itemsWithLinkMethod,
|
|
126
|
+
runExperiment
|
|
72
127
|
};
|
|
73
128
|
return returnDataset;
|
|
74
129
|
}
|
|
@@ -81,7 +136,7 @@ var DatasetManager = class {
|
|
|
81
136
|
*/
|
|
82
137
|
createDatasetItemLinkFunction(item) {
|
|
83
138
|
const linkFunction = async (obj, runName, runArgs) => {
|
|
84
|
-
return await this.
|
|
139
|
+
return await this.langfuseClient.api.datasetRunItems.create({
|
|
85
140
|
runName,
|
|
86
141
|
datasetItemId: item.id,
|
|
87
142
|
traceId: obj.otelSpan.spanContext().traceId,
|
|
@@ -93,9 +148,499 @@ var DatasetManager = class {
|
|
|
93
148
|
}
|
|
94
149
|
};
|
|
95
150
|
|
|
151
|
+
// src/experiment/ExperimentManager.ts
|
|
152
|
+
import { getGlobalLogger } from "@langfuse/core";
|
|
153
|
+
import { startActiveObservation } from "@langfuse/tracing";
|
|
154
|
+
import { ProxyTracerProvider, trace } from "@opentelemetry/api";
|
|
155
|
+
var ExperimentManager = class {
|
|
156
|
+
/**
|
|
157
|
+
* Creates a new ExperimentManager instance.
|
|
158
|
+
*
|
|
159
|
+
* @param params - Configuration object
|
|
160
|
+
* @param params.langfuseClient - The Langfuse client instance for API communication
|
|
161
|
+
* @internal
|
|
162
|
+
*/
|
|
163
|
+
constructor(params) {
|
|
164
|
+
this.langfuseClient = params.langfuseClient;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Gets the global logger instance for experiment-related logging.
|
|
168
|
+
*
|
|
169
|
+
* @returns The global logger instance
|
|
170
|
+
* @internal
|
|
171
|
+
*/
|
|
172
|
+
get logger() {
|
|
173
|
+
return getGlobalLogger();
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Executes an experiment by running a task on each data item and evaluating the results.
|
|
177
|
+
*
|
|
178
|
+
* This method orchestrates the complete experiment lifecycle:
|
|
179
|
+
* 1. Executes the task function on each data item with proper tracing
|
|
180
|
+
* 2. Runs item-level evaluators on each task output
|
|
181
|
+
* 3. Executes run-level evaluators on the complete result set
|
|
182
|
+
* 4. Links results to dataset runs (for Langfuse datasets)
|
|
183
|
+
* 5. Stores all scores and traces in Langfuse
|
|
184
|
+
*
|
|
185
|
+
* @param config - The experiment configuration
|
|
186
|
+
* @param config.name - Human-readable name for the experiment
|
|
187
|
+
* @param config.description - Optional description of the experiment's purpose
|
|
188
|
+
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
189
|
+
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
190
|
+
* @param config.task - Function that processes each data item and returns output
|
|
191
|
+
* @param config.evaluators - Optional array of functions to evaluate each item's output
|
|
192
|
+
* @param config.runEvaluators - Optional array of functions to evaluate the entire run
|
|
193
|
+
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
194
|
+
*
|
|
195
|
+
* @returns Promise that resolves to experiment results including:
|
|
196
|
+
* - itemResults: Results for each processed data item
|
|
197
|
+
* - runEvaluations: Results from run-level evaluators
|
|
198
|
+
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
199
|
+
* - prettyPrint: Function to format and display results
|
|
200
|
+
*
|
|
201
|
+
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
202
|
+
* @throws {Error} When required evaluators fail critically
|
|
203
|
+
*
|
|
204
|
+
* @example Simple experiment
|
|
205
|
+
* ```typescript
|
|
206
|
+
* const result = await langfuse.experiment.run({
|
|
207
|
+
* name: "Translation Quality Test",
|
|
208
|
+
* data: [
|
|
209
|
+
* { input: "Hello world", expectedOutput: "Hola mundo" },
|
|
210
|
+
* { input: "Good morning", expectedOutput: "Buenos días" }
|
|
211
|
+
* ],
|
|
212
|
+
* task: async ({ input }) => translateText(input, 'es'),
|
|
213
|
+
* evaluators: [
|
|
214
|
+
* async ({ output, expectedOutput }) => ({
|
|
215
|
+
* name: "bleu_score",
|
|
216
|
+
* value: calculateBleuScore(output, expectedOutput)
|
|
217
|
+
* })
|
|
218
|
+
* ]
|
|
219
|
+
* });
|
|
220
|
+
* ```
|
|
221
|
+
*
|
|
222
|
+
* @example Experiment with concurrency control
|
|
223
|
+
* ```typescript
|
|
224
|
+
* const result = await langfuse.experiment.run({
|
|
225
|
+
* name: "Large Scale Evaluation",
|
|
226
|
+
* data: largeBatchOfItems,
|
|
227
|
+
* task: expensiveModelCall,
|
|
228
|
+
* maxConcurrency: 5, // Process max 5 items simultaneously
|
|
229
|
+
* evaluators: [myEvaluator],
|
|
230
|
+
* runEvaluators: [
|
|
231
|
+
* async ({ itemResults }) => ({
|
|
232
|
+
* name: "average_score",
|
|
233
|
+
* value: itemResults.reduce((acc, r) => acc + r.evaluations[0].value, 0) / itemResults.length
|
|
234
|
+
* })
|
|
235
|
+
* ]
|
|
236
|
+
* });
|
|
237
|
+
* ```
|
|
238
|
+
*
|
|
239
|
+
* @see {@link ExperimentParams} for detailed parameter documentation
|
|
240
|
+
* @see {@link ExperimentResult} for detailed return value documentation
|
|
241
|
+
* @see {@link Evaluator} for evaluator function specifications
|
|
242
|
+
* @see {@link RunEvaluator} for run evaluator function specifications
|
|
243
|
+
*
|
|
244
|
+
* @public
|
|
245
|
+
*/
|
|
246
|
+
async run(config) {
|
|
247
|
+
const {
|
|
248
|
+
data,
|
|
249
|
+
evaluators,
|
|
250
|
+
task,
|
|
251
|
+
name,
|
|
252
|
+
description,
|
|
253
|
+
metadata,
|
|
254
|
+
maxConcurrency: batchSize = Infinity,
|
|
255
|
+
runEvaluators
|
|
256
|
+
} = config;
|
|
257
|
+
if (!this.isOtelRegistered()) {
|
|
258
|
+
this.logger.warn(
|
|
259
|
+
"OpenTelemetry has not been set up. Traces will not be sent to Langfuse.See our docs on how to set up OpenTelemetry: https://langfuse.com/docs/observability/sdk/typescript/setup#tracing-setup"
|
|
260
|
+
);
|
|
261
|
+
}
|
|
262
|
+
const itemResults = [];
|
|
263
|
+
for (let i = 0; i < data.length; i += batchSize) {
|
|
264
|
+
const batch = data.slice(i, i + batchSize);
|
|
265
|
+
const promises = batch.map(
|
|
266
|
+
async (item) => {
|
|
267
|
+
return this.runItem({
|
|
268
|
+
item,
|
|
269
|
+
evaluators,
|
|
270
|
+
task,
|
|
271
|
+
experimentName: name,
|
|
272
|
+
experimentDescription: description,
|
|
273
|
+
experimentMetadata: metadata
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
);
|
|
277
|
+
const results = await Promise.all(promises);
|
|
278
|
+
itemResults.push(...results);
|
|
279
|
+
}
|
|
280
|
+
const datasetRunId = itemResults.length > 0 ? itemResults[0].datasetRunId : void 0;
|
|
281
|
+
let datasetRunUrl = void 0;
|
|
282
|
+
if (datasetRunId && data.length > 0 && "datasetId" in data[0]) {
|
|
283
|
+
const datasetId = data[0].datasetId;
|
|
284
|
+
const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split(
|
|
285
|
+
"/traces"
|
|
286
|
+
)[0];
|
|
287
|
+
datasetRunUrl = `${projectUrl}/datasets/${datasetId}/runs/${datasetRunId}`;
|
|
288
|
+
}
|
|
289
|
+
let runEvaluations = [];
|
|
290
|
+
if (runEvaluators && (runEvaluators == null ? void 0 : runEvaluators.length) > 0) {
|
|
291
|
+
const promises = runEvaluators.map(async (runEvaluator) => {
|
|
292
|
+
return runEvaluator({ itemResults }).then((result) => {
|
|
293
|
+
return Array.isArray(result) ? result : [result];
|
|
294
|
+
}).catch((err) => {
|
|
295
|
+
this.logger.error("Run evaluator failed with error ", err);
|
|
296
|
+
throw err;
|
|
297
|
+
});
|
|
298
|
+
});
|
|
299
|
+
runEvaluations = (await Promise.allSettled(promises)).reduce(
|
|
300
|
+
(acc, settledPromise) => {
|
|
301
|
+
if (settledPromise.status === "fulfilled") {
|
|
302
|
+
acc.push(...settledPromise.value);
|
|
303
|
+
}
|
|
304
|
+
return acc;
|
|
305
|
+
},
|
|
306
|
+
[]
|
|
307
|
+
);
|
|
308
|
+
if (datasetRunId) {
|
|
309
|
+
runEvaluations.forEach(
|
|
310
|
+
(runEval) => this.langfuseClient.score.create({ datasetRunId, ...runEval })
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
await this.langfuseClient.score.flush();
|
|
315
|
+
return {
|
|
316
|
+
itemResults,
|
|
317
|
+
datasetRunId,
|
|
318
|
+
runEvaluations,
|
|
319
|
+
prettyPrint: async (options) => {
|
|
320
|
+
var _a;
|
|
321
|
+
return await this.prettyPrintResults({
|
|
322
|
+
datasetRunUrl,
|
|
323
|
+
itemResults,
|
|
324
|
+
originalData: data,
|
|
325
|
+
runEvaluations,
|
|
326
|
+
name: config.name,
|
|
327
|
+
description: config.description,
|
|
328
|
+
includeItemResults: (_a = options == null ? void 0 : options.includeItemResults) != null ? _a : true
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Executes the task and evaluators for a single data item.
|
|
335
|
+
*
|
|
336
|
+
* This method handles the complete processing pipeline for one data item:
|
|
337
|
+
* 1. Executes the task within a traced observation span
|
|
338
|
+
* 2. Links the result to a dataset run (if applicable)
|
|
339
|
+
* 3. Runs all item-level evaluators on the output
|
|
340
|
+
* 4. Stores evaluation scores in Langfuse
|
|
341
|
+
* 5. Handles errors gracefully by continuing with remaining evaluators
|
|
342
|
+
*
|
|
343
|
+
* @param params - Parameters for item execution
|
|
344
|
+
* @param params.experimentName - Name of the parent experiment
|
|
345
|
+
* @param params.experimentDescription - Description of the parent experiment
|
|
346
|
+
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
347
|
+
* @param params.item - The data item to process
|
|
348
|
+
* @param params.task - The task function to execute
|
|
349
|
+
* @param params.evaluators - Optional evaluators to run on the output
|
|
350
|
+
*
|
|
351
|
+
* @returns Promise resolving to the item result with output, evaluations, and trace info
|
|
352
|
+
*
|
|
353
|
+
* @throws {Error} When task execution fails (propagated from task function)
|
|
354
|
+
*
|
|
355
|
+
* @internal
|
|
356
|
+
*/
|
|
357
|
+
async runItem(params) {
|
|
358
|
+
const { item, evaluators = [], task } = params;
|
|
359
|
+
const { output, traceId } = await startActiveObservation(
|
|
360
|
+
"experiment-item-run",
|
|
361
|
+
async (span) => {
|
|
362
|
+
const output2 = await task(item);
|
|
363
|
+
span.update({
|
|
364
|
+
input: item.input,
|
|
365
|
+
output: output2
|
|
366
|
+
});
|
|
367
|
+
return { output: output2, traceId: span.traceId };
|
|
368
|
+
}
|
|
369
|
+
);
|
|
370
|
+
let datasetRunId = void 0;
|
|
371
|
+
if ("id" in item) {
|
|
372
|
+
await this.langfuseClient.api.datasetRunItems.create({
|
|
373
|
+
runName: params.experimentName,
|
|
374
|
+
runDescription: params.experimentDescription,
|
|
375
|
+
metadata: params.experimentMetadata,
|
|
376
|
+
datasetItemId: item.id,
|
|
377
|
+
traceId
|
|
378
|
+
}).then((result) => {
|
|
379
|
+
datasetRunId = result.datasetRunId;
|
|
380
|
+
}).catch(
|
|
381
|
+
(err) => this.logger.error("Linking dataset run item failed", err)
|
|
382
|
+
);
|
|
383
|
+
}
|
|
384
|
+
const evalPromises = evaluators.map(
|
|
385
|
+
async (evaluator) => {
|
|
386
|
+
const params2 = {
|
|
387
|
+
input: item.input,
|
|
388
|
+
expectedOutput: item.expectedOutput,
|
|
389
|
+
output
|
|
390
|
+
};
|
|
391
|
+
return evaluator(params2).then((result) => {
|
|
392
|
+
return Array.isArray(result) ? result : [result];
|
|
393
|
+
}).catch((err) => {
|
|
394
|
+
this.logger.error(
|
|
395
|
+
`Evaluator '${evaluator.name}' failed for params
|
|
396
|
+
|
|
397
|
+
${JSON.stringify(params2)}
|
|
398
|
+
|
|
399
|
+
with error: ${err}`
|
|
400
|
+
);
|
|
401
|
+
throw err;
|
|
402
|
+
});
|
|
403
|
+
}
|
|
404
|
+
);
|
|
405
|
+
const evals = (await Promise.allSettled(evalPromises)).reduce(
|
|
406
|
+
(acc, promiseResult) => {
|
|
407
|
+
if (promiseResult.status === "fulfilled") {
|
|
408
|
+
acc.push(...promiseResult.value.flat());
|
|
409
|
+
}
|
|
410
|
+
return acc;
|
|
411
|
+
},
|
|
412
|
+
[]
|
|
413
|
+
);
|
|
414
|
+
for (const ev of evals) {
|
|
415
|
+
this.langfuseClient.score.create({
|
|
416
|
+
traceId,
|
|
417
|
+
name: ev.name,
|
|
418
|
+
comment: ev.comment,
|
|
419
|
+
value: ev.value,
|
|
420
|
+
metadata: ev.metadata,
|
|
421
|
+
dataType: ev.dataType
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
return {
|
|
425
|
+
output,
|
|
426
|
+
evaluations: evals,
|
|
427
|
+
traceId,
|
|
428
|
+
datasetRunId
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
/**
|
|
432
|
+
* Formats experiment results into a human-readable string representation.
|
|
433
|
+
*
|
|
434
|
+
* Creates a comprehensive, nicely formatted summary of the experiment including:
|
|
435
|
+
* - Individual item results with inputs, outputs, expected values, and scores
|
|
436
|
+
* - Dataset item and trace links (when available)
|
|
437
|
+
* - Experiment overview with aggregate statistics
|
|
438
|
+
* - Average scores across all evaluations
|
|
439
|
+
* - Run-level evaluation results
|
|
440
|
+
* - Links to dataset runs in the Langfuse UI
|
|
441
|
+
*
|
|
442
|
+
* @param params - Formatting parameters
|
|
443
|
+
* @param params.datasetRunUrl - Optional URL to the dataset run in Langfuse UI
|
|
444
|
+
* @param params.itemResults - Results from processing each data item
|
|
445
|
+
* @param params.originalData - The original input data items
|
|
446
|
+
* @param params.runEvaluations - Results from run-level evaluators
|
|
447
|
+
* @param params.name - Name of the experiment
|
|
448
|
+
* @param params.description - Optional description of the experiment
|
|
449
|
+
* @param params.includeItemResults - Whether to include individual item details (default: true)
|
|
450
|
+
*
|
|
451
|
+
* @returns Promise resolving to formatted string representation
|
|
452
|
+
*
|
|
453
|
+
* @example Output format
|
|
454
|
+
* ```
|
|
455
|
+
* 1. Item 1:
|
|
456
|
+
* Input: What is the capital of France?
|
|
457
|
+
* Expected: Paris
|
|
458
|
+
* Actual: Paris
|
|
459
|
+
* Scores:
|
|
460
|
+
* • exact_match: 1.000
|
|
461
|
+
* • similarity: 0.95
|
|
462
|
+
* 💭 Very close match with expected output
|
|
463
|
+
*
|
|
464
|
+
* Dataset Item:
|
|
465
|
+
* https://cloud.langfuse.com/project/123/datasets/456/items/789
|
|
466
|
+
*
|
|
467
|
+
* Trace:
|
|
468
|
+
* https://cloud.langfuse.com/project/123/traces/abc123
|
|
469
|
+
*
|
|
470
|
+
* ──────────────────────────────────────────────────
|
|
471
|
+
* 📊 Translation Quality Test - Testing model accuracy
|
|
472
|
+
* 2 items
|
|
473
|
+
* Evaluations:
|
|
474
|
+
* • exact_match
|
|
475
|
+
* • similarity
|
|
476
|
+
*
|
|
477
|
+
* Average Scores:
|
|
478
|
+
* • exact_match: 0.850
|
|
479
|
+
* • similarity: 0.923
|
|
480
|
+
*
|
|
481
|
+
* Run Evaluations:
|
|
482
|
+
* • overall_quality: 0.887
|
|
483
|
+
* 💭 Good performance with room for improvement
|
|
484
|
+
*
|
|
485
|
+
* 🔗 Dataset Run:
|
|
486
|
+
* https://cloud.langfuse.com/project/123/datasets/456/runs/def456
|
|
487
|
+
* ```
|
|
488
|
+
*
|
|
489
|
+
* @internal
|
|
490
|
+
*/
|
|
491
|
+
async prettyPrintResults(params) {
|
|
492
|
+
var _a, _b;
|
|
493
|
+
const {
|
|
494
|
+
itemResults,
|
|
495
|
+
originalData,
|
|
496
|
+
runEvaluations,
|
|
497
|
+
name,
|
|
498
|
+
description,
|
|
499
|
+
includeItemResults = true
|
|
500
|
+
} = params;
|
|
501
|
+
if (itemResults.length === 0) {
|
|
502
|
+
return "No experiment results to display.";
|
|
503
|
+
}
|
|
504
|
+
let output = "";
|
|
505
|
+
if (includeItemResults) {
|
|
506
|
+
for (let index = 0; index < itemResults.length; index++) {
|
|
507
|
+
const result = itemResults[index];
|
|
508
|
+
const originalItem = originalData[index];
|
|
509
|
+
output += `
|
|
510
|
+
${index + 1}. Item ${index + 1}:
|
|
511
|
+
`;
|
|
512
|
+
if ((originalItem == null ? void 0 : originalItem.input) !== void 0) {
|
|
513
|
+
output += ` Input: ${this.formatValue(originalItem.input)}
|
|
514
|
+
`;
|
|
515
|
+
}
|
|
516
|
+
const expectedOutput = (_b = (_a = originalItem == null ? void 0 : originalItem.expectedOutput) != null ? _a : result.expectedOutput) != null ? _b : null;
|
|
517
|
+
output += ` Expected: ${expectedOutput !== null ? this.formatValue(expectedOutput) : "null"}
|
|
518
|
+
`;
|
|
519
|
+
output += ` Actual: ${this.formatValue(result.output)}
|
|
520
|
+
`;
|
|
521
|
+
if (result.evaluations.length > 0) {
|
|
522
|
+
output += ` Scores:
|
|
523
|
+
`;
|
|
524
|
+
result.evaluations.forEach((evaluation) => {
|
|
525
|
+
const score = typeof evaluation.value === "number" ? evaluation.value.toFixed(3) : evaluation.value;
|
|
526
|
+
output += ` \u2022 ${evaluation.name}: ${score}`;
|
|
527
|
+
if (evaluation.comment) {
|
|
528
|
+
output += `
|
|
529
|
+
\u{1F4AD} ${evaluation.comment}`;
|
|
530
|
+
}
|
|
531
|
+
output += "\n";
|
|
532
|
+
});
|
|
533
|
+
}
|
|
534
|
+
if (originalItem && "id" in originalItem && "datasetId" in originalItem) {
|
|
535
|
+
const projectUrl = (await this.langfuseClient.getTraceUrl("mock")).split("/traces")[0];
|
|
536
|
+
const datasetItemUrl = `${projectUrl}/datasets/${originalItem.datasetId}/items/${originalItem.id}`;
|
|
537
|
+
output += `
|
|
538
|
+
Dataset Item:
|
|
539
|
+
${datasetItemUrl}
|
|
540
|
+
`;
|
|
541
|
+
}
|
|
542
|
+
if (result.traceId) {
|
|
543
|
+
const traceUrl = await this.langfuseClient.getTraceUrl(
|
|
544
|
+
result.traceId
|
|
545
|
+
);
|
|
546
|
+
output += `
|
|
547
|
+
Trace:
|
|
548
|
+
${traceUrl}
|
|
549
|
+
`;
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
} else {
|
|
553
|
+
output += `Individual Results: Hidden (${itemResults.length} items)
|
|
554
|
+
`;
|
|
555
|
+
output += "\u{1F4A1} Call prettyPrint({ includeItemResults: true }) to view them\n";
|
|
556
|
+
}
|
|
557
|
+
const totalItems = itemResults.length;
|
|
558
|
+
const evaluationNames = new Set(
|
|
559
|
+
itemResults.flatMap((r) => r.evaluations.map((e) => e.name))
|
|
560
|
+
);
|
|
561
|
+
output += `
|
|
562
|
+
${"\u2500".repeat(50)}
|
|
563
|
+
`;
|
|
564
|
+
output += `\u{1F4CA} ${name}`;
|
|
565
|
+
if (description) {
|
|
566
|
+
output += ` - ${description}`;
|
|
567
|
+
}
|
|
568
|
+
output += `
|
|
569
|
+
${totalItems} items`;
|
|
570
|
+
if (evaluationNames.size > 0) {
|
|
571
|
+
output += `
|
|
572
|
+
Evaluations:`;
|
|
573
|
+
Array.from(evaluationNames).forEach((evalName) => {
|
|
574
|
+
output += `
|
|
575
|
+
\u2022 ${evalName}`;
|
|
576
|
+
});
|
|
577
|
+
output += "\n";
|
|
578
|
+
}
|
|
579
|
+
if (evaluationNames.size > 0) {
|
|
580
|
+
output += `
|
|
581
|
+
Average Scores:`;
|
|
582
|
+
for (const evalName of evaluationNames) {
|
|
583
|
+
const scores = itemResults.flatMap((r) => r.evaluations).filter((e) => e.name === evalName && typeof e.value === "number").map((e) => e.value);
|
|
584
|
+
if (scores.length > 0) {
|
|
585
|
+
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
586
|
+
output += `
|
|
587
|
+
\u2022 ${evalName}: ${avg.toFixed(3)}`;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
output += "\n";
|
|
591
|
+
}
|
|
592
|
+
if (runEvaluations.length > 0) {
|
|
593
|
+
output += `
|
|
594
|
+
Run Evaluations:`;
|
|
595
|
+
runEvaluations.forEach((runEval) => {
|
|
596
|
+
const score = typeof runEval.value === "number" ? runEval.value.toFixed(3) : runEval.value;
|
|
597
|
+
output += `
|
|
598
|
+
\u2022 ${runEval.name}: ${score}`;
|
|
599
|
+
if (runEval.comment) {
|
|
600
|
+
output += `
|
|
601
|
+
\u{1F4AD} ${runEval.comment}`;
|
|
602
|
+
}
|
|
603
|
+
});
|
|
604
|
+
output += "\n";
|
|
605
|
+
}
|
|
606
|
+
if (params.datasetRunUrl) {
|
|
607
|
+
output += `
|
|
608
|
+
\u{1F517} Dataset Run:
|
|
609
|
+
${params.datasetRunUrl}`;
|
|
610
|
+
}
|
|
611
|
+
return output;
|
|
612
|
+
}
|
|
613
|
+
/**
|
|
614
|
+
* Formats a value for display in pretty-printed output.
|
|
615
|
+
*
|
|
616
|
+
* Handles different value types appropriately:
|
|
617
|
+
* - Strings: Truncates long strings to 50 characters with "..."
|
|
618
|
+
* - Objects/Arrays: Converts to JSON string representation
|
|
619
|
+
* - Primitives: Uses toString() representation
|
|
620
|
+
*
|
|
621
|
+
* @param value - The value to format
|
|
622
|
+
* @returns Formatted string representation suitable for display
|
|
623
|
+
*
|
|
624
|
+
* @internal
|
|
625
|
+
*/
|
|
626
|
+
formatValue(value) {
|
|
627
|
+
if (typeof value === "string") {
|
|
628
|
+
return value.length > 50 ? `${value.substring(0, 47)}...` : value;
|
|
629
|
+
}
|
|
630
|
+
return JSON.stringify(value);
|
|
631
|
+
}
|
|
632
|
+
isOtelRegistered() {
|
|
633
|
+
let tracerProvider = trace.getTracerProvider();
|
|
634
|
+
if (tracerProvider instanceof ProxyTracerProvider) {
|
|
635
|
+
tracerProvider = tracerProvider.getDelegate();
|
|
636
|
+
}
|
|
637
|
+
return tracerProvider.constructor.name !== "NoopTracerProvider";
|
|
638
|
+
}
|
|
639
|
+
};
|
|
640
|
+
|
|
96
641
|
// src/media/index.ts
|
|
97
642
|
import {
|
|
98
|
-
getGlobalLogger,
|
|
643
|
+
getGlobalLogger as getGlobalLogger2,
|
|
99
644
|
bytesToBase64
|
|
100
645
|
} from "@langfuse/core";
|
|
101
646
|
var MediaManager = class _MediaManager {
|
|
@@ -182,7 +727,7 @@ var MediaManager = class _MediaManager {
|
|
|
182
727
|
base64DataUri
|
|
183
728
|
);
|
|
184
729
|
} catch (error) {
|
|
185
|
-
|
|
730
|
+
getGlobalLogger2().warn(
|
|
186
731
|
"Error fetching media content for reference string",
|
|
187
732
|
referenceString,
|
|
188
733
|
error
|
|
@@ -259,11 +804,11 @@ var MediaManager = class _MediaManager {
|
|
|
259
804
|
|
|
260
805
|
// src/prompt/promptManager.ts
|
|
261
806
|
import {
|
|
262
|
-
getGlobalLogger as
|
|
807
|
+
getGlobalLogger as getGlobalLogger4
|
|
263
808
|
} from "@langfuse/core";
|
|
264
809
|
|
|
265
810
|
// src/prompt/promptCache.ts
|
|
266
|
-
import { getGlobalLogger as
|
|
811
|
+
import { getGlobalLogger as getGlobalLogger3 } from "@langfuse/core";
|
|
267
812
|
var DEFAULT_PROMPT_CACHE_TTL_SECONDS = 60;
|
|
268
813
|
var LangfusePromptCacheItem = class {
|
|
269
814
|
constructor(value, ttlSeconds) {
|
|
@@ -315,7 +860,7 @@ var LangfusePromptCache = class {
|
|
|
315
860
|
return this._refreshingKeys.has(key);
|
|
316
861
|
}
|
|
317
862
|
invalidate(promptName) {
|
|
318
|
-
|
|
863
|
+
getGlobalLogger3().debug(
|
|
319
864
|
"Invalidating cache keys for",
|
|
320
865
|
promptName,
|
|
321
866
|
this._cache.keys()
|
|
@@ -659,7 +1204,7 @@ var PromptManager = class {
|
|
|
659
1204
|
this.cache = new LangfusePromptCache();
|
|
660
1205
|
}
|
|
661
1206
|
get logger() {
|
|
662
|
-
return
|
|
1207
|
+
return getGlobalLogger4();
|
|
663
1208
|
}
|
|
664
1209
|
/**
|
|
665
1210
|
* Creates a new prompt in Langfuse.
|
|
@@ -889,10 +1434,10 @@ var PromptManager = class {
|
|
|
889
1434
|
import {
|
|
890
1435
|
getEnv,
|
|
891
1436
|
generateUUID,
|
|
892
|
-
getGlobalLogger as
|
|
1437
|
+
getGlobalLogger as getGlobalLogger5,
|
|
893
1438
|
safeSetTimeout
|
|
894
1439
|
} from "@langfuse/core";
|
|
895
|
-
import { trace } from "@opentelemetry/api";
|
|
1440
|
+
import { trace as trace2 } from "@opentelemetry/api";
|
|
896
1441
|
var MAX_QUEUE_SIZE = 1e5;
|
|
897
1442
|
var MAX_BATCH_SIZE = 100;
|
|
898
1443
|
var ScoreManager = class {
|
|
@@ -913,7 +1458,7 @@ var ScoreManager = class {
|
|
|
913
1458
|
this.flushIntervalSeconds = envFlushIntervalSeconds ? Number(envFlushIntervalSeconds) : 1;
|
|
914
1459
|
}
|
|
915
1460
|
get logger() {
|
|
916
|
-
return
|
|
1461
|
+
return getGlobalLogger5();
|
|
917
1462
|
}
|
|
918
1463
|
/**
|
|
919
1464
|
* Creates a new score event and adds it to the processing queue.
|
|
@@ -954,6 +1499,10 @@ var ScoreManager = class {
|
|
|
954
1499
|
return;
|
|
955
1500
|
}
|
|
956
1501
|
this.eventQueue.push(scoreIngestionEvent);
|
|
1502
|
+
this.logger.debug(
|
|
1503
|
+
"Added score event to queue:\n",
|
|
1504
|
+
JSON.stringify(scoreIngestionEvent, null, 2)
|
|
1505
|
+
);
|
|
957
1506
|
if (this.eventQueue.length >= this.flushAtCount) {
|
|
958
1507
|
this.flushPromise = this.flush();
|
|
959
1508
|
} else if (!this.flushTimer) {
|
|
@@ -1040,7 +1589,7 @@ var ScoreManager = class {
|
|
|
1040
1589
|
* ```
|
|
1041
1590
|
*/
|
|
1042
1591
|
activeObservation(data) {
|
|
1043
|
-
const currentOtelSpan =
|
|
1592
|
+
const currentOtelSpan = trace2.getActiveSpan();
|
|
1044
1593
|
if (!currentOtelSpan) {
|
|
1045
1594
|
this.logger.warn("No active span in context to score.");
|
|
1046
1595
|
return;
|
|
@@ -1076,7 +1625,7 @@ var ScoreManager = class {
|
|
|
1076
1625
|
* ```
|
|
1077
1626
|
*/
|
|
1078
1627
|
activeTrace(data) {
|
|
1079
|
-
const currentOtelSpan =
|
|
1628
|
+
const currentOtelSpan = trace2.getActiveSpan();
|
|
1080
1629
|
if (!currentOtelSpan) {
|
|
1081
1630
|
this.logger.warn("No active span in context to score trace.");
|
|
1082
1631
|
return;
|
|
@@ -1176,7 +1725,7 @@ var LangfuseClient = class {
|
|
|
1176
1725
|
constructor(params) {
|
|
1177
1726
|
this.projectId = null;
|
|
1178
1727
|
var _a, _b, _c, _d, _e, _f, _g;
|
|
1179
|
-
const logger =
|
|
1728
|
+
const logger = getGlobalLogger6();
|
|
1180
1729
|
const publicKey = (_a = params == null ? void 0 : params.publicKey) != null ? _a : getEnv2("LANGFUSE_PUBLIC_KEY");
|
|
1181
1730
|
const secretKey = (_b = params == null ? void 0 : params.secretKey) != null ? _b : getEnv2("LANGFUSE_SECRET_KEY");
|
|
1182
1731
|
this.baseUrl = (_e = (_d = (_c = params == null ? void 0 : params.baseUrl) != null ? _c : getEnv2("LANGFUSE_BASE_URL")) != null ? _d : getEnv2("LANGFUSE_BASEURL")) != null ? _e : (
|
|
@@ -1211,9 +1760,10 @@ var LangfuseClient = class {
|
|
|
1211
1760
|
timeoutSeconds
|
|
1212
1761
|
});
|
|
1213
1762
|
this.prompt = new PromptManager({ apiClient: this.api });
|
|
1214
|
-
this.dataset = new DatasetManager({
|
|
1763
|
+
this.dataset = new DatasetManager({ langfuseClient: this });
|
|
1215
1764
|
this.score = new ScoreManager({ apiClient: this.api });
|
|
1216
1765
|
this.media = new MediaManager({ apiClient: this.api });
|
|
1766
|
+
this.experiment = new ExperimentManager({ langfuseClient: this });
|
|
1217
1767
|
this.getPrompt = this.prompt.get.bind(this.prompt);
|
|
1218
1768
|
this.createPrompt = this.prompt.create.bind(this.prompt);
|
|
1219
1769
|
this.updatePrompt = this.prompt.update.bind(this.prompt);
|
|
@@ -1288,14 +1838,35 @@ var LangfuseClient = class {
|
|
|
1288
1838
|
return traceUrl;
|
|
1289
1839
|
}
|
|
1290
1840
|
};
|
|
1841
|
+
|
|
1842
|
+
// src/experiment/adapters.ts
|
|
1843
|
+
function autoevalsToLangfuseEvaluator(autoevalEvaluator, params) {
|
|
1844
|
+
const langfuseEvaluator = async (langfuseEvaluatorParams) => {
|
|
1845
|
+
var _a;
|
|
1846
|
+
const score = await autoevalEvaluator({
|
|
1847
|
+
...params != null ? params : {},
|
|
1848
|
+
input: langfuseEvaluatorParams.input,
|
|
1849
|
+
output: langfuseEvaluatorParams.output,
|
|
1850
|
+
expected: langfuseEvaluatorParams.expectedOutput
|
|
1851
|
+
});
|
|
1852
|
+
return {
|
|
1853
|
+
name: score.name,
|
|
1854
|
+
value: (_a = score.score) != null ? _a : 0,
|
|
1855
|
+
metadata: score.metadata
|
|
1856
|
+
};
|
|
1857
|
+
};
|
|
1858
|
+
return langfuseEvaluator;
|
|
1859
|
+
}
|
|
1291
1860
|
export {
|
|
1292
1861
|
ChatMessageType,
|
|
1293
1862
|
ChatPromptClient,
|
|
1294
1863
|
DatasetManager,
|
|
1864
|
+
ExperimentManager,
|
|
1295
1865
|
LangfuseClient,
|
|
1296
1866
|
MediaManager,
|
|
1297
1867
|
PromptManager,
|
|
1298
1868
|
ScoreManager,
|
|
1299
|
-
TextPromptClient
|
|
1869
|
+
TextPromptClient,
|
|
1870
|
+
autoevalsToLangfuseEvaluator
|
|
1300
1871
|
};
|
|
1301
1872
|
//# sourceMappingURL=index.mjs.map
|