@langfuse/client 4.1.0-alpha.1 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +67 -16
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -14
- package/dist/index.d.ts +48 -14
- package/dist/index.mjs +66 -15
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.d.cts
CHANGED
|
@@ -120,6 +120,14 @@ type ExperimentParams<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
120
120
|
* Choose a descriptive name that identifies the experiment's purpose.
|
|
121
121
|
*/
|
|
122
122
|
name: string;
|
|
123
|
+
/**
|
|
124
|
+
* Optional exact name for the experiment run.
|
|
125
|
+
*
|
|
126
|
+
* If provided, this will be used as the exact dataset run name if the data
|
|
127
|
+
* contains Langfuse dataset items. If not provided, this will default to
|
|
128
|
+
* the experiment name appended with an ISO timestamp.
|
|
129
|
+
*/
|
|
130
|
+
runName?: string;
|
|
123
131
|
/**
|
|
124
132
|
* Optional description explaining the experiment's purpose.
|
|
125
133
|
*
|
|
@@ -227,10 +235,10 @@ type ExperimentItemResult<Input = any, ExpectedOutput = any, Metadata extends Re
|
|
|
227
235
|
* console.log(`Average score: ${avgScore?.value}`);
|
|
228
236
|
*
|
|
229
237
|
* // Print formatted results
|
|
230
|
-
* console.log(await result.
|
|
238
|
+
* console.log(await result.format());
|
|
231
239
|
*
|
|
232
240
|
* // Print summary with individual item results
|
|
233
|
-
* console.log(await result.
|
|
241
|
+
* console.log(await result.format({ includeItemResults: true }));
|
|
234
242
|
*
|
|
235
243
|
* // Link to dataset run (if available)
|
|
236
244
|
* if (result.datasetRunUrl) {
|
|
@@ -241,6 +249,13 @@ type ExperimentItemResult<Input = any, ExpectedOutput = any, Metadata extends Re
|
|
|
241
249
|
* @public
|
|
242
250
|
*/
|
|
243
251
|
type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
|
|
252
|
+
/**
|
|
253
|
+
* The experiment run name.
|
|
254
|
+
*
|
|
255
|
+
* This is equal to the dataset run name if experiment was on Langfuse dataset.
|
|
256
|
+
* Either the provided runName parameter or generated name (experiment name + timestamp).
|
|
257
|
+
*/
|
|
258
|
+
runName: string;
|
|
244
259
|
/**
|
|
245
260
|
* ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
|
|
246
261
|
*
|
|
@@ -273,7 +288,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
273
288
|
*/
|
|
274
289
|
runEvaluations: Evaluation[];
|
|
275
290
|
/**
|
|
276
|
-
* Function to format
|
|
291
|
+
* Function to format experiment results in a human-readable format.
|
|
277
292
|
*
|
|
278
293
|
* Generates a comprehensive, nicely formatted summary including individual results,
|
|
279
294
|
* aggregate statistics, evaluation scores, and links to traces and dataset runs.
|
|
@@ -282,7 +297,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
282
297
|
* @param options.includeItemResults - Whether to include individual item details (default: false)
|
|
283
298
|
* @returns Promise resolving to formatted string representation
|
|
284
299
|
*/
|
|
285
|
-
|
|
300
|
+
format: (options?: {
|
|
286
301
|
includeItemResults?: boolean;
|
|
287
302
|
}) => Promise<string>;
|
|
288
303
|
};
|
|
@@ -301,6 +316,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
301
316
|
* const dataset = await langfuse.dataset.get("my-dataset");
|
|
302
317
|
* const result = await dataset.runExperiment({
|
|
303
318
|
* name: "Model Evaluation",
|
|
319
|
+
* runName: "Model Evaluation Run 1", // optional
|
|
304
320
|
* task: myTask,
|
|
305
321
|
* evaluators: [myEvaluator]
|
|
306
322
|
* });
|
|
@@ -469,6 +485,7 @@ declare class DatasetManager {
|
|
|
469
485
|
*
|
|
470
486
|
* const result = await dataset.runExperiment({
|
|
471
487
|
* name: "GPT-4 Benchmark",
|
|
488
|
+
* runName: "GPT-4 Benchmark v1.2", // optional exact run name
|
|
472
489
|
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
473
490
|
* task: async ({ input }) => {
|
|
474
491
|
* const response = await openai.chat.completions.create({
|
|
@@ -485,7 +502,7 @@ declare class DatasetManager {
|
|
|
485
502
|
* ]
|
|
486
503
|
* });
|
|
487
504
|
*
|
|
488
|
-
* console.log(await result.
|
|
505
|
+
* console.log(await result.format());
|
|
489
506
|
* ```
|
|
490
507
|
*
|
|
491
508
|
* @example Handling large datasets
|
|
@@ -549,7 +566,7 @@ declare class DatasetManager {
|
|
|
549
566
|
* ]
|
|
550
567
|
* });
|
|
551
568
|
*
|
|
552
|
-
* console.log(await result.
|
|
569
|
+
* console.log(await result.format());
|
|
553
570
|
* ```
|
|
554
571
|
*
|
|
555
572
|
* @example Using with Langfuse datasets
|
|
@@ -597,6 +614,7 @@ declare class ExperimentManager {
|
|
|
597
614
|
*
|
|
598
615
|
* @param config - The experiment configuration
|
|
599
616
|
* @param config.name - Human-readable name for the experiment
|
|
617
|
+
* @param config.runName - Optional exact name for the experiment run (defaults to name + timestamp)
|
|
600
618
|
* @param config.description - Optional description of the experiment's purpose
|
|
601
619
|
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
602
620
|
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
@@ -606,10 +624,11 @@ declare class ExperimentManager {
|
|
|
606
624
|
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
607
625
|
*
|
|
608
626
|
* @returns Promise that resolves to experiment results including:
|
|
627
|
+
* - runName: The experiment run name (either provided or generated)
|
|
609
628
|
* - itemResults: Results for each processed data item
|
|
610
629
|
* - runEvaluations: Results from run-level evaluators
|
|
611
630
|
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
612
|
-
* -
|
|
631
|
+
* - format: Function to format results for display
|
|
613
632
|
*
|
|
614
633
|
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
615
634
|
* @throws {Error} When required evaluators fail critically
|
|
@@ -669,6 +688,7 @@ declare class ExperimentManager {
|
|
|
669
688
|
*
|
|
670
689
|
* @param params - Parameters for item execution
|
|
671
690
|
* @param params.experimentName - Name of the parent experiment
|
|
691
|
+
* @param params.experimentRunName - Run name for the parent experiment
|
|
672
692
|
* @param params.experimentDescription - Description of the parent experiment
|
|
673
693
|
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
674
694
|
* @param params.item - The data item to process
|
|
@@ -758,6 +778,20 @@ declare class ExperimentManager {
|
|
|
758
778
|
*/
|
|
759
779
|
private formatValue;
|
|
760
780
|
private isOtelRegistered;
|
|
781
|
+
/**
|
|
782
|
+
* Creates an experiment run name based on provided parameters.
|
|
783
|
+
*
|
|
784
|
+
* If runName is provided, returns it directly. Otherwise, generates
|
|
785
|
+
* a name by combining the experiment name with an ISO timestamp.
|
|
786
|
+
*
|
|
787
|
+
* @param params - Parameters for run name creation
|
|
788
|
+
* @param params.name - The experiment name
|
|
789
|
+
* @param params.runName - Optional provided run name
|
|
790
|
+
* @returns The final run name to use
|
|
791
|
+
*
|
|
792
|
+
* @internal
|
|
793
|
+
*/
|
|
794
|
+
private createExperimentRunName;
|
|
761
795
|
}
|
|
762
796
|
|
|
763
797
|
/**
|
|
@@ -1501,7 +1535,7 @@ declare class LangfuseClient {
|
|
|
1501
1535
|
* ]
|
|
1502
1536
|
* });
|
|
1503
1537
|
*
|
|
1504
|
-
* console.log(await result.
|
|
1538
|
+
* console.log(await result.format());
|
|
1505
1539
|
* ```
|
|
1506
1540
|
*
|
|
1507
1541
|
* @example Using with datasets
|
|
@@ -1674,10 +1708,10 @@ declare class LangfuseClient {
|
|
|
1674
1708
|
* @example Basic usage with AutoEvals
|
|
1675
1709
|
* ```typescript
|
|
1676
1710
|
* import { Factuality, Levenshtein } from 'autoevals';
|
|
1677
|
-
* import {
|
|
1711
|
+
* import { createEvaluatorFromAutoevals } from '@langfuse/client';
|
|
1678
1712
|
*
|
|
1679
|
-
* const factualityEvaluator =
|
|
1680
|
-
* const levenshteinEvaluator =
|
|
1713
|
+
* const factualityEvaluator = createEvaluatorFromAutoevals(Factuality);
|
|
1714
|
+
* const levenshteinEvaluator = createEvaluatorFromAutoevals(Levenshtein);
|
|
1681
1715
|
*
|
|
1682
1716
|
* await langfuse.experiment.run({
|
|
1683
1717
|
* name: "AutoEvals Integration Test",
|
|
@@ -1691,7 +1725,7 @@ declare class LangfuseClient {
|
|
|
1691
1725
|
* ```typescript
|
|
1692
1726
|
* import { Factuality } from 'autoevals';
|
|
1693
1727
|
*
|
|
1694
|
-
* const factualityEvaluator =
|
|
1728
|
+
* const factualityEvaluator = createEvaluatorFromAutoevals(
|
|
1695
1729
|
* Factuality,
|
|
1696
1730
|
* { model: 'gpt-4o' } // Additional params for AutoEvals
|
|
1697
1731
|
* );
|
|
@@ -1710,7 +1744,7 @@ declare class LangfuseClient {
|
|
|
1710
1744
|
* @public
|
|
1711
1745
|
* @since 4.0.0
|
|
1712
1746
|
*/
|
|
1713
|
-
declare function
|
|
1747
|
+
declare function createEvaluatorFromAutoevals<E extends CallableFunction>(autoevalEvaluator: E, params?: Params<E>): Evaluator;
|
|
1714
1748
|
/**
|
|
1715
1749
|
* Utility type to extract parameter types from AutoEvals evaluator functions.
|
|
1716
1750
|
*
|
|
@@ -1723,4 +1757,4 @@ declare function autoevalsToLangfuseEvaluator<E extends CallableFunction>(autoev
|
|
|
1723
1757
|
*/
|
|
1724
1758
|
type Params<E> = Parameters<E extends (...args: any[]) => any ? E : never>[0] extends infer P ? Omit<P, "input" | "output" | "expected"> : never;
|
|
1725
1759
|
|
|
1726
|
-
export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient,
|
|
1760
|
+
export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient, createEvaluatorFromAutoevals };
|
package/dist/index.d.ts
CHANGED
|
@@ -120,6 +120,14 @@ type ExperimentParams<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
120
120
|
* Choose a descriptive name that identifies the experiment's purpose.
|
|
121
121
|
*/
|
|
122
122
|
name: string;
|
|
123
|
+
/**
|
|
124
|
+
* Optional exact name for the experiment run.
|
|
125
|
+
*
|
|
126
|
+
* If provided, this will be used as the exact dataset run name if the data
|
|
127
|
+
* contains Langfuse dataset items. If not provided, this will default to
|
|
128
|
+
* the experiment name appended with an ISO timestamp.
|
|
129
|
+
*/
|
|
130
|
+
runName?: string;
|
|
123
131
|
/**
|
|
124
132
|
* Optional description explaining the experiment's purpose.
|
|
125
133
|
*
|
|
@@ -227,10 +235,10 @@ type ExperimentItemResult<Input = any, ExpectedOutput = any, Metadata extends Re
|
|
|
227
235
|
* console.log(`Average score: ${avgScore?.value}`);
|
|
228
236
|
*
|
|
229
237
|
* // Print formatted results
|
|
230
|
-
* console.log(await result.
|
|
238
|
+
* console.log(await result.format());
|
|
231
239
|
*
|
|
232
240
|
* // Print summary with individual item results
|
|
233
|
-
* console.log(await result.
|
|
241
|
+
* console.log(await result.format({ includeItemResults: true }));
|
|
234
242
|
*
|
|
235
243
|
* // Link to dataset run (if available)
|
|
236
244
|
* if (result.datasetRunUrl) {
|
|
@@ -241,6 +249,13 @@ type ExperimentItemResult<Input = any, ExpectedOutput = any, Metadata extends Re
|
|
|
241
249
|
* @public
|
|
242
250
|
*/
|
|
243
251
|
type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record<string, any> = Record<string, any>> = {
|
|
252
|
+
/**
|
|
253
|
+
* The experiment run name.
|
|
254
|
+
*
|
|
255
|
+
* This is equal to the dataset run name if experiment was on Langfuse dataset.
|
|
256
|
+
* Either the provided runName parameter or generated name (experiment name + timestamp).
|
|
257
|
+
*/
|
|
258
|
+
runName: string;
|
|
244
259
|
/**
|
|
245
260
|
* ID of the dataset run in Langfuse (only for experiments on Langfuse datasets).
|
|
246
261
|
*
|
|
@@ -273,7 +288,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
273
288
|
*/
|
|
274
289
|
runEvaluations: Evaluation[];
|
|
275
290
|
/**
|
|
276
|
-
* Function to format
|
|
291
|
+
* Function to format experiment results in a human-readable format.
|
|
277
292
|
*
|
|
278
293
|
* Generates a comprehensive, nicely formatted summary including individual results,
|
|
279
294
|
* aggregate statistics, evaluation scores, and links to traces and dataset runs.
|
|
@@ -282,7 +297,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
282
297
|
* @param options.includeItemResults - Whether to include individual item details (default: false)
|
|
283
298
|
* @returns Promise resolving to formatted string representation
|
|
284
299
|
*/
|
|
285
|
-
|
|
300
|
+
format: (options?: {
|
|
286
301
|
includeItemResults?: boolean;
|
|
287
302
|
}) => Promise<string>;
|
|
288
303
|
};
|
|
@@ -301,6 +316,7 @@ type ExperimentResult<Input = any, ExpectedOutput = any, Metadata extends Record
|
|
|
301
316
|
* const dataset = await langfuse.dataset.get("my-dataset");
|
|
302
317
|
* const result = await dataset.runExperiment({
|
|
303
318
|
* name: "Model Evaluation",
|
|
319
|
+
* runName: "Model Evaluation Run 1", // optional
|
|
304
320
|
* task: myTask,
|
|
305
321
|
* evaluators: [myEvaluator]
|
|
306
322
|
* });
|
|
@@ -469,6 +485,7 @@ declare class DatasetManager {
|
|
|
469
485
|
*
|
|
470
486
|
* const result = await dataset.runExperiment({
|
|
471
487
|
* name: "GPT-4 Benchmark",
|
|
488
|
+
* runName: "GPT-4 Benchmark v1.2", // optional exact run name
|
|
472
489
|
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
473
490
|
* task: async ({ input }) => {
|
|
474
491
|
* const response = await openai.chat.completions.create({
|
|
@@ -485,7 +502,7 @@ declare class DatasetManager {
|
|
|
485
502
|
* ]
|
|
486
503
|
* });
|
|
487
504
|
*
|
|
488
|
-
* console.log(await result.
|
|
505
|
+
* console.log(await result.format());
|
|
489
506
|
* ```
|
|
490
507
|
*
|
|
491
508
|
* @example Handling large datasets
|
|
@@ -549,7 +566,7 @@ declare class DatasetManager {
|
|
|
549
566
|
* ]
|
|
550
567
|
* });
|
|
551
568
|
*
|
|
552
|
-
* console.log(await result.
|
|
569
|
+
* console.log(await result.format());
|
|
553
570
|
* ```
|
|
554
571
|
*
|
|
555
572
|
* @example Using with Langfuse datasets
|
|
@@ -597,6 +614,7 @@ declare class ExperimentManager {
|
|
|
597
614
|
*
|
|
598
615
|
* @param config - The experiment configuration
|
|
599
616
|
* @param config.name - Human-readable name for the experiment
|
|
617
|
+
* @param config.runName - Optional exact name for the experiment run (defaults to name + timestamp)
|
|
600
618
|
* @param config.description - Optional description of the experiment's purpose
|
|
601
619
|
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
602
620
|
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
@@ -606,10 +624,11 @@ declare class ExperimentManager {
|
|
|
606
624
|
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
607
625
|
*
|
|
608
626
|
* @returns Promise that resolves to experiment results including:
|
|
627
|
+
* - runName: The experiment run name (either provided or generated)
|
|
609
628
|
* - itemResults: Results for each processed data item
|
|
610
629
|
* - runEvaluations: Results from run-level evaluators
|
|
611
630
|
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
612
|
-
* -
|
|
631
|
+
* - format: Function to format results for display
|
|
613
632
|
*
|
|
614
633
|
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
615
634
|
* @throws {Error} When required evaluators fail critically
|
|
@@ -669,6 +688,7 @@ declare class ExperimentManager {
|
|
|
669
688
|
*
|
|
670
689
|
* @param params - Parameters for item execution
|
|
671
690
|
* @param params.experimentName - Name of the parent experiment
|
|
691
|
+
* @param params.experimentRunName - Run name for the parent experiment
|
|
672
692
|
* @param params.experimentDescription - Description of the parent experiment
|
|
673
693
|
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
674
694
|
* @param params.item - The data item to process
|
|
@@ -758,6 +778,20 @@ declare class ExperimentManager {
|
|
|
758
778
|
*/
|
|
759
779
|
private formatValue;
|
|
760
780
|
private isOtelRegistered;
|
|
781
|
+
/**
|
|
782
|
+
* Creates an experiment run name based on provided parameters.
|
|
783
|
+
*
|
|
784
|
+
* If runName is provided, returns it directly. Otherwise, generates
|
|
785
|
+
* a name by combining the experiment name with an ISO timestamp.
|
|
786
|
+
*
|
|
787
|
+
* @param params - Parameters for run name creation
|
|
788
|
+
* @param params.name - The experiment name
|
|
789
|
+
* @param params.runName - Optional provided run name
|
|
790
|
+
* @returns The final run name to use
|
|
791
|
+
*
|
|
792
|
+
* @internal
|
|
793
|
+
*/
|
|
794
|
+
private createExperimentRunName;
|
|
761
795
|
}
|
|
762
796
|
|
|
763
797
|
/**
|
|
@@ -1501,7 +1535,7 @@ declare class LangfuseClient {
|
|
|
1501
1535
|
* ]
|
|
1502
1536
|
* });
|
|
1503
1537
|
*
|
|
1504
|
-
* console.log(await result.
|
|
1538
|
+
* console.log(await result.format());
|
|
1505
1539
|
* ```
|
|
1506
1540
|
*
|
|
1507
1541
|
* @example Using with datasets
|
|
@@ -1674,10 +1708,10 @@ declare class LangfuseClient {
|
|
|
1674
1708
|
* @example Basic usage with AutoEvals
|
|
1675
1709
|
* ```typescript
|
|
1676
1710
|
* import { Factuality, Levenshtein } from 'autoevals';
|
|
1677
|
-
* import {
|
|
1711
|
+
* import { createEvaluatorFromAutoevals } from '@langfuse/client';
|
|
1678
1712
|
*
|
|
1679
|
-
* const factualityEvaluator =
|
|
1680
|
-
* const levenshteinEvaluator =
|
|
1713
|
+
* const factualityEvaluator = createEvaluatorFromAutoevals(Factuality);
|
|
1714
|
+
* const levenshteinEvaluator = createEvaluatorFromAutoevals(Levenshtein);
|
|
1681
1715
|
*
|
|
1682
1716
|
* await langfuse.experiment.run({
|
|
1683
1717
|
* name: "AutoEvals Integration Test",
|
|
@@ -1691,7 +1725,7 @@ declare class LangfuseClient {
|
|
|
1691
1725
|
* ```typescript
|
|
1692
1726
|
* import { Factuality } from 'autoevals';
|
|
1693
1727
|
*
|
|
1694
|
-
* const factualityEvaluator =
|
|
1728
|
+
* const factualityEvaluator = createEvaluatorFromAutoevals(
|
|
1695
1729
|
* Factuality,
|
|
1696
1730
|
* { model: 'gpt-4o' } // Additional params for AutoEvals
|
|
1697
1731
|
* );
|
|
@@ -1710,7 +1744,7 @@ declare class LangfuseClient {
|
|
|
1710
1744
|
* @public
|
|
1711
1745
|
* @since 4.0.0
|
|
1712
1746
|
*/
|
|
1713
|
-
declare function
|
|
1747
|
+
declare function createEvaluatorFromAutoevals<E extends CallableFunction>(autoevalEvaluator: E, params?: Params<E>): Evaluator;
|
|
1714
1748
|
/**
|
|
1715
1749
|
* Utility type to extract parameter types from AutoEvals evaluator functions.
|
|
1716
1750
|
*
|
|
@@ -1723,4 +1757,4 @@ declare function autoevalsToLangfuseEvaluator<E extends CallableFunction>(autoev
|
|
|
1723
1757
|
*/
|
|
1724
1758
|
type Params<E> = Parameters<E extends (...args: any[]) => any ? E : never>[0] extends infer P ? Omit<P, "input" | "output" | "expected"> : never;
|
|
1725
1759
|
|
|
1726
|
-
export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient,
|
|
1760
|
+
export { type ChatMessageOrPlaceholder, ChatMessageType, ChatPromptClient, type CreateChatPromptBodyWithPlaceholders, DatasetManager, type Evaluation, type Evaluator, type EvaluatorParams, type ExperimentItem, type ExperimentItemResult, ExperimentManager, type ExperimentParams, type ExperimentResult, type ExperimentTask, type ExperimentTaskParams, type FetchedDataset, type LangchainMessagesPlaceholder, LangfuseClient, type LangfuseClientParams, type LangfuseMediaResolveMediaReferencesParams, type LinkDatasetItemFunction, MediaManager, PromptManager, type RunEvaluator, type RunEvaluatorParams, type RunExperimentOnDataset, ScoreManager, TextPromptClient, createEvaluatorFromAutoevals };
|
package/dist/index.mjs
CHANGED
|
@@ -59,6 +59,7 @@ var DatasetManager = class {
|
|
|
59
59
|
*
|
|
60
60
|
* const result = await dataset.runExperiment({
|
|
61
61
|
* name: "GPT-4 Benchmark",
|
|
62
|
+
* runName: "GPT-4 Benchmark v1.2", // optional exact run name
|
|
62
63
|
* description: "Evaluating GPT-4 on our benchmark tasks",
|
|
63
64
|
* task: async ({ input }) => {
|
|
64
65
|
* const response = await openai.chat.completions.create({
|
|
@@ -75,7 +76,7 @@ var DatasetManager = class {
|
|
|
75
76
|
* ]
|
|
76
77
|
* });
|
|
77
78
|
*
|
|
78
|
-
* console.log(await result.
|
|
79
|
+
* console.log(await result.format());
|
|
79
80
|
* ```
|
|
80
81
|
*
|
|
81
82
|
* @example Handling large datasets
|
|
@@ -184,6 +185,7 @@ var ExperimentManager = class {
|
|
|
184
185
|
*
|
|
185
186
|
* @param config - The experiment configuration
|
|
186
187
|
* @param config.name - Human-readable name for the experiment
|
|
188
|
+
* @param config.runName - Optional exact name for the experiment run (defaults to name + timestamp)
|
|
187
189
|
* @param config.description - Optional description of the experiment's purpose
|
|
188
190
|
* @param config.metadata - Optional metadata to attach to the experiment run
|
|
189
191
|
* @param config.data - Array of data items to process (ExperimentItem[] or DatasetItem[])
|
|
@@ -193,10 +195,11 @@ var ExperimentManager = class {
|
|
|
193
195
|
* @param config.maxConcurrency - Maximum number of concurrent task executions (default: Infinity)
|
|
194
196
|
*
|
|
195
197
|
* @returns Promise that resolves to experiment results including:
|
|
198
|
+
* - runName: The experiment run name (either provided or generated)
|
|
196
199
|
* - itemResults: Results for each processed data item
|
|
197
200
|
* - runEvaluations: Results from run-level evaluators
|
|
198
201
|
* - datasetRunId: ID of the dataset run (if using Langfuse datasets)
|
|
199
|
-
* -
|
|
202
|
+
* - format: Function to format results for display
|
|
200
203
|
*
|
|
201
204
|
* @throws {Error} When task execution fails and cannot be handled gracefully
|
|
202
205
|
* @throws {Error} When required evaluators fail critically
|
|
@@ -249,11 +252,16 @@ var ExperimentManager = class {
|
|
|
249
252
|
evaluators,
|
|
250
253
|
task,
|
|
251
254
|
name,
|
|
255
|
+
runName: providedRunName,
|
|
252
256
|
description,
|
|
253
257
|
metadata,
|
|
254
258
|
maxConcurrency: batchSize = Infinity,
|
|
255
259
|
runEvaluators
|
|
256
260
|
} = config;
|
|
261
|
+
const runName = this.createExperimentRunName({
|
|
262
|
+
name,
|
|
263
|
+
runName: providedRunName
|
|
264
|
+
});
|
|
257
265
|
if (!this.isOtelRegistered()) {
|
|
258
266
|
this.logger.warn(
|
|
259
267
|
"OpenTelemetry has not been set up. Traces will not be sent to Langfuse.See our docs on how to set up OpenTelemetry: https://langfuse.com/docs/observability/sdk/typescript/setup#tracing-setup"
|
|
@@ -268,11 +276,26 @@ var ExperimentManager = class {
|
|
|
268
276
|
evaluators,
|
|
269
277
|
task,
|
|
270
278
|
experimentName: name,
|
|
279
|
+
experimentRunName: runName,
|
|
271
280
|
experimentDescription: description,
|
|
272
281
|
experimentMetadata: metadata
|
|
273
282
|
});
|
|
274
283
|
});
|
|
275
|
-
const
|
|
284
|
+
const settledResults = await Promise.allSettled(promises);
|
|
285
|
+
const results = settledResults.reduce(
|
|
286
|
+
(acc, settledResult) => {
|
|
287
|
+
if (settledResult.status === "fulfilled") {
|
|
288
|
+
acc.push(settledResult.value);
|
|
289
|
+
} else {
|
|
290
|
+
const errorMessage = settledResult.reason instanceof Error ? settledResult.reason.message : String(settledResult.reason);
|
|
291
|
+
this.logger.error(
|
|
292
|
+
`Task failed with error: ${errorMessage}. Skipping item.`
|
|
293
|
+
);
|
|
294
|
+
}
|
|
295
|
+
return acc;
|
|
296
|
+
},
|
|
297
|
+
[]
|
|
298
|
+
);
|
|
276
299
|
itemResults.push(...results);
|
|
277
300
|
}
|
|
278
301
|
const datasetRunId = itemResults.length > 0 ? itemResults[0].datasetRunId : void 0;
|
|
@@ -311,11 +334,12 @@ var ExperimentManager = class {
|
|
|
311
334
|
}
|
|
312
335
|
await this.langfuseClient.score.flush();
|
|
313
336
|
return {
|
|
337
|
+
runName,
|
|
314
338
|
itemResults,
|
|
315
339
|
datasetRunId,
|
|
316
340
|
datasetRunUrl,
|
|
317
341
|
runEvaluations,
|
|
318
|
-
|
|
342
|
+
format: async (options) => {
|
|
319
343
|
var _a;
|
|
320
344
|
return await this.prettyPrintResults({
|
|
321
345
|
datasetRunUrl,
|
|
@@ -323,6 +347,7 @@ var ExperimentManager = class {
|
|
|
323
347
|
originalData: data,
|
|
324
348
|
runEvaluations,
|
|
325
349
|
name: config.name,
|
|
350
|
+
runName,
|
|
326
351
|
description: config.description,
|
|
327
352
|
includeItemResults: (_a = options == null ? void 0 : options.includeItemResults) != null ? _a : false
|
|
328
353
|
});
|
|
@@ -341,6 +366,7 @@ var ExperimentManager = class {
|
|
|
341
366
|
*
|
|
342
367
|
* @param params - Parameters for item execution
|
|
343
368
|
* @param params.experimentName - Name of the parent experiment
|
|
369
|
+
* @param params.experimentRunName - Run name for the parent experiment
|
|
344
370
|
* @param params.experimentDescription - Description of the parent experiment
|
|
345
371
|
* @param params.experimentMetadata - Metadata for the parent experiment
|
|
346
372
|
* @param params.item - The data item to process
|
|
@@ -355,7 +381,7 @@ var ExperimentManager = class {
|
|
|
355
381
|
*/
|
|
356
382
|
async runItem(params) {
|
|
357
383
|
const { item, evaluators = [], task, experimentMetadata = {} } = params;
|
|
358
|
-
const { output, traceId } = await startActiveObservation(
|
|
384
|
+
const { output, traceId, observationId } = await startActiveObservation(
|
|
359
385
|
"experiment-item-run",
|
|
360
386
|
async (span) => {
|
|
361
387
|
var _a;
|
|
@@ -364,26 +390,28 @@ var ExperimentManager = class {
|
|
|
364
390
|
input: item.input,
|
|
365
391
|
output: output2,
|
|
366
392
|
metadata: {
|
|
367
|
-
|
|
393
|
+
experiment_name: params.experimentName,
|
|
394
|
+
experiment_run_name: params.experimentRunName,
|
|
368
395
|
...experimentMetadata,
|
|
369
396
|
...(_a = item.metadata) != null ? _a : {},
|
|
370
397
|
..."id" in item && "datasetId" in item ? {
|
|
371
|
-
|
|
372
|
-
|
|
398
|
+
dataset_id: item["datasetId"],
|
|
399
|
+
dataset_item_id: item["id"]
|
|
373
400
|
} : {}
|
|
374
401
|
}
|
|
375
402
|
});
|
|
376
|
-
return { output: output2, traceId: span.traceId };
|
|
403
|
+
return { output: output2, traceId: span.traceId, observationId: span.id };
|
|
377
404
|
}
|
|
378
405
|
);
|
|
379
406
|
let datasetRunId = void 0;
|
|
380
407
|
if ("id" in item) {
|
|
381
408
|
await this.langfuseClient.api.datasetRunItems.create({
|
|
382
|
-
runName: params.
|
|
409
|
+
runName: params.experimentRunName,
|
|
383
410
|
runDescription: params.experimentDescription,
|
|
384
411
|
metadata: params.experimentMetadata,
|
|
385
412
|
datasetItemId: item.id,
|
|
386
|
-
traceId
|
|
413
|
+
traceId,
|
|
414
|
+
observationId
|
|
387
415
|
}).then((result) => {
|
|
388
416
|
datasetRunId = result.datasetRunId;
|
|
389
417
|
}).catch(
|
|
@@ -505,6 +533,7 @@ ${JSON.stringify(params2)}
|
|
|
505
533
|
originalData,
|
|
506
534
|
runEvaluations,
|
|
507
535
|
name,
|
|
536
|
+
runName,
|
|
508
537
|
description,
|
|
509
538
|
includeItemResults = false
|
|
510
539
|
} = params;
|
|
@@ -562,7 +591,7 @@ ${index + 1}. Item ${index + 1}:
|
|
|
562
591
|
} else {
|
|
563
592
|
output += `Individual Results: Hidden (${itemResults.length} items)
|
|
564
593
|
`;
|
|
565
|
-
output += "\u{1F4A1} Call
|
|
594
|
+
output += "\u{1F4A1} Call format({ includeItemResults: true }) to view them\n";
|
|
566
595
|
}
|
|
567
596
|
const totalItems = itemResults.length;
|
|
568
597
|
const evaluationNames = new Set(
|
|
@@ -571,7 +600,9 @@ ${index + 1}. Item ${index + 1}:
|
|
|
571
600
|
output += `
|
|
572
601
|
${"\u2500".repeat(50)}
|
|
573
602
|
`;
|
|
574
|
-
output += `\u{
|
|
603
|
+
output += `\u{1F9EA} Experiment: ${name}`;
|
|
604
|
+
output += `
|
|
605
|
+
\u{1F4CB} Run name: ${runName}`;
|
|
575
606
|
if (description) {
|
|
576
607
|
output += ` - ${description}`;
|
|
577
608
|
}
|
|
@@ -646,6 +677,26 @@ Run Evaluations:`;
|
|
|
646
677
|
}
|
|
647
678
|
return tracerProvider.constructor.name !== "NoopTracerProvider";
|
|
648
679
|
}
|
|
680
|
+
/**
|
|
681
|
+
* Creates an experiment run name based on provided parameters.
|
|
682
|
+
*
|
|
683
|
+
* If runName is provided, returns it directly. Otherwise, generates
|
|
684
|
+
* a name by combining the experiment name with an ISO timestamp.
|
|
685
|
+
*
|
|
686
|
+
* @param params - Parameters for run name creation
|
|
687
|
+
* @param params.name - The experiment name
|
|
688
|
+
* @param params.runName - Optional provided run name
|
|
689
|
+
* @returns The final run name to use
|
|
690
|
+
*
|
|
691
|
+
* @internal
|
|
692
|
+
*/
|
|
693
|
+
createExperimentRunName(params) {
|
|
694
|
+
if (params.runName) {
|
|
695
|
+
return params.runName;
|
|
696
|
+
}
|
|
697
|
+
const isoTimestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
698
|
+
return `${params.name} - ${isoTimestamp}`;
|
|
699
|
+
}
|
|
649
700
|
};
|
|
650
701
|
|
|
651
702
|
// src/media/index.ts
|
|
@@ -1850,7 +1901,7 @@ var LangfuseClient = class {
|
|
|
1850
1901
|
};
|
|
1851
1902
|
|
|
1852
1903
|
// src/experiment/adapters.ts
|
|
1853
|
-
function
|
|
1904
|
+
function createEvaluatorFromAutoevals(autoevalEvaluator, params) {
|
|
1854
1905
|
const langfuseEvaluator = async (langfuseEvaluatorParams) => {
|
|
1855
1906
|
var _a;
|
|
1856
1907
|
const score = await autoevalEvaluator({
|
|
@@ -1877,6 +1928,6 @@ export {
|
|
|
1877
1928
|
PromptManager,
|
|
1878
1929
|
ScoreManager,
|
|
1879
1930
|
TextPromptClient,
|
|
1880
|
-
|
|
1931
|
+
createEvaluatorFromAutoevals
|
|
1881
1932
|
};
|
|
1882
1933
|
//# sourceMappingURL=index.mjs.map
|