@fallom/trace 0.2.15 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-3HBKT4HK.mjs +827 -0
- package/dist/chunk-XBZ3ESNV.mjs +824 -0
- package/dist/core-4L56QWI7.mjs +21 -0
- package/dist/core-JLHYFVYS.mjs +21 -0
- package/dist/index.d.mts +140 -3
- package/dist/index.d.ts +140 -3
- package/dist/index.js +169 -2
- package/dist/index.mjs +4 -2
- package/package.json +1 -1
- package/dist/chunk-KFD5AQ7V.mjs +0 -308
- package/dist/models-SEFDGZU2.mjs +0 -8
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-3HBKT4HK.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-XBZ3ESNV.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
package/dist/index.d.mts
CHANGED
|
@@ -458,6 +458,37 @@ interface EvalResult {
|
|
|
458
458
|
tokensOut?: number;
|
|
459
459
|
cost?: number;
|
|
460
460
|
}
|
|
461
|
+
/**
|
|
462
|
+
* A golden record from a dataset - contains input and optionally expected output.
|
|
463
|
+
* This represents a "golden" test case that you can use to generate actual outputs
|
|
464
|
+
* from your LLM pipeline and then evaluate.
|
|
465
|
+
*/
|
|
466
|
+
interface Golden {
|
|
467
|
+
input: string;
|
|
468
|
+
expectedOutput?: string;
|
|
469
|
+
systemMessage?: string;
|
|
470
|
+
/** Retrieved documents for RAG evaluation */
|
|
471
|
+
context?: string[];
|
|
472
|
+
metadata?: Record<string, unknown>;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* A test case for evaluation - contains input and actual output from your LLM.
|
|
476
|
+
*
|
|
477
|
+
*/
|
|
478
|
+
interface LLMTestCase {
|
|
479
|
+
/** The user input/query */
|
|
480
|
+
input: string;
|
|
481
|
+
/** The output generated by your LLM pipeline */
|
|
482
|
+
actualOutput: string;
|
|
483
|
+
/** (Optional) The expected/golden output for comparison */
|
|
484
|
+
expectedOutput?: string;
|
|
485
|
+
/** (Optional) System prompt used */
|
|
486
|
+
systemMessage?: string;
|
|
487
|
+
/** (Optional) Retrieved documents for RAG faithfulness evaluation */
|
|
488
|
+
context?: string[];
|
|
489
|
+
/** (Optional) Additional metadata */
|
|
490
|
+
metadata?: Record<string, unknown>;
|
|
491
|
+
}
|
|
461
492
|
/** Response format from model calls */
|
|
462
493
|
interface ModelResponse {
|
|
463
494
|
content: string;
|
|
@@ -487,13 +518,16 @@ interface InitOptions$1 {
|
|
|
487
518
|
}
|
|
488
519
|
/** Options for evaluate() */
|
|
489
520
|
interface EvaluateOptions {
|
|
490
|
-
dataset
|
|
521
|
+
/** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
|
|
522
|
+
dataset?: DatasetInput;
|
|
491
523
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
492
524
|
metrics?: MetricInput[];
|
|
493
525
|
judgeModel?: string;
|
|
494
526
|
name?: string;
|
|
495
527
|
description?: string;
|
|
496
528
|
verbose?: boolean;
|
|
529
|
+
/** Alternative to dataset - provide test cases from EvaluationDataset */
|
|
530
|
+
testCases?: LLMTestCase[];
|
|
497
531
|
_skipUpload?: boolean;
|
|
498
532
|
}
|
|
499
533
|
/** Options for compareModels() */
|
|
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
|
|
|
532
566
|
*/
|
|
533
567
|
declare function init$1(options?: InitOptions$1): void;
|
|
534
568
|
/**
|
|
535
|
-
* Evaluate
|
|
569
|
+
* Evaluate outputs against specified metrics using G-Eval.
|
|
536
570
|
*
|
|
537
571
|
* Results are automatically uploaded to Fallom dashboard.
|
|
572
|
+
*
|
|
538
573
|
*/
|
|
539
574
|
declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
|
|
540
575
|
/**
|
|
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
|
|
|
630
665
|
_baseUrl?: string;
|
|
631
666
|
_initialized?: boolean;
|
|
632
667
|
}): Promise<DatasetItem[]>;
|
|
668
|
+
/**
|
|
669
|
+
* A dataset for evaluation that supports pulling from Fallom and adding test cases.
|
|
670
|
+
*
|
|
671
|
+
* This provides a workflow where you:
|
|
672
|
+
* 1. Pull a dataset (goldens) from Fallom
|
|
673
|
+
* 2. Run your own LLM pipeline on each golden to generate outputs
|
|
674
|
+
* 3. Add the results as test cases
|
|
675
|
+
* 4. Evaluate the test cases
|
|
676
|
+
*
|
|
677
|
+
*/
|
|
678
|
+
declare class EvaluationDataset {
|
|
679
|
+
private _goldens;
|
|
680
|
+
private _testCases;
|
|
681
|
+
private _datasetKey;
|
|
682
|
+
private _datasetName;
|
|
683
|
+
private _version;
|
|
684
|
+
/** List of golden records (inputs with optional expected outputs). */
|
|
685
|
+
get goldens(): Golden[];
|
|
686
|
+
/** List of test cases (inputs with actual outputs from your LLM). */
|
|
687
|
+
get testCases(): LLMTestCase[];
|
|
688
|
+
/** The Fallom dataset key if pulled from Fallom. */
|
|
689
|
+
get datasetKey(): string | null;
|
|
690
|
+
/**
|
|
691
|
+
* Pull a dataset from Fallom.
|
|
692
|
+
*
|
|
693
|
+
* @param alias - The dataset key/alias in Fallom
|
|
694
|
+
* @param version - Specific version to pull (default: latest)
|
|
695
|
+
* @returns Self for chaining
|
|
696
|
+
*/
|
|
697
|
+
pull(alias: string, version?: number): Promise<EvaluationDataset>;
|
|
698
|
+
/**
|
|
699
|
+
* Add a golden record manually.
|
|
700
|
+
* @param golden - A Golden object
|
|
701
|
+
* @returns Self for chaining
|
|
702
|
+
*/
|
|
703
|
+
addGolden(golden: Golden): EvaluationDataset;
|
|
704
|
+
/**
|
|
705
|
+
* Add multiple golden records.
|
|
706
|
+
* @param goldens - Array of Golden objects
|
|
707
|
+
* @returns Self for chaining
|
|
708
|
+
*/
|
|
709
|
+
addGoldens(goldens: Golden[]): EvaluationDataset;
|
|
710
|
+
/**
|
|
711
|
+
* Add a test case with actual LLM output.
|
|
712
|
+
* @param testCase - An LLMTestCase object
|
|
713
|
+
* @returns Self for chaining
|
|
714
|
+
*/
|
|
715
|
+
addTestCase(testCase: LLMTestCase): EvaluationDataset;
|
|
716
|
+
/**
|
|
717
|
+
* Add multiple test cases.
|
|
718
|
+
* @param testCases - Array of LLMTestCase objects
|
|
719
|
+
* @returns Self for chaining
|
|
720
|
+
*/
|
|
721
|
+
addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
|
|
722
|
+
/**
|
|
723
|
+
* Automatically generate test cases by running all goldens through your LLM app.
|
|
724
|
+
*
|
|
725
|
+
* @param llmApp - A callable that takes messages and returns response
|
|
726
|
+
* @param options - Configuration options
|
|
727
|
+
* @returns Self for chaining
|
|
728
|
+
*/
|
|
729
|
+
generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
|
|
730
|
+
includeContext?: boolean;
|
|
731
|
+
}): Promise<EvaluationDataset>;
|
|
732
|
+
/** Clear all test cases (useful for re-running with different LLM). */
|
|
733
|
+
clearTestCases(): EvaluationDataset;
|
|
734
|
+
/** Return the number of goldens. */
|
|
735
|
+
get length(): number;
|
|
736
|
+
}
|
|
633
737
|
|
|
634
738
|
/**
|
|
635
739
|
* Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
|
|
636
740
|
*
|
|
637
741
|
* Evaluate production outputs or compare different models on your dataset.
|
|
638
742
|
* Results are uploaded to Fallom dashboard for visualization.
|
|
743
|
+
*
|
|
744
|
+
* @example
|
|
745
|
+
* import fallom from "@fallom/trace";
|
|
746
|
+
*
|
|
747
|
+
* // Initialize
|
|
748
|
+
* fallom.evals.init({ apiKey: "flm_xxx" });
|
|
749
|
+
*
|
|
750
|
+
* // Method 1: Direct dataset evaluation
|
|
751
|
+
* const results = await fallom.evals.evaluate({
|
|
752
|
+
* dataset: [...],
|
|
753
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
754
|
+
* });
|
|
755
|
+
*
|
|
756
|
+
* // Method 2: Use EvaluationDataset with your own LLM pipeline
|
|
757
|
+
* const dataset = new fallom.evals.EvaluationDataset();
|
|
758
|
+
* await dataset.pull("my-dataset-key");
|
|
759
|
+
*
|
|
760
|
+
* for (const golden of dataset.goldens) {
|
|
761
|
+
* const actualOutput = await myLLMApp(golden.input);
|
|
762
|
+
* dataset.addTestCase({
|
|
763
|
+
* input: golden.input,
|
|
764
|
+
* actualOutput,
|
|
765
|
+
* });
|
|
766
|
+
* }
|
|
767
|
+
*
|
|
768
|
+
* const results = await fallom.evals.evaluate({
|
|
769
|
+
* testCases: dataset.testCases,
|
|
770
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
771
|
+
* });
|
|
639
772
|
*/
|
|
640
773
|
|
|
641
774
|
declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
|
|
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
|
|
|
646
779
|
type evals_DatasetItem = DatasetItem;
|
|
647
780
|
type evals_EvalResult = EvalResult;
|
|
648
781
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
|
+
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
|
+
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
784
|
+
type evals_Golden = Golden;
|
|
785
|
+
type evals_LLMTestCase = LLMTestCase;
|
|
649
786
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
650
787
|
type evals_Message = Message;
|
|
651
788
|
type evals_MetricInput = MetricInput;
|
|
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
|
|
|
664
801
|
declare const evals_getMetricName: typeof getMetricName;
|
|
665
802
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
666
803
|
declare namespace evals {
|
|
667
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
804
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
668
805
|
}
|
|
669
806
|
|
|
670
807
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -458,6 +458,37 @@ interface EvalResult {
|
|
|
458
458
|
tokensOut?: number;
|
|
459
459
|
cost?: number;
|
|
460
460
|
}
|
|
461
|
+
/**
|
|
462
|
+
* A golden record from a dataset - contains input and optionally expected output.
|
|
463
|
+
* This represents a "golden" test case that you can use to generate actual outputs
|
|
464
|
+
* from your LLM pipeline and then evaluate.
|
|
465
|
+
*/
|
|
466
|
+
interface Golden {
|
|
467
|
+
input: string;
|
|
468
|
+
expectedOutput?: string;
|
|
469
|
+
systemMessage?: string;
|
|
470
|
+
/** Retrieved documents for RAG evaluation */
|
|
471
|
+
context?: string[];
|
|
472
|
+
metadata?: Record<string, unknown>;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* A test case for evaluation - contains input and actual output from your LLM.
|
|
476
|
+
*
|
|
477
|
+
*/
|
|
478
|
+
interface LLMTestCase {
|
|
479
|
+
/** The user input/query */
|
|
480
|
+
input: string;
|
|
481
|
+
/** The output generated by your LLM pipeline */
|
|
482
|
+
actualOutput: string;
|
|
483
|
+
/** (Optional) The expected/golden output for comparison */
|
|
484
|
+
expectedOutput?: string;
|
|
485
|
+
/** (Optional) System prompt used */
|
|
486
|
+
systemMessage?: string;
|
|
487
|
+
/** (Optional) Retrieved documents for RAG faithfulness evaluation */
|
|
488
|
+
context?: string[];
|
|
489
|
+
/** (Optional) Additional metadata */
|
|
490
|
+
metadata?: Record<string, unknown>;
|
|
491
|
+
}
|
|
461
492
|
/** Response format from model calls */
|
|
462
493
|
interface ModelResponse {
|
|
463
494
|
content: string;
|
|
@@ -487,13 +518,16 @@ interface InitOptions$1 {
|
|
|
487
518
|
}
|
|
488
519
|
/** Options for evaluate() */
|
|
489
520
|
interface EvaluateOptions {
|
|
490
|
-
dataset
|
|
521
|
+
/** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
|
|
522
|
+
dataset?: DatasetInput;
|
|
491
523
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
492
524
|
metrics?: MetricInput[];
|
|
493
525
|
judgeModel?: string;
|
|
494
526
|
name?: string;
|
|
495
527
|
description?: string;
|
|
496
528
|
verbose?: boolean;
|
|
529
|
+
/** Alternative to dataset - provide test cases from EvaluationDataset */
|
|
530
|
+
testCases?: LLMTestCase[];
|
|
497
531
|
_skipUpload?: boolean;
|
|
498
532
|
}
|
|
499
533
|
/** Options for compareModels() */
|
|
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
|
|
|
532
566
|
*/
|
|
533
567
|
declare function init$1(options?: InitOptions$1): void;
|
|
534
568
|
/**
|
|
535
|
-
* Evaluate
|
|
569
|
+
* Evaluate outputs against specified metrics using G-Eval.
|
|
536
570
|
*
|
|
537
571
|
* Results are automatically uploaded to Fallom dashboard.
|
|
572
|
+
*
|
|
538
573
|
*/
|
|
539
574
|
declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
|
|
540
575
|
/**
|
|
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
|
|
|
630
665
|
_baseUrl?: string;
|
|
631
666
|
_initialized?: boolean;
|
|
632
667
|
}): Promise<DatasetItem[]>;
|
|
668
|
+
/**
|
|
669
|
+
* A dataset for evaluation that supports pulling from Fallom and adding test cases.
|
|
670
|
+
*
|
|
671
|
+
* This provides a workflow where you:
|
|
672
|
+
* 1. Pull a dataset (goldens) from Fallom
|
|
673
|
+
* 2. Run your own LLM pipeline on each golden to generate outputs
|
|
674
|
+
* 3. Add the results as test cases
|
|
675
|
+
* 4. Evaluate the test cases
|
|
676
|
+
*
|
|
677
|
+
*/
|
|
678
|
+
declare class EvaluationDataset {
|
|
679
|
+
private _goldens;
|
|
680
|
+
private _testCases;
|
|
681
|
+
private _datasetKey;
|
|
682
|
+
private _datasetName;
|
|
683
|
+
private _version;
|
|
684
|
+
/** List of golden records (inputs with optional expected outputs). */
|
|
685
|
+
get goldens(): Golden[];
|
|
686
|
+
/** List of test cases (inputs with actual outputs from your LLM). */
|
|
687
|
+
get testCases(): LLMTestCase[];
|
|
688
|
+
/** The Fallom dataset key if pulled from Fallom. */
|
|
689
|
+
get datasetKey(): string | null;
|
|
690
|
+
/**
|
|
691
|
+
* Pull a dataset from Fallom.
|
|
692
|
+
*
|
|
693
|
+
* @param alias - The dataset key/alias in Fallom
|
|
694
|
+
* @param version - Specific version to pull (default: latest)
|
|
695
|
+
* @returns Self for chaining
|
|
696
|
+
*/
|
|
697
|
+
pull(alias: string, version?: number): Promise<EvaluationDataset>;
|
|
698
|
+
/**
|
|
699
|
+
* Add a golden record manually.
|
|
700
|
+
* @param golden - A Golden object
|
|
701
|
+
* @returns Self for chaining
|
|
702
|
+
*/
|
|
703
|
+
addGolden(golden: Golden): EvaluationDataset;
|
|
704
|
+
/**
|
|
705
|
+
* Add multiple golden records.
|
|
706
|
+
* @param goldens - Array of Golden objects
|
|
707
|
+
* @returns Self for chaining
|
|
708
|
+
*/
|
|
709
|
+
addGoldens(goldens: Golden[]): EvaluationDataset;
|
|
710
|
+
/**
|
|
711
|
+
* Add a test case with actual LLM output.
|
|
712
|
+
* @param testCase - An LLMTestCase object
|
|
713
|
+
* @returns Self for chaining
|
|
714
|
+
*/
|
|
715
|
+
addTestCase(testCase: LLMTestCase): EvaluationDataset;
|
|
716
|
+
/**
|
|
717
|
+
* Add multiple test cases.
|
|
718
|
+
* @param testCases - Array of LLMTestCase objects
|
|
719
|
+
* @returns Self for chaining
|
|
720
|
+
*/
|
|
721
|
+
addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
|
|
722
|
+
/**
|
|
723
|
+
* Automatically generate test cases by running all goldens through your LLM app.
|
|
724
|
+
*
|
|
725
|
+
* @param llmApp - A callable that takes messages and returns response
|
|
726
|
+
* @param options - Configuration options
|
|
727
|
+
* @returns Self for chaining
|
|
728
|
+
*/
|
|
729
|
+
generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
|
|
730
|
+
includeContext?: boolean;
|
|
731
|
+
}): Promise<EvaluationDataset>;
|
|
732
|
+
/** Clear all test cases (useful for re-running with different LLM). */
|
|
733
|
+
clearTestCases(): EvaluationDataset;
|
|
734
|
+
/** Return the number of goldens. */
|
|
735
|
+
get length(): number;
|
|
736
|
+
}
|
|
633
737
|
|
|
634
738
|
/**
|
|
635
739
|
* Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
|
|
636
740
|
*
|
|
637
741
|
* Evaluate production outputs or compare different models on your dataset.
|
|
638
742
|
* Results are uploaded to Fallom dashboard for visualization.
|
|
743
|
+
*
|
|
744
|
+
* @example
|
|
745
|
+
* import fallom from "@fallom/trace";
|
|
746
|
+
*
|
|
747
|
+
* // Initialize
|
|
748
|
+
* fallom.evals.init({ apiKey: "flm_xxx" });
|
|
749
|
+
*
|
|
750
|
+
* // Method 1: Direct dataset evaluation
|
|
751
|
+
* const results = await fallom.evals.evaluate({
|
|
752
|
+
* dataset: [...],
|
|
753
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
754
|
+
* });
|
|
755
|
+
*
|
|
756
|
+
* // Method 2: Use EvaluationDataset with your own LLM pipeline
|
|
757
|
+
* const dataset = new fallom.evals.EvaluationDataset();
|
|
758
|
+
* await dataset.pull("my-dataset-key");
|
|
759
|
+
*
|
|
760
|
+
* for (const golden of dataset.goldens) {
|
|
761
|
+
* const actualOutput = await myLLMApp(golden.input);
|
|
762
|
+
* dataset.addTestCase({
|
|
763
|
+
* input: golden.input,
|
|
764
|
+
* actualOutput,
|
|
765
|
+
* });
|
|
766
|
+
* }
|
|
767
|
+
*
|
|
768
|
+
* const results = await fallom.evals.evaluate({
|
|
769
|
+
* testCases: dataset.testCases,
|
|
770
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
771
|
+
* });
|
|
639
772
|
*/
|
|
640
773
|
|
|
641
774
|
declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
|
|
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
|
|
|
646
779
|
type evals_DatasetItem = DatasetItem;
|
|
647
780
|
type evals_EvalResult = EvalResult;
|
|
648
781
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
|
+
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
|
+
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
784
|
+
type evals_Golden = Golden;
|
|
785
|
+
type evals_LLMTestCase = LLMTestCase;
|
|
649
786
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
650
787
|
type evals_Message = Message;
|
|
651
788
|
type evals_MetricInput = MetricInput;
|
|
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
|
|
|
664
801
|
declare const evals_getMetricName: typeof getMetricName;
|
|
665
802
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
666
803
|
declare namespace evals {
|
|
667
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
804
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
668
805
|
}
|
|
669
806
|
|
|
670
807
|
/**
|
package/dist/index.js
CHANGED
|
@@ -590,9 +590,159 @@ async function datasetFromFallom(datasetKey, version, config) {
|
|
|
590
590
|
);
|
|
591
591
|
return items;
|
|
592
592
|
}
|
|
593
|
+
var EvaluationDataset;
|
|
593
594
|
var init_helpers = __esm({
|
|
594
595
|
"src/evals/helpers.ts"() {
|
|
595
596
|
"use strict";
|
|
597
|
+
EvaluationDataset = class {
|
|
598
|
+
constructor() {
|
|
599
|
+
this._goldens = [];
|
|
600
|
+
this._testCases = [];
|
|
601
|
+
this._datasetKey = null;
|
|
602
|
+
this._datasetName = null;
|
|
603
|
+
this._version = null;
|
|
604
|
+
}
|
|
605
|
+
/** List of golden records (inputs with optional expected outputs). */
|
|
606
|
+
get goldens() {
|
|
607
|
+
return this._goldens;
|
|
608
|
+
}
|
|
609
|
+
/** List of test cases (inputs with actual outputs from your LLM). */
|
|
610
|
+
get testCases() {
|
|
611
|
+
return this._testCases;
|
|
612
|
+
}
|
|
613
|
+
/** The Fallom dataset key if pulled from Fallom. */
|
|
614
|
+
get datasetKey() {
|
|
615
|
+
return this._datasetKey;
|
|
616
|
+
}
|
|
617
|
+
/**
|
|
618
|
+
* Pull a dataset from Fallom.
|
|
619
|
+
*
|
|
620
|
+
* @param alias - The dataset key/alias in Fallom
|
|
621
|
+
* @param version - Specific version to pull (default: latest)
|
|
622
|
+
* @returns Self for chaining
|
|
623
|
+
*/
|
|
624
|
+
async pull(alias, version) {
|
|
625
|
+
const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await Promise.resolve().then(() => (init_core(), core_exports));
|
|
626
|
+
if (!_initialized2) {
|
|
627
|
+
throw new Error("Fallom evals not initialized. Call evals.init() first.");
|
|
628
|
+
}
|
|
629
|
+
const params = new URLSearchParams({ include_entries: "true" });
|
|
630
|
+
if (version !== void 0) {
|
|
631
|
+
params.set("version", String(version));
|
|
632
|
+
}
|
|
633
|
+
const url = `${_baseUrl2}/api/datasets/${encodeURIComponent(alias)}?${params}`;
|
|
634
|
+
const response = await fetch(url, {
|
|
635
|
+
headers: {
|
|
636
|
+
Authorization: `Bearer ${_apiKey2}`,
|
|
637
|
+
"Content-Type": "application/json"
|
|
638
|
+
}
|
|
639
|
+
});
|
|
640
|
+
if (response.status === 404) {
|
|
641
|
+
throw new Error(`Dataset '${alias}' not found`);
|
|
642
|
+
} else if (response.status === 403) {
|
|
643
|
+
throw new Error(`Access denied to dataset '${alias}'`);
|
|
644
|
+
}
|
|
645
|
+
if (!response.ok) {
|
|
646
|
+
throw new Error(`Failed to fetch dataset: ${response.statusText}`);
|
|
647
|
+
}
|
|
648
|
+
const data = await response.json();
|
|
649
|
+
this._datasetKey = alias;
|
|
650
|
+
this._datasetName = data.dataset?.name || alias;
|
|
651
|
+
this._version = data.version?.version || null;
|
|
652
|
+
this._goldens = [];
|
|
653
|
+
for (const entry of data.entries || []) {
|
|
654
|
+
this._goldens.push({
|
|
655
|
+
input: entry.input || "",
|
|
656
|
+
expectedOutput: entry.output,
|
|
657
|
+
systemMessage: entry.systemMessage,
|
|
658
|
+
metadata: entry.metadata
|
|
659
|
+
});
|
|
660
|
+
}
|
|
661
|
+
console.log(
|
|
662
|
+
`\u2713 Pulled dataset '${this._datasetName}' (version ${this._version}) with ${this._goldens.length} goldens`
|
|
663
|
+
);
|
|
664
|
+
return this;
|
|
665
|
+
}
|
|
666
|
+
/**
|
|
667
|
+
* Add a golden record manually.
|
|
668
|
+
* @param golden - A Golden object
|
|
669
|
+
* @returns Self for chaining
|
|
670
|
+
*/
|
|
671
|
+
addGolden(golden) {
|
|
672
|
+
this._goldens.push(golden);
|
|
673
|
+
return this;
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Add multiple golden records.
|
|
677
|
+
* @param goldens - Array of Golden objects
|
|
678
|
+
* @returns Self for chaining
|
|
679
|
+
*/
|
|
680
|
+
addGoldens(goldens) {
|
|
681
|
+
this._goldens.push(...goldens);
|
|
682
|
+
return this;
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Add a test case with actual LLM output.
|
|
686
|
+
* @param testCase - An LLMTestCase object
|
|
687
|
+
* @returns Self for chaining
|
|
688
|
+
*/
|
|
689
|
+
addTestCase(testCase) {
|
|
690
|
+
this._testCases.push(testCase);
|
|
691
|
+
return this;
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Add multiple test cases.
|
|
695
|
+
* @param testCases - Array of LLMTestCase objects
|
|
696
|
+
* @returns Self for chaining
|
|
697
|
+
*/
|
|
698
|
+
addTestCases(testCases) {
|
|
699
|
+
this._testCases.push(...testCases);
|
|
700
|
+
return this;
|
|
701
|
+
}
|
|
702
|
+
/**
|
|
703
|
+
* Automatically generate test cases by running all goldens through your LLM app.
|
|
704
|
+
*
|
|
705
|
+
* @param llmApp - A callable that takes messages and returns response
|
|
706
|
+
* @param options - Configuration options
|
|
707
|
+
* @returns Self for chaining
|
|
708
|
+
*/
|
|
709
|
+
async generateTestCases(llmApp, options = {}) {
|
|
710
|
+
const { includeContext = false } = options;
|
|
711
|
+
console.log(`Generating test cases for ${this._goldens.length} goldens...`);
|
|
712
|
+
for (let i = 0; i < this._goldens.length; i++) {
|
|
713
|
+
const golden = this._goldens[i];
|
|
714
|
+
const messages = [];
|
|
715
|
+
if (golden.systemMessage) {
|
|
716
|
+
messages.push({ role: "system", content: golden.systemMessage });
|
|
717
|
+
}
|
|
718
|
+
messages.push({ role: "user", content: golden.input });
|
|
719
|
+
const response = await llmApp(messages);
|
|
720
|
+
const testCase = {
|
|
721
|
+
input: golden.input,
|
|
722
|
+
actualOutput: response.content,
|
|
723
|
+
expectedOutput: golden.expectedOutput,
|
|
724
|
+
systemMessage: golden.systemMessage,
|
|
725
|
+
context: includeContext ? response.context : golden.context,
|
|
726
|
+
metadata: golden.metadata
|
|
727
|
+
};
|
|
728
|
+
this._testCases.push(testCase);
|
|
729
|
+
console.log(
|
|
730
|
+
` [${i + 1}/${this._goldens.length}] Generated output for: ${golden.input.slice(0, 50)}...`
|
|
731
|
+
);
|
|
732
|
+
}
|
|
733
|
+
console.log(`\u2713 Generated ${this._testCases.length} test cases`);
|
|
734
|
+
return this;
|
|
735
|
+
}
|
|
736
|
+
/** Clear all test cases (useful for re-running with different LLM). */
|
|
737
|
+
clearTestCases() {
|
|
738
|
+
this._testCases = [];
|
|
739
|
+
return this;
|
|
740
|
+
}
|
|
741
|
+
/** Return the number of goldens. */
|
|
742
|
+
get length() {
|
|
743
|
+
return this._goldens.length;
|
|
744
|
+
}
|
|
745
|
+
};
|
|
596
746
|
}
|
|
597
747
|
});
|
|
598
748
|
|
|
@@ -707,9 +857,22 @@ async function evaluate(options) {
|
|
|
707
857
|
name,
|
|
708
858
|
description,
|
|
709
859
|
verbose = true,
|
|
860
|
+
testCases,
|
|
710
861
|
_skipUpload = false
|
|
711
862
|
} = options;
|
|
712
|
-
|
|
863
|
+
let dataset;
|
|
864
|
+
if (testCases !== void 0 && testCases.length > 0) {
|
|
865
|
+
dataset = testCases.map((tc) => ({
|
|
866
|
+
input: tc.input,
|
|
867
|
+
output: tc.actualOutput,
|
|
868
|
+
systemMessage: tc.systemMessage,
|
|
869
|
+
metadata: tc.metadata
|
|
870
|
+
}));
|
|
871
|
+
} else if (datasetInput !== void 0) {
|
|
872
|
+
dataset = await resolveDataset(datasetInput);
|
|
873
|
+
} else {
|
|
874
|
+
throw new Error("Either 'dataset' or 'testCases' must be provided");
|
|
875
|
+
}
|
|
713
876
|
for (const m of metrics) {
|
|
714
877
|
if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
|
|
715
878
|
throw new Error(
|
|
@@ -775,6 +938,9 @@ async function compareModels(options) {
|
|
|
775
938
|
description,
|
|
776
939
|
verbose = true
|
|
777
940
|
} = options;
|
|
941
|
+
if (!datasetInput) {
|
|
942
|
+
throw new Error("'dataset' is required for compareModels()");
|
|
943
|
+
}
|
|
778
944
|
const dataset = await resolveDataset(datasetInput);
|
|
779
945
|
const results = {};
|
|
780
946
|
if (includeProduction) {
|
|
@@ -1035,7 +1201,7 @@ var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otl
|
|
|
1035
1201
|
// node_modules/@opentelemetry/resources/build/esm/Resource.js
|
|
1036
1202
|
var import_api = require("@opentelemetry/api");
|
|
1037
1203
|
|
|
1038
|
-
// node_modules/@opentelemetry/
|
|
1204
|
+
// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
1039
1205
|
var SemanticResourceAttributes = {
|
|
1040
1206
|
/**
|
|
1041
1207
|
* Name of the cloud provider.
|
|
@@ -3543,6 +3709,7 @@ var evals_exports = {};
|
|
|
3543
3709
|
__export(evals_exports, {
|
|
3544
3710
|
AVAILABLE_METRICS: () => AVAILABLE_METRICS,
|
|
3545
3711
|
DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
|
|
3712
|
+
EvaluationDataset: () => EvaluationDataset,
|
|
3546
3713
|
METRIC_PROMPTS: () => METRIC_PROMPTS,
|
|
3547
3714
|
compareModels: () => compareModels,
|
|
3548
3715
|
createCustomModel: () => createCustomModel,
|
package/dist/index.mjs
CHANGED
|
@@ -5,6 +5,7 @@ import {
|
|
|
5
5
|
import {
|
|
6
6
|
AVAILABLE_METRICS,
|
|
7
7
|
DEFAULT_JUDGE_MODEL,
|
|
8
|
+
EvaluationDataset,
|
|
8
9
|
METRIC_PROMPTS,
|
|
9
10
|
compareModels,
|
|
10
11
|
createCustomModel,
|
|
@@ -18,7 +19,7 @@ import {
|
|
|
18
19
|
init as init2,
|
|
19
20
|
isCustomMetric,
|
|
20
21
|
uploadResultsPublic
|
|
21
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-3HBKT4HK.mjs";
|
|
22
23
|
import {
|
|
23
24
|
__export
|
|
24
25
|
} from "./chunk-7P6ASYW6.mjs";
|
|
@@ -40,7 +41,7 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
|
40
41
|
// node_modules/@opentelemetry/resources/build/esm/Resource.js
|
|
41
42
|
import { diag } from "@opentelemetry/api";
|
|
42
43
|
|
|
43
|
-
// node_modules/@opentelemetry/
|
|
44
|
+
// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
44
45
|
var SemanticResourceAttributes = {
|
|
45
46
|
/**
|
|
46
47
|
* Name of the cloud provider.
|
|
@@ -2545,6 +2546,7 @@ var evals_exports = {};
|
|
|
2545
2546
|
__export(evals_exports, {
|
|
2546
2547
|
AVAILABLE_METRICS: () => AVAILABLE_METRICS,
|
|
2547
2548
|
DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
|
|
2549
|
+
EvaluationDataset: () => EvaluationDataset,
|
|
2548
2550
|
METRIC_PROMPTS: () => METRIC_PROMPTS,
|
|
2549
2551
|
compareModels: () => compareModels,
|
|
2550
2552
|
createCustomModel: () => createCustomModel,
|
package/package.json
CHANGED