@fallom/trace 0.2.14 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-3HBKT4HK.mjs +827 -0
- package/dist/chunk-XBZ3ESNV.mjs +824 -0
- package/dist/core-4L56QWI7.mjs +21 -0
- package/dist/core-JLHYFVYS.mjs +21 -0
- package/dist/index.d.mts +140 -3
- package/dist/index.d.ts +140 -3
- package/dist/index.js +180 -14
- package/dist/index.mjs +15 -14
- package/package.json +1 -1
- package/dist/chunk-KFD5AQ7V.mjs +0 -308
- package/dist/models-SEFDGZU2.mjs +0 -8
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-3HBKT4HK.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-XBZ3ESNV.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
package/dist/index.d.mts
CHANGED
|
@@ -458,6 +458,37 @@ interface EvalResult {
|
|
|
458
458
|
tokensOut?: number;
|
|
459
459
|
cost?: number;
|
|
460
460
|
}
|
|
461
|
+
/**
|
|
462
|
+
* A golden record from a dataset - contains input and optionally expected output.
|
|
463
|
+
* This represents a "golden" test case that you can use to generate actual outputs
|
|
464
|
+
* from your LLM pipeline and then evaluate.
|
|
465
|
+
*/
|
|
466
|
+
interface Golden {
|
|
467
|
+
input: string;
|
|
468
|
+
expectedOutput?: string;
|
|
469
|
+
systemMessage?: string;
|
|
470
|
+
/** Retrieved documents for RAG evaluation */
|
|
471
|
+
context?: string[];
|
|
472
|
+
metadata?: Record<string, unknown>;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* A test case for evaluation - contains input and actual output from your LLM.
|
|
476
|
+
*
|
|
477
|
+
*/
|
|
478
|
+
interface LLMTestCase {
|
|
479
|
+
/** The user input/query */
|
|
480
|
+
input: string;
|
|
481
|
+
/** The output generated by your LLM pipeline */
|
|
482
|
+
actualOutput: string;
|
|
483
|
+
/** (Optional) The expected/golden output for comparison */
|
|
484
|
+
expectedOutput?: string;
|
|
485
|
+
/** (Optional) System prompt used */
|
|
486
|
+
systemMessage?: string;
|
|
487
|
+
/** (Optional) Retrieved documents for RAG faithfulness evaluation */
|
|
488
|
+
context?: string[];
|
|
489
|
+
/** (Optional) Additional metadata */
|
|
490
|
+
metadata?: Record<string, unknown>;
|
|
491
|
+
}
|
|
461
492
|
/** Response format from model calls */
|
|
462
493
|
interface ModelResponse {
|
|
463
494
|
content: string;
|
|
@@ -487,13 +518,16 @@ interface InitOptions$1 {
|
|
|
487
518
|
}
|
|
488
519
|
/** Options for evaluate() */
|
|
489
520
|
interface EvaluateOptions {
|
|
490
|
-
dataset
|
|
521
|
+
/** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
|
|
522
|
+
dataset?: DatasetInput;
|
|
491
523
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
492
524
|
metrics?: MetricInput[];
|
|
493
525
|
judgeModel?: string;
|
|
494
526
|
name?: string;
|
|
495
527
|
description?: string;
|
|
496
528
|
verbose?: boolean;
|
|
529
|
+
/** Alternative to dataset - provide test cases from EvaluationDataset */
|
|
530
|
+
testCases?: LLMTestCase[];
|
|
497
531
|
_skipUpload?: boolean;
|
|
498
532
|
}
|
|
499
533
|
/** Options for compareModels() */
|
|
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
|
|
|
532
566
|
*/
|
|
533
567
|
declare function init$1(options?: InitOptions$1): void;
|
|
534
568
|
/**
|
|
535
|
-
* Evaluate
|
|
569
|
+
* Evaluate outputs against specified metrics using G-Eval.
|
|
536
570
|
*
|
|
537
571
|
* Results are automatically uploaded to Fallom dashboard.
|
|
572
|
+
*
|
|
538
573
|
*/
|
|
539
574
|
declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
|
|
540
575
|
/**
|
|
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
|
|
|
630
665
|
_baseUrl?: string;
|
|
631
666
|
_initialized?: boolean;
|
|
632
667
|
}): Promise<DatasetItem[]>;
|
|
668
|
+
/**
|
|
669
|
+
* A dataset for evaluation that supports pulling from Fallom and adding test cases.
|
|
670
|
+
*
|
|
671
|
+
* This provides a workflow where you:
|
|
672
|
+
* 1. Pull a dataset (goldens) from Fallom
|
|
673
|
+
* 2. Run your own LLM pipeline on each golden to generate outputs
|
|
674
|
+
* 3. Add the results as test cases
|
|
675
|
+
* 4. Evaluate the test cases
|
|
676
|
+
*
|
|
677
|
+
*/
|
|
678
|
+
declare class EvaluationDataset {
|
|
679
|
+
private _goldens;
|
|
680
|
+
private _testCases;
|
|
681
|
+
private _datasetKey;
|
|
682
|
+
private _datasetName;
|
|
683
|
+
private _version;
|
|
684
|
+
/** List of golden records (inputs with optional expected outputs). */
|
|
685
|
+
get goldens(): Golden[];
|
|
686
|
+
/** List of test cases (inputs with actual outputs from your LLM). */
|
|
687
|
+
get testCases(): LLMTestCase[];
|
|
688
|
+
/** The Fallom dataset key if pulled from Fallom. */
|
|
689
|
+
get datasetKey(): string | null;
|
|
690
|
+
/**
|
|
691
|
+
* Pull a dataset from Fallom.
|
|
692
|
+
*
|
|
693
|
+
* @param alias - The dataset key/alias in Fallom
|
|
694
|
+
* @param version - Specific version to pull (default: latest)
|
|
695
|
+
* @returns Self for chaining
|
|
696
|
+
*/
|
|
697
|
+
pull(alias: string, version?: number): Promise<EvaluationDataset>;
|
|
698
|
+
/**
|
|
699
|
+
* Add a golden record manually.
|
|
700
|
+
* @param golden - A Golden object
|
|
701
|
+
* @returns Self for chaining
|
|
702
|
+
*/
|
|
703
|
+
addGolden(golden: Golden): EvaluationDataset;
|
|
704
|
+
/**
|
|
705
|
+
* Add multiple golden records.
|
|
706
|
+
* @param goldens - Array of Golden objects
|
|
707
|
+
* @returns Self for chaining
|
|
708
|
+
*/
|
|
709
|
+
addGoldens(goldens: Golden[]): EvaluationDataset;
|
|
710
|
+
/**
|
|
711
|
+
* Add a test case with actual LLM output.
|
|
712
|
+
* @param testCase - An LLMTestCase object
|
|
713
|
+
* @returns Self for chaining
|
|
714
|
+
*/
|
|
715
|
+
addTestCase(testCase: LLMTestCase): EvaluationDataset;
|
|
716
|
+
/**
|
|
717
|
+
* Add multiple test cases.
|
|
718
|
+
* @param testCases - Array of LLMTestCase objects
|
|
719
|
+
* @returns Self for chaining
|
|
720
|
+
*/
|
|
721
|
+
addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
|
|
722
|
+
/**
|
|
723
|
+
* Automatically generate test cases by running all goldens through your LLM app.
|
|
724
|
+
*
|
|
725
|
+
* @param llmApp - A callable that takes messages and returns response
|
|
726
|
+
* @param options - Configuration options
|
|
727
|
+
* @returns Self for chaining
|
|
728
|
+
*/
|
|
729
|
+
generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
|
|
730
|
+
includeContext?: boolean;
|
|
731
|
+
}): Promise<EvaluationDataset>;
|
|
732
|
+
/** Clear all test cases (useful for re-running with different LLM). */
|
|
733
|
+
clearTestCases(): EvaluationDataset;
|
|
734
|
+
/** Return the number of goldens. */
|
|
735
|
+
get length(): number;
|
|
736
|
+
}
|
|
633
737
|
|
|
634
738
|
/**
|
|
635
739
|
* Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
|
|
636
740
|
*
|
|
637
741
|
* Evaluate production outputs or compare different models on your dataset.
|
|
638
742
|
* Results are uploaded to Fallom dashboard for visualization.
|
|
743
|
+
*
|
|
744
|
+
* @example
|
|
745
|
+
* import fallom from "@fallom/trace";
|
|
746
|
+
*
|
|
747
|
+
* // Initialize
|
|
748
|
+
* fallom.evals.init({ apiKey: "flm_xxx" });
|
|
749
|
+
*
|
|
750
|
+
* // Method 1: Direct dataset evaluation
|
|
751
|
+
* const results = await fallom.evals.evaluate({
|
|
752
|
+
* dataset: [...],
|
|
753
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
754
|
+
* });
|
|
755
|
+
*
|
|
756
|
+
* // Method 2: Use EvaluationDataset with your own LLM pipeline
|
|
757
|
+
* const dataset = new fallom.evals.EvaluationDataset();
|
|
758
|
+
* await dataset.pull("my-dataset-key");
|
|
759
|
+
*
|
|
760
|
+
* for (const golden of dataset.goldens) {
|
|
761
|
+
* const actualOutput = await myLLMApp(golden.input);
|
|
762
|
+
* dataset.addTestCase({
|
|
763
|
+
* input: golden.input,
|
|
764
|
+
* actualOutput,
|
|
765
|
+
* });
|
|
766
|
+
* }
|
|
767
|
+
*
|
|
768
|
+
* const results = await fallom.evals.evaluate({
|
|
769
|
+
* testCases: dataset.testCases,
|
|
770
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
771
|
+
* });
|
|
639
772
|
*/
|
|
640
773
|
|
|
641
774
|
declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
|
|
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
|
|
|
646
779
|
type evals_DatasetItem = DatasetItem;
|
|
647
780
|
type evals_EvalResult = EvalResult;
|
|
648
781
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
|
+
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
|
+
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
784
|
+
type evals_Golden = Golden;
|
|
785
|
+
type evals_LLMTestCase = LLMTestCase;
|
|
649
786
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
650
787
|
type evals_Message = Message;
|
|
651
788
|
type evals_MetricInput = MetricInput;
|
|
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
|
|
|
664
801
|
declare const evals_getMetricName: typeof getMetricName;
|
|
665
802
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
666
803
|
declare namespace evals {
|
|
667
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
804
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
668
805
|
}
|
|
669
806
|
|
|
670
807
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -458,6 +458,37 @@ interface EvalResult {
|
|
|
458
458
|
tokensOut?: number;
|
|
459
459
|
cost?: number;
|
|
460
460
|
}
|
|
461
|
+
/**
|
|
462
|
+
* A golden record from a dataset - contains input and optionally expected output.
|
|
463
|
+
* This represents a "golden" test case that you can use to generate actual outputs
|
|
464
|
+
* from your LLM pipeline and then evaluate.
|
|
465
|
+
*/
|
|
466
|
+
interface Golden {
|
|
467
|
+
input: string;
|
|
468
|
+
expectedOutput?: string;
|
|
469
|
+
systemMessage?: string;
|
|
470
|
+
/** Retrieved documents for RAG evaluation */
|
|
471
|
+
context?: string[];
|
|
472
|
+
metadata?: Record<string, unknown>;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* A test case for evaluation - contains input and actual output from your LLM.
|
|
476
|
+
*
|
|
477
|
+
*/
|
|
478
|
+
interface LLMTestCase {
|
|
479
|
+
/** The user input/query */
|
|
480
|
+
input: string;
|
|
481
|
+
/** The output generated by your LLM pipeline */
|
|
482
|
+
actualOutput: string;
|
|
483
|
+
/** (Optional) The expected/golden output for comparison */
|
|
484
|
+
expectedOutput?: string;
|
|
485
|
+
/** (Optional) System prompt used */
|
|
486
|
+
systemMessage?: string;
|
|
487
|
+
/** (Optional) Retrieved documents for RAG faithfulness evaluation */
|
|
488
|
+
context?: string[];
|
|
489
|
+
/** (Optional) Additional metadata */
|
|
490
|
+
metadata?: Record<string, unknown>;
|
|
491
|
+
}
|
|
461
492
|
/** Response format from model calls */
|
|
462
493
|
interface ModelResponse {
|
|
463
494
|
content: string;
|
|
@@ -487,13 +518,16 @@ interface InitOptions$1 {
|
|
|
487
518
|
}
|
|
488
519
|
/** Options for evaluate() */
|
|
489
520
|
interface EvaluateOptions {
|
|
490
|
-
dataset
|
|
521
|
+
/** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
|
|
522
|
+
dataset?: DatasetInput;
|
|
491
523
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
492
524
|
metrics?: MetricInput[];
|
|
493
525
|
judgeModel?: string;
|
|
494
526
|
name?: string;
|
|
495
527
|
description?: string;
|
|
496
528
|
verbose?: boolean;
|
|
529
|
+
/** Alternative to dataset - provide test cases from EvaluationDataset */
|
|
530
|
+
testCases?: LLMTestCase[];
|
|
497
531
|
_skipUpload?: boolean;
|
|
498
532
|
}
|
|
499
533
|
/** Options for compareModels() */
|
|
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
|
|
|
532
566
|
*/
|
|
533
567
|
declare function init$1(options?: InitOptions$1): void;
|
|
534
568
|
/**
|
|
535
|
-
* Evaluate
|
|
569
|
+
* Evaluate outputs against specified metrics using G-Eval.
|
|
536
570
|
*
|
|
537
571
|
* Results are automatically uploaded to Fallom dashboard.
|
|
572
|
+
*
|
|
538
573
|
*/
|
|
539
574
|
declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
|
|
540
575
|
/**
|
|
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
|
|
|
630
665
|
_baseUrl?: string;
|
|
631
666
|
_initialized?: boolean;
|
|
632
667
|
}): Promise<DatasetItem[]>;
|
|
668
|
+
/**
|
|
669
|
+
* A dataset for evaluation that supports pulling from Fallom and adding test cases.
|
|
670
|
+
*
|
|
671
|
+
* This provides a workflow where you:
|
|
672
|
+
* 1. Pull a dataset (goldens) from Fallom
|
|
673
|
+
* 2. Run your own LLM pipeline on each golden to generate outputs
|
|
674
|
+
* 3. Add the results as test cases
|
|
675
|
+
* 4. Evaluate the test cases
|
|
676
|
+
*
|
|
677
|
+
*/
|
|
678
|
+
declare class EvaluationDataset {
|
|
679
|
+
private _goldens;
|
|
680
|
+
private _testCases;
|
|
681
|
+
private _datasetKey;
|
|
682
|
+
private _datasetName;
|
|
683
|
+
private _version;
|
|
684
|
+
/** List of golden records (inputs with optional expected outputs). */
|
|
685
|
+
get goldens(): Golden[];
|
|
686
|
+
/** List of test cases (inputs with actual outputs from your LLM). */
|
|
687
|
+
get testCases(): LLMTestCase[];
|
|
688
|
+
/** The Fallom dataset key if pulled from Fallom. */
|
|
689
|
+
get datasetKey(): string | null;
|
|
690
|
+
/**
|
|
691
|
+
* Pull a dataset from Fallom.
|
|
692
|
+
*
|
|
693
|
+
* @param alias - The dataset key/alias in Fallom
|
|
694
|
+
* @param version - Specific version to pull (default: latest)
|
|
695
|
+
* @returns Self for chaining
|
|
696
|
+
*/
|
|
697
|
+
pull(alias: string, version?: number): Promise<EvaluationDataset>;
|
|
698
|
+
/**
|
|
699
|
+
* Add a golden record manually.
|
|
700
|
+
* @param golden - A Golden object
|
|
701
|
+
* @returns Self for chaining
|
|
702
|
+
*/
|
|
703
|
+
addGolden(golden: Golden): EvaluationDataset;
|
|
704
|
+
/**
|
|
705
|
+
* Add multiple golden records.
|
|
706
|
+
* @param goldens - Array of Golden objects
|
|
707
|
+
* @returns Self for chaining
|
|
708
|
+
*/
|
|
709
|
+
addGoldens(goldens: Golden[]): EvaluationDataset;
|
|
710
|
+
/**
|
|
711
|
+
* Add a test case with actual LLM output.
|
|
712
|
+
* @param testCase - An LLMTestCase object
|
|
713
|
+
* @returns Self for chaining
|
|
714
|
+
*/
|
|
715
|
+
addTestCase(testCase: LLMTestCase): EvaluationDataset;
|
|
716
|
+
/**
|
|
717
|
+
* Add multiple test cases.
|
|
718
|
+
* @param testCases - Array of LLMTestCase objects
|
|
719
|
+
* @returns Self for chaining
|
|
720
|
+
*/
|
|
721
|
+
addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
|
|
722
|
+
/**
|
|
723
|
+
* Automatically generate test cases by running all goldens through your LLM app.
|
|
724
|
+
*
|
|
725
|
+
* @param llmApp - A callable that takes messages and returns response
|
|
726
|
+
* @param options - Configuration options
|
|
727
|
+
* @returns Self for chaining
|
|
728
|
+
*/
|
|
729
|
+
generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
|
|
730
|
+
includeContext?: boolean;
|
|
731
|
+
}): Promise<EvaluationDataset>;
|
|
732
|
+
/** Clear all test cases (useful for re-running with different LLM). */
|
|
733
|
+
clearTestCases(): EvaluationDataset;
|
|
734
|
+
/** Return the number of goldens. */
|
|
735
|
+
get length(): number;
|
|
736
|
+
}
|
|
633
737
|
|
|
634
738
|
/**
|
|
635
739
|
* Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
|
|
636
740
|
*
|
|
637
741
|
* Evaluate production outputs or compare different models on your dataset.
|
|
638
742
|
* Results are uploaded to Fallom dashboard for visualization.
|
|
743
|
+
*
|
|
744
|
+
* @example
|
|
745
|
+
* import fallom from "@fallom/trace";
|
|
746
|
+
*
|
|
747
|
+
* // Initialize
|
|
748
|
+
* fallom.evals.init({ apiKey: "flm_xxx" });
|
|
749
|
+
*
|
|
750
|
+
* // Method 1: Direct dataset evaluation
|
|
751
|
+
* const results = await fallom.evals.evaluate({
|
|
752
|
+
* dataset: [...],
|
|
753
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
754
|
+
* });
|
|
755
|
+
*
|
|
756
|
+
* // Method 2: Use EvaluationDataset with your own LLM pipeline
|
|
757
|
+
* const dataset = new fallom.evals.EvaluationDataset();
|
|
758
|
+
* await dataset.pull("my-dataset-key");
|
|
759
|
+
*
|
|
760
|
+
* for (const golden of dataset.goldens) {
|
|
761
|
+
* const actualOutput = await myLLMApp(golden.input);
|
|
762
|
+
* dataset.addTestCase({
|
|
763
|
+
* input: golden.input,
|
|
764
|
+
* actualOutput,
|
|
765
|
+
* });
|
|
766
|
+
* }
|
|
767
|
+
*
|
|
768
|
+
* const results = await fallom.evals.evaluate({
|
|
769
|
+
* testCases: dataset.testCases,
|
|
770
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
771
|
+
* });
|
|
639
772
|
*/
|
|
640
773
|
|
|
641
774
|
declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
|
|
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
|
|
|
646
779
|
type evals_DatasetItem = DatasetItem;
|
|
647
780
|
type evals_EvalResult = EvalResult;
|
|
648
781
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
|
+
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
|
+
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
784
|
+
type evals_Golden = Golden;
|
|
785
|
+
type evals_LLMTestCase = LLMTestCase;
|
|
649
786
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
650
787
|
type evals_Message = Message;
|
|
651
788
|
type evals_MetricInput = MetricInput;
|
|
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
|
|
|
664
801
|
declare const evals_getMetricName: typeof getMetricName;
|
|
665
802
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
666
803
|
declare namespace evals {
|
|
667
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
804
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
668
805
|
}
|
|
669
806
|
|
|
670
807
|
/**
|