@fallom/trace 0.2.15 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ import {
2
+ DEFAULT_JUDGE_MODEL,
3
+ _apiKey,
4
+ _baseUrl,
5
+ _initialized,
6
+ compareModels,
7
+ evaluate,
8
+ init,
9
+ uploadResultsPublic
10
+ } from "./chunk-3HBKT4HK.mjs";
11
+ import "./chunk-7P6ASYW6.mjs";
12
+ export {
13
+ DEFAULT_JUDGE_MODEL,
14
+ _apiKey,
15
+ _baseUrl,
16
+ _initialized,
17
+ compareModels,
18
+ evaluate,
19
+ init,
20
+ uploadResultsPublic
21
+ };
package/dist/index.d.mts CHANGED
@@ -458,6 +458,37 @@ interface EvalResult {
458
458
  tokensOut?: number;
459
459
  cost?: number;
460
460
  }
461
+ /**
462
+ * A golden record from a dataset - contains input and optionally expected output.
463
+ * This represents a "golden" test case that you can use to generate actual outputs
464
+ * from your LLM pipeline and then evaluate.
465
+ */
466
+ interface Golden {
467
+ input: string;
468
+ expectedOutput?: string;
469
+ systemMessage?: string;
470
+ /** Retrieved documents for RAG evaluation */
471
+ context?: string[];
472
+ metadata?: Record<string, unknown>;
473
+ }
474
+ /**
475
+ * A test case for evaluation - contains input and actual output from your LLM.
476
+ *
477
+ */
478
+ interface LLMTestCase {
479
+ /** The user input/query */
480
+ input: string;
481
+ /** The output generated by your LLM pipeline */
482
+ actualOutput: string;
483
+ /** (Optional) The expected/golden output for comparison */
484
+ expectedOutput?: string;
485
+ /** (Optional) System prompt used */
486
+ systemMessage?: string;
487
+ /** (Optional) Retrieved documents for RAG faithfulness evaluation */
488
+ context?: string[];
489
+ /** (Optional) Additional metadata */
490
+ metadata?: Record<string, unknown>;
491
+ }
461
492
  /** Response format from model calls */
462
493
  interface ModelResponse {
463
494
  content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
487
518
  }
488
519
  /** Options for evaluate() */
489
520
  interface EvaluateOptions {
490
- dataset: DatasetInput;
521
+ /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
522
+ dataset?: DatasetInput;
491
523
  /** List of metrics to run (built-in or custom). Default: all built-in metrics */
492
524
  metrics?: MetricInput[];
493
525
  judgeModel?: string;
494
526
  name?: string;
495
527
  description?: string;
496
528
  verbose?: boolean;
529
+ /** Alternative to dataset - provide test cases from EvaluationDataset */
530
+ testCases?: LLMTestCase[];
497
531
  _skipUpload?: boolean;
498
532
  }
499
533
  /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
532
566
  */
533
567
  declare function init$1(options?: InitOptions$1): void;
534
568
  /**
535
- * Evaluate production outputs against specified metrics using G-Eval.
569
+ * Evaluate outputs against specified metrics using G-Eval.
536
570
  *
537
571
  * Results are automatically uploaded to Fallom dashboard.
572
+ *
538
573
  */
539
574
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
540
575
  /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
630
665
  _baseUrl?: string;
631
666
  _initialized?: boolean;
632
667
  }): Promise<DatasetItem[]>;
668
+ /**
669
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
670
+ *
671
+ * This provides a workflow where you:
672
+ * 1. Pull a dataset (goldens) from Fallom
673
+ * 2. Run your own LLM pipeline on each golden to generate outputs
674
+ * 3. Add the results as test cases
675
+ * 4. Evaluate the test cases
676
+ *
677
+ */
678
+ declare class EvaluationDataset {
679
+ private _goldens;
680
+ private _testCases;
681
+ private _datasetKey;
682
+ private _datasetName;
683
+ private _version;
684
+ /** List of golden records (inputs with optional expected outputs). */
685
+ get goldens(): Golden[];
686
+ /** List of test cases (inputs with actual outputs from your LLM). */
687
+ get testCases(): LLMTestCase[];
688
+ /** The Fallom dataset key if pulled from Fallom. */
689
+ get datasetKey(): string | null;
690
+ /**
691
+ * Pull a dataset from Fallom.
692
+ *
693
+ * @param alias - The dataset key/alias in Fallom
694
+ * @param version - Specific version to pull (default: latest)
695
+ * @returns Self for chaining
696
+ */
697
+ pull(alias: string, version?: number): Promise<EvaluationDataset>;
698
+ /**
699
+ * Add a golden record manually.
700
+ * @param golden - A Golden object
701
+ * @returns Self for chaining
702
+ */
703
+ addGolden(golden: Golden): EvaluationDataset;
704
+ /**
705
+ * Add multiple golden records.
706
+ * @param goldens - Array of Golden objects
707
+ * @returns Self for chaining
708
+ */
709
+ addGoldens(goldens: Golden[]): EvaluationDataset;
710
+ /**
711
+ * Add a test case with actual LLM output.
712
+ * @param testCase - An LLMTestCase object
713
+ * @returns Self for chaining
714
+ */
715
+ addTestCase(testCase: LLMTestCase): EvaluationDataset;
716
+ /**
717
+ * Add multiple test cases.
718
+ * @param testCases - Array of LLMTestCase objects
719
+ * @returns Self for chaining
720
+ */
721
+ addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
722
+ /**
723
+ * Automatically generate test cases by running all goldens through your LLM app.
724
+ *
725
+ * @param llmApp - A callable that takes messages and returns response
726
+ * @param options - Configuration options
727
+ * @returns Self for chaining
728
+ */
729
+ generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
730
+ includeContext?: boolean;
731
+ }): Promise<EvaluationDataset>;
732
+ /** Clear all test cases (useful for re-running with different LLM). */
733
+ clearTestCases(): EvaluationDataset;
734
+ /** Return the number of goldens. */
735
+ get length(): number;
736
+ }
633
737
 
634
738
  /**
635
739
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
636
740
  *
637
741
  * Evaluate production outputs or compare different models on your dataset.
638
742
  * Results are uploaded to Fallom dashboard for visualization.
743
+ *
744
+ * @example
745
+ * import fallom from "@fallom/trace";
746
+ *
747
+ * // Initialize
748
+ * fallom.evals.init({ apiKey: "flm_xxx" });
749
+ *
750
+ * // Method 1: Direct dataset evaluation
751
+ * const results = await fallom.evals.evaluate({
752
+ * dataset: [...],
753
+ * metrics: ["answer_relevancy", "faithfulness"],
754
+ * });
755
+ *
756
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
757
+ * const dataset = new fallom.evals.EvaluationDataset();
758
+ * await dataset.pull("my-dataset-key");
759
+ *
760
+ * for (const golden of dataset.goldens) {
761
+ * const actualOutput = await myLLMApp(golden.input);
762
+ * dataset.addTestCase({
763
+ * input: golden.input,
764
+ * actualOutput,
765
+ * });
766
+ * }
767
+ *
768
+ * const results = await fallom.evals.evaluate({
769
+ * testCases: dataset.testCases,
770
+ * metrics: ["answer_relevancy", "faithfulness"],
771
+ * });
639
772
  */
640
773
 
641
774
  declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
646
779
  type evals_DatasetItem = DatasetItem;
647
780
  type evals_EvalResult = EvalResult;
648
781
  type evals_EvaluateOptions = EvaluateOptions;
782
+ type evals_EvaluationDataset = EvaluationDataset;
783
+ declare const evals_EvaluationDataset: typeof EvaluationDataset;
784
+ type evals_Golden = Golden;
785
+ type evals_LLMTestCase = LLMTestCase;
649
786
  declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
650
787
  type evals_Message = Message;
651
788
  type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
664
801
  declare const evals_getMetricName: typeof getMetricName;
665
802
  declare const evals_isCustomMetric: typeof isCustomMetric;
666
803
  declare namespace evals {
667
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
804
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
668
805
  }
669
806
 
670
807
  /**
package/dist/index.d.ts CHANGED
@@ -458,6 +458,37 @@ interface EvalResult {
458
458
  tokensOut?: number;
459
459
  cost?: number;
460
460
  }
461
+ /**
462
+ * A golden record from a dataset - contains input and optionally expected output.
463
+ * This represents a "golden" test case that you can use to generate actual outputs
464
+ * from your LLM pipeline and then evaluate.
465
+ */
466
+ interface Golden {
467
+ input: string;
468
+ expectedOutput?: string;
469
+ systemMessage?: string;
470
+ /** Retrieved documents for RAG evaluation */
471
+ context?: string[];
472
+ metadata?: Record<string, unknown>;
473
+ }
474
+ /**
475
+ * A test case for evaluation - contains input and actual output from your LLM.
476
+ *
477
+ */
478
+ interface LLMTestCase {
479
+ /** The user input/query */
480
+ input: string;
481
+ /** The output generated by your LLM pipeline */
482
+ actualOutput: string;
483
+ /** (Optional) The expected/golden output for comparison */
484
+ expectedOutput?: string;
485
+ /** (Optional) System prompt used */
486
+ systemMessage?: string;
487
+ /** (Optional) Retrieved documents for RAG faithfulness evaluation */
488
+ context?: string[];
489
+ /** (Optional) Additional metadata */
490
+ metadata?: Record<string, unknown>;
491
+ }
461
492
  /** Response format from model calls */
462
493
  interface ModelResponse {
463
494
  content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
487
518
  }
488
519
  /** Options for evaluate() */
489
520
  interface EvaluateOptions {
490
- dataset: DatasetInput;
521
+ /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
522
+ dataset?: DatasetInput;
491
523
  /** List of metrics to run (built-in or custom). Default: all built-in metrics */
492
524
  metrics?: MetricInput[];
493
525
  judgeModel?: string;
494
526
  name?: string;
495
527
  description?: string;
496
528
  verbose?: boolean;
529
+ /** Alternative to dataset - provide test cases from EvaluationDataset */
530
+ testCases?: LLMTestCase[];
497
531
  _skipUpload?: boolean;
498
532
  }
499
533
  /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
532
566
  */
533
567
  declare function init$1(options?: InitOptions$1): void;
534
568
  /**
535
- * Evaluate production outputs against specified metrics using G-Eval.
569
+ * Evaluate outputs against specified metrics using G-Eval.
536
570
  *
537
571
  * Results are automatically uploaded to Fallom dashboard.
572
+ *
538
573
  */
539
574
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
540
575
  /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
630
665
  _baseUrl?: string;
631
666
  _initialized?: boolean;
632
667
  }): Promise<DatasetItem[]>;
668
+ /**
669
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
670
+ *
671
+ * This provides a workflow where you:
672
+ * 1. Pull a dataset (goldens) from Fallom
673
+ * 2. Run your own LLM pipeline on each golden to generate outputs
674
+ * 3. Add the results as test cases
675
+ * 4. Evaluate the test cases
676
+ *
677
+ */
678
+ declare class EvaluationDataset {
679
+ private _goldens;
680
+ private _testCases;
681
+ private _datasetKey;
682
+ private _datasetName;
683
+ private _version;
684
+ /** List of golden records (inputs with optional expected outputs). */
685
+ get goldens(): Golden[];
686
+ /** List of test cases (inputs with actual outputs from your LLM). */
687
+ get testCases(): LLMTestCase[];
688
+ /** The Fallom dataset key if pulled from Fallom. */
689
+ get datasetKey(): string | null;
690
+ /**
691
+ * Pull a dataset from Fallom.
692
+ *
693
+ * @param alias - The dataset key/alias in Fallom
694
+ * @param version - Specific version to pull (default: latest)
695
+ * @returns Self for chaining
696
+ */
697
+ pull(alias: string, version?: number): Promise<EvaluationDataset>;
698
+ /**
699
+ * Add a golden record manually.
700
+ * @param golden - A Golden object
701
+ * @returns Self for chaining
702
+ */
703
+ addGolden(golden: Golden): EvaluationDataset;
704
+ /**
705
+ * Add multiple golden records.
706
+ * @param goldens - Array of Golden objects
707
+ * @returns Self for chaining
708
+ */
709
+ addGoldens(goldens: Golden[]): EvaluationDataset;
710
+ /**
711
+ * Add a test case with actual LLM output.
712
+ * @param testCase - An LLMTestCase object
713
+ * @returns Self for chaining
714
+ */
715
+ addTestCase(testCase: LLMTestCase): EvaluationDataset;
716
+ /**
717
+ * Add multiple test cases.
718
+ * @param testCases - Array of LLMTestCase objects
719
+ * @returns Self for chaining
720
+ */
721
+ addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
722
+ /**
723
+ * Automatically generate test cases by running all goldens through your LLM app.
724
+ *
725
+ * @param llmApp - A callable that takes messages and returns response
726
+ * @param options - Configuration options
727
+ * @returns Self for chaining
728
+ */
729
+ generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
730
+ includeContext?: boolean;
731
+ }): Promise<EvaluationDataset>;
732
+ /** Clear all test cases (useful for re-running with different LLM). */
733
+ clearTestCases(): EvaluationDataset;
734
+ /** Return the number of goldens. */
735
+ get length(): number;
736
+ }
633
737
 
634
738
  /**
635
739
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
636
740
  *
637
741
  * Evaluate production outputs or compare different models on your dataset.
638
742
  * Results are uploaded to Fallom dashboard for visualization.
743
+ *
744
+ * @example
745
+ * import fallom from "@fallom/trace";
746
+ *
747
+ * // Initialize
748
+ * fallom.evals.init({ apiKey: "flm_xxx" });
749
+ *
750
+ * // Method 1: Direct dataset evaluation
751
+ * const results = await fallom.evals.evaluate({
752
+ * dataset: [...],
753
+ * metrics: ["answer_relevancy", "faithfulness"],
754
+ * });
755
+ *
756
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
757
+ * const dataset = new fallom.evals.EvaluationDataset();
758
+ * await dataset.pull("my-dataset-key");
759
+ *
760
+ * for (const golden of dataset.goldens) {
761
+ * const actualOutput = await myLLMApp(golden.input);
762
+ * dataset.addTestCase({
763
+ * input: golden.input,
764
+ * actualOutput,
765
+ * });
766
+ * }
767
+ *
768
+ * const results = await fallom.evals.evaluate({
769
+ * testCases: dataset.testCases,
770
+ * metrics: ["answer_relevancy", "faithfulness"],
771
+ * });
639
772
  */
640
773
 
641
774
  declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
646
779
  type evals_DatasetItem = DatasetItem;
647
780
  type evals_EvalResult = EvalResult;
648
781
  type evals_EvaluateOptions = EvaluateOptions;
782
+ type evals_EvaluationDataset = EvaluationDataset;
783
+ declare const evals_EvaluationDataset: typeof EvaluationDataset;
784
+ type evals_Golden = Golden;
785
+ type evals_LLMTestCase = LLMTestCase;
649
786
  declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
650
787
  type evals_Message = Message;
651
788
  type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
664
801
  declare const evals_getMetricName: typeof getMetricName;
665
802
  declare const evals_isCustomMetric: typeof isCustomMetric;
666
803
  declare namespace evals {
667
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
804
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
668
805
  }
669
806
 
670
807
  /**