@fallom/trace 0.2.15 → 0.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ import {
2
+ DEFAULT_JUDGE_MODEL,
3
+ _apiKey,
4
+ _baseUrl,
5
+ _initialized,
6
+ compareModels,
7
+ evaluate,
8
+ init,
9
+ uploadResultsPublic
10
+ } from "./chunk-3HBKT4HK.mjs";
11
+ import "./chunk-7P6ASYW6.mjs";
12
+ export {
13
+ DEFAULT_JUDGE_MODEL,
14
+ _apiKey,
15
+ _baseUrl,
16
+ _initialized,
17
+ compareModels,
18
+ evaluate,
19
+ init,
20
+ uploadResultsPublic
21
+ };
@@ -0,0 +1,21 @@
1
+ import {
2
+ DEFAULT_JUDGE_MODEL,
3
+ _apiKey,
4
+ _baseUrl,
5
+ _initialized,
6
+ compareModels,
7
+ evaluate,
8
+ init,
9
+ uploadResultsPublic
10
+ } from "./chunk-XBZ3ESNV.mjs";
11
+ import "./chunk-7P6ASYW6.mjs";
12
+ export {
13
+ DEFAULT_JUDGE_MODEL,
14
+ _apiKey,
15
+ _baseUrl,
16
+ _initialized,
17
+ compareModels,
18
+ evaluate,
19
+ init,
20
+ uploadResultsPublic
21
+ };
package/dist/index.d.mts CHANGED
@@ -458,6 +458,37 @@ interface EvalResult {
458
458
  tokensOut?: number;
459
459
  cost?: number;
460
460
  }
461
+ /**
462
+ * A golden record from a dataset - contains input and optionally expected output.
463
+ * This represents a "golden" test case that you can use to generate actual outputs
464
+ * from your LLM pipeline and then evaluate.
465
+ */
466
+ interface Golden {
467
+ input: string;
468
+ expectedOutput?: string;
469
+ systemMessage?: string;
470
+ /** Retrieved documents for RAG evaluation */
471
+ context?: string[];
472
+ metadata?: Record<string, unknown>;
473
+ }
474
+ /**
475
+ * A test case for evaluation - contains input and actual output from your LLM.
476
+ *
477
+ */
478
+ interface LLMTestCase {
479
+ /** The user input/query */
480
+ input: string;
481
+ /** The output generated by your LLM pipeline */
482
+ actualOutput: string;
483
+ /** (Optional) The expected/golden output for comparison */
484
+ expectedOutput?: string;
485
+ /** (Optional) System prompt used */
486
+ systemMessage?: string;
487
+ /** (Optional) Retrieved documents for RAG faithfulness evaluation */
488
+ context?: string[];
489
+ /** (Optional) Additional metadata */
490
+ metadata?: Record<string, unknown>;
491
+ }
461
492
  /** Response format from model calls */
462
493
  interface ModelResponse {
463
494
  content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
487
518
  }
488
519
  /** Options for evaluate() */
489
520
  interface EvaluateOptions {
490
- dataset: DatasetInput;
521
+ /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
522
+ dataset?: DatasetInput;
491
523
  /** List of metrics to run (built-in or custom). Default: all built-in metrics */
492
524
  metrics?: MetricInput[];
493
525
  judgeModel?: string;
494
526
  name?: string;
495
527
  description?: string;
496
528
  verbose?: boolean;
529
+ /** Alternative to dataset - provide test cases from EvaluationDataset */
530
+ testCases?: LLMTestCase[];
497
531
  _skipUpload?: boolean;
498
532
  }
499
533
  /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
532
566
  */
533
567
  declare function init$1(options?: InitOptions$1): void;
534
568
  /**
535
- * Evaluate production outputs against specified metrics using G-Eval.
569
+ * Evaluate outputs against specified metrics using G-Eval.
536
570
  *
537
571
  * Results are automatically uploaded to Fallom dashboard.
572
+ *
538
573
  */
539
574
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
540
575
  /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
630
665
  _baseUrl?: string;
631
666
  _initialized?: boolean;
632
667
  }): Promise<DatasetItem[]>;
668
+ /**
669
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
670
+ *
671
+ * This provides a workflow where you:
672
+ * 1. Pull a dataset (goldens) from Fallom
673
+ * 2. Run your own LLM pipeline on each golden to generate outputs
674
+ * 3. Add the results as test cases
675
+ * 4. Evaluate the test cases
676
+ *
677
+ */
678
+ declare class EvaluationDataset {
679
+ private _goldens;
680
+ private _testCases;
681
+ private _datasetKey;
682
+ private _datasetName;
683
+ private _version;
684
+ /** List of golden records (inputs with optional expected outputs). */
685
+ get goldens(): Golden[];
686
+ /** List of test cases (inputs with actual outputs from your LLM). */
687
+ get testCases(): LLMTestCase[];
688
+ /** The Fallom dataset key if pulled from Fallom. */
689
+ get datasetKey(): string | null;
690
+ /**
691
+ * Pull a dataset from Fallom.
692
+ *
693
+ * @param alias - The dataset key/alias in Fallom
694
+ * @param version - Specific version to pull (default: latest)
695
+ * @returns Self for chaining
696
+ */
697
+ pull(alias: string, version?: number): Promise<EvaluationDataset>;
698
+ /**
699
+ * Add a golden record manually.
700
+ * @param golden - A Golden object
701
+ * @returns Self for chaining
702
+ */
703
+ addGolden(golden: Golden): EvaluationDataset;
704
+ /**
705
+ * Add multiple golden records.
706
+ * @param goldens - Array of Golden objects
707
+ * @returns Self for chaining
708
+ */
709
+ addGoldens(goldens: Golden[]): EvaluationDataset;
710
+ /**
711
+ * Add a test case with actual LLM output.
712
+ * @param testCase - An LLMTestCase object
713
+ * @returns Self for chaining
714
+ */
715
+ addTestCase(testCase: LLMTestCase): EvaluationDataset;
716
+ /**
717
+ * Add multiple test cases.
718
+ * @param testCases - Array of LLMTestCase objects
719
+ * @returns Self for chaining
720
+ */
721
+ addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
722
+ /**
723
+ * Automatically generate test cases by running all goldens through your LLM app.
724
+ *
725
+ * @param llmApp - A callable that takes messages and returns response
726
+ * @param options - Configuration options
727
+ * @returns Self for chaining
728
+ */
729
+ generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
730
+ includeContext?: boolean;
731
+ }): Promise<EvaluationDataset>;
732
+ /** Clear all test cases (useful for re-running with different LLM). */
733
+ clearTestCases(): EvaluationDataset;
734
+ /** Return the number of goldens. */
735
+ get length(): number;
736
+ }
633
737
 
634
738
  /**
635
739
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
636
740
  *
637
741
  * Evaluate production outputs or compare different models on your dataset.
638
742
  * Results are uploaded to Fallom dashboard for visualization.
743
+ *
744
+ * @example
745
+ * import fallom from "@fallom/trace";
746
+ *
747
+ * // Initialize
748
+ * fallom.evals.init({ apiKey: "flm_xxx" });
749
+ *
750
+ * // Method 1: Direct dataset evaluation
751
+ * const results = await fallom.evals.evaluate({
752
+ * dataset: [...],
753
+ * metrics: ["answer_relevancy", "faithfulness"],
754
+ * });
755
+ *
756
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
757
+ * const dataset = new fallom.evals.EvaluationDataset();
758
+ * await dataset.pull("my-dataset-key");
759
+ *
760
+ * for (const golden of dataset.goldens) {
761
+ * const actualOutput = await myLLMApp(golden.input);
762
+ * dataset.addTestCase({
763
+ * input: golden.input,
764
+ * actualOutput,
765
+ * });
766
+ * }
767
+ *
768
+ * const results = await fallom.evals.evaluate({
769
+ * testCases: dataset.testCases,
770
+ * metrics: ["answer_relevancy", "faithfulness"],
771
+ * });
639
772
  */
640
773
 
641
774
  declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
646
779
  type evals_DatasetItem = DatasetItem;
647
780
  type evals_EvalResult = EvalResult;
648
781
  type evals_EvaluateOptions = EvaluateOptions;
782
+ type evals_EvaluationDataset = EvaluationDataset;
783
+ declare const evals_EvaluationDataset: typeof EvaluationDataset;
784
+ type evals_Golden = Golden;
785
+ type evals_LLMTestCase = LLMTestCase;
649
786
  declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
650
787
  type evals_Message = Message;
651
788
  type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
664
801
  declare const evals_getMetricName: typeof getMetricName;
665
802
  declare const evals_isCustomMetric: typeof isCustomMetric;
666
803
  declare namespace evals {
667
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
804
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
668
805
  }
669
806
 
670
807
  /**
package/dist/index.d.ts CHANGED
@@ -458,6 +458,37 @@ interface EvalResult {
458
458
  tokensOut?: number;
459
459
  cost?: number;
460
460
  }
461
+ /**
462
+ * A golden record from a dataset - contains input and optionally expected output.
463
+ * This represents a "golden" test case that you can use to generate actual outputs
464
+ * from your LLM pipeline and then evaluate.
465
+ */
466
+ interface Golden {
467
+ input: string;
468
+ expectedOutput?: string;
469
+ systemMessage?: string;
470
+ /** Retrieved documents for RAG evaluation */
471
+ context?: string[];
472
+ metadata?: Record<string, unknown>;
473
+ }
474
+ /**
475
+ * A test case for evaluation - contains input and actual output from your LLM.
476
+ *
477
+ */
478
+ interface LLMTestCase {
479
+ /** The user input/query */
480
+ input: string;
481
+ /** The output generated by your LLM pipeline */
482
+ actualOutput: string;
483
+ /** (Optional) The expected/golden output for comparison */
484
+ expectedOutput?: string;
485
+ /** (Optional) System prompt used */
486
+ systemMessage?: string;
487
+ /** (Optional) Retrieved documents for RAG faithfulness evaluation */
488
+ context?: string[];
489
+ /** (Optional) Additional metadata */
490
+ metadata?: Record<string, unknown>;
491
+ }
461
492
  /** Response format from model calls */
462
493
  interface ModelResponse {
463
494
  content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
487
518
  }
488
519
  /** Options for evaluate() */
489
520
  interface EvaluateOptions {
490
- dataset: DatasetInput;
521
+ /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
522
+ dataset?: DatasetInput;
491
523
  /** List of metrics to run (built-in or custom). Default: all built-in metrics */
492
524
  metrics?: MetricInput[];
493
525
  judgeModel?: string;
494
526
  name?: string;
495
527
  description?: string;
496
528
  verbose?: boolean;
529
+ /** Alternative to dataset - provide test cases from EvaluationDataset */
530
+ testCases?: LLMTestCase[];
497
531
  _skipUpload?: boolean;
498
532
  }
499
533
  /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
532
566
  */
533
567
  declare function init$1(options?: InitOptions$1): void;
534
568
  /**
535
- * Evaluate production outputs against specified metrics using G-Eval.
569
+ * Evaluate outputs against specified metrics using G-Eval.
536
570
  *
537
571
  * Results are automatically uploaded to Fallom dashboard.
572
+ *
538
573
  */
539
574
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
540
575
  /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
630
665
  _baseUrl?: string;
631
666
  _initialized?: boolean;
632
667
  }): Promise<DatasetItem[]>;
668
+ /**
669
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
670
+ *
671
+ * This provides a workflow where you:
672
+ * 1. Pull a dataset (goldens) from Fallom
673
+ * 2. Run your own LLM pipeline on each golden to generate outputs
674
+ * 3. Add the results as test cases
675
+ * 4. Evaluate the test cases
676
+ *
677
+ */
678
+ declare class EvaluationDataset {
679
+ private _goldens;
680
+ private _testCases;
681
+ private _datasetKey;
682
+ private _datasetName;
683
+ private _version;
684
+ /** List of golden records (inputs with optional expected outputs). */
685
+ get goldens(): Golden[];
686
+ /** List of test cases (inputs with actual outputs from your LLM). */
687
+ get testCases(): LLMTestCase[];
688
+ /** The Fallom dataset key if pulled from Fallom. */
689
+ get datasetKey(): string | null;
690
+ /**
691
+ * Pull a dataset from Fallom.
692
+ *
693
+ * @param alias - The dataset key/alias in Fallom
694
+ * @param version - Specific version to pull (default: latest)
695
+ * @returns Self for chaining
696
+ */
697
+ pull(alias: string, version?: number): Promise<EvaluationDataset>;
698
+ /**
699
+ * Add a golden record manually.
700
+ * @param golden - A Golden object
701
+ * @returns Self for chaining
702
+ */
703
+ addGolden(golden: Golden): EvaluationDataset;
704
+ /**
705
+ * Add multiple golden records.
706
+ * @param goldens - Array of Golden objects
707
+ * @returns Self for chaining
708
+ */
709
+ addGoldens(goldens: Golden[]): EvaluationDataset;
710
+ /**
711
+ * Add a test case with actual LLM output.
712
+ * @param testCase - An LLMTestCase object
713
+ * @returns Self for chaining
714
+ */
715
+ addTestCase(testCase: LLMTestCase): EvaluationDataset;
716
+ /**
717
+ * Add multiple test cases.
718
+ * @param testCases - Array of LLMTestCase objects
719
+ * @returns Self for chaining
720
+ */
721
+ addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
722
+ /**
723
+ * Automatically generate test cases by running all goldens through your LLM app.
724
+ *
725
+ * @param llmApp - A callable that takes messages and returns response
726
+ * @param options - Configuration options
727
+ * @returns Self for chaining
728
+ */
729
+ generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
730
+ includeContext?: boolean;
731
+ }): Promise<EvaluationDataset>;
732
+ /** Clear all test cases (useful for re-running with different LLM). */
733
+ clearTestCases(): EvaluationDataset;
734
+ /** Return the number of goldens. */
735
+ get length(): number;
736
+ }
633
737
 
634
738
  /**
635
739
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
636
740
  *
637
741
  * Evaluate production outputs or compare different models on your dataset.
638
742
  * Results are uploaded to Fallom dashboard for visualization.
743
+ *
744
+ * @example
745
+ * import fallom from "@fallom/trace";
746
+ *
747
+ * // Initialize
748
+ * fallom.evals.init({ apiKey: "flm_xxx" });
749
+ *
750
+ * // Method 1: Direct dataset evaluation
751
+ * const results = await fallom.evals.evaluate({
752
+ * dataset: [...],
753
+ * metrics: ["answer_relevancy", "faithfulness"],
754
+ * });
755
+ *
756
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
757
+ * const dataset = new fallom.evals.EvaluationDataset();
758
+ * await dataset.pull("my-dataset-key");
759
+ *
760
+ * for (const golden of dataset.goldens) {
761
+ * const actualOutput = await myLLMApp(golden.input);
762
+ * dataset.addTestCase({
763
+ * input: golden.input,
764
+ * actualOutput,
765
+ * });
766
+ * }
767
+ *
768
+ * const results = await fallom.evals.evaluate({
769
+ * testCases: dataset.testCases,
770
+ * metrics: ["answer_relevancy", "faithfulness"],
771
+ * });
639
772
  */
640
773
 
641
774
  declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
646
779
  type evals_DatasetItem = DatasetItem;
647
780
  type evals_EvalResult = EvalResult;
648
781
  type evals_EvaluateOptions = EvaluateOptions;
782
+ type evals_EvaluationDataset = EvaluationDataset;
783
+ declare const evals_EvaluationDataset: typeof EvaluationDataset;
784
+ type evals_Golden = Golden;
785
+ type evals_LLMTestCase = LLMTestCase;
649
786
  declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
650
787
  type evals_Message = Message;
651
788
  type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
664
801
  declare const evals_getMetricName: typeof getMetricName;
665
802
  declare const evals_isCustomMetric: typeof isCustomMetric;
666
803
  declare namespace evals {
667
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
804
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
668
805
  }
669
806
 
670
807
  /**
package/dist/index.js CHANGED
@@ -590,9 +590,159 @@ async function datasetFromFallom(datasetKey, version, config) {
590
590
  );
591
591
  return items;
592
592
  }
593
+ var EvaluationDataset;
593
594
  var init_helpers = __esm({
594
595
  "src/evals/helpers.ts"() {
595
596
  "use strict";
597
+ EvaluationDataset = class {
598
+ constructor() {
599
+ this._goldens = [];
600
+ this._testCases = [];
601
+ this._datasetKey = null;
602
+ this._datasetName = null;
603
+ this._version = null;
604
+ }
605
+ /** List of golden records (inputs with optional expected outputs). */
606
+ get goldens() {
607
+ return this._goldens;
608
+ }
609
+ /** List of test cases (inputs with actual outputs from your LLM). */
610
+ get testCases() {
611
+ return this._testCases;
612
+ }
613
+ /** The Fallom dataset key if pulled from Fallom. */
614
+ get datasetKey() {
615
+ return this._datasetKey;
616
+ }
617
+ /**
618
+ * Pull a dataset from Fallom.
619
+ *
620
+ * @param alias - The dataset key/alias in Fallom
621
+ * @param version - Specific version to pull (default: latest)
622
+ * @returns Self for chaining
623
+ */
624
+ async pull(alias, version) {
625
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await Promise.resolve().then(() => (init_core(), core_exports));
626
+ if (!_initialized2) {
627
+ throw new Error("Fallom evals not initialized. Call evals.init() first.");
628
+ }
629
+ const params = new URLSearchParams({ include_entries: "true" });
630
+ if (version !== void 0) {
631
+ params.set("version", String(version));
632
+ }
633
+ const url = `${_baseUrl2}/api/datasets/${encodeURIComponent(alias)}?${params}`;
634
+ const response = await fetch(url, {
635
+ headers: {
636
+ Authorization: `Bearer ${_apiKey2}`,
637
+ "Content-Type": "application/json"
638
+ }
639
+ });
640
+ if (response.status === 404) {
641
+ throw new Error(`Dataset '${alias}' not found`);
642
+ } else if (response.status === 403) {
643
+ throw new Error(`Access denied to dataset '${alias}'`);
644
+ }
645
+ if (!response.ok) {
646
+ throw new Error(`Failed to fetch dataset: ${response.statusText}`);
647
+ }
648
+ const data = await response.json();
649
+ this._datasetKey = alias;
650
+ this._datasetName = data.dataset?.name || alias;
651
+ this._version = data.version?.version || null;
652
+ this._goldens = [];
653
+ for (const entry of data.entries || []) {
654
+ this._goldens.push({
655
+ input: entry.input || "",
656
+ expectedOutput: entry.output,
657
+ systemMessage: entry.systemMessage,
658
+ metadata: entry.metadata
659
+ });
660
+ }
661
+ console.log(
662
+ `\u2713 Pulled dataset '${this._datasetName}' (version ${this._version}) with ${this._goldens.length} goldens`
663
+ );
664
+ return this;
665
+ }
666
+ /**
667
+ * Add a golden record manually.
668
+ * @param golden - A Golden object
669
+ * @returns Self for chaining
670
+ */
671
+ addGolden(golden) {
672
+ this._goldens.push(golden);
673
+ return this;
674
+ }
675
+ /**
676
+ * Add multiple golden records.
677
+ * @param goldens - Array of Golden objects
678
+ * @returns Self for chaining
679
+ */
680
+ addGoldens(goldens) {
681
+ this._goldens.push(...goldens);
682
+ return this;
683
+ }
684
+ /**
685
+ * Add a test case with actual LLM output.
686
+ * @param testCase - An LLMTestCase object
687
+ * @returns Self for chaining
688
+ */
689
+ addTestCase(testCase) {
690
+ this._testCases.push(testCase);
691
+ return this;
692
+ }
693
+ /**
694
+ * Add multiple test cases.
695
+ * @param testCases - Array of LLMTestCase objects
696
+ * @returns Self for chaining
697
+ */
698
+ addTestCases(testCases) {
699
+ this._testCases.push(...testCases);
700
+ return this;
701
+ }
702
+ /**
703
+ * Automatically generate test cases by running all goldens through your LLM app.
704
+ *
705
+ * @param llmApp - A callable that takes messages and returns response
706
+ * @param options - Configuration options
707
+ * @returns Self for chaining
708
+ */
709
+ async generateTestCases(llmApp, options = {}) {
710
+ const { includeContext = false } = options;
711
+ console.log(`Generating test cases for ${this._goldens.length} goldens...`);
712
+ for (let i = 0; i < this._goldens.length; i++) {
713
+ const golden = this._goldens[i];
714
+ const messages = [];
715
+ if (golden.systemMessage) {
716
+ messages.push({ role: "system", content: golden.systemMessage });
717
+ }
718
+ messages.push({ role: "user", content: golden.input });
719
+ const response = await llmApp(messages);
720
+ const testCase = {
721
+ input: golden.input,
722
+ actualOutput: response.content,
723
+ expectedOutput: golden.expectedOutput,
724
+ systemMessage: golden.systemMessage,
725
+ context: includeContext ? response.context : golden.context,
726
+ metadata: golden.metadata
727
+ };
728
+ this._testCases.push(testCase);
729
+ console.log(
730
+ ` [${i + 1}/${this._goldens.length}] Generated output for: ${golden.input.slice(0, 50)}...`
731
+ );
732
+ }
733
+ console.log(`\u2713 Generated ${this._testCases.length} test cases`);
734
+ return this;
735
+ }
736
+ /** Clear all test cases (useful for re-running with different LLM). */
737
+ clearTestCases() {
738
+ this._testCases = [];
739
+ return this;
740
+ }
741
+ /** Return the number of goldens. */
742
+ get length() {
743
+ return this._goldens.length;
744
+ }
745
+ };
596
746
  }
597
747
  });
598
748
 
@@ -707,9 +857,22 @@ async function evaluate(options) {
707
857
  name,
708
858
  description,
709
859
  verbose = true,
860
+ testCases,
710
861
  _skipUpload = false
711
862
  } = options;
712
- const dataset = await resolveDataset(datasetInput);
863
+ let dataset;
864
+ if (testCases !== void 0 && testCases.length > 0) {
865
+ dataset = testCases.map((tc) => ({
866
+ input: tc.input,
867
+ output: tc.actualOutput,
868
+ systemMessage: tc.systemMessage,
869
+ metadata: tc.metadata
870
+ }));
871
+ } else if (datasetInput !== void 0) {
872
+ dataset = await resolveDataset(datasetInput);
873
+ } else {
874
+ throw new Error("Either 'dataset' or 'testCases' must be provided");
875
+ }
713
876
  for (const m of metrics) {
714
877
  if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
715
878
  throw new Error(
@@ -775,6 +938,9 @@ async function compareModels(options) {
775
938
  description,
776
939
  verbose = true
777
940
  } = options;
941
+ if (!datasetInput) {
942
+ throw new Error("'dataset' is required for compareModels()");
943
+ }
778
944
  const dataset = await resolveDataset(datasetInput);
779
945
  const results = {};
780
946
  if (includeProduction) {
@@ -1035,7 +1201,7 @@ var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otl
1035
1201
  // node_modules/@opentelemetry/resources/build/esm/Resource.js
1036
1202
  var import_api = require("@opentelemetry/api");
1037
1203
 
1038
- // node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
1204
+ // node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
1039
1205
  var SemanticResourceAttributes = {
1040
1206
  /**
1041
1207
  * Name of the cloud provider.
@@ -3543,6 +3709,7 @@ var evals_exports = {};
3543
3709
  __export(evals_exports, {
3544
3710
  AVAILABLE_METRICS: () => AVAILABLE_METRICS,
3545
3711
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
3712
+ EvaluationDataset: () => EvaluationDataset,
3546
3713
  METRIC_PROMPTS: () => METRIC_PROMPTS,
3547
3714
  compareModels: () => compareModels,
3548
3715
  createCustomModel: () => createCustomModel,
package/dist/index.mjs CHANGED
@@ -5,6 +5,7 @@ import {
5
5
  import {
6
6
  AVAILABLE_METRICS,
7
7
  DEFAULT_JUDGE_MODEL,
8
+ EvaluationDataset,
8
9
  METRIC_PROMPTS,
9
10
  compareModels,
10
11
  createCustomModel,
@@ -18,7 +19,7 @@ import {
18
19
  init as init2,
19
20
  isCustomMetric,
20
21
  uploadResultsPublic
21
- } from "./chunk-2NGJF2JZ.mjs";
22
+ } from "./chunk-3HBKT4HK.mjs";
22
23
  import {
23
24
  __export
24
25
  } from "./chunk-7P6ASYW6.mjs";
@@ -40,7 +41,7 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
40
41
  // node_modules/@opentelemetry/resources/build/esm/Resource.js
41
42
  import { diag } from "@opentelemetry/api";
42
43
 
43
- // node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
44
+ // node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
44
45
  var SemanticResourceAttributes = {
45
46
  /**
46
47
  * Name of the cloud provider.
@@ -2545,6 +2546,7 @@ var evals_exports = {};
2545
2546
  __export(evals_exports, {
2546
2547
  AVAILABLE_METRICS: () => AVAILABLE_METRICS,
2547
2548
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
2549
+ EvaluationDataset: () => EvaluationDataset,
2548
2550
  METRIC_PROMPTS: () => METRIC_PROMPTS,
2549
2551
  compareModels: () => compareModels,
2550
2552
  createCustomModel: () => createCustomModel,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fallom/trace",
3
- "version": "0.2.15",
3
+ "version": "0.2.16",
4
4
  "description": "Model A/B testing and tracing for LLM applications. Zero latency, production-ready.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",