npm - @fallom/trace - Versions diffs - 0.2.15 → 0.2.16 - Mend

@fallom/trace 0.2.15 → 0.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/core-4L56QWI7.mjs ADDED Viewed

@@ -0,0 +1,21 @@
+import {
+  DEFAULT_JUDGE_MODEL,
+  _apiKey,
+  _baseUrl,
+  _initialized,
+  compareModels,
+  evaluate,
+  init,
+  uploadResultsPublic
+} from "./chunk-3HBKT4HK.mjs";
+import "./chunk-7P6ASYW6.mjs";
+export {
+  DEFAULT_JUDGE_MODEL,
+  _apiKey,
+  _baseUrl,
+  _initialized,
+  compareModels,
+  evaluate,
+  init,
+  uploadResultsPublic
+};

package/dist/core-JLHYFVYS.mjs ADDED Viewed

@@ -0,0 +1,21 @@
+import {
+  DEFAULT_JUDGE_MODEL,
+  _apiKey,
+  _baseUrl,
+  _initialized,
+  compareModels,
+  evaluate,
+  init,
+  uploadResultsPublic
+} from "./chunk-XBZ3ESNV.mjs";
+import "./chunk-7P6ASYW6.mjs";
+export {
+  DEFAULT_JUDGE_MODEL,
+  _apiKey,
+  _baseUrl,
+  _initialized,
+  compareModels,
+  evaluate,
+  init,
+  uploadResultsPublic
+};

package/dist/index.d.mts CHANGED Viewed

@@ -458,6 +458,37 @@ interface EvalResult {
     tokensOut?: number;
     cost?: number;
 }
+/**
+ * A golden record from a dataset - contains input and optionally expected output.
+ * This represents a "golden" test case that you can use to generate actual outputs
+ * from your LLM pipeline and then evaluate.
+ */
+interface Golden {
+    input: string;
+    expectedOutput?: string;
+    systemMessage?: string;
+    /** Retrieved documents for RAG evaluation */
+    context?: string[];
+    metadata?: Record<string, unknown>;
+}
+/**
+ * A test case for evaluation - contains input and actual output from your LLM.
+ *
+ */
+interface LLMTestCase {
+    /** The user input/query */
+    input: string;
+    /** The output generated by your LLM pipeline */
+    actualOutput: string;
+    /** (Optional) The expected/golden output for comparison */
+    expectedOutput?: string;
+    /** (Optional) System prompt used */
+    systemMessage?: string;
+    /** (Optional) Retrieved documents for RAG faithfulness evaluation */
+    context?: string[];
+    /** (Optional) Additional metadata */
+    metadata?: Record<string, unknown>;
+}
 /** Response format from model calls */
 interface ModelResponse {
     content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
 }
 /** Options for evaluate() */
 interface EvaluateOptions {
-    dataset: DatasetInput;
+    /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
+    dataset?: DatasetInput;
     /** List of metrics to run (built-in or custom). Default: all built-in metrics */
     metrics?: MetricInput[];
     judgeModel?: string;
     name?: string;
     description?: string;
     verbose?: boolean;
+    /** Alternative to dataset - provide test cases from EvaluationDataset */
+    testCases?: LLMTestCase[];
     _skipUpload?: boolean;
 }
 /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
  */
 declare function init$1(options?: InitOptions$1): void;
 /**
- * Evaluate production outputs against specified metrics using G-Eval.
+ * Evaluate outputs against specified metrics using G-Eval.
  *
  * Results are automatically uploaded to Fallom dashboard.
+ *
  */
 declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
 /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
     _baseUrl?: string;
     _initialized?: boolean;
 }): Promise<DatasetItem[]>;
+/**
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
+ *
+ * This provides a workflow where you:
+ * 1. Pull a dataset (goldens) from Fallom
+ * 2. Run your own LLM pipeline on each golden to generate outputs
+ * 3. Add the results as test cases
+ * 4. Evaluate the test cases
+ *
+ */
+declare class EvaluationDataset {
+    private _goldens;
+    private _testCases;
+    private _datasetKey;
+    private _datasetName;
+    private _version;
+    /** List of golden records (inputs with optional expected outputs). */
+    get goldens(): Golden[];
+    /** List of test cases (inputs with actual outputs from your LLM). */
+    get testCases(): LLMTestCase[];
+    /** The Fallom dataset key if pulled from Fallom. */
+    get datasetKey(): string | null;
+    /**
+     * Pull a dataset from Fallom.
+     *
+     * @param alias - The dataset key/alias in Fallom
+     * @param version - Specific version to pull (default: latest)
+     * @returns Self for chaining
+     */
+    pull(alias: string, version?: number): Promise<EvaluationDataset>;
+    /**
+     * Add a golden record manually.
+     * @param golden - A Golden object
+     * @returns Self for chaining
+     */
+    addGolden(golden: Golden): EvaluationDataset;
+    /**
+     * Add multiple golden records.
+     * @param goldens - Array of Golden objects
+     * @returns Self for chaining
+     */
+    addGoldens(goldens: Golden[]): EvaluationDataset;
+    /**
+     * Add a test case with actual LLM output.
+     * @param testCase - An LLMTestCase object
+     * @returns Self for chaining
+     */
+    addTestCase(testCase: LLMTestCase): EvaluationDataset;
+    /**
+     * Add multiple test cases.
+     * @param testCases - Array of LLMTestCase objects
+     * @returns Self for chaining
+     */
+    addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
+    /**
+     * Automatically generate test cases by running all goldens through your LLM app.
+     *
+     * @param llmApp - A callable that takes messages and returns response
+     * @param options - Configuration options
+     * @returns Self for chaining
+     */
+    generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
+        includeContext?: boolean;
+    }): Promise<EvaluationDataset>;
+    /** Clear all test cases (useful for re-running with different LLM). */
+    clearTestCases(): EvaluationDataset;
+    /** Return the number of goldens. */
+    get length(): number;
+}
 /**
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
  *
  * Evaluate production outputs or compare different models on your dataset.
  * Results are uploaded to Fallom dashboard for visualization.
+ *
+ * @example
+ * import fallom from "@fallom/trace";
+ *
+ * // Initialize
+ * fallom.evals.init({ apiKey: "flm_xxx" });
+ *
+ * // Method 1: Direct dataset evaluation
+ * const results = await fallom.evals.evaluate({
+ *   dataset: [...],
+ *   metrics: ["answer_relevancy", "faithfulness"],
+ * });
+ *
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
+ * const dataset = new fallom.evals.EvaluationDataset();
+ * await dataset.pull("my-dataset-key");
+ *
+ * for (const golden of dataset.goldens) {
+ *   const actualOutput = await myLLMApp(golden.input);
+ *   dataset.addTestCase({
+ *     input: golden.input,
+ *     actualOutput,
+ *   });
+ * }
+ *
+ * const results = await fallom.evals.evaluate({
+ *   testCases: dataset.testCases,
+ *   metrics: ["answer_relevancy", "faithfulness"],
+ * });
  */
 declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
 type evals_DatasetItem = DatasetItem;
 type evals_EvalResult = EvalResult;
 type evals_EvaluateOptions = EvaluateOptions;
+type evals_EvaluationDataset = EvaluationDataset;
+declare const evals_EvaluationDataset: typeof EvaluationDataset;
+type evals_Golden = Golden;
+type evals_LLMTestCase = LLMTestCase;
 declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
 type evals_Message = Message;
 type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
 declare const evals_getMetricName: typeof getMetricName;
 declare const evals_isCustomMetric: typeof isCustomMetric;
 declare namespace evals {
-  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
+  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
 }
 /**

package/dist/index.d.ts CHANGED Viewed

@@ -458,6 +458,37 @@ interface EvalResult {
     tokensOut?: number;
     cost?: number;
 }
+/**
+ * A golden record from a dataset - contains input and optionally expected output.
+ * This represents a "golden" test case that you can use to generate actual outputs
+ * from your LLM pipeline and then evaluate.
+ */
+interface Golden {
+    input: string;
+    expectedOutput?: string;
+    systemMessage?: string;
+    /** Retrieved documents for RAG evaluation */
+    context?: string[];
+    metadata?: Record<string, unknown>;
+}
+/**
+ * A test case for evaluation - contains input and actual output from your LLM.
+ *
+ */
+interface LLMTestCase {
+    /** The user input/query */
+    input: string;
+    /** The output generated by your LLM pipeline */
+    actualOutput: string;
+    /** (Optional) The expected/golden output for comparison */
+    expectedOutput?: string;
+    /** (Optional) System prompt used */
+    systemMessage?: string;
+    /** (Optional) Retrieved documents for RAG faithfulness evaluation */
+    context?: string[];
+    /** (Optional) Additional metadata */
+    metadata?: Record<string, unknown>;
+}
 /** Response format from model calls */
 interface ModelResponse {
     content: string;
@@ -487,13 +518,16 @@ interface InitOptions$1 {
 }
 /** Options for evaluate() */
 interface EvaluateOptions {
-    dataset: DatasetInput;
+    /** Dataset to evaluate (list of DatasetItem or Fallom dataset key) */
+    dataset?: DatasetInput;
     /** List of metrics to run (built-in or custom). Default: all built-in metrics */
     metrics?: MetricInput[];
     judgeModel?: string;
     name?: string;
     description?: string;
     verbose?: boolean;
+    /** Alternative to dataset - provide test cases from EvaluationDataset */
+    testCases?: LLMTestCase[];
     _skipUpload?: boolean;
 }
 /** Options for compareModels() */
@@ -532,9 +566,10 @@ declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
  */
 declare function init$1(options?: InitOptions$1): void;
 /**
- * Evaluate production outputs against specified metrics using G-Eval.
+ * Evaluate outputs against specified metrics using G-Eval.
  *
  * Results are automatically uploaded to Fallom dashboard.
+ *
  */
 declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
 /**
@@ -630,12 +665,110 @@ declare function datasetFromFallom(datasetKey: string, version?: number, config?
     _baseUrl?: string;
     _initialized?: boolean;
 }): Promise<DatasetItem[]>;
+/**
+ * A dataset for evaluation that supports pulling from Fallom and adding test cases.
+ *
+ * This provides a workflow where you:
+ * 1. Pull a dataset (goldens) from Fallom
+ * 2. Run your own LLM pipeline on each golden to generate outputs
+ * 3. Add the results as test cases
+ * 4. Evaluate the test cases
+ *
+ */
+declare class EvaluationDataset {
+    private _goldens;
+    private _testCases;
+    private _datasetKey;
+    private _datasetName;
+    private _version;
+    /** List of golden records (inputs with optional expected outputs). */
+    get goldens(): Golden[];
+    /** List of test cases (inputs with actual outputs from your LLM). */
+    get testCases(): LLMTestCase[];
+    /** The Fallom dataset key if pulled from Fallom. */
+    get datasetKey(): string | null;
+    /**
+     * Pull a dataset from Fallom.
+     *
+     * @param alias - The dataset key/alias in Fallom
+     * @param version - Specific version to pull (default: latest)
+     * @returns Self for chaining
+     */
+    pull(alias: string, version?: number): Promise<EvaluationDataset>;
+    /**
+     * Add a golden record manually.
+     * @param golden - A Golden object
+     * @returns Self for chaining
+     */
+    addGolden(golden: Golden): EvaluationDataset;
+    /**
+     * Add multiple golden records.
+     * @param goldens - Array of Golden objects
+     * @returns Self for chaining
+     */
+    addGoldens(goldens: Golden[]): EvaluationDataset;
+    /**
+     * Add a test case with actual LLM output.
+     * @param testCase - An LLMTestCase object
+     * @returns Self for chaining
+     */
+    addTestCase(testCase: LLMTestCase): EvaluationDataset;
+    /**
+     * Add multiple test cases.
+     * @param testCases - Array of LLMTestCase objects
+     * @returns Self for chaining
+     */
+    addTestCases(testCases: LLMTestCase[]): EvaluationDataset;
+    /**
+     * Automatically generate test cases by running all goldens through your LLM app.
+     *
+     * @param llmApp - A callable that takes messages and returns response
+     * @param options - Configuration options
+     * @returns Self for chaining
+     */
+    generateTestCases(llmApp: (messages: Message[]) => Promise<ModelResponse>, options?: {
+        includeContext?: boolean;
+    }): Promise<EvaluationDataset>;
+    /** Clear all test cases (useful for re-running with different LLM). */
+    clearTestCases(): EvaluationDataset;
+    /** Return the number of goldens. */
+    get length(): number;
+}
 /**
  * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
  *
  * Evaluate production outputs or compare different models on your dataset.
  * Results are uploaded to Fallom dashboard for visualization.
+ *
+ * @example
+ * import fallom from "@fallom/trace";
+ *
+ * // Initialize
+ * fallom.evals.init({ apiKey: "flm_xxx" });
+ *
+ * // Method 1: Direct dataset evaluation
+ * const results = await fallom.evals.evaluate({
+ *   dataset: [...],
+ *   metrics: ["answer_relevancy", "faithfulness"],
+ * });
+ *
+ * // Method 2: Use EvaluationDataset with your own LLM pipeline
+ * const dataset = new fallom.evals.EvaluationDataset();
+ * await dataset.pull("my-dataset-key");
+ *
+ * for (const golden of dataset.goldens) {
+ *   const actualOutput = await myLLMApp(golden.input);
+ *   dataset.addTestCase({
+ *     input: golden.input,
+ *     actualOutput,
+ *   });
+ * }
+ *
+ * const results = await fallom.evals.evaluate({
+ *   testCases: dataset.testCases,
+ *   metrics: ["answer_relevancy", "faithfulness"],
+ * });
  */
 declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
@@ -646,6 +779,10 @@ type evals_DatasetInput = DatasetInput;
 type evals_DatasetItem = DatasetItem;
 type evals_EvalResult = EvalResult;
 type evals_EvaluateOptions = EvaluateOptions;
+type evals_EvaluationDataset = EvaluationDataset;
+declare const evals_EvaluationDataset: typeof EvaluationDataset;
+type evals_Golden = Golden;
+type evals_LLMTestCase = LLMTestCase;
 declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
 type evals_Message = Message;
 type evals_MetricInput = MetricInput;
@@ -664,7 +801,7 @@ declare const evals_evaluate: typeof evaluate;
 declare const evals_getMetricName: typeof getMetricName;
 declare const evals_isCustomMetric: typeof isCustomMetric;
 declare namespace evals {
-  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
+  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
 }
 /**

package/dist/index.js CHANGED Viewed

@@ -590,9 +590,159 @@ async function datasetFromFallom(datasetKey, version, config) {
   );
   return items;
 }
+var EvaluationDataset;
 var init_helpers = __esm({
   "src/evals/helpers.ts"() {
     "use strict";
+    EvaluationDataset = class {
+      constructor() {
+        this._goldens = [];
+        this._testCases = [];
+        this._datasetKey = null;
+        this._datasetName = null;
+        this._version = null;
+      }
+      /** List of golden records (inputs with optional expected outputs). */
+      get goldens() {
+        return this._goldens;
+      }
+      /** List of test cases (inputs with actual outputs from your LLM). */
+      get testCases() {
+        return this._testCases;
+      }
+      /** The Fallom dataset key if pulled from Fallom. */
+      get datasetKey() {
+        return this._datasetKey;
+      }
+      /**
+       * Pull a dataset from Fallom.
+       *
+       * @param alias - The dataset key/alias in Fallom
+       * @param version - Specific version to pull (default: latest)
+       * @returns Self for chaining
+       */
+      async pull(alias, version) {
+        const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await Promise.resolve().then(() => (init_core(), core_exports));
+        if (!_initialized2) {
+          throw new Error("Fallom evals not initialized. Call evals.init() first.");
+        }
+        const params = new URLSearchParams({ include_entries: "true" });
+        if (version !== void 0) {
+          params.set("version", String(version));
+        }
+        const url = `${_baseUrl2}/api/datasets/${encodeURIComponent(alias)}?${params}`;
+        const response = await fetch(url, {
+          headers: {
+            Authorization: `Bearer ${_apiKey2}`,
+            "Content-Type": "application/json"
+          }
+        });
+        if (response.status === 404) {
+          throw new Error(`Dataset '${alias}' not found`);
+        } else if (response.status === 403) {
+          throw new Error(`Access denied to dataset '${alias}'`);
+        }
+        if (!response.ok) {
+          throw new Error(`Failed to fetch dataset: ${response.statusText}`);
+        }
+        const data = await response.json();
+        this._datasetKey = alias;
+        this._datasetName = data.dataset?.name || alias;
+        this._version = data.version?.version || null;
+        this._goldens = [];
+        for (const entry of data.entries || []) {
+          this._goldens.push({
+            input: entry.input || "",
+            expectedOutput: entry.output,
+            systemMessage: entry.systemMessage,
+            metadata: entry.metadata
+          });
+        }
+        console.log(
+          `\u2713 Pulled dataset '${this._datasetName}' (version ${this._version}) with ${this._goldens.length} goldens`
+        );
+        return this;
+      }
+      /**
+       * Add a golden record manually.
+       * @param golden - A Golden object
+       * @returns Self for chaining
+       */
+      addGolden(golden) {
+        this._goldens.push(golden);
+        return this;
+      }
+      /**
+       * Add multiple golden records.
+       * @param goldens - Array of Golden objects
+       * @returns Self for chaining
+       */
+      addGoldens(goldens) {
+        this._goldens.push(...goldens);
+        return this;
+      }
+      /**
+       * Add a test case with actual LLM output.
+       * @param testCase - An LLMTestCase object
+       * @returns Self for chaining
+       */
+      addTestCase(testCase) {
+        this._testCases.push(testCase);
+        return this;
+      }
+      /**
+       * Add multiple test cases.
+       * @param testCases - Array of LLMTestCase objects
+       * @returns Self for chaining
+       */
+      addTestCases(testCases) {
+        this._testCases.push(...testCases);
+        return this;
+      }
+      /**
+       * Automatically generate test cases by running all goldens through your LLM app.
+       *
+       * @param llmApp - A callable that takes messages and returns response
+       * @param options - Configuration options
+       * @returns Self for chaining
+       */
+      async generateTestCases(llmApp, options = {}) {
+        const { includeContext = false } = options;
+        console.log(`Generating test cases for ${this._goldens.length} goldens...`);
+        for (let i = 0; i < this._goldens.length; i++) {
+          const golden = this._goldens[i];
+          const messages = [];
+          if (golden.systemMessage) {
+            messages.push({ role: "system", content: golden.systemMessage });
+          }
+          messages.push({ role: "user", content: golden.input });
+          const response = await llmApp(messages);
+          const testCase = {
+            input: golden.input,
+            actualOutput: response.content,
+            expectedOutput: golden.expectedOutput,
+            systemMessage: golden.systemMessage,
+            context: includeContext ? response.context : golden.context,
+            metadata: golden.metadata
+          };
+          this._testCases.push(testCase);
+          console.log(
+            `  [${i + 1}/${this._goldens.length}] Generated output for: ${golden.input.slice(0, 50)}...`
+          );
+        }
+        console.log(`\u2713 Generated ${this._testCases.length} test cases`);
+        return this;
+      }
+      /** Clear all test cases (useful for re-running with different LLM). */
+      clearTestCases() {
+        this._testCases = [];
+        return this;
+      }
+      /** Return the number of goldens. */
+      get length() {
+        return this._goldens.length;
+      }
+    };
   }
 });
@@ -707,9 +857,22 @@ async function evaluate(options) {
     name,
     description,
     verbose = true,
+    testCases,
     _skipUpload = false
   } = options;
-  const dataset = await resolveDataset(datasetInput);
+  let dataset;
+  if (testCases !== void 0 && testCases.length > 0) {
+    dataset = testCases.map((tc) => ({
+      input: tc.input,
+      output: tc.actualOutput,
+      systemMessage: tc.systemMessage,
+      metadata: tc.metadata
+    }));
+  } else if (datasetInput !== void 0) {
+    dataset = await resolveDataset(datasetInput);
+  } else {
+    throw new Error("Either 'dataset' or 'testCases' must be provided");
+  }
   for (const m of metrics) {
     if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
       throw new Error(
@@ -775,6 +938,9 @@ async function compareModels(options) {
     description,
     verbose = true
   } = options;
+  if (!datasetInput) {
+    throw new Error("'dataset' is required for compareModels()");
+  }
   const dataset = await resolveDataset(datasetInput);
   const results = {};
   if (includeProduction) {
@@ -1035,7 +1201,7 @@ var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otl
 // node_modules/@opentelemetry/resources/build/esm/Resource.js
 var import_api = require("@opentelemetry/api");
-// node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
+// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
 var SemanticResourceAttributes = {
   /**
    * Name of the cloud provider.
@@ -3543,6 +3709,7 @@ var evals_exports = {};
 __export(evals_exports, {
   AVAILABLE_METRICS: () => AVAILABLE_METRICS,
   DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
+  EvaluationDataset: () => EvaluationDataset,
   METRIC_PROMPTS: () => METRIC_PROMPTS,
   compareModels: () => compareModels,
   createCustomModel: () => createCustomModel,

package/dist/index.mjs CHANGED Viewed

@@ -5,6 +5,7 @@ import {
 import {
   AVAILABLE_METRICS,
   DEFAULT_JUDGE_MODEL,
+  EvaluationDataset,
   METRIC_PROMPTS,
   compareModels,
   createCustomModel,
@@ -18,7 +19,7 @@ import {
   init as init2,
   isCustomMetric,
   uploadResultsPublic
-} from "./chunk-2NGJF2JZ.mjs";
+} from "./chunk-3HBKT4HK.mjs";
 import {
   __export
 } from "./chunk-7P6ASYW6.mjs";
@@ -40,7 +41,7 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
 // node_modules/@opentelemetry/resources/build/esm/Resource.js
 import { diag } from "@opentelemetry/api";
-// node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
+// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
 var SemanticResourceAttributes = {
   /**
    * Name of the cloud provider.
@@ -2545,6 +2546,7 @@ var evals_exports = {};
 __export(evals_exports, {
   AVAILABLE_METRICS: () => AVAILABLE_METRICS,
   DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
+  EvaluationDataset: () => EvaluationDataset,
   METRIC_PROMPTS: () => METRIC_PROMPTS,
   compareModels: () => compareModels,
   createCustomModel: () => createCustomModel,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fallom/trace",
-  "version": "0.2.15",
+  "version": "0.2.16",
   "description": "Model A/B testing and tracing for LLM applications. Zero latency, production-ready.",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",