npm - @fallom/trace - Versions diffs - 0.2.10 → 0.2.13 - Mend

@fallom/trace 0.2.10 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -394,22 +394,35 @@ declare namespace prompts {
 }
 /**
- * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
- *
- * Evaluate production outputs or compare different models on your dataset.
- * Results are uploaded to Fallom dashboard for visualization.
- *
+ * Type definitions for Fallom Evals.
  */
+/** Built-in metric names */
 type MetricName = "answer_relevancy" | "hallucination" | "toxicity" | "faithfulness" | "completeness";
+/** List of all available built-in metrics */
 declare const AVAILABLE_METRICS: MetricName[];
+/**
+ * Define a custom evaluation metric using G-Eval.
+ */
+interface CustomMetric {
+    /** Unique identifier for the metric (e.g., "brand_alignment") */
+    name: string;
+    /** Description of what the metric evaluates */
+    criteria: string;
+    /** List of evaluation steps for the LLM judge to follow */
+    steps: string[];
+}
+/** Metric can be a built-in name or a custom metric */
+type MetricInput = MetricName | CustomMetric;
 /** Dataset can be a list of items OR a string (dataset key to fetch from Fallom) */
 type DatasetInput = DatasetItem[] | string;
+/** A single item in an evaluation dataset */
 interface DatasetItem {
     input: string;
     output: string;
     systemMessage?: string;
     metadata?: Record<string, unknown>;
 }
+/** Evaluation result for a single item */
 interface EvalResult {
     input: string;
     output: string;
@@ -449,19 +462,23 @@ interface Model {
     name: string;
     callFn?: ModelCallable;
 }
+/** Options for init() */
 interface InitOptions$1 {
     apiKey?: string;
     baseUrl?: string;
 }
+/** Options for evaluate() */
 interface EvaluateOptions {
     dataset: DatasetInput;
-    metrics?: MetricName[];
+    /** List of metrics to run (built-in or custom). Default: all built-in metrics */
+    metrics?: MetricInput[];
     judgeModel?: string;
     name?: string;
     description?: string;
     verbose?: boolean;
     _skipUpload?: boolean;
 }
+/** Options for compareModels() */
 interface CompareModelsOptions extends EvaluateOptions {
     /**
      * List of models to test. Each can be:
@@ -472,31 +489,72 @@ interface CompareModelsOptions extends EvaluateOptions {
     includeProduction?: boolean;
     modelKwargs?: Record<string, unknown>;
 }
+/** Type guard to check if a metric is a CustomMetric */
+declare function isCustomMetric(metric: MetricInput): metric is CustomMetric;
+/** Get the name of a metric (works for both built-in and custom) */
+declare function getMetricName(metric: MetricInput): string;
+/**
+ * G-Eval prompts for each metric.
+ */
+/** G-Eval prompts for each built-in metric */
+declare const METRIC_PROMPTS: Record<MetricName, {
+    criteria: string;
+    steps: string[];
+}>;
+/**
+ * Core evaluation functions.
+ */
+declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
 /**
  * Initialize Fallom evals.
  */
 declare function init$1(options?: InitOptions$1): void;
 /**
  * Evaluate production outputs against specified metrics using G-Eval.
+ *
  * Results are automatically uploaded to Fallom dashboard.
  */
 declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
 /**
- * Create a Model using OpenAI directly (for fine-tuned models or direct API access).
+ * Compare multiple models on the same dataset.
+ *
+ * Results are automatically uploaded to Fallom dashboard.
+ */
+declare function compareModels(options: CompareModelsOptions): Promise<Record<string, EvalResult[]>>;
+/**
+ * Public function to upload results manually.
+ */
+declare function uploadResultsPublic(results: EvalResult[] | Record<string, EvalResult[]>, options: {
+    name: string;
+    description?: string;
+    judgeModel?: string;
+}): Promise<string>;
+/**
+ * Helper functions for creating models and datasets.
+ */
+/**
+ * Create a Model using OpenAI directly (for fine-tuned models or Azure OpenAI).
  *
  * @param modelId - The OpenAI model ID (e.g., "gpt-4o" or "ft:gpt-4o-2024-08-06:org::id")
  * @param options - Configuration options
- * @returns A Model instance that can be used in compareModels()
+ * @returns Model instance that can be used in compareModels()
  */
 declare function createOpenAIModel(modelId: string, options?: {
     name?: string;
     apiKey?: string;
-    baseURL?: string;
+    baseUrl?: string;
     temperature?: number;
     maxTokens?: number;
 }): Model;
 /**
  * Create a Model for any OpenAI-compatible API endpoint.
+ *
  * Works with self-hosted models (vLLM, Ollama, LMStudio, etc.), custom endpoints,
  * or any service that follows the OpenAI chat completions API format.
  *
@@ -510,12 +568,13 @@ declare function createCustomModel(name: string, options: {
     headers?: Record<string, string>;
     modelField?: string;
     modelValue?: string;
-    temperature?: number;
-    maxTokens?: number;
+    extraParams?: Record<string, unknown>;
 }): Model;
 /**
  * Create a Model from any callable function.
- * This is the most flexible option - you provide a function that handles the model call.
+ *
+ * This is the most flexible option - you provide a function that takes
+ * messages and returns a response.
  *
  * @param name - Display name for the model
  * @param callFn - Function that takes messages and returns a response
@@ -523,17 +582,19 @@ declare function createCustomModel(name: string, options: {
  */
 declare function createModelFromCallable(name: string, callFn: ModelCallable): Model;
 /**
- * Compare multiple models on the same dataset.
- */
-declare function compareModels(options: CompareModelsOptions): Promise<Record<string, EvalResult[]>>;
-/**
- * Manually upload evaluation results to Fallom dashboard.
- * Note: Results are automatically uploaded after evaluate() and compareModels(),
- * so this is only needed for custom scenarios.
+ * Create a custom evaluation metric using G-Eval.
+ *
+ * @param name - Unique identifier for the metric (e.g., "brand_alignment")
+ * @param criteria - Description of what the metric evaluates
+ * @param steps - List of evaluation steps for the LLM judge to follow
+ * @returns A CustomMetric instance
  */
-declare function uploadResults(results: EvalResult[] | Record<string, EvalResult[]>, name: string, description?: string, judgeModel?: string): Promise<string>;
+declare function customMetric(name: string, criteria: string, steps: string[]): CustomMetric;
 /**
  * Create a dataset from Fallom trace data.
+ *
+ * @param traces - List of trace objects with attributes
+ * @returns List of DatasetItem ready for evaluation
  */
 declare function datasetFromTraces(traces: Array<{
     attributes?: Record<string, unknown>;
@@ -542,27 +603,34 @@ declare function datasetFromTraces(traces: Array<{
  * Fetch a dataset stored in Fallom by its key.
  *
  * @param datasetKey - The unique key of the dataset (e.g., "customer-support-qa")
- * @param version - Specific version number to fetch. If undefined, fetches the latest version.
+ * @param version - Specific version number to fetch. If undefined, fetches latest.
+ * @param config - Internal config (api key, base url, initialized flag)
  * @returns List of DatasetItem ready for evaluation
  */
-declare function datasetFromFallom(datasetKey: string, version?: number): Promise<DatasetItem[]>;
-declare const _default$1: {
-    init: typeof init$1;
-    evaluate: typeof evaluate;
-    compareModels: typeof compareModels;
-    uploadResults: typeof uploadResults;
-    datasetFromTraces: typeof datasetFromTraces;
-    datasetFromFallom: typeof datasetFromFallom;
-    AVAILABLE_METRICS: MetricName[];
-};
+declare function datasetFromFallom(datasetKey: string, version?: number, config?: {
+    _apiKey?: string | null;
+    _baseUrl?: string;
+    _initialized?: boolean;
+}): Promise<DatasetItem[]>;
+/**
+ * Fallom Evals - Run LLM evaluations locally using G-Eval with LLM as a Judge.
+ *
+ * Evaluate production outputs or compare different models on your dataset.
+ * Results are uploaded to Fallom dashboard for visualization.
+ */
 declare const evals_AVAILABLE_METRICS: typeof AVAILABLE_METRICS;
 type evals_CompareModelsOptions = CompareModelsOptions;
+type evals_CustomMetric = CustomMetric;
+declare const evals_DEFAULT_JUDGE_MODEL: typeof DEFAULT_JUDGE_MODEL;
 type evals_DatasetInput = DatasetInput;
 type evals_DatasetItem = DatasetItem;
 type evals_EvalResult = EvalResult;
 type evals_EvaluateOptions = EvaluateOptions;
+declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
 type evals_Message = Message;
+type evals_MetricInput = MetricInput;
 type evals_MetricName = MetricName;
 type evals_Model = Model;
 type evals_ModelCallable = ModelCallable;
@@ -571,12 +639,14 @@ declare const evals_compareModels: typeof compareModels;
 declare const evals_createCustomModel: typeof createCustomModel;
 declare const evals_createModelFromCallable: typeof createModelFromCallable;
 declare const evals_createOpenAIModel: typeof createOpenAIModel;
+declare const evals_customMetric: typeof customMetric;
 declare const evals_datasetFromFallom: typeof datasetFromFallom;
 declare const evals_datasetFromTraces: typeof datasetFromTraces;
 declare const evals_evaluate: typeof evaluate;
-declare const evals_uploadResults: typeof uploadResults;
+declare const evals_getMetricName: typeof getMetricName;
+declare const evals_isCustomMetric: typeof isCustomMetric;
 declare namespace evals {
-  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, type evals_Message as Message, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, _default$1 as default, evals_evaluate as evaluate, init$1 as init, evals_uploadResults as uploadResults };
+  export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, type InitOptions$1 as InitOptions, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
 }
 /**