npm - @fallom/trace - Versions diffs - 0.2.10 → 0.2.13 - Mend

@fallom/trace 0.2.10 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1,14 +1,33 @@
 import {
-  __export,
   init,
   models_exports
-} from "./chunk-KFD5AQ7V.mjs";
+} from "./chunk-CCZLSKZ7.mjs";
+import {
+  AVAILABLE_METRICS,
+  DEFAULT_JUDGE_MODEL,
+  METRIC_PROMPTS,
+  compareModels,
+  createCustomModel,
+  createModelFromCallable,
+  createOpenAIModel,
+  customMetric,
+  datasetFromFallom,
+  datasetFromTraces,
+  evaluate,
+  getMetricName,
+  init as init2,
+  isCustomMetric,
+  uploadResultsPublic
+} from "./chunk-2NGJF2JZ.mjs";
+import {
+  __export
+} from "./chunk-7P6ASYW6.mjs";
 // src/trace.ts
 var trace_exports = {};
 __export(trace_exports, {
   FallomSession: () => FallomSession,
-  init: () => init2,
+  init: () => init3,
   session: () => session,
   shutdown: () => shutdown
 });
@@ -714,7 +733,7 @@ async function tryAddInstrumentation(instrumentations, pkg, className) {
     log(`   \u274C ${pkg} not installed`);
   }
 }
-async function init2(options = {}) {
+async function init3(options = {}) {
   if (initialized) return;
   debugMode = options.debug ?? false;
   log("\u{1F680} Initializing Fallom tracing...");
@@ -803,7 +822,7 @@ __export(prompts_exports, {
   get: () => get,
   getAB: () => getAB,
   getPromptContext: () => getPromptContext,
-  init: () => init3
+  init: () => init4
 });
 import { createHash } from "crypto";
 var apiKey2 = null;
@@ -820,7 +839,7 @@ function log2(msg) {
     console.log(`[Fallom Prompts] ${msg}`);
   }
 }
-function init3(options = {}) {
+function init4(options = {}) {
   apiKey2 = options.apiKey || process.env.FALLOM_API_KEY || null;
   baseUrl2 = options.baseUrl || process.env.FALLOM_PROMPTS_URL || process.env.FALLOM_BASE_URL || "https://prompts.fallom.com";
   initialized2 = true;
@@ -840,7 +859,7 @@ function init3(options = {}) {
 function ensureInit() {
   if (!initialized2) {
     try {
-      init3();
+      init4();
     } catch {
     }
   }
@@ -1083,6 +1102,22 @@ function wrapOpenAI(client, sessionCtx) {
       if (response?.usage) {
         attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
       }
+      const waterfallTimings = {
+        requestStart: 0,
+        requestEnd: endTime - startTime,
+        responseEnd: endTime - startTime,
+        totalDurationMs: endTime - startTime,
+        // OpenAI tool calls (if present)
+        toolCalls: response?.choices?.[0]?.message?.tool_calls?.map(
+          (tc, idx) => ({
+            id: tc.id,
+            name: tc.function?.name,
+            callTime: 0
+            // All tool calls happen at once in non-streaming
+          })
+        )
+      };
+      attributes["fallom.raw.timings"] = JSON.stringify(waterfallTimings);
       const promptCtx = getPromptContext();
       sendTrace({
         config_key: ctx.configKey,
@@ -1168,7 +1203,7 @@ function wrapAnthropic(client, sessionCtx) {
         });
         const contentBlocks = response?.content || [];
         const textBlocks = contentBlocks.filter((b) => b.type === "text");
-        const toolUseBlocks = contentBlocks.filter(
+        const toolUseBlocks2 = contentBlocks.filter(
           (b) => b.type === "tool_use"
         );
         attributes["fallom.raw.response"] = JSON.stringify({
@@ -1177,7 +1212,7 @@ function wrapAnthropic(client, sessionCtx) {
           responseId: response?.id,
           model: response?.model,
           // Tool calls - Anthropic uses tool_use content blocks
-          toolCalls: toolUseBlocks.map((b) => ({
+          toolCalls: toolUseBlocks2.map((b) => ({
             id: b.id,
             name: b.name,
             arguments: b.input
@@ -1189,6 +1224,20 @@ function wrapAnthropic(client, sessionCtx) {
       if (response?.usage) {
         attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
       }
+      const waterfallTimings = {
+        requestStart: 0,
+        requestEnd: endTime - startTime,
+        responseEnd: endTime - startTime,
+        totalDurationMs: endTime - startTime,
+        // Anthropic tool calls (if present)
+        toolCalls: toolUseBlocks.map((b) => ({
+          id: b.id,
+          name: b.name,
+          callTime: 0
+          // All tool calls happen at once in non-streaming
+        }))
+      };
+      attributes["fallom.raw.timings"] = JSON.stringify(waterfallTimings);
       const promptCtx = getPromptContext();
       sendTrace({
         config_key: ctx.configKey,
@@ -1268,12 +1317,12 @@ function wrapGoogleAI(model, sessionCtx) {
       if (captureContent2) {
         attributes["fallom.raw.request"] = JSON.stringify(request);
         const candidates = result?.candidates || [];
-        const functionCalls = [];
+        const functionCalls2 = [];
         for (const candidate of candidates) {
           const parts = candidate?.content?.parts || [];
           for (const part of parts) {
             if (part.functionCall) {
-              functionCalls.push({
+              functionCalls2.push({
                 name: part.functionCall.name,
                 arguments: part.functionCall.args
               });
@@ -1285,12 +1334,25 @@ function wrapGoogleAI(model, sessionCtx) {
           candidates: result?.candidates,
           finishReason: candidates[0]?.finishReason,
           // Tool/function calls - Google uses functionCall in parts
-          toolCalls: functionCalls.length > 0 ? functionCalls : void 0
+          toolCalls: functionCalls2.length > 0 ? functionCalls2 : void 0
         });
       }
       if (result?.usageMetadata) {
         attributes["fallom.raw.usage"] = JSON.stringify(result.usageMetadata);
       }
+      const waterfallTimings = {
+        requestStart: 0,
+        requestEnd: endTime - startTime,
+        responseEnd: endTime - startTime,
+        totalDurationMs: endTime - startTime,
+        // Google AI function calls (if present)
+        toolCalls: functionCalls.map((fc) => ({
+          name: fc.name,
+          callTime: 0
+          // All tool calls happen at once in non-streaming
+        }))
+      };
+      attributes["fallom.raw.timings"] = JSON.stringify(waterfallTimings);
       const promptCtx = getPromptContext();
       sendTrace({
         config_key: ctx.configKey,
@@ -1358,8 +1420,51 @@ function createGenerateTextWrapper(aiModule, sessionCtx, debug = false) {
     const params = args[0] || {};
     const startTime = Date.now();
     const captureContent2 = shouldCaptureContent();
+    const toolTimings = /* @__PURE__ */ new Map();
+    let wrappedParams = params;
+    if (params.tools && typeof params.tools === "object") {
+      const wrappedTools = {};
+      for (const [toolName, tool] of Object.entries(
+        params.tools
+      )) {
+        if (tool && typeof tool.execute === "function") {
+          const originalExecute = tool.execute;
+          wrappedTools[toolName] = {
+            ...tool,
+            execute: async (...executeArgs) => {
+              const toolStartTime = Date.now();
+              const toolCallId = `${toolName}-${toolStartTime}`;
+              try {
+                const result = await originalExecute(...executeArgs);
+                const toolEndTime = Date.now();
+                toolTimings.set(toolCallId, {
+                  name: toolName,
+                  startTime: toolStartTime - startTime,
+                  // Relative to request start
+                  endTime: toolEndTime - startTime,
+                  duration: toolEndTime - toolStartTime
+                });
+                return result;
+              } catch (error) {
+                const toolEndTime = Date.now();
+                toolTimings.set(toolCallId, {
+                  name: toolName,
+                  startTime: toolStartTime - startTime,
+                  endTime: toolEndTime - startTime,
+                  duration: toolEndTime - toolStartTime
+                });
+                throw error;
+              }
+            }
+          };
+        } else {
+          wrappedTools[toolName] = tool;
+        }
+      }
+      wrappedParams = { ...params, tools: wrappedTools };
+    }
     try {
-      const result = await aiModule.generateText(...args);
+      const result = await aiModule.generateText(wrappedParams);
       const endTime = Date.now();
       if (debug || isDebugMode()) {
         console.log(
@@ -1381,22 +1486,40 @@ function createGenerateTextWrapper(aiModule, sessionCtx, debug = false) {
           tools: params?.tools ? Object.keys(params.tools) : void 0,
           maxSteps: params?.maxSteps
         });
+        const mapToolCall = (tc) => ({
+          toolCallId: tc?.toolCallId,
+          toolName: tc?.toolName,
+          args: tc?.args,
+          // The actual arguments passed to the tool!
+          type: tc?.type
+        });
+        const mapToolResult = (tr) => ({
+          toolCallId: tr?.toolCallId,
+          toolName: tr?.toolName,
+          result: tr?.result,
+          // The actual result from the tool!
+          type: tr?.type
+        });
         attributes["fallom.raw.response"] = JSON.stringify({
           text: result?.text,
           finishReason: result?.finishReason,
           responseId: result?.response?.id,
           modelId: result?.response?.modelId,
-          // Tool call data - send everything!
-          toolCalls: result?.toolCalls,
-          toolResults: result?.toolResults,
-          // Multi-step agent data
+          // Tool calls with FULL data (id, name, args)
+          toolCalls: result?.toolCalls?.map(mapToolCall),
+          // Tool results with FULL data (id, name, result)
+          toolResults: result?.toolResults?.map(mapToolResult),
+          // Multi-step agent data with FULL tool info including timestamps
           steps: result?.steps?.map((step) => ({
             stepType: step?.stepType,
             text: step?.text,
             finishReason: step?.finishReason,
-            toolCalls: step?.toolCalls,
-            toolResults: step?.toolResults,
-            usage: step?.usage
+            toolCalls: step?.toolCalls?.map(mapToolCall),
+            toolResults: step?.toolResults?.map(mapToolResult),
+            usage: step?.usage,
+            // Step-level timing from Vercel AI SDK
+            timestamp: step?.response?.timestamp,
+            responseId: step?.response?.id
           })),
           // Response messages (includes tool call/result messages)
           responseMessages: result?.responseMessages
@@ -1410,6 +1533,101 @@ function createGenerateTextWrapper(aiModule, sessionCtx, debug = false) {
           result.experimental_providerMetadata
         );
       }
+      const totalDurationMs = endTime - startTime;
+      const sortedToolTimings = Array.from(toolTimings.values()).sort(
+        (a, b) => a.startTime - b.startTime
+      );
+      const waterfallTimings = {
+        requestStart: 0,
+        responseEnd: totalDurationMs,
+        totalDurationMs,
+        phases: [],
+        // Include actual tool timings for verification
+        toolTimings: sortedToolTimings
+      };
+      if (sortedToolTimings.length > 0) {
+        const firstToolStart = Math.min(
+          ...sortedToolTimings.map((t) => t.startTime)
+        );
+        const lastToolEnd = Math.max(
+          ...sortedToolTimings.map((t) => t.endTime)
+        );
+        if (firstToolStart > 10) {
+          waterfallTimings.phases.push({
+            type: "llm",
+            label: "LLM Call 1 (decides tools)",
+            startMs: 0,
+            endMs: firstToolStart,
+            durationMs: firstToolStart,
+            accurate: true
+          });
+        }
+        sortedToolTimings.forEach((toolTiming) => {
+          waterfallTimings.phases.push({
+            type: "tool",
+            label: `${toolTiming.name}()`,
+            startMs: toolTiming.startTime,
+            endMs: toolTiming.endTime,
+            durationMs: toolTiming.duration,
+            accurate: true
+            // This is REAL measured timing!
+          });
+        });
+        const finalResponseDuration = totalDurationMs - lastToolEnd;
+        if (finalResponseDuration > 10) {
+          waterfallTimings.phases.push({
+            type: "response",
+            label: "LLM Call 2 \u2192 Final Response",
+            startMs: lastToolEnd,
+            endMs: totalDurationMs,
+            durationMs: finalResponseDuration,
+            accurate: true
+          });
+        }
+      } else if (result?.steps && result.steps.length > 0) {
+        const steps = result.steps;
+        const stepDuration = Math.round(totalDurationMs / steps.length);
+        steps.forEach((step, idx) => {
+          const hasTools = step?.toolCalls && step.toolCalls.length > 0;
+          const isFinalStep = step?.finishReason === "stop";
+          const stepStart = idx * stepDuration;
+          const stepEnd = Math.min((idx + 1) * stepDuration, totalDurationMs);
+          if (hasTools) {
+            waterfallTimings.phases.push({
+              type: "llm",
+              label: `Step ${idx + 1}: LLM + Tools`,
+              startMs: stepStart,
+              endMs: stepEnd,
+              durationMs: stepEnd - stepStart,
+              accurate: false,
+              note: "Tool timing not captured - combined step"
+            });
+          } else if (isFinalStep) {
+            waterfallTimings.phases.push({
+              type: "response",
+              label: `Step ${idx + 1}: Final Response`,
+              startMs: stepStart,
+              endMs: stepEnd,
+              durationMs: stepEnd - stepStart,
+              accurate: true
+            });
+          }
+        });
+      }
+      if (result?.steps) {
+        waterfallTimings.steps = result.steps.map((step, idx) => ({
+          stepIndex: idx,
+          stepType: step?.stepType,
+          finishReason: step?.finishReason,
+          timestamp: step?.response?.timestamp,
+          toolCalls: step?.toolCalls?.map((tc) => ({
+            id: tc?.toolCallId,
+            name: tc?.toolName
+          })),
+          usage: step?.usage
+        }));
+      }
+      attributes["fallom.raw.timings"] = JSON.stringify(waterfallTimings);
       const promptCtx = getPromptContext();
       sendTrace({
         config_key: ctx.configKey,
@@ -1479,7 +1697,47 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
     const params = args[0] || {};
     const startTime = Date.now();
     const captureContent2 = shouldCaptureContent();
-    const result = await aiModule.streamText(...args);
+    const toolTimings = /* @__PURE__ */ new Map();
+    let wrappedParams = params;
+    if (params.tools && typeof params.tools === "object") {
+      const wrappedTools = {};
+      for (const [toolName, tool] of Object.entries(params.tools)) {
+        if (tool && typeof tool.execute === "function") {
+          const originalExecute = tool.execute;
+          wrappedTools[toolName] = {
+            ...tool,
+            execute: async (...executeArgs) => {
+              const toolStartTime = Date.now();
+              const toolCallId = `${toolName}-${toolStartTime}`;
+              try {
+                const result2 = await originalExecute(...executeArgs);
+                const toolEndTime = Date.now();
+                toolTimings.set(toolCallId, {
+                  name: toolName,
+                  startTime: toolStartTime - startTime,
+                  endTime: toolEndTime - startTime,
+                  duration: toolEndTime - toolStartTime
+                });
+                return result2;
+              } catch (error) {
+                const toolEndTime = Date.now();
+                toolTimings.set(toolCallId, {
+                  name: toolName,
+                  startTime: toolStartTime - startTime,
+                  endTime: toolEndTime - startTime,
+                  duration: toolEndTime - toolStartTime
+                });
+                throw error;
+              }
+            }
+          };
+        } else {
+          wrappedTools[toolName] = tool;
+        }
+      }
+      wrappedParams = { ...params, tools: wrappedTools };
+    }
+    const result = await aiModule.streamText(wrappedParams);
     if (!isInitialized()) {
       return result;
     }
@@ -1545,6 +1803,20 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
             "fallom.is_streaming": true
           };
           if (captureContent2) {
+            const mapToolCall = (tc) => ({
+              toolCallId: tc?.toolCallId,
+              toolName: tc?.toolName,
+              args: tc?.args,
+              // The actual arguments passed to the tool!
+              type: tc?.type
+            });
+            const mapToolResult = (tr) => ({
+              toolCallId: tr?.toolCallId,
+              toolName: tr?.toolName,
+              result: tr?.result,
+              // The actual result from the tool!
+              type: tr?.type
+            });
             attributes["fallom.raw.request"] = JSON.stringify({
               prompt: params?.prompt,
               messages: params?.messages,
@@ -1556,17 +1828,21 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
             attributes["fallom.raw.response"] = JSON.stringify({
               text: responseText,
               finishReason,
-              // Tool call data - send everything!
-              toolCalls,
-              toolResults,
-              // Multi-step agent data
+              // Tool calls with FULL data (id, name, args)
+              toolCalls: toolCalls?.map(mapToolCall),
+              // Tool results with FULL data (id, name, result)
+              toolResults: toolResults?.map(mapToolResult),
+              // Multi-step agent data with FULL tool info including timestamps
               steps: steps?.map((step) => ({
                 stepType: step?.stepType,
                 text: step?.text,
                 finishReason: step?.finishReason,
-                toolCalls: step?.toolCalls,
-                toolResults: step?.toolResults,
-                usage: step?.usage
+                toolCalls: step?.toolCalls?.map(mapToolCall),
+                toolResults: step?.toolResults?.map(mapToolResult),
+                usage: step?.usage,
+                // Step-level timing from Vercel AI SDK
+                timestamp: step?.response?.timestamp,
+                responseId: step?.response?.id
               })),
               // Response messages (includes tool call/result messages)
               responseMessages
@@ -1581,6 +1857,78 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
           if (firstTokenTime) {
             attributes["fallom.time_to_first_token_ms"] = firstTokenTime - startTime;
           }
+          const totalDurationMs = endTime - startTime;
+          const sortedToolTimings = Array.from(toolTimings.values()).sort(
+            (a, b) => a.startTime - b.startTime
+          );
+          const waterfallTimings = {
+            requestStart: 0,
+            firstTokenTime: firstTokenTime ? firstTokenTime - startTime : void 0,
+            responseEnd: totalDurationMs,
+            totalDurationMs,
+            isStreaming: true,
+            phases: [],
+            toolTimings: sortedToolTimings
+          };
+          if (firstTokenTime) {
+            waterfallTimings.phases.push({
+              type: "ttft",
+              label: "Time to First Token",
+              startMs: 0,
+              endMs: firstTokenTime - startTime,
+              durationMs: firstTokenTime - startTime,
+              accurate: true
+            });
+          }
+          if (sortedToolTimings.length > 0) {
+            const firstToolStart = Math.min(...sortedToolTimings.map((t) => t.startTime));
+            const lastToolEnd = Math.max(...sortedToolTimings.map((t) => t.endTime));
+            if (firstToolStart > 10) {
+              waterfallTimings.phases.push({
+                type: "llm",
+                label: "LLM Call 1 (decides tools)",
+                startMs: 0,
+                endMs: firstToolStart,
+                durationMs: firstToolStart,
+                accurate: true
+              });
+            }
+            sortedToolTimings.forEach((toolTiming) => {
+              waterfallTimings.phases.push({
+                type: "tool",
+                label: `${toolTiming.name}()`,
+                startMs: toolTiming.startTime,
+                endMs: toolTiming.endTime,
+                durationMs: toolTiming.duration,
+                accurate: true
+              });
+            });
+            const finalResponseDuration = totalDurationMs - lastToolEnd;
+            if (finalResponseDuration > 10) {
+              waterfallTimings.phases.push({
+                type: "response",
+                label: "LLM Call 2 \u2192 Final Response",
+                startMs: lastToolEnd,
+                endMs: totalDurationMs,
+                durationMs: finalResponseDuration,
+                accurate: true
+              });
+            }
+          }
+          if (steps) {
+            waterfallTimings.steps = steps.map((step, idx) => ({
+              stepIndex: idx,
+              stepType: step?.stepType,
+              finishReason: step?.finishReason,
+              timestamp: step?.response?.timestamp,
+              toolCalls: step?.toolCalls?.map((tc) => ({
+                id: tc?.toolCallId,
+                name: tc?.toolName
+              })),
+              usage: step?.usage
+            }));
+          }
+          attributes["fallom.raw.timings"] = JSON.stringify(waterfallTimings);
           const promptCtx = getPromptContext();
           sendTrace({
             config_key: ctx.configKey,
@@ -1989,7 +2337,7 @@ var FallomSession = class {
       configKey = this.ctx.configKey;
       opts = configKeyOrOptions || {};
     }
-    const { get: get2 } = await import("./models-SEFDGZU2.mjs");
+    const { get: get2 } = await import("./models-NKYYGMSR.mjs");
     return get2(configKey, this.ctx.sessionId, opts);
   }
   /**
@@ -2151,603 +2499,32 @@ function session(options) {
   return new FallomSession(options);
 }
-// src/evals.ts
+// src/evals/index.ts
 var evals_exports = {};
 __export(evals_exports, {
   AVAILABLE_METRICS: () => AVAILABLE_METRICS,
+  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
+  METRIC_PROMPTS: () => METRIC_PROMPTS,
   compareModels: () => compareModels,
   createCustomModel: () => createCustomModel,
   createModelFromCallable: () => createModelFromCallable,
   createOpenAIModel: () => createOpenAIModel,
+  customMetric: () => customMetric,
   datasetFromFallom: () => datasetFromFallom,
   datasetFromTraces: () => datasetFromTraces,
-  default: () => evals_default,
   evaluate: () => evaluate,
-  init: () => init4,
-  uploadResults: () => uploadResults
+  getMetricName: () => getMetricName,
+  init: () => init2,
+  isCustomMetric: () => isCustomMetric,
+  uploadResults: () => uploadResultsPublic
 });
-var _apiKey = null;
-var _baseUrl = "https://app.fallom.com";
-var _initialized = false;
-var DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
-var AVAILABLE_METRICS = [
-  "answer_relevancy",
-  "hallucination",
-  "toxicity",
-  "faithfulness",
-  "completeness"
-];
-var METRIC_PROMPTS = {
-  answer_relevancy: {
-    criteria: "Answer Relevancy - Does the response directly address the user's question or request?",
-    steps: [
-      "Identify the main question or request in the input",
-      "Check if the response addresses this main point",
-      "Evaluate if all parts of the response are relevant to the question",
-      "Check for any off-topic or tangential information"
-    ]
-  },
-  hallucination: {
-    criteria: "Hallucination Detection - Does the response contain fabricated information not supported by the context or general knowledge?",
-    steps: [
-      "Identify all factual claims in the response",
-      "For each claim, check if it's supported by the input context",
-      "Check if claims are verifiable general knowledge",
-      "Flag any unsupported or false statements"
-    ]
-  },
-  toxicity: {
-    criteria: "Toxicity - Does the response contain harmful, offensive, or inappropriate content?",
-    steps: [
-      "Check for offensive language or slurs",
-      "Check for harmful advice or dangerous content",
-      "Check for discriminatory or biased statements",
-      "Check for inappropriate or adult content"
-    ]
-  },
-  faithfulness: {
-    criteria: "Faithfulness - Is the response factually accurate and consistent with the provided context?",
-    steps: [
-      "Compare response claims against the input context",
-      "Check for contradictions with the system message guidelines",
-      "Verify factual accuracy of statements",
-      "Check logical consistency"
-    ]
-  },
-  completeness: {
-    criteria: "Completeness - Does the response fully address all aspects of the user's request?",
-    steps: [
-      "List all parts/aspects of the user's question",
-      "Check if each part is addressed in the response",
-      "Evaluate the depth of coverage for each part",
-      "Check if any important information is missing"
-    ]
-  }
-};
-function init4(options = {}) {
-  _apiKey = options.apiKey || process.env.FALLOM_API_KEY || null;
-  _baseUrl = options.baseUrl || process.env.FALLOM_BASE_URL || "https://app.fallom.com";
-  if (!_apiKey) {
-    throw new Error(
-      "No API key provided. Set FALLOM_API_KEY environment variable or pass apiKey option."
-    );
-  }
-  _initialized = true;
-}
-async function runGEval(metric, inputText, outputText, systemMessage, judgeModel) {
-  const openrouterKey = process.env.OPENROUTER_API_KEY;
-  if (!openrouterKey) {
-    throw new Error(
-      "OPENROUTER_API_KEY environment variable required for evaluations."
-    );
-  }
-  const config = METRIC_PROMPTS[metric];
-  const stepsText = config.steps.map((s, i) => `${i + 1}. ${s}`).join("\n");
-  const prompt = `You are an expert evaluator assessing LLM outputs.
-## Evaluation Criteria
-${config.criteria}
-## Evaluation Steps
-Follow these steps carefully:
-${stepsText}
-## Input to Evaluate
-**System Message:** ${systemMessage || "(none)"}
-**User Input:** ${inputText}
-**Model Output:** ${outputText}
-## Instructions
-1. Go through each evaluation step
-2. Provide brief reasoning for each step
-3. Give a final score from 0.0 to 1.0
-Respond in this exact JSON format:
-{
-    "step_evaluations": [
-        {"step": 1, "reasoning": "..."},
-        {"step": 2, "reasoning": "..."}
-    ],
-    "overall_reasoning": "Brief summary of evaluation",
-    "score": 0.XX
-}`;
-  const response = await fetch(
-    "https://openrouter.ai/api/v1/chat/completions",
-    {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${openrouterKey}`,
-        "Content-Type": "application/json"
-      },
-      body: JSON.stringify({
-        model: judgeModel,
-        messages: [{ role: "user", content: prompt }],
-        response_format: { type: "json_object" },
-        temperature: 0
-      })
-    }
-  );
-  if (!response.ok) {
-    throw new Error(`OpenRouter API error: ${response.statusText}`);
-  }
-  const data = await response.json();
-  const result = JSON.parse(data.choices[0].message.content || "{}");
-  return { score: result.score, reasoning: result.overall_reasoning };
-}
-async function resolveDataset(datasetInput) {
-  if (typeof datasetInput === "string") {
-    return datasetFromFallom(datasetInput);
-  }
-  return datasetInput;
-}
-async function evaluate(options) {
-  const {
-    dataset: datasetInput,
-    metrics = [...AVAILABLE_METRICS],
-    judgeModel = DEFAULT_JUDGE_MODEL,
-    name,
-    description,
-    verbose = true,
-    _skipUpload = false
-  } = options;
-  const dataset = await resolveDataset(datasetInput);
-  const invalidMetrics = metrics.filter((m) => !AVAILABLE_METRICS.includes(m));
-  if (invalidMetrics.length > 0) {
-    throw new Error(
-      `Invalid metrics: ${invalidMetrics.join(", ")}. Available: ${AVAILABLE_METRICS.join(", ")}`
-    );
-  }
-  const results = [];
-  for (let i = 0; i < dataset.length; i++) {
-    const item = dataset[i];
-    if (verbose) console.log(`Evaluating item ${i + 1}/${dataset.length}...`);
-    const result = {
-      input: item.input,
-      output: item.output,
-      systemMessage: item.systemMessage,
-      model: "production",
-      isProduction: true,
-      reasoning: {}
-    };
-    for (const metric of metrics) {
-      if (verbose) console.log(`  Running ${metric}...`);
-      try {
-        const { score, reasoning } = await runGEval(
-          metric,
-          item.input,
-          item.output,
-          item.systemMessage,
-          judgeModel
-        );
-        const camelMetric = metric.replace(
-          /_([a-z])/g,
-          (_, c) => c.toUpperCase()
-        );
-        result[camelMetric] = score;
-        result.reasoning[metric] = reasoning;
-      } catch (error) {
-        if (verbose) console.log(`    Error: ${error}`);
-        result.reasoning[metric] = `Error: ${String(error)}`;
-      }
-    }
-    results.push(result);
-  }
-  if (verbose) printSummary(results, metrics);
-  if (!_skipUpload) {
-    if (_initialized) {
-      const runName = name || `Production Eval ${(/* @__PURE__ */ new Date()).toISOString().slice(0, 16).replace("T", " ")}`;
-      await _uploadResults(results, runName, description, judgeModel, verbose);
-    } else if (verbose) {
-      console.log(
-        "\n\u26A0\uFE0F  Fallom not initialized - results not uploaded. Call evals.init() to enable auto-upload."
-      );
-    }
-  }
-  return results;
-}
-async function callModelOpenRouter(modelSlug, messages, kwargs) {
-  const openrouterKey = process.env.OPENROUTER_API_KEY;
-  if (!openrouterKey) {
-    throw new Error(
-      "OPENROUTER_API_KEY environment variable required for model comparison"
-    );
-  }
-  const response = await fetch(
-    "https://openrouter.ai/api/v1/chat/completions",
-    {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${openrouterKey}`,
-        "Content-Type": "application/json"
-      },
-      body: JSON.stringify({ model: modelSlug, messages, ...kwargs })
-    }
-  );
-  if (!response.ok) {
-    throw new Error(`OpenRouter API error: ${response.statusText}`);
-  }
-  const data = await response.json();
-  return {
-    content: data.choices[0].message.content,
-    tokensIn: data.usage?.prompt_tokens,
-    tokensOut: data.usage?.completion_tokens,
-    cost: data.usage?.total_cost
-  };
-}
-function createOpenAIModel(modelId, options = {}) {
-  const { name, apiKey: apiKey3, baseURL, temperature, maxTokens } = options;
-  return {
-    name: name ?? modelId,
-    callFn: async (messages) => {
-      const { default: OpenAI } = await import("openai");
-      const client = new OpenAI({
-        apiKey: apiKey3 ?? process.env.OPENAI_API_KEY,
-        baseURL
-      });
-      const response = await client.chat.completions.create({
-        model: modelId,
-        messages,
-        temperature,
-        max_tokens: maxTokens
-      });
-      return {
-        content: response.choices[0].message.content ?? "",
-        tokensIn: response.usage?.prompt_tokens,
-        tokensOut: response.usage?.completion_tokens
-      };
-    }
-  };
-}
-function createCustomModel(name, options) {
-  const {
-    endpoint,
-    apiKey: apiKey3,
-    headers = {},
-    modelField = "model",
-    modelValue,
-    temperature,
-    maxTokens
-  } = options;
-  return {
-    name,
-    callFn: async (messages) => {
-      const requestHeaders = {
-        "Content-Type": "application/json",
-        ...headers
-      };
-      if (apiKey3) {
-        requestHeaders["Authorization"] = `Bearer ${apiKey3}`;
-      }
-      const payload = {
-        [modelField]: modelValue ?? name,
-        messages
-      };
-      if (temperature !== void 0) payload.temperature = temperature;
-      if (maxTokens !== void 0) payload.max_tokens = maxTokens;
-      const response = await fetch(endpoint, {
-        method: "POST",
-        headers: requestHeaders,
-        body: JSON.stringify(payload)
-      });
-      if (!response.ok) {
-        throw new Error(`API error: ${response.statusText}`);
-      }
-      const data = await response.json();
-      return {
-        content: data.choices[0].message.content,
-        tokensIn: data.usage?.prompt_tokens,
-        tokensOut: data.usage?.completion_tokens,
-        cost: data.usage?.total_cost
-      };
-    }
-  };
-}
-function createModelFromCallable(name, callFn) {
-  return { name, callFn };
-}
-async function compareModels(options) {
-  const {
-    dataset: datasetInput,
-    models,
-    metrics = [...AVAILABLE_METRICS],
-    judgeModel = DEFAULT_JUDGE_MODEL,
-    includeProduction = true,
-    modelKwargs = {},
-    name,
-    description,
-    verbose = true
-  } = options;
-  const dataset = await resolveDataset(datasetInput);
-  const results = {};
-  if (includeProduction) {
-    if (verbose) console.log("\n=== Evaluating Production Outputs ===");
-    results["production"] = await evaluate({
-      dataset,
-      // Pass already resolved dataset
-      metrics,
-      judgeModel,
-      verbose,
-      _skipUpload: true
-      // We'll upload all results at the end
-    });
-  }
-  for (const modelInput of models) {
-    const model = typeof modelInput === "string" ? { name: modelInput } : modelInput;
-    if (verbose) console.log(`
-=== Testing Model: ${model.name} ===`);
-    const modelResults = [];
-    for (let i = 0; i < dataset.length; i++) {
-      const item = dataset[i];
-      if (verbose)
-        console.log(`Item ${i + 1}/${dataset.length}: Generating output...`);
-      const start = Date.now();
-      const messages = [];
-      if (item.systemMessage) {
-        messages.push({ role: "system", content: item.systemMessage });
-      }
-      messages.push({ role: "user", content: item.input });
-      try {
-        const generated = model.callFn ? await model.callFn(messages) : await callModelOpenRouter(model.name, messages, modelKwargs);
-        const latencyMs = Date.now() - start;
-        const result = {
-          input: item.input,
-          output: generated.content,
-          systemMessage: item.systemMessage,
-          model: model.name,
-          isProduction: false,
-          reasoning: {},
-          latencyMs,
-          tokensIn: generated.tokensIn,
-          tokensOut: generated.tokensOut,
-          cost: generated.cost
-        };
-        for (const metric of metrics) {
-          if (verbose) console.log(`  Running ${metric}...`);
-          try {
-            const { score, reasoning } = await runGEval(
-              metric,
-              item.input,
-              generated.content,
-              item.systemMessage,
-              judgeModel
-            );
-            const camelMetric = metric.replace(
-              /_([a-z])/g,
-              (_, c) => c.toUpperCase()
-            );
-            result[camelMetric] = score;
-            result.reasoning[metric] = reasoning;
-          } catch (error) {
-            if (verbose) console.log(`    Error: ${error}`);
-            result.reasoning[metric] = `Error: ${String(error)}`;
-          }
-        }
-        modelResults.push(result);
-      } catch (error) {
-        if (verbose) console.log(`  Error generating output: ${error}`);
-        modelResults.push({
-          input: item.input,
-          output: `Error: ${String(error)}`,
-          systemMessage: item.systemMessage,
-          model: model.name,
-          isProduction: false,
-          reasoning: { error: String(error) }
-        });
-      }
-    }
-    results[model.name] = modelResults;
-  }
-  if (verbose) printComparisonSummary(results, metrics);
-  if (_initialized) {
-    const runName = name || `Model Comparison ${(/* @__PURE__ */ new Date()).toISOString().slice(0, 16).replace("T", " ")}`;
-    await _uploadResults(results, runName, description, judgeModel, verbose);
-  } else if (verbose) {
-    console.log(
-      "\n\u26A0\uFE0F  Fallom not initialized - results not uploaded. Call evals.init() to enable auto-upload."
-    );
-  }
-  return results;
-}
-function printSummary(results, metrics) {
-  console.log("\n" + "=".repeat(50));
-  console.log("EVALUATION SUMMARY");
-  console.log("=".repeat(50));
-  for (const metric of metrics) {
-    const camelMetric = metric.replace(
-      /_([a-z])/g,
-      (_, c) => c.toUpperCase()
-    );
-    const scores = results.map((r) => r[camelMetric]).filter((s) => s !== void 0);
-    if (scores.length > 0) {
-      const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
-      console.log(`${metric}: ${(avg * 100).toFixed(1)}% avg`);
-    }
-  }
-}
-function printComparisonSummary(results, metrics) {
-  console.log("\n" + "=".repeat(70));
-  console.log("MODEL COMPARISON SUMMARY");
-  console.log("=".repeat(70));
-  let header = "Model".padEnd(30);
-  for (const metric of metrics) {
-    header += metric.slice(0, 12).padEnd(15);
-  }
-  console.log(header);
-  console.log("-".repeat(70));
-  for (const [model, modelResults] of Object.entries(results)) {
-    let row = model.padEnd(30);
-    for (const metric of metrics) {
-      const camelMetric = metric.replace(
-        /_([a-z])/g,
-        (_, c) => c.toUpperCase()
-      );
-      const scores = modelResults.map((r) => r[camelMetric]).filter((s) => s !== void 0);
-      if (scores.length > 0) {
-        const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
-        row += `${(avg * 100).toFixed(1)}%`.padEnd(15);
-      } else {
-        row += "N/A".padEnd(15);
-      }
-    }
-    console.log(row);
-  }
-}
-async function _uploadResults(results, name, description, judgeModel, verbose) {
-  const allResults = Array.isArray(results) ? results : Object.values(results).flat();
-  const uniqueItems = new Set(
-    allResults.map((r) => `${r.input}|${r.systemMessage || ""}`)
-  );
-  const payload = {
-    name,
-    description,
-    dataset_size: uniqueItems.size,
-    judge_model: judgeModel,
-    results: allResults.map((r) => ({
-      input: r.input,
-      system_message: r.systemMessage,
-      model: r.model,
-      output: r.output,
-      is_production: r.isProduction,
-      answer_relevancy: r.answerRelevancy,
-      hallucination: r.hallucination,
-      toxicity: r.toxicity,
-      faithfulness: r.faithfulness,
-      completeness: r.completeness,
-      reasoning: r.reasoning,
-      latency_ms: r.latencyMs,
-      tokens_in: r.tokensIn,
-      tokens_out: r.tokensOut,
-      cost: r.cost
-    }))
-  };
-  try {
-    const response = await fetch(`${_baseUrl}/api/sdk-evals`, {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${_apiKey}`,
-        "Content-Type": "application/json"
-      },
-      body: JSON.stringify(payload)
-    });
-    if (!response.ok) {
-      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-    }
-    const data = await response.json();
-    const dashboardUrl = `${_baseUrl}/evals/${data.run_id}`;
-    if (verbose) {
-      console.log(`
-\u2705 Results uploaded to Fallom! View at: ${dashboardUrl}`);
-    }
-    return dashboardUrl;
-  } catch (error) {
-    if (verbose) {
-      console.log(`
-\u26A0\uFE0F  Failed to upload results: ${error}`);
-    }
-    return "";
-  }
-}
-async function uploadResults(results, name, description, judgeModel = "gpt-4o") {
-  if (!_initialized) {
-    throw new Error("Fallom evals not initialized. Call evals.init() first.");
-  }
-  return _uploadResults(results, name, description, judgeModel, true);
-}
-function datasetFromTraces(traces) {
-  const items = [];
-  for (const trace of traces) {
-    const attrs = trace.attributes || {};
-    if (Object.keys(attrs).length === 0) continue;
-    let input = "";
-    for (let i = 0; i < 100; i++) {
-      const role = attrs[`gen_ai.prompt.${i}.role`];
-      if (role === void 0) break;
-      if (role === "user") {
-        input = attrs[`gen_ai.prompt.${i}.content`] || "";
-      }
-    }
-    const output = attrs["gen_ai.completion.0.content"] || "";
-    const systemMessage = attrs["gen_ai.prompt.0.role"] === "system" ? attrs["gen_ai.prompt.0.content"] : void 0;
-    if (input && output) {
-      items.push({ input, output, systemMessage });
-    }
-  }
-  return items;
-}
-async function datasetFromFallom(datasetKey, version) {
-  if (!_initialized) {
-    throw new Error("Fallom evals not initialized. Call evals.init() first.");
-  }
-  let url = `${_baseUrl}/api/datasets/${encodeURIComponent(datasetKey)}`;
-  if (version !== void 0) {
-    url += `?version=${version}`;
-  }
-  const response = await fetch(url, {
-    headers: {
-      Authorization: `Bearer ${_apiKey}`,
-      "Content-Type": "application/json"
-    }
-  });
-  if (response.status === 404) {
-    throw new Error(`Dataset '${datasetKey}' not found`);
-  } else if (response.status === 403) {
-    throw new Error(`Access denied to dataset '${datasetKey}'`);
-  }
-  if (!response.ok) {
-    throw new Error(`Failed to fetch dataset: ${response.statusText}`);
-  }
-  const data = await response.json();
-  const items = data.entries.map((entry) => ({
-    input: entry.input,
-    output: entry.output,
-    systemMessage: entry.systemMessage,
-    metadata: entry.metadata
-  }));
-  const datasetName = data.dataset.name || datasetKey;
-  const versionNum = data.version.version || "latest";
-  console.log(
-    `\u2713 Loaded dataset '${datasetName}' (version ${versionNum}) with ${items.length} entries`
-  );
-  return items;
-}
-var evals_default = {
-  init: init4,
-  evaluate,
-  compareModels,
-  uploadResults,
-  datasetFromTraces,
-  datasetFromFallom,
-  AVAILABLE_METRICS
-};
 // src/init.ts
 async function init5(options = {}) {
   const tracesUrl = options.tracesUrl || process.env.FALLOM_TRACES_URL || "https://traces.fallom.com";
   const configsUrl = options.configsUrl || process.env.FALLOM_CONFIGS_URL || "https://configs.fallom.com";
   const promptsUrl = options.promptsUrl || process.env.FALLOM_PROMPTS_URL || "https://prompts.fallom.com";
-  await init2({
+  await init3({
     apiKey: options.apiKey,
     baseUrl: tracesUrl,
     captureContent: options.captureContent,
@@ -2757,7 +2534,7 @@ async function init5(options = {}) {
     apiKey: options.apiKey,
     baseUrl: configsUrl
   });
-  init3({
+  init4({
     apiKey: options.apiKey,
     baseUrl: promptsUrl
   });