@fallom/trace 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -120,7 +120,18 @@ Respond in JSON format:
120
120
  "score": 0.85
121
121
  }`;
122
122
  }
123
- async function runGEval(metric, inputText, outputText, systemMessage, judgeModel, openrouterKey) {
123
+ async function runGEval(options) {
124
+ const {
125
+ metric,
126
+ inputText,
127
+ outputText,
128
+ systemMessage,
129
+ judgeModel,
130
+ openrouterKey,
131
+ fallomApiKey,
132
+ traceSessionId,
133
+ traceCustomerId
134
+ } = options;
124
135
  const apiKey = openrouterKey || process.env.OPENROUTER_API_KEY;
125
136
  if (!apiKey) {
126
137
  throw new Error(
@@ -131,6 +142,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
131
142
  if (!config) {
132
143
  throw new Error(`Unknown metric: ${metric}`);
133
144
  }
145
+ const metricName = typeof metric === "object" ? metric.name : metric;
134
146
  const prompt = buildGEvalPrompt(
135
147
  config.criteria,
136
148
  config.steps,
@@ -138,6 +150,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
138
150
  inputText,
139
151
  outputText
140
152
  );
153
+ const startTime = Date.now();
141
154
  const response = await fetch(
142
155
  "https://openrouter.ai/api/v1/chat/completions",
143
156
  {
@@ -158,17 +171,94 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
158
171
  throw new Error(`G-Eval API error: ${response.statusText}`);
159
172
  }
160
173
  const data = await response.json();
174
+ const endTime = Date.now();
161
175
  try {
162
176
  const result = JSON.parse(data.choices[0].message.content);
163
- return {
164
- score: Math.max(0, Math.min(1, result.score)),
165
- // Clamp to 0-1
166
- reasoning: result.overall_reasoning || ""
167
- };
177
+ const score = Math.max(0, Math.min(1, result.score));
178
+ const reasoning = result.overall_reasoning || "";
179
+ if (fallomApiKey) {
180
+ sendGEvalTrace({
181
+ fallomApiKey,
182
+ metricName,
183
+ judgeModel,
184
+ prompt,
185
+ response: data.choices[0].message.content,
186
+ score,
187
+ reasoning,
188
+ startTime,
189
+ endTime,
190
+ usage: data.usage,
191
+ sessionId: traceSessionId,
192
+ customerId: traceCustomerId
193
+ }).catch(() => {
194
+ });
195
+ }
196
+ return { score, reasoning };
168
197
  } catch {
169
198
  throw new Error("Failed to parse G-Eval response");
170
199
  }
171
200
  }
201
+ async function sendGEvalTrace(options) {
202
+ const {
203
+ fallomApiKey,
204
+ metricName,
205
+ judgeModel,
206
+ prompt,
207
+ response,
208
+ score,
209
+ reasoning,
210
+ startTime,
211
+ endTime,
212
+ usage,
213
+ sessionId,
214
+ customerId
215
+ } = options;
216
+ const traceUrl = process.env.FALLOM_TRACES_URL || "https://traces.fallom.com";
217
+ const traceData = {
218
+ config_key: "eval-worker",
219
+ session_id: sessionId || `geval-${Date.now()}`,
220
+ customer_id: customerId,
221
+ trace_id: generateHexId(32),
222
+ span_id: generateHexId(16),
223
+ name: `geval.${metricName}`,
224
+ kind: "llm",
225
+ model: judgeModel,
226
+ start_time: new Date(startTime).toISOString(),
227
+ end_time: new Date(endTime).toISOString(),
228
+ duration_ms: endTime - startTime,
229
+ status: "OK",
230
+ metadata: {
231
+ metric: metricName,
232
+ score
233
+ },
234
+ tags: ["eval-worker", "geval", metricName],
235
+ attributes: {
236
+ "fallom.sdk_version": "2",
237
+ "fallom.method": "runGEval",
238
+ "geval.metric": metricName,
239
+ "geval.score": score,
240
+ "geval.reasoning": reasoning,
241
+ "gen_ai.prompt.0.role": "user",
242
+ "gen_ai.prompt.0.content": prompt,
243
+ "gen_ai.completion.0.content": response,
244
+ "gen_ai.usage.prompt_tokens": usage?.prompt_tokens,
245
+ "gen_ai.usage.completion_tokens": usage?.completion_tokens
246
+ }
247
+ };
248
+ await fetch(`${traceUrl}/v1/traces`, {
249
+ method: "POST",
250
+ headers: {
251
+ Authorization: `Bearer ${fallomApiKey}`,
252
+ "Content-Type": "application/json"
253
+ },
254
+ body: JSON.stringify(traceData)
255
+ });
256
+ }
257
+ function generateHexId(length) {
258
+ const bytes = new Uint8Array(length / 2);
259
+ crypto.getRandomValues(bytes);
260
+ return Array.from(bytes).map((b) => b.toString(16).padStart(2, "0")).join("");
261
+ }
172
262
  function calculateAggregateScores(results) {
173
263
  const aggregates = {};
174
264
  for (const result of results) {
@@ -333,7 +423,7 @@ function datasetFromTraces(traces) {
333
423
  return items;
334
424
  }
335
425
  async function datasetFromFallom(datasetKey, version, config) {
336
- const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-DUG2SP2V.mjs").then(
426
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-Q3IHBEHB.mjs").then(
337
427
  (m) => ({
338
428
  _apiKey: config?._apiKey ?? m._apiKey,
339
429
  _baseUrl: config?._baseUrl ?? m._baseUrl,
@@ -406,7 +496,7 @@ var EvaluationDataset = class {
406
496
  * @returns Self for chaining
407
497
  */
408
498
  async pull(alias, version) {
409
- const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-DUG2SP2V.mjs");
499
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-Q3IHBEHB.mjs");
410
500
  if (!_initialized2) {
411
501
  throw new Error("Fallom evals not initialized. Call evals.init() first.");
412
502
  }
@@ -545,7 +635,13 @@ function init(options = {}) {
545
635
  }
546
636
  async function runGEval2(metric, inputText, outputText, systemMessage, judgeModel) {
547
637
  const metricArg = isCustomMetric(metric) ? { name: metric.name, criteria: metric.criteria, steps: metric.steps } : metric;
548
- return runGEval(metricArg, inputText, outputText, systemMessage, judgeModel);
638
+ return runGEval({
639
+ metric: metricArg,
640
+ inputText,
641
+ outputText,
642
+ systemMessage,
643
+ judgeModel
644
+ });
549
645
  }
550
646
  async function resolveDataset(datasetInput) {
551
647
  if (typeof datasetInput === "string") {
@@ -617,7 +713,9 @@ async function evaluate(options) {
617
713
  for (const m of metrics) {
618
714
  if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
619
715
  throw new Error(
620
- `Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(", ")}. Or use CustomMetric for custom metrics.`
716
+ `Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(
717
+ ", "
718
+ )}. Or use CustomMetric for custom metrics.`
621
719
  );
622
720
  }
623
721
  }
@@ -7,7 +7,7 @@ import {
7
7
  evaluate,
8
8
  init,
9
9
  uploadResultsPublic
10
- } from "./chunk-2NGJF2JZ.mjs";
10
+ } from "./chunk-3VWF2OJX.mjs";
11
11
  import "./chunk-7P6ASYW6.mjs";
12
12
  export {
13
13
  DEFAULT_JUDGE_MODEL,
package/dist/index.d.mts CHANGED
@@ -568,22 +568,40 @@ interface GEvalScore {
568
568
  score: number;
569
569
  reasoning: string;
570
570
  }
571
+ /**
572
+ * Options for runGEval function.
573
+ */
574
+ interface RunGEvalOptions {
575
+ /** Built-in metric name or custom metric config */
576
+ metric: string | {
577
+ name: string;
578
+ criteria: string;
579
+ steps: string[];
580
+ };
581
+ /** The user's input/query */
582
+ inputText: string;
583
+ /** The LLM's response to evaluate */
584
+ outputText: string;
585
+ /** Optional system message for context */
586
+ systemMessage?: string;
587
+ /** The model to use as judge (OpenRouter format, e.g., "openai/gpt-4o-mini") */
588
+ judgeModel: string;
589
+ /** OpenRouter API key (defaults to OPENROUTER_API_KEY env var) */
590
+ openrouterKey?: string;
591
+ /** Optional Fallom API key to enable tracing of the judge LLM call */
592
+ fallomApiKey?: string;
593
+ /** Optional session ID for tracing (e.g., eval run ID) */
594
+ traceSessionId?: string;
595
+ /** Optional customer ID for tracing (e.g., organization ID) */
596
+ traceCustomerId?: string;
597
+ }
571
598
  /**
572
599
  * Run G-Eval for a single metric using OpenRouter.
573
600
  * This is the low-level function used by both the SDK and backend workers.
574
601
  *
575
- * @param metric - Built-in metric name or custom metric config
576
- * @param inputText - The user's input/query
577
- * @param outputText - The LLM's response
578
- * @param systemMessage - Optional system message
579
- * @param judgeModel - The model to use as judge (OpenRouter format)
580
- * @param openrouterKey - OpenRouter API key (defaults to env var)
602
+ * If `fallomApiKey` is provided, the judge LLM call will be traced to Fallom.
581
603
  */
582
- declare function runGEval(metric: string | {
583
- name: string;
584
- criteria: string;
585
- steps: string[];
586
- }, inputText: string, outputText: string, systemMessage: string | undefined, judgeModel: string, openrouterKey?: string): Promise<GEvalScore>;
604
+ declare function runGEval(options: RunGEvalOptions): Promise<GEvalScore>;
587
605
  /**
588
606
  * Calculate aggregate scores from a list of results.
589
607
  */
@@ -614,12 +632,22 @@ declare function detectRegression(currentScores: Record<string, {
614
632
  };
615
633
 
616
634
  /**
617
- * Core evaluation functions.
635
+ * Core evaluation functions for Fallom Evals.
636
+ *
637
+ * Provides the main API for running LLM evaluations using G-Eval methodology.
618
638
  */
619
639
 
640
+ /** Default judge model (via OpenRouter) */
620
641
  declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
621
642
  /**
622
643
  * Initialize Fallom evals.
644
+ *
645
+ * @example
646
+ * ```typescript
647
+ * import fallom from "@fallom/trace";
648
+ *
649
+ * fallom.evals.init({ apiKey: "your-api-key" });
650
+ * ```
623
651
  */
624
652
  declare function init$1(options?: InitOptions$1): void;
625
653
  /**
@@ -627,6 +655,13 @@ declare function init$1(options?: InitOptions$1): void;
627
655
  *
628
656
  * Results are automatically uploaded to Fallom dashboard.
629
657
  *
658
+ * @example
659
+ * ```typescript
660
+ * const results = await fallom.evals.evaluate({
661
+ * dataset: [{ input: "What is 2+2?", output: "4" }],
662
+ * metrics: ["answer_relevancy", "faithfulness"],
663
+ * });
664
+ * ```
630
665
  */
631
666
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
632
667
  /**
@@ -848,6 +883,7 @@ type evals_MetricName = MetricName;
848
883
  type evals_Model = Model;
849
884
  type evals_ModelCallable = ModelCallable;
850
885
  type evals_ModelResponse = ModelResponse;
886
+ type evals_RunGEvalOptions = RunGEvalOptions;
851
887
  declare const evals_buildGEvalPrompt: typeof buildGEvalPrompt;
852
888
  declare const evals_calculateAggregateScores: typeof calculateAggregateScores;
853
889
  declare const evals_compareModels: typeof compareModels;
@@ -863,7 +899,7 @@ declare const evals_getMetricName: typeof getMetricName;
863
899
  declare const evals_isCustomMetric: typeof isCustomMetric;
864
900
  declare const evals_runGEval: typeof runGEval;
865
901
  declare namespace evals {
866
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
902
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, type evals_RunGEvalOptions as RunGEvalOptions, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
867
903
  }
868
904
 
869
905
  /**
package/dist/index.d.ts CHANGED
@@ -568,22 +568,40 @@ interface GEvalScore {
568
568
  score: number;
569
569
  reasoning: string;
570
570
  }
571
+ /**
572
+ * Options for runGEval function.
573
+ */
574
+ interface RunGEvalOptions {
575
+ /** Built-in metric name or custom metric config */
576
+ metric: string | {
577
+ name: string;
578
+ criteria: string;
579
+ steps: string[];
580
+ };
581
+ /** The user's input/query */
582
+ inputText: string;
583
+ /** The LLM's response to evaluate */
584
+ outputText: string;
585
+ /** Optional system message for context */
586
+ systemMessage?: string;
587
+ /** The model to use as judge (OpenRouter format, e.g., "openai/gpt-4o-mini") */
588
+ judgeModel: string;
589
+ /** OpenRouter API key (defaults to OPENROUTER_API_KEY env var) */
590
+ openrouterKey?: string;
591
+ /** Optional Fallom API key to enable tracing of the judge LLM call */
592
+ fallomApiKey?: string;
593
+ /** Optional session ID for tracing (e.g., eval run ID) */
594
+ traceSessionId?: string;
595
+ /** Optional customer ID for tracing (e.g., organization ID) */
596
+ traceCustomerId?: string;
597
+ }
571
598
  /**
572
599
  * Run G-Eval for a single metric using OpenRouter.
573
600
  * This is the low-level function used by both the SDK and backend workers.
574
601
  *
575
- * @param metric - Built-in metric name or custom metric config
576
- * @param inputText - The user's input/query
577
- * @param outputText - The LLM's response
578
- * @param systemMessage - Optional system message
579
- * @param judgeModel - The model to use as judge (OpenRouter format)
580
- * @param openrouterKey - OpenRouter API key (defaults to env var)
602
+ * If `fallomApiKey` is provided, the judge LLM call will be traced to Fallom.
581
603
  */
582
- declare function runGEval(metric: string | {
583
- name: string;
584
- criteria: string;
585
- steps: string[];
586
- }, inputText: string, outputText: string, systemMessage: string | undefined, judgeModel: string, openrouterKey?: string): Promise<GEvalScore>;
604
+ declare function runGEval(options: RunGEvalOptions): Promise<GEvalScore>;
587
605
  /**
588
606
  * Calculate aggregate scores from a list of results.
589
607
  */
@@ -614,12 +632,22 @@ declare function detectRegression(currentScores: Record<string, {
614
632
  };
615
633
 
616
634
  /**
617
- * Core evaluation functions.
635
+ * Core evaluation functions for Fallom Evals.
636
+ *
637
+ * Provides the main API for running LLM evaluations using G-Eval methodology.
618
638
  */
619
639
 
640
+ /** Default judge model (via OpenRouter) */
620
641
  declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
621
642
  /**
622
643
  * Initialize Fallom evals.
644
+ *
645
+ * @example
646
+ * ```typescript
647
+ * import fallom from "@fallom/trace";
648
+ *
649
+ * fallom.evals.init({ apiKey: "your-api-key" });
650
+ * ```
623
651
  */
624
652
  declare function init$1(options?: InitOptions$1): void;
625
653
  /**
@@ -627,6 +655,13 @@ declare function init$1(options?: InitOptions$1): void;
627
655
  *
628
656
  * Results are automatically uploaded to Fallom dashboard.
629
657
  *
658
+ * @example
659
+ * ```typescript
660
+ * const results = await fallom.evals.evaluate({
661
+ * dataset: [{ input: "What is 2+2?", output: "4" }],
662
+ * metrics: ["answer_relevancy", "faithfulness"],
663
+ * });
664
+ * ```
630
665
  */
631
666
  declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
632
667
  /**
@@ -848,6 +883,7 @@ type evals_MetricName = MetricName;
848
883
  type evals_Model = Model;
849
884
  type evals_ModelCallable = ModelCallable;
850
885
  type evals_ModelResponse = ModelResponse;
886
+ type evals_RunGEvalOptions = RunGEvalOptions;
851
887
  declare const evals_buildGEvalPrompt: typeof buildGEvalPrompt;
852
888
  declare const evals_calculateAggregateScores: typeof calculateAggregateScores;
853
889
  declare const evals_compareModels: typeof compareModels;
@@ -863,7 +899,7 @@ declare const evals_getMetricName: typeof getMetricName;
863
899
  declare const evals_isCustomMetric: typeof isCustomMetric;
864
900
  declare const evals_runGEval: typeof runGEval;
865
901
  declare namespace evals {
866
- export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
902
+ export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, type evals_RunGEvalOptions as RunGEvalOptions, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
867
903
  }
868
904
 
869
905
  /**
package/dist/index.js CHANGED
@@ -378,7 +378,18 @@ Respond in JSON format:
378
378
  "score": 0.85
379
379
  }`;
380
380
  }
381
- async function runGEval(metric, inputText, outputText, systemMessage, judgeModel, openrouterKey) {
381
+ async function runGEval(options) {
382
+ const {
383
+ metric,
384
+ inputText,
385
+ outputText,
386
+ systemMessage,
387
+ judgeModel,
388
+ openrouterKey,
389
+ fallomApiKey,
390
+ traceSessionId,
391
+ traceCustomerId
392
+ } = options;
382
393
  const apiKey4 = openrouterKey || process.env.OPENROUTER_API_KEY;
383
394
  if (!apiKey4) {
384
395
  throw new Error(
@@ -389,6 +400,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
389
400
  if (!config) {
390
401
  throw new Error(`Unknown metric: ${metric}`);
391
402
  }
403
+ const metricName = typeof metric === "object" ? metric.name : metric;
392
404
  const prompt = buildGEvalPrompt(
393
405
  config.criteria,
394
406
  config.steps,
@@ -396,6 +408,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
396
408
  inputText,
397
409
  outputText
398
410
  );
411
+ const startTime = Date.now();
399
412
  const response = await fetch(
400
413
  "https://openrouter.ai/api/v1/chat/completions",
401
414
  {
@@ -416,17 +429,94 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
416
429
  throw new Error(`G-Eval API error: ${response.statusText}`);
417
430
  }
418
431
  const data = await response.json();
432
+ const endTime = Date.now();
419
433
  try {
420
434
  const result = JSON.parse(data.choices[0].message.content);
421
- return {
422
- score: Math.max(0, Math.min(1, result.score)),
423
- // Clamp to 0-1
424
- reasoning: result.overall_reasoning || ""
425
- };
435
+ const score = Math.max(0, Math.min(1, result.score));
436
+ const reasoning = result.overall_reasoning || "";
437
+ if (fallomApiKey) {
438
+ sendGEvalTrace({
439
+ fallomApiKey,
440
+ metricName,
441
+ judgeModel,
442
+ prompt,
443
+ response: data.choices[0].message.content,
444
+ score,
445
+ reasoning,
446
+ startTime,
447
+ endTime,
448
+ usage: data.usage,
449
+ sessionId: traceSessionId,
450
+ customerId: traceCustomerId
451
+ }).catch(() => {
452
+ });
453
+ }
454
+ return { score, reasoning };
426
455
  } catch {
427
456
  throw new Error("Failed to parse G-Eval response");
428
457
  }
429
458
  }
459
+ async function sendGEvalTrace(options) {
460
+ const {
461
+ fallomApiKey,
462
+ metricName,
463
+ judgeModel,
464
+ prompt,
465
+ response,
466
+ score,
467
+ reasoning,
468
+ startTime,
469
+ endTime,
470
+ usage,
471
+ sessionId,
472
+ customerId
473
+ } = options;
474
+ const traceUrl = process.env.FALLOM_TRACES_URL || "https://traces.fallom.com";
475
+ const traceData = {
476
+ config_key: "eval-worker",
477
+ session_id: sessionId || `geval-${Date.now()}`,
478
+ customer_id: customerId,
479
+ trace_id: generateHexId2(32),
480
+ span_id: generateHexId2(16),
481
+ name: `geval.${metricName}`,
482
+ kind: "llm",
483
+ model: judgeModel,
484
+ start_time: new Date(startTime).toISOString(),
485
+ end_time: new Date(endTime).toISOString(),
486
+ duration_ms: endTime - startTime,
487
+ status: "OK",
488
+ metadata: {
489
+ metric: metricName,
490
+ score
491
+ },
492
+ tags: ["eval-worker", "geval", metricName],
493
+ attributes: {
494
+ "fallom.sdk_version": "2",
495
+ "fallom.method": "runGEval",
496
+ "geval.metric": metricName,
497
+ "geval.score": score,
498
+ "geval.reasoning": reasoning,
499
+ "gen_ai.prompt.0.role": "user",
500
+ "gen_ai.prompt.0.content": prompt,
501
+ "gen_ai.completion.0.content": response,
502
+ "gen_ai.usage.prompt_tokens": usage?.prompt_tokens,
503
+ "gen_ai.usage.completion_tokens": usage?.completion_tokens
504
+ }
505
+ };
506
+ await fetch(`${traceUrl}/v1/traces`, {
507
+ method: "POST",
508
+ headers: {
509
+ Authorization: `Bearer ${fallomApiKey}`,
510
+ "Content-Type": "application/json"
511
+ },
512
+ body: JSON.stringify(traceData)
513
+ });
514
+ }
515
+ function generateHexId2(length) {
516
+ const bytes = new Uint8Array(length / 2);
517
+ crypto.getRandomValues(bytes);
518
+ return Array.from(bytes).map((b) => b.toString(16).padStart(2, "0")).join("");
519
+ }
430
520
  function calculateAggregateScores(results) {
431
521
  const aggregates = {};
432
522
  for (const result of results) {
@@ -894,7 +984,13 @@ function init4(options = {}) {
894
984
  }
895
985
  async function runGEval2(metric, inputText, outputText, systemMessage, judgeModel) {
896
986
  const metricArg = isCustomMetric(metric) ? { name: metric.name, criteria: metric.criteria, steps: metric.steps } : metric;
897
- return runGEval(metricArg, inputText, outputText, systemMessage, judgeModel);
987
+ return runGEval({
988
+ metric: metricArg,
989
+ inputText,
990
+ outputText,
991
+ systemMessage,
992
+ judgeModel
993
+ });
898
994
  }
899
995
  async function resolveDataset(datasetInput) {
900
996
  if (typeof datasetInput === "string") {
@@ -966,7 +1062,9 @@ async function evaluate(options) {
966
1062
  for (const m of metrics) {
967
1063
  if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
968
1064
  throw new Error(
969
- `Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(", ")}. Or use CustomMetric for custom metrics.`
1065
+ `Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(
1066
+ ", "
1067
+ )}. Or use CustomMetric for custom metrics.`
970
1068
  );
971
1069
  }
972
1070
  }
@@ -1297,7 +1395,7 @@ var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otl
1297
1395
  // node_modules/@opentelemetry/resources/build/esm/Resource.js
1298
1396
  var import_api = require("@opentelemetry/api");
1299
1397
 
1300
- // node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
1398
+ // node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
1301
1399
  var SemanticResourceAttributes = {
1302
1400
  /**
1303
1401
  * Name of the cloud provider.
package/dist/index.mjs CHANGED
@@ -23,7 +23,7 @@ import {
23
23
  isCustomMetric,
24
24
  runGEval,
25
25
  uploadResultsPublic
26
- } from "./chunk-GZ6TE7G4.mjs";
26
+ } from "./chunk-3VWF2OJX.mjs";
27
27
  import {
28
28
  __export
29
29
  } from "./chunk-7P6ASYW6.mjs";
@@ -45,7 +45,7 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
45
45
  // node_modules/@opentelemetry/resources/build/esm/Resource.js
46
46
  import { diag } from "@opentelemetry/api";
47
47
 
48
- // node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
48
+ // node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
49
49
  var SemanticResourceAttributes = {
50
50
  /**
51
51
  * Name of the cloud provider.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fallom/trace",
3
- "version": "0.2.22",
3
+ "version": "0.2.24",
4
4
  "description": "Model A/B testing and tracing for LLM applications. Zero latency, production-ready.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",