@fallom/trace 0.2.21 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-KFD5AQ7V.mjs +308 -0
- package/dist/{chunk-GZ6TE7G4.mjs → chunk-NNVWIZN5.mjs} +101 -10
- package/dist/{core-DUG2SP2V.mjs → core-3MHBKYBC.mjs} +1 -1
- package/dist/index.d.mts +46 -14
- package/dist/index.d.ts +46 -14
- package/dist/index.js +108 -9
- package/dist/index.mjs +6 -2
- package/dist/models-SEFDGZU2.mjs +8 -0
- package/package.json +1 -1
- package/dist/chunk-XBZ3ESNV.mjs +0 -824
- package/dist/core-JLHYFVYS.mjs +0 -21
package/dist/index.d.ts
CHANGED
|
@@ -568,22 +568,36 @@ interface GEvalScore {
|
|
|
568
568
|
score: number;
|
|
569
569
|
reasoning: string;
|
|
570
570
|
}
|
|
571
|
+
/**
|
|
572
|
+
* Options for runGEval function.
|
|
573
|
+
*/
|
|
574
|
+
interface RunGEvalOptions {
|
|
575
|
+
/** Built-in metric name or custom metric config */
|
|
576
|
+
metric: string | {
|
|
577
|
+
name: string;
|
|
578
|
+
criteria: string;
|
|
579
|
+
steps: string[];
|
|
580
|
+
};
|
|
581
|
+
/** The user's input/query */
|
|
582
|
+
inputText: string;
|
|
583
|
+
/** The LLM's response to evaluate */
|
|
584
|
+
outputText: string;
|
|
585
|
+
/** Optional system message for context */
|
|
586
|
+
systemMessage?: string;
|
|
587
|
+
/** The model to use as judge (OpenRouter format, e.g., "openai/gpt-4o-mini") */
|
|
588
|
+
judgeModel: string;
|
|
589
|
+
/** OpenRouter API key (defaults to OPENROUTER_API_KEY env var) */
|
|
590
|
+
openrouterKey?: string;
|
|
591
|
+
/** Optional Fallom API key to enable tracing of the judge LLM call */
|
|
592
|
+
fallomApiKey?: string;
|
|
593
|
+
}
|
|
571
594
|
/**
|
|
572
595
|
* Run G-Eval for a single metric using OpenRouter.
|
|
573
596
|
* This is the low-level function used by both the SDK and backend workers.
|
|
574
597
|
*
|
|
575
|
-
*
|
|
576
|
-
* @param inputText - The user's input/query
|
|
577
|
-
* @param outputText - The LLM's response
|
|
578
|
-
* @param systemMessage - Optional system message
|
|
579
|
-
* @param judgeModel - The model to use as judge (OpenRouter format)
|
|
580
|
-
* @param openrouterKey - OpenRouter API key (defaults to env var)
|
|
598
|
+
* If `fallomApiKey` is provided, the judge LLM call will be traced to Fallom.
|
|
581
599
|
*/
|
|
582
|
-
declare function runGEval(
|
|
583
|
-
name: string;
|
|
584
|
-
criteria: string;
|
|
585
|
-
steps: string[];
|
|
586
|
-
}, inputText: string, outputText: string, systemMessage: string | undefined, judgeModel: string, openrouterKey?: string): Promise<GEvalScore>;
|
|
600
|
+
declare function runGEval(options: RunGEvalOptions): Promise<GEvalScore>;
|
|
587
601
|
/**
|
|
588
602
|
* Calculate aggregate scores from a list of results.
|
|
589
603
|
*/
|
|
@@ -614,12 +628,22 @@ declare function detectRegression(currentScores: Record<string, {
|
|
|
614
628
|
};
|
|
615
629
|
|
|
616
630
|
/**
|
|
617
|
-
* Core evaluation functions.
|
|
631
|
+
* Core evaluation functions for Fallom Evals.
|
|
632
|
+
*
|
|
633
|
+
* Provides the main API for running LLM evaluations using G-Eval methodology.
|
|
618
634
|
*/
|
|
619
635
|
|
|
636
|
+
/** Default judge model (via OpenRouter) */
|
|
620
637
|
declare const DEFAULT_JUDGE_MODEL = "openai/gpt-4o-mini";
|
|
621
638
|
/**
|
|
622
639
|
* Initialize Fallom evals.
|
|
640
|
+
*
|
|
641
|
+
* @example
|
|
642
|
+
* ```typescript
|
|
643
|
+
* import fallom from "@fallom/trace";
|
|
644
|
+
*
|
|
645
|
+
* fallom.evals.init({ apiKey: "your-api-key" });
|
|
646
|
+
* ```
|
|
623
647
|
*/
|
|
624
648
|
declare function init$1(options?: InitOptions$1): void;
|
|
625
649
|
/**
|
|
@@ -627,6 +651,13 @@ declare function init$1(options?: InitOptions$1): void;
|
|
|
627
651
|
*
|
|
628
652
|
* Results are automatically uploaded to Fallom dashboard.
|
|
629
653
|
*
|
|
654
|
+
* @example
|
|
655
|
+
* ```typescript
|
|
656
|
+
* const results = await fallom.evals.evaluate({
|
|
657
|
+
* dataset: [{ input: "What is 2+2?", output: "4" }],
|
|
658
|
+
* metrics: ["answer_relevancy", "faithfulness"],
|
|
659
|
+
* });
|
|
660
|
+
* ```
|
|
630
661
|
*/
|
|
631
662
|
declare function evaluate(options: EvaluateOptions): Promise<EvalResult[]>;
|
|
632
663
|
/**
|
|
@@ -848,6 +879,7 @@ type evals_MetricName = MetricName;
|
|
|
848
879
|
type evals_Model = Model;
|
|
849
880
|
type evals_ModelCallable = ModelCallable;
|
|
850
881
|
type evals_ModelResponse = ModelResponse;
|
|
882
|
+
type evals_RunGEvalOptions = RunGEvalOptions;
|
|
851
883
|
declare const evals_buildGEvalPrompt: typeof buildGEvalPrompt;
|
|
852
884
|
declare const evals_calculateAggregateScores: typeof calculateAggregateScores;
|
|
853
885
|
declare const evals_compareModels: typeof compareModels;
|
|
@@ -863,7 +895,7 @@ declare const evals_getMetricName: typeof getMetricName;
|
|
|
863
895
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
864
896
|
declare const evals_runGEval: typeof runGEval;
|
|
865
897
|
declare namespace evals {
|
|
866
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
|
|
898
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, type evals_RunGEvalOptions as RunGEvalOptions, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
|
|
867
899
|
}
|
|
868
900
|
|
|
869
901
|
/**
|
|
@@ -1072,4 +1104,4 @@ declare const _default: {
|
|
|
1072
1104
|
session: typeof session;
|
|
1073
1105
|
};
|
|
1074
1106
|
|
|
1075
|
-
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, clearMastraPrompt, _default as default, evals, init, models, prompts, session, setMastraPrompt, setMastraPromptAB, trace };
|
|
1107
|
+
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, type GEvalScore, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, buildGEvalPrompt, calculateAggregateScores, clearMastraPrompt, _default as default, detectRegression, evals, init, models, prompts, runGEval, session, setMastraPrompt, setMastraPromptAB, trace };
|
package/dist/index.js
CHANGED
|
@@ -378,7 +378,16 @@ Respond in JSON format:
|
|
|
378
378
|
"score": 0.85
|
|
379
379
|
}`;
|
|
380
380
|
}
|
|
381
|
-
async function runGEval(
|
|
381
|
+
async function runGEval(options) {
|
|
382
|
+
const {
|
|
383
|
+
metric,
|
|
384
|
+
inputText,
|
|
385
|
+
outputText,
|
|
386
|
+
systemMessage,
|
|
387
|
+
judgeModel,
|
|
388
|
+
openrouterKey,
|
|
389
|
+
fallomApiKey
|
|
390
|
+
} = options;
|
|
382
391
|
const apiKey4 = openrouterKey || process.env.OPENROUTER_API_KEY;
|
|
383
392
|
if (!apiKey4) {
|
|
384
393
|
throw new Error(
|
|
@@ -389,6 +398,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
|
|
|
389
398
|
if (!config) {
|
|
390
399
|
throw new Error(`Unknown metric: ${metric}`);
|
|
391
400
|
}
|
|
401
|
+
const metricName = typeof metric === "object" ? metric.name : metric;
|
|
392
402
|
const prompt = buildGEvalPrompt(
|
|
393
403
|
config.criteria,
|
|
394
404
|
config.steps,
|
|
@@ -396,6 +406,7 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
|
|
|
396
406
|
inputText,
|
|
397
407
|
outputText
|
|
398
408
|
);
|
|
409
|
+
const startTime = Date.now();
|
|
399
410
|
const response = await fetch(
|
|
400
411
|
"https://openrouter.ai/api/v1/chat/completions",
|
|
401
412
|
{
|
|
@@ -416,17 +427,89 @@ async function runGEval(metric, inputText, outputText, systemMessage, judgeModel
|
|
|
416
427
|
throw new Error(`G-Eval API error: ${response.statusText}`);
|
|
417
428
|
}
|
|
418
429
|
const data = await response.json();
|
|
430
|
+
const endTime = Date.now();
|
|
419
431
|
try {
|
|
420
432
|
const result = JSON.parse(data.choices[0].message.content);
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
433
|
+
const score = Math.max(0, Math.min(1, result.score));
|
|
434
|
+
const reasoning = result.overall_reasoning || "";
|
|
435
|
+
if (fallomApiKey) {
|
|
436
|
+
sendGEvalTrace({
|
|
437
|
+
fallomApiKey,
|
|
438
|
+
metricName,
|
|
439
|
+
judgeModel,
|
|
440
|
+
prompt,
|
|
441
|
+
response: data.choices[0].message.content,
|
|
442
|
+
score,
|
|
443
|
+
reasoning,
|
|
444
|
+
startTime,
|
|
445
|
+
endTime,
|
|
446
|
+
usage: data.usage
|
|
447
|
+
}).catch(() => {
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
return { score, reasoning };
|
|
426
451
|
} catch {
|
|
427
452
|
throw new Error("Failed to parse G-Eval response");
|
|
428
453
|
}
|
|
429
454
|
}
|
|
455
|
+
async function sendGEvalTrace(options) {
|
|
456
|
+
const {
|
|
457
|
+
fallomApiKey,
|
|
458
|
+
metricName,
|
|
459
|
+
judgeModel,
|
|
460
|
+
prompt,
|
|
461
|
+
response,
|
|
462
|
+
score,
|
|
463
|
+
reasoning,
|
|
464
|
+
startTime,
|
|
465
|
+
endTime,
|
|
466
|
+
usage
|
|
467
|
+
} = options;
|
|
468
|
+
const traceUrl = process.env.FALLOM_TRACES_URL || "https://traces.fallom.com";
|
|
469
|
+
const traceData = {
|
|
470
|
+
config_key: "eval-worker",
|
|
471
|
+
session_id: `geval-${Date.now()}`,
|
|
472
|
+
trace_id: generateHexId2(32),
|
|
473
|
+
span_id: generateHexId2(16),
|
|
474
|
+
name: `geval.${metricName}`,
|
|
475
|
+
kind: "llm",
|
|
476
|
+
model: judgeModel,
|
|
477
|
+
start_time: new Date(startTime).toISOString(),
|
|
478
|
+
end_time: new Date(endTime).toISOString(),
|
|
479
|
+
duration_ms: endTime - startTime,
|
|
480
|
+
status: "OK",
|
|
481
|
+
metadata: {
|
|
482
|
+
metric: metricName,
|
|
483
|
+
score
|
|
484
|
+
},
|
|
485
|
+
tags: ["eval-worker", "geval", metricName],
|
|
486
|
+
attributes: {
|
|
487
|
+
"fallom.sdk_version": "2",
|
|
488
|
+
"fallom.method": "runGEval",
|
|
489
|
+
"geval.metric": metricName,
|
|
490
|
+
"geval.score": score,
|
|
491
|
+
"geval.reasoning": reasoning,
|
|
492
|
+
"gen_ai.prompt.0.role": "user",
|
|
493
|
+
"gen_ai.prompt.0.content": prompt,
|
|
494
|
+
"gen_ai.completion.0.content": response,
|
|
495
|
+
"gen_ai.usage.prompt_tokens": usage?.prompt_tokens,
|
|
496
|
+
"gen_ai.usage.completion_tokens": usage?.completion_tokens
|
|
497
|
+
}
|
|
498
|
+
};
|
|
499
|
+
await fetch(`${traceUrl}/v1/traces`, {
|
|
500
|
+
method: "POST",
|
|
501
|
+
headers: {
|
|
502
|
+
Authorization: `Bearer ${fallomApiKey}`,
|
|
503
|
+
"Content-Type": "application/json"
|
|
504
|
+
},
|
|
505
|
+
body: JSON.stringify(traceData)
|
|
506
|
+
});
|
|
507
|
+
}
|
|
508
|
+
function generateHexId2(length) {
|
|
509
|
+
const bytes = new Uint8Array(length / 2);
|
|
510
|
+
crypto.getRandomValues(bytes);
|
|
511
|
+
return Array.from(bytes).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
512
|
+
}
|
|
430
513
|
function calculateAggregateScores(results) {
|
|
431
514
|
const aggregates = {};
|
|
432
515
|
for (const result of results) {
|
|
@@ -894,7 +977,13 @@ function init4(options = {}) {
|
|
|
894
977
|
}
|
|
895
978
|
async function runGEval2(metric, inputText, outputText, systemMessage, judgeModel) {
|
|
896
979
|
const metricArg = isCustomMetric(metric) ? { name: metric.name, criteria: metric.criteria, steps: metric.steps } : metric;
|
|
897
|
-
return runGEval(
|
|
980
|
+
return runGEval({
|
|
981
|
+
metric: metricArg,
|
|
982
|
+
inputText,
|
|
983
|
+
outputText,
|
|
984
|
+
systemMessage,
|
|
985
|
+
judgeModel
|
|
986
|
+
});
|
|
898
987
|
}
|
|
899
988
|
async function resolveDataset(datasetInput) {
|
|
900
989
|
if (typeof datasetInput === "string") {
|
|
@@ -966,7 +1055,9 @@ async function evaluate(options) {
|
|
|
966
1055
|
for (const m of metrics) {
|
|
967
1056
|
if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
|
|
968
1057
|
throw new Error(
|
|
969
|
-
`Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(
|
|
1058
|
+
`Invalid metric: ${m}. Available: ${AVAILABLE_METRICS.join(
|
|
1059
|
+
", "
|
|
1060
|
+
)}. Or use CustomMetric for custom metrics.`
|
|
970
1061
|
);
|
|
971
1062
|
}
|
|
972
1063
|
}
|
|
@@ -1263,12 +1354,16 @@ var index_exports = {};
|
|
|
1263
1354
|
__export(index_exports, {
|
|
1264
1355
|
FallomExporter: () => FallomExporter,
|
|
1265
1356
|
FallomSession: () => FallomSession,
|
|
1357
|
+
buildGEvalPrompt: () => buildGEvalPrompt,
|
|
1358
|
+
calculateAggregateScores: () => calculateAggregateScores,
|
|
1266
1359
|
clearMastraPrompt: () => clearMastraPrompt,
|
|
1267
1360
|
default: () => index_default,
|
|
1361
|
+
detectRegression: () => detectRegression,
|
|
1268
1362
|
evals: () => evals_exports,
|
|
1269
1363
|
init: () => init5,
|
|
1270
1364
|
models: () => models_exports,
|
|
1271
1365
|
prompts: () => prompts_exports,
|
|
1366
|
+
runGEval: () => runGEval,
|
|
1272
1367
|
session: () => session,
|
|
1273
1368
|
setMastraPrompt: () => setMastraPrompt,
|
|
1274
1369
|
setMastraPromptAB: () => setMastraPromptAB,
|
|
@@ -1293,7 +1388,7 @@ var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otl
|
|
|
1293
1388
|
// node_modules/@opentelemetry/resources/build/esm/Resource.js
|
|
1294
1389
|
var import_api = require("@opentelemetry/api");
|
|
1295
1390
|
|
|
1296
|
-
// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
1391
|
+
// node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
1297
1392
|
var SemanticResourceAttributes = {
|
|
1298
1393
|
/**
|
|
1299
1394
|
* Name of the cloud provider.
|
|
@@ -4289,11 +4384,15 @@ var index_default = {
|
|
|
4289
4384
|
0 && (module.exports = {
|
|
4290
4385
|
FallomExporter,
|
|
4291
4386
|
FallomSession,
|
|
4387
|
+
buildGEvalPrompt,
|
|
4388
|
+
calculateAggregateScores,
|
|
4292
4389
|
clearMastraPrompt,
|
|
4390
|
+
detectRegression,
|
|
4293
4391
|
evals,
|
|
4294
4392
|
init,
|
|
4295
4393
|
models,
|
|
4296
4394
|
prompts,
|
|
4395
|
+
runGEval,
|
|
4297
4396
|
session,
|
|
4298
4397
|
setMastraPrompt,
|
|
4299
4398
|
setMastraPromptAB,
|
package/dist/index.mjs
CHANGED
|
@@ -23,7 +23,7 @@ import {
|
|
|
23
23
|
isCustomMetric,
|
|
24
24
|
runGEval,
|
|
25
25
|
uploadResultsPublic
|
|
26
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-NNVWIZN5.mjs";
|
|
27
27
|
import {
|
|
28
28
|
__export
|
|
29
29
|
} from "./chunk-7P6ASYW6.mjs";
|
|
@@ -45,7 +45,7 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
|
45
45
|
// node_modules/@opentelemetry/resources/build/esm/Resource.js
|
|
46
46
|
import { diag } from "@opentelemetry/api";
|
|
47
47
|
|
|
48
|
-
// node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
48
|
+
// node_modules/@opentelemetry/resources/node_modules/@opentelemetry/semantic-conventions/build/esm/resource/SemanticResourceAttributes.js
|
|
49
49
|
var SemanticResourceAttributes = {
|
|
50
50
|
/**
|
|
51
51
|
* Name of the cloud provider.
|
|
@@ -3031,12 +3031,16 @@ var index_default = {
|
|
|
3031
3031
|
export {
|
|
3032
3032
|
FallomExporter,
|
|
3033
3033
|
FallomSession,
|
|
3034
|
+
buildGEvalPrompt,
|
|
3035
|
+
calculateAggregateScores,
|
|
3034
3036
|
clearMastraPrompt,
|
|
3035
3037
|
index_default as default,
|
|
3038
|
+
detectRegression,
|
|
3036
3039
|
evals_exports as evals,
|
|
3037
3040
|
init5 as init,
|
|
3038
3041
|
models_exports as models,
|
|
3039
3042
|
prompts_exports as prompts,
|
|
3043
|
+
runGEval,
|
|
3040
3044
|
session,
|
|
3041
3045
|
setMastraPrompt,
|
|
3042
3046
|
setMastraPromptAB,
|
package/package.json
CHANGED