@fallom/trace 0.2.17 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-GZ6TE7G4.mjs +923 -0
- package/dist/chunk-XBZ3ESNV.mjs +824 -0
- package/dist/core-DUG2SP2V.mjs +21 -0
- package/dist/core-JLHYFVYS.mjs +21 -0
- package/dist/index.d.mts +64 -2
- package/dist/index.d.ts +64 -2
- package/dist/index.js +305 -114
- package/dist/index.mjs +137 -34
- package/package.json +1 -1
- package/dist/chunk-KFD5AQ7V.mjs +0 -308
- package/dist/models-SEFDGZU2.mjs +0 -8
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-GZ6TE7G4.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-XBZ3ESNV.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
package/dist/index.d.mts
CHANGED
|
@@ -415,7 +415,7 @@ declare namespace prompts {
|
|
|
415
415
|
* Type definitions for Fallom Evals.
|
|
416
416
|
*/
|
|
417
417
|
/** Built-in metric names */
|
|
418
|
-
type MetricName = "answer_relevancy" | "hallucination" | "toxicity" | "faithfulness" | "completeness";
|
|
418
|
+
type MetricName = "answer_relevancy" | "hallucination" | "toxicity" | "faithfulness" | "completeness" | "coherence" | "bias";
|
|
419
419
|
/** List of all available built-in metrics */
|
|
420
420
|
declare const AVAILABLE_METRICS: MetricName[];
|
|
421
421
|
/**
|
|
@@ -452,6 +452,8 @@ interface EvalResult {
|
|
|
452
452
|
toxicity?: number;
|
|
453
453
|
faithfulness?: number;
|
|
454
454
|
completeness?: number;
|
|
455
|
+
coherence?: number;
|
|
456
|
+
bias?: number;
|
|
455
457
|
reasoning: Record<string, string>;
|
|
456
458
|
latencyMs?: number;
|
|
457
459
|
tokensIn?: number;
|
|
@@ -555,6 +557,61 @@ declare const METRIC_PROMPTS: Record<MetricName, {
|
|
|
555
557
|
criteria: string;
|
|
556
558
|
steps: string[];
|
|
557
559
|
}>;
|
|
560
|
+
/**
|
|
561
|
+
* Build the G-Eval prompt for the LLM judge.
|
|
562
|
+
*/
|
|
563
|
+
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string): string;
|
|
564
|
+
/**
|
|
565
|
+
* Result of running G-Eval on a single metric.
|
|
566
|
+
*/
|
|
567
|
+
interface GEvalScore {
|
|
568
|
+
score: number;
|
|
569
|
+
reasoning: string;
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Run G-Eval for a single metric using OpenRouter.
|
|
573
|
+
* This is the low-level function used by both the SDK and backend workers.
|
|
574
|
+
*
|
|
575
|
+
* @param metric - Built-in metric name or custom metric config
|
|
576
|
+
* @param inputText - The user's input/query
|
|
577
|
+
* @param outputText - The LLM's response
|
|
578
|
+
* @param systemMessage - Optional system message
|
|
579
|
+
* @param judgeModel - The model to use as judge (OpenRouter format)
|
|
580
|
+
* @param openrouterKey - OpenRouter API key (defaults to env var)
|
|
581
|
+
*/
|
|
582
|
+
declare function runGEval(metric: string | {
|
|
583
|
+
name: string;
|
|
584
|
+
criteria: string;
|
|
585
|
+
steps: string[];
|
|
586
|
+
}, inputText: string, outputText: string, systemMessage: string | undefined, judgeModel: string, openrouterKey?: string): Promise<GEvalScore>;
|
|
587
|
+
/**
|
|
588
|
+
* Calculate aggregate scores from a list of results.
|
|
589
|
+
*/
|
|
590
|
+
declare function calculateAggregateScores(results: Array<{
|
|
591
|
+
scores: Record<string, {
|
|
592
|
+
score: number;
|
|
593
|
+
}>;
|
|
594
|
+
}>): Record<string, {
|
|
595
|
+
avg: number;
|
|
596
|
+
min: number;
|
|
597
|
+
max: number;
|
|
598
|
+
count: number;
|
|
599
|
+
}>;
|
|
600
|
+
/**
|
|
601
|
+
* Detect regression by comparing current scores to previous scores.
|
|
602
|
+
*/
|
|
603
|
+
declare function detectRegression(currentScores: Record<string, {
|
|
604
|
+
avg: number;
|
|
605
|
+
}>, previousScores: Record<string, {
|
|
606
|
+
avg: number;
|
|
607
|
+
}>, threshold?: number): {
|
|
608
|
+
detected: boolean;
|
|
609
|
+
details: Record<string, {
|
|
610
|
+
current: number;
|
|
611
|
+
previous: number;
|
|
612
|
+
delta: number;
|
|
613
|
+
}>;
|
|
614
|
+
};
|
|
558
615
|
|
|
559
616
|
/**
|
|
560
617
|
* Core evaluation functions.
|
|
@@ -781,6 +838,7 @@ type evals_EvalResult = EvalResult;
|
|
|
781
838
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
839
|
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
840
|
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
841
|
+
type evals_GEvalScore = GEvalScore;
|
|
784
842
|
type evals_Golden = Golden;
|
|
785
843
|
type evals_LLMTestCase = LLMTestCase;
|
|
786
844
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
@@ -790,6 +848,8 @@ type evals_MetricName = MetricName;
|
|
|
790
848
|
type evals_Model = Model;
|
|
791
849
|
type evals_ModelCallable = ModelCallable;
|
|
792
850
|
type evals_ModelResponse = ModelResponse;
|
|
851
|
+
declare const evals_buildGEvalPrompt: typeof buildGEvalPrompt;
|
|
852
|
+
declare const evals_calculateAggregateScores: typeof calculateAggregateScores;
|
|
793
853
|
declare const evals_compareModels: typeof compareModels;
|
|
794
854
|
declare const evals_createCustomModel: typeof createCustomModel;
|
|
795
855
|
declare const evals_createModelFromCallable: typeof createModelFromCallable;
|
|
@@ -797,11 +857,13 @@ declare const evals_createOpenAIModel: typeof createOpenAIModel;
|
|
|
797
857
|
declare const evals_customMetric: typeof customMetric;
|
|
798
858
|
declare const evals_datasetFromFallom: typeof datasetFromFallom;
|
|
799
859
|
declare const evals_datasetFromTraces: typeof datasetFromTraces;
|
|
860
|
+
declare const evals_detectRegression: typeof detectRegression;
|
|
800
861
|
declare const evals_evaluate: typeof evaluate;
|
|
801
862
|
declare const evals_getMetricName: typeof getMetricName;
|
|
802
863
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
864
|
+
declare const evals_runGEval: typeof runGEval;
|
|
803
865
|
declare namespace evals {
|
|
804
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
866
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
|
|
805
867
|
}
|
|
806
868
|
|
|
807
869
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -415,7 +415,7 @@ declare namespace prompts {
|
|
|
415
415
|
* Type definitions for Fallom Evals.
|
|
416
416
|
*/
|
|
417
417
|
/** Built-in metric names */
|
|
418
|
-
type MetricName = "answer_relevancy" | "hallucination" | "toxicity" | "faithfulness" | "completeness";
|
|
418
|
+
type MetricName = "answer_relevancy" | "hallucination" | "toxicity" | "faithfulness" | "completeness" | "coherence" | "bias";
|
|
419
419
|
/** List of all available built-in metrics */
|
|
420
420
|
declare const AVAILABLE_METRICS: MetricName[];
|
|
421
421
|
/**
|
|
@@ -452,6 +452,8 @@ interface EvalResult {
|
|
|
452
452
|
toxicity?: number;
|
|
453
453
|
faithfulness?: number;
|
|
454
454
|
completeness?: number;
|
|
455
|
+
coherence?: number;
|
|
456
|
+
bias?: number;
|
|
455
457
|
reasoning: Record<string, string>;
|
|
456
458
|
latencyMs?: number;
|
|
457
459
|
tokensIn?: number;
|
|
@@ -555,6 +557,61 @@ declare const METRIC_PROMPTS: Record<MetricName, {
|
|
|
555
557
|
criteria: string;
|
|
556
558
|
steps: string[];
|
|
557
559
|
}>;
|
|
560
|
+
/**
|
|
561
|
+
* Build the G-Eval prompt for the LLM judge.
|
|
562
|
+
*/
|
|
563
|
+
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string): string;
|
|
564
|
+
/**
|
|
565
|
+
* Result of running G-Eval on a single metric.
|
|
566
|
+
*/
|
|
567
|
+
interface GEvalScore {
|
|
568
|
+
score: number;
|
|
569
|
+
reasoning: string;
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Run G-Eval for a single metric using OpenRouter.
|
|
573
|
+
* This is the low-level function used by both the SDK and backend workers.
|
|
574
|
+
*
|
|
575
|
+
* @param metric - Built-in metric name or custom metric config
|
|
576
|
+
* @param inputText - The user's input/query
|
|
577
|
+
* @param outputText - The LLM's response
|
|
578
|
+
* @param systemMessage - Optional system message
|
|
579
|
+
* @param judgeModel - The model to use as judge (OpenRouter format)
|
|
580
|
+
* @param openrouterKey - OpenRouter API key (defaults to env var)
|
|
581
|
+
*/
|
|
582
|
+
declare function runGEval(metric: string | {
|
|
583
|
+
name: string;
|
|
584
|
+
criteria: string;
|
|
585
|
+
steps: string[];
|
|
586
|
+
}, inputText: string, outputText: string, systemMessage: string | undefined, judgeModel: string, openrouterKey?: string): Promise<GEvalScore>;
|
|
587
|
+
/**
|
|
588
|
+
* Calculate aggregate scores from a list of results.
|
|
589
|
+
*/
|
|
590
|
+
declare function calculateAggregateScores(results: Array<{
|
|
591
|
+
scores: Record<string, {
|
|
592
|
+
score: number;
|
|
593
|
+
}>;
|
|
594
|
+
}>): Record<string, {
|
|
595
|
+
avg: number;
|
|
596
|
+
min: number;
|
|
597
|
+
max: number;
|
|
598
|
+
count: number;
|
|
599
|
+
}>;
|
|
600
|
+
/**
|
|
601
|
+
* Detect regression by comparing current scores to previous scores.
|
|
602
|
+
*/
|
|
603
|
+
declare function detectRegression(currentScores: Record<string, {
|
|
604
|
+
avg: number;
|
|
605
|
+
}>, previousScores: Record<string, {
|
|
606
|
+
avg: number;
|
|
607
|
+
}>, threshold?: number): {
|
|
608
|
+
detected: boolean;
|
|
609
|
+
details: Record<string, {
|
|
610
|
+
current: number;
|
|
611
|
+
previous: number;
|
|
612
|
+
delta: number;
|
|
613
|
+
}>;
|
|
614
|
+
};
|
|
558
615
|
|
|
559
616
|
/**
|
|
560
617
|
* Core evaluation functions.
|
|
@@ -781,6 +838,7 @@ type evals_EvalResult = EvalResult;
|
|
|
781
838
|
type evals_EvaluateOptions = EvaluateOptions;
|
|
782
839
|
type evals_EvaluationDataset = EvaluationDataset;
|
|
783
840
|
declare const evals_EvaluationDataset: typeof EvaluationDataset;
|
|
841
|
+
type evals_GEvalScore = GEvalScore;
|
|
784
842
|
type evals_Golden = Golden;
|
|
785
843
|
type evals_LLMTestCase = LLMTestCase;
|
|
786
844
|
declare const evals_METRIC_PROMPTS: typeof METRIC_PROMPTS;
|
|
@@ -790,6 +848,8 @@ type evals_MetricName = MetricName;
|
|
|
790
848
|
type evals_Model = Model;
|
|
791
849
|
type evals_ModelCallable = ModelCallable;
|
|
792
850
|
type evals_ModelResponse = ModelResponse;
|
|
851
|
+
declare const evals_buildGEvalPrompt: typeof buildGEvalPrompt;
|
|
852
|
+
declare const evals_calculateAggregateScores: typeof calculateAggregateScores;
|
|
793
853
|
declare const evals_compareModels: typeof compareModels;
|
|
794
854
|
declare const evals_createCustomModel: typeof createCustomModel;
|
|
795
855
|
declare const evals_createModelFromCallable: typeof createModelFromCallable;
|
|
@@ -797,11 +857,13 @@ declare const evals_createOpenAIModel: typeof createOpenAIModel;
|
|
|
797
857
|
declare const evals_customMetric: typeof customMetric;
|
|
798
858
|
declare const evals_datasetFromFallom: typeof datasetFromFallom;
|
|
799
859
|
declare const evals_datasetFromTraces: typeof datasetFromTraces;
|
|
860
|
+
declare const evals_detectRegression: typeof detectRegression;
|
|
800
861
|
declare const evals_evaluate: typeof evaluate;
|
|
801
862
|
declare const evals_getMetricName: typeof getMetricName;
|
|
802
863
|
declare const evals_isCustomMetric: typeof isCustomMetric;
|
|
864
|
+
declare const evals_runGEval: typeof runGEval;
|
|
803
865
|
declare namespace evals {
|
|
804
|
-
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, uploadResultsPublic as uploadResults };
|
|
866
|
+
export { evals_AVAILABLE_METRICS as AVAILABLE_METRICS, type evals_CompareModelsOptions as CompareModelsOptions, type evals_CustomMetric as CustomMetric, evals_DEFAULT_JUDGE_MODEL as DEFAULT_JUDGE_MODEL, type evals_DatasetInput as DatasetInput, type evals_DatasetItem as DatasetItem, type evals_EvalResult as EvalResult, type evals_EvaluateOptions as EvaluateOptions, evals_EvaluationDataset as EvaluationDataset, type evals_GEvalScore as GEvalScore, type evals_Golden as Golden, type InitOptions$1 as InitOptions, type evals_LLMTestCase as LLMTestCase, evals_METRIC_PROMPTS as METRIC_PROMPTS, type evals_Message as Message, type evals_MetricInput as MetricInput, type evals_MetricName as MetricName, type evals_Model as Model, type evals_ModelCallable as ModelCallable, type evals_ModelResponse as ModelResponse, evals_buildGEvalPrompt as buildGEvalPrompt, evals_calculateAggregateScores as calculateAggregateScores, evals_compareModels as compareModels, evals_createCustomModel as createCustomModel, evals_createModelFromCallable as createModelFromCallable, evals_createOpenAIModel as createOpenAIModel, evals_customMetric as customMetric, evals_datasetFromFallom as datasetFromFallom, evals_datasetFromTraces as datasetFromTraces, evals_detectRegression as detectRegression, evals_evaluate as evaluate, evals_getMetricName as getMetricName, init$1 as init, evals_isCustomMetric as isCustomMetric, evals_runGEval as runGEval, uploadResultsPublic as uploadResults };
|
|
805
867
|
}
|
|
806
868
|
|
|
807
869
|
/**
|