judgeval 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/logger.js +28 -24
- package/dist/cjs/common/logger.js.map +1 -1
- package/dist/cjs/common/tracer.js +80 -130
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +2 -1
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +3 -3
- package/dist/cjs/exporters/otel-exporter.js +352 -0
- package/dist/cjs/exporters/otel-exporter.js.map +1 -0
- package/dist/cjs/judges/index.js +217 -0
- package/dist/cjs/judges/index.js.map +1 -0
- package/dist/cjs/run-evaluation.js +13 -13
- package/dist/cjs/run-evaluation.js.map +1 -1
- package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
- package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
- package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
- package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
- package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
- package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
- package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
- package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
- package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
- package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
- package/dist/esm/common/logger.js +16 -11
- package/dist/esm/common/logger.js.map +1 -1
- package/dist/esm/common/tracer.js +78 -128
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +1 -0
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +3 -3
- package/dist/esm/exporters/otel-exporter.js +348 -0
- package/dist/esm/exporters/otel-exporter.js.map +1 -0
- package/dist/esm/judges/index.js +185 -0
- package/dist/esm/judges/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
- package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
- package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
- package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
- package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
- package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
- package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
- package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
- package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
- package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
- package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
- package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
- package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
- package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
- package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
- package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
- package/dist/types/common/logger.d.ts +1 -1
- package/dist/types/constants.d.ts +1 -0
- package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
- package/dist/types/exporters/otel-exporter.d.ts +16 -0
- package/dist/types/judges/index.d.ts +50 -0
- package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
- package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
- package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
- package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
- package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
- package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
- package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
- package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
- package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
- package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
- package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
- package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
- package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
- package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
- package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
- package/package.json +32 -14
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { ExportResult } from '@opentelemetry/core';
|
|
2
|
+
import { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base";
|
|
3
|
+
interface OtelExporterConfig {
|
|
4
|
+
apiKey: string;
|
|
5
|
+
organizationId: string;
|
|
6
|
+
serviceName?: string;
|
|
7
|
+
}
|
|
8
|
+
export declare class JudgevalExporter implements SpanExporter {
|
|
9
|
+
private serviceName;
|
|
10
|
+
private apiKey;
|
|
11
|
+
private organizationId;
|
|
12
|
+
constructor(config: OtelExporterConfig);
|
|
13
|
+
export(spans: ReadableSpan[], resultCallback: (result: ExportResult) => void): Promise<void>;
|
|
14
|
+
shutdown(): Promise<void>;
|
|
15
|
+
}
|
|
16
|
+
export {};
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Interface for judge models that can generate text
|
|
3
|
+
*/
|
|
4
|
+
export interface Judge {
|
|
5
|
+
/**
|
|
6
|
+
* Generate text synchronously
|
|
7
|
+
*/
|
|
8
|
+
generate(prompt: string): string;
|
|
9
|
+
/**
|
|
10
|
+
* Generate text asynchronously
|
|
11
|
+
*/
|
|
12
|
+
aGenerate(prompt: string): Promise<string>;
|
|
13
|
+
/**
|
|
14
|
+
* Get the name of the model
|
|
15
|
+
*/
|
|
16
|
+
getModelName(): string;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Default judge implementation using OpenAI API
|
|
20
|
+
*/
|
|
21
|
+
export declare class DefaultJudge implements Judge {
|
|
22
|
+
private modelName;
|
|
23
|
+
private apiKey?;
|
|
24
|
+
private user?;
|
|
25
|
+
constructor(modelName?: string, apiKey?: string, user?: string);
|
|
26
|
+
generate(prompt: string): string;
|
|
27
|
+
aGenerate(prompt: string): Promise<string>;
|
|
28
|
+
getModelName(): string;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Together AI judge implementation
|
|
32
|
+
*/
|
|
33
|
+
export declare class TogetherJudge implements Judge {
|
|
34
|
+
private modelName;
|
|
35
|
+
private apiKey?;
|
|
36
|
+
constructor(modelName?: string, apiKey?: string);
|
|
37
|
+
generate(prompt: string): string;
|
|
38
|
+
aGenerate(prompt: string): Promise<string>;
|
|
39
|
+
getModelName(): string;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Create a judge instance
|
|
43
|
+
* @param model Model name or Judge instance
|
|
44
|
+
* @param user Optional user identifier
|
|
45
|
+
* @returns Judge instance and whether it's a native model
|
|
46
|
+
*/
|
|
47
|
+
export declare function createJudge(model?: string | Judge, user?: string): {
|
|
48
|
+
judge: Judge;
|
|
49
|
+
usingNativeModel: boolean;
|
|
50
|
+
};
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { Example } from '../../../data/example.js';
|
|
2
|
+
import { ScorerData } from '../../../data/result.js';
|
|
3
|
+
import { JudgevalScorer } from '../../base-scorer.js';
|
|
4
|
+
export interface Judge {
|
|
5
|
+
generate(prompt: string): string;
|
|
6
|
+
aGenerate(prompt: string): Promise<string>;
|
|
7
|
+
getModelName(): string;
|
|
8
|
+
}
|
|
9
|
+
export declare class DefaultJudge implements Judge {
|
|
10
|
+
private modelName;
|
|
11
|
+
private apiKey?;
|
|
12
|
+
private user?;
|
|
13
|
+
constructor(modelName?: string, apiKey?: string, user?: string);
|
|
14
|
+
generate(prompt: string): string;
|
|
15
|
+
aGenerate(prompt: string): Promise<string>;
|
|
16
|
+
getModelName(): string;
|
|
17
|
+
}
|
|
18
|
+
export declare function createJudge(model?: string | Judge, user?: string): {
|
|
19
|
+
judge: Judge;
|
|
20
|
+
usingNativeModel: boolean;
|
|
21
|
+
};
|
|
22
|
+
/**
|
|
23
|
+
* AnswerCorrectnessScorer evaluates how well an actual output matches an expected output
|
|
24
|
+
* by breaking down the expected output into statements and checking if each statement
|
|
25
|
+
* is correctly represented in the actual output.
|
|
26
|
+
*/
|
|
27
|
+
export declare class AnswerCorrectnessScorer extends JudgevalScorer {
|
|
28
|
+
private model;
|
|
29
|
+
private usingNativeModel;
|
|
30
|
+
private statements?;
|
|
31
|
+
private verdicts?;
|
|
32
|
+
evaluation_cost?: number;
|
|
33
|
+
reason?: string;
|
|
34
|
+
/**
|
|
35
|
+
* Constructor for AnswerCorrectnessScorer
|
|
36
|
+
* @param threshold Minimum score to consider the evaluation successful (default: 0.5)
|
|
37
|
+
* @param model LLM to use for evaluation (string or Judge instance)
|
|
38
|
+
* @param include_reason Whether to generate a reason for the score
|
|
39
|
+
* @param async_mode Whether to use asynchronous evaluation
|
|
40
|
+
* @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
|
|
41
|
+
* @param verbose_mode Enables detailed logging
|
|
42
|
+
* @param user Optional user identifier for the LLM
|
|
43
|
+
* @param additional_metadata Additional metadata to include in the result
|
|
44
|
+
*/
|
|
45
|
+
constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
|
|
46
|
+
/**
|
|
47
|
+
* Get statements from expected output asynchronously
|
|
48
|
+
*/
|
|
49
|
+
private _aGetStatements;
|
|
50
|
+
/**
|
|
51
|
+
* Get statements from expected output synchronously
|
|
52
|
+
*/
|
|
53
|
+
private _getStatements;
|
|
54
|
+
/**
|
|
55
|
+
* Get verdicts for statements against actual output asynchronously
|
|
56
|
+
*/
|
|
57
|
+
private _aGetVerdicts;
|
|
58
|
+
/**
|
|
59
|
+
* Get verdicts for statements against actual output synchronously
|
|
60
|
+
*/
|
|
61
|
+
private _getVerdicts;
|
|
62
|
+
/**
|
|
63
|
+
* Get reason for the score asynchronously
|
|
64
|
+
*/
|
|
65
|
+
private _aGetReason;
|
|
66
|
+
/**
|
|
67
|
+
* Get reason for the score synchronously
|
|
68
|
+
*/
|
|
69
|
+
private _getReason;
|
|
70
|
+
/**
|
|
71
|
+
* Compute score based on verdicts
|
|
72
|
+
*/
|
|
73
|
+
private _computeScore;
|
|
74
|
+
/**
|
|
75
|
+
* Calculate token costs for model usage
|
|
76
|
+
*/
|
|
77
|
+
private _calculateTokenCosts;
|
|
78
|
+
/**
|
|
79
|
+
* Check if example has required parameters
|
|
80
|
+
*/
|
|
81
|
+
private _checkExampleParams;
|
|
82
|
+
/**
|
|
83
|
+
* Create verbose logs for debugging
|
|
84
|
+
*/
|
|
85
|
+
private _createVerboseLogs;
|
|
86
|
+
/**
|
|
87
|
+
* Score an example synchronously - this is for compatibility with the Python SDK
|
|
88
|
+
*/
|
|
89
|
+
syncScoreExample(example: Example): ScorerData;
|
|
90
|
+
/**
|
|
91
|
+
* Score an example - this is the main method that should be called
|
|
92
|
+
* It will use async or sync methods based on the async_mode setting
|
|
93
|
+
*/
|
|
94
|
+
scoreExample(example: Example): Promise<ScorerData>;
|
|
95
|
+
/**
|
|
96
|
+
* Get the name of the scorer
|
|
97
|
+
*/
|
|
98
|
+
get name(): string;
|
|
99
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility prompts for AnswerCorrectnessScorer
|
|
3
|
+
*/
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
export declare const StatementsSchema: z.ZodObject<{
|
|
6
|
+
statements: z.ZodArray<z.ZodString, "many">;
|
|
7
|
+
}, "strip", z.ZodTypeAny, {
|
|
8
|
+
statements: string[];
|
|
9
|
+
}, {
|
|
10
|
+
statements: string[];
|
|
11
|
+
}>;
|
|
12
|
+
export type Statements = z.infer<typeof StatementsSchema>;
|
|
13
|
+
export declare const ACVerdictSchema: z.ZodObject<{
|
|
14
|
+
verdict: z.ZodString;
|
|
15
|
+
reason: z.ZodString;
|
|
16
|
+
}, "strip", z.ZodTypeAny, {
|
|
17
|
+
reason: string;
|
|
18
|
+
verdict: string;
|
|
19
|
+
}, {
|
|
20
|
+
reason: string;
|
|
21
|
+
verdict: string;
|
|
22
|
+
}>;
|
|
23
|
+
export type ACVerdict = z.infer<typeof ACVerdictSchema>;
|
|
24
|
+
export declare const VerdictsSchema: z.ZodObject<{
|
|
25
|
+
verdicts: z.ZodArray<z.ZodObject<{
|
|
26
|
+
verdict: z.ZodString;
|
|
27
|
+
reason: z.ZodString;
|
|
28
|
+
}, "strip", z.ZodTypeAny, {
|
|
29
|
+
reason: string;
|
|
30
|
+
verdict: string;
|
|
31
|
+
}, {
|
|
32
|
+
reason: string;
|
|
33
|
+
verdict: string;
|
|
34
|
+
}>, "many">;
|
|
35
|
+
}, "strip", z.ZodTypeAny, {
|
|
36
|
+
verdicts: {
|
|
37
|
+
reason: string;
|
|
38
|
+
verdict: string;
|
|
39
|
+
}[];
|
|
40
|
+
}, {
|
|
41
|
+
verdicts: {
|
|
42
|
+
reason: string;
|
|
43
|
+
verdict: string;
|
|
44
|
+
}[];
|
|
45
|
+
}>;
|
|
46
|
+
export type Verdicts = z.infer<typeof VerdictsSchema>;
|
|
47
|
+
export declare const ReasonSchema: z.ZodObject<{
|
|
48
|
+
reason: z.ZodString;
|
|
49
|
+
}, "strip", z.ZodTypeAny, {
|
|
50
|
+
reason: string;
|
|
51
|
+
}, {
|
|
52
|
+
reason: string;
|
|
53
|
+
}>;
|
|
54
|
+
export type Reason = z.infer<typeof ReasonSchema>;
|
|
55
|
+
/**
|
|
56
|
+
* Template prompts for the AnswerCorrectnessScorer
|
|
57
|
+
*/
|
|
58
|
+
export declare class AnswerCorrectnessTemplate {
|
|
59
|
+
/**
|
|
60
|
+
* Generate a prompt to extract statements from the expected output
|
|
61
|
+
*/
|
|
62
|
+
static deduceStatements(expectedOutput: string): string;
|
|
63
|
+
/**
|
|
64
|
+
* Generate a prompt to evaluate statements against the actual output
|
|
65
|
+
*/
|
|
66
|
+
static generateVerdicts(statements: string[], actualOutput: string): string;
|
|
67
|
+
/**
|
|
68
|
+
* Generate a prompt to explain the score based on incorrect statements
|
|
69
|
+
*/
|
|
70
|
+
static generateReason(incorrectStatements: [string, string][], score: string): string;
|
|
71
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { Example } from '../../../data/example.js';
|
|
2
|
+
import { ScorerData } from '../../../data/result.js';
|
|
3
|
+
import { JudgevalScorer } from '../../base-scorer.js';
|
|
4
|
+
import { Judge } from '../answer-correctness/answer-correctness.js';
|
|
5
|
+
/**
|
|
6
|
+
* AnswerRelevancyScorer evaluates how relevant the actual output is to the input
|
|
7
|
+
* by breaking down the actual output into statements and checking if each statement
|
|
8
|
+
* is relevant to the input.
|
|
9
|
+
*/
|
|
10
|
+
export declare class AnswerRelevancyScorer extends JudgevalScorer {
|
|
11
|
+
private model;
|
|
12
|
+
private usingNativeModel;
|
|
13
|
+
private statements?;
|
|
14
|
+
private verdicts?;
|
|
15
|
+
evaluation_cost?: number;
|
|
16
|
+
reason?: string;
|
|
17
|
+
/**
|
|
18
|
+
* Constructor for AnswerRelevancyScorer
|
|
19
|
+
* @param threshold Minimum score to consider the evaluation successful (default: 0.5)
|
|
20
|
+
* @param model LLM to use for evaluation (string or Judge instance)
|
|
21
|
+
* @param include_reason Whether to generate a reason for the score
|
|
22
|
+
* @param async_mode Whether to use asynchronous evaluation
|
|
23
|
+
* @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
|
|
24
|
+
* @param verbose_mode Enables detailed logging
|
|
25
|
+
* @param user Optional user identifier for the LLM
|
|
26
|
+
* @param additional_metadata Additional metadata to include in the result
|
|
27
|
+
*/
|
|
28
|
+
constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
|
|
29
|
+
/**
|
|
30
|
+
* Get statements from actual output asynchronously
|
|
31
|
+
*/
|
|
32
|
+
private _aGetStatements;
|
|
33
|
+
/**
|
|
34
|
+
* Get statements from actual output synchronously
|
|
35
|
+
*/
|
|
36
|
+
private _getStatements;
|
|
37
|
+
/**
|
|
38
|
+
* Get verdicts for statements against input asynchronously
|
|
39
|
+
*/
|
|
40
|
+
private _aGetVerdicts;
|
|
41
|
+
/**
|
|
42
|
+
* Get verdicts for statements against input synchronously
|
|
43
|
+
*/
|
|
44
|
+
private _getVerdicts;
|
|
45
|
+
/**
|
|
46
|
+
* Get reason for the score asynchronously
|
|
47
|
+
*/
|
|
48
|
+
private _aGetReason;
|
|
49
|
+
/**
|
|
50
|
+
* Get reason for the score synchronously
|
|
51
|
+
*/
|
|
52
|
+
private _getReason;
|
|
53
|
+
/**
|
|
54
|
+
* Compute score based on verdicts
|
|
55
|
+
*/
|
|
56
|
+
private _computeScore;
|
|
57
|
+
/**
|
|
58
|
+
* Check if example has required parameters
|
|
59
|
+
*/
|
|
60
|
+
private _checkExampleParams;
|
|
61
|
+
/**
|
|
62
|
+
* Create verbose logs for debugging
|
|
63
|
+
*/
|
|
64
|
+
private _createVerboseLogs;
|
|
65
|
+
/**
|
|
66
|
+
* Score an example synchronously - this is for compatibility with the Python SDK
|
|
67
|
+
*/
|
|
68
|
+
syncScoreExample(example: Example): ScorerData;
|
|
69
|
+
/**
|
|
70
|
+
* Score an example - this is the main method that should be called
|
|
71
|
+
* It will use async or sync methods based on the async_mode setting
|
|
72
|
+
*/
|
|
73
|
+
scoreExample(example: Example): Promise<ScorerData>;
|
|
74
|
+
/**
|
|
75
|
+
* Get the name of the scorer
|
|
76
|
+
*/
|
|
77
|
+
get name(): string;
|
|
78
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility prompts for AnswerRelevancyScorer
|
|
3
|
+
*/
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
export declare const StatementsSchema: z.ZodObject<{
|
|
6
|
+
statements: z.ZodArray<z.ZodString, "many">;
|
|
7
|
+
}, "strip", z.ZodTypeAny, {
|
|
8
|
+
statements: string[];
|
|
9
|
+
}, {
|
|
10
|
+
statements: string[];
|
|
11
|
+
}>;
|
|
12
|
+
export type Statements = z.infer<typeof StatementsSchema>;
|
|
13
|
+
export declare const ARVerdictSchema: z.ZodObject<{
|
|
14
|
+
verdict: z.ZodString;
|
|
15
|
+
reason: z.ZodString;
|
|
16
|
+
}, "strip", z.ZodTypeAny, {
|
|
17
|
+
reason: string;
|
|
18
|
+
verdict: string;
|
|
19
|
+
}, {
|
|
20
|
+
reason: string;
|
|
21
|
+
verdict: string;
|
|
22
|
+
}>;
|
|
23
|
+
export type ARVerdict = z.infer<typeof ARVerdictSchema>;
|
|
24
|
+
export declare const VerdictsSchema: z.ZodObject<{
|
|
25
|
+
verdicts: z.ZodArray<z.ZodObject<{
|
|
26
|
+
verdict: z.ZodString;
|
|
27
|
+
reason: z.ZodString;
|
|
28
|
+
}, "strip", z.ZodTypeAny, {
|
|
29
|
+
reason: string;
|
|
30
|
+
verdict: string;
|
|
31
|
+
}, {
|
|
32
|
+
reason: string;
|
|
33
|
+
verdict: string;
|
|
34
|
+
}>, "many">;
|
|
35
|
+
}, "strip", z.ZodTypeAny, {
|
|
36
|
+
verdicts: {
|
|
37
|
+
reason: string;
|
|
38
|
+
verdict: string;
|
|
39
|
+
}[];
|
|
40
|
+
}, {
|
|
41
|
+
verdicts: {
|
|
42
|
+
reason: string;
|
|
43
|
+
verdict: string;
|
|
44
|
+
}[];
|
|
45
|
+
}>;
|
|
46
|
+
export type Verdicts = z.infer<typeof VerdictsSchema>;
|
|
47
|
+
export declare const ReasonSchema: z.ZodObject<{
|
|
48
|
+
reason: z.ZodString;
|
|
49
|
+
}, "strip", z.ZodTypeAny, {
|
|
50
|
+
reason: string;
|
|
51
|
+
}, {
|
|
52
|
+
reason: string;
|
|
53
|
+
}>;
|
|
54
|
+
export type Reason = z.infer<typeof ReasonSchema>;
|
|
55
|
+
/**
|
|
56
|
+
* Template prompts for the AnswerRelevancyScorer
|
|
57
|
+
*/
|
|
58
|
+
export declare class AnswerRelevancyTemplate {
|
|
59
|
+
/**
|
|
60
|
+
* Generate a prompt to extract statements from the actual output
|
|
61
|
+
*/
|
|
62
|
+
static deduceStatements(actualOutput: string): string;
|
|
63
|
+
/**
|
|
64
|
+
* Generate a prompt to evaluate statements against the input
|
|
65
|
+
*/
|
|
66
|
+
static generateVerdicts(input: string, statements: string[]): string;
|
|
67
|
+
/**
|
|
68
|
+
* Generate a prompt to explain the score based on irrelevant statements
|
|
69
|
+
*/
|
|
70
|
+
static generateReason(irrelevantStatements: [string, string][], input: string, score: string): string;
|
|
71
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { Example } from '../../../data/example.js';
|
|
2
|
+
import { ScorerData } from '../../../data/result.js';
|
|
3
|
+
import { JudgevalScorer } from '../../base-scorer.js';
|
|
4
|
+
import { Judge } from '../../../judges/index.js';
|
|
5
|
+
/**
|
|
6
|
+
* FaithfulnessScorer evaluates how well the actual output is supported by the retrieval context
|
|
7
|
+
* by extracting claims from the output and checking if each claim is supported by the context.
|
|
8
|
+
*/
|
|
9
|
+
export declare class FaithfulnessScorer extends JudgevalScorer {
|
|
10
|
+
private model;
|
|
11
|
+
private usingNativeModel;
|
|
12
|
+
private claims?;
|
|
13
|
+
private claimsWithQuotes?;
|
|
14
|
+
private verdicts?;
|
|
15
|
+
evaluation_cost?: number;
|
|
16
|
+
reason?: string;
|
|
17
|
+
/**
|
|
18
|
+
* Constructor for FaithfulnessScorer
|
|
19
|
+
* @param threshold Minimum score to consider the evaluation successful (default: 0.5)
|
|
20
|
+
* @param model LLM to use for evaluation (string or Judge instance)
|
|
21
|
+
* @param include_reason Whether to generate a reason for the score
|
|
22
|
+
* @param async_mode Whether to use asynchronous evaluation
|
|
23
|
+
* @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
|
|
24
|
+
* @param verbose_mode Enables detailed logging
|
|
25
|
+
* @param user Optional user identifier for the LLM
|
|
26
|
+
* @param additional_metadata Additional metadata to include in the result
|
|
27
|
+
*/
|
|
28
|
+
constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
|
|
29
|
+
/**
|
|
30
|
+
* Generate claims from actual output asynchronously
|
|
31
|
+
*/
|
|
32
|
+
private _aGenerateClaims;
|
|
33
|
+
/**
|
|
34
|
+
* Generate claims from actual output synchronously
|
|
35
|
+
*/
|
|
36
|
+
private _generateClaims;
|
|
37
|
+
/**
|
|
38
|
+
* Generate verdicts for claims against retrieval context asynchronously
|
|
39
|
+
*/
|
|
40
|
+
private _aGenerateVerdicts;
|
|
41
|
+
/**
|
|
42
|
+
* Generate verdicts for claims against retrieval context synchronously
|
|
43
|
+
*/
|
|
44
|
+
private _generateVerdicts;
|
|
45
|
+
/**
|
|
46
|
+
* Generate reason for the score asynchronously
|
|
47
|
+
*/
|
|
48
|
+
private _aGenerateReason;
|
|
49
|
+
/**
|
|
50
|
+
* Generate reason for the score synchronously
|
|
51
|
+
*/
|
|
52
|
+
private _generateReason;
|
|
53
|
+
/**
|
|
54
|
+
* Compute score based on verdicts
|
|
55
|
+
*/
|
|
56
|
+
private _computeScore;
|
|
57
|
+
/**
|
|
58
|
+
* Check if example has required parameters
|
|
59
|
+
*/
|
|
60
|
+
private _checkExampleParams;
|
|
61
|
+
/**
|
|
62
|
+
* Create verbose logs for debugging
|
|
63
|
+
*/
|
|
64
|
+
private _createVerboseLogs;
|
|
65
|
+
/**
|
|
66
|
+
* Score an example synchronously
|
|
67
|
+
*/
|
|
68
|
+
syncScoreExample(example: Example, allClaims?: boolean): ScorerData;
|
|
69
|
+
/**
|
|
70
|
+
* Score an example asynchronously
|
|
71
|
+
*/
|
|
72
|
+
scoreExample(example: Example, allClaims?: boolean): Promise<ScorerData>;
|
|
73
|
+
/**
|
|
74
|
+
* Get the name of the scorer
|
|
75
|
+
*/
|
|
76
|
+
get name(): string;
|
|
77
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility prompts for FaithfulnessScorer
|
|
3
|
+
*/
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
export declare const FaithfulnessVerdictSchema: z.ZodObject<{
|
|
6
|
+
verdict: z.ZodString;
|
|
7
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
8
|
+
}, "strip", z.ZodTypeAny, {
|
|
9
|
+
verdict: string;
|
|
10
|
+
reason?: string | undefined;
|
|
11
|
+
}, {
|
|
12
|
+
verdict: string;
|
|
13
|
+
reason?: string | undefined;
|
|
14
|
+
}>;
|
|
15
|
+
export type FaithfulnessVerdict = z.infer<typeof FaithfulnessVerdictSchema>;
|
|
16
|
+
export declare const VerdictsSchema: z.ZodObject<{
|
|
17
|
+
verdicts: z.ZodArray<z.ZodObject<{
|
|
18
|
+
verdict: z.ZodString;
|
|
19
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
20
|
+
}, "strip", z.ZodTypeAny, {
|
|
21
|
+
verdict: string;
|
|
22
|
+
reason?: string | undefined;
|
|
23
|
+
}, {
|
|
24
|
+
verdict: string;
|
|
25
|
+
reason?: string | undefined;
|
|
26
|
+
}>, "many">;
|
|
27
|
+
}, "strip", z.ZodTypeAny, {
|
|
28
|
+
verdicts: {
|
|
29
|
+
verdict: string;
|
|
30
|
+
reason?: string | undefined;
|
|
31
|
+
}[];
|
|
32
|
+
}, {
|
|
33
|
+
verdicts: {
|
|
34
|
+
verdict: string;
|
|
35
|
+
reason?: string | undefined;
|
|
36
|
+
}[];
|
|
37
|
+
}>;
|
|
38
|
+
export type Verdicts = z.infer<typeof VerdictsSchema>;
|
|
39
|
+
export declare const TruthsSchema: z.ZodObject<{
|
|
40
|
+
truths: z.ZodArray<z.ZodString, "many">;
|
|
41
|
+
}, "strip", z.ZodTypeAny, {
|
|
42
|
+
truths: string[];
|
|
43
|
+
}, {
|
|
44
|
+
truths: string[];
|
|
45
|
+
}>;
|
|
46
|
+
export type Truths = z.infer<typeof TruthsSchema>;
|
|
47
|
+
export declare const ClaimsSchema: z.ZodObject<{
|
|
48
|
+
claims: z.ZodArray<z.ZodObject<{
|
|
49
|
+
claim: z.ZodString;
|
|
50
|
+
quote: z.ZodString;
|
|
51
|
+
}, "strip", z.ZodTypeAny, {
|
|
52
|
+
claim: string;
|
|
53
|
+
quote: string;
|
|
54
|
+
}, {
|
|
55
|
+
claim: string;
|
|
56
|
+
quote: string;
|
|
57
|
+
}>, "many">;
|
|
58
|
+
}, "strip", z.ZodTypeAny, {
|
|
59
|
+
claims: {
|
|
60
|
+
claim: string;
|
|
61
|
+
quote: string;
|
|
62
|
+
}[];
|
|
63
|
+
}, {
|
|
64
|
+
claims: {
|
|
65
|
+
claim: string;
|
|
66
|
+
quote: string;
|
|
67
|
+
}[];
|
|
68
|
+
}>;
|
|
69
|
+
export type Claims = z.infer<typeof ClaimsSchema>;
|
|
70
|
+
export declare const ReasonSchema: z.ZodObject<{
|
|
71
|
+
reason: z.ZodString;
|
|
72
|
+
}, "strip", z.ZodTypeAny, {
|
|
73
|
+
reason: string;
|
|
74
|
+
}, {
|
|
75
|
+
reason: string;
|
|
76
|
+
}>;
|
|
77
|
+
export type Reason = z.infer<typeof ReasonSchema>;
|
|
78
|
+
/**
|
|
79
|
+
* Template prompts for the FaithfulnessScorer
|
|
80
|
+
*/
|
|
81
|
+
export declare class FaithfulnessTemplate {
|
|
82
|
+
/**
|
|
83
|
+
* Generate a prompt to extract claims from the actual output
|
|
84
|
+
*/
|
|
85
|
+
static findClaims(text: string, allClaims?: boolean): string;
|
|
86
|
+
/**
|
|
87
|
+
* Generate a prompt to evaluate claims against the retrieval context
|
|
88
|
+
*/
|
|
89
|
+
static generateVerdicts(claims: string[], retrievalContext: string): string;
|
|
90
|
+
/**
|
|
91
|
+
* Generate a prompt to explain the score based on verdicts
|
|
92
|
+
*/
|
|
93
|
+
static generateReason(verdicts: FaithfulnessVerdict[], score: string): string;
|
|
94
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { Example } from '../../../data/example.js';
|
|
2
|
+
import { ScorerData } from '../../../data/result.js';
|
|
3
|
+
import { JudgevalScorer } from '../../base-scorer.js';
|
|
4
|
+
import { Judge } from '../../../judges/index.js';
|
|
5
|
+
/**
|
|
6
|
+
* HallucinationScorer evaluates whether an LLM's output contains hallucinations
|
|
7
|
+
* by comparing it against provided context.
|
|
8
|
+
*
|
|
9
|
+
* The score is the fraction of context segments that contradict the output.
|
|
10
|
+
* Lower scores are better (0 = no hallucinations, 1 = all contexts contradict the output).
|
|
11
|
+
*/
|
|
12
|
+
export declare class HallucinationScorer extends JudgevalScorer {
|
|
13
|
+
private model;
|
|
14
|
+
private using_native_model;
|
|
15
|
+
private _verdicts;
|
|
16
|
+
/**
|
|
17
|
+
* Create a new HallucinationScorer
|
|
18
|
+
*
|
|
19
|
+
* @param threshold - Success threshold (default: 0.5)
|
|
20
|
+
* @param model - Model to use for evaluation (default: DefaultJudge)
|
|
21
|
+
* @param include_reason - Whether to include a reason for the score (default: true)
|
|
22
|
+
* @param async_mode - Whether to use async mode (default: false)
|
|
23
|
+
* @param strict_mode - Whether to use strict mode (default: false)
|
|
24
|
+
* @param verbose_mode - Whether to include verbose logs (default: false)
|
|
25
|
+
*/
|
|
26
|
+
constructor(threshold?: number, model?: string | Judge | undefined, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean);
|
|
27
|
+
/**
|
|
28
|
+
* Generate verdicts for each context
|
|
29
|
+
*/
|
|
30
|
+
private _aGenerateVerdicts;
|
|
31
|
+
/**
|
|
32
|
+
* Generate verdicts for each context (synchronous)
|
|
33
|
+
*/
|
|
34
|
+
private _generateVerdicts;
|
|
35
|
+
/**
|
|
36
|
+
* Generate a reason for the score
|
|
37
|
+
*/
|
|
38
|
+
private _aGenerateReason;
|
|
39
|
+
/**
|
|
40
|
+
* Generate a reason for the score (synchronous)
|
|
41
|
+
*/
|
|
42
|
+
private _generateReason;
|
|
43
|
+
/**
|
|
44
|
+
* Calculate the hallucination score
|
|
45
|
+
*/
|
|
46
|
+
private _computeScore;
|
|
47
|
+
/**
|
|
48
|
+
* Create verbose logs for debugging
|
|
49
|
+
*/
|
|
50
|
+
private _createVerboseLogs;
|
|
51
|
+
/**
|
|
52
|
+
* Check if example has required parameters
|
|
53
|
+
*/
|
|
54
|
+
private _checkExampleParams;
|
|
55
|
+
/**
|
|
56
|
+
* Score an example synchronously
|
|
57
|
+
*/
|
|
58
|
+
syncScoreExample(example: Example): ScorerData;
|
|
59
|
+
/**
|
|
60
|
+
* Score an example asynchronously
|
|
61
|
+
*/
|
|
62
|
+
scoreExample(example: Example): Promise<ScorerData>;
|
|
63
|
+
/**
|
|
64
|
+
* Get the name of the scorer
|
|
65
|
+
*/
|
|
66
|
+
get name(): string;
|
|
67
|
+
}
|