judgeval 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/cjs/common/logger.js +28 -24
  2. package/dist/cjs/common/logger.js.map +1 -1
  3. package/dist/cjs/common/tracer.js +80 -130
  4. package/dist/cjs/common/tracer.js.map +1 -1
  5. package/dist/cjs/constants.js +2 -1
  6. package/dist/cjs/constants.js.map +1 -1
  7. package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
  8. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
  9. package/dist/cjs/e2etests/eval-operations.test.js +3 -3
  10. package/dist/cjs/exporters/otel-exporter.js +352 -0
  11. package/dist/cjs/exporters/otel-exporter.js.map +1 -0
  12. package/dist/cjs/judges/index.js +217 -0
  13. package/dist/cjs/judges/index.js.map +1 -0
  14. package/dist/cjs/run-evaluation.js +13 -13
  15. package/dist/cjs/run-evaluation.js.map +1 -1
  16. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
  17. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  18. package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
  19. package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
  20. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
  21. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  22. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
  23. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  24. package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
  25. package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
  26. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
  27. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  28. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
  29. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  30. package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
  31. package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
  32. package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
  33. package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
  34. package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
  35. package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
  36. package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
  37. package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
  38. package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
  39. package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
  40. package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
  41. package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
  42. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
  43. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  44. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
  45. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  46. package/dist/esm/common/logger.js +16 -11
  47. package/dist/esm/common/logger.js.map +1 -1
  48. package/dist/esm/common/tracer.js +78 -128
  49. package/dist/esm/common/tracer.js.map +1 -1
  50. package/dist/esm/constants.js +1 -0
  51. package/dist/esm/constants.js.map +1 -1
  52. package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
  53. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
  54. package/dist/esm/e2etests/eval-operations.test.js +3 -3
  55. package/dist/esm/exporters/otel-exporter.js +348 -0
  56. package/dist/esm/exporters/otel-exporter.js.map +1 -0
  57. package/dist/esm/judges/index.js +185 -0
  58. package/dist/esm/judges/index.js.map +1 -0
  59. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
  60. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  61. package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
  62. package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
  63. package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
  64. package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  65. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
  66. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  67. package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
  68. package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
  69. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
  70. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  71. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
  72. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  73. package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
  74. package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
  75. package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
  76. package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
  77. package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
  78. package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
  79. package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
  80. package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
  81. package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
  82. package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
  83. package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
  84. package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
  85. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
  86. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  87. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
  88. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  89. package/dist/types/common/logger.d.ts +1 -1
  90. package/dist/types/constants.d.ts +1 -0
  91. package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
  92. package/dist/types/exporters/otel-exporter.d.ts +16 -0
  93. package/dist/types/judges/index.d.ts +50 -0
  94. package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
  95. package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
  96. package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
  97. package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
  98. package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
  99. package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
  100. package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
  101. package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
  102. package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
  103. package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
  104. package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
  105. package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
  106. package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
  107. package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
  108. package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
  109. package/package.json +32 -14
@@ -0,0 +1,16 @@
1
+ import { ExportResult } from '@opentelemetry/core';
2
+ import { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base";
3
+ interface OtelExporterConfig {
4
+ apiKey: string;
5
+ organizationId: string;
6
+ serviceName?: string;
7
+ }
8
+ export declare class JudgevalExporter implements SpanExporter {
9
+ private serviceName;
10
+ private apiKey;
11
+ private organizationId;
12
+ constructor(config: OtelExporterConfig);
13
+ export(spans: ReadableSpan[], resultCallback: (result: ExportResult) => void): Promise<void>;
14
+ shutdown(): Promise<void>;
15
+ }
16
+ export {};
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Interface for judge models that can generate text
3
+ */
4
+ export interface Judge {
5
+ /**
6
+ * Generate text synchronously
7
+ */
8
+ generate(prompt: string): string;
9
+ /**
10
+ * Generate text asynchronously
11
+ */
12
+ aGenerate(prompt: string): Promise<string>;
13
+ /**
14
+ * Get the name of the model
15
+ */
16
+ getModelName(): string;
17
+ }
18
+ /**
19
+ * Default judge implementation using OpenAI API
20
+ */
21
+ export declare class DefaultJudge implements Judge {
22
+ private modelName;
23
+ private apiKey?;
24
+ private user?;
25
+ constructor(modelName?: string, apiKey?: string, user?: string);
26
+ generate(prompt: string): string;
27
+ aGenerate(prompt: string): Promise<string>;
28
+ getModelName(): string;
29
+ }
30
+ /**
31
+ * Together AI judge implementation
32
+ */
33
+ export declare class TogetherJudge implements Judge {
34
+ private modelName;
35
+ private apiKey?;
36
+ constructor(modelName?: string, apiKey?: string);
37
+ generate(prompt: string): string;
38
+ aGenerate(prompt: string): Promise<string>;
39
+ getModelName(): string;
40
+ }
41
+ /**
42
+ * Create a judge instance
43
+ * @param model Model name or Judge instance
44
+ * @param user Optional user identifier
45
+ * @returns Judge instance and whether it's a native model
46
+ */
47
+ export declare function createJudge(model?: string | Judge, user?: string): {
48
+ judge: Judge;
49
+ usingNativeModel: boolean;
50
+ };
@@ -0,0 +1,99 @@
1
+ import { Example } from '../../../data/example.js';
2
+ import { ScorerData } from '../../../data/result.js';
3
+ import { JudgevalScorer } from '../../base-scorer.js';
4
+ export interface Judge {
5
+ generate(prompt: string): string;
6
+ aGenerate(prompt: string): Promise<string>;
7
+ getModelName(): string;
8
+ }
9
+ export declare class DefaultJudge implements Judge {
10
+ private modelName;
11
+ private apiKey?;
12
+ private user?;
13
+ constructor(modelName?: string, apiKey?: string, user?: string);
14
+ generate(prompt: string): string;
15
+ aGenerate(prompt: string): Promise<string>;
16
+ getModelName(): string;
17
+ }
18
+ export declare function createJudge(model?: string | Judge, user?: string): {
19
+ judge: Judge;
20
+ usingNativeModel: boolean;
21
+ };
22
+ /**
23
+ * AnswerCorrectnessScorer evaluates how well an actual output matches an expected output
24
+ * by breaking down the expected output into statements and checking if each statement
25
+ * is correctly represented in the actual output.
26
+ */
27
+ export declare class AnswerCorrectnessScorer extends JudgevalScorer {
28
+ private model;
29
+ private usingNativeModel;
30
+ private statements?;
31
+ private verdicts?;
32
+ evaluation_cost?: number;
33
+ reason?: string;
34
+ /**
35
+ * Constructor for AnswerCorrectnessScorer
36
+ * @param threshold Minimum score to consider the evaluation successful (default: 0.5)
37
+ * @param model LLM to use for evaluation (string or Judge instance)
38
+ * @param include_reason Whether to generate a reason for the score
39
+ * @param async_mode Whether to use asynchronous evaluation
40
+ * @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
41
+ * @param verbose_mode Enables detailed logging
42
+ * @param user Optional user identifier for the LLM
43
+ * @param additional_metadata Additional metadata to include in the result
44
+ */
45
+ constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
46
+ /**
47
+ * Get statements from expected output asynchronously
48
+ */
49
+ private _aGetStatements;
50
+ /**
51
+ * Get statements from expected output synchronously
52
+ */
53
+ private _getStatements;
54
+ /**
55
+ * Get verdicts for statements against actual output asynchronously
56
+ */
57
+ private _aGetVerdicts;
58
+ /**
59
+ * Get verdicts for statements against actual output synchronously
60
+ */
61
+ private _getVerdicts;
62
+ /**
63
+ * Get reason for the score asynchronously
64
+ */
65
+ private _aGetReason;
66
+ /**
67
+ * Get reason for the score synchronously
68
+ */
69
+ private _getReason;
70
+ /**
71
+ * Compute score based on verdicts
72
+ */
73
+ private _computeScore;
74
+ /**
75
+ * Calculate token costs for model usage
76
+ */
77
+ private _calculateTokenCosts;
78
+ /**
79
+ * Check if example has required parameters
80
+ */
81
+ private _checkExampleParams;
82
+ /**
83
+ * Create verbose logs for debugging
84
+ */
85
+ private _createVerboseLogs;
86
+ /**
87
+ * Score an example synchronously - this is for compatibility with the Python SDK
88
+ */
89
+ syncScoreExample(example: Example): ScorerData;
90
+ /**
91
+ * Score an example - this is the main method that should be called
92
+ * It will use async or sync methods based on the async_mode setting
93
+ */
94
+ scoreExample(example: Example): Promise<ScorerData>;
95
+ /**
96
+ * Get the name of the scorer
97
+ */
98
+ get name(): string;
99
+ }
@@ -0,0 +1,2 @@
1
+ export * from './answer-correctness.js';
2
+ export * from './prompts.js';
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Utility prompts for AnswerCorrectnessScorer
3
+ */
4
+ import { z } from 'zod';
5
+ export declare const StatementsSchema: z.ZodObject<{
6
+ statements: z.ZodArray<z.ZodString, "many">;
7
+ }, "strip", z.ZodTypeAny, {
8
+ statements: string[];
9
+ }, {
10
+ statements: string[];
11
+ }>;
12
+ export type Statements = z.infer<typeof StatementsSchema>;
13
+ export declare const ACVerdictSchema: z.ZodObject<{
14
+ verdict: z.ZodString;
15
+ reason: z.ZodString;
16
+ }, "strip", z.ZodTypeAny, {
17
+ reason: string;
18
+ verdict: string;
19
+ }, {
20
+ reason: string;
21
+ verdict: string;
22
+ }>;
23
+ export type ACVerdict = z.infer<typeof ACVerdictSchema>;
24
+ export declare const VerdictsSchema: z.ZodObject<{
25
+ verdicts: z.ZodArray<z.ZodObject<{
26
+ verdict: z.ZodString;
27
+ reason: z.ZodString;
28
+ }, "strip", z.ZodTypeAny, {
29
+ reason: string;
30
+ verdict: string;
31
+ }, {
32
+ reason: string;
33
+ verdict: string;
34
+ }>, "many">;
35
+ }, "strip", z.ZodTypeAny, {
36
+ verdicts: {
37
+ reason: string;
38
+ verdict: string;
39
+ }[];
40
+ }, {
41
+ verdicts: {
42
+ reason: string;
43
+ verdict: string;
44
+ }[];
45
+ }>;
46
+ export type Verdicts = z.infer<typeof VerdictsSchema>;
47
+ export declare const ReasonSchema: z.ZodObject<{
48
+ reason: z.ZodString;
49
+ }, "strip", z.ZodTypeAny, {
50
+ reason: string;
51
+ }, {
52
+ reason: string;
53
+ }>;
54
+ export type Reason = z.infer<typeof ReasonSchema>;
55
+ /**
56
+ * Template prompts for the AnswerCorrectnessScorer
57
+ */
58
+ export declare class AnswerCorrectnessTemplate {
59
+ /**
60
+ * Generate a prompt to extract statements from the expected output
61
+ */
62
+ static deduceStatements(expectedOutput: string): string;
63
+ /**
64
+ * Generate a prompt to evaluate statements against the actual output
65
+ */
66
+ static generateVerdicts(statements: string[], actualOutput: string): string;
67
+ /**
68
+ * Generate a prompt to explain the score based on incorrect statements
69
+ */
70
+ static generateReason(incorrectStatements: [string, string][], score: string): string;
71
+ }
@@ -0,0 +1,78 @@
1
+ import { Example } from '../../../data/example.js';
2
+ import { ScorerData } from '../../../data/result.js';
3
+ import { JudgevalScorer } from '../../base-scorer.js';
4
+ import { Judge } from '../answer-correctness/answer-correctness.js';
5
+ /**
6
+ * AnswerRelevancyScorer evaluates how relevant the actual output is to the input
7
+ * by breaking down the actual output into statements and checking if each statement
8
+ * is relevant to the input.
9
+ */
10
+ export declare class AnswerRelevancyScorer extends JudgevalScorer {
11
+ private model;
12
+ private usingNativeModel;
13
+ private statements?;
14
+ private verdicts?;
15
+ evaluation_cost?: number;
16
+ reason?: string;
17
+ /**
18
+ * Constructor for AnswerRelevancyScorer
19
+ * @param threshold Minimum score to consider the evaluation successful (default: 0.5)
20
+ * @param model LLM to use for evaluation (string or Judge instance)
21
+ * @param include_reason Whether to generate a reason for the score
22
+ * @param async_mode Whether to use asynchronous evaluation
23
+ * @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
24
+ * @param verbose_mode Enables detailed logging
25
+ * @param user Optional user identifier for the LLM
26
+ * @param additional_metadata Additional metadata to include in the result
27
+ */
28
+ constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
29
+ /**
30
+ * Get statements from actual output asynchronously
31
+ */
32
+ private _aGetStatements;
33
+ /**
34
+ * Get statements from actual output synchronously
35
+ */
36
+ private _getStatements;
37
+ /**
38
+ * Get verdicts for statements against input asynchronously
39
+ */
40
+ private _aGetVerdicts;
41
+ /**
42
+ * Get verdicts for statements against input synchronously
43
+ */
44
+ private _getVerdicts;
45
+ /**
46
+ * Get reason for the score asynchronously
47
+ */
48
+ private _aGetReason;
49
+ /**
50
+ * Get reason for the score synchronously
51
+ */
52
+ private _getReason;
53
+ /**
54
+ * Compute score based on verdicts
55
+ */
56
+ private _computeScore;
57
+ /**
58
+ * Check if example has required parameters
59
+ */
60
+ private _checkExampleParams;
61
+ /**
62
+ * Create verbose logs for debugging
63
+ */
64
+ private _createVerboseLogs;
65
+ /**
66
+ * Score an example synchronously - this is for compatibility with the Python SDK
67
+ */
68
+ syncScoreExample(example: Example): ScorerData;
69
+ /**
70
+ * Score an example - this is the main method that should be called
71
+ * It will use async or sync methods based on the async_mode setting
72
+ */
73
+ scoreExample(example: Example): Promise<ScorerData>;
74
+ /**
75
+ * Get the name of the scorer
76
+ */
77
+ get name(): string;
78
+ }
@@ -0,0 +1,2 @@
1
+ export * from './answer-relevancy.js';
2
+ export * from './prompts.js';
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Utility prompts for AnswerRelevancyScorer
3
+ */
4
+ import { z } from 'zod';
5
+ export declare const StatementsSchema: z.ZodObject<{
6
+ statements: z.ZodArray<z.ZodString, "many">;
7
+ }, "strip", z.ZodTypeAny, {
8
+ statements: string[];
9
+ }, {
10
+ statements: string[];
11
+ }>;
12
+ export type Statements = z.infer<typeof StatementsSchema>;
13
+ export declare const ARVerdictSchema: z.ZodObject<{
14
+ verdict: z.ZodString;
15
+ reason: z.ZodString;
16
+ }, "strip", z.ZodTypeAny, {
17
+ reason: string;
18
+ verdict: string;
19
+ }, {
20
+ reason: string;
21
+ verdict: string;
22
+ }>;
23
+ export type ARVerdict = z.infer<typeof ARVerdictSchema>;
24
+ export declare const VerdictsSchema: z.ZodObject<{
25
+ verdicts: z.ZodArray<z.ZodObject<{
26
+ verdict: z.ZodString;
27
+ reason: z.ZodString;
28
+ }, "strip", z.ZodTypeAny, {
29
+ reason: string;
30
+ verdict: string;
31
+ }, {
32
+ reason: string;
33
+ verdict: string;
34
+ }>, "many">;
35
+ }, "strip", z.ZodTypeAny, {
36
+ verdicts: {
37
+ reason: string;
38
+ verdict: string;
39
+ }[];
40
+ }, {
41
+ verdicts: {
42
+ reason: string;
43
+ verdict: string;
44
+ }[];
45
+ }>;
46
+ export type Verdicts = z.infer<typeof VerdictsSchema>;
47
+ export declare const ReasonSchema: z.ZodObject<{
48
+ reason: z.ZodString;
49
+ }, "strip", z.ZodTypeAny, {
50
+ reason: string;
51
+ }, {
52
+ reason: string;
53
+ }>;
54
+ export type Reason = z.infer<typeof ReasonSchema>;
55
+ /**
56
+ * Template prompts for the AnswerRelevancyScorer
57
+ */
58
+ export declare class AnswerRelevancyTemplate {
59
+ /**
60
+ * Generate a prompt to extract statements from the actual output
61
+ */
62
+ static deduceStatements(actualOutput: string): string;
63
+ /**
64
+ * Generate a prompt to evaluate statements against the input
65
+ */
66
+ static generateVerdicts(input: string, statements: string[]): string;
67
+ /**
68
+ * Generate a prompt to explain the score based on irrelevant statements
69
+ */
70
+ static generateReason(irrelevantStatements: [string, string][], input: string, score: string): string;
71
+ }
@@ -0,0 +1,77 @@
1
+ import { Example } from '../../../data/example.js';
2
+ import { ScorerData } from '../../../data/result.js';
3
+ import { JudgevalScorer } from '../../base-scorer.js';
4
+ import { Judge } from '../../../judges/index.js';
5
+ /**
6
+ * FaithfulnessScorer evaluates how well the actual output is supported by the retrieval context
7
+ * by extracting claims from the output and checking if each claim is supported by the context.
8
+ */
9
+ export declare class FaithfulnessScorer extends JudgevalScorer {
10
+ private model;
11
+ private usingNativeModel;
12
+ private claims?;
13
+ private claimsWithQuotes?;
14
+ private verdicts?;
15
+ evaluation_cost?: number;
16
+ reason?: string;
17
+ /**
18
+ * Constructor for FaithfulnessScorer
19
+ * @param threshold Minimum score to consider the evaluation successful (default: 0.5)
20
+ * @param model LLM to use for evaluation (string or Judge instance)
21
+ * @param include_reason Whether to generate a reason for the score
22
+ * @param async_mode Whether to use asynchronous evaluation
23
+ * @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
24
+ * @param verbose_mode Enables detailed logging
25
+ * @param user Optional user identifier for the LLM
26
+ * @param additional_metadata Additional metadata to include in the result
27
+ */
28
+ constructor(threshold?: number, model?: string | Judge, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean, user?: string, additional_metadata?: Record<string, any>);
29
+ /**
30
+ * Generate claims from actual output asynchronously
31
+ */
32
+ private _aGenerateClaims;
33
+ /**
34
+ * Generate claims from actual output synchronously
35
+ */
36
+ private _generateClaims;
37
+ /**
38
+ * Generate verdicts for claims against retrieval context asynchronously
39
+ */
40
+ private _aGenerateVerdicts;
41
+ /**
42
+ * Generate verdicts for claims against retrieval context synchronously
43
+ */
44
+ private _generateVerdicts;
45
+ /**
46
+ * Generate reason for the score asynchronously
47
+ */
48
+ private _aGenerateReason;
49
+ /**
50
+ * Generate reason for the score synchronously
51
+ */
52
+ private _generateReason;
53
+ /**
54
+ * Compute score based on verdicts
55
+ */
56
+ private _computeScore;
57
+ /**
58
+ * Check if example has required parameters
59
+ */
60
+ private _checkExampleParams;
61
+ /**
62
+ * Create verbose logs for debugging
63
+ */
64
+ private _createVerboseLogs;
65
+ /**
66
+ * Score an example synchronously
67
+ */
68
+ syncScoreExample(example: Example, allClaims?: boolean): ScorerData;
69
+ /**
70
+ * Score an example asynchronously
71
+ */
72
+ scoreExample(example: Example, allClaims?: boolean): Promise<ScorerData>;
73
+ /**
74
+ * Get the name of the scorer
75
+ */
76
+ get name(): string;
77
+ }
@@ -0,0 +1,2 @@
1
+ export * from './faithfulness.js';
2
+ export * from './prompts.js';
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Utility prompts for FaithfulnessScorer
3
+ */
4
+ import { z } from 'zod';
5
+ export declare const FaithfulnessVerdictSchema: z.ZodObject<{
6
+ verdict: z.ZodString;
7
+ reason: z.ZodOptional<z.ZodString>;
8
+ }, "strip", z.ZodTypeAny, {
9
+ verdict: string;
10
+ reason?: string | undefined;
11
+ }, {
12
+ verdict: string;
13
+ reason?: string | undefined;
14
+ }>;
15
+ export type FaithfulnessVerdict = z.infer<typeof FaithfulnessVerdictSchema>;
16
+ export declare const VerdictsSchema: z.ZodObject<{
17
+ verdicts: z.ZodArray<z.ZodObject<{
18
+ verdict: z.ZodString;
19
+ reason: z.ZodOptional<z.ZodString>;
20
+ }, "strip", z.ZodTypeAny, {
21
+ verdict: string;
22
+ reason?: string | undefined;
23
+ }, {
24
+ verdict: string;
25
+ reason?: string | undefined;
26
+ }>, "many">;
27
+ }, "strip", z.ZodTypeAny, {
28
+ verdicts: {
29
+ verdict: string;
30
+ reason?: string | undefined;
31
+ }[];
32
+ }, {
33
+ verdicts: {
34
+ verdict: string;
35
+ reason?: string | undefined;
36
+ }[];
37
+ }>;
38
+ export type Verdicts = z.infer<typeof VerdictsSchema>;
39
+ export declare const TruthsSchema: z.ZodObject<{
40
+ truths: z.ZodArray<z.ZodString, "many">;
41
+ }, "strip", z.ZodTypeAny, {
42
+ truths: string[];
43
+ }, {
44
+ truths: string[];
45
+ }>;
46
+ export type Truths = z.infer<typeof TruthsSchema>;
47
+ export declare const ClaimsSchema: z.ZodObject<{
48
+ claims: z.ZodArray<z.ZodObject<{
49
+ claim: z.ZodString;
50
+ quote: z.ZodString;
51
+ }, "strip", z.ZodTypeAny, {
52
+ claim: string;
53
+ quote: string;
54
+ }, {
55
+ claim: string;
56
+ quote: string;
57
+ }>, "many">;
58
+ }, "strip", z.ZodTypeAny, {
59
+ claims: {
60
+ claim: string;
61
+ quote: string;
62
+ }[];
63
+ }, {
64
+ claims: {
65
+ claim: string;
66
+ quote: string;
67
+ }[];
68
+ }>;
69
+ export type Claims = z.infer<typeof ClaimsSchema>;
70
+ export declare const ReasonSchema: z.ZodObject<{
71
+ reason: z.ZodString;
72
+ }, "strip", z.ZodTypeAny, {
73
+ reason: string;
74
+ }, {
75
+ reason: string;
76
+ }>;
77
+ export type Reason = z.infer<typeof ReasonSchema>;
78
+ /**
79
+ * Template prompts for the FaithfulnessScorer
80
+ */
81
+ export declare class FaithfulnessTemplate {
82
+ /**
83
+ * Generate a prompt to extract claims from the actual output
84
+ */
85
+ static findClaims(text: string, allClaims?: boolean): string;
86
+ /**
87
+ * Generate a prompt to evaluate claims against the retrieval context
88
+ */
89
+ static generateVerdicts(claims: string[], retrievalContext: string): string;
90
+ /**
91
+ * Generate a prompt to explain the score based on verdicts
92
+ */
93
+ static generateReason(verdicts: FaithfulnessVerdict[], score: string): string;
94
+ }
@@ -0,0 +1,67 @@
1
+ import { Example } from '../../../data/example.js';
2
+ import { ScorerData } from '../../../data/result.js';
3
+ import { JudgevalScorer } from '../../base-scorer.js';
4
+ import { Judge } from '../../../judges/index.js';
5
+ /**
6
+ * HallucinationScorer evaluates whether an LLM's output contains hallucinations
7
+ * by comparing it against provided context.
8
+ *
9
+ * The score is the fraction of context segments that contradict the output.
10
+ * Lower scores are better (0 = no hallucinations, 1 = all contexts contradict the output).
11
+ */
12
+ export declare class HallucinationScorer extends JudgevalScorer {
13
+ private model;
14
+ private using_native_model;
15
+ private _verdicts;
16
+ /**
17
+ * Create a new HallucinationScorer
18
+ *
19
+ * @param threshold - Success threshold (default: 0.5)
20
+ * @param model - Model to use for evaluation (default: DefaultJudge)
21
+ * @param include_reason - Whether to include a reason for the score (default: true)
22
+ * @param async_mode - Whether to use async mode (default: false)
23
+ * @param strict_mode - Whether to use strict mode (default: false)
24
+ * @param verbose_mode - Whether to include verbose logs (default: false)
25
+ */
26
+ constructor(threshold?: number, model?: string | Judge | undefined, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean);
27
+ /**
28
+ * Generate verdicts for each context
29
+ */
30
+ private _aGenerateVerdicts;
31
+ /**
32
+ * Generate verdicts for each context (synchronous)
33
+ */
34
+ private _generateVerdicts;
35
+ /**
36
+ * Generate a reason for the score
37
+ */
38
+ private _aGenerateReason;
39
+ /**
40
+ * Generate a reason for the score (synchronous)
41
+ */
42
+ private _generateReason;
43
+ /**
44
+ * Calculate the hallucination score
45
+ */
46
+ private _computeScore;
47
+ /**
48
+ * Create verbose logs for debugging
49
+ */
50
+ private _createVerboseLogs;
51
+ /**
52
+ * Check if example has required parameters
53
+ */
54
+ private _checkExampleParams;
55
+ /**
56
+ * Score an example synchronously
57
+ */
58
+ syncScoreExample(example: Example): ScorerData;
59
+ /**
60
+ * Score an example asynchronously
61
+ */
62
+ scoreExample(example: Example): Promise<ScorerData>;
63
+ /**
64
+ * Get the name of the scorer
65
+ */
66
+ get name(): string;
67
+ }
@@ -0,0 +1,3 @@
1
+ export { HallucinationScorer } from './hallucination.js';
2
+ export { HallucinationTemplate, HallucinationVerdictSchema, VerdictsSchema, ReasonSchema } from './prompts.js';
3
+ export type { HallucinationVerdict } from './prompts.js';