judgeval 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +1 -3
- judgeval/clients.py +0 -6
- judgeval/common/logger.py +0 -1
- judgeval/common/tracer.py +270 -62
- judgeval/common/utils.py +9 -5
- judgeval/constants.py +7 -2
- judgeval/data/__init__.py +2 -0
- judgeval/data/api_example.py +2 -2
- judgeval/data/datasets/__init__.py +1 -2
- judgeval/data/datasets/dataset.py +4 -5
- judgeval/data/datasets/eval_dataset_client.py +11 -7
- judgeval/data/datasets/utils.py +1 -2
- judgeval/data/example.py +72 -17
- judgeval/data/scorer_data.py +1 -1
- judgeval/evaluation_run.py +2 -2
- judgeval/judges/__init__.py +0 -1
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +7 -2
- judgeval/judgment_client.py +16 -8
- judgeval/rules.py +2 -4
- judgeval/run_evaluation.py +8 -8
- judgeval/scorers/__init__.py +6 -0
- judgeval/scorers/api_scorer.py +12 -6
- judgeval/scorers/base_scorer.py +12 -6
- judgeval/scorers/judgeval_scorer.py +7 -3
- judgeval/scorers/judgeval_scorers/__init__.py +24 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
- judgeval/scorers/prompt_scorer.py +7 -5
- judgeval/scorers/utils.py +1 -1
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
- /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/example.py
CHANGED
@@ -2,17 +2,12 @@
|
|
2
2
|
Classes for representing examples in a dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
6
|
-
from typing import TypeVar, Optional, Any, Dict, List
|
5
|
+
from typing import Optional, Any, Dict, List
|
7
6
|
from uuid import uuid4
|
8
|
-
from pydantic import BaseModel, Field
|
7
|
+
from pydantic import BaseModel, Field
|
9
8
|
from enum import Enum
|
10
9
|
from datetime import datetime
|
11
|
-
import time
|
12
|
-
|
13
10
|
|
14
|
-
Input = TypeVar('Input')
|
15
|
-
Output = TypeVar('Output')
|
16
11
|
|
17
12
|
class ExampleParams(Enum):
|
18
13
|
INPUT = "input"
|
@@ -23,11 +18,12 @@ class ExampleParams(Enum):
|
|
23
18
|
TOOLS_CALLED = "tools_called"
|
24
19
|
EXPECTED_TOOLS = "expected_tools"
|
25
20
|
REASONING = "reasoning"
|
21
|
+
ADDITIONAL_METADATA = "additional_metadata"
|
26
22
|
|
27
23
|
|
28
24
|
class Example(BaseModel):
|
29
|
-
input:
|
30
|
-
actual_output:
|
25
|
+
input: str
|
26
|
+
actual_output: str
|
31
27
|
expected_output: Optional[str] = None
|
32
28
|
context: Optional[List[str]] = None
|
33
29
|
retrieval_context: Optional[List[str]] = None
|
@@ -39,22 +35,81 @@ class Example(BaseModel):
|
|
39
35
|
example_index: Optional[int] = None
|
40
36
|
timestamp: Optional[str] = None
|
41
37
|
trace_id: Optional[str] = None
|
42
|
-
|
43
|
-
@field_validator('input', 'actual_output', mode='before')
|
44
|
-
def convert_to_str(cls, value):
|
45
|
-
try:
|
46
|
-
return str(value)
|
47
|
-
except Exception:
|
48
|
-
return repr(value)
|
49
38
|
|
50
39
|
def __init__(self, **data):
|
40
|
+
# Check that required fields are provided
|
41
|
+
if 'input' not in data:
|
42
|
+
raise ValueError("Example must be initialized with 'input' field.")
|
43
|
+
if 'actual_output' not in data:
|
44
|
+
raise ValueError("Example must be initialized with 'actual_output' field.")
|
45
|
+
|
51
46
|
if 'example_id' not in data:
|
52
47
|
data['example_id'] = str(uuid4())
|
53
48
|
# Set timestamp if not provided
|
54
49
|
if 'timestamp' not in data:
|
55
50
|
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
56
51
|
super().__init__(**data)
|
57
|
-
|
52
|
+
|
53
|
+
@field_validator('input', mode='before')
|
54
|
+
@classmethod
|
55
|
+
def validate_input(cls, v):
|
56
|
+
if not v or not isinstance(v, str):
|
57
|
+
raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
|
58
|
+
return v
|
59
|
+
|
60
|
+
@field_validator('actual_output', mode='before')
|
61
|
+
@classmethod
|
62
|
+
def validate_actual_output(cls, v):
|
63
|
+
if not isinstance(v, str):
|
64
|
+
raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
|
65
|
+
return v
|
66
|
+
|
67
|
+
@field_validator('expected_output', mode='before')
|
68
|
+
@classmethod
|
69
|
+
def validate_expected_output(cls, v):
|
70
|
+
if v is not None and not isinstance(v, str):
|
71
|
+
raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
|
72
|
+
return v
|
73
|
+
|
74
|
+
@field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
|
75
|
+
@classmethod
|
76
|
+
def validate_string_lists(cls, v, info):
|
77
|
+
field_name = info.field_name
|
78
|
+
if v is not None:
|
79
|
+
if not isinstance(v, list):
|
80
|
+
raise ValueError(f"{field_name} must be a list of strings or None but got {v} of type {type(v)}")
|
81
|
+
for i, item in enumerate(v):
|
82
|
+
if not isinstance(item, str):
|
83
|
+
raise ValueError(f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}")
|
84
|
+
return v
|
85
|
+
|
86
|
+
@field_validator('additional_metadata', mode='before')
|
87
|
+
@classmethod
|
88
|
+
def validate_additional_metadata(cls, v):
|
89
|
+
if v is not None and not isinstance(v, dict):
|
90
|
+
raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
|
91
|
+
return v
|
92
|
+
|
93
|
+
@field_validator('example_index', mode='before')
|
94
|
+
@classmethod
|
95
|
+
def validate_example_index(cls, v):
|
96
|
+
if v is not None and not isinstance(v, int):
|
97
|
+
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
98
|
+
return v
|
99
|
+
|
100
|
+
@field_validator('timestamp', mode='before')
|
101
|
+
@classmethod
|
102
|
+
def validate_timestamp(cls, v):
|
103
|
+
if v is not None and not isinstance(v, str):
|
104
|
+
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
105
|
+
return v
|
106
|
+
|
107
|
+
@field_validator('trace_id', mode='before')
|
108
|
+
@classmethod
|
109
|
+
def validate_trace_id(cls, v):
|
110
|
+
if v is not None and not isinstance(v, str):
|
111
|
+
raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
|
112
|
+
return v
|
58
113
|
|
59
114
|
def to_dict(self):
|
60
115
|
return {
|
judgeval/data/scorer_data.py
CHANGED
judgeval/evaluation_run.py
CHANGED
@@ -111,7 +111,7 @@ class EvaluationRun(BaseModel):
|
|
111
111
|
# Check if model is string or list of strings
|
112
112
|
if isinstance(v, str):
|
113
113
|
if v not in ACCEPTABLE_MODELS:
|
114
|
-
raise ValueError(f"Model name {v} not recognized.")
|
114
|
+
raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
|
115
115
|
return v
|
116
116
|
|
117
117
|
if isinstance(v, list):
|
@@ -119,7 +119,7 @@ class EvaluationRun(BaseModel):
|
|
119
119
|
raise ValueError("When providing a list of models, all elements must be strings")
|
120
120
|
for m in v:
|
121
121
|
if m not in ACCEPTABLE_MODELS:
|
122
|
-
raise ValueError(f"Model name {m} not recognized.")
|
122
|
+
raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
|
123
123
|
return v
|
124
124
|
raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
|
125
125
|
|
judgeval/judges/__init__.py
CHANGED
judgeval/judges/base_judge.py
CHANGED
@@ -5,9 +5,14 @@ Enables client to use multiple models to generate responses and then aggregate t
|
|
5
5
|
"""
|
6
6
|
from judgeval import *
|
7
7
|
import pydantic
|
8
|
-
from typing import List, Union, Mapping
|
8
|
+
from typing import List, Union, Mapping
|
9
9
|
from judgeval.judges import JudgevalJudge
|
10
|
-
from judgeval.common.utils import
|
10
|
+
from judgeval.common.utils import (
|
11
|
+
get_completion_multiple_models,
|
12
|
+
get_chat_completion,
|
13
|
+
aget_completion_multiple_models,
|
14
|
+
aget_chat_completion
|
15
|
+
)
|
11
16
|
from judgeval.common.logger import debug, error
|
12
17
|
|
13
18
|
def build_dynamic_mixture_prompt(
|
judgeval/judgment_client.py
CHANGED
@@ -6,17 +6,17 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
|
-
Example
|
12
|
+
Example,
|
13
|
+
GroundTruthExample
|
13
14
|
)
|
14
15
|
from judgeval.scorers import (
|
15
16
|
APIJudgmentScorer,
|
16
17
|
JudgevalScorer,
|
17
18
|
ClassifierScorer,
|
18
19
|
ScorerWrapper,
|
19
|
-
score,
|
20
20
|
)
|
21
21
|
from judgeval.evaluation_run import EvaluationRun
|
22
22
|
from judgeval.run_evaluation import (
|
@@ -24,7 +24,11 @@ from judgeval.run_evaluation import (
|
|
24
24
|
assert_test
|
25
25
|
)
|
26
26
|
from judgeval.judges import JudgevalJudge
|
27
|
-
from judgeval.constants import
|
27
|
+
from judgeval.constants import (
|
28
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
29
|
+
JUDGMENT_EVAL_DELETE_API_URL,
|
30
|
+
JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
31
|
+
)
|
28
32
|
from judgeval.common.exceptions import JudgmentAPIError
|
29
33
|
from pydantic import BaseModel
|
30
34
|
from judgeval.rules import Rule
|
@@ -306,7 +310,8 @@ class JudgmentClient:
|
|
306
310
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
307
311
|
"X-Organization-Id": self.organization_id
|
308
312
|
},
|
309
|
-
json=eval_run_request_body.model_dump()
|
313
|
+
json=eval_run_request_body.model_dump(),
|
314
|
+
verify=True)
|
310
315
|
if eval_run.status_code != requests.codes.ok:
|
311
316
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
312
317
|
|
@@ -378,7 +383,8 @@ class JudgmentClient:
|
|
378
383
|
"Content-Type": "application/json",
|
379
384
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
380
385
|
},
|
381
|
-
json={} # Empty body now
|
386
|
+
json={}, # Empty body now
|
387
|
+
verify=True
|
382
388
|
)
|
383
389
|
if response.status_code == 200:
|
384
390
|
return True, response.json()
|
@@ -409,7 +415,8 @@ class JudgmentClient:
|
|
409
415
|
"Content-Type": "application/json",
|
410
416
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
411
417
|
"X-Organization-Id": self.organization_id
|
412
|
-
}
|
418
|
+
},
|
419
|
+
verify=True
|
413
420
|
)
|
414
421
|
|
415
422
|
if response.status_code == 500:
|
@@ -452,7 +459,8 @@ class JudgmentClient:
|
|
452
459
|
"Content-Type": "application/json",
|
453
460
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
454
461
|
"X-Organization-Id": self.organization_id
|
455
|
-
}
|
462
|
+
},
|
463
|
+
verify=True
|
456
464
|
)
|
457
465
|
|
458
466
|
if response.status_code == 500:
|
judgeval/rules.py
CHANGED
@@ -5,14 +5,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
|
|
5
5
|
from typing import Dict, List, Optional, Union, Any, Set, Tuple
|
6
6
|
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
7
7
|
from enum import Enum
|
8
|
-
from datetime import datetime
|
9
8
|
import asyncio
|
10
9
|
from concurrent.futures import ThreadPoolExecutor
|
11
10
|
import time
|
12
|
-
import uuid
|
11
|
+
import uuid
|
13
12
|
|
14
|
-
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
15
|
-
from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
|
13
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
16
14
|
|
17
15
|
class AlertStatus(str, Enum):
|
18
16
|
"""Status of an alert evaluation."""
|
judgeval/run_evaluation.py
CHANGED
@@ -5,7 +5,6 @@ from datetime import datetime
|
|
5
5
|
from rich import print as rprint
|
6
6
|
|
7
7
|
from judgeval.data import (
|
8
|
-
Example,
|
9
8
|
ScorerData,
|
10
9
|
ScoringResult
|
11
10
|
)
|
@@ -25,13 +24,11 @@ from judgeval.constants import (
|
|
25
24
|
from judgeval.common.exceptions import JudgmentAPIError
|
26
25
|
from judgeval.evaluation_run import EvaluationRun
|
27
26
|
from judgeval.common.logger import (
|
28
|
-
enable_logging,
|
29
27
|
debug,
|
30
28
|
info,
|
31
29
|
error,
|
32
30
|
example_logging_context
|
33
31
|
)
|
34
|
-
from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
|
35
32
|
|
36
33
|
|
37
34
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
@@ -55,7 +52,8 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
55
52
|
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
56
53
|
"X-Organization-Id": evaluation_run.organization_id
|
57
54
|
},
|
58
|
-
json=payload
|
55
|
+
json=payload,
|
56
|
+
verify=True)
|
59
57
|
response_data = response.json()
|
60
58
|
except Exception as e:
|
61
59
|
error(f"Error: {e}")
|
@@ -168,12 +166,13 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
168
166
|
"eval_name": eval_name,
|
169
167
|
"project_name": project_name,
|
170
168
|
"judgment_api_key": judgment_api_key,
|
171
|
-
}
|
169
|
+
},
|
170
|
+
verify=True
|
172
171
|
)
|
173
172
|
|
174
173
|
if response.status_code == 409:
|
175
|
-
error(f"
|
176
|
-
raise ValueError(f"
|
174
|
+
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
|
175
|
+
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
|
177
176
|
|
178
177
|
if not response.ok:
|
179
178
|
response_data = response.json()
|
@@ -210,7 +209,8 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
210
209
|
"results": [result.to_dict() for result in merged_results],
|
211
210
|
"project_name": evaluation_run.project_name,
|
212
211
|
"eval_name": evaluation_run.eval_name,
|
213
|
-
}
|
212
|
+
},
|
213
|
+
verify=True
|
214
214
|
)
|
215
215
|
|
216
216
|
if not res.ok:
|
judgeval/scorers/__init__.py
CHANGED
@@ -14,6 +14,9 @@ from judgeval.scorers.judgeval_scorers import (
|
|
14
14
|
ScorerWrapper,
|
15
15
|
AnswerCorrectnessScorer,
|
16
16
|
Text2SQLScorer,
|
17
|
+
ComparisonScorer,
|
18
|
+
InstructionAdherenceScorer,
|
19
|
+
GroundednessScorer,
|
17
20
|
)
|
18
21
|
|
19
22
|
__all__ = [
|
@@ -33,4 +36,7 @@ __all__ = [
|
|
33
36
|
"ScorerWrapper",
|
34
37
|
"AnswerCorrectnessScorer",
|
35
38
|
"Text2SQLScorer",
|
39
|
+
"ComparisonScorer",
|
40
|
+
"InstructionAdherenceScorer",
|
41
|
+
"GroundednessScorer",
|
36
42
|
]
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from judgeval.common.logger import debug, info, warning, error
|
9
9
|
|
10
|
-
from judgeval.constants import APIScorer
|
10
|
+
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
11
|
|
12
12
|
|
13
13
|
class APIJudgmentScorer(BaseModel):
|
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
|
|
18
18
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
19
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
20
|
"""
|
21
|
-
threshold: float
|
22
21
|
score_type: APIScorer
|
22
|
+
threshold: float
|
23
23
|
|
24
24
|
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v):
|
25
|
+
def validate_threshold(cls, v, info):
|
26
26
|
"""
|
27
27
|
Validates that the threshold is between 0 and 1 inclusive.
|
28
28
|
"""
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
score_type = info.data.get('score_type')
|
30
|
+
if score_type in UNBOUNDED_SCORERS:
|
31
|
+
if v < 0:
|
32
|
+
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
+
else:
|
35
|
+
if not 0 <= v <= 1:
|
36
|
+
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
32
38
|
return v
|
33
39
|
|
34
40
|
@field_validator('score_type')
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from judgeval.common.logger import debug, info, warning, error
|
9
9
|
|
10
|
-
from judgeval.constants import APIScorer
|
10
|
+
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
11
|
|
12
12
|
|
13
13
|
class APIJudgmentScorer(BaseModel):
|
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
|
|
18
18
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
19
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
20
|
"""
|
21
|
-
threshold: float
|
22
21
|
score_type: APIScorer
|
22
|
+
threshold: float
|
23
23
|
|
24
24
|
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v):
|
25
|
+
def validate_threshold(cls, v, info):
|
26
26
|
"""
|
27
27
|
Validates that the threshold is between 0 and 1 inclusive.
|
28
28
|
"""
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
score_type = info.data.get('score_type')
|
30
|
+
if score_type in UNBOUNDED_SCORERS:
|
31
|
+
if v < 0:
|
32
|
+
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
+
else:
|
35
|
+
if not 0 <= v <= 1:
|
36
|
+
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
32
38
|
return v
|
33
39
|
|
34
40
|
@field_validator('score_type')
|
@@ -11,7 +11,7 @@ from abc import abstractmethod
|
|
11
11
|
from judgeval.common.logger import debug, info, warning, error
|
12
12
|
from judgeval.judges import JudgevalJudge
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
|
-
|
14
|
+
from judgeval.constants import UNBOUNDED_SCORERS
|
15
15
|
|
16
16
|
class JudgevalScorer:
|
17
17
|
"""
|
@@ -58,8 +58,12 @@ class JudgevalScorer:
|
|
58
58
|
additional_metadata: Optional[Dict] = None
|
59
59
|
):
|
60
60
|
debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
|
61
|
-
if
|
62
|
-
|
61
|
+
if score_type in UNBOUNDED_SCORERS:
|
62
|
+
if threshold < 0:
|
63
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {threshold}")
|
64
|
+
else:
|
65
|
+
if not 0 <= threshold <= 1:
|
66
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {threshold}")
|
63
67
|
if strict_mode:
|
64
68
|
warning("Strict mode enabled - scoring will be more rigorous")
|
65
69
|
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from typing import Type, Optional, Any
|
2
|
-
from functools import wraps
|
3
2
|
|
4
3
|
# Import implementations
|
5
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
@@ -12,7 +11,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
12
11
|
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
13
12
|
ContextualRecallScorer as APIContextualRecallScorer,
|
14
13
|
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
15
|
-
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
14
|
+
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
15
|
+
ComparisonScorer as APIComparisonScorer,
|
16
|
+
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
|
+
GroundednessScorer as APIGroundednessScorer,
|
16
18
|
)
|
17
19
|
|
18
20
|
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
@@ -25,7 +27,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
25
27
|
ToolCorrectnessScorer as LocalToolCorrectnessScorer,
|
26
28
|
HallucinationScorer as LocalHallucinationScorer,
|
27
29
|
SummarizationScorer as LocalSummarizationScorer,
|
28
|
-
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
|
30
|
+
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
31
|
+
ComparisonScorer as LocalComparisonScorer,
|
32
|
+
InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
|
29
33
|
)
|
30
34
|
|
31
35
|
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
@@ -134,6 +138,21 @@ ContextualRecallScorer = ScorerWrapper(
|
|
134
138
|
local_implementation=LocalContextualRecallScorer
|
135
139
|
)
|
136
140
|
|
141
|
+
InstructionAdherenceScorer = ScorerWrapper(
|
142
|
+
api_implementation=APIInstructionAdherenceScorer,
|
143
|
+
local_implementation=LocalInstructionAdherenceScorer
|
144
|
+
)
|
145
|
+
|
146
|
+
def ComparisonScorer(threshold: float, criteria: str, description: str):
|
147
|
+
return ScorerWrapper(
|
148
|
+
api_implementation=APIComparisonScorer,
|
149
|
+
local_implementation=LocalComparisonScorer
|
150
|
+
)(threshold=threshold, criteria=criteria, description=description)
|
151
|
+
|
152
|
+
GroundednessScorer = ScorerWrapper(
|
153
|
+
api_implementation=APIGroundednessScorer,
|
154
|
+
)
|
155
|
+
|
137
156
|
__all__ = [
|
138
157
|
"ToolCorrectnessScorer",
|
139
158
|
"JSONCorrectnessScorer",
|
@@ -145,4 +164,6 @@ __all__ = [
|
|
145
164
|
"ContextualRecallScorer",
|
146
165
|
"AnswerRelevancyScorer",
|
147
166
|
"Text2SQLScorer",
|
167
|
+
"ComparisonScorer",
|
168
|
+
"GroundednessScorer",
|
148
169
|
]
|
@@ -8,6 +8,9 @@ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import C
|
|
8
8
|
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
|
11
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
12
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
11
14
|
|
12
15
|
__all__ = [
|
13
16
|
"ToolCorrectnessScorer",
|
@@ -20,4 +23,7 @@ __all__ = [
|
|
20
23
|
"ContextualRecallScorer",
|
21
24
|
"AnswerRelevancyScorer",
|
22
25
|
"AnswerCorrectnessScorer",
|
26
|
+
"ComparisonScorer",
|
27
|
+
"InstructionAdherenceScorer",
|
28
|
+
"GroundednessScorer",
|
23
29
|
]
|
@@ -0,0 +1,35 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` comparison scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from typing import Optional, Dict
|
12
|
+
|
13
|
+
class ComparisonScorer(APIJudgmentScorer):
|
14
|
+
kwargs: Optional[Dict] = None
|
15
|
+
|
16
|
+
def __init__(self, threshold: float, criteria: str, description: str):
|
17
|
+
super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
|
18
|
+
self.kwargs = {"criteria": criteria, "description": description}
|
19
|
+
|
20
|
+
@property
|
21
|
+
def __name__(self):
|
22
|
+
return f"Comparison-{self.kwargs['criteria']}"
|
23
|
+
|
24
|
+
def to_dict(self) -> dict:
|
25
|
+
"""
|
26
|
+
Converts the scorer configuration to a dictionary format.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
dict: A dictionary containing the scorer's configuration
|
30
|
+
"""
|
31
|
+
return {
|
32
|
+
"score_type": self.score_type,
|
33
|
+
"threshold": self.threshold,
|
34
|
+
"kwargs": self.kwargs
|
35
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` Groundedness scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class GroundednessScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Groundedness"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` instruction adherence scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class InstructionAdherenceScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Instruction Adherence"
|
@@ -8,11 +8,13 @@ from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.to
|
|
8
8
|
from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
|
11
|
-
|
11
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
|
12
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"AnswerCorrectnessScorer",
|
15
16
|
"AnswerRelevancyScorer",
|
17
|
+
"ComparisonScorer",
|
16
18
|
"ContextualPrecisionScorer",
|
17
19
|
"ContextualRecallScorer",
|
18
20
|
"ContextualRelevancyScorer",
|
@@ -21,4 +23,5 @@ __all__ = [
|
|
21
23
|
"ToolCorrectnessScorer",
|
22
24
|
"HallucinationScorer",
|
23
25
|
"SummarizationScorer",
|
26
|
+
"InstructionAdherenceScorer",
|
24
27
|
]
|