judgeval 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +83 -0
- judgeval/clients.py +19 -0
- judgeval/common/__init__.py +8 -0
- judgeval/common/exceptions.py +28 -0
- judgeval/common/logger.py +189 -0
- judgeval/common/tracer.py +587 -0
- judgeval/common/utils.py +763 -0
- judgeval/constants.py +55 -0
- judgeval/data/__init__.py +14 -0
- judgeval/data/api_example.py +111 -0
- judgeval/data/datasets/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +407 -0
- judgeval/data/datasets/ground_truth.py +54 -0
- judgeval/data/datasets/utils.py +74 -0
- judgeval/data/example.py +76 -0
- judgeval/data/result.py +83 -0
- judgeval/data/scorer_data.py +86 -0
- judgeval/evaluation_run.py +130 -0
- judgeval/judges/__init__.py +7 -0
- judgeval/judges/base_judge.py +44 -0
- judgeval/judges/litellm_judge.py +49 -0
- judgeval/judges/mixture_of_judges.py +248 -0
- judgeval/judges/together_judge.py +55 -0
- judgeval/judges/utils.py +45 -0
- judgeval/judgment_client.py +244 -0
- judgeval/run_evaluation.py +355 -0
- judgeval/scorers/__init__.py +30 -0
- judgeval/scorers/base_scorer.py +51 -0
- judgeval/scorers/custom_scorer.py +134 -0
- judgeval/scorers/judgeval_scorers/__init__.py +21 -0
- judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
- judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
- judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
- judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
- judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
- judgeval/scorers/judgeval_scorers/summarization.py +20 -0
- judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
- judgeval/scorers/prompt_scorer.py +439 -0
- judgeval/scorers/score.py +427 -0
- judgeval/scorers/utils.py +175 -0
- judgeval-0.0.1.dist-info/METADATA +40 -0
- judgeval-0.0.1.dist-info/RECORD +46 -0
- judgeval-0.0.1.dist-info/WHEEL +4 -0
- judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
@@ -0,0 +1,355 @@
|
|
1
|
+
import asyncio
|
2
|
+
import requests
|
3
|
+
from typing import List, Dict
|
4
|
+
from datetime import datetime
|
5
|
+
from rich import print as rprint
|
6
|
+
|
7
|
+
from judgeval.data import (
|
8
|
+
Example,
|
9
|
+
ScorerData,
|
10
|
+
ScoringResult
|
11
|
+
)
|
12
|
+
from judgeval.scorers import (
|
13
|
+
CustomScorer,
|
14
|
+
JudgmentScorer,
|
15
|
+
ClassifierScorer
|
16
|
+
)
|
17
|
+
from judgeval.scorers.score import a_execute_scoring
|
18
|
+
|
19
|
+
from judgeval.constants import (
|
20
|
+
ROOT_API,
|
21
|
+
JUDGMENT_EVAL_API_URL,
|
22
|
+
JUDGMENT_EVAL_LOG_API_URL,
|
23
|
+
)
|
24
|
+
from judgeval.common.exceptions import JudgmentAPIError
|
25
|
+
from judgeval.evaluation_run import EvaluationRun
|
26
|
+
from judgeval.common.logger import (
|
27
|
+
enable_logging,
|
28
|
+
debug,
|
29
|
+
info,
|
30
|
+
error,
|
31
|
+
example_logging_context
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
36
|
+
"""
|
37
|
+
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
|
44
|
+
object.
|
45
|
+
"""
|
46
|
+
|
47
|
+
try:
|
48
|
+
# submit API request to execute evals
|
49
|
+
payload = evaluation_run.model_dump(warnings=False)
|
50
|
+
response = requests.post(JUDGMENT_EVAL_API_URL, json=payload)
|
51
|
+
response_data = response.json()
|
52
|
+
except Exception as e:
|
53
|
+
error(f"Error: {e}")
|
54
|
+
details = response.json().get("detail", "No details provided")
|
55
|
+
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
|
56
|
+
# Check if the response status code is not 2XX
|
57
|
+
# Add check for the duplicate eval run name
|
58
|
+
if not response.ok:
|
59
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
60
|
+
error(f"Error: {error_message=}")
|
61
|
+
raise JudgmentAPIError(error_message)
|
62
|
+
return response_data
|
63
|
+
|
64
|
+
|
65
|
+
def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
|
66
|
+
"""
|
67
|
+
When executing scorers that come from both the Judgment API and custom scorers, we're left with
|
68
|
+
results for each type of scorer. This function merges the results from the API and local evaluations,
|
69
|
+
grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
|
73
|
+
local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
|
77
|
+
"""
|
78
|
+
# No merge required
|
79
|
+
if not local_results and api_results:
|
80
|
+
return api_results
|
81
|
+
if not api_results and local_results:
|
82
|
+
return local_results
|
83
|
+
|
84
|
+
if len(api_results) != len(local_results):
|
85
|
+
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
86
|
+
raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
|
87
|
+
|
88
|
+
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
89
|
+
for api_result, local_result in zip(api_results, local_results):
|
90
|
+
if api_result.input != local_result.input:
|
91
|
+
raise ValueError("The API and local results are not aligned.")
|
92
|
+
if api_result.actual_output != local_result.actual_output:
|
93
|
+
raise ValueError("The API and local results are not aligned.")
|
94
|
+
if api_result.expected_output != local_result.expected_output:
|
95
|
+
raise ValueError("The API and local results are not aligned.")
|
96
|
+
if api_result.context != local_result.context:
|
97
|
+
raise ValueError("The API and local results are not aligned.")
|
98
|
+
if api_result.retrieval_context != local_result.retrieval_context:
|
99
|
+
raise ValueError("The API and local results are not aligned.")
|
100
|
+
|
101
|
+
# Merge ScorerData from the API and local scorers together
|
102
|
+
api_scorer_data = api_result.scorers_data
|
103
|
+
local_scorer_data = local_result.scorers_data
|
104
|
+
if api_scorer_data is None and local_scorer_data is not None:
|
105
|
+
api_result.scorers_data = local_scorer_data
|
106
|
+
|
107
|
+
if api_scorer_data is not None and local_scorer_data is not None:
|
108
|
+
api_result.scorers_data = api_scorer_data + local_scorer_data
|
109
|
+
|
110
|
+
return api_results
|
111
|
+
|
112
|
+
|
113
|
+
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
114
|
+
"""
|
115
|
+
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
116
|
+
|
117
|
+
If any are missing, logs an error and returns the results.
|
118
|
+
"""
|
119
|
+
for i, result in enumerate(results):
|
120
|
+
if not result.scorers_data:
|
121
|
+
error(
|
122
|
+
f"Scorer data is missing for example {i}. "
|
123
|
+
"This is usually caused when the example does not contain "
|
124
|
+
"the fields required by the scorer. "
|
125
|
+
"Check that your example contains the fields required by the scorers. "
|
126
|
+
"TODO add docs link here for reference."
|
127
|
+
)
|
128
|
+
return results
|
129
|
+
|
130
|
+
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
|
131
|
+
"""
|
132
|
+
Checks if an evaluation run name already exists for a given project.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
eval_name (str): Name of the evaluation run
|
136
|
+
project_name (str): Name of the project
|
137
|
+
judgment_api_key (str): API key for authentication
|
138
|
+
|
139
|
+
Raises:
|
140
|
+
ValueError: If the evaluation run name already exists
|
141
|
+
JudgmentAPIError: If there's an API error during the check
|
142
|
+
"""
|
143
|
+
try:
|
144
|
+
response = requests.post(
|
145
|
+
f"{ROOT_API}/eval-run-name-exists/",
|
146
|
+
json={
|
147
|
+
"eval_name": eval_name,
|
148
|
+
"project_name": project_name,
|
149
|
+
"judgment_api_key": judgment_api_key,
|
150
|
+
}
|
151
|
+
)
|
152
|
+
|
153
|
+
if response.status_code == 409:
|
154
|
+
error(f"Evaluation run name '{eval_name}' already exists for this project")
|
155
|
+
raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
|
156
|
+
|
157
|
+
if not response.ok:
|
158
|
+
response_data = response.json()
|
159
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
160
|
+
error(f"Error checking eval run name: {error_message}")
|
161
|
+
raise JudgmentAPIError(error_message)
|
162
|
+
|
163
|
+
except requests.exceptions.RequestException as e:
|
164
|
+
error(f"Failed to check if eval run name exists: {str(e)}")
|
165
|
+
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
166
|
+
|
167
|
+
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
|
168
|
+
"""
|
169
|
+
Logs evaluation results to the Judgment API database.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
merged_results (List[ScoringResult]): The results to log
|
173
|
+
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
174
|
+
|
175
|
+
Raises:
|
176
|
+
JudgmentAPIError: If there's an API error during logging
|
177
|
+
ValueError: If there's a validation error with the results
|
178
|
+
"""
|
179
|
+
try:
|
180
|
+
res = requests.post(
|
181
|
+
JUDGMENT_EVAL_LOG_API_URL,
|
182
|
+
json={
|
183
|
+
"results": [result.to_dict() for result in merged_results],
|
184
|
+
"judgment_api_key": evaluation_run.judgment_api_key,
|
185
|
+
"project_name": evaluation_run.project_name,
|
186
|
+
"eval_name": evaluation_run.eval_name,
|
187
|
+
}
|
188
|
+
)
|
189
|
+
|
190
|
+
if not res.ok:
|
191
|
+
response_data = res.json()
|
192
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
193
|
+
error(f"Error {res.status_code}: {error_message}")
|
194
|
+
raise JudgmentAPIError(error_message)
|
195
|
+
|
196
|
+
if "ui_results_url" in res.json():
|
197
|
+
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
|
198
|
+
|
199
|
+
except requests.exceptions.RequestException as e:
|
200
|
+
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
201
|
+
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
202
|
+
except Exception as e:
|
203
|
+
error(f"Failed to save evaluation results to DB: {str(e)}")
|
204
|
+
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
205
|
+
|
206
|
+
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
207
|
+
"""
|
208
|
+
Executes an evaluation of `Example`s using one or more `Scorer`s
|
209
|
+
|
210
|
+
Args:
|
211
|
+
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
212
|
+
|
213
|
+
Args:
|
214
|
+
project_name (str): The name of the project the evaluation results belong to
|
215
|
+
eval_name (str): The name of the evaluation run
|
216
|
+
examples (List[Example]): The examples to evaluate
|
217
|
+
scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
|
218
|
+
model (str): The model used as a judge when using LLM as a Judge
|
219
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
220
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
221
|
+
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
222
|
+
log_results (bool): Whether to log the results to the Judgment API
|
223
|
+
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
227
|
+
"""
|
228
|
+
|
229
|
+
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
230
|
+
if not override and evaluation_run.log_results:
|
231
|
+
check_eval_run_name_exists(
|
232
|
+
evaluation_run.eval_name,
|
233
|
+
evaluation_run.project_name,
|
234
|
+
evaluation_run.judgment_api_key
|
235
|
+
)
|
236
|
+
|
237
|
+
# Set example IDs if not already set
|
238
|
+
debug("Initializing examples with IDs and timestamps")
|
239
|
+
for idx, example in enumerate(evaluation_run.examples):
|
240
|
+
if example.example_id is None:
|
241
|
+
example.example_id = idx
|
242
|
+
debug(f"Set example ID {idx} for input: {example.input[:50]}...")
|
243
|
+
example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
244
|
+
with example_logging_context(example.timestamp, example.example_id):
|
245
|
+
debug(f"Initialized example {example.example_id}")
|
246
|
+
debug(f"Input: {example.input}")
|
247
|
+
debug(f"Actual output: {example.actual_output}")
|
248
|
+
if example.expected_output:
|
249
|
+
debug(f"Expected output: {example.expected_output}")
|
250
|
+
if example.context:
|
251
|
+
debug(f"Context: {example.context}")
|
252
|
+
if example.retrieval_context:
|
253
|
+
debug(f"Retrieval context: {example.retrieval_context}")
|
254
|
+
|
255
|
+
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
256
|
+
|
257
|
+
# Group JudgmentScorers and CustomScorers, then evaluate them in parallel
|
258
|
+
debug("Grouping scorers by type")
|
259
|
+
judgment_scorers: List[JudgmentScorer] = []
|
260
|
+
custom_scorers: List[CustomScorer] = []
|
261
|
+
for scorer in evaluation_run.scorers:
|
262
|
+
if isinstance(scorer, (JudgmentScorer, ClassifierScorer)):
|
263
|
+
judgment_scorers.append(scorer)
|
264
|
+
debug(f"Added judgment scorer: {type(scorer).__name__}")
|
265
|
+
else:
|
266
|
+
custom_scorers.append(scorer)
|
267
|
+
debug(f"Added custom scorer: {type(scorer).__name__}")
|
268
|
+
|
269
|
+
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(custom_scorers)} custom scorers")
|
270
|
+
|
271
|
+
api_results: List[ScoringResult] = []
|
272
|
+
local_results: List[ScoringResult] = []
|
273
|
+
|
274
|
+
# Execute evaluation using Judgment API
|
275
|
+
if judgment_scorers:
|
276
|
+
info("Starting API evaluation")
|
277
|
+
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
278
|
+
try: # execute an EvaluationRun with just JudgmentScorers
|
279
|
+
api_evaluation_run: EvaluationRun = EvaluationRun(
|
280
|
+
eval_name=evaluation_run.eval_name,
|
281
|
+
project_name=evaluation_run.project_name,
|
282
|
+
examples=evaluation_run.examples,
|
283
|
+
scorers=judgment_scorers,
|
284
|
+
model=evaluation_run.model,
|
285
|
+
aggregator=evaluation_run.aggregator,
|
286
|
+
metadata=evaluation_run.metadata,
|
287
|
+
judgment_api_key=evaluation_run.judgment_api_key,
|
288
|
+
log_results=evaluation_run.log_results
|
289
|
+
)
|
290
|
+
debug("Sending request to Judgment API")
|
291
|
+
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # ScoringResults
|
292
|
+
info(f"Received {len(response_data['results'])} results from API")
|
293
|
+
except JudgmentAPIError as e:
|
294
|
+
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
295
|
+
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
296
|
+
except ValueError as e:
|
297
|
+
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
|
298
|
+
|
299
|
+
# Convert the response data to `ScoringResult` objects
|
300
|
+
debug("Processing API results")
|
301
|
+
for idx, result in enumerate(response_data["results"]):
|
302
|
+
with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
|
303
|
+
for scorer in judgment_scorers:
|
304
|
+
debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
|
305
|
+
# filter for key-value pairs that are used to initialize ScoringResult
|
306
|
+
# there may be some stuff in here that doesn't belong in ScoringResult
|
307
|
+
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
308
|
+
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
309
|
+
|
310
|
+
# Convert scorers_data dicts to ScorerData objects
|
311
|
+
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
312
|
+
filtered_result["scorers_data"] = [
|
313
|
+
ScorerData(**scorer_dict)
|
314
|
+
for scorer_dict in filtered_result["scorers_data"]
|
315
|
+
]
|
316
|
+
|
317
|
+
api_results.append(ScoringResult(**filtered_result))
|
318
|
+
|
319
|
+
# Run local evals
|
320
|
+
if custom_scorers: # List[CustomScorer]
|
321
|
+
info("Starting local evaluation")
|
322
|
+
for example in evaluation_run.examples:
|
323
|
+
with example_logging_context(example.timestamp, example.example_id):
|
324
|
+
debug(f"Processing example {example.example_id}: {example.input}")
|
325
|
+
|
326
|
+
results: List[ScoringResult] = asyncio.run(
|
327
|
+
a_execute_scoring(
|
328
|
+
evaluation_run.examples,
|
329
|
+
custom_scorers,
|
330
|
+
model=evaluation_run.model,
|
331
|
+
ignore_errors=True,
|
332
|
+
skip_on_missing_params=True,
|
333
|
+
show_indicator=True,
|
334
|
+
_use_bar_indicator=True,
|
335
|
+
throttle_value=0,
|
336
|
+
max_concurrent=100,
|
337
|
+
)
|
338
|
+
)
|
339
|
+
local_results = results
|
340
|
+
info(f"Local evaluation complete with {len(local_results)} results")
|
341
|
+
|
342
|
+
# Aggregate the ScorerData from the API and local evaluations
|
343
|
+
debug("Merging API and local results")
|
344
|
+
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
345
|
+
merged_results = check_missing_scorer_data(merged_results)
|
346
|
+
|
347
|
+
info(f"Successfully merged {len(merged_results)} results")
|
348
|
+
|
349
|
+
if evaluation_run.log_results:
|
350
|
+
log_evaluation_results(merged_results, evaluation_run)
|
351
|
+
|
352
|
+
for i, result in enumerate(merged_results):
|
353
|
+
if not result.scorers_data: # none of the scorers could be executed on this example
|
354
|
+
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
355
|
+
return merged_results
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from judgeval.scorers.base_scorer import JudgmentScorer
|
2
|
+
from judgeval.scorers.custom_scorer import CustomScorer
|
3
|
+
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
|
+
from judgeval.scorers.judgeval_scorers import (
|
5
|
+
ToolCorrectnessScorer,
|
6
|
+
JSONCorrectnessScorer,
|
7
|
+
SummarizationScorer,
|
8
|
+
HallucinationScorer,
|
9
|
+
FaithfulnessScorer,
|
10
|
+
ContextualRelevancyScorer,
|
11
|
+
ContextualPrecisionScorer,
|
12
|
+
ContextualRecallScorer,
|
13
|
+
AnswerRelevancyScorer,
|
14
|
+
)
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"JudgmentScorer",
|
18
|
+
"CustomScorer",
|
19
|
+
"PromptScorer",
|
20
|
+
"ClassifierScorer",
|
21
|
+
"ToolCorrectnessScorer",
|
22
|
+
"JSONCorrectnessScorer",
|
23
|
+
"SummarizationScorer",
|
24
|
+
"HallucinationScorer",
|
25
|
+
"FaithfulnessScorer",
|
26
|
+
"ContextualRelevancyScorer",
|
27
|
+
"ContextualPrecisionScorer",
|
28
|
+
"ContextualRecallScorer",
|
29
|
+
"AnswerRelevancyScorer",
|
30
|
+
]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
"""
|
2
|
+
Judgment Scorer class.
|
3
|
+
|
4
|
+
Scores `Example`s using ready-made Judgment evaluators.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from pydantic import BaseModel, field_validator
|
8
|
+
from judgeval.common.logger import debug, info, warning, error
|
9
|
+
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class JudgmentScorer(BaseModel):
|
14
|
+
"""
|
15
|
+
Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
|
+
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
|
+
"""
|
21
|
+
threshold: float
|
22
|
+
score_type: APIScorer
|
23
|
+
|
24
|
+
@field_validator('threshold')
|
25
|
+
def validate_threshold(cls, v):
|
26
|
+
"""
|
27
|
+
Validates that the threshold is between 0 and 1 inclusive.
|
28
|
+
"""
|
29
|
+
if not 0 <= v <= 1:
|
30
|
+
error(f"Threshold must be between 0 and 1, got: {v}")
|
31
|
+
raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
|
32
|
+
return v
|
33
|
+
|
34
|
+
@field_validator('score_type')
|
35
|
+
def convert_to_enum_value(cls, v):
|
36
|
+
"""
|
37
|
+
Validates that the `score_type` is a valid `JudgmentMetric` enum value.
|
38
|
+
Converts string values to `JudgmentMetric` enum values.
|
39
|
+
"""
|
40
|
+
debug(f"Attempting to convert score_type value: {v}")
|
41
|
+
if isinstance(v, APIScorer):
|
42
|
+
info(f"Using existing JudgmentMetric: {v.value}")
|
43
|
+
return v.value
|
44
|
+
elif isinstance(v, str):
|
45
|
+
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
46
|
+
return APIScorer[v.upper()].value
|
47
|
+
error(f"Invalid score_type value: {v}")
|
48
|
+
raise ValueError(f"Invalid value for score_type: {v}")
|
49
|
+
|
50
|
+
def __str__(self):
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
@@ -0,0 +1,134 @@
|
|
1
|
+
"""
|
2
|
+
Custom Scorer class
|
3
|
+
|
4
|
+
Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
|
5
|
+
To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Optional, Dict, Union, List
|
9
|
+
from abc import abstractmethod
|
10
|
+
|
11
|
+
from judgeval.common.logger import debug, info, warning, error
|
12
|
+
from judgeval.judges import judgevalJudge
|
13
|
+
from judgeval.judges.utils import create_judge
|
14
|
+
|
15
|
+
|
16
|
+
class CustomScorer:
|
17
|
+
"""
|
18
|
+
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
19
|
+
you can create a custom scorer by extending this class. This is best used for special use cases
|
20
|
+
where none of Judgment's scorers are suitable.
|
21
|
+
"""
|
22
|
+
score_type: str # name of your new scorer
|
23
|
+
threshold: float # The threshold to pass a test while using this scorer as a scorer
|
24
|
+
score: Optional[float] = None # The float score of the scorer run on the test case
|
25
|
+
score_breakdown: Dict = None
|
26
|
+
reason: Optional[str] = None # The reason for the score when evaluating the test case
|
27
|
+
success: Optional[bool] = None # Whether the test case passed or failed
|
28
|
+
evaluation_model: Optional[str] = None # The model used to evaluate the test case
|
29
|
+
strict_mode: bool = False # Whether to run the scorer in strict mode
|
30
|
+
async_mode: bool = True # Whether to run the scorer in async mode
|
31
|
+
verbose_mode: bool = True # Whether to run the scorer in verbose mode
|
32
|
+
include_reason: bool = False # Whether to include the reason in the output
|
33
|
+
error: Optional[str] = None # The error message if the scorer failed
|
34
|
+
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
35
|
+
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
36
|
+
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
score_type: str,
|
41
|
+
threshold: float,
|
42
|
+
score: Optional[float] = None,
|
43
|
+
score_breakdown: Optional[Dict] = None,
|
44
|
+
reason: Optional[str] = None,
|
45
|
+
success: Optional[bool] = None,
|
46
|
+
evaluation_model: Optional[str] = None,
|
47
|
+
strict_mode: bool = False,
|
48
|
+
async_mode: bool = True,
|
49
|
+
verbose_mode: bool = True,
|
50
|
+
include_reason: bool = False,
|
51
|
+
error: Optional[str] = None,
|
52
|
+
evaluation_cost: Optional[float] = None,
|
53
|
+
verbose_logs: Optional[str] = None,
|
54
|
+
additional_metadata: Optional[Dict] = None
|
55
|
+
):
|
56
|
+
debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
|
57
|
+
if strict_mode:
|
58
|
+
warning("Strict mode enabled - scoring will be more rigorous")
|
59
|
+
info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
|
60
|
+
self.score_type = score_type
|
61
|
+
self.threshold = threshold
|
62
|
+
self.score = score
|
63
|
+
self.score_breakdown = score_breakdown
|
64
|
+
self.reason = reason
|
65
|
+
self.success = success
|
66
|
+
self.evaluation_model = evaluation_model
|
67
|
+
self.strict_mode = strict_mode
|
68
|
+
self.async_mode = async_mode
|
69
|
+
self.verbose_mode = verbose_mode
|
70
|
+
self.include_reason = include_reason
|
71
|
+
self.error = error
|
72
|
+
self.evaluation_cost = evaluation_cost
|
73
|
+
self.verbose_logs = verbose_logs
|
74
|
+
self.additional_metadata = additional_metadata
|
75
|
+
|
76
|
+
def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = None):
|
77
|
+
"""
|
78
|
+
Adds the evaluation model to the CustomScorer instance
|
79
|
+
|
80
|
+
This method is used at eval time
|
81
|
+
"""
|
82
|
+
self.model, self.using_native_model = create_judge(model)
|
83
|
+
self.evaluation_model = self.model.get_model_name()
|
84
|
+
|
85
|
+
@abstractmethod
|
86
|
+
def score_example(self, example, *args, **kwargs) -> float:
|
87
|
+
"""
|
88
|
+
Measures the score on a single example
|
89
|
+
"""
|
90
|
+
warning("Attempting to call unimplemented score_example method")
|
91
|
+
error("score_example method not implemented")
|
92
|
+
raise NotImplementedError("You must implement the `score` method in your custom scorer")
|
93
|
+
|
94
|
+
@abstractmethod
|
95
|
+
async def a_score_example(self, example, *args, **kwargs) -> float:
|
96
|
+
"""
|
97
|
+
Asynchronously measures the score on a single example
|
98
|
+
"""
|
99
|
+
warning("Attempting to call unimplemented a_score_example method")
|
100
|
+
error("a_score_example method not implemented")
|
101
|
+
raise NotImplementedError("You must implement the `a_score` method in your custom scorer")
|
102
|
+
|
103
|
+
@abstractmethod
|
104
|
+
def _success_check(self) -> bool:
|
105
|
+
"""
|
106
|
+
For unit testing, determines whether the test case passes or fails
|
107
|
+
"""
|
108
|
+
warning("Attempting to call unimplemented success_check method")
|
109
|
+
error("success_check method not implemented")
|
110
|
+
raise NotImplementedError("You must implement the `passes` method in your custom scorer")
|
111
|
+
|
112
|
+
def __str__(self):
|
113
|
+
debug("Converting CustomScorer instance to string representation")
|
114
|
+
if self.error:
|
115
|
+
warning(f"CustomScorer contains error: {self.error}")
|
116
|
+
info(f"CustomScorer status - success: {self.success}, score: {self.score}")
|
117
|
+
attributes = {
|
118
|
+
"score_type": self.score_type,
|
119
|
+
"threshold": self.threshold,
|
120
|
+
"score": self.score,
|
121
|
+
"score_breakdown": self.score_breakdown,
|
122
|
+
"reason": self.reason,
|
123
|
+
"success": self.success,
|
124
|
+
"evaluation_model": self.evaluation_model,
|
125
|
+
"strict_mode": self.strict_mode,
|
126
|
+
"async_mode": self.async_mode,
|
127
|
+
"verbose_mode": self.verbose_mode,
|
128
|
+
"include_reason": self.include_reason,
|
129
|
+
"error": self.error,
|
130
|
+
"evaluation_cost": self.evaluation_cost,
|
131
|
+
"verbose_logs": self.verbose_logs,
|
132
|
+
"additional_metadata": self.additional_metadata,
|
133
|
+
}
|
134
|
+
return f"CustomScorer({attributes})"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from judgeval.scorers.judgeval_scorers.tool_correctness import ToolCorrectnessScorer
|
2
|
+
from judgeval.scorers.judgeval_scorers.json_correctness import JSONCorrectnessScorer
|
3
|
+
from judgeval.scorers.judgeval_scorers.summarization import SummarizationScorer
|
4
|
+
from judgeval.scorers.judgeval_scorers.hallucination import HallucinationScorer
|
5
|
+
from judgeval.scorers.judgeval_scorers.faithfulness import FaithfulnessScorer
|
6
|
+
from judgeval.scorers.judgeval_scorers.contextual_relevancy import ContextualRelevancyScorer
|
7
|
+
from judgeval.scorers.judgeval_scorers.contextual_precision import ContextualPrecisionScorer
|
8
|
+
from judgeval.scorers.judgeval_scorers.contextual_recall import ContextualRecallScorer
|
9
|
+
from judgeval.scorers.judgeval_scorers.answer_relevancy import AnswerRelevancyScorer
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"ToolCorrectnessScorer",
|
13
|
+
"JSONCorrectnessScorer",
|
14
|
+
"SummarizationScorer",
|
15
|
+
"HallucinationScorer",
|
16
|
+
"FaithfulnessScorer",
|
17
|
+
"ContextualRelevancyScorer",
|
18
|
+
"ContextualPrecisionScorer",
|
19
|
+
"ContextualRecallScorer",
|
20
|
+
"AnswerRelevancyScorer",
|
21
|
+
]
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` answer relevancy scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.base_scorer import JudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class AnswerRelevancyScorer(JudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Answer Relevancy"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` contextual precision scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.base_scorer import JudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class ContextualPrecisionScorer(JudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Contextual Precision"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` contextual recall scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.base_scorer import JudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class ContextualRecallScorer(JudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Contextual Recall"
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` contextual relevancy scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.base_scorer import JudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class ContextualRelevancyScorer(JudgmentScorer):
|
14
|
+
"""
|
15
|
+
Scorer that checks if the output of a model is relevant to the retrieval context
|
16
|
+
"""
|
17
|
+
def __init__(self, threshold: float):
|
18
|
+
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
|
19
|
+
|
20
|
+
@property
|
21
|
+
def __name__(self):
|
22
|
+
return "Contextual Relevancy"
|