judgeval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. judgeval/__init__.py +83 -0
  2. judgeval/clients.py +19 -0
  3. judgeval/common/__init__.py +8 -0
  4. judgeval/common/exceptions.py +28 -0
  5. judgeval/common/logger.py +189 -0
  6. judgeval/common/tracer.py +587 -0
  7. judgeval/common/utils.py +763 -0
  8. judgeval/constants.py +55 -0
  9. judgeval/data/__init__.py +14 -0
  10. judgeval/data/api_example.py +111 -0
  11. judgeval/data/datasets/__init__.py +4 -0
  12. judgeval/data/datasets/dataset.py +407 -0
  13. judgeval/data/datasets/ground_truth.py +54 -0
  14. judgeval/data/datasets/utils.py +74 -0
  15. judgeval/data/example.py +76 -0
  16. judgeval/data/result.py +83 -0
  17. judgeval/data/scorer_data.py +86 -0
  18. judgeval/evaluation_run.py +130 -0
  19. judgeval/judges/__init__.py +7 -0
  20. judgeval/judges/base_judge.py +44 -0
  21. judgeval/judges/litellm_judge.py +49 -0
  22. judgeval/judges/mixture_of_judges.py +248 -0
  23. judgeval/judges/together_judge.py +55 -0
  24. judgeval/judges/utils.py +45 -0
  25. judgeval/judgment_client.py +244 -0
  26. judgeval/run_evaluation.py +355 -0
  27. judgeval/scorers/__init__.py +30 -0
  28. judgeval/scorers/base_scorer.py +51 -0
  29. judgeval/scorers/custom_scorer.py +134 -0
  30. judgeval/scorers/judgeval_scorers/__init__.py +21 -0
  31. judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
  32. judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
  33. judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
  34. judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
  35. judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
  36. judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
  37. judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
  38. judgeval/scorers/judgeval_scorers/summarization.py +20 -0
  39. judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
  40. judgeval/scorers/prompt_scorer.py +439 -0
  41. judgeval/scorers/score.py +427 -0
  42. judgeval/scorers/utils.py +175 -0
  43. judgeval-0.0.1.dist-info/METADATA +40 -0
  44. judgeval-0.0.1.dist-info/RECORD +46 -0
  45. judgeval-0.0.1.dist-info/WHEEL +4 -0
  46. judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
@@ -0,0 +1,355 @@
1
+ import asyncio
2
+ import requests
3
+ from typing import List, Dict
4
+ from datetime import datetime
5
+ from rich import print as rprint
6
+
7
+ from judgeval.data import (
8
+ Example,
9
+ ScorerData,
10
+ ScoringResult
11
+ )
12
+ from judgeval.scorers import (
13
+ CustomScorer,
14
+ JudgmentScorer,
15
+ ClassifierScorer
16
+ )
17
+ from judgeval.scorers.score import a_execute_scoring
18
+
19
+ from judgeval.constants import (
20
+ ROOT_API,
21
+ JUDGMENT_EVAL_API_URL,
22
+ JUDGMENT_EVAL_LOG_API_URL,
23
+ )
24
+ from judgeval.common.exceptions import JudgmentAPIError
25
+ from judgeval.evaluation_run import EvaluationRun
26
+ from judgeval.common.logger import (
27
+ enable_logging,
28
+ debug,
29
+ info,
30
+ error,
31
+ example_logging_context
32
+ )
33
+
34
+
35
+ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
36
+ """
37
+ Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
38
+
39
+ Args:
40
+ evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
41
+
42
+ Returns:
43
+ List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
44
+ object.
45
+ """
46
+
47
+ try:
48
+ # submit API request to execute evals
49
+ payload = evaluation_run.model_dump(warnings=False)
50
+ response = requests.post(JUDGMENT_EVAL_API_URL, json=payload)
51
+ response_data = response.json()
52
+ except Exception as e:
53
+ error(f"Error: {e}")
54
+ details = response.json().get("detail", "No details provided")
55
+ raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
56
+ # Check if the response status code is not 2XX
57
+ # Add check for the duplicate eval run name
58
+ if not response.ok:
59
+ error_message = response_data.get('detail', 'An unknown error occurred.')
60
+ error(f"Error: {error_message=}")
61
+ raise JudgmentAPIError(error_message)
62
+ return response_data
63
+
64
+
65
+ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
66
+ """
67
+ When executing scorers that come from both the Judgment API and custom scorers, we're left with
68
+ results for each type of scorer. This function merges the results from the API and local evaluations,
69
+ grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
70
+
71
+ Args:
72
+ api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
73
+ local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
74
+
75
+ Returns:
76
+ List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
77
+ """
78
+ # No merge required
79
+ if not local_results and api_results:
80
+ return api_results
81
+ if not api_results and local_results:
82
+ return local_results
83
+
84
+ if len(api_results) != len(local_results):
85
+ # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
86
+ raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
87
+
88
+ # Each ScoringResult in api and local have all the same fields besides `scorers_data`
89
+ for api_result, local_result in zip(api_results, local_results):
90
+ if api_result.input != local_result.input:
91
+ raise ValueError("The API and local results are not aligned.")
92
+ if api_result.actual_output != local_result.actual_output:
93
+ raise ValueError("The API and local results are not aligned.")
94
+ if api_result.expected_output != local_result.expected_output:
95
+ raise ValueError("The API and local results are not aligned.")
96
+ if api_result.context != local_result.context:
97
+ raise ValueError("The API and local results are not aligned.")
98
+ if api_result.retrieval_context != local_result.retrieval_context:
99
+ raise ValueError("The API and local results are not aligned.")
100
+
101
+ # Merge ScorerData from the API and local scorers together
102
+ api_scorer_data = api_result.scorers_data
103
+ local_scorer_data = local_result.scorers_data
104
+ if api_scorer_data is None and local_scorer_data is not None:
105
+ api_result.scorers_data = local_scorer_data
106
+
107
+ if api_scorer_data is not None and local_scorer_data is not None:
108
+ api_result.scorers_data = api_scorer_data + local_scorer_data
109
+
110
+ return api_results
111
+
112
+
113
+ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
114
+ """
115
+ Checks if any `ScoringResult` objects are missing `scorers_data`.
116
+
117
+ If any are missing, logs an error and returns the results.
118
+ """
119
+ for i, result in enumerate(results):
120
+ if not result.scorers_data:
121
+ error(
122
+ f"Scorer data is missing for example {i}. "
123
+ "This is usually caused when the example does not contain "
124
+ "the fields required by the scorer. "
125
+ "Check that your example contains the fields required by the scorers. "
126
+ "TODO add docs link here for reference."
127
+ )
128
+ return results
129
+
130
+ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
131
+ """
132
+ Checks if an evaluation run name already exists for a given project.
133
+
134
+ Args:
135
+ eval_name (str): Name of the evaluation run
136
+ project_name (str): Name of the project
137
+ judgment_api_key (str): API key for authentication
138
+
139
+ Raises:
140
+ ValueError: If the evaluation run name already exists
141
+ JudgmentAPIError: If there's an API error during the check
142
+ """
143
+ try:
144
+ response = requests.post(
145
+ f"{ROOT_API}/eval-run-name-exists/",
146
+ json={
147
+ "eval_name": eval_name,
148
+ "project_name": project_name,
149
+ "judgment_api_key": judgment_api_key,
150
+ }
151
+ )
152
+
153
+ if response.status_code == 409:
154
+ error(f"Evaluation run name '{eval_name}' already exists for this project")
155
+ raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
156
+
157
+ if not response.ok:
158
+ response_data = response.json()
159
+ error_message = response_data.get('detail', 'An unknown error occurred.')
160
+ error(f"Error checking eval run name: {error_message}")
161
+ raise JudgmentAPIError(error_message)
162
+
163
+ except requests.exceptions.RequestException as e:
164
+ error(f"Failed to check if eval run name exists: {str(e)}")
165
+ raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
166
+
167
+ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
168
+ """
169
+ Logs evaluation results to the Judgment API database.
170
+
171
+ Args:
172
+ merged_results (List[ScoringResult]): The results to log
173
+ evaluation_run (EvaluationRun): The evaluation run containing project info and API key
174
+
175
+ Raises:
176
+ JudgmentAPIError: If there's an API error during logging
177
+ ValueError: If there's a validation error with the results
178
+ """
179
+ try:
180
+ res = requests.post(
181
+ JUDGMENT_EVAL_LOG_API_URL,
182
+ json={
183
+ "results": [result.to_dict() for result in merged_results],
184
+ "judgment_api_key": evaluation_run.judgment_api_key,
185
+ "project_name": evaluation_run.project_name,
186
+ "eval_name": evaluation_run.eval_name,
187
+ }
188
+ )
189
+
190
+ if not res.ok:
191
+ response_data = res.json()
192
+ error_message = response_data.get('detail', 'An unknown error occurred.')
193
+ error(f"Error {res.status_code}: {error_message}")
194
+ raise JudgmentAPIError(error_message)
195
+
196
+ if "ui_results_url" in res.json():
197
+ rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
198
+
199
+ except requests.exceptions.RequestException as e:
200
+ error(f"Request failed while saving evaluation results to DB: {str(e)}")
201
+ raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
202
+ except Exception as e:
203
+ error(f"Failed to save evaluation results to DB: {str(e)}")
204
+ raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
205
+
206
+ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
207
+ """
208
+ Executes an evaluation of `Example`s using one or more `Scorer`s
209
+
210
+ Args:
211
+ evaluation_run (EvaluationRun): Stores example and evaluation together for running
212
+
213
+ Args:
214
+ project_name (str): The name of the project the evaluation results belong to
215
+ eval_name (str): The name of the evaluation run
216
+ examples (List[Example]): The examples to evaluate
217
+ scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
218
+ model (str): The model used as a judge when using LLM as a Judge
219
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
220
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
221
+ judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
222
+ log_results (bool): Whether to log the results to the Judgment API
223
+
224
+
225
+ Returns:
226
+ List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
227
+ """
228
+
229
+ # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
230
+ if not override and evaluation_run.log_results:
231
+ check_eval_run_name_exists(
232
+ evaluation_run.eval_name,
233
+ evaluation_run.project_name,
234
+ evaluation_run.judgment_api_key
235
+ )
236
+
237
+ # Set example IDs if not already set
238
+ debug("Initializing examples with IDs and timestamps")
239
+ for idx, example in enumerate(evaluation_run.examples):
240
+ if example.example_id is None:
241
+ example.example_id = idx
242
+ debug(f"Set example ID {idx} for input: {example.input[:50]}...")
243
+ example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
244
+ with example_logging_context(example.timestamp, example.example_id):
245
+ debug(f"Initialized example {example.example_id}")
246
+ debug(f"Input: {example.input}")
247
+ debug(f"Actual output: {example.actual_output}")
248
+ if example.expected_output:
249
+ debug(f"Expected output: {example.expected_output}")
250
+ if example.context:
251
+ debug(f"Context: {example.context}")
252
+ if example.retrieval_context:
253
+ debug(f"Retrieval context: {example.retrieval_context}")
254
+
255
+ debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
256
+
257
+ # Group JudgmentScorers and CustomScorers, then evaluate them in parallel
258
+ debug("Grouping scorers by type")
259
+ judgment_scorers: List[JudgmentScorer] = []
260
+ custom_scorers: List[CustomScorer] = []
261
+ for scorer in evaluation_run.scorers:
262
+ if isinstance(scorer, (JudgmentScorer, ClassifierScorer)):
263
+ judgment_scorers.append(scorer)
264
+ debug(f"Added judgment scorer: {type(scorer).__name__}")
265
+ else:
266
+ custom_scorers.append(scorer)
267
+ debug(f"Added custom scorer: {type(scorer).__name__}")
268
+
269
+ debug(f"Found {len(judgment_scorers)} judgment scorers and {len(custom_scorers)} custom scorers")
270
+
271
+ api_results: List[ScoringResult] = []
272
+ local_results: List[ScoringResult] = []
273
+
274
+ # Execute evaluation using Judgment API
275
+ if judgment_scorers:
276
+ info("Starting API evaluation")
277
+ debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
278
+ try: # execute an EvaluationRun with just JudgmentScorers
279
+ api_evaluation_run: EvaluationRun = EvaluationRun(
280
+ eval_name=evaluation_run.eval_name,
281
+ project_name=evaluation_run.project_name,
282
+ examples=evaluation_run.examples,
283
+ scorers=judgment_scorers,
284
+ model=evaluation_run.model,
285
+ aggregator=evaluation_run.aggregator,
286
+ metadata=evaluation_run.metadata,
287
+ judgment_api_key=evaluation_run.judgment_api_key,
288
+ log_results=evaluation_run.log_results
289
+ )
290
+ debug("Sending request to Judgment API")
291
+ response_data: List[Dict] = execute_api_eval(api_evaluation_run) # ScoringResults
292
+ info(f"Received {len(response_data['results'])} results from API")
293
+ except JudgmentAPIError as e:
294
+ error(f"An error occurred while executing the Judgment API request: {str(e)}")
295
+ raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
296
+ except ValueError as e:
297
+ raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
298
+
299
+ # Convert the response data to `ScoringResult` objects
300
+ debug("Processing API results")
301
+ for idx, result in enumerate(response_data["results"]):
302
+ with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
303
+ for scorer in judgment_scorers:
304
+ debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
305
+ # filter for key-value pairs that are used to initialize ScoringResult
306
+ # there may be some stuff in here that doesn't belong in ScoringResult
307
+ # TODO: come back and refactor this to have ScoringResult take in **kwargs
308
+ filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
309
+
310
+ # Convert scorers_data dicts to ScorerData objects
311
+ if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
312
+ filtered_result["scorers_data"] = [
313
+ ScorerData(**scorer_dict)
314
+ for scorer_dict in filtered_result["scorers_data"]
315
+ ]
316
+
317
+ api_results.append(ScoringResult(**filtered_result))
318
+
319
+ # Run local evals
320
+ if custom_scorers: # List[CustomScorer]
321
+ info("Starting local evaluation")
322
+ for example in evaluation_run.examples:
323
+ with example_logging_context(example.timestamp, example.example_id):
324
+ debug(f"Processing example {example.example_id}: {example.input}")
325
+
326
+ results: List[ScoringResult] = asyncio.run(
327
+ a_execute_scoring(
328
+ evaluation_run.examples,
329
+ custom_scorers,
330
+ model=evaluation_run.model,
331
+ ignore_errors=True,
332
+ skip_on_missing_params=True,
333
+ show_indicator=True,
334
+ _use_bar_indicator=True,
335
+ throttle_value=0,
336
+ max_concurrent=100,
337
+ )
338
+ )
339
+ local_results = results
340
+ info(f"Local evaluation complete with {len(local_results)} results")
341
+
342
+ # Aggregate the ScorerData from the API and local evaluations
343
+ debug("Merging API and local results")
344
+ merged_results: List[ScoringResult] = merge_results(api_results, local_results)
345
+ merged_results = check_missing_scorer_data(merged_results)
346
+
347
+ info(f"Successfully merged {len(merged_results)} results")
348
+
349
+ if evaluation_run.log_results:
350
+ log_evaluation_results(merged_results, evaluation_run)
351
+
352
+ for i, result in enumerate(merged_results):
353
+ if not result.scorers_data: # none of the scorers could be executed on this example
354
+ info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
355
+ return merged_results
@@ -0,0 +1,30 @@
1
+ from judgeval.scorers.base_scorer import JudgmentScorer
2
+ from judgeval.scorers.custom_scorer import CustomScorer
3
+ from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
+ from judgeval.scorers.judgeval_scorers import (
5
+ ToolCorrectnessScorer,
6
+ JSONCorrectnessScorer,
7
+ SummarizationScorer,
8
+ HallucinationScorer,
9
+ FaithfulnessScorer,
10
+ ContextualRelevancyScorer,
11
+ ContextualPrecisionScorer,
12
+ ContextualRecallScorer,
13
+ AnswerRelevancyScorer,
14
+ )
15
+
16
+ __all__ = [
17
+ "JudgmentScorer",
18
+ "CustomScorer",
19
+ "PromptScorer",
20
+ "ClassifierScorer",
21
+ "ToolCorrectnessScorer",
22
+ "JSONCorrectnessScorer",
23
+ "SummarizationScorer",
24
+ "HallucinationScorer",
25
+ "FaithfulnessScorer",
26
+ "ContextualRelevancyScorer",
27
+ "ContextualPrecisionScorer",
28
+ "ContextualRecallScorer",
29
+ "AnswerRelevancyScorer",
30
+ ]
@@ -0,0 +1,51 @@
1
+ """
2
+ Judgment Scorer class.
3
+
4
+ Scores `Example`s using ready-made Judgment evaluators.
5
+ """
6
+
7
+ from pydantic import BaseModel, field_validator
8
+ from judgeval.common.logger import debug, info, warning, error
9
+
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class JudgmentScorer(BaseModel):
14
+ """
15
+ Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
+
17
+ Args:
18
+ score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
+ threshold (float): A value between 0 and 1 that determines the scoring threshold
20
+ """
21
+ threshold: float
22
+ score_type: APIScorer
23
+
24
+ @field_validator('threshold')
25
+ def validate_threshold(cls, v):
26
+ """
27
+ Validates that the threshold is between 0 and 1 inclusive.
28
+ """
29
+ if not 0 <= v <= 1:
30
+ error(f"Threshold must be between 0 and 1, got: {v}")
31
+ raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
32
+ return v
33
+
34
+ @field_validator('score_type')
35
+ def convert_to_enum_value(cls, v):
36
+ """
37
+ Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
+ Converts string values to `JudgmentMetric` enum values.
39
+ """
40
+ debug(f"Attempting to convert score_type value: {v}")
41
+ if isinstance(v, APIScorer):
42
+ info(f"Using existing JudgmentMetric: {v.value}")
43
+ return v.value
44
+ elif isinstance(v, str):
45
+ debug(f"Converting string value to JudgmentMetric enum: {v}")
46
+ return APIScorer[v.upper()].value
47
+ error(f"Invalid score_type value: {v}")
48
+ raise ValueError(f"Invalid value for score_type: {v}")
49
+
50
+ def __str__(self):
51
+ return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
@@ -0,0 +1,134 @@
1
+ """
2
+ Custom Scorer class
3
+
4
+ Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
5
+ To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
6
+ """
7
+
8
+ from typing import Optional, Dict, Union, List
9
+ from abc import abstractmethod
10
+
11
+ from judgeval.common.logger import debug, info, warning, error
12
+ from judgeval.judges import judgevalJudge
13
+ from judgeval.judges.utils import create_judge
14
+
15
+
16
+ class CustomScorer:
17
+ """
18
+ If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
19
+ you can create a custom scorer by extending this class. This is best used for special use cases
20
+ where none of Judgment's scorers are suitable.
21
+ """
22
+ score_type: str # name of your new scorer
23
+ threshold: float # The threshold to pass a test while using this scorer as a scorer
24
+ score: Optional[float] = None # The float score of the scorer run on the test case
25
+ score_breakdown: Dict = None
26
+ reason: Optional[str] = None # The reason for the score when evaluating the test case
27
+ success: Optional[bool] = None # Whether the test case passed or failed
28
+ evaluation_model: Optional[str] = None # The model used to evaluate the test case
29
+ strict_mode: bool = False # Whether to run the scorer in strict mode
30
+ async_mode: bool = True # Whether to run the scorer in async mode
31
+ verbose_mode: bool = True # Whether to run the scorer in verbose mode
32
+ include_reason: bool = False # Whether to include the reason in the output
33
+ error: Optional[str] = None # The error message if the scorer failed
34
+ evaluation_cost: Optional[float] = None # The cost of running the scorer
35
+ verbose_logs: Optional[str] = None # The verbose logs of the scorer
36
+ additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
37
+
38
+ def __init__(
39
+ self,
40
+ score_type: str,
41
+ threshold: float,
42
+ score: Optional[float] = None,
43
+ score_breakdown: Optional[Dict] = None,
44
+ reason: Optional[str] = None,
45
+ success: Optional[bool] = None,
46
+ evaluation_model: Optional[str] = None,
47
+ strict_mode: bool = False,
48
+ async_mode: bool = True,
49
+ verbose_mode: bool = True,
50
+ include_reason: bool = False,
51
+ error: Optional[str] = None,
52
+ evaluation_cost: Optional[float] = None,
53
+ verbose_logs: Optional[str] = None,
54
+ additional_metadata: Optional[Dict] = None
55
+ ):
56
+ debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
57
+ if strict_mode:
58
+ warning("Strict mode enabled - scoring will be more rigorous")
59
+ info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
60
+ self.score_type = score_type
61
+ self.threshold = threshold
62
+ self.score = score
63
+ self.score_breakdown = score_breakdown
64
+ self.reason = reason
65
+ self.success = success
66
+ self.evaluation_model = evaluation_model
67
+ self.strict_mode = strict_mode
68
+ self.async_mode = async_mode
69
+ self.verbose_mode = verbose_mode
70
+ self.include_reason = include_reason
71
+ self.error = error
72
+ self.evaluation_cost = evaluation_cost
73
+ self.verbose_logs = verbose_logs
74
+ self.additional_metadata = additional_metadata
75
+
76
+ def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = None):
77
+ """
78
+ Adds the evaluation model to the CustomScorer instance
79
+
80
+ This method is used at eval time
81
+ """
82
+ self.model, self.using_native_model = create_judge(model)
83
+ self.evaluation_model = self.model.get_model_name()
84
+
85
+ @abstractmethod
86
+ def score_example(self, example, *args, **kwargs) -> float:
87
+ """
88
+ Measures the score on a single example
89
+ """
90
+ warning("Attempting to call unimplemented score_example method")
91
+ error("score_example method not implemented")
92
+ raise NotImplementedError("You must implement the `score` method in your custom scorer")
93
+
94
+ @abstractmethod
95
+ async def a_score_example(self, example, *args, **kwargs) -> float:
96
+ """
97
+ Asynchronously measures the score on a single example
98
+ """
99
+ warning("Attempting to call unimplemented a_score_example method")
100
+ error("a_score_example method not implemented")
101
+ raise NotImplementedError("You must implement the `a_score` method in your custom scorer")
102
+
103
+ @abstractmethod
104
+ def _success_check(self) -> bool:
105
+ """
106
+ For unit testing, determines whether the test case passes or fails
107
+ """
108
+ warning("Attempting to call unimplemented success_check method")
109
+ error("success_check method not implemented")
110
+ raise NotImplementedError("You must implement the `passes` method in your custom scorer")
111
+
112
+ def __str__(self):
113
+ debug("Converting CustomScorer instance to string representation")
114
+ if self.error:
115
+ warning(f"CustomScorer contains error: {self.error}")
116
+ info(f"CustomScorer status - success: {self.success}, score: {self.score}")
117
+ attributes = {
118
+ "score_type": self.score_type,
119
+ "threshold": self.threshold,
120
+ "score": self.score,
121
+ "score_breakdown": self.score_breakdown,
122
+ "reason": self.reason,
123
+ "success": self.success,
124
+ "evaluation_model": self.evaluation_model,
125
+ "strict_mode": self.strict_mode,
126
+ "async_mode": self.async_mode,
127
+ "verbose_mode": self.verbose_mode,
128
+ "include_reason": self.include_reason,
129
+ "error": self.error,
130
+ "evaluation_cost": self.evaluation_cost,
131
+ "verbose_logs": self.verbose_logs,
132
+ "additional_metadata": self.additional_metadata,
133
+ }
134
+ return f"CustomScorer({attributes})"
@@ -0,0 +1,21 @@
1
+ from judgeval.scorers.judgeval_scorers.tool_correctness import ToolCorrectnessScorer
2
+ from judgeval.scorers.judgeval_scorers.json_correctness import JSONCorrectnessScorer
3
+ from judgeval.scorers.judgeval_scorers.summarization import SummarizationScorer
4
+ from judgeval.scorers.judgeval_scorers.hallucination import HallucinationScorer
5
+ from judgeval.scorers.judgeval_scorers.faithfulness import FaithfulnessScorer
6
+ from judgeval.scorers.judgeval_scorers.contextual_relevancy import ContextualRelevancyScorer
7
+ from judgeval.scorers.judgeval_scorers.contextual_precision import ContextualPrecisionScorer
8
+ from judgeval.scorers.judgeval_scorers.contextual_recall import ContextualRecallScorer
9
+ from judgeval.scorers.judgeval_scorers.answer_relevancy import AnswerRelevancyScorer
10
+
11
+ __all__ = [
12
+ "ToolCorrectnessScorer",
13
+ "JSONCorrectnessScorer",
14
+ "SummarizationScorer",
15
+ "HallucinationScorer",
16
+ "FaithfulnessScorer",
17
+ "ContextualRelevancyScorer",
18
+ "ContextualPrecisionScorer",
19
+ "ContextualRecallScorer",
20
+ "AnswerRelevancyScorer",
21
+ ]
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` answer relevancy scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class AnswerRelevancyScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Answer Relevancy"
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` contextual precision scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class ContextualPrecisionScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Contextual Precision"
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` contextual recall scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class ContextualRecallScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Contextual Recall"
@@ -0,0 +1,22 @@
1
+ """
2
+ `judgeval` contextual relevancy scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class ContextualRelevancyScorer(JudgmentScorer):
14
+ """
15
+ Scorer that checks if the output of a model is relevant to the retrieval context
16
+ """
17
+ def __init__(self, threshold: float):
18
+ super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
19
+
20
+ @property
21
+ def __name__(self):
22
+ return "Contextual Relevancy"