judgeval 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. judgeval/__init__.py +0 -71
  2. judgeval/common/tracer.py +57 -31
  3. judgeval/constants.py +1 -0
  4. judgeval/data/__init__.py +2 -1
  5. judgeval/data/scorer_data.py +2 -2
  6. judgeval/evaluation_run.py +16 -15
  7. judgeval/judges/__init__.py +2 -2
  8. judgeval/judges/base_judge.py +1 -1
  9. judgeval/judges/litellm_judge.py +2 -2
  10. judgeval/judges/mixture_of_judges.py +2 -2
  11. judgeval/judges/together_judge.py +2 -2
  12. judgeval/judges/utils.py +4 -4
  13. judgeval/judgment_client.py +67 -15
  14. judgeval/run_evaluation.py +79 -14
  15. judgeval/scorers/__init__.py +8 -4
  16. judgeval/scorers/api_scorer.py +64 -0
  17. judgeval/scorers/base_scorer.py +3 -2
  18. judgeval/scorers/exceptions.py +11 -0
  19. judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
  20. judgeval/scorers/judgeval_scorers/__init__.py +132 -9
  21. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
  22. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
  23. judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
  24. judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
  25. judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
  26. judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
  27. judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
  28. judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
  29. judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
  30. judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
  31. judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
  32. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
  36. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
  42. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
  43. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
  44. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
  45. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
  48. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
  49. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
  50. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
  51. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
  52. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
  53. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
  54. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
  55. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
  56. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
  57. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
  58. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
  59. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
  60. judgeval/scorers/prompt_scorer.py +4 -4
  61. judgeval/scorers/score.py +14 -14
  62. judgeval/scorers/utils.py +40 -6
  63. {judgeval-0.0.2.dist-info → judgeval-0.0.4.dist-info}/METADATA +11 -12
  64. judgeval-0.0.4.dist-info/RECORD +78 -0
  65. judgeval-0.0.2.dist-info/RECORD +0 -46
  66. {judgeval-0.0.2.dist-info → judgeval-0.0.4.dist-info}/WHEEL +0 -0
  67. {judgeval-0.0.2.dist-info → judgeval-0.0.4.dist-info}/licenses/LICENSE.md +0 -0
@@ -10,8 +10,8 @@ from judgeval.data import (
10
10
  ScoringResult
11
11
  )
12
12
  from judgeval.scorers import (
13
- CustomScorer,
14
- JudgmentScorer,
13
+ JudgevalScorer,
14
+ APIJudgmentScorer,
15
15
  ClassifierScorer
16
16
  )
17
17
  from judgeval.scorers.score import a_execute_scoring
@@ -64,7 +64,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
64
64
 
65
65
  def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
66
66
  """
67
- When executing scorers that come from both the Judgment API and custom scorers, we're left with
67
+ When executing scorers that come from both the Judgment API and local scorers, we're left with
68
68
  results for each type of scorer. This function merges the results from the API and local evaluations,
69
69
  grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
70
70
 
@@ -127,6 +127,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
127
127
  )
128
128
  return results
129
129
 
130
+
130
131
  def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
131
132
  """
132
133
  Checks if an evaluation run name already exists for a given project.
@@ -164,6 +165,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
164
165
  error(f"Failed to check if eval run name exists: {str(e)}")
165
166
  raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
166
167
 
168
+
167
169
  def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
168
170
  """
169
171
  Logs evaluation results to the Judgment API database.
@@ -203,6 +205,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
203
205
  error(f"Failed to save evaluation results to DB: {str(e)}")
204
206
  raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
205
207
 
208
+
206
209
  def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
207
210
  """
208
211
  Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -214,7 +217,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
214
217
  project_name (str): The name of the project the evaluation results belong to
215
218
  eval_name (str): The name of the evaluation run
216
219
  examples (List[Example]): The examples to evaluate
217
- scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
220
+ scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
218
221
  model (str): The model used as a judge when using LLM as a Judge
219
222
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
220
223
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
@@ -254,19 +257,19 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
254
257
 
255
258
  debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
256
259
 
257
- # Group JudgmentScorers and CustomScorers, then evaluate them in parallel
260
+ # Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
258
261
  debug("Grouping scorers by type")
259
- judgment_scorers: List[JudgmentScorer] = []
260
- custom_scorers: List[CustomScorer] = []
262
+ judgment_scorers: List[APIJudgmentScorer] = []
263
+ local_scorers: List[JudgevalScorer] = []
261
264
  for scorer in evaluation_run.scorers:
262
- if isinstance(scorer, (JudgmentScorer, ClassifierScorer)):
265
+ if isinstance(scorer, (APIJudgmentScorer, ClassifierScorer)):
263
266
  judgment_scorers.append(scorer)
264
267
  debug(f"Added judgment scorer: {type(scorer).__name__}")
265
268
  else:
266
- custom_scorers.append(scorer)
267
- debug(f"Added custom scorer: {type(scorer).__name__}")
269
+ local_scorers.append(scorer)
270
+ debug(f"Added local scorer: {type(scorer).__name__}")
268
271
 
269
- debug(f"Found {len(judgment_scorers)} judgment scorers and {len(custom_scorers)} custom scorers")
272
+ debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
270
273
 
271
274
  api_results: List[ScoringResult] = []
272
275
  local_results: List[ScoringResult] = []
@@ -288,7 +291,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
288
291
  log_results=evaluation_run.log_results
289
292
  )
290
293
  debug("Sending request to Judgment API")
291
- response_data: List[Dict] = execute_api_eval(api_evaluation_run) # ScoringResults
294
+ response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
292
295
  info(f"Received {len(response_data['results'])} results from API")
293
296
  except JudgmentAPIError as e:
294
297
  error(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -317,7 +320,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
317
320
  api_results.append(ScoringResult(**filtered_result))
318
321
 
319
322
  # Run local evals
320
- if custom_scorers: # List[CustomScorer]
323
+ if local_scorers: # List[JudgevalScorer]
321
324
  info("Starting local evaluation")
322
325
  for example in evaluation_run.examples:
323
326
  with example_logging_context(example.timestamp, example.example_id):
@@ -326,7 +329,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
326
329
  results: List[ScoringResult] = asyncio.run(
327
330
  a_execute_scoring(
328
331
  evaluation_run.examples,
329
- custom_scorers,
332
+ local_scorers,
330
333
  model=evaluation_run.model,
331
334
  ignore_errors=True,
332
335
  skip_on_missing_params=True,
@@ -353,3 +356,65 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
353
356
  if not result.scorers_data: # none of the scorers could be executed on this example
354
357
  info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
355
358
  return merged_results
359
+
360
+ def assert_test(scoring_results: List[ScoringResult]) -> None:
361
+ """
362
+ Collects all failed scorers from the scoring results.
363
+
364
+ Args:
365
+ ScoringResults (List[ScoringResult]): List of scoring results to check
366
+
367
+ Returns:
368
+ None. Raises exceptions for any failed test cases.
369
+ """
370
+ failed_cases: List[ScorerData] = []
371
+
372
+ for result in scoring_results:
373
+ if not result.success:
374
+
375
+ # Create a test case context with all relevant fields
376
+ test_case = {
377
+ 'input': result.input,
378
+ 'actual_output': result.actual_output,
379
+ 'expected_output': result.expected_output,
380
+ 'context': result.context,
381
+ 'retrieval_context': result.retrieval_context,
382
+ 'eval_run_name': result.eval_run_name,
383
+ 'failed_scorers': []
384
+ }
385
+ if result.scorers_data:
386
+ # If the result was not successful, check each scorer_data
387
+ for scorer_data in result.scorers_data:
388
+ if not scorer_data.success:
389
+ test_case['failed_scorers'].append(scorer_data)
390
+ failed_cases.append(test_case)
391
+
392
+ if failed_cases:
393
+ error_msg = f"The following test cases failed: \n"
394
+ for fail_case in failed_cases:
395
+ error_msg += f"\nInput: {fail_case['input']}\n"
396
+ error_msg += f"Actual Output: {fail_case['actual_output']}\n"
397
+ error_msg += f"Expected Output: {fail_case['expected_output']}\n"
398
+ error_msg += f"Context: {fail_case['context']}\n"
399
+ error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
400
+ error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
401
+
402
+ for fail_scorer in fail_case['failed_scorers']:
403
+
404
+ error_msg += (
405
+ f"\nScorer Name: {fail_scorer.name}\n"
406
+ f"Threshold: {fail_scorer.threshold}\n"
407
+ f"Success: {fail_scorer.success}\n"
408
+ f"Score: {fail_scorer.score}\n"
409
+ f"Reason: {fail_scorer.reason}\n"
410
+ f"Strict Mode: {fail_scorer.strict_mode}\n"
411
+ f"Evaluation Model: {fail_scorer.evaluation_model}\n"
412
+ f"Error: {fail_scorer.error}\n"
413
+ f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
414
+ f"Verbose Logs: {fail_scorer.verbose_logs}\n"
415
+ f"Additional Metadata: {fail_scorer.additional_metadata}\n"
416
+ )
417
+ error_msg += "-"*100
418
+
419
+ raise AssertionError(error_msg)
420
+
@@ -1,5 +1,5 @@
1
- from judgeval.scorers.base_scorer import JudgmentScorer
2
- from judgeval.scorers.custom_scorer import CustomScorer
1
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
2
+ from judgeval.scorers.judgeval_scorer import JudgevalScorer
3
3
  from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
4
4
  from judgeval.scorers.judgeval_scorers import (
5
5
  ToolCorrectnessScorer,
@@ -11,11 +11,13 @@ from judgeval.scorers.judgeval_scorers import (
11
11
  ContextualPrecisionScorer,
12
12
  ContextualRecallScorer,
13
13
  AnswerRelevancyScorer,
14
+ ScorerWrapper,
15
+ AnswerCorrectnessScorer,
14
16
  )
15
17
 
16
18
  __all__ = [
17
- "JudgmentScorer",
18
- "CustomScorer",
19
+ "APIJudgmentScorer",
20
+ "JudgevalScorer",
19
21
  "PromptScorer",
20
22
  "ClassifierScorer",
21
23
  "ToolCorrectnessScorer",
@@ -27,4 +29,6 @@ __all__ = [
27
29
  "ContextualPrecisionScorer",
28
30
  "ContextualRecallScorer",
29
31
  "AnswerRelevancyScorer",
32
+ "ScorerWrapper",
33
+ "AnswerCorrectnessScorer",
30
34
  ]
@@ -0,0 +1,64 @@
1
+ """
2
+ Judgment Scorer class.
3
+
4
+ Scores `Example`s using ready-made Judgment evaluators.
5
+ """
6
+
7
+ from pydantic import BaseModel, field_validator
8
+ from judgeval.common.logger import debug, info, warning, error
9
+
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class APIJudgmentScorer(BaseModel):
14
+ """
15
+ Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
+
17
+ Args:
18
+ score_type (APIScorer): The Judgment metric to use for scoring `Example`s
19
+ threshold (float): A value between 0 and 1 that determines the scoring threshold
20
+ """
21
+ threshold: float
22
+ score_type: APIScorer
23
+
24
+ @field_validator('threshold')
25
+ def validate_threshold(cls, v):
26
+ """
27
+ Validates that the threshold is between 0 and 1 inclusive.
28
+ """
29
+ if not 0 <= v <= 1:
30
+ error(f"Threshold must be between 0 and 1, got: {v}")
31
+ raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
32
+ return v
33
+
34
+ @field_validator('score_type')
35
+ def convert_to_enum_value(cls, v):
36
+ """
37
+ Validates that the `score_type` is a valid `JudgmentMetric` enum value.
38
+ Converts string values to `JudgmentMetric` enum values.
39
+ """
40
+ debug(f"Attempting to convert score_type value: {v}")
41
+ if isinstance(v, APIScorer):
42
+ info(f"Using existing JudgmentMetric: {v.value}")
43
+ return v.value
44
+ elif isinstance(v, str):
45
+ debug(f"Converting string value to JudgmentMetric enum: {v}")
46
+ return APIScorer[v.upper()].value
47
+ error(f"Invalid score_type value: {v}")
48
+ raise ValueError(f"Invalid value for score_type: {v}")
49
+
50
+ def __str__(self):
51
+ return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
+
53
+ def to_dict(self) -> dict:
54
+ """
55
+ Converts the scorer configuration to a dictionary format.
56
+
57
+ Returns:
58
+ dict: A dictionary containing the scorer's configuration
59
+ """
60
+ return {
61
+ "score_type": self.score_type,
62
+ "threshold": self.threshold
63
+ }
64
+
@@ -10,7 +10,7 @@ from judgeval.common.logger import debug, info, warning, error
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class JudgmentScorer(BaseModel):
13
+ class APIJudgmentScorer(BaseModel):
14
14
  """
15
15
  Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
16
 
@@ -48,4 +48,5 @@ class JudgmentScorer(BaseModel):
48
48
  raise ValueError(f"Invalid value for score_type: {v}")
49
49
 
50
50
  def __str__(self):
51
- return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
51
+ return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
52
+
@@ -0,0 +1,11 @@
1
+ """
2
+ Error handling for scorers
3
+
4
+ """
5
+
6
+
7
+ class MissingExampleParamsError(Exception):
8
+ """
9
+ Error raised when a scorer is missing required example parameters.
10
+ """
11
+ pass
@@ -9,15 +9,19 @@ from typing import Optional, Dict, Union, List
9
9
  from abc import abstractmethod
10
10
 
11
11
  from judgeval.common.logger import debug, info, warning, error
12
- from judgeval.judges import judgevalJudge
12
+ from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
14
 
15
15
 
16
- class CustomScorer:
16
+ class JudgevalScorer:
17
17
  """
18
+ Base class for scorers in `judgeval`.
19
+
20
+ In practice, you should not implement this class unless you are creating a custom scorer.
21
+ Judgeval offers 10+ default scorers that you can use out of the box.
22
+
18
23
  If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
19
- you can create a custom scorer by extending this class. This is best used for special use cases
20
- where none of Judgment's scorers are suitable.
24
+ you can create a custom scorer by extending this class.
21
25
  """
22
26
  score_type: str # name of your new scorer
23
27
  threshold: float # The threshold to pass a test while using this scorer as a scorer
@@ -73,7 +77,7 @@ class CustomScorer:
73
77
  self.verbose_logs = verbose_logs
74
78
  self.additional_metadata = additional_metadata
75
79
 
76
- def _add_model(self, model: Optional[Union[str, List[str], judgevalJudge]] = None):
80
+ def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
77
81
  """
78
82
  Adds the evaluation model to the CustomScorer instance
79
83
 
@@ -1,12 +1,135 @@
1
- from judgeval.scorers.judgeval_scorers.tool_correctness import ToolCorrectnessScorer
2
- from judgeval.scorers.judgeval_scorers.json_correctness import JSONCorrectnessScorer
3
- from judgeval.scorers.judgeval_scorers.summarization import SummarizationScorer
4
- from judgeval.scorers.judgeval_scorers.hallucination import HallucinationScorer
5
- from judgeval.scorers.judgeval_scorers.faithfulness import FaithfulnessScorer
6
- from judgeval.scorers.judgeval_scorers.contextual_relevancy import ContextualRelevancyScorer
7
- from judgeval.scorers.judgeval_scorers.contextual_precision import ContextualPrecisionScorer
8
- from judgeval.scorers.judgeval_scorers.contextual_recall import ContextualRecallScorer
9
- from judgeval.scorers.judgeval_scorers.answer_relevancy import AnswerRelevancyScorer
1
+ from typing import Type, Optional, Any
2
+ from functools import wraps
3
+
4
+ # Import implementations
5
+ from judgeval.scorers.judgeval_scorers.api_scorers import (
6
+ ToolCorrectnessScorer as APIToolCorrectnessScorer,
7
+ JSONCorrectnessScorer as APIJSONCorrectnessScorer,
8
+ SummarizationScorer as APISummarizationScorer,
9
+ HallucinationScorer as APIHallucinationScorer,
10
+ FaithfulnessScorer as APIFaithfulnessScorer,
11
+ ContextualRelevancyScorer as APIContextualRelevancyScorer,
12
+ ContextualPrecisionScorer as APIContextualPrecisionScorer,
13
+ ContextualRecallScorer as APIContextualRecallScorer,
14
+ AnswerRelevancyScorer as APIAnswerRelevancyScorer,
15
+ AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
16
+ )
17
+
18
+ from judgeval.scorers.judgeval_scorers.local_implementations import (
19
+ AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
20
+ ContextualPrecisionScorer as LocalContextualPrecisionScorer,
21
+ ContextualRecallScorer as LocalContextualRecallScorer,
22
+ ContextualRelevancyScorer as LocalContextualRelevancyScorer,
23
+ FaithfulnessScorer as LocalFaithfulnessScorer,
24
+ JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
25
+ ToolCorrectnessScorer as LocalToolCorrectnessScorer,
26
+ HallucinationScorer as LocalHallucinationScorer,
27
+ SummarizationScorer as LocalSummarizationScorer,
28
+ AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
29
+ )
30
+
31
+ class ScorerWrapper:
32
+ """
33
+ Wrapper class that can dynamically load either API or local implementation of a scorer.
34
+ """
35
+ def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
36
+ self.api_implementation = api_implementation
37
+ self.local_implementation = local_implementation
38
+ self._instance = None
39
+ self._init_args = None
40
+ self._init_kwargs = None
41
+
42
+ def __call__(self, *args, **kwargs):
43
+ """Store initialization arguments for later use when implementation is loaded"""
44
+ self._init_args = args
45
+ self._init_kwargs = kwargs
46
+ return self
47
+
48
+ def load_implementation(self, use_judgment: bool = True) -> Any:
49
+ """
50
+ Load the appropriate implementation based on the use_judgment flag.
51
+
52
+ Args:
53
+ use_judgment (bool): If True, use API implementation. If False, use local implementation.
54
+
55
+ Returns:
56
+ Instance of the appropriate implementation
57
+
58
+ Raises:
59
+ ValueError: If local implementation is requested but not available
60
+ """
61
+ if self._instance is not None:
62
+ return self._instance
63
+
64
+ if use_judgment:
65
+ implementation = self.api_implementation
66
+ else:
67
+ if self.local_implementation is None:
68
+ raise ValueError("No local implementation available for this scorer")
69
+ implementation = self.local_implementation
70
+
71
+ args = self._init_args or ()
72
+ kwargs = self._init_kwargs or {}
73
+ self._instance = implementation(*args, **kwargs)
74
+ return self._instance
75
+
76
+ def __getattr__(self, name):
77
+ """Defer all attribute access to the loaded implementation"""
78
+ if self._instance is None:
79
+ raise RuntimeError("Implementation not loaded. Call load_implementation() first")
80
+ return getattr(self._instance, name)
81
+
82
+ # Create wrapped versions of all scorers
83
+
84
+ AnswerCorrectnessScorer = ScorerWrapper(
85
+ api_implementation=APIAnswerCorrectnessScorer,
86
+ local_implementation=LocalAnswerCorrectnessScorer
87
+ )
88
+
89
+ AnswerRelevancyScorer = ScorerWrapper(
90
+ api_implementation=APIAnswerRelevancyScorer,
91
+ local_implementation=LocalAnswerRelevancyScorer
92
+ )
93
+
94
+ ToolCorrectnessScorer = ScorerWrapper(
95
+ api_implementation=APIToolCorrectnessScorer,
96
+ local_implementation=LocalToolCorrectnessScorer
97
+ )
98
+
99
+ JSONCorrectnessScorer = ScorerWrapper(
100
+ api_implementation=APIJSONCorrectnessScorer,
101
+ local_implementation=LocalJsonCorrectnessScorer
102
+ )
103
+
104
+ SummarizationScorer = ScorerWrapper(
105
+ api_implementation=APISummarizationScorer,
106
+ local_implementation=LocalSummarizationScorer
107
+ )
108
+
109
+ HallucinationScorer = ScorerWrapper(
110
+ api_implementation=APIHallucinationScorer,
111
+ local_implementation=LocalHallucinationScorer
112
+ )
113
+
114
+ FaithfulnessScorer = ScorerWrapper(
115
+ api_implementation=APIFaithfulnessScorer,
116
+ local_implementation=LocalFaithfulnessScorer
117
+ )
118
+
119
+ ContextualRelevancyScorer = ScorerWrapper(
120
+ api_implementation=APIContextualRelevancyScorer,
121
+ local_implementation=LocalContextualRelevancyScorer
122
+ )
123
+
124
+ ContextualPrecisionScorer = ScorerWrapper(
125
+ api_implementation=APIContextualPrecisionScorer,
126
+ local_implementation=LocalContextualPrecisionScorer
127
+ )
128
+
129
+ ContextualRecallScorer = ScorerWrapper(
130
+ api_implementation=APIContextualRecallScorer,
131
+ local_implementation=LocalContextualRecallScorer
132
+ )
10
133
 
11
134
  __all__ = [
12
135
  "ToolCorrectnessScorer",
@@ -0,0 +1,23 @@
1
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
2
+ from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
3
+ from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
4
+ from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
5
+ from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import FaithfulnessScorer
6
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import ContextualRelevancyScorer
7
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import ContextualPrecisionScorer
8
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
9
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
10
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
11
+
12
+ __all__ = [
13
+ "ToolCorrectnessScorer",
14
+ "JSONCorrectnessScorer",
15
+ "SummarizationScorer",
16
+ "HallucinationScorer",
17
+ "FaithfulnessScorer",
18
+ "ContextualRelevancyScorer",
19
+ "ContextualPrecisionScorer",
20
+ "ContextualRecallScorer",
21
+ "AnswerRelevancyScorer",
22
+ "AnswerCorrectnessScorer",
23
+ ]
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` answer relevancy scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class AnswerCorrectnessScorer(APIJudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Answer Correctness"
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class AnswerRelevancyScorer(JudgmentScorer):
13
+ class AnswerRelevancyScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
16
16
 
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class ContextualPrecisionScorer(JudgmentScorer):
13
+ class ContextualPrecisionScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
16
16
 
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class ContextualRecallScorer(JudgmentScorer):
13
+ class ContextualRecallScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
16
16
 
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class ContextualRelevancyScorer(JudgmentScorer):
13
+ class ContextualRelevancyScorer(APIJudgmentScorer):
14
14
  """
15
15
  Scorer that checks if the output of a model is relevant to the retrieval context
16
16
  """
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class FaithfulnessScorer(JudgmentScorer):
13
+ class FaithfulnessScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
16
16
 
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class HallucinationScorer(JudgmentScorer):
13
+ class HallucinationScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
16
16
 
@@ -9,23 +9,23 @@ TODO add link to docs page for this scorer
9
9
  # External imports
10
10
  from pydantic import BaseModel, Field
11
11
  # Internal imports
12
- from judgeval.scorers.base_scorer import JudgmentScorer
12
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
13
13
  from judgeval.constants import APIScorer
14
14
 
15
15
 
16
- class JSONCorrectnessScorer(JudgmentScorer):
16
+ class JSONCorrectnessScorer(APIJudgmentScorer):
17
17
  json_schema: BaseModel = Field(None, exclude=True)
18
18
 
19
19
  def __init__(self, threshold: float, json_schema: BaseModel):
20
20
  super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
21
21
  object.__setattr__(self, 'json_schema', json_schema)
22
-
22
+
23
23
  def to_dict(self):
24
- return {
25
- "score_type": self.score_type,
26
- "threshold": self.threshold,
27
- "kwargs": {"json_schema": self.json_schema.model_json_schema()}
24
+ base_dict = super().to_dict() # Get the parent class's dictionary
25
+ base_dict["kwargs"] = {
26
+ "json_schema": self.json_schema.model_json_schema()
28
27
  }
28
+ return base_dict
29
29
 
30
30
  @property
31
31
  def __name__(self):
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.base_scorer import JudgmentScorer
9
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
12
 
13
- class SummarizationScorer(JudgmentScorer):
13
+ class SummarizationScorer(APIJudgmentScorer):
14
14
  def __init__(self, threshold: float):
15
15
  super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
16
16