judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -6,7 +6,7 @@ Scores `Example`s using ready-made Judgment evaluators.
6
6
 
7
7
  from pydantic import BaseModel, field_validator
8
8
  from typing import List
9
- from judgeval.common.logger import debug, info, warning, error
9
+ from judgeval.common.logger import debug, info, error
10
10
  from judgeval.data import ExampleParams
11
11
  from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
12
12
 
@@ -19,27 +19,34 @@ class APIJudgmentScorer(BaseModel):
19
19
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
20
20
  threshold (float): A value between 0 and 1 that determines the scoring threshold
21
21
  """
22
+
22
23
  score_type: APIScorer
23
24
  threshold: float
24
- required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
25
+ required_params: List[
26
+ ExampleParams
27
+ ] = [] # List of the required parameters on examples for the scorer
25
28
 
26
- @field_validator('threshold')
29
+ @field_validator("threshold")
27
30
  def validate_threshold(cls, v, info):
28
31
  """
29
32
  Validates that the threshold is between 0 and 1 inclusive.
30
33
  """
31
- score_type = info.data.get('score_type')
34
+ score_type = info.data.get("score_type")
32
35
  if score_type in UNBOUNDED_SCORERS:
33
36
  if v < 0:
34
37
  error(f"Threshold for {score_type} must be greater than 0, got: {v}")
35
- raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
38
+ raise ValueError(
39
+ f"Threshold for {score_type} must be greater than 0, got: {v}"
40
+ )
36
41
  else:
37
42
  if not 0 <= v <= 1:
38
43
  error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
39
- raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
44
+ raise ValueError(
45
+ f"Threshold for {score_type} must be between 0 and 1, got: {v}"
46
+ )
40
47
  return v
41
48
 
42
- @field_validator('score_type')
49
+ @field_validator("score_type")
43
50
  def convert_to_enum_value(cls, v):
44
51
  """
45
52
  Validates that the `score_type` is a valid `APIScorer` enum value.
@@ -61,11 +68,13 @@ class APIJudgmentScorer(BaseModel):
61
68
  def to_dict(self) -> dict:
62
69
  """
63
70
  Converts the scorer configuration to a dictionary format.
64
-
71
+
65
72
  Returns:
66
73
  dict: A dictionary containing the scorer's configuration
67
74
  """
68
75
  return {
69
- "score_type": str(self.score_type.value), # Convert enum to string for serialization
70
- "threshold": self.threshold
71
- }
76
+ "score_type": str(
77
+ self.score_type.value
78
+ ), # Convert enum to string for serialization
79
+ "threshold": self.threshold,
80
+ }
@@ -8,4 +8,5 @@ class MissingExampleParamsError(Exception):
8
8
  """
9
9
  Error raised when a scorer is missing required example parameters.
10
10
  """
11
+
11
12
  pass
@@ -13,21 +13,26 @@ from judgeval.judges import JudgevalJudge
13
13
  from judgeval.judges.utils import create_judge
14
14
  from judgeval.constants import UNBOUNDED_SCORERS
15
15
  from judgeval.data.example import ExampleParams
16
+
17
+
16
18
  class JudgevalScorer:
17
19
  """
18
20
  Base class for scorers in `judgeval`.
19
21
 
20
22
  In practice, you should not implement this class unless you are creating a custom scorer.
21
23
  Judgeval offers 10+ default scorers that you can use out of the box.
22
-
24
+
23
25
  If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
24
26
  you can create a custom scorer by extending this class.
25
27
  """
28
+
26
29
  score_type: str # name of your new scorer
27
30
  threshold: float # The threshold to pass a test while using this scorer as a scorer
28
31
  score: Optional[float] = None # The float score of the scorer run on the test case
29
- score_breakdown: Dict = None
30
- reason: Optional[str] = None # The reason for the score when evaluating the test case
32
+ score_breakdown: Optional[Dict] = None
33
+ reason: Optional[str] = (
34
+ None # The reason for the score when evaluating the test case
35
+ )
31
36
  success: Optional[bool] = None # Whether the test case passed or failed
32
37
  evaluation_model: Optional[str] = None # The model used to evaluate the test case
33
38
  strict_mode: bool = False # Whether to run the scorer in strict mode
@@ -39,61 +44,67 @@ class JudgevalScorer:
39
44
  evaluation_cost: Optional[float] = None # The cost of running the scorer
40
45
  verbose_logs: Optional[str] = None # The verbose logs of the scorer
41
46
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
- required_params: Optional[List[ExampleParams]] = None # The required parameters for the scorer
43
- error: Optional[str] = None
44
- success: Optional[bool] = None
47
+ required_params: Optional[List[ExampleParams]] = (
48
+ None # The required parameters for the scorer
49
+ )
45
50
 
46
51
  def __init__(
47
- self,
48
- score_type: str,
49
- threshold: float,
50
- score: Optional[float] = None,
51
- score_breakdown: Optional[Dict] = None,
52
- reason: Optional[str] = None,
53
- success: Optional[bool] = None,
54
- evaluation_model: Optional[str] = None,
52
+ self,
53
+ score_type: str,
54
+ threshold: float,
55
+ score: Optional[float] = None,
56
+ score_breakdown: Optional[Dict] = None,
57
+ reason: Optional[str] = None,
58
+ success: Optional[bool] = None,
59
+ evaluation_model: Optional[str] = None,
55
60
  required_params: Optional[List[ExampleParams]] = None,
56
- strict_mode: bool = False,
57
- async_mode: bool = True,
58
- verbose_mode: bool = True,
59
- include_reason: bool = False,
61
+ strict_mode: bool = False,
62
+ async_mode: bool = True,
63
+ verbose_mode: bool = True,
64
+ include_reason: bool = False,
60
65
  custom_example: bool = False,
61
- error: Optional[str] = None,
62
- evaluation_cost: Optional[float] = None,
63
- verbose_logs: Optional[str] = None,
64
- additional_metadata: Optional[Dict] = None
65
- ):
66
- debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
67
- if score_type in UNBOUNDED_SCORERS:
68
- if threshold < 0:
69
- raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {threshold}")
70
- else:
71
- if not 0 <= threshold <= 1:
72
- raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {threshold}")
73
- if strict_mode:
74
- warning("Strict mode enabled - scoring will be more rigorous")
75
- info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
76
- self.score_type = score_type
77
- self.threshold = threshold
78
- self.score = score
79
- self.score_breakdown = score_breakdown
80
- self.reason = reason
81
- self.success = success
82
- self.evaluation_model = evaluation_model
83
- self.strict_mode = strict_mode
84
- self.async_mode = async_mode
85
- self.verbose_mode = verbose_mode
86
- self.include_reason = include_reason
87
- self.custom_example = custom_example
88
- self.error = error
89
- self.evaluation_cost = evaluation_cost
90
- self.verbose_logs = verbose_logs
91
- self.additional_metadata = additional_metadata
92
- self.required_params = required_params
66
+ error: Optional[str] = None,
67
+ evaluation_cost: Optional[float] = None,
68
+ verbose_logs: Optional[str] = None,
69
+ additional_metadata: Optional[Dict] = None,
70
+ ):
71
+ debug(
72
+ f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}"
73
+ )
74
+ if score_type in UNBOUNDED_SCORERS:
75
+ if threshold < 0:
76
+ raise ValueError(
77
+ f"Threshold for {score_type} must be greater than 0, got: {threshold}"
78
+ )
79
+ else:
80
+ if not 0 <= threshold <= 1:
81
+ raise ValueError(
82
+ f"Threshold for {score_type} must be between 0 and 1, got: {threshold}"
83
+ )
84
+ if strict_mode:
85
+ warning("Strict mode enabled - scoring will be more rigorous")
86
+ info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
87
+ self.score_type = score_type
88
+ self.threshold = threshold
89
+ self.score = score
90
+ self.score_breakdown = score_breakdown
91
+ self.reason = reason
92
+ self.success = success
93
+ self.evaluation_model = evaluation_model
94
+ self.strict_mode = strict_mode
95
+ self.async_mode = async_mode
96
+ self.verbose_mode = verbose_mode
97
+ self.include_reason = include_reason
98
+ self.custom_example = custom_example
99
+ self.error = error
100
+ self.evaluation_cost = evaluation_cost
101
+ self.verbose_logs = verbose_logs
102
+ self.additional_metadata = additional_metadata
103
+ self.required_params = required_params
93
104
 
94
105
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
95
106
  """
96
- Adds the evaluation model to the JudgevalScorer instance
107
+ Adds the evaluation model to the JudgevalScorer instance
97
108
 
98
109
  This method is used at eval time
99
110
  """
@@ -107,7 +118,9 @@ class JudgevalScorer:
107
118
  """
108
119
  warning("Attempting to call unimplemented score_example method")
109
120
  error("score_example method not implemented")
110
- raise NotImplementedError("You must implement the `score` method in your custom scorer")
121
+ raise NotImplementedError(
122
+ "You must implement the `score` method in your custom scorer"
123
+ )
111
124
 
112
125
  @abstractmethod
113
126
  async def a_score_example(self, example, *args, **kwargs) -> float:
@@ -116,8 +129,10 @@ class JudgevalScorer:
116
129
  """
117
130
  warning("Attempting to call unimplemented a_score_example method")
118
131
  error("a_score_example method not implemented")
119
- raise NotImplementedError("You must implement the `a_score` method in your custom scorer")
120
-
132
+ raise NotImplementedError(
133
+ "You must implement the `a_score` method in your custom scorer"
134
+ )
135
+
121
136
  @abstractmethod
122
137
  def _success_check(self) -> bool:
123
138
  """
@@ -125,7 +140,9 @@ class JudgevalScorer:
125
140
  """
126
141
  warning("Attempting to call unimplemented success_check method")
127
142
  error("_success_check method not implemented")
128
- raise NotImplementedError("You must implement the `_success_check` method in your custom scorer")
143
+ raise NotImplementedError(
144
+ "You must implement the `_success_check` method in your custom scorer"
145
+ )
129
146
 
130
147
  def __str__(self):
131
148
  debug("Converting JudgevalScorer instance to string representation")
@@ -150,9 +167,11 @@ class JudgevalScorer:
150
167
  "additional_metadata": self.additional_metadata,
151
168
  }
152
169
  return f"JudgevalScorer({attributes})"
153
-
170
+
154
171
  def to_dict(self):
155
172
  return {
156
- "score_type": str(self.score_type), # Convert enum to string for serialization
157
- "threshold": self.threshold
173
+ "score_type": str(
174
+ self.score_type
175
+ ), # Convert enum to string for serialization
176
+ "threshold": self.threshold,
158
177
  }
@@ -1,20 +1,51 @@
1
- from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import ExecutionOrderScorer
2
- from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
3
- from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
4
- from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
5
- from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import FaithfulnessScorer
6
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import ContextualRelevancyScorer
7
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import ContextualPrecisionScorer
8
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
9
- from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
10
- from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
1
+ from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
2
+ ExecutionOrderScorer,
3
+ )
4
+ from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import (
5
+ JSONCorrectnessScorer,
6
+ )
7
+ from judgeval.scorers.judgeval_scorers.api_scorers.summarization import (
8
+ SummarizationScorer,
9
+ )
10
+ from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
11
+ HallucinationScorer,
12
+ )
13
+ from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
14
+ FaithfulnessScorer,
15
+ )
16
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import (
17
+ ContextualRelevancyScorer,
18
+ )
19
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import (
20
+ ContextualPrecisionScorer,
21
+ )
22
+ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import (
23
+ ContextualRecallScorer,
24
+ )
25
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
26
+ AnswerRelevancyScorer,
27
+ )
28
+ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
29
+ AnswerCorrectnessScorer,
30
+ )
11
31
  from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
12
- from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
13
- from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
14
- from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
32
+ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
33
+ InstructionAdherenceScorer,
34
+ )
35
+ from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import (
36
+ GroundednessScorer,
37
+ )
38
+ from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
39
+ DerailmentScorer,
40
+ )
15
41
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
16
- from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
17
- from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
42
+ from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import (
43
+ ClassifierScorer,
44
+ )
45
+ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
46
+ ToolDependencyScorer,
47
+ )
48
+
18
49
  __all__ = [
19
50
  "ExecutionOrderScorer",
20
51
  "JSONCorrectnessScorer",
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class AnswerCorrectnessScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.ANSWER_CORRECTNESS,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
22
  ExampleParams.EXPECTED_OUTPUT,
22
- ]
23
+ ],
23
24
  )
24
25
 
25
26
  @property
@@ -10,15 +10,16 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class AnswerRelevancyScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.ANSWER_RELEVANCY,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
- ]
22
+ ],
22
23
  )
23
24
 
24
25
  @property
@@ -1,11 +1,11 @@
1
1
  from judgeval.scorers.api_scorer import APIJudgmentScorer
2
2
  from judgeval.constants import APIScorer
3
- from typing import List, Mapping, Optional, Dict
4
- from pydantic import model_serializer
3
+ from typing import List, Mapping, Optional
4
+
5
5
 
6
6
  class ClassifierScorer(APIJudgmentScorer):
7
7
  """
8
- In the Judgment backend, this scorer is implemented as a PromptScorer that takes
8
+ In the Judgment backend, this scorer is implemented as a PromptScorer that takes
9
9
  1. a system role that may involve the Example object
10
10
  2. options for scores on the example
11
11
 
@@ -14,7 +14,7 @@ class ClassifierScorer(APIJudgmentScorer):
14
14
  ex:
15
15
  system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
16
16
  options = {"positive": 1, "negative": 0}
17
-
17
+
18
18
  Args:
19
19
  name (str): The name of the scorer
20
20
  slug (str): A unique identifier for the scorer
@@ -25,14 +25,15 @@ class ClassifierScorer(APIJudgmentScorer):
25
25
  strict_mode (bool): Whether to use strict mode (default: False)
26
26
  verbose_mode (bool): Whether to include verbose logging (default: False)
27
27
  """
28
+
28
29
  name: Optional[str] = None
29
30
  slug: Optional[str] = None
30
31
  conversation: Optional[List[dict]] = None
31
32
  options: Optional[Mapping[str, float]] = None
32
33
  verbose_mode: bool = False
33
34
  strict_mode: bool = False
34
- include_reason: bool = True,
35
- async_mode: bool = True,
35
+ include_reason: bool = True
36
+ async_mode: bool = True
36
37
  threshold: float = 0.5
37
38
 
38
39
  def __init__(
@@ -65,26 +66,26 @@ class ClassifierScorer(APIJudgmentScorer):
65
66
  Updates the name of the scorer.
66
67
  """
67
68
  self.name = name
68
-
69
+
69
70
  def update_threshold(self, threshold: float):
70
71
  """
71
72
  Updates the threshold of the scorer.
72
73
  """
73
74
  self.threshold = threshold
74
-
75
+
75
76
  def update_conversation(self, conversation: List[dict]):
76
77
  """
77
78
  Updates the conversation with the new conversation.
78
-
79
+
79
80
  Sample conversation:
80
81
  [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
81
82
  """
82
83
  self.conversation = conversation
83
-
84
+
84
85
  def update_options(self, options: Mapping[str, float]):
85
86
  """
86
87
  Updates the options with the new options.
87
-
88
+
88
89
  Sample options:
89
90
  {"yes": 1, "no": 0}
90
91
  """
@@ -10,34 +10,36 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from typing import Optional, Dict
12
12
  from judgeval.data import ExampleParams
13
+
14
+
13
15
  class ComparisonScorer(APIJudgmentScorer):
14
16
  kwargs: Optional[Dict] = None
15
17
 
16
18
  def __init__(self, threshold: float, criteria: str, description: str):
17
19
  super().__init__(
18
- threshold=threshold,
20
+ threshold=threshold,
19
21
  score_type=APIScorer.COMPARISON,
20
22
  required_params=[
21
23
  ExampleParams.INPUT,
22
24
  ExampleParams.ACTUAL_OUTPUT,
23
25
  ExampleParams.EXPECTED_OUTPUT,
24
- ]
26
+ ],
25
27
  )
26
28
  self.kwargs = {"criteria": criteria, "description": description}
27
29
 
28
30
  @property
29
31
  def __name__(self):
30
32
  return f"Comparison-{self.kwargs['criteria']}"
31
-
33
+
32
34
  def to_dict(self) -> dict:
33
35
  """
34
36
  Converts the scorer configuration to a dictionary format.
35
-
37
+
36
38
  Returns:
37
39
  dict: A dictionary containing the scorer's configuration
38
40
  """
39
41
  return {
40
42
  "score_type": self.score_type,
41
43
  "threshold": self.threshold,
42
- "kwargs": self.kwargs
44
+ "kwargs": self.kwargs,
43
45
  }
@@ -10,17 +10,18 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class ContextualPrecisionScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.CONTEXTUAL_PRECISION,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
22
  ExampleParams.RETRIEVAL_CONTEXT,
22
23
  ExampleParams.EXPECTED_OUTPUT,
23
- ]
24
+ ],
24
25
  )
25
26
 
26
27
  @property
@@ -14,15 +14,16 @@ from judgeval.data import ExampleParams
14
14
  class ContextualRecallScorer(APIJudgmentScorer):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
- threshold=threshold,
17
+ threshold=threshold,
18
18
  score_type=APIScorer.CONTEXTUAL_RECALL,
19
19
  required_params=[
20
20
  ExampleParams.INPUT,
21
21
  ExampleParams.ACTUAL_OUTPUT,
22
22
  ExampleParams.EXPECTED_OUTPUT,
23
23
  ExampleParams.RETRIEVAL_CONTEXT,
24
- ]
24
+ ],
25
25
  )
26
+
26
27
  @property
27
28
  def __name__(self):
28
29
  return "Contextual Recall"
@@ -10,20 +10,23 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class ContextualRelevancyScorer(APIJudgmentScorer):
14
15
  """
15
16
  Scorer that checks if the output of a model is relevant to the retrieval context
16
17
  """
18
+
17
19
  def __init__(self, threshold: float):
18
20
  super().__init__(
19
- threshold=threshold,
21
+ threshold=threshold,
20
22
  score_type=APIScorer.CONTEXTUAL_RELEVANCY,
21
23
  required_params=[
22
24
  ExampleParams.INPUT,
23
25
  ExampleParams.ACTUAL_OUTPUT,
24
26
  ExampleParams.RETRIEVAL_CONTEXT,
25
- ]
27
+ ],
26
28
  )
29
+
27
30
  @property
28
31
  def __name__(self):
29
32
  return "Contextual Relevancy"
@@ -9,10 +9,11 @@ TODO add link to docs page for this scorer
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
 
12
+
12
13
  class DerailmentScorer(APIJudgmentScorer):
13
14
  def __init__(self, threshold: float):
14
15
  super().__init__(
15
- threshold=threshold,
16
+ threshold=threshold,
16
17
  score_type=APIScorer.DERAILMENT,
17
18
  )
18
19
 
@@ -8,22 +8,31 @@ TODO add link to docs page for this scorer
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
- from typing import Optional, Dict, List
11
+ from typing import Optional, Dict
12
12
  from judgeval.data import ExampleParams
13
13
 
14
+
14
15
  class ExecutionOrderScorer(APIJudgmentScorer):
15
16
  kwargs: Optional[Dict] = None
16
17
 
17
- def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
18
+ def __init__(
19
+ self,
20
+ threshold: float,
21
+ should_exact_match: bool = False,
22
+ should_consider_ordering: bool = False,
23
+ ):
18
24
  super().__init__(
19
- threshold=threshold,
25
+ threshold=threshold,
20
26
  score_type=APIScorer.EXECUTION_ORDER,
21
27
  required_params=[
22
28
  ExampleParams.ACTUAL_OUTPUT,
23
29
  ExampleParams.EXPECTED_OUTPUT,
24
- ]
30
+ ],
25
31
  )
26
- self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
32
+ self.kwargs = {
33
+ "should_exact_match": should_exact_match,
34
+ "should_consider_ordering": should_consider_ordering,
35
+ }
27
36
 
28
37
  @property
29
38
  def __name__(self):
@@ -32,12 +41,12 @@ class ExecutionOrderScorer(APIJudgmentScorer):
32
41
  def to_dict(self) -> dict:
33
42
  """
34
43
  Converts the scorer configuration to a dictionary format.
35
-
44
+
36
45
  Returns:
37
46
  dict: A dictionary containing the scorer's configuration
38
47
  """
39
48
  return {
40
49
  "score_type": self.score_type,
41
50
  "threshold": self.threshold,
42
- "kwargs": self.kwargs
43
- }
51
+ "kwargs": self.kwargs,
52
+ }
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class FaithfulnessScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.FAITHFULNESS,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
22
  ExampleParams.RETRIEVAL_CONTEXT,
22
- ]
23
+ ],
23
24
  )
24
25
 
25
26
  @property