judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
  47. judgeval-0.0.53.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.51.dist-info/RECORD +0 -69
  59. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,21 @@
1
+ from judgeval.scorers.base_scorer import BaseScorer
2
+ from judgeval.data import Trace
3
+ from typing import List, Optional
4
+ from abc import abstractmethod
5
+
6
+ from judgeval.common.logger import warning, error
7
+
8
+
9
+ class AgentScorer(BaseScorer):
10
+ @abstractmethod
11
+ async def a_score_trace(
12
+ self, trace: Trace, tools: Optional[List] = None, *args, **kwargs
13
+ ) -> float:
14
+ """
15
+ Asynchronously measures the score on a trace
16
+ """
17
+ warning("Attempting to call unimplemented a_score_trace method")
18
+ error("a_score_trace method not implemented")
19
+ raise NotImplementedError(
20
+ "You must implement the `a_score_trace` method in your custom scorer"
21
+ )
@@ -6,27 +6,35 @@ Scores `Example`s using ready-made Judgment evaluators.
6
6
 
7
7
  from pydantic import BaseModel, field_validator
8
8
  from typing import List
9
- from judgeval.common.logger import debug, info, error
10
9
  from judgeval.data import ExampleParams
11
- from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
10
+ from judgeval.constants import APIScorerType, UNBOUNDED_SCORERS
11
+ from judgeval.common.logger import judgeval_logger
12
12
 
13
13
 
14
- class APIJudgmentScorer(BaseModel):
14
+ class APIScorerConfig(BaseModel):
15
15
  """
16
- Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
16
+ Scorer config that is used to send to our Judgment server.
17
17
 
18
18
  Args:
19
19
  score_type (APIScorer): The Judgment metric to use for scoring `Example`s
20
+ name (str): The name of the scorer, usually this is the same as the score_type
20
21
  threshold (float): A value between 0 and 1 that determines the scoring threshold
22
+ strict_mode (bool): Whether to use strict mode for the scorer
23
+ required_params (List[ExampleParams]): List of the required parameters on examples for the scorer
24
+ kwargs (dict): Additional keyword arguments to pass to the scorer
21
25
  """
22
26
 
23
- score_type: APIScorer
24
- threshold: float
27
+ score_type: APIScorerType
28
+ name: str = ""
29
+ threshold: float = 0.5
30
+ strict_mode: bool = False
25
31
  required_params: List[
26
32
  ExampleParams
27
- ] = [] # List of the required parameters on examples for the scorer
33
+ ] = [] # This is used to check if the example has the required parameters before running the scorer
34
+ kwargs: dict = {}
28
35
 
29
36
  @field_validator("threshold")
37
+ @classmethod
30
38
  def validate_threshold(cls, v, info):
31
39
  """
32
40
  Validates that the threshold is between 0 and 1 inclusive.
@@ -34,47 +42,29 @@ class APIJudgmentScorer(BaseModel):
34
42
  score_type = info.data.get("score_type")
35
43
  if score_type in UNBOUNDED_SCORERS:
36
44
  if v < 0:
37
- error(f"Threshold for {score_type} must be greater than 0, got: {v}")
45
+ judgeval_logger.error(
46
+ f"Threshold for {score_type} must be greater than 0, got: {v}"
47
+ )
38
48
  raise ValueError(
39
49
  f"Threshold for {score_type} must be greater than 0, got: {v}"
40
50
  )
41
51
  else:
42
52
  if not 0 <= v <= 1:
43
- error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
53
+ judgeval_logger.error(
54
+ f"Threshold for {score_type} must be between 0 and 1, got: {v}"
55
+ )
44
56
  raise ValueError(
45
57
  f"Threshold for {score_type} must be between 0 and 1, got: {v}"
46
58
  )
47
59
  return v
48
60
 
49
- @field_validator("score_type")
50
- def convert_to_enum_value(cls, v):
51
- """
52
- Validates that the `score_type` is a valid `APIScorer` enum value.
53
- Converts string values to `APIScorer` enum values.
54
- """
55
- debug(f"Attempting to convert score_type value: {v}")
56
- if isinstance(v, APIScorer):
57
- info(f"Using existing APIScorer: {v}")
58
- return v
59
- elif isinstance(v, str):
60
- debug(f"Converting string value to APIScorer enum: {v}")
61
- return APIScorer[v.upper()]
62
- error(f"Invalid score_type value: {v}")
63
- raise ValueError(f"Invalid value for score_type: {v}")
61
+ @field_validator("name", mode="after")
62
+ @classmethod
63
+ def set_name_to_score_type_if_none(cls, v, info):
64
+ """Set name to score_type if not provided"""
65
+ if v is None:
66
+ return info.data.get("score_type")
67
+ return v
64
68
 
65
69
  def __str__(self):
66
70
  return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
67
-
68
- def to_dict(self) -> dict:
69
- """
70
- Converts the scorer configuration to a dictionary format.
71
-
72
- Returns:
73
- dict: A dictionary containing the scorer's configuration
74
- """
75
- return {
76
- "score_type": str(
77
- self.score_type.value
78
- ), # Convert enum to string for serialization
79
- "threshold": self.threshold,
80
- }
@@ -0,0 +1,98 @@
1
+ """
2
+ Base class for all scorers.
3
+ """
4
+
5
+ from typing import Dict, Optional
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ from judgeval.judges.utils import create_judge
11
+ from typing import Any
12
+ from pydantic import model_validator, Field
13
+ from judgeval.common.logger import judgeval_logger
14
+
15
+
16
+ class BaseScorer(BaseModel):
17
+ """
18
+ If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
19
+ you can create a custom scorer by extending this class. This is best used for special use cases
20
+ where none of Judgment's scorers are suitable.
21
+ """
22
+
23
+ score_type: str # type of your scorer (Faithfulness, PromptScorer)
24
+ threshold: float = (
25
+ 0.5 # The threshold to pass a test while using this scorer as a scorer
26
+ )
27
+ name: Optional[str] = (
28
+ None # name of your scorer (Faithfulness, PromptScorer-randomslug)
29
+ )
30
+ score: Optional[float] = None # The float score of the scorer run on the test case
31
+ score_breakdown: Optional[Dict] = None
32
+ reason: Optional[str] = ""
33
+ using_native_model: Optional[bool] = None # Whether the model is a native model
34
+ success: Optional[bool] = None # Whether the test case passed or failed
35
+ model: Optional[Any] = Field(
36
+ default=None, exclude=True
37
+ ) # The model used to evaluate the test case
38
+ evaluation_model: Optional[str] = None # The model used to evaluate the test case
39
+ strict_mode: bool = False # Whether to run the scorer in strict mode
40
+ error: Optional[str] = None # The error message if the scorer failed
41
+ additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
+ user: Optional[str] = None # The user ID of the scorer
43
+
44
+ @model_validator(mode="before")
45
+ @classmethod
46
+ def enforce_strict_threshold(cls, data: dict):
47
+ if data.get("strict_mode"):
48
+ data["threshold"] = 1.0
49
+ return data
50
+
51
+ @model_validator(mode="after")
52
+ @classmethod
53
+ def default_name(cls, m: "BaseScorer") -> "BaseScorer":
54
+ if not m.name:
55
+ # Try to use the class name if it exists and is not empty
56
+ class_name = getattr(m, "__class__", None)
57
+ if class_name and getattr(m.__class__, "__name__", None):
58
+ m.name = m.__class__.__name__
59
+ else:
60
+ m.name = m.score_type
61
+ return m
62
+
63
+ def _add_model(self, model: str):
64
+ """
65
+ Adds the evaluation model to the BaseScorer instance
66
+
67
+ This method is used at eval time
68
+ """
69
+ self.model, self.using_native_model = create_judge(model)
70
+ self.evaluation_model = self.model.get_model_name()
71
+
72
+ def success_check(self) -> bool:
73
+ """
74
+ For unit testing, determines whether the test case passes or fails
75
+ """
76
+ if self.error:
77
+ return False
78
+ if self.score is None:
79
+ return False
80
+ return self.score >= self.threshold
81
+
82
+ def __str__(self):
83
+ if self.error:
84
+ judgeval_logger.warning(f"BaseScorer contains error: {self.error}")
85
+ attributes = {
86
+ "score_type": self.score_type,
87
+ "threshold": self.threshold,
88
+ "score": self.score,
89
+ "score_breakdown": self.score_breakdown,
90
+ "reason": self.reason,
91
+ "success": self.success,
92
+ "model": self.model,
93
+ "evaluation_model": self.evaluation_model,
94
+ "strict_mode": self.strict_mode,
95
+ "error": self.error,
96
+ "additional_metadata": self.additional_metadata,
97
+ }
98
+ return f"BaseScorer({attributes})"
@@ -0,0 +1,19 @@
1
+ from judgeval.scorers.base_scorer import BaseScorer
2
+ from judgeval.data import Example
3
+ from typing import List
4
+ from pydantic import Field
5
+ from judgeval.common.logger import judgeval_logger
6
+
7
+
8
+ class ExampleScorer(BaseScorer):
9
+ score_type: str = "Custom" # default to custom score type
10
+ required_params: List[str] = Field(default_factory=list)
11
+
12
+ async def a_score_example(self, example: Example, *args, **kwargs) -> float:
13
+ """
14
+ Asynchronously measures the score on a single example
15
+ """
16
+ judgeval_logger.error("a_score_example method not implemented")
17
+ raise NotImplementedError(
18
+ "You must implement the `a_score_example` method in your custom scorer"
19
+ )
@@ -1,40 +1,21 @@
1
1
  from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
2
2
  ExecutionOrderScorer,
3
3
  )
4
- from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import (
5
- JSONCorrectnessScorer,
6
- )
7
- from judgeval.scorers.judgeval_scorers.api_scorers.summarization import (
8
- SummarizationScorer,
9
- )
10
4
  from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
11
5
  HallucinationScorer,
12
6
  )
13
7
  from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
14
8
  FaithfulnessScorer,
15
9
  )
16
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import (
17
- ContextualRelevancyScorer,
18
- )
19
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import (
20
- ContextualPrecisionScorer,
21
- )
22
- from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import (
23
- ContextualRecallScorer,
24
- )
25
10
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
26
11
  AnswerRelevancyScorer,
27
12
  )
28
13
  from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
29
14
  AnswerCorrectnessScorer,
30
15
  )
31
- from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
32
16
  from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
33
17
  InstructionAdherenceScorer,
34
18
  )
35
- from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import (
36
- GroundednessScorer,
37
- )
38
19
  from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
39
20
  DerailmentScorer,
40
21
  )
@@ -57,7 +38,6 @@ __all__ = [
57
38
  "ContextualRecallScorer",
58
39
  "AnswerRelevancyScorer",
59
40
  "AnswerCorrectnessScorer",
60
- "ComparisonScorer",
61
41
  "InstructionAdherenceScorer",
62
42
  "GroundednessScorer",
63
43
  "DerailmentScorer",
@@ -6,23 +6,16 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
+ from typing import List
12
13
 
13
14
 
14
- class AnswerCorrectnessScorer(APIJudgmentScorer):
15
- def __init__(self, threshold: float):
16
- super().__init__(
17
- threshold=threshold,
18
- score_type=APIScorer.ANSWER_CORRECTNESS,
19
- required_params=[
20
- ExampleParams.INPUT,
21
- ExampleParams.ACTUAL_OUTPUT,
22
- ExampleParams.EXPECTED_OUTPUT,
23
- ],
24
- )
25
-
26
- @property
27
- def __name__(self):
28
- return "Answer Correctness"
15
+ class AnswerCorrectnessScorer(APIScorerConfig):
16
+ score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
17
+ required_params: List[ExampleParams] = [
18
+ ExampleParams.INPUT,
19
+ ExampleParams.ACTUAL_OUTPUT,
20
+ ExampleParams.EXPECTED_OUTPUT,
21
+ ]
@@ -1,27 +1,12 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
1
+ from judgeval.scorers.api_scorer import APIScorerConfig
2
+ from judgeval.constants import APIScorerType
11
3
  from judgeval.data import ExampleParams
4
+ from typing import List
12
5
 
13
6
 
14
- class AnswerRelevancyScorer(APIJudgmentScorer):
15
- def __init__(self, threshold: float):
16
- super().__init__(
17
- threshold=threshold,
18
- score_type=APIScorer.ANSWER_RELEVANCY,
19
- required_params=[
20
- ExampleParams.INPUT,
21
- ExampleParams.ACTUAL_OUTPUT,
22
- ],
23
- )
24
-
25
- @property
26
- def __name__(self):
27
- return "Answer Relevancy"
7
+ class AnswerRelevancyScorer(APIScorerConfig):
8
+ score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
9
+ required_params: List[ExampleParams] = [
10
+ ExampleParams.INPUT,
11
+ ExampleParams.ACTUAL_OUTPUT,
12
+ ]
@@ -1,9 +1,9 @@
1
- from judgeval.scorers.api_scorer import APIJudgmentScorer
2
- from judgeval.constants import APIScorer
3
- from typing import List, Mapping, Optional
1
+ from judgeval.scorers.api_scorer import APIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from typing import List, Mapping, Optional, Dict, Any
4
4
 
5
5
 
6
- class ClassifierScorer(APIJudgmentScorer):
6
+ class ClassifierScorer(APIScorerConfig):
7
7
  """
8
8
  In the Judgment backend, this scorer is implemented as a PromptScorer that takes
9
9
  1. a system role that may involve the Example object
@@ -16,50 +16,15 @@ class ClassifierScorer(APIJudgmentScorer):
16
16
  options = {"positive": 1, "negative": 0}
17
17
 
18
18
  Args:
19
- name (str): The name of the scorer
20
19
  slug (str): A unique identifier for the scorer
21
20
  conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
22
21
  options (Mapping[str, float]): A mapping of classification options to their corresponding scores
23
- threshold (float): The threshold for determining success (default: 0.5)
24
- include_reason (bool): Whether to include reasoning in the response (default: True)
25
- strict_mode (bool): Whether to use strict mode (default: False)
26
- verbose_mode (bool): Whether to include verbose logging (default: False)
27
22
  """
28
23
 
29
- name: Optional[str] = None
30
24
  slug: Optional[str] = None
31
25
  conversation: Optional[List[dict]] = None
32
26
  options: Optional[Mapping[str, float]] = None
33
- verbose_mode: bool = False
34
- strict_mode: bool = False
35
- include_reason: bool = True
36
- async_mode: bool = True
37
- threshold: float = 0.5
38
-
39
- def __init__(
40
- self,
41
- name: str,
42
- slug: str,
43
- conversation: List[dict],
44
- options: Mapping[str, float],
45
- threshold: float = 0.5,
46
- include_reason: bool = True,
47
- strict_mode: bool = False,
48
- verbose_mode: bool = False,
49
- async_mode: bool = True,
50
- ):
51
- super().__init__(
52
- threshold=threshold,
53
- score_type=APIScorer.CLASSIFIER,
54
- )
55
- self.name = name
56
- self.verbose_mode = verbose_mode
57
- self.strict_mode = strict_mode
58
- self.include_reason = include_reason
59
- self.slug = slug
60
- self.conversation = conversation
61
- self.options = options
62
- self.async_mode = async_mode
27
+ score_type: APIScorerType = APIScorerType.PROMPT_SCORER
63
28
 
64
29
  def update_name(self, name: str):
65
30
  """
@@ -94,32 +59,15 @@ class ClassifierScorer(APIJudgmentScorer):
94
59
  def __str__(self):
95
60
  return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
96
61
 
97
- # @model_serializer
98
- # def serialize_model(self) -> dict:
99
- # """
100
- # Defines how the ClassifierScorer should be serialized when model_dump() is called.
101
- # """
102
- # return {
103
- # "name": self.name,
104
- # "score_type": self.name,
105
- # "conversation": self.conversation,
106
- # "options": self.options,
107
- # "threshold": self.threshold,
108
- # "include_reason": self.include_reason,
109
- # "async_mode": self.async_mode,
110
- # "strict_mode": self.strict_mode,
111
- # "verbose_mode": self.verbose_mode,
112
- # }
113
-
114
- def to_dict(self) -> dict:
115
- return {
116
- "name": self.name,
117
- "score_type": self.name,
118
- "conversation": self.conversation,
119
- "options": self.options,
120
- "threshold": self.threshold,
121
- "include_reason": self.include_reason,
122
- "async_mode": self.async_mode,
123
- "strict_mode": self.strict_mode,
124
- "verbose_mode": self.verbose_mode,
62
+ def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
63
+ base = super().model_dump(*args, **kwargs)
64
+ base_fields = set(APIScorerConfig.model_fields.keys())
65
+ all_fields = set(self.__class__.model_fields.keys())
66
+
67
+ extra_fields = all_fields - base_fields - {"kwargs"}
68
+
69
+ base["kwargs"] = {
70
+ k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
125
71
  }
72
+
73
+ return base
@@ -6,17 +6,9 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
 
12
12
 
13
- class DerailmentScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(
16
- threshold=threshold,
17
- score_type=APIScorer.DERAILMENT,
18
- )
19
-
20
- @property
21
- def __name__(self):
22
- return "Derailment"
13
+ class DerailmentScorer(APIScorerConfig):
14
+ score_type: APIScorerType = APIScorerType.DERAILMENT
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
  from typing import Optional, Dict
12
12
  from judgeval.data import ExampleParams
13
13
 
14
14
 
15
- class ExecutionOrderScorer(APIJudgmentScorer):
15
+ class ExecutionOrderScorer(APIScorerConfig):
16
16
  kwargs: Optional[Dict] = None
17
17
 
18
18
  def __init__(
@@ -23,7 +23,7 @@ class ExecutionOrderScorer(APIJudgmentScorer):
23
23
  ):
24
24
  super().__init__(
25
25
  threshold=threshold,
26
- score_type=APIScorer.EXECUTION_ORDER,
26
+ score_type=APIScorerType.EXECUTION_ORDER,
27
27
  required_params=[
28
28
  ExampleParams.ACTUAL_OUTPUT,
29
29
  ExampleParams.EXPECTED_OUTPUT,
@@ -6,23 +6,16 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
+ from typing import List
12
13
 
13
14
 
14
- class FaithfulnessScorer(APIJudgmentScorer):
15
- def __init__(self, threshold: float):
16
- super().__init__(
17
- threshold=threshold,
18
- score_type=APIScorer.FAITHFULNESS,
19
- required_params=[
20
- ExampleParams.INPUT,
21
- ExampleParams.ACTUAL_OUTPUT,
22
- ExampleParams.RETRIEVAL_CONTEXT,
23
- ],
24
- )
25
-
26
- @property
27
- def __name__(self):
28
- return "Faithfulness"
15
+ class FaithfulnessScorer(APIScorerConfig):
16
+ score_type: APIScorerType = APIScorerType.FAITHFULNESS
17
+ required_params: List[ExampleParams] = [
18
+ ExampleParams.INPUT,
19
+ ExampleParams.ACTUAL_OUTPUT,
20
+ ExampleParams.RETRIEVAL_CONTEXT,
21
+ ]
@@ -6,16 +6,16 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
 
13
13
 
14
- class HallucinationScorer(APIJudgmentScorer):
14
+ class HallucinationScorer(APIScorerConfig):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
17
  threshold=threshold,
18
- score_type=APIScorer.HALLUCINATION,
18
+ score_type=APIScorerType.HALLUCINATION,
19
19
  required_params=[
20
20
  ExampleParams.INPUT,
21
21
  ExampleParams.ACTUAL_OUTPUT,
@@ -6,16 +6,16 @@ TODO add link to docs page for this scorer
6
6
  """
7
7
 
8
8
  # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
9
+ from judgeval.scorers.api_scorer import APIScorerConfig
10
+ from judgeval.constants import APIScorerType
11
11
  from judgeval.data import ExampleParams
12
12
 
13
13
 
14
- class InstructionAdherenceScorer(APIJudgmentScorer):
14
+ class InstructionAdherenceScorer(APIScorerConfig):
15
15
  def __init__(self, threshold: float):
16
16
  super().__init__(
17
17
  threshold=threshold,
18
- score_type=APIScorer.INSTRUCTION_ADHERENCE,
18
+ score_type=APIScorerType.INSTRUCTION_ADHERENCE,
19
19
  required_params=[
20
20
  ExampleParams.INPUT,
21
21
  ExampleParams.ACTUAL_OUTPUT,
@@ -3,16 +3,16 @@
3
3
  """
4
4
 
5
5
  # Internal imports
6
- from judgeval.scorers.api_scorer import APIJudgmentScorer
7
- from judgeval.constants import APIScorer
6
+ from judgeval.scorers.api_scorer import APIScorerConfig
7
+ from judgeval.constants import APIScorerType
8
8
  from typing import Optional, Dict
9
9
 
10
10
 
11
- class ToolDependencyScorer(APIJudgmentScorer):
11
+ class ToolDependencyScorer(APIScorerConfig):
12
12
  kwargs: Optional[Dict] = None
13
13
 
14
14
  def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
15
- super().__init__(threshold=threshold, score_type=APIScorer.TOOL_DEPENDENCY)
15
+ super().__init__(threshold=threshold, score_type=APIScorerType.TOOL_DEPENDENCY)
16
16
  self.kwargs = {"enable_param_checking": enable_param_checking}
17
17
 
18
18
  @property
@@ -3,21 +3,25 @@
3
3
  """
4
4
 
5
5
  # Internal imports
6
- from judgeval.scorers.api_scorer import APIJudgmentScorer
7
- from judgeval.constants import APIScorer
8
- from typing import Optional, Dict
6
+ from judgeval.scorers.api_scorer import APIScorerConfig
7
+ from judgeval.constants import APIScorerType
8
+ from typing import Dict, Any
9
9
 
10
10
 
11
- class ToolOrderScorer(APIJudgmentScorer):
12
- kwargs: Optional[Dict] = None
11
+ class ToolOrderScorer(APIScorerConfig):
12
+ score_type: APIScorerType = APIScorerType.TOOL_ORDER
13
+ threshold: float = 1.0
14
+ exact_match: bool = False
13
15
 
14
- def __init__(self, threshold: float = 1.0, exact_match: bool = False):
15
- super().__init__(
16
- threshold=threshold,
17
- score_type=APIScorer.TOOL_ORDER,
18
- )
19
- self.kwargs = {"exact_match": exact_match}
16
+ def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
17
+ base = super().model_dump(*args, **kwargs)
18
+ base_fields = set(APIScorerConfig.model_fields.keys())
19
+ all_fields = set(self.__class__.model_fields.keys())
20
20
 
21
- @property
22
- def __name__(self):
23
- return "Tool Order"
21
+ extra_fields = all_fields - base_fields - {"kwargs"}
22
+
23
+ base["kwargs"] = {
24
+ k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
25
+ }
26
+
27
+ return base