judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,6 +1,6 @@
1
1
  from judgeval.judges.base_judge import JudgevalJudge
2
2
  from judgeval.judges.litellm_judge import LiteLLMJudge
3
3
  from judgeval.judges.together_judge import TogetherJudge
4
- from judgeval.judges.mixture_of_judges import MixtureOfJudges
5
4
 
6
- __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
5
+
6
+ __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
@@ -1,21 +1,77 @@
1
1
  import pydantic
2
- from typing import List, Union, Mapping
2
+ from typing import Dict, List, Union, Mapping, Any
3
3
 
4
+ from judgeval.constants import ACCEPTABLE_MODELS
4
5
  from judgeval.judges import JudgevalJudge
5
- from judgeval.common.utils import (
6
- afetch_litellm_api_response,
7
- fetch_litellm_api_response,
8
- )
9
- from judgeval.common.logger import judgeval_logger
10
- from judgeval.constants import DEFAULT_GPT_MODEL
6
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError:
11
+ raise ImportError(
12
+ "Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
13
+ )
14
+
15
+
16
+ def fetch_litellm_api_response(
17
+ model: str,
18
+ messages: List[Dict[str, str]],
19
+ response_format: Union[Dict[str, Any], None] = None,
20
+ ) -> str:
21
+ if response_format is not None:
22
+ response = litellm.completion(
23
+ model=model,
24
+ messages=messages,
25
+ response_format=response_format,
26
+ )
27
+ else:
28
+ response = litellm.completion(
29
+ model=model,
30
+ messages=messages,
31
+ )
32
+
33
+ content = response.choices[0].message.content # type: ignore[attr-defined]
34
+ if content is None:
35
+ raise ValueError("Received empty response from litellm")
36
+ return content
37
+
38
+
39
+ async def afetch_litellm_api_response(
40
+ model: str,
41
+ messages: List[Dict[str, str]],
42
+ response_format: Union[Dict[str, Any], None] = None,
43
+ ) -> str:
44
+ if not messages:
45
+ raise ValueError("Messages cannot be empty")
46
+
47
+ if model not in ACCEPTABLE_MODELS:
48
+ raise ValueError(
49
+ f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
50
+ )
51
+
52
+ if response_format is not None:
53
+ response = await litellm.acompletion(
54
+ model=model, messages=messages, response_format=response_format
55
+ )
56
+ else:
57
+ response = await litellm.acompletion(
58
+ model=model,
59
+ messages=messages,
60
+ )
61
+
62
+ content = response.choices[0].message.content # type: ignore[attr-defined]
63
+ if content is None:
64
+ raise ValueError("Received empty response from litellm")
65
+ return content
66
+
11
67
 
12
68
  BASE_CONVERSATION = [
13
69
  {"role": "system", "content": "You are a helpful assistant."},
14
- ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
70
+ ]
15
71
 
16
72
 
17
73
  class LiteLLMJudge(JudgevalJudge):
18
- def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
74
+ def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
19
75
  self.model = model
20
76
  self.kwargs = kwargs
21
77
  super().__init__(model_name=model)
@@ -25,17 +81,19 @@ class LiteLLMJudge(JudgevalJudge):
25
81
  input: Union[str, List[Mapping[str, str]]],
26
82
  schema: Union[pydantic.BaseModel, None] = None,
27
83
  ) -> str:
84
+ response_format = schema.model_json_schema() if schema else None
85
+
28
86
  if isinstance(input, str):
29
87
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
30
88
  return fetch_litellm_api_response(
31
- model=self.model, messages=convo, response_format=schema
89
+ model=self.model, messages=convo, response_format=response_format
32
90
  )
33
91
  elif isinstance(input, list):
92
+ messages = [dict(msg) for msg in input]
34
93
  return fetch_litellm_api_response(
35
- model=self.model, messages=input, response_format=schema
94
+ model=self.model, messages=messages, response_format=response_format
36
95
  )
37
96
  else:
38
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
39
97
  raise TypeError(
40
98
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
41
99
  )
@@ -45,19 +103,21 @@ class LiteLLMJudge(JudgevalJudge):
45
103
  input: Union[str, List[Mapping[str, str]]],
46
104
  schema: Union[pydantic.BaseModel, None] = None,
47
105
  ) -> str:
106
+ response_format = schema.model_json_schema() if schema else None
107
+
48
108
  if isinstance(input, str):
49
109
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
50
110
  response = await afetch_litellm_api_response(
51
- model=self.model, messages=convo, response_format=schema
111
+ model=self.model, messages=convo, response_format=response_format
52
112
  )
53
113
  return response
54
114
  elif isinstance(input, list):
115
+ messages = [dict(msg) for msg in input]
55
116
  response = await afetch_litellm_api_response(
56
- model=self.model, messages=input, response_format=schema
117
+ model=self.model, messages=messages, response_format=response_format
57
118
  )
58
119
  return response
59
120
  else:
60
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
61
121
  raise TypeError(
62
122
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
63
123
  )
@@ -3,15 +3,77 @@ Implementation of using TogetherAI inference for judges.
3
3
  """
4
4
 
5
5
  from pydantic import BaseModel
6
- from typing import List, Union
7
-
6
+ from typing import Dict, List, Union, Any, cast
8
7
  from judgeval.judges import JudgevalJudge
9
- from judgeval.common.utils import (
10
- fetch_together_api_response,
11
- afetch_together_api_response,
8
+ from judgeval.logger import judgeval_logger
9
+ from judgeval.env import (
10
+ JUDGMENT_DEFAULT_TOGETHER_MODEL,
11
+ TOGETHERAI_API_KEY,
12
+ TOGETHER_API_KEY,
12
13
  )
13
- from judgeval.common.logger import judgeval_logger
14
- from judgeval.constants import DEFAULT_TOGETHER_MODEL
14
+
15
+ together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
16
+ if together_api_key:
17
+ try:
18
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
19
+
20
+ together_client = Together(api_key=together_api_key)
21
+ async_together_client = AsyncTogether(api_key=together_api_key)
22
+ except Exception:
23
+ pass
24
+
25
+
26
+ def fetch_together_api_response(
27
+ model: str,
28
+ messages: List[Dict[str, str]],
29
+ response_format: Union[Dict[str, Any], None] = None,
30
+ ) -> str:
31
+ if not messages:
32
+ raise ValueError("Messages cannot be empty")
33
+
34
+ if response_format is not None:
35
+ response = together_client.chat.completions.create(
36
+ model=model,
37
+ messages=messages,
38
+ response_format=response_format,
39
+ )
40
+ else:
41
+ response = together_client.chat.completions.create(
42
+ model=model,
43
+ messages=messages,
44
+ )
45
+
46
+ content = response.choices[0].message.content # type: ignore[attr-defined]
47
+ if content is None:
48
+ raise ValueError("Received empty response from TogetherAI")
49
+ return cast(str, content)
50
+
51
+
52
+ async def afetch_together_api_response(
53
+ model: str,
54
+ messages: List[Dict[str, str]],
55
+ response_format: Union[Dict[str, Any], None] = None,
56
+ ) -> str:
57
+ if not messages:
58
+ raise ValueError("Messages cannot be empty")
59
+
60
+ if response_format is not None:
61
+ response = await async_together_client.chat.completions.create(
62
+ model=model,
63
+ messages=messages,
64
+ response_format=response_format,
65
+ )
66
+ else:
67
+ response = await async_together_client.chat.completions.create(
68
+ model=model,
69
+ messages=messages,
70
+ )
71
+
72
+ content = response.choices[0].message.content # type: ignore[attr-defined]
73
+ if content is None:
74
+ raise ValueError("Received empty response from TogetherAI")
75
+ return cast(str, content)
76
+
15
77
 
16
78
  BASE_CONVERSATION = [
17
79
  {"role": "system", "content": "You are a helpful assistant."},
@@ -19,46 +81,52 @@ BASE_CONVERSATION = [
19
81
 
20
82
 
21
83
  class TogetherJudge(JudgevalJudge):
22
- def __init__(self, model: str = DEFAULT_TOGETHER_MODEL, **kwargs):
84
+ def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
23
85
  self.model = model
24
86
  self.kwargs = kwargs
25
87
  super().__init__(model_name=model)
26
88
 
27
- # TODO: Fix cost for generate and a_generate
28
89
  def generate(
29
- self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
90
+ self,
91
+ input: Union[str, List[Dict[str, str]]],
92
+ schema: Union[BaseModel, None] = None,
30
93
  ) -> str:
94
+ response_format = schema.model_json_schema() if schema else None
95
+
31
96
  if isinstance(input, str):
32
97
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
33
98
  return fetch_together_api_response(
34
- self.model, convo, response_format=schema
99
+ self.model, convo, response_format=response_format
35
100
  )
36
101
  elif isinstance(input, list):
37
- convo = input
102
+ messages = [dict(msg) for msg in input]
38
103
  return fetch_together_api_response(
39
- self.model, convo, response_format=schema
104
+ self.model, messages, response_format=response_format
40
105
  )
41
106
  else:
42
107
  judgeval_logger.error(f"Invalid input type received: {type(input)}")
43
108
  raise TypeError("Input must be a string or a list of dictionaries.")
44
109
 
45
110
  async def a_generate(
46
- self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
111
+ self,
112
+ input: Union[str, List[Dict[str, str]]],
113
+ schema: Union[BaseModel, None] = None,
47
114
  ) -> str:
115
+ response_format = schema.model_json_schema() if schema else None
116
+
48
117
  if isinstance(input, str):
49
118
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
50
119
  res = await afetch_together_api_response(
51
- self.model, convo, response_format=schema
120
+ self.model, convo, response_format=response_format
52
121
  )
53
122
  return res
54
123
  elif isinstance(input, list):
55
- convo = input
124
+ messages = [dict(msg) for msg in input]
56
125
  res = await afetch_together_api_response(
57
- self.model, convo, response_format=schema
126
+ self.model, messages, response_format=response_format
58
127
  )
59
128
  return res
60
129
  else:
61
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
62
130
  raise TypeError("Input must be a string or a list of dictionaries.")
63
131
 
64
132
  def load_model(self) -> str:
judgeval/judges/utils.py CHANGED
@@ -3,22 +3,21 @@ This module contains utility functions for judge models.
3
3
  """
4
4
 
5
5
  import litellm
6
- from typing import Optional, Union, Tuple, List
6
+ from typing import Optional, Union, Tuple
7
7
 
8
- from judgeval.common.exceptions import InvalidJudgeModelError
9
- from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
10
- from judgeval.constants import DEFAULT_GPT_MODEL
8
+ from judgeval.exceptions import InvalidJudgeModelError
9
+ from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
10
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
11
11
  from judgeval.constants import (
12
12
  TOGETHER_SUPPORTED_MODELS,
13
13
  JUDGMENT_SUPPORTED_MODELS,
14
- ACCEPTABLE_MODELS,
15
14
  )
16
15
 
17
16
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
18
17
 
19
18
 
20
19
  def create_judge(
21
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
20
+ model: Optional[Union[str, JudgevalJudge]] = None,
22
21
  ) -> Tuple[JudgevalJudge, bool]:
23
22
  """
24
23
  Creates a judge model from string(s) or a judgeval judge object.
@@ -31,28 +30,15 @@ def create_judge(
31
30
  If no model is provided, uses GPT4o as the default judge.
32
31
  """
33
32
  if model is None: # default option
34
- return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
33
+ return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
35
34
  if not isinstance(model, (str, list, JudgevalJudge)):
36
35
  raise InvalidJudgeModelError(
37
36
  f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
38
37
  )
39
38
  # If model is already a valid judge type, return it and mark native
40
- if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
39
+ if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
41
40
  return model, True
42
41
 
43
- # Either string or List[str]
44
- if isinstance(model, list):
45
- for m in model:
46
- if m in JUDGMENT_SUPPORTED_MODELS:
47
- raise NotImplementedError(
48
- """Judgment models are not yet supported for local scoring.
49
- Please either set the `use_judgment` flag to True or use
50
- non-Judgment models."""
51
- )
52
- if m not in ACCEPTABLE_MODELS:
53
- raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
54
- return MixtureOfJudges(models=model), True
55
- # If model is a string, check that it corresponds to a valid model
56
42
  if model in LITELLM_SUPPORTED_MODELS:
57
43
  return LiteLLMJudge(model=model), True
58
44
  if model in TOGETHER_SUPPORTED_MODELS:
@@ -1,10 +1,9 @@
1
- # logger.py
2
-
3
1
  import logging
4
2
  import sys
5
- import os
6
3
 
7
- # ANSI escape sequences
4
+ from judgeval.env import JUDGMENT_NO_COLOR
5
+ from judgeval.utils.decorators import use_once
6
+
8
7
  RESET = "\033[0m"
9
8
  RED = "\033[31m"
10
9
  YELLOW = "\033[33m"
@@ -38,8 +37,9 @@ class ColorFormatter(logging.Formatter):
38
37
  return message
39
38
 
40
39
 
40
+ @use_once
41
41
  def _setup_judgeval_logger():
42
- use_color = sys.stdout.isatty() and os.getenv("NO_COLOR") is None
42
+ use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
43
43
  handler = logging.StreamHandler(sys.stdout)
44
44
  handler.setLevel(logging.DEBUG)
45
45
  handler.setFormatter(
@@ -56,5 +56,7 @@ def _setup_judgeval_logger():
56
56
  return logger
57
57
 
58
58
 
59
- # Global logger you can import elsewhere
60
59
  judgeval_logger = _setup_judgeval_logger()
60
+
61
+
62
+ __all__ = ("judgeval_logger",)
@@ -1,8 +1,6 @@
1
1
  from judgeval.scorers.api_scorer import APIScorerConfig
2
2
  from judgeval.scorers.base_scorer import BaseScorer
3
3
  from judgeval.scorers.judgeval_scorers.api_scorers import (
4
- ExecutionOrderScorer,
5
- HallucinationScorer,
6
4
  FaithfulnessScorer,
7
5
  AnswerRelevancyScorer,
8
6
  AnswerCorrectnessScorer,
@@ -17,8 +15,6 @@ __all__ = [
17
15
  "APIScorerConfig",
18
16
  "BaseScorer",
19
17
  "PromptScorer",
20
- "ExecutionOrderScorer",
21
- "HallucinationScorer",
22
18
  "FaithfulnessScorer",
23
19
  "AnswerRelevancyScorer",
24
20
  "AnswerCorrectnessScorer",
@@ -1,21 +1,17 @@
1
1
  from judgeval.scorers.base_scorer import BaseScorer
2
- from judgeval.data import Trace
2
+ from judgeval.data.judgment_types import Trace as JudgmentTrace
3
3
  from typing import List, Optional
4
4
  from abc import abstractmethod
5
5
 
6
- from judgeval.common.logger import warning, error
7
6
 
8
-
9
- class AgentScorer(BaseScorer):
7
+ class TraceScorer(BaseScorer):
10
8
  @abstractmethod
11
9
  async def a_score_trace(
12
- self, trace: Trace, tools: Optional[List] = None, *args, **kwargs
10
+ self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
13
11
  ) -> float:
14
12
  """
15
13
  Asynchronously measures the score on a trace
16
14
  """
17
- warning("Attempting to call unimplemented a_score_trace method")
18
- error("a_score_trace method not implemented")
19
15
  raise NotImplementedError(
20
16
  "You must implement the `a_score_trace` method in your custom scorer"
21
17
  )
@@ -4,11 +4,12 @@ Judgment Scorer class.
4
4
  Scores `Example`s using ready-made Judgment evaluators.
5
5
  """
6
6
 
7
+ from __future__ import annotations
8
+
7
9
  from pydantic import BaseModel, field_validator
8
10
  from typing import List
9
- from judgeval.data import ExampleParams
10
- from judgeval.constants import APIScorerType, UNBOUNDED_SCORERS
11
- from judgeval.common.logger import judgeval_logger
11
+ from judgeval.constants import UNBOUNDED_SCORERS, APIScorerType
12
+ from judgeval.data.example import ExampleParams
12
13
 
13
14
 
14
15
  class APIScorerConfig(BaseModel):
@@ -28,9 +29,10 @@ class APIScorerConfig(BaseModel):
28
29
  name: str = ""
29
30
  threshold: float = 0.5
30
31
  strict_mode: bool = False
31
- required_params: List[
32
- ExampleParams
33
- ] = [] # This is used to check if the example has the required parameters before running the scorer
32
+
33
+ # This is used to check if the example has the required parameters before running the scorer
34
+ required_params: List[ExampleParams] = []
35
+
34
36
  kwargs: dict = {}
35
37
 
36
38
  @field_validator("threshold")
@@ -42,17 +44,11 @@ class APIScorerConfig(BaseModel):
42
44
  score_type = info.data.get("score_type")
43
45
  if score_type in UNBOUNDED_SCORERS:
44
46
  if v < 0:
45
- judgeval_logger.error(
46
- f"Threshold for {score_type} must be greater than 0, got: {v}"
47
- )
48
47
  raise ValueError(
49
48
  f"Threshold for {score_type} must be greater than 0, got: {v}"
50
49
  )
51
50
  else:
52
51
  if not 0 <= v <= 1:
53
- judgeval_logger.error(
54
- f"Threshold for {score_type} must be between 0 and 1, got: {v}"
55
- )
56
52
  raise ValueError(
57
53
  f"Threshold for {score_type} must be between 0 and 1, got: {v}"
58
54
  )
@@ -61,7 +57,6 @@ class APIScorerConfig(BaseModel):
61
57
  @field_validator("name", mode="after")
62
58
  @classmethod
63
59
  def set_name_to_score_type_if_none(cls, v, info):
64
- """Set name to score_type if not provided"""
65
60
  if v is None:
66
61
  return info.data.get("score_type")
67
62
  return v
@@ -2,6 +2,7 @@
2
2
  Base class for all scorers.
3
3
  """
4
4
 
5
+ from __future__ import annotations
5
6
  from typing import Dict, Optional
6
7
 
7
8
  from pydantic import BaseModel
@@ -19,44 +20,63 @@ class BaseScorer(BaseModel):
19
20
  where none of Judgment's scorers are suitable.
20
21
  """
21
22
 
22
- score_type: str # type of your scorer (Faithfulness, PromptScorer)
23
- threshold: float = (
24
- 0.5 # The threshold to pass a test while using this scorer as a scorer
25
- )
26
- name: Optional[str] = (
27
- None # name of your scorer (Faithfulness, PromptScorer-randomslug)
28
- )
29
- class_name: Optional[str] = None # The name of the class of the scorer
30
- score: Optional[float] = None # The float score of the scorer run on the test case
23
+ # type of your scorer (Faithfulness, PromptScorer)
24
+ score_type: str
25
+
26
+ # The threshold to pass a test while using this scorer as a scorer
27
+ threshold: float = 0.5
28
+
29
+ # name of your scorer (Faithfulness, PromptScorer-randomslug)
30
+ name: Optional[str] = None
31
+
32
+ # The name of the class of the scorer
33
+ class_name: Optional[str] = None
34
+
35
+ # The float score of the scorer run on the test case
36
+ score: Optional[float] = None
37
+
31
38
  score_breakdown: Optional[Dict] = None
32
39
  reason: Optional[str] = ""
33
- using_native_model: Optional[bool] = None # Whether the model is a native model
34
- success: Optional[bool] = None # Whether the test case passed or failed
35
- model: Optional[str] = None # The name of the model used to evaluate the test case
36
- model_client: Optional[Any] = Field(
37
- default=None, exclude=True
38
- ) # The model used to evaluate the test case
39
- strict_mode: bool = False # Whether to run the scorer in strict mode
40
- error: Optional[str] = None # The error message if the scorer failed
41
- additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
42
- user: Optional[str] = None # The user ID of the scorer
43
- server_hosted: bool = False # Whether the scorer is enabled for e2b
40
+
41
+ # Whether the model is a native model
42
+ using_native_model: Optional[bool] = None
43
+
44
+ # Whether the test case passed or failed
45
+ success: Optional[bool] = None
46
+
47
+ # The name of the model used to evaluate the test case
48
+ model: Optional[str] = None
49
+
50
+ # The model used to evaluate the test case
51
+ model_client: Optional[Any] = Field(default=None, exclude=True)
52
+
53
+ # Whether to run the scorer in strict mode
54
+ strict_mode: bool = False
55
+
56
+ # The error message if the scorer failed
57
+ error: Optional[str] = None
58
+
59
+ # Additional metadata for the scorer
60
+ additional_metadata: Optional[Dict] = None
61
+
62
+ # The user ID of the scorer
63
+ user: Optional[str] = None
64
+
65
+ # Whether the scorer is hosted on the server
66
+ server_hosted: bool = False
44
67
 
45
68
  @model_validator(mode="after")
46
- @classmethod
47
- def enforce_strict_threshold(cls, data: "BaseScorer"):
48
- if data.strict_mode:
49
- data.threshold = 1.0
50
- return data
69
+ def enforce_strict_threshold(self):
70
+ if self.strict_mode:
71
+ self.threshold = 1.0
72
+ return self
51
73
 
52
74
  @model_validator(mode="after")
53
- @classmethod
54
- def default_name(cls, m: "BaseScorer") -> "BaseScorer":
55
- # Always set class_name to the string name of the class
56
- m.class_name = m.__class__.__name__
57
- if not m.name:
58
- m.name = m.class_name
59
- return m
75
+ def default_name(self):
76
+ self.class_name = self.__class__.__name__
77
+ if not self.name:
78
+ self.name = self.class_name
79
+ return self
60
80
 
61
81
  def _add_model(self, model: str):
62
82
  """
@@ -2,18 +2,16 @@ from judgeval.scorers.base_scorer import BaseScorer
2
2
  from judgeval.data import Example
3
3
  from typing import List
4
4
  from pydantic import Field
5
- from judgeval.common.logger import judgeval_logger
6
5
 
7
6
 
8
7
  class ExampleScorer(BaseScorer):
9
- score_type: str = "Custom" # default to custom score type
8
+ score_type: str = "Custom"
10
9
  required_params: List[str] = Field(default_factory=list)
11
10
 
12
11
  async def a_score_example(self, example: Example, *args, **kwargs) -> float:
13
12
  """
14
13
  Asynchronously measures the score on a single example
15
14
  """
16
- judgeval_logger.error("a_score_example method not implemented")
17
15
  raise NotImplementedError(
18
16
  "You must implement the `a_score_example` method in your custom scorer"
19
17
  )
@@ -1,9 +1,3 @@
1
- from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
2
- ExecutionOrderScorer,
3
- )
4
- from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
5
- HallucinationScorer,
6
- )
7
1
  from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
8
2
  FaithfulnessScorer,
9
3
  )
@@ -28,18 +22,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
28
22
  )
29
23
 
30
24
  __all__ = [
31
- "ExecutionOrderScorer",
32
- "JSONCorrectnessScorer",
33
- "SummarizationScorer",
34
- "HallucinationScorer",
35
25
  "FaithfulnessScorer",
36
- "ContextualRelevancyScorer",
37
- "ContextualPrecisionScorer",
38
- "ContextualRecallScorer",
39
26
  "AnswerRelevancyScorer",
40
27
  "AnswerCorrectnessScorer",
41
28
  "InstructionAdherenceScorer",
42
- "GroundednessScorer",
43
29
  "DerailmentScorer",
44
30
  "ToolOrderScorer",
45
31
  "PromptScorer",