judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class GroundednessScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.GROUNDEDNESS,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
22
  ExampleParams.RETRIEVAL_CONTEXT,
22
- ]
23
+ ],
23
24
  )
24
25
 
25
26
  @property
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class HallucinationScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.HALLUCINATION,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
22
  ExampleParams.CONTEXT,
22
- ]
23
+ ],
23
24
  )
24
25
 
25
26
  @property
@@ -10,15 +10,16 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
10
10
  from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class InstructionAdherenceScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.INSTRUCTION_ADHERENCE,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
- ]
22
+ ],
22
23
  )
23
24
 
24
25
  @property
@@ -5,33 +5,32 @@ TODO add link to docs page for this scorer
5
5
 
6
6
  """
7
7
 
8
-
9
8
  # External imports
10
9
  from pydantic import BaseModel, Field
10
+
11
11
  # Internal imports
12
12
  from judgeval.scorers.api_scorer import APIJudgmentScorer
13
13
  from judgeval.constants import APIScorer
14
14
  from judgeval.data import ExampleParams
15
15
 
16
+
16
17
  class JSONCorrectnessScorer(APIJudgmentScorer):
17
18
  json_schema: BaseModel = Field(None, exclude=True)
18
-
19
+
19
20
  def __init__(self, threshold: float, json_schema: BaseModel):
20
21
  super().__init__(
21
- threshold=threshold,
22
+ threshold=threshold,
22
23
  score_type=APIScorer.JSON_CORRECTNESS,
23
24
  required_params=[
24
25
  ExampleParams.INPUT,
25
26
  ExampleParams.ACTUAL_OUTPUT,
26
- ]
27
+ ],
27
28
  )
28
- object.__setattr__(self, 'json_schema', json_schema)
29
-
29
+ object.__setattr__(self, "json_schema", json_schema)
30
+
30
31
  def to_dict(self):
31
32
  base_dict = super().to_dict() # Get the parent class's dictionary
32
- base_dict["kwargs"] = {
33
- "json_schema": self.json_schema.model_json_schema()
34
- }
33
+ base_dict["kwargs"] = {"json_schema": self.json_schema.model_json_schema()}
35
34
  return base_dict
36
35
 
37
36
  @property
@@ -7,21 +7,21 @@ TODO add link to docs page for this scorer
7
7
 
8
8
  # Internal imports
9
9
  from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
10
+ from judgeval.constants import APIScorer
11
11
  from judgeval.data import ExampleParams
12
12
 
13
+
13
14
  class SummarizationScorer(APIJudgmentScorer):
14
15
  def __init__(self, threshold: float):
15
16
  super().__init__(
16
- threshold=threshold,
17
+ threshold=threshold,
17
18
  score_type=APIScorer.SUMMARIZATION,
18
19
  required_params=[
19
20
  ExampleParams.INPUT,
20
21
  ExampleParams.ACTUAL_OUTPUT,
21
- ]
22
+ ],
22
23
  )
23
24
 
24
25
  @property
25
26
  def __name__(self):
26
27
  return "Summarization"
27
-
@@ -6,13 +6,13 @@
6
6
  from judgeval.scorers.api_scorer import APIJudgmentScorer
7
7
  from judgeval.constants import APIScorer
8
8
  from typing import Optional, Dict
9
+
10
+
9
11
  class ToolDependencyScorer(APIJudgmentScorer):
10
12
  kwargs: Optional[Dict] = None
11
- def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
12
- super().__init__(
13
- threshold=threshold,
14
- score_type=APIScorer.TOOL_DEPENDENCY
15
- )
13
+
14
+ def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.TOOL_DEPENDENCY)
16
16
  self.kwargs = {"enable_param_checking": enable_param_checking}
17
17
 
18
18
  @property
@@ -6,11 +6,14 @@
6
6
  from judgeval.scorers.api_scorer import APIJudgmentScorer
7
7
  from judgeval.constants import APIScorer
8
8
  from typing import Optional, Dict
9
+
10
+
9
11
  class ToolOrderScorer(APIJudgmentScorer):
10
12
  kwargs: Optional[Dict] = None
11
- def __init__(self, threshold: float=1.0, exact_match: bool=False):
13
+
14
+ def __init__(self, threshold: float = 1.0, exact_match: bool = False):
12
15
  super().__init__(
13
- threshold=threshold,
16
+ threshold=threshold,
14
17
  score_type=APIScorer.TOOL_ORDER,
15
18
  )
16
19
  self.kwargs = {"exact_match": exact_match}
@@ -4,15 +4,17 @@ ClassifierScorer implementation for basic Text-to-SQL evaluation.
4
4
  Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
5
  Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
6
  """
7
+
7
8
  from judgeval.scorers import ClassifierScorer
8
9
 
9
10
  Text2SQLScorer = ClassifierScorer(
10
11
  name="Text to SQL",
11
12
  slug="text2sql-1010101010",
12
13
  threshold=1.0,
13
- conversation=[{
14
- "role": "system",
15
- "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
14
+ conversation=[
15
+ {
16
+ "role": "system",
17
+ "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
16
18
 
17
19
  ** TASK INSTRUCTIONS **
18
20
  Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
@@ -44,11 +46,8 @@ LLM generated SQL query:
44
46
 
45
47
  Table schema:
46
48
  {{context}}
47
- """
48
- }],
49
- options={
50
- "Y": 1.0,
51
- "N": 0.0
52
- }
49
+ """,
50
+ }
51
+ ],
52
+ options={"Y": 1.0, "N": 0.0},
53
53
  )
54
-
@@ -9,7 +9,7 @@ To implement a subclass of PromptScorer, you need to implement the following met
9
9
  - success_check(): determines whether the evaluation was successful
10
10
 
11
11
  The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
12
- by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
12
+ by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
13
13
  the judge, and parses the structured response to determine a score.
14
14
 
15
15
  For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
@@ -26,17 +26,17 @@ NOTE: When implementing build_measure_prompt and build_schema:
26
26
  """
27
27
 
28
28
  from abc import abstractmethod
29
- from typing import List, Optional, Tuple, Any, Mapping
30
- from pydantic import BaseModel, model_serializer, Field
29
+ from typing import List, Optional, Tuple, Any
30
+ from pydantic import BaseModel, Field
31
31
 
32
32
  from judgeval.data import Example
33
33
  from judgeval.data.example import ExampleParams
34
34
  from judgeval.scorers import JudgevalScorer
35
35
  from judgeval.scorers.utils import (
36
- scorer_progress_meter,
36
+ scorer_progress_meter,
37
37
  parse_response_json,
38
38
  get_or_create_event_loop,
39
- create_verbose_logs
39
+ create_verbose_logs,
40
40
  )
41
41
  from judgeval.judges import JudgevalJudge
42
42
 
@@ -56,10 +56,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
56
56
  # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
57
57
  _response: Optional[dict] = None
58
58
  _result: Optional[float] = None
59
-
59
+
60
60
  def __init__(
61
61
  self,
62
- name: str,
62
+ name: str,
63
63
  threshold: float = 0.5,
64
64
  include_reason: bool = True,
65
65
  async_mode: bool = True,
@@ -91,10 +91,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
91
91
  )
92
92
 
93
93
  def score_example(
94
- self,
95
- example: Example,
96
- _show_indicator: bool = True
97
- ) -> float:
94
+ self, example: Example, _show_indicator: bool = True
95
+ ) -> float | None:
98
96
  """
99
97
  Synchronous method for scoring an example using the prompt criteria.
100
98
  """
@@ -104,6 +102,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
104
102
  loop.run_until_complete(
105
103
  self.a_score_example(example, _show_indicator=False)
106
104
  )
105
+ return self._result
107
106
  else:
108
107
  result, reason = self.evaluate(example)
109
108
  self.reason = reason
@@ -117,10 +116,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
117
116
  return result
118
117
 
119
118
  async def a_score_example(
120
- self,
121
- example: Example,
122
- _show_indicator: bool = True,
123
- ) -> float:
119
+ self,
120
+ example: Example,
121
+ _show_indicator: bool = True,
122
+ ) -> float:
124
123
  """
125
124
  Async method for scoring an example using the prompt criteria.
126
125
  """
@@ -135,30 +134,32 @@ class PromptScorer(JudgevalScorer, BaseModel):
135
134
  ],
136
135
  )
137
136
  return result
138
-
137
+
139
138
  def evaluate(self, example: Example) -> Tuple[Any, str]:
140
139
  """
141
140
  Synchronous helper method for evaluating an example using the prompt criteria.
142
141
 
143
- Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
142
+ Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
144
143
  for evaluation. The result is then parsed as JSON and returned.
145
144
 
146
145
  NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
147
146
  """
148
147
  prompt = self._build_measure_prompt(example)
149
- if self.using_native_model:
148
+ if self.using_native_model and self.model:
150
149
  res = self.model.generate(prompt)
151
150
  response = parse_response_json(res, self)
152
151
  result, reason = self._process_response(response)
153
152
  return result, reason
154
153
  else:
155
- raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.")
154
+ raise NotImplementedError(
155
+ "Non-native judge models are not supported in synchronous mode yet."
156
+ )
156
157
 
157
158
  async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
158
159
  """
159
160
  Asynchronous helper method for evaluating an example using the prompt criteria.
160
161
 
161
- Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
162
+ Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
162
163
  for evaluation. The result is then parsed as JSON and returned.
163
164
 
164
165
  NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
@@ -166,7 +167,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
166
167
  judge_prompt = self._build_measure_prompt(example)
167
168
  schema = self._build_schema()
168
169
  prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
169
- if self.using_native_model:
170
+ if self.using_native_model and self.model:
170
171
  res = await self.model.a_generate(prompt)
171
172
  response = parse_response_json(res, self)
172
173
  self._response = response
@@ -177,7 +178,9 @@ class PromptScorer(JudgevalScorer, BaseModel):
177
178
  self._response = response
178
179
  return result, reason
179
180
  else:
180
- raise NotImplementedError("Non-native judge models are not supported in async mode yet.")
181
+ raise NotImplementedError(
182
+ "Non-native judge models are not supported in async mode yet."
183
+ )
181
184
 
182
185
  # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
183
186
  @abstractmethod
@@ -190,7 +193,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
190
193
 
191
194
  The prompt is typically a set of instructions that the judge model uses to evaluate the example.
192
195
 
193
- This function returns a conversation prompt of the form
196
+ This function returns a conversation prompt of the form
194
197
  [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
195
198
 
196
199
  A basic version of implementing this function could be as follows:
@@ -201,7 +204,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
201
204
  ]
202
205
  """
203
206
  pass
204
-
207
+
205
208
  # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
206
209
  @abstractmethod
207
210
  def _build_schema(self) -> dict:
@@ -214,23 +217,23 @@ class PromptScorer(JudgevalScorer, BaseModel):
214
217
  return {"score": int, "reason": str}
215
218
  """
216
219
  pass
217
-
220
+
218
221
  def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
219
222
  """
220
223
  Formats the final prompt to the judge model.
221
224
 
222
- This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
223
- and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
225
+ This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
226
+ and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
224
227
  The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
225
228
 
226
229
  Args:
227
- judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
230
+ judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
228
231
  Each dictionary should contain a "content" key.
229
- schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
232
+ schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
230
233
  and the values are the types of the corresponding values.
231
234
 
232
235
  Returns:
233
- List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
236
+ List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
234
237
  of the first dictionary.
235
238
 
236
239
  Raises:
@@ -242,19 +245,27 @@ class PromptScorer(JudgevalScorer, BaseModel):
242
245
  formatted_prompt = format_measure_prompt(judge_prompt, schema)
243
246
  # formatted_prompt[0]["content"] will include the schema enforcement prompt
244
247
  """
245
- SCHEMA_ENFORCEMENT_PROMPT = "\n\nPlease provide your response in the following JSON format: {"
246
- if isinstance(judge_prompt, list) and all(isinstance(item, dict) for item in judge_prompt):
248
+ SCHEMA_ENFORCEMENT_PROMPT = (
249
+ "\n\nPlease provide your response in the following JSON format: {"
250
+ )
251
+ if isinstance(judge_prompt, list) and all(
252
+ isinstance(item, dict) for item in judge_prompt
253
+ ):
247
254
  # create formatting string for schema enforcement
248
- # schema is a map between key and type of the value
255
+ # schema is a map between key and type of the value
249
256
  for key, key_type in schema.items():
250
257
  SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
251
- SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}" # remove trailing comma and space
258
+ SCHEMA_ENFORCEMENT_PROMPT = (
259
+ SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"
260
+ ) # remove trailing comma and space
252
261
  judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
253
262
  return judge_prompt
254
263
  else:
255
- raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.")
264
+ raise TypeError(
265
+ f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead."
266
+ )
256
267
 
257
- @abstractmethod
268
+ @abstractmethod
258
269
  def _process_response(self, response: dict):
259
270
  """
260
271
  Customizable method for processing the response from the judge model.
@@ -276,7 +287,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
276
287
  Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
277
288
  """
278
289
  pass
279
-
290
+
280
291
  @property
281
292
  def __name__(self):
282
293
  return self.name