judgeval 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
2
+ from judgeval.constants import APIScorer
3
+ from typing import List, Mapping, Optional, Dict
4
+ from pydantic import model_serializer
5
+
6
+ class ClassifierScorer(APIJudgmentScorer):
7
+ """
8
+ In the Judgment backend, this scorer is implemented as a PromptScorer that takes
9
+ 1. a system role that may involve the Example object
10
+ 2. options for scores on the example
11
+
12
+ and uses a judge to execute the evaluation from the system role and classify into one of the options
13
+
14
+ ex:
15
+ system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
16
+ options = {"positive": 1, "negative": 0}
17
+
18
+ Args:
19
+ name (str): The name of the scorer
20
+ slug (str): A unique identifier for the scorer
21
+ conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
22
+ options (Mapping[str, float]): A mapping of classification options to their corresponding scores
23
+ threshold (float): The threshold for determining success (default: 0.5)
24
+ include_reason (bool): Whether to include reasoning in the response (default: True)
25
+ strict_mode (bool): Whether to use strict mode (default: False)
26
+ verbose_mode (bool): Whether to include verbose logging (default: False)
27
+ """
28
+ name: Optional[str] = None
29
+ slug: Optional[str] = None
30
+ conversation: Optional[List[dict]] = None
31
+ options: Optional[Mapping[str, float]] = None
32
+ verbose_mode: bool = False
33
+ strict_mode: bool = False
34
+ include_reason: bool = True,
35
+ async_mode: bool = True,
36
+ threshold: float = 0.5
37
+
38
+ def __init__(
39
+ self,
40
+ name: str,
41
+ slug: str,
42
+ conversation: List[dict],
43
+ options: Mapping[str, float],
44
+ threshold: float = 0.5,
45
+ include_reason: bool = True,
46
+ strict_mode: bool = False,
47
+ verbose_mode: bool = False,
48
+ async_mode: bool = True,
49
+ ):
50
+ super().__init__(
51
+ threshold=threshold,
52
+ score_type=APIScorer.CLASSIFIER,
53
+ )
54
+ self.name = name
55
+ self.verbose_mode = verbose_mode
56
+ self.strict_mode = strict_mode
57
+ self.include_reason = include_reason
58
+ self.slug = slug
59
+ self.conversation = conversation
60
+ self.options = options
61
+ self.async_mode = async_mode
62
+
63
+ def update_name(self, name: str):
64
+ """
65
+ Updates the name of the scorer.
66
+ """
67
+ self.name = name
68
+
69
+ def update_threshold(self, threshold: float):
70
+ """
71
+ Updates the threshold of the scorer.
72
+ """
73
+ self.threshold = threshold
74
+
75
+ def update_conversation(self, conversation: List[dict]):
76
+ """
77
+ Updates the conversation with the new conversation.
78
+
79
+ Sample conversation:
80
+ [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
81
+ """
82
+ self.conversation = conversation
83
+
84
+ def update_options(self, options: Mapping[str, float]):
85
+ """
86
+ Updates the options with the new options.
87
+
88
+ Sample options:
89
+ {"yes": 1, "no": 0}
90
+ """
91
+ self.options = options
92
+
93
+ def __str__(self):
94
+ return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
95
+
96
+ # @model_serializer
97
+ # def serialize_model(self) -> dict:
98
+ # """
99
+ # Defines how the ClassifierScorer should be serialized when model_dump() is called.
100
+ # """
101
+ # return {
102
+ # "name": self.name,
103
+ # "score_type": self.name,
104
+ # "conversation": self.conversation,
105
+ # "options": self.options,
106
+ # "threshold": self.threshold,
107
+ # "include_reason": self.include_reason,
108
+ # "async_mode": self.async_mode,
109
+ # "strict_mode": self.strict_mode,
110
+ # "verbose_mode": self.verbose_mode,
111
+ # }
112
+
113
+ def to_dict(self) -> dict:
114
+ return {
115
+ "name": self.name,
116
+ "score_type": self.name,
117
+ "conversation": self.conversation,
118
+ "options": self.options,
119
+ "threshold": self.threshold,
120
+ "include_reason": self.include_reason,
121
+ "async_mode": self.async_mode,
122
+ "strict_mode": self.strict_mode,
123
+ "verbose_mode": self.verbose_mode,
124
+ }
@@ -0,0 +1,20 @@
1
+ """
2
+ `judgeval` tool dependency scorer
3
+ """
4
+
5
+ # Internal imports
6
+ from judgeval.scorers.api_scorer import APIJudgmentScorer
7
+ from judgeval.constants import APIScorer
8
+ from typing import Optional, Dict
9
+ class ToolDependencyScorer(APIJudgmentScorer):
10
+ kwargs: Optional[Dict] = None
11
+ def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
12
+ super().__init__(
13
+ threshold=threshold,
14
+ score_type=APIScorer.TOOL_DEPENDENCY
15
+ )
16
+ self.kwargs = {"enable_param_checking": enable_param_checking}
17
+
18
+ @property
19
+ def __name__(self):
20
+ return "Tool Dependency"
@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
7
7
  from judgeval.scorers import ClassifierScorer
8
8
 
9
9
  Text2SQLScorer = ClassifierScorer(
10
- "Text to SQL",
10
+ name="Text to SQL",
11
11
  slug="text2sql-1010101010",
12
12
  threshold=1.0,
13
13
  conversation=[{
@@ -37,6 +37,7 @@ from judgeval.scorers.utils import (
37
37
  get_or_create_event_loop,
38
38
  create_verbose_logs
39
39
  )
40
+ from judgeval.judges import JudgevalJudge
40
41
 
41
42
 
42
43
  class ReasonScore(BaseModel):
@@ -49,7 +50,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
49
50
  score_type: str
50
51
  threshold: float = Field(default=0.5)
51
52
  using_native_model: bool = Field(default=True)
52
-
53
+ model: Optional[JudgevalJudge] = Field(default=None)
54
+ skipped: bool = Field(default=False)
53
55
  # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
54
56
  _response: Optional[dict] = None
55
57
  _result: Optional[float] = None
@@ -276,166 +278,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
276
278
  def __name__(self):
277
279
  return self.name
278
280
 
279
-
280
- class ClassifierScorer(PromptScorer):
281
-
282
- """
283
- This is a PromptScorer that takes
284
- 1. a system role that may involve the Example object
285
- 2. options for scores on the example
286
-
287
- and uses a judge to execute the evaluation from the system role and classify into one of the options
288
-
289
- ex:
290
- system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
291
- options = {"positive": 1, "negative": 0}
292
- """
293
-
294
- conversation: List[dict]
295
- options: Mapping[str, float]
296
-
297
- def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
298
- threshold: float = 0.5, include_reason: bool = True,
299
- async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
300
- # Initialize BaseModel first with all fields
301
- BaseModel.__init__(
302
- self,
303
- name=name,
304
- slug=slug,
305
- score_type=name,
306
- conversation=conversation,
307
- options=options,
308
- threshold=threshold,
309
- include_reason=include_reason,
310
- async_mode=async_mode,
311
- strict_mode=strict_mode,
312
- verbose_mode=verbose_mode,
313
- )
314
- # Then initialize JudgevalScorer
315
- JudgevalScorer.__init__(
316
- self,
317
- score_type=name,
318
- threshold=threshold,
319
- include_reason=include_reason,
320
- async_mode=async_mode,
321
- strict_mode=strict_mode,
322
- verbose_mode=verbose_mode,
323
- )
324
-
325
- def _build_measure_prompt(self, example: Example) -> List[dict]:
326
- """
327
- Builds the measure prompt for the classifier scorer.
328
-
329
- Args:
330
- example (Example): The example to build the prompt for
331
-
332
- Returns:
333
- List[dict]: The measure prompt for the classifier scorer
334
- """
335
- replacement_words = {
336
- "{{actual_output}}": example.actual_output,
337
- "{{expected_output}}": example.expected_output,
338
- "{{context}}": example.context,
339
- "{{retrieval_context}}": example.retrieval_context,
340
- "{{tools_called}}": example.tools_called,
341
- "{{expected_tools}}": example.expected_tools,
342
- }
343
- # Make a copy of the conversation to avoid modifying the original
344
- conversation_copy = [dict(message) for message in self.conversation]
345
-
346
- # Only replace if double brackets are found in the content
347
- for message in conversation_copy:
348
- content = message["content"]
349
- if "{{" in content:
350
- for key, value in replacement_words.items():
351
- if key in content:
352
- message["content"] = content.replace(key, str(value))
353
- return conversation_copy
354
-
355
- def _build_schema(self) -> dict:
356
- return self.options
357
-
358
- def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
359
- """
360
- Enforces the judge model to choose an option from the schema.
361
-
362
- We want the model to choose an option from the schema and a reason for the choice.
363
- """
364
- options = list(schema.keys())
365
- options_str = ", ".join(options)
366
-
367
- system_role = judge_prompt[0]["content"]
368
- system_role += (
369
- f"\n\nYou must choose one of the following options: {options_str}. "
370
- "Format your response as a JSON object with two fields:\n"
371
- "1. 'choice': Your selected option (must be one of the provided choices)\n"
372
- "2. 'reason': A brief explanation for why you made this choice\n\n"
373
- "Example response format:\n"
374
- "{\n"
375
- ' "choice": "<one of the valid options>",\n'
376
- ' "reason": "<your explanation>"\n'
377
- "}"
378
- )
379
-
380
- judge_prompt[0]["content"] = system_role
381
- return judge_prompt
382
-
383
- def _process_response(self, response: dict) -> Tuple[float, str]:
384
- choice = response.get("choice")
385
- if choice not in self.options:
386
- raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
387
- reason = response.get("reason", "No reason could be found in model response.")
388
- return self.options[choice], reason
389
-
390
- def _success_check(self, **kwargs) -> bool:
391
- return self.score >= self.threshold
392
-
393
- def update_name(self, name: str):
394
- """
395
- Updates the name of the scorer.
396
- """
397
- self.name = name
398
-
399
- def update_threshold(self, threshold: float):
400
- """
401
- Updates the threshold of the scorer.
402
- """
403
- self.threshold = threshold
404
-
405
- def update_conversation(self, conversation: List[dict]):
406
- """
407
- Updates the conversation with the new conversation.
408
-
409
- Sample conversation:
410
- [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
411
- """
412
- self.conversation = conversation
413
-
414
- def update_options(self, options: Mapping[str, float]):
415
- """
416
- Updates the options with the new options.
417
-
418
- Sample options:
419
- {"yes": 1, "no": 0}
420
- """
421
- self.options = options
422
-
423
- def __str__(self):
424
- return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
425
-
426
- @model_serializer
427
- def serialize_model(self) -> dict:
428
- """
429
- Defines how the ClassifierScorer should be serialized when model_dump() is called.
430
- """
431
- return {
432
- "name": self.name,
433
- "score_type": self.score_type,
434
- "conversation": self.conversation,
435
- "options": self.options,
436
- "threshold": self.threshold,
437
- "include_reason": self.include_reason,
438
- "async_mode": self.async_mode,
439
- "strict_mode": self.strict_mode,
440
- "verbose_mode": self.verbose_mode,
441
- }
281
+ class Config:
282
+ arbitrary_types_allowed = True
judgeval/scorers/score.py CHANGED
@@ -48,7 +48,7 @@ async def safe_a_score_example(
48
48
  info(f"Successfully scored example {example.example_id}")
49
49
  except MissingTestCaseParamsError as e:
50
50
  if skip_on_missing_params: # Skip the example if the scorer requires parameters that are missing
51
- with example_logging_context(example.timestamp, example.example_id):
51
+ with example_logging_context(example.created_at, example.example_id):
52
52
  warning(f"Skipping example {example.example_id} due to missing parameters")
53
53
  scorer.skipped = True
54
54
  return
@@ -56,10 +56,10 @@ async def safe_a_score_example(
56
56
  if ignore_errors: # Gracefully handle the error, does not stop the evaluation
57
57
  scorer.error = str(e)
58
58
  scorer.success = False
59
- with example_logging_context(example.timestamp, example.example_id):
59
+ with example_logging_context(example.created_at, example.example_id):
60
60
  warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
61
61
  else: # Raise the error and stop the evaluation
62
- with example_logging_context(example.timestamp, example.example_id):
62
+ with example_logging_context(example.created_at, example.example_id):
63
63
  error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
64
64
  raise
65
65
  except TypeError: # in case a_score_example does not accept _show_indicator
@@ -68,27 +68,27 @@ async def safe_a_score_example(
68
68
  except MissingTestCaseParamsError as e:
69
69
  if skip_on_missing_params:
70
70
  scorer.skipped = True
71
- with example_logging_context(example.timestamp, example.example_id):
71
+ with example_logging_context(example.created_at, example.example_id):
72
72
  warning(f"Skipping example {example.example_id} due to missing parameters")
73
73
  return
74
74
  else:
75
75
  if ignore_errors:
76
76
  scorer.error = str(e)
77
77
  scorer.success = False
78
- with example_logging_context(example.timestamp, example.example_id):
78
+ with example_logging_context(example.created_at, example.example_id):
79
79
  warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
80
80
  else:
81
- with example_logging_context(example.timestamp, example.example_id):
81
+ with example_logging_context(example.created_at, example.example_id):
82
82
  error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
83
83
  raise
84
84
  except Exception as e:
85
85
  if ignore_errors:
86
86
  scorer.error = str(e)
87
87
  scorer.success = False # Assuming you want to set success to False
88
- with example_logging_context(example.timestamp, example.example_id):
88
+ with example_logging_context(example.created_at, example.example_id):
89
89
  warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
90
90
  else:
91
- with example_logging_context(example.timestamp, example.example_id):
91
+ with example_logging_context(example.created_at, example.example_id):
92
92
  error(f"Stopping example {example.example_id}: {str(e)}")
93
93
  raise
94
94
 
@@ -128,7 +128,7 @@ async def score_task(
128
128
  except MissingTestCaseParamsError as e:
129
129
  if skip_on_missing_params:
130
130
  scorer.skipped = True
131
- with example_logging_context(example.timestamp, example.example_id):
131
+ with example_logging_context(example.created_at, example.example_id):
132
132
  debug(f"Skipping example {example.example_id} due to missing parameters")
133
133
  return
134
134
  else:
@@ -137,7 +137,7 @@ async def score_task(
137
137
  scorer.success = False # Override success
138
138
  finish_text = "Failed"
139
139
  else:
140
- with example_logging_context(example.timestamp, example.example_id):
140
+ with example_logging_context(example.created_at, example.example_id):
141
141
  error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
142
142
  raise
143
143
  except TypeError:
@@ -147,7 +147,7 @@ async def score_task(
147
147
  except MissingTestCaseParamsError as e:
148
148
  if skip_on_missing_params:
149
149
  scorer.skipped = True
150
- with example_logging_context(example.timestamp, example.example_id):
150
+ with example_logging_context(example.created_at, example.example_id):
151
151
  debug(f"Skipping example {example.example_id} due to missing parameters")
152
152
  return
153
153
  else:
@@ -156,7 +156,7 @@ async def score_task(
156
156
  scorer.success = False # Override success
157
157
  finish_text = "Failed"
158
158
  else:
159
- with example_logging_context(example.timestamp, example.example_id):
159
+ with example_logging_context(example.created_at, example.example_id):
160
160
  error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
161
161
  raise
162
162
  except Exception as e:
@@ -164,10 +164,10 @@ async def score_task(
164
164
  scorer.error = str(e)
165
165
  scorer.success = False # Override success
166
166
  finish_text = "Failed"
167
- with example_logging_context(example.timestamp, example.example_id):
167
+ with example_logging_context(example.created_at, example.example_id):
168
168
  warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
169
169
  else:
170
- with example_logging_context(example.timestamp, example.example_id):
170
+ with example_logging_context(example.created_at, example.example_id):
171
171
  error(f"Stopping example {example.example_id}: {str(e)}")
172
172
  raise
173
173
 
@@ -305,7 +305,7 @@ async def a_execute_scoring(
305
305
  bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
306
306
  ) as pbar:
307
307
  for i, ex in enumerate(examples):
308
- with example_logging_context(ex.timestamp, ex.example_id):
308
+ with example_logging_context(ex.created_at, ex.example_id):
309
309
  debug(f"Starting scoring for example {ex.example_id}")
310
310
  debug(f"Input: {ex.input}")
311
311
  debug(f"Using {len(scorers)} scorers")