judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/clients.py +6 -4
- judgeval/common/tracer.py +361 -236
- judgeval/constants.py +3 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/example.py +14 -13
- judgeval/data/tool.py +47 -0
- judgeval/data/trace.py +28 -39
- judgeval/data/trace_run.py +2 -1
- judgeval/evaluation_run.py +4 -7
- judgeval/judgment_client.py +27 -6
- judgeval/run_evaluation.py +395 -37
- judgeval/scorers/__init__.py +4 -1
- judgeval/scorers/judgeval_scorer.py +8 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
- judgeval/scorers/prompt_scorer.py +5 -164
- judgeval/scorers/score.py +15 -15
- judgeval-0.0.40.dist-info/METADATA +1441 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
- judgeval-0.0.38.dist-info/METADATA +0 -247
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
- {judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
|
+
from judgeval.constants import APIScorer
|
3
|
+
from typing import List, Mapping, Optional, Dict
|
4
|
+
from pydantic import model_serializer
|
5
|
+
|
6
|
+
class ClassifierScorer(APIJudgmentScorer):
|
7
|
+
"""
|
8
|
+
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
9
|
+
1. a system role that may involve the Example object
|
10
|
+
2. options for scores on the example
|
11
|
+
|
12
|
+
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
13
|
+
|
14
|
+
ex:
|
15
|
+
system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
|
16
|
+
options = {"positive": 1, "negative": 0}
|
17
|
+
|
18
|
+
Args:
|
19
|
+
name (str): The name of the scorer
|
20
|
+
slug (str): A unique identifier for the scorer
|
21
|
+
conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
|
22
|
+
options (Mapping[str, float]): A mapping of classification options to their corresponding scores
|
23
|
+
threshold (float): The threshold for determining success (default: 0.5)
|
24
|
+
include_reason (bool): Whether to include reasoning in the response (default: True)
|
25
|
+
strict_mode (bool): Whether to use strict mode (default: False)
|
26
|
+
verbose_mode (bool): Whether to include verbose logging (default: False)
|
27
|
+
"""
|
28
|
+
name: Optional[str] = None
|
29
|
+
slug: Optional[str] = None
|
30
|
+
conversation: Optional[List[dict]] = None
|
31
|
+
options: Optional[Mapping[str, float]] = None
|
32
|
+
verbose_mode: bool = False
|
33
|
+
strict_mode: bool = False
|
34
|
+
include_reason: bool = True,
|
35
|
+
async_mode: bool = True,
|
36
|
+
threshold: float = 0.5
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
name: str,
|
41
|
+
slug: str,
|
42
|
+
conversation: List[dict],
|
43
|
+
options: Mapping[str, float],
|
44
|
+
threshold: float = 0.5,
|
45
|
+
include_reason: bool = True,
|
46
|
+
strict_mode: bool = False,
|
47
|
+
verbose_mode: bool = False,
|
48
|
+
async_mode: bool = True,
|
49
|
+
):
|
50
|
+
super().__init__(
|
51
|
+
threshold=threshold,
|
52
|
+
score_type=APIScorer.CLASSIFIER,
|
53
|
+
)
|
54
|
+
self.name = name
|
55
|
+
self.verbose_mode = verbose_mode
|
56
|
+
self.strict_mode = strict_mode
|
57
|
+
self.include_reason = include_reason
|
58
|
+
self.slug = slug
|
59
|
+
self.conversation = conversation
|
60
|
+
self.options = options
|
61
|
+
self.async_mode = async_mode
|
62
|
+
|
63
|
+
def update_name(self, name: str):
|
64
|
+
"""
|
65
|
+
Updates the name of the scorer.
|
66
|
+
"""
|
67
|
+
self.name = name
|
68
|
+
|
69
|
+
def update_threshold(self, threshold: float):
|
70
|
+
"""
|
71
|
+
Updates the threshold of the scorer.
|
72
|
+
"""
|
73
|
+
self.threshold = threshold
|
74
|
+
|
75
|
+
def update_conversation(self, conversation: List[dict]):
|
76
|
+
"""
|
77
|
+
Updates the conversation with the new conversation.
|
78
|
+
|
79
|
+
Sample conversation:
|
80
|
+
[{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
|
81
|
+
"""
|
82
|
+
self.conversation = conversation
|
83
|
+
|
84
|
+
def update_options(self, options: Mapping[str, float]):
|
85
|
+
"""
|
86
|
+
Updates the options with the new options.
|
87
|
+
|
88
|
+
Sample options:
|
89
|
+
{"yes": 1, "no": 0}
|
90
|
+
"""
|
91
|
+
self.options = options
|
92
|
+
|
93
|
+
def __str__(self):
|
94
|
+
return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
|
95
|
+
|
96
|
+
# @model_serializer
|
97
|
+
# def serialize_model(self) -> dict:
|
98
|
+
# """
|
99
|
+
# Defines how the ClassifierScorer should be serialized when model_dump() is called.
|
100
|
+
# """
|
101
|
+
# return {
|
102
|
+
# "name": self.name,
|
103
|
+
# "score_type": self.name,
|
104
|
+
# "conversation": self.conversation,
|
105
|
+
# "options": self.options,
|
106
|
+
# "threshold": self.threshold,
|
107
|
+
# "include_reason": self.include_reason,
|
108
|
+
# "async_mode": self.async_mode,
|
109
|
+
# "strict_mode": self.strict_mode,
|
110
|
+
# "verbose_mode": self.verbose_mode,
|
111
|
+
# }
|
112
|
+
|
113
|
+
def to_dict(self) -> dict:
|
114
|
+
return {
|
115
|
+
"name": self.name,
|
116
|
+
"score_type": self.name,
|
117
|
+
"conversation": self.conversation,
|
118
|
+
"options": self.options,
|
119
|
+
"threshold": self.threshold,
|
120
|
+
"include_reason": self.include_reason,
|
121
|
+
"async_mode": self.async_mode,
|
122
|
+
"strict_mode": self.strict_mode,
|
123
|
+
"verbose_mode": self.verbose_mode,
|
124
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` tool dependency scorer
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Internal imports
|
6
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
|
+
from judgeval.constants import APIScorer
|
8
|
+
from typing import Optional, Dict
|
9
|
+
class ToolDependencyScorer(APIJudgmentScorer):
|
10
|
+
kwargs: Optional[Dict] = None
|
11
|
+
def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
|
12
|
+
super().__init__(
|
13
|
+
threshold=threshold,
|
14
|
+
score_type=APIScorer.TOOL_DEPENDENCY
|
15
|
+
)
|
16
|
+
self.kwargs = {"enable_param_checking": enable_param_checking}
|
17
|
+
|
18
|
+
@property
|
19
|
+
def __name__(self):
|
20
|
+
return "Tool Dependency"
|
@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
|
|
7
7
|
from judgeval.scorers import ClassifierScorer
|
8
8
|
|
9
9
|
Text2SQLScorer = ClassifierScorer(
|
10
|
-
"Text to SQL",
|
10
|
+
name="Text to SQL",
|
11
11
|
slug="text2sql-1010101010",
|
12
12
|
threshold=1.0,
|
13
13
|
conversation=[{
|
@@ -37,6 +37,7 @@ from judgeval.scorers.utils import (
|
|
37
37
|
get_or_create_event_loop,
|
38
38
|
create_verbose_logs
|
39
39
|
)
|
40
|
+
from judgeval.judges import JudgevalJudge
|
40
41
|
|
41
42
|
|
42
43
|
class ReasonScore(BaseModel):
|
@@ -49,7 +50,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
49
50
|
score_type: str
|
50
51
|
threshold: float = Field(default=0.5)
|
51
52
|
using_native_model: bool = Field(default=True)
|
52
|
-
|
53
|
+
model: Optional[JudgevalJudge] = Field(default=None)
|
54
|
+
skipped: bool = Field(default=False)
|
53
55
|
# DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
|
54
56
|
_response: Optional[dict] = None
|
55
57
|
_result: Optional[float] = None
|
@@ -276,166 +278,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
276
278
|
def __name__(self):
|
277
279
|
return self.name
|
278
280
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
"""
|
283
|
-
This is a PromptScorer that takes
|
284
|
-
1. a system role that may involve the Example object
|
285
|
-
2. options for scores on the example
|
286
|
-
|
287
|
-
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
288
|
-
|
289
|
-
ex:
|
290
|
-
system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
|
291
|
-
options = {"positive": 1, "negative": 0}
|
292
|
-
"""
|
293
|
-
|
294
|
-
conversation: List[dict]
|
295
|
-
options: Mapping[str, float]
|
296
|
-
|
297
|
-
def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
|
298
|
-
threshold: float = 0.5, include_reason: bool = True,
|
299
|
-
async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
|
300
|
-
# Initialize BaseModel first with all fields
|
301
|
-
BaseModel.__init__(
|
302
|
-
self,
|
303
|
-
name=name,
|
304
|
-
slug=slug,
|
305
|
-
score_type=name,
|
306
|
-
conversation=conversation,
|
307
|
-
options=options,
|
308
|
-
threshold=threshold,
|
309
|
-
include_reason=include_reason,
|
310
|
-
async_mode=async_mode,
|
311
|
-
strict_mode=strict_mode,
|
312
|
-
verbose_mode=verbose_mode,
|
313
|
-
)
|
314
|
-
# Then initialize JudgevalScorer
|
315
|
-
JudgevalScorer.__init__(
|
316
|
-
self,
|
317
|
-
score_type=name,
|
318
|
-
threshold=threshold,
|
319
|
-
include_reason=include_reason,
|
320
|
-
async_mode=async_mode,
|
321
|
-
strict_mode=strict_mode,
|
322
|
-
verbose_mode=verbose_mode,
|
323
|
-
)
|
324
|
-
|
325
|
-
def _build_measure_prompt(self, example: Example) -> List[dict]:
|
326
|
-
"""
|
327
|
-
Builds the measure prompt for the classifier scorer.
|
328
|
-
|
329
|
-
Args:
|
330
|
-
example (Example): The example to build the prompt for
|
331
|
-
|
332
|
-
Returns:
|
333
|
-
List[dict]: The measure prompt for the classifier scorer
|
334
|
-
"""
|
335
|
-
replacement_words = {
|
336
|
-
"{{actual_output}}": example.actual_output,
|
337
|
-
"{{expected_output}}": example.expected_output,
|
338
|
-
"{{context}}": example.context,
|
339
|
-
"{{retrieval_context}}": example.retrieval_context,
|
340
|
-
"{{tools_called}}": example.tools_called,
|
341
|
-
"{{expected_tools}}": example.expected_tools,
|
342
|
-
}
|
343
|
-
# Make a copy of the conversation to avoid modifying the original
|
344
|
-
conversation_copy = [dict(message) for message in self.conversation]
|
345
|
-
|
346
|
-
# Only replace if double brackets are found in the content
|
347
|
-
for message in conversation_copy:
|
348
|
-
content = message["content"]
|
349
|
-
if "{{" in content:
|
350
|
-
for key, value in replacement_words.items():
|
351
|
-
if key in content:
|
352
|
-
message["content"] = content.replace(key, str(value))
|
353
|
-
return conversation_copy
|
354
|
-
|
355
|
-
def _build_schema(self) -> dict:
|
356
|
-
return self.options
|
357
|
-
|
358
|
-
def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
|
359
|
-
"""
|
360
|
-
Enforces the judge model to choose an option from the schema.
|
361
|
-
|
362
|
-
We want the model to choose an option from the schema and a reason for the choice.
|
363
|
-
"""
|
364
|
-
options = list(schema.keys())
|
365
|
-
options_str = ", ".join(options)
|
366
|
-
|
367
|
-
system_role = judge_prompt[0]["content"]
|
368
|
-
system_role += (
|
369
|
-
f"\n\nYou must choose one of the following options: {options_str}. "
|
370
|
-
"Format your response as a JSON object with two fields:\n"
|
371
|
-
"1. 'choice': Your selected option (must be one of the provided choices)\n"
|
372
|
-
"2. 'reason': A brief explanation for why you made this choice\n\n"
|
373
|
-
"Example response format:\n"
|
374
|
-
"{\n"
|
375
|
-
' "choice": "<one of the valid options>",\n'
|
376
|
-
' "reason": "<your explanation>"\n'
|
377
|
-
"}"
|
378
|
-
)
|
379
|
-
|
380
|
-
judge_prompt[0]["content"] = system_role
|
381
|
-
return judge_prompt
|
382
|
-
|
383
|
-
def _process_response(self, response: dict) -> Tuple[float, str]:
|
384
|
-
choice = response.get("choice")
|
385
|
-
if choice not in self.options:
|
386
|
-
raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
|
387
|
-
reason = response.get("reason", "No reason could be found in model response.")
|
388
|
-
return self.options[choice], reason
|
389
|
-
|
390
|
-
def _success_check(self, **kwargs) -> bool:
|
391
|
-
return self.score >= self.threshold
|
392
|
-
|
393
|
-
def update_name(self, name: str):
|
394
|
-
"""
|
395
|
-
Updates the name of the scorer.
|
396
|
-
"""
|
397
|
-
self.name = name
|
398
|
-
|
399
|
-
def update_threshold(self, threshold: float):
|
400
|
-
"""
|
401
|
-
Updates the threshold of the scorer.
|
402
|
-
"""
|
403
|
-
self.threshold = threshold
|
404
|
-
|
405
|
-
def update_conversation(self, conversation: List[dict]):
|
406
|
-
"""
|
407
|
-
Updates the conversation with the new conversation.
|
408
|
-
|
409
|
-
Sample conversation:
|
410
|
-
[{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
|
411
|
-
"""
|
412
|
-
self.conversation = conversation
|
413
|
-
|
414
|
-
def update_options(self, options: Mapping[str, float]):
|
415
|
-
"""
|
416
|
-
Updates the options with the new options.
|
417
|
-
|
418
|
-
Sample options:
|
419
|
-
{"yes": 1, "no": 0}
|
420
|
-
"""
|
421
|
-
self.options = options
|
422
|
-
|
423
|
-
def __str__(self):
|
424
|
-
return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
|
425
|
-
|
426
|
-
@model_serializer
|
427
|
-
def serialize_model(self) -> dict:
|
428
|
-
"""
|
429
|
-
Defines how the ClassifierScorer should be serialized when model_dump() is called.
|
430
|
-
"""
|
431
|
-
return {
|
432
|
-
"name": self.name,
|
433
|
-
"score_type": self.score_type,
|
434
|
-
"conversation": self.conversation,
|
435
|
-
"options": self.options,
|
436
|
-
"threshold": self.threshold,
|
437
|
-
"include_reason": self.include_reason,
|
438
|
-
"async_mode": self.async_mode,
|
439
|
-
"strict_mode": self.strict_mode,
|
440
|
-
"verbose_mode": self.verbose_mode,
|
441
|
-
}
|
281
|
+
class Config:
|
282
|
+
arbitrary_types_allowed = True
|
judgeval/scorers/score.py
CHANGED
@@ -48,7 +48,7 @@ async def safe_a_score_example(
|
|
48
48
|
info(f"Successfully scored example {example.example_id}")
|
49
49
|
except MissingTestCaseParamsError as e:
|
50
50
|
if skip_on_missing_params: # Skip the example if the scorer requires parameters that are missing
|
51
|
-
with example_logging_context(example.
|
51
|
+
with example_logging_context(example.created_at, example.example_id):
|
52
52
|
warning(f"Skipping example {example.example_id} due to missing parameters")
|
53
53
|
scorer.skipped = True
|
54
54
|
return
|
@@ -56,10 +56,10 @@ async def safe_a_score_example(
|
|
56
56
|
if ignore_errors: # Gracefully handle the error, does not stop the evaluation
|
57
57
|
scorer.error = str(e)
|
58
58
|
scorer.success = False
|
59
|
-
with example_logging_context(example.
|
59
|
+
with example_logging_context(example.created_at, example.example_id):
|
60
60
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
61
61
|
else: # Raise the error and stop the evaluation
|
62
|
-
with example_logging_context(example.
|
62
|
+
with example_logging_context(example.created_at, example.example_id):
|
63
63
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
64
64
|
raise
|
65
65
|
except TypeError: # in case a_score_example does not accept _show_indicator
|
@@ -68,27 +68,27 @@ async def safe_a_score_example(
|
|
68
68
|
except MissingTestCaseParamsError as e:
|
69
69
|
if skip_on_missing_params:
|
70
70
|
scorer.skipped = True
|
71
|
-
with example_logging_context(example.
|
71
|
+
with example_logging_context(example.created_at, example.example_id):
|
72
72
|
warning(f"Skipping example {example.example_id} due to missing parameters")
|
73
73
|
return
|
74
74
|
else:
|
75
75
|
if ignore_errors:
|
76
76
|
scorer.error = str(e)
|
77
77
|
scorer.success = False
|
78
|
-
with example_logging_context(example.
|
78
|
+
with example_logging_context(example.created_at, example.example_id):
|
79
79
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
80
80
|
else:
|
81
|
-
with example_logging_context(example.
|
81
|
+
with example_logging_context(example.created_at, example.example_id):
|
82
82
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
83
83
|
raise
|
84
84
|
except Exception as e:
|
85
85
|
if ignore_errors:
|
86
86
|
scorer.error = str(e)
|
87
87
|
scorer.success = False # Assuming you want to set success to False
|
88
|
-
with example_logging_context(example.
|
88
|
+
with example_logging_context(example.created_at, example.example_id):
|
89
89
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
90
90
|
else:
|
91
|
-
with example_logging_context(example.
|
91
|
+
with example_logging_context(example.created_at, example.example_id):
|
92
92
|
error(f"Stopping example {example.example_id}: {str(e)}")
|
93
93
|
raise
|
94
94
|
|
@@ -128,7 +128,7 @@ async def score_task(
|
|
128
128
|
except MissingTestCaseParamsError as e:
|
129
129
|
if skip_on_missing_params:
|
130
130
|
scorer.skipped = True
|
131
|
-
with example_logging_context(example.
|
131
|
+
with example_logging_context(example.created_at, example.example_id):
|
132
132
|
debug(f"Skipping example {example.example_id} due to missing parameters")
|
133
133
|
return
|
134
134
|
else:
|
@@ -137,7 +137,7 @@ async def score_task(
|
|
137
137
|
scorer.success = False # Override success
|
138
138
|
finish_text = "Failed"
|
139
139
|
else:
|
140
|
-
with example_logging_context(example.
|
140
|
+
with example_logging_context(example.created_at, example.example_id):
|
141
141
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
142
142
|
raise
|
143
143
|
except TypeError:
|
@@ -147,7 +147,7 @@ async def score_task(
|
|
147
147
|
except MissingTestCaseParamsError as e:
|
148
148
|
if skip_on_missing_params:
|
149
149
|
scorer.skipped = True
|
150
|
-
with example_logging_context(example.
|
150
|
+
with example_logging_context(example.created_at, example.example_id):
|
151
151
|
debug(f"Skipping example {example.example_id} due to missing parameters")
|
152
152
|
return
|
153
153
|
else:
|
@@ -156,7 +156,7 @@ async def score_task(
|
|
156
156
|
scorer.success = False # Override success
|
157
157
|
finish_text = "Failed"
|
158
158
|
else:
|
159
|
-
with example_logging_context(example.
|
159
|
+
with example_logging_context(example.created_at, example.example_id):
|
160
160
|
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
161
161
|
raise
|
162
162
|
except Exception as e:
|
@@ -164,10 +164,10 @@ async def score_task(
|
|
164
164
|
scorer.error = str(e)
|
165
165
|
scorer.success = False # Override success
|
166
166
|
finish_text = "Failed"
|
167
|
-
with example_logging_context(example.
|
167
|
+
with example_logging_context(example.created_at, example.example_id):
|
168
168
|
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
169
169
|
else:
|
170
|
-
with example_logging_context(example.
|
170
|
+
with example_logging_context(example.created_at, example.example_id):
|
171
171
|
error(f"Stopping example {example.example_id}: {str(e)}")
|
172
172
|
raise
|
173
173
|
|
@@ -305,7 +305,7 @@ async def a_execute_scoring(
|
|
305
305
|
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
306
306
|
) as pbar:
|
307
307
|
for i, ex in enumerate(examples):
|
308
|
-
with example_logging_context(ex.
|
308
|
+
with example_logging_context(ex.created_at, ex.example_id):
|
309
309
|
debug(f"Starting scoring for example {ex.example_id}")
|
310
310
|
debug(f"Input: {ex.input}")
|
311
311
|
debug(f"Using {len(scorers)} scorers")
|