judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class GroundednessScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.GROUNDEDNESS,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
22
|
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
-
]
|
23
|
+
],
|
23
24
|
)
|
24
25
|
|
25
26
|
@property
|
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class HallucinationScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.HALLUCINATION,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
22
|
ExampleParams.CONTEXT,
|
22
|
-
]
|
23
|
+
],
|
23
24
|
)
|
24
25
|
|
25
26
|
@property
|
@@ -10,15 +10,16 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class InstructionAdherenceScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.INSTRUCTION_ADHERENCE,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
|
-
]
|
22
|
+
],
|
22
23
|
)
|
23
24
|
|
24
25
|
@property
|
@@ -5,33 +5,32 @@ TODO add link to docs page for this scorer
|
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
8
|
-
|
9
8
|
# External imports
|
10
9
|
from pydantic import BaseModel, Field
|
10
|
+
|
11
11
|
# Internal imports
|
12
12
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
13
13
|
from judgeval.constants import APIScorer
|
14
14
|
from judgeval.data import ExampleParams
|
15
15
|
|
16
|
+
|
16
17
|
class JSONCorrectnessScorer(APIJudgmentScorer):
|
17
18
|
json_schema: BaseModel = Field(None, exclude=True)
|
18
|
-
|
19
|
+
|
19
20
|
def __init__(self, threshold: float, json_schema: BaseModel):
|
20
21
|
super().__init__(
|
21
|
-
threshold=threshold,
|
22
|
+
threshold=threshold,
|
22
23
|
score_type=APIScorer.JSON_CORRECTNESS,
|
23
24
|
required_params=[
|
24
25
|
ExampleParams.INPUT,
|
25
26
|
ExampleParams.ACTUAL_OUTPUT,
|
26
|
-
]
|
27
|
+
],
|
27
28
|
)
|
28
|
-
object.__setattr__(self,
|
29
|
-
|
29
|
+
object.__setattr__(self, "json_schema", json_schema)
|
30
|
+
|
30
31
|
def to_dict(self):
|
31
32
|
base_dict = super().to_dict() # Get the parent class's dictionary
|
32
|
-
base_dict["kwargs"] = {
|
33
|
-
"json_schema": self.json_schema.model_json_schema()
|
34
|
-
}
|
33
|
+
base_dict["kwargs"] = {"json_schema": self.json_schema.model_json_schema()}
|
35
34
|
return base_dict
|
36
35
|
|
37
36
|
@property
|
@@ -7,21 +7,21 @@ TODO add link to docs page for this scorer
|
|
7
7
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class SummarizationScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.SUMMARIZATION,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
|
-
]
|
22
|
+
],
|
22
23
|
)
|
23
24
|
|
24
25
|
@property
|
25
26
|
def __name__(self):
|
26
27
|
return "Summarization"
|
27
|
-
|
@@ -6,13 +6,13 @@
|
|
6
6
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
7
|
from judgeval.constants import APIScorer
|
8
8
|
from typing import Optional, Dict
|
9
|
+
|
10
|
+
|
9
11
|
class ToolDependencyScorer(APIJudgmentScorer):
|
10
12
|
kwargs: Optional[Dict] = None
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
score_type=APIScorer.TOOL_DEPENDENCY
|
15
|
-
)
|
13
|
+
|
14
|
+
def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.TOOL_DEPENDENCY)
|
16
16
|
self.kwargs = {"enable_param_checking": enable_param_checking}
|
17
17
|
|
18
18
|
@property
|
@@ -6,11 +6,14 @@
|
|
6
6
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
7
7
|
from judgeval.constants import APIScorer
|
8
8
|
from typing import Optional, Dict
|
9
|
+
|
10
|
+
|
9
11
|
class ToolOrderScorer(APIJudgmentScorer):
|
10
12
|
kwargs: Optional[Dict] = None
|
11
|
-
|
13
|
+
|
14
|
+
def __init__(self, threshold: float = 1.0, exact_match: bool = False):
|
12
15
|
super().__init__(
|
13
|
-
threshold=threshold,
|
16
|
+
threshold=threshold,
|
14
17
|
score_type=APIScorer.TOOL_ORDER,
|
15
18
|
)
|
16
19
|
self.kwargs = {"exact_match": exact_match}
|
@@ -4,15 +4,17 @@ ClassifierScorer implementation for basic Text-to-SQL evaluation.
|
|
4
4
|
Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
|
5
5
|
Determines if the LLM-generated SQL query is valid and works for the natural language query.
|
6
6
|
"""
|
7
|
+
|
7
8
|
from judgeval.scorers import ClassifierScorer
|
8
9
|
|
9
10
|
Text2SQLScorer = ClassifierScorer(
|
10
11
|
name="Text to SQL",
|
11
12
|
slug="text2sql-1010101010",
|
12
13
|
threshold=1.0,
|
13
|
-
conversation=[
|
14
|
-
|
15
|
-
|
14
|
+
conversation=[
|
15
|
+
{
|
16
|
+
"role": "system",
|
17
|
+
"content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
|
16
18
|
|
17
19
|
** TASK INSTRUCTIONS **
|
18
20
|
Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
|
@@ -44,11 +46,8 @@ LLM generated SQL query:
|
|
44
46
|
|
45
47
|
Table schema:
|
46
48
|
{{context}}
|
47
|
-
"""
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
"N": 0.0
|
52
|
-
}
|
49
|
+
""",
|
50
|
+
}
|
51
|
+
],
|
52
|
+
options={"Y": 1.0, "N": 0.0},
|
53
53
|
)
|
54
|
-
|
@@ -9,7 +9,7 @@ To implement a subclass of PromptScorer, you need to implement the following met
|
|
9
9
|
- success_check(): determines whether the evaluation was successful
|
10
10
|
|
11
11
|
The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
|
12
|
-
by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
|
12
|
+
by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
|
13
13
|
the judge, and parses the structured response to determine a score.
|
14
14
|
|
15
15
|
For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
|
@@ -26,17 +26,17 @@ NOTE: When implementing build_measure_prompt and build_schema:
|
|
26
26
|
"""
|
27
27
|
|
28
28
|
from abc import abstractmethod
|
29
|
-
from typing import List, Optional, Tuple, Any
|
30
|
-
from pydantic import BaseModel,
|
29
|
+
from typing import List, Optional, Tuple, Any
|
30
|
+
from pydantic import BaseModel, Field
|
31
31
|
|
32
32
|
from judgeval.data import Example
|
33
33
|
from judgeval.data.example import ExampleParams
|
34
34
|
from judgeval.scorers import JudgevalScorer
|
35
35
|
from judgeval.scorers.utils import (
|
36
|
-
scorer_progress_meter,
|
36
|
+
scorer_progress_meter,
|
37
37
|
parse_response_json,
|
38
38
|
get_or_create_event_loop,
|
39
|
-
create_verbose_logs
|
39
|
+
create_verbose_logs,
|
40
40
|
)
|
41
41
|
from judgeval.judges import JudgevalJudge
|
42
42
|
|
@@ -56,10 +56,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
56
56
|
# DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
|
57
57
|
_response: Optional[dict] = None
|
58
58
|
_result: Optional[float] = None
|
59
|
-
|
59
|
+
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
|
-
name: str,
|
62
|
+
name: str,
|
63
63
|
threshold: float = 0.5,
|
64
64
|
include_reason: bool = True,
|
65
65
|
async_mode: bool = True,
|
@@ -91,10 +91,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
91
91
|
)
|
92
92
|
|
93
93
|
def score_example(
|
94
|
-
|
95
|
-
|
96
|
-
_show_indicator: bool = True
|
97
|
-
) -> float:
|
94
|
+
self, example: Example, _show_indicator: bool = True
|
95
|
+
) -> float | None:
|
98
96
|
"""
|
99
97
|
Synchronous method for scoring an example using the prompt criteria.
|
100
98
|
"""
|
@@ -104,6 +102,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
104
102
|
loop.run_until_complete(
|
105
103
|
self.a_score_example(example, _show_indicator=False)
|
106
104
|
)
|
105
|
+
return self._result
|
107
106
|
else:
|
108
107
|
result, reason = self.evaluate(example)
|
109
108
|
self.reason = reason
|
@@ -117,10 +116,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
117
116
|
return result
|
118
117
|
|
119
118
|
async def a_score_example(
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
119
|
+
self,
|
120
|
+
example: Example,
|
121
|
+
_show_indicator: bool = True,
|
122
|
+
) -> float:
|
124
123
|
"""
|
125
124
|
Async method for scoring an example using the prompt criteria.
|
126
125
|
"""
|
@@ -135,30 +134,32 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
135
134
|
],
|
136
135
|
)
|
137
136
|
return result
|
138
|
-
|
137
|
+
|
139
138
|
def evaluate(self, example: Example) -> Tuple[Any, str]:
|
140
139
|
"""
|
141
140
|
Synchronous helper method for evaluating an example using the prompt criteria.
|
142
141
|
|
143
|
-
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
142
|
+
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
144
143
|
for evaluation. The result is then parsed as JSON and returned.
|
145
144
|
|
146
145
|
NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
|
147
146
|
"""
|
148
147
|
prompt = self._build_measure_prompt(example)
|
149
|
-
if self.using_native_model:
|
148
|
+
if self.using_native_model and self.model:
|
150
149
|
res = self.model.generate(prompt)
|
151
150
|
response = parse_response_json(res, self)
|
152
151
|
result, reason = self._process_response(response)
|
153
152
|
return result, reason
|
154
153
|
else:
|
155
|
-
raise NotImplementedError(
|
154
|
+
raise NotImplementedError(
|
155
|
+
"Non-native judge models are not supported in synchronous mode yet."
|
156
|
+
)
|
156
157
|
|
157
158
|
async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
|
158
159
|
"""
|
159
160
|
Asynchronous helper method for evaluating an example using the prompt criteria.
|
160
161
|
|
161
|
-
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
162
|
+
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
162
163
|
for evaluation. The result is then parsed as JSON and returned.
|
163
164
|
|
164
165
|
NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
|
@@ -166,7 +167,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
166
167
|
judge_prompt = self._build_measure_prompt(example)
|
167
168
|
schema = self._build_schema()
|
168
169
|
prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
|
169
|
-
if self.using_native_model:
|
170
|
+
if self.using_native_model and self.model:
|
170
171
|
res = await self.model.a_generate(prompt)
|
171
172
|
response = parse_response_json(res, self)
|
172
173
|
self._response = response
|
@@ -177,7 +178,9 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
177
178
|
self._response = response
|
178
179
|
return result, reason
|
179
180
|
else:
|
180
|
-
raise NotImplementedError(
|
181
|
+
raise NotImplementedError(
|
182
|
+
"Non-native judge models are not supported in async mode yet."
|
183
|
+
)
|
181
184
|
|
182
185
|
# TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
|
183
186
|
@abstractmethod
|
@@ -190,7 +193,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
190
193
|
|
191
194
|
The prompt is typically a set of instructions that the judge model uses to evaluate the example.
|
192
195
|
|
193
|
-
This function returns a conversation prompt of the form
|
196
|
+
This function returns a conversation prompt of the form
|
194
197
|
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
|
195
198
|
|
196
199
|
A basic version of implementing this function could be as follows:
|
@@ -201,7 +204,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
201
204
|
]
|
202
205
|
"""
|
203
206
|
pass
|
204
|
-
|
207
|
+
|
205
208
|
# TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
|
206
209
|
@abstractmethod
|
207
210
|
def _build_schema(self) -> dict:
|
@@ -214,23 +217,23 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
214
217
|
return {"score": int, "reason": str}
|
215
218
|
"""
|
216
219
|
pass
|
217
|
-
|
220
|
+
|
218
221
|
def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
|
219
222
|
"""
|
220
223
|
Formats the final prompt to the judge model.
|
221
224
|
|
222
|
-
This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
|
223
|
-
and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
|
225
|
+
This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
|
226
|
+
and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
|
224
227
|
The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
|
225
228
|
|
226
229
|
Args:
|
227
|
-
judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
|
230
|
+
judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
|
228
231
|
Each dictionary should contain a "content" key.
|
229
|
-
schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
|
232
|
+
schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
|
230
233
|
and the values are the types of the corresponding values.
|
231
234
|
|
232
235
|
Returns:
|
233
|
-
List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
|
236
|
+
List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
|
234
237
|
of the first dictionary.
|
235
238
|
|
236
239
|
Raises:
|
@@ -242,19 +245,27 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
242
245
|
formatted_prompt = format_measure_prompt(judge_prompt, schema)
|
243
246
|
# formatted_prompt[0]["content"] will include the schema enforcement prompt
|
244
247
|
"""
|
245
|
-
SCHEMA_ENFORCEMENT_PROMPT =
|
246
|
-
|
248
|
+
SCHEMA_ENFORCEMENT_PROMPT = (
|
249
|
+
"\n\nPlease provide your response in the following JSON format: {"
|
250
|
+
)
|
251
|
+
if isinstance(judge_prompt, list) and all(
|
252
|
+
isinstance(item, dict) for item in judge_prompt
|
253
|
+
):
|
247
254
|
# create formatting string for schema enforcement
|
248
|
-
# schema is a map between key and type of the value
|
255
|
+
# schema is a map between key and type of the value
|
249
256
|
for key, key_type in schema.items():
|
250
257
|
SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
|
251
|
-
SCHEMA_ENFORCEMENT_PROMPT =
|
258
|
+
SCHEMA_ENFORCEMENT_PROMPT = (
|
259
|
+
SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"
|
260
|
+
) # remove trailing comma and space
|
252
261
|
judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
|
253
262
|
return judge_prompt
|
254
263
|
else:
|
255
|
-
raise TypeError(
|
264
|
+
raise TypeError(
|
265
|
+
f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead."
|
266
|
+
)
|
256
267
|
|
257
|
-
@abstractmethod
|
268
|
+
@abstractmethod
|
258
269
|
def _process_response(self, response: dict):
|
259
270
|
"""
|
260
271
|
Customizable method for processing the response from the judge model.
|
@@ -276,7 +287,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
276
287
|
Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
|
277
288
|
"""
|
278
289
|
pass
|
279
|
-
|
290
|
+
|
280
291
|
@property
|
281
292
|
def __name__(self):
|
282
293
|
return self.name
|