judgeval 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +229 -44
- judgeval/constants.py +15 -3
- judgeval/data/datasets/__init__.py +2 -1
- judgeval/data/datasets/dataset.py +1 -122
- judgeval/data/datasets/eval_dataset_client.py +193 -0
- judgeval/data/result.py +16 -1
- judgeval/evaluation_run.py +2 -1
- judgeval/judges/utils.py +14 -2
- judgeval/judgment_client.py +64 -7
- judgeval/run_evaluation.py +19 -0
- judgeval/scorers/judgeval_scorer.py +8 -8
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
- judgeval/scorers/prompt_scorer.py +2 -2
- judgeval/scorers/score.py +11 -11
- judgeval/scorers/utils.py +3 -3
- judgeval/tracer/__init__.py +3 -0
- {judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/METADATA +5 -4
- {judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/RECORD +21 -19
- {judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/WHEEL +0 -0
- {judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,193 @@
|
|
1
|
+
|
2
|
+
from typing import Optional
|
3
|
+
import requests
|
4
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
5
|
+
|
6
|
+
from judgeval.common.logger import debug, error, warning, info
|
7
|
+
from judgeval.constants import (
|
8
|
+
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
|
+
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL
|
11
|
+
)
|
12
|
+
from judgeval.data import Example
|
13
|
+
from judgeval.data.datasets import EvalDataset
|
14
|
+
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
class EvalDatasetClient:
|
20
|
+
def __init__(self, judgment_api_key: str):
|
21
|
+
self.judgment_api_key = judgment_api_key
|
22
|
+
|
23
|
+
def create_dataset(self) -> EvalDataset:
|
24
|
+
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
25
|
+
|
26
|
+
def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
|
27
|
+
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
28
|
+
if overwrite:
|
29
|
+
warning(f"Overwrite enabled for alias '{alias}'")
|
30
|
+
"""
|
31
|
+
Pushes the dataset to Judgment platform
|
32
|
+
|
33
|
+
Mock request:
|
34
|
+
dataset = {
|
35
|
+
"alias": alias,
|
36
|
+
"ground_truths": [...],
|
37
|
+
"examples": [...],
|
38
|
+
"overwrite": overwrite
|
39
|
+
} ==>
|
40
|
+
{
|
41
|
+
"_alias": alias,
|
42
|
+
"_id": "..." # ID of the dataset
|
43
|
+
}
|
44
|
+
"""
|
45
|
+
with Progress(
|
46
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
47
|
+
TextColumn("[progress.description]{task.description}"),
|
48
|
+
transient=False,
|
49
|
+
) as progress:
|
50
|
+
task_id = progress.add_task(
|
51
|
+
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
52
|
+
total=100,
|
53
|
+
)
|
54
|
+
content = {
|
55
|
+
"alias": alias,
|
56
|
+
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
57
|
+
"examples": [e.to_dict() for e in dataset.examples],
|
58
|
+
"overwrite": overwrite,
|
59
|
+
"judgment_api_key": dataset.judgment_api_key
|
60
|
+
}
|
61
|
+
try:
|
62
|
+
response = requests.post(
|
63
|
+
JUDGMENT_DATASETS_PUSH_API_URL,
|
64
|
+
json=content
|
65
|
+
)
|
66
|
+
if response.status_code == 500:
|
67
|
+
error(f"Server error during push: {content.get('message')}")
|
68
|
+
return False
|
69
|
+
response.raise_for_status()
|
70
|
+
except requests.exceptions.HTTPError as err:
|
71
|
+
if response.status_code == 422:
|
72
|
+
error(f"Validation error during push: {err.response.json()}")
|
73
|
+
else:
|
74
|
+
error(f"HTTP error during push: {err}")
|
75
|
+
|
76
|
+
info(f"Successfully pushed dataset with alias '{alias}'")
|
77
|
+
payload = response.json()
|
78
|
+
dataset._alias = payload.get("_alias")
|
79
|
+
dataset._id = payload.get("_id")
|
80
|
+
progress.update(
|
81
|
+
task_id,
|
82
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
83
|
+
)
|
84
|
+
return True
|
85
|
+
|
86
|
+
def pull(self, alias: str) -> EvalDataset:
|
87
|
+
debug(f"Pulling dataset with alias '{alias}'")
|
88
|
+
"""
|
89
|
+
Pulls the dataset from Judgment platform
|
90
|
+
|
91
|
+
Mock request:
|
92
|
+
{
|
93
|
+
"alias": alias,
|
94
|
+
"user_id": user_id
|
95
|
+
}
|
96
|
+
==>
|
97
|
+
{
|
98
|
+
"ground_truths": [...],
|
99
|
+
"examples": [...],
|
100
|
+
"_alias": alias,
|
101
|
+
"_id": "..." # ID of the dataset
|
102
|
+
}
|
103
|
+
"""
|
104
|
+
# Make a POST request to the Judgment API to get the dataset
|
105
|
+
dataset = self.create_dataset()
|
106
|
+
|
107
|
+
with Progress(
|
108
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
109
|
+
TextColumn("[progress.description]{task.description}"),
|
110
|
+
transient=False,
|
111
|
+
) as progress:
|
112
|
+
task_id = progress.add_task(
|
113
|
+
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
114
|
+
total=100,
|
115
|
+
)
|
116
|
+
request_body = {
|
117
|
+
"alias": alias,
|
118
|
+
"judgment_api_key": self.judgment_api_key
|
119
|
+
}
|
120
|
+
|
121
|
+
try:
|
122
|
+
response = requests.post(
|
123
|
+
JUDGMENT_DATASETS_PULL_API_URL,
|
124
|
+
json=request_body
|
125
|
+
)
|
126
|
+
response.raise_for_status()
|
127
|
+
except requests.exceptions.RequestException as e:
|
128
|
+
error(f"Error pulling dataset: {str(e)}")
|
129
|
+
raise
|
130
|
+
|
131
|
+
info(f"Successfully pulled dataset with alias '{alias}'")
|
132
|
+
payload = response.json()
|
133
|
+
dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
|
134
|
+
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
135
|
+
dataset._alias = payload.get("_alias")
|
136
|
+
dataset._id = payload.get("_id")
|
137
|
+
progress.update(
|
138
|
+
task_id,
|
139
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
140
|
+
)
|
141
|
+
|
142
|
+
return dataset
|
143
|
+
|
144
|
+
def pull_all_user_dataset_stats(self) -> dict:
|
145
|
+
debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
|
146
|
+
"""
|
147
|
+
Pulls the user datasets stats from Judgment platform
|
148
|
+
|
149
|
+
Mock request:
|
150
|
+
{
|
151
|
+
"user_id": user_id
|
152
|
+
}
|
153
|
+
==>
|
154
|
+
{
|
155
|
+
"test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
|
156
|
+
"test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
|
157
|
+
...
|
158
|
+
}
|
159
|
+
"""
|
160
|
+
# Make a POST request to the Judgment API to get the dataset
|
161
|
+
|
162
|
+
with Progress(
|
163
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
164
|
+
TextColumn("[progress.description]{task.description}"),
|
165
|
+
transient=False,
|
166
|
+
) as progress:
|
167
|
+
task_id = progress.add_task(
|
168
|
+
f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
|
169
|
+
total=100,
|
170
|
+
)
|
171
|
+
request_body = {
|
172
|
+
"judgment_api_key": self.judgment_api_key
|
173
|
+
}
|
174
|
+
|
175
|
+
try:
|
176
|
+
response = requests.post(
|
177
|
+
JUDGMENT_DATASETS_PULL_ALL_API_URL,
|
178
|
+
json=request_body
|
179
|
+
)
|
180
|
+
response.raise_for_status()
|
181
|
+
except requests.exceptions.RequestException as e:
|
182
|
+
error(f"Error pulling dataset: {str(e)}")
|
183
|
+
raise
|
184
|
+
|
185
|
+
info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
|
186
|
+
payload = response.json()
|
187
|
+
|
188
|
+
progress.update(
|
189
|
+
task_id,
|
190
|
+
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
191
|
+
)
|
192
|
+
|
193
|
+
return payload
|
judgeval/data/result.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
|
-
from typing import List, Union, Optional
|
2
|
+
from typing import List, Union, Optional, Dict, Any
|
3
3
|
|
4
4
|
from judgeval.data import ScorerData, ProcessExample
|
5
5
|
|
@@ -18,6 +18,9 @@ class ScoringResult:
|
|
18
18
|
expected_output (Optional[str]): The expected output of the example
|
19
19
|
context (Optional[List[str]]): The context of the example
|
20
20
|
retrieval_context (Optional[List[str]]): The retrieval context of the example
|
21
|
+
additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
|
22
|
+
tools_called (Optional[List[str]]): The tools called by the example
|
23
|
+
expected_tools (Optional[List[str]]): The expected tools of the example
|
21
24
|
trace_id (Optional[str]): The trace id of the example
|
22
25
|
|
23
26
|
"""
|
@@ -31,6 +34,9 @@ class ScoringResult:
|
|
31
34
|
expected_output: Optional[str] = None
|
32
35
|
context: Optional[List[str]] = None
|
33
36
|
retrieval_context: Optional[List[str]] = None
|
37
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
38
|
+
tools_called: Optional[List[str]] = None
|
39
|
+
expected_tools: Optional[List[str]] = None
|
34
40
|
trace_id: Optional[str] = None
|
35
41
|
|
36
42
|
example_id: Optional[str] = None
|
@@ -46,6 +52,9 @@ class ScoringResult:
|
|
46
52
|
"expected_output": self.expected_output,
|
47
53
|
"context": self.context,
|
48
54
|
"retrieval_context": self.retrieval_context,
|
55
|
+
"additional_metadata": self.additional_metadata,
|
56
|
+
"tools_called": self.tools_called,
|
57
|
+
"expected_tools": self.expected_tools,
|
49
58
|
"trace_id": self.trace_id,
|
50
59
|
"example_id": self.example_id
|
51
60
|
}
|
@@ -59,6 +68,9 @@ class ScoringResult:
|
|
59
68
|
expected_output={self.expected_output}, \
|
60
69
|
context={self.context}, \
|
61
70
|
retrieval_context={self.retrieval_context}, \
|
71
|
+
additional_metadata={self.additional_metadata}, \
|
72
|
+
tools_called={self.tools_called}, \
|
73
|
+
expected_tools={self.expected_tools}, \
|
62
74
|
trace_id={self.trace_id})"
|
63
75
|
|
64
76
|
|
@@ -79,5 +91,8 @@ def generate_scoring_result(
|
|
79
91
|
expected_output=process_example.expected_output,
|
80
92
|
context=process_example.context,
|
81
93
|
retrieval_context=process_example.retrieval_context,
|
94
|
+
additional_metadata=process_example.additional_metadata,
|
95
|
+
tools_called=process_example.tools_called,
|
96
|
+
expected_tools=process_example.expected_tools,
|
82
97
|
trace_id=process_example.trace_id
|
83
98
|
)
|
judgeval/evaluation_run.py
CHANGED
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
|
|
15
15
|
project_name (str): The name of the project the evaluation results belong to
|
16
16
|
eval_name (str): A name for this evaluation run
|
17
17
|
examples (List[Example]): The examples to evaluate
|
18
|
-
scorers (List[Union[JudgmentScorer,
|
18
|
+
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
19
|
model (str): The model used as a judge when using LLM as a Judge
|
20
20
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
21
21
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
|
|
33
33
|
metadata: Optional[Dict[str, Any]] = None
|
34
34
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
35
35
|
judgment_api_key: Optional[str] = ""
|
36
|
+
override: Optional[bool] = False
|
36
37
|
|
37
38
|
def model_dump(self, **kwargs):
|
38
39
|
data = super().model_dump(**kwargs)
|
judgeval/judges/utils.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
|
|
6
6
|
|
7
7
|
from judgeval.common.exceptions import InvalidJudgeModelError
|
8
8
|
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
|
9
|
-
from judgeval.constants import TOGETHER_SUPPORTED_MODELS
|
9
|
+
from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
|
10
10
|
|
11
11
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
12
12
|
|
@@ -33,7 +33,13 @@ def create_judge(
|
|
33
33
|
# Either string or List[str]
|
34
34
|
if isinstance(model, list):
|
35
35
|
for m in model:
|
36
|
-
if m
|
36
|
+
if m in JUDGMENT_SUPPORTED_MODELS:
|
37
|
+
raise NotImplementedError(
|
38
|
+
"""Judgment models are not yet supported for local scoring.
|
39
|
+
Please either set the `use_judgment` flag to True or use
|
40
|
+
non-Judgment models."""
|
41
|
+
)
|
42
|
+
if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
|
37
43
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
38
44
|
return MixtureOfJudges(models=model), True
|
39
45
|
# If model is a string, check that it corresponds to a valid model
|
@@ -41,5 +47,11 @@ def create_judge(
|
|
41
47
|
return LiteLLMJudge(model=model), True
|
42
48
|
if model in TOGETHER_SUPPORTED_MODELS:
|
43
49
|
return TogetherJudge(model=model), True
|
50
|
+
if model in JUDGMENT_SUPPORTED_MODELS:
|
51
|
+
raise NotImplementedError(
|
52
|
+
"""Judgment models are not yet supported for local scoring.
|
53
|
+
Please either set the `use_judgment` flag to True or use
|
54
|
+
non-Judgment models."""
|
55
|
+
)
|
44
56
|
else:
|
45
57
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")
|
judgeval/judgment_client.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example
|
@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
|
|
23
23
|
assert_test
|
24
24
|
)
|
25
25
|
from judgeval.judges import JudgevalJudge
|
26
|
-
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
|
26
|
+
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
27
27
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
28
|
from pydantic import BaseModel
|
29
29
|
|
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
|
|
36
36
|
class JudgmentClient:
|
37
37
|
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
38
38
|
self.judgment_api_key = judgment_api_key
|
39
|
+
self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
|
39
40
|
|
40
41
|
# Verify API key is valid
|
41
42
|
result, response = self._validate_api_key()
|
@@ -121,7 +122,7 @@ class JudgmentClient:
|
|
121
122
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
122
123
|
|
123
124
|
def create_dataset(self) -> EvalDataset:
|
124
|
-
return
|
125
|
+
return self.eval_dataset_client.create_dataset()
|
125
126
|
|
126
127
|
def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
|
127
128
|
"""
|
@@ -137,7 +138,7 @@ class JudgmentClient:
|
|
137
138
|
"""
|
138
139
|
# Set judgment_api_key just in case it was not set
|
139
140
|
dataset.judgment_api_key = self.judgment_api_key
|
140
|
-
return
|
141
|
+
return self.eval_dataset_client.push(dataset, alias, overwrite)
|
141
142
|
|
142
143
|
def pull_dataset(self, alias: str) -> EvalDataset:
|
143
144
|
"""
|
@@ -149,9 +150,20 @@ class JudgmentClient:
|
|
149
150
|
Returns:
|
150
151
|
EvalDataset: The retrieved dataset
|
151
152
|
"""
|
152
|
-
|
153
|
-
|
154
|
-
|
153
|
+
return self.eval_dataset_client.pull(alias)
|
154
|
+
|
155
|
+
def pull_all_user_dataset_stats(self) -> dict:
|
156
|
+
"""
|
157
|
+
Retrieves all dataset stats from the Judgment platform for the user.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
alias (str): The name of the dataset to retrieve
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
EvalDataset: The retrieved dataset
|
164
|
+
"""
|
165
|
+
return self.eval_dataset_client.pull_all_user_dataset_stats()
|
166
|
+
|
155
167
|
|
156
168
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
157
169
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -182,6 +194,51 @@ class JudgmentClient:
|
|
182
194
|
eval_run_result[0]["id"] = result_id
|
183
195
|
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
184
196
|
return eval_run_result
|
197
|
+
|
198
|
+
def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
|
199
|
+
"""
|
200
|
+
Deletes an evaluation from the server by project and run name.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
project_name (str): Name of the project
|
204
|
+
eval_run_name (str): Name of the evaluation run
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
bool: Whether the evaluation was successfully deleted
|
208
|
+
"""
|
209
|
+
eval_run_request_body = EvalRunRequestBody(project_name=project_name,
|
210
|
+
eval_name=eval_run_name,
|
211
|
+
judgment_api_key=self.judgment_api_key)
|
212
|
+
response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
|
213
|
+
json=eval_run_request_body.model_dump(),
|
214
|
+
headers={
|
215
|
+
"Content-Type": "application/json",
|
216
|
+
})
|
217
|
+
if response.status_code != requests.codes.ok:
|
218
|
+
raise ValueError(f"Error deleting eval results: {response.json()}")
|
219
|
+
return response.json()
|
220
|
+
|
221
|
+
def delete_project_evals(self, project_name: str) -> bool:
|
222
|
+
"""
|
223
|
+
Deletes all evaluations from the server for a given project.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
project_name (str): Name of the project
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
bool: Whether the evaluations were successfully deleted
|
230
|
+
"""
|
231
|
+
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
232
|
+
json={
|
233
|
+
"project_name": project_name,
|
234
|
+
"judgment_api_key": self.judgment_api_key
|
235
|
+
},
|
236
|
+
headers={
|
237
|
+
"Content-Type": "application/json",
|
238
|
+
})
|
239
|
+
if response.status_code != requests.codes.ok:
|
240
|
+
raise ValueError(f"Error deleting eval results: {response.json()}")
|
241
|
+
return response.json()
|
185
242
|
|
186
243
|
def _validate_api_key(self):
|
187
244
|
"""
|
judgeval/run_evaluation.py
CHANGED
@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
97
97
|
raise ValueError("The API and local results are not aligned.")
|
98
98
|
if api_result.retrieval_context != local_result.retrieval_context:
|
99
99
|
raise ValueError("The API and local results are not aligned.")
|
100
|
+
if api_result.additional_metadata != local_result.additional_metadata:
|
101
|
+
raise ValueError("The API and local results are not aligned.")
|
102
|
+
if api_result.tools_called != local_result.tools_called:
|
103
|
+
raise ValueError("The API and local results are not aligned.")
|
104
|
+
if api_result.expected_tools != local_result.expected_tools:
|
105
|
+
raise ValueError("The API and local results are not aligned.")
|
106
|
+
|
100
107
|
|
101
108
|
# Merge ScorerData from the API and local scorers together
|
102
109
|
api_scorer_data = api_result.scorers_data
|
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
254
261
|
debug(f"Context: {example.context}")
|
255
262
|
if example.retrieval_context:
|
256
263
|
debug(f"Retrieval context: {example.retrieval_context}")
|
264
|
+
if example.additional_metadata:
|
265
|
+
debug(f"Additional metadata: {example.additional_metadata}")
|
266
|
+
if example.tools_called:
|
267
|
+
debug(f"Tools called: {example.tools_called}")
|
268
|
+
if example.expected_tools:
|
269
|
+
debug(f"Expected tools: {example.expected_tools}")
|
257
270
|
|
258
271
|
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
259
272
|
|
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
379
392
|
'expected_output': result.expected_output,
|
380
393
|
'context': result.context,
|
381
394
|
'retrieval_context': result.retrieval_context,
|
395
|
+
'additional_metadata': result.additional_metadata,
|
396
|
+
'tools_called': result.tools_called,
|
397
|
+
'expected_tools': result.expected_tools,
|
382
398
|
'eval_run_name': result.eval_run_name,
|
383
399
|
'failed_scorers': []
|
384
400
|
}
|
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
397
413
|
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
398
414
|
error_msg += f"Context: {fail_case['context']}\n"
|
399
415
|
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
416
|
+
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
417
|
+
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
418
|
+
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
400
419
|
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
401
420
|
|
402
421
|
for fail_scorer in fail_case['failed_scorers']:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Judgeval Scorer class
|
3
3
|
|
4
4
|
Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
|
5
5
|
To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
|
@@ -57,12 +57,12 @@ class JudgevalScorer:
|
|
57
57
|
verbose_logs: Optional[str] = None,
|
58
58
|
additional_metadata: Optional[Dict] = None
|
59
59
|
):
|
60
|
-
debug(f"Initializing
|
60
|
+
debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
|
61
61
|
if not 0 <= threshold <= 1:
|
62
62
|
raise ValueError("Threshold must be between 0 and 1")
|
63
63
|
if strict_mode:
|
64
64
|
warning("Strict mode enabled - scoring will be more rigorous")
|
65
|
-
info(f"
|
65
|
+
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
66
66
|
self.score_type = score_type
|
67
67
|
self.threshold = threshold
|
68
68
|
self.score = score
|
@@ -81,7 +81,7 @@ class JudgevalScorer:
|
|
81
81
|
|
82
82
|
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
83
83
|
"""
|
84
|
-
Adds the evaluation model to the
|
84
|
+
Adds the evaluation model to the JudgevalScorer instance
|
85
85
|
|
86
86
|
This method is used at eval time
|
87
87
|
"""
|
@@ -116,10 +116,10 @@ class JudgevalScorer:
|
|
116
116
|
raise NotImplementedError("You must implement the `passes` method in your custom scorer")
|
117
117
|
|
118
118
|
def __str__(self):
|
119
|
-
debug("Converting
|
119
|
+
debug("Converting JudgevalScorer instance to string representation")
|
120
120
|
if self.error:
|
121
|
-
warning(f"
|
122
|
-
info(f"
|
121
|
+
warning(f"JudgevalScorer contains error: {self.error}")
|
122
|
+
info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
|
123
123
|
attributes = {
|
124
124
|
"score_type": self.score_type,
|
125
125
|
"threshold": self.threshold,
|
@@ -137,4 +137,4 @@ class JudgevalScorer:
|
|
137
137
|
"verbose_logs": self.verbose_logs,
|
138
138
|
"additional_metadata": self.additional_metadata,
|
139
139
|
}
|
140
|
-
return f"
|
140
|
+
return f"JudgevalScorer({attributes})"
|
@@ -2,7 +2,7 @@
|
|
2
2
|
Code for the local implementation of the Faithfulness metric.
|
3
3
|
"""
|
4
4
|
from typing import List, Optional, Union
|
5
|
-
|
5
|
+
from pprint import pprint
|
6
6
|
from judgeval.constants import APIScorer
|
7
7
|
from judgeval.data import (
|
8
8
|
Example,
|
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
|
|
114
114
|
):
|
115
115
|
self.claims = await self._a_generate_claims(example.actual_output)
|
116
116
|
|
117
|
+
|
117
118
|
if self.additional_metadata is None:
|
118
119
|
self.additional_metadata = {}
|
119
120
|
self.additional_metadata["claims"] = self.claims
|
120
121
|
|
121
122
|
self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
|
123
|
+
|
122
124
|
self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
|
123
125
|
|
124
126
|
self.score = self._calculate_score()
|
@@ -129,10 +129,13 @@ JSON:
|
|
129
129
|
def create_verdicts(claims, retrieval_context):
|
130
130
|
return f"""==== TASK INSTRUCTIONS ====
|
131
131
|
You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
|
132
|
-
|
133
|
-
|
132
|
+
I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
|
133
|
+
For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
|
134
|
+
YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
|
135
|
+
|
136
|
+
Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
|
134
137
|
Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
|
135
|
-
Claims that are
|
138
|
+
Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
|
136
139
|
|
137
140
|
==== FORMATTING YOUR ANSWER ====
|
138
141
|
Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.
|
@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
72
72
|
strict_mode=strict_mode,
|
73
73
|
verbose_mode=verbose_mode,
|
74
74
|
)
|
75
|
-
# Then initialize
|
75
|
+
# Then initialize JudgevalScorer
|
76
76
|
JudgevalScorer.__init__(
|
77
77
|
self,
|
78
78
|
score_type=name,
|
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
|
|
309
309
|
strict_mode=strict_mode,
|
310
310
|
verbose_mode=verbose_mode,
|
311
311
|
)
|
312
|
-
# Then initialize
|
312
|
+
# Then initialize JudgevalScorer
|
313
313
|
JudgevalScorer.__init__(
|
314
314
|
self,
|
315
315
|
score_type=name,
|
judgeval/scorers/score.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Infrastructure for executing evaluations of `Example`s using one or more `
|
2
|
+
Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
|
3
3
|
"""
|
4
4
|
|
5
5
|
|
@@ -30,15 +30,15 @@ async def safe_a_score_example(
|
|
30
30
|
):
|
31
31
|
"""
|
32
32
|
Scoring task function when not using a progress indicator!
|
33
|
-
"Safely" scores an `Example` using a `
|
33
|
+
"Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
|
34
34
|
|
35
35
|
Args:
|
36
|
-
scorer (
|
36
|
+
scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
|
37
37
|
example (Example): The `Example` to be scored.
|
38
38
|
|
39
39
|
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
40
40
|
If set to false, any error will be raised and stop the evaluation.
|
41
|
-
If set to true, the error will be stored in the `error` attribute of the `
|
41
|
+
If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
|
42
42
|
|
43
43
|
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
44
44
|
"""
|
@@ -102,12 +102,12 @@ async def score_task(
|
|
102
102
|
skip_on_missing_params: bool = True,
|
103
103
|
):
|
104
104
|
"""
|
105
|
-
Task function for asynchronously measuring a given example using a
|
105
|
+
Task function for asynchronously measuring a given example using a JudgevalScorer.
|
106
106
|
|
107
107
|
Args:
|
108
108
|
task_id (int): The ID of the task being measured.
|
109
109
|
progress (Progress): An instance of the Progress class to track task progress.
|
110
|
-
scorer (
|
110
|
+
scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
|
111
111
|
example (Example): The example to be scored.
|
112
112
|
ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
|
113
113
|
skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
|
@@ -189,10 +189,10 @@ async def score_with_indicator(
|
|
189
189
|
show_indicator: bool,
|
190
190
|
):
|
191
191
|
"""
|
192
|
-
Scores an example using a list of
|
192
|
+
Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
|
193
193
|
|
194
194
|
Args:
|
195
|
-
scorers (List[
|
195
|
+
scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
|
196
196
|
example (Example): The example to be scored.
|
197
197
|
ignore_errors (bool): If True, errors during scoring will be ignored.
|
198
198
|
skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
|
@@ -253,8 +253,8 @@ async def a_execute_scoring(
|
|
253
253
|
_use_bar_indicator: bool = True,
|
254
254
|
) -> List[ScoringResult]:
|
255
255
|
"""
|
256
|
-
Executes evaluations of `Example`s asynchronously using one or more `
|
257
|
-
Each `Example` will be evaluated by all of the `
|
256
|
+
Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
|
257
|
+
Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
|
258
258
|
|
259
259
|
Args:
|
260
260
|
examples (List[Example]): A list of `Example` objects to be evaluated.
|
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
|
|
379
379
|
Evaluate a single example asynchronously using a list of scorers.
|
380
380
|
|
381
381
|
Args:
|
382
|
-
scorers (List[
|
382
|
+
scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
|
383
383
|
example (Example): The example to be evaluated.
|
384
384
|
scoring_results (List[ScoringResult]): List to store the scoring results.
|
385
385
|
score_index (int): Index at which the result should be stored in scoring_results.
|