judgeval 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +515 -193
- judgeval/constants.py +4 -2
- judgeval/data/__init__.py +0 -3
- judgeval/data/{api_example.py → custom_api_example.py} +12 -19
- judgeval/data/datasets/eval_dataset_client.py +59 -20
- judgeval/data/result.py +34 -56
- judgeval/evaluation_run.py +1 -0
- judgeval/judgment_client.py +47 -15
- judgeval/run_evaluation.py +20 -36
- judgeval/scorers/score.py +9 -11
- {judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/METADATA +1 -1
- {judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/RECORD +14 -14
- {judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/WHEEL +0 -0
- {judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -41,14 +41,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
|
41
41
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
42
42
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
43
43
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
44
|
+
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
44
45
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
45
|
-
|
46
|
-
|
46
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
47
|
+
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
47
48
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
48
49
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
50
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
50
51
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
52
|
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
53
|
+
JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
52
54
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
53
55
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
54
56
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
judgeval/data/__init__.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
|
-
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
4
|
|
6
5
|
__all__ = [
|
7
6
|
"Example",
|
8
7
|
"ExampleParams",
|
9
|
-
"ProcessExample",
|
10
|
-
"create_process_example",
|
11
8
|
"ScorerData",
|
12
9
|
"create_scorer_data",
|
13
10
|
"ScoringResult",
|
@@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Union
|
|
2
2
|
from pydantic import BaseModel, ConfigDict, model_validator
|
3
3
|
|
4
4
|
from judgeval.data.example import Example
|
5
|
+
from judgeval.data.custom_example import CustomExample
|
5
6
|
from judgeval.data.scorer_data import ScorerData
|
6
7
|
from judgeval.common.logger import debug, error
|
7
8
|
|
@@ -12,13 +13,13 @@ class ProcessExample(BaseModel):
|
|
12
13
|
internal operations and keeping track of the evaluation process.
|
13
14
|
"""
|
14
15
|
name: str
|
15
|
-
input: Optional[str] = None
|
16
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
17
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
18
|
-
context: Optional[list] = None
|
19
|
-
retrieval_context: Optional[list] = None
|
20
|
-
tools_called: Optional[list] = None
|
21
|
-
expected_tools: Optional[list] = None
|
16
|
+
# input: Optional[str] = None
|
17
|
+
# actual_output: Optional[Union[str, List[str]]] = None
|
18
|
+
# expected_output: Optional[Union[str, List[str]]] = None
|
19
|
+
# context: Optional[list] = None
|
20
|
+
# retrieval_context: Optional[list] = None
|
21
|
+
# tools_called: Optional[list] = None
|
22
|
+
# expected_tools: Optional[list] = None
|
22
23
|
|
23
24
|
# make these optional, not all test cases in a conversation will be evaluated
|
24
25
|
success: Optional[bool] = None
|
@@ -57,10 +58,10 @@ class ProcessExample(BaseModel):
|
|
57
58
|
|
58
59
|
def update_run_duration(self, run_duration: float):
|
59
60
|
self.run_duration = run_duration
|
60
|
-
|
61
61
|
|
62
|
-
|
63
|
-
|
62
|
+
|
63
|
+
def create_process_custom_example(
|
64
|
+
example: CustomExample,
|
64
65
|
) -> ProcessExample:
|
65
66
|
"""
|
66
67
|
When an LLM Test Case is executed, we track its progress using an ProcessExample.
|
@@ -79,13 +80,6 @@ def create_process_example(
|
|
79
80
|
debug(f"Creating ProcessExample for: {name}")
|
80
81
|
process_ex = ProcessExample(
|
81
82
|
name=name,
|
82
|
-
input=example.input,
|
83
|
-
actual_output=example.actual_output,
|
84
|
-
expected_output=example.expected_output,
|
85
|
-
context=example.context,
|
86
|
-
retrieval_context=example.retrieval_context,
|
87
|
-
tools_called=example.tools_called,
|
88
|
-
expected_tools=example.expected_tools,
|
89
83
|
success=success,
|
90
84
|
scorers_data=scorers_data,
|
91
85
|
run_duration=None,
|
@@ -94,5 +88,4 @@ def create_process_example(
|
|
94
88
|
additional_metadata=example.additional_metadata,
|
95
89
|
trace_id=example.trace_id
|
96
90
|
)
|
97
|
-
return process_ex
|
98
|
-
|
91
|
+
return process_ex
|
@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
|
|
7
7
|
from judgeval.constants import (
|
8
8
|
JUDGMENT_DATASETS_PUSH_API_URL,
|
9
9
|
JUDGMENT_DATASETS_PULL_API_URL,
|
10
|
-
|
11
|
-
|
10
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
11
|
+
JUDGMENT_DATASETS_DELETE_API_URL,
|
12
|
+
JUDGMENT_DATASETS_INSERT_API_URL,
|
12
13
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
13
14
|
)
|
14
15
|
from judgeval.data import Example
|
@@ -25,7 +26,7 @@ class EvalDatasetClient:
|
|
25
26
|
def create_dataset(self) -> EvalDataset:
|
26
27
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
27
28
|
|
28
|
-
def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
|
29
|
+
def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
|
29
30
|
debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
|
30
31
|
if overwrite:
|
31
32
|
warning(f"Overwrite enabled for alias '{alias}'")
|
@@ -53,7 +54,8 @@ class EvalDatasetClient:
|
|
53
54
|
total=100,
|
54
55
|
)
|
55
56
|
content = {
|
56
|
-
"
|
57
|
+
"dataset_alias": alias,
|
58
|
+
"project_name": project_name,
|
57
59
|
"examples": [e.to_dict() for e in dataset.examples],
|
58
60
|
"overwrite": overwrite,
|
59
61
|
}
|
@@ -88,7 +90,7 @@ class EvalDatasetClient:
|
|
88
90
|
)
|
89
91
|
return True
|
90
92
|
|
91
|
-
def pull(self, alias: str) -> EvalDataset:
|
93
|
+
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
92
94
|
debug(f"Pulling dataset with alias '{alias}'")
|
93
95
|
"""
|
94
96
|
Pulls the dataset from Judgment platform
|
@@ -96,7 +98,7 @@ class EvalDatasetClient:
|
|
96
98
|
Mock request:
|
97
99
|
{
|
98
100
|
"alias": alias,
|
99
|
-
"
|
101
|
+
"project_name": project_name
|
100
102
|
}
|
101
103
|
==>
|
102
104
|
{
|
@@ -118,7 +120,8 @@ class EvalDatasetClient:
|
|
118
120
|
total=100,
|
119
121
|
)
|
120
122
|
request_body = {
|
121
|
-
"
|
123
|
+
"dataset_alias": alias,
|
124
|
+
"project_name": project_name
|
122
125
|
}
|
123
126
|
|
124
127
|
try:
|
@@ -139,24 +142,58 @@ class EvalDatasetClient:
|
|
139
142
|
|
140
143
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
141
144
|
payload = response.json()
|
145
|
+
|
142
146
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
143
|
-
dataset._alias = payload.get("
|
144
|
-
dataset._id = payload.get("
|
147
|
+
dataset._alias = payload.get("alias")
|
148
|
+
dataset._id = payload.get("id")
|
145
149
|
progress.update(
|
146
150
|
task_id,
|
147
151
|
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
148
152
|
)
|
149
153
|
|
150
154
|
return dataset
|
155
|
+
|
156
|
+
def delete(self, alias: str, project_name: str) -> bool:
|
157
|
+
with Progress(
|
158
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
159
|
+
TextColumn("[progress.description]{task.description}"),
|
160
|
+
transient=False,
|
161
|
+
) as progress:
|
162
|
+
task_id = progress.add_task(
|
163
|
+
f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
164
|
+
total=100,
|
165
|
+
)
|
166
|
+
request_body = {
|
167
|
+
"dataset_alias": alias,
|
168
|
+
"project_name": project_name
|
169
|
+
}
|
151
170
|
|
152
|
-
|
153
|
-
|
171
|
+
try:
|
172
|
+
response = requests.post(
|
173
|
+
JUDGMENT_DATASETS_DELETE_API_URL,
|
174
|
+
json=request_body,
|
175
|
+
headers={
|
176
|
+
"Content-Type": "application/json",
|
177
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
178
|
+
"X-Organization-Id": self.organization_id
|
179
|
+
},
|
180
|
+
verify=True
|
181
|
+
)
|
182
|
+
response.raise_for_status()
|
183
|
+
except requests.exceptions.RequestException as e:
|
184
|
+
error(f"Error deleting dataset: {str(e)}")
|
185
|
+
raise
|
186
|
+
|
187
|
+
return True
|
188
|
+
|
189
|
+
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
190
|
+
debug(f"Pulling project datasets stats for project_name: {project_name}'")
|
154
191
|
"""
|
155
|
-
Pulls the
|
192
|
+
Pulls the project datasets stats from Judgment platform
|
156
193
|
|
157
194
|
Mock request:
|
158
195
|
{
|
159
|
-
"
|
196
|
+
"project_name": project_name
|
160
197
|
}
|
161
198
|
==>
|
162
199
|
{
|
@@ -177,11 +214,12 @@ class EvalDatasetClient:
|
|
177
214
|
total=100,
|
178
215
|
)
|
179
216
|
request_body = {
|
217
|
+
"project_name": project_name
|
180
218
|
}
|
181
219
|
|
182
220
|
try:
|
183
221
|
response = requests.post(
|
184
|
-
|
222
|
+
JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
|
185
223
|
json=request_body,
|
186
224
|
headers={
|
187
225
|
"Content-Type": "application/json",
|
@@ -205,7 +243,7 @@ class EvalDatasetClient:
|
|
205
243
|
|
206
244
|
return payload
|
207
245
|
|
208
|
-
def
|
246
|
+
def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
209
247
|
"""
|
210
248
|
Edits the dataset on Judgment platform by adding new examples
|
211
249
|
|
@@ -213,7 +251,7 @@ class EvalDatasetClient:
|
|
213
251
|
{
|
214
252
|
"alias": alias,
|
215
253
|
"examples": [...],
|
216
|
-
"
|
254
|
+
"project_name": project_name
|
217
255
|
}
|
218
256
|
"""
|
219
257
|
with Progress(
|
@@ -227,13 +265,14 @@ class EvalDatasetClient:
|
|
227
265
|
)
|
228
266
|
|
229
267
|
content = {
|
230
|
-
"
|
268
|
+
"dataset_alias": alias,
|
231
269
|
"examples": [e.to_dict() for e in examples],
|
270
|
+
"project_name": project_name
|
232
271
|
}
|
233
272
|
|
234
273
|
try:
|
235
274
|
response = requests.post(
|
236
|
-
|
275
|
+
JUDGMENT_DATASETS_INSERT_API_URL,
|
237
276
|
json=content,
|
238
277
|
headers={
|
239
278
|
"Content-Type": "application/json",
|
@@ -250,7 +289,7 @@ class EvalDatasetClient:
|
|
250
289
|
info(f"Successfully edited dataset '{alias}'")
|
251
290
|
return True
|
252
291
|
|
253
|
-
def export_jsonl(self, alias: str) -> requests.Response:
|
292
|
+
def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
|
254
293
|
"""Export dataset in JSONL format from Judgment platform"""
|
255
294
|
debug(f"Exporting dataset with alias '{alias}' as JSONL")
|
256
295
|
with Progress(
|
@@ -265,7 +304,7 @@ class EvalDatasetClient:
|
|
265
304
|
try:
|
266
305
|
response = requests.post(
|
267
306
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
|
268
|
-
json={"alias":
|
307
|
+
json={"dataset_alias": alias, "project_name": project_name},
|
269
308
|
headers={
|
270
309
|
"Content-Type": "application/json",
|
271
310
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
judgeval/data/result.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
from dataclasses import dataclass
|
2
2
|
from typing import List, Union, Optional, Dict, Any, Union
|
3
|
+
from judgeval.common.logger import debug, error
|
4
|
+
from pydantic import BaseModel
|
5
|
+
from judgeval.data import ScorerData, Example
|
3
6
|
|
4
|
-
from judgeval.data import ScorerData, ProcessExample
|
5
7
|
|
6
|
-
|
7
|
-
class ScoringResult:
|
8
|
+
class ScoringResult(BaseModel):
|
8
9
|
"""
|
9
10
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
10
11
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
@@ -13,69 +14,44 @@ class ScoringResult:
|
|
13
14
|
success (bool): Whether the evaluation was successful.
|
14
15
|
This means that all scorers applied to this example returned a success.
|
15
16
|
scorer_data (List[ScorerData]): The scorers data for the evaluated example
|
16
|
-
|
17
|
-
actual_output (Optional[str]): The actual output of the example
|
18
|
-
expected_output (Optional[str]): The expected output of the example
|
19
|
-
context (Optional[List[str]]): The context of the example
|
20
|
-
retrieval_context (Optional[List[str]]): The retrieval context of the example
|
21
|
-
additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
|
22
|
-
tools_called (Optional[List[str]]): The tools called by the example
|
23
|
-
expected_tools (Optional[List[str]]): The expected tools of the example
|
24
|
-
trace_id (Optional[str]): The trace id of the example
|
17
|
+
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
|
25
18
|
|
26
19
|
"""
|
27
20
|
# Fields for scoring outputs
|
28
21
|
success: bool # used for unit testing
|
29
22
|
scorers_data: Union[List[ScorerData], None]
|
23
|
+
name: Optional[str] = None
|
30
24
|
|
31
|
-
#
|
32
|
-
|
33
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
34
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
35
|
-
context: Optional[List[str]] = None
|
36
|
-
retrieval_context: Optional[List[str]] = None
|
37
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
38
|
-
tools_called: Optional[List[str]] = None
|
39
|
-
expected_tools: Optional[List[str]] = None
|
25
|
+
# The original example object that was used to create the ScoringResult
|
26
|
+
data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
|
40
27
|
trace_id: Optional[str] = None
|
41
28
|
|
42
|
-
|
43
|
-
|
29
|
+
# Additional fields for internal use
|
30
|
+
run_duration: Optional[float] = None
|
31
|
+
evaluation_cost: Optional[float] = None
|
44
32
|
|
45
33
|
def to_dict(self) -> dict:
|
46
34
|
"""Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
|
47
35
|
return {
|
48
36
|
"success": self.success,
|
49
37
|
"scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
|
50
|
-
"
|
51
|
-
"actual_output": self.actual_output,
|
52
|
-
"expected_output": self.expected_output,
|
53
|
-
"context": self.context,
|
54
|
-
"retrieval_context": self.retrieval_context,
|
55
|
-
"additional_metadata": self.additional_metadata,
|
56
|
-
"tools_called": self.tools_called,
|
57
|
-
"expected_tools": self.expected_tools,
|
58
|
-
"trace_id": self.trace_id,
|
59
|
-
"example_id": self.example_id
|
38
|
+
"data_object": self.data_object.to_dict() if self.data_object else None,
|
60
39
|
}
|
61
|
-
|
40
|
+
|
62
41
|
def __str__(self) -> str:
|
63
42
|
return f"ScoringResult(\
|
64
43
|
success={self.success}, \
|
65
44
|
scorer_data={self.scorers_data}, \
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
context={self.context}, \
|
70
|
-
retrieval_context={self.retrieval_context}, \
|
71
|
-
additional_metadata={self.additional_metadata}, \
|
72
|
-
tools_called={self.tools_called}, \
|
73
|
-
expected_tools={self.expected_tools}, \
|
74
|
-
trace_id={self.trace_id})"
|
45
|
+
data_object={self.data_object}, \
|
46
|
+
run_duration={self.run_duration}, \
|
47
|
+
evaluation_cost={self.evaluation_cost})"
|
75
48
|
|
76
49
|
|
77
50
|
def generate_scoring_result(
|
78
|
-
|
51
|
+
example: Example,
|
52
|
+
scorers_data: List[ScorerData],
|
53
|
+
run_duration: float,
|
54
|
+
success: bool,
|
79
55
|
) -> ScoringResult:
|
80
56
|
"""
|
81
57
|
Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
|
@@ -83,16 +59,18 @@ def generate_scoring_result(
|
|
83
59
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
84
60
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
85
61
|
"""
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
62
|
+
if example.name is not None:
|
63
|
+
name = example.name
|
64
|
+
else:
|
65
|
+
name = "Test Case Placeholder"
|
66
|
+
debug(f"No name provided for example, using default name: {name}")
|
67
|
+
debug(f"Creating ScoringResult for: {name}")
|
68
|
+
scoring_result = ScoringResult(
|
69
|
+
name=name,
|
70
|
+
data_object=example,
|
71
|
+
success=success,
|
72
|
+
scorers_data=scorers_data,
|
73
|
+
run_duration=run_duration,
|
74
|
+
evaluation_cost=None,
|
98
75
|
)
|
76
|
+
return scoring_result
|
judgeval/evaluation_run.py
CHANGED
@@ -34,6 +34,7 @@ class EvaluationRun(BaseModel):
|
|
34
34
|
model: Union[str, List[str], JudgevalJudge]
|
35
35
|
aggregator: Optional[str] = None
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
|
+
trace_span_id: Optional[str] = None
|
37
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
38
39
|
judgment_api_key: Optional[str] = ""
|
39
40
|
override: Optional[bool] = False
|
judgeval/judgment_client.py
CHANGED
@@ -27,7 +27,8 @@ from judgeval.constants import (
|
|
27
27
|
JUDGMENT_EVAL_FETCH_API_URL,
|
28
28
|
JUDGMENT_EVAL_DELETE_API_URL,
|
29
29
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
30
|
-
JUDGMENT_PROJECT_DELETE_API_URL
|
30
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
31
|
+
JUDGMENT_PROJECT_CREATE_API_URL
|
31
32
|
)
|
32
33
|
from judgeval.common.exceptions import JudgmentAPIError
|
33
34
|
from pydantic import BaseModel
|
@@ -43,8 +44,16 @@ class DeleteEvalRunRequestBody(BaseModel):
|
|
43
44
|
project_name: str
|
44
45
|
judgment_api_key: str
|
45
46
|
|
47
|
+
class SingletonMeta(type):
|
48
|
+
_instances = {}
|
46
49
|
|
47
|
-
|
50
|
+
def __call__(cls, *args, **kwargs):
|
51
|
+
if cls not in cls._instances:
|
52
|
+
instance = super().__call__(*args, **kwargs)
|
53
|
+
cls._instances[cls] = instance
|
54
|
+
return cls._instances[cls]
|
55
|
+
|
56
|
+
class JudgmentClient(metaclass=SingletonMeta):
|
48
57
|
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
49
58
|
self.judgment_api_key = judgment_api_key
|
50
59
|
self.organization_id = organization_id
|
@@ -56,8 +65,8 @@ class JudgmentClient:
|
|
56
65
|
# May be bad to output their invalid API key...
|
57
66
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
58
67
|
else:
|
59
|
-
print(f"Successfully initialized JudgmentClient
|
60
|
-
|
68
|
+
print(f"Successfully initialized JudgmentClient!")
|
69
|
+
|
61
70
|
def a_run_evaluation(
|
62
71
|
self,
|
63
72
|
examples: List[Example],
|
@@ -267,7 +276,7 @@ class JudgmentClient:
|
|
267
276
|
def create_dataset(self) -> EvalDataset:
|
268
277
|
return self.eval_dataset_client.create_dataset()
|
269
278
|
|
270
|
-
def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
|
279
|
+
def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
|
271
280
|
"""
|
272
281
|
Uploads an `EvalDataset` to the Judgment platform for storage.
|
273
282
|
|
@@ -281,9 +290,9 @@ class JudgmentClient:
|
|
281
290
|
"""
|
282
291
|
# Set judgment_api_key just in case it was not set
|
283
292
|
dataset.judgment_api_key = self.judgment_api_key
|
284
|
-
return self.eval_dataset_client.push(dataset, alias, overwrite)
|
293
|
+
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
285
294
|
|
286
|
-
def pull_dataset(self, alias: str) -> EvalDataset:
|
295
|
+
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
287
296
|
"""
|
288
297
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
289
298
|
|
@@ -293,25 +302,31 @@ class JudgmentClient:
|
|
293
302
|
Returns:
|
294
303
|
EvalDataset: The retrieved dataset
|
295
304
|
"""
|
296
|
-
return self.eval_dataset_client.pull(alias)
|
305
|
+
return self.eval_dataset_client.pull(alias, project_name)
|
306
|
+
|
307
|
+
def delete_dataset(self, alias: str, project_name: str) -> bool:
|
308
|
+
"""
|
309
|
+
Deletes a saved `EvalDataset` from the Judgment platform.
|
310
|
+
"""
|
311
|
+
return self.eval_dataset_client.delete(alias, project_name)
|
297
312
|
|
298
|
-
def
|
313
|
+
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
299
314
|
"""
|
300
|
-
Retrieves all dataset stats from the Judgment platform for the
|
315
|
+
Retrieves all dataset stats from the Judgment platform for the project.
|
301
316
|
|
302
317
|
Args:
|
303
|
-
|
318
|
+
project_name (str): The name of the project to retrieve
|
304
319
|
|
305
320
|
Returns:
|
306
|
-
|
321
|
+
dict: The retrieved dataset stats
|
307
322
|
"""
|
308
|
-
return self.eval_dataset_client.
|
323
|
+
return self.eval_dataset_client.pull_project_dataset_stats(project_name)
|
309
324
|
|
310
|
-
def
|
325
|
+
def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
311
326
|
"""
|
312
327
|
Edits the dataset on Judgment platform by adding new examples
|
313
328
|
"""
|
314
|
-
return self.eval_dataset_client.
|
329
|
+
return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
|
315
330
|
|
316
331
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
317
332
|
def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
@@ -402,6 +417,23 @@ class JudgmentClient:
|
|
402
417
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
403
418
|
return response.json()
|
404
419
|
|
420
|
+
def create_project(self, project_name: str) -> bool:
|
421
|
+
"""
|
422
|
+
Creates a project on the server.
|
423
|
+
"""
|
424
|
+
response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
|
425
|
+
json={
|
426
|
+
"project_name": project_name,
|
427
|
+
},
|
428
|
+
headers={
|
429
|
+
"Content-Type": "application/json",
|
430
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
431
|
+
"X-Organization-Id": self.organization_id
|
432
|
+
})
|
433
|
+
if response.status_code != requests.codes.ok:
|
434
|
+
raise ValueError(f"Error creating project: {response.json()}")
|
435
|
+
return response.json()
|
436
|
+
|
405
437
|
def delete_project(self, project_name: str) -> bool:
|
406
438
|
"""
|
407
439
|
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
judgeval/run_evaluation.py
CHANGED
@@ -117,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
117
117
|
|
118
118
|
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
119
119
|
for api_result, local_result in zip(api_results, local_results):
|
120
|
-
if api_result.
|
120
|
+
if not (api_result.data_object and local_result.data_object):
|
121
|
+
raise ValueError("Data object is None in one of the results.")
|
122
|
+
if api_result.data_object.input != local_result.data_object.input:
|
121
123
|
raise ValueError("The API and local results are not aligned.")
|
122
|
-
if api_result.actual_output != local_result.actual_output:
|
124
|
+
if api_result.data_object.actual_output != local_result.data_object.actual_output:
|
123
125
|
raise ValueError("The API and local results are not aligned.")
|
124
|
-
if api_result.expected_output != local_result.expected_output:
|
126
|
+
if api_result.data_object.expected_output != local_result.data_object.expected_output:
|
125
127
|
raise ValueError("The API and local results are not aligned.")
|
126
|
-
if api_result.context != local_result.context:
|
128
|
+
if api_result.data_object.context != local_result.data_object.context:
|
127
129
|
raise ValueError("The API and local results are not aligned.")
|
128
|
-
if api_result.retrieval_context != local_result.retrieval_context:
|
130
|
+
if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
|
129
131
|
raise ValueError("The API and local results are not aligned.")
|
130
|
-
if api_result.additional_metadata != local_result.additional_metadata:
|
132
|
+
if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
|
131
133
|
raise ValueError("The API and local results are not aligned.")
|
132
|
-
if api_result.tools_called != local_result.tools_called:
|
134
|
+
if api_result.data_object.tools_called != local_result.data_object.tools_called:
|
133
135
|
raise ValueError("The API and local results are not aligned.")
|
134
|
-
if api_result.expected_tools != local_result.expected_tools:
|
136
|
+
if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
|
135
137
|
raise ValueError("The API and local results are not aligned.")
|
136
138
|
|
137
139
|
|
@@ -422,23 +424,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
422
424
|
|
423
425
|
# Convert the response data to `ScoringResult` objects
|
424
426
|
debug("Processing API results")
|
425
|
-
|
426
|
-
with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
|
427
|
-
for scorer in judgment_scorers:
|
428
|
-
debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
|
429
|
-
# filter for key-value pairs that are used to initialize ScoringResult
|
430
|
-
# there may be some stuff in here that doesn't belong in ScoringResult
|
431
|
-
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
432
|
-
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
433
|
-
|
434
|
-
# Convert scorers_data dicts to ScorerData objects
|
435
|
-
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
436
|
-
filtered_result["scorers_data"] = [
|
437
|
-
ScorerData(**scorer_dict)
|
438
|
-
for scorer_dict in filtered_result["scorers_data"]
|
439
|
-
]
|
440
|
-
|
441
|
-
api_results.append(ScoringResult(**filtered_result))
|
427
|
+
api_results = [ScoringResult(**result) for result in response_data["results"]]
|
442
428
|
# Run local evals
|
443
429
|
if local_scorers: # List[JudgevalScorer]
|
444
430
|
# We should be removing local scorers soon
|
@@ -477,7 +463,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
477
463
|
# judgment_api_key=evaluation_run.judgment_api_key,
|
478
464
|
# organization_id=evaluation_run.organization_id
|
479
465
|
# )
|
480
|
-
|
466
|
+
# print(merged_results)
|
481
467
|
if evaluation_run.log_results:
|
482
468
|
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
483
469
|
rprint(pretty_str)
|
@@ -504,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
504
490
|
|
505
491
|
# Create a test case context with all relevant fields
|
506
492
|
test_case = {
|
507
|
-
'input': result.input,
|
508
|
-
'actual_output': result.actual_output,
|
509
|
-
'expected_output': result.expected_output,
|
510
|
-
'context': result.context,
|
511
|
-
'retrieval_context': result.retrieval_context,
|
512
|
-
'additional_metadata': result.additional_metadata,
|
513
|
-
'tools_called': result.tools_called,
|
514
|
-
'expected_tools': result.expected_tools,
|
515
|
-
'eval_run_name': result.eval_run_name,
|
493
|
+
'input': result.data_object.input,
|
494
|
+
'actual_output': result.data_object.actual_output,
|
495
|
+
'expected_output': result.data_object.expected_output,
|
496
|
+
'context': result.data_object.context,
|
497
|
+
'retrieval_context': result.data_object.retrieval_context,
|
498
|
+
'additional_metadata': result.data_object.additional_metadata,
|
499
|
+
'tools_called': result.data_object.tools_called,
|
500
|
+
'expected_tools': result.data_object.expected_tools,
|
516
501
|
'failed_scorers': []
|
517
502
|
}
|
518
503
|
if result.scorers_data:
|
@@ -533,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
533
518
|
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
534
519
|
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
535
520
|
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
536
|
-
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
537
521
|
|
538
522
|
for fail_scorer in fail_case['failed_scorers']:
|
539
523
|
|