judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -1,60 +1,62 @@
|
|
1
1
|
"""
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import os
|
5
6
|
from uuid import uuid4
|
6
7
|
from typing import Optional, List, Dict, Any, Union, Callable
|
7
|
-
import
|
8
|
+
from requests import codes
|
9
|
+
from judgeval.utils.requests import requests
|
8
10
|
import asyncio
|
9
11
|
|
10
12
|
from judgeval.constants import ROOT_API
|
11
13
|
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
12
14
|
from judgeval.data import (
|
13
|
-
ScoringResult,
|
15
|
+
ScoringResult,
|
14
16
|
Example,
|
15
17
|
CustomExample,
|
16
18
|
Trace,
|
17
19
|
)
|
18
20
|
from judgeval.scorers import (
|
19
|
-
APIJudgmentScorer,
|
20
|
-
JudgevalScorer,
|
21
|
-
ClassifierScorer,
|
21
|
+
APIJudgmentScorer,
|
22
|
+
JudgevalScorer,
|
23
|
+
ClassifierScorer,
|
22
24
|
)
|
23
25
|
from judgeval.evaluation_run import EvaluationRun
|
24
26
|
from judgeval.run_evaluation import (
|
25
|
-
run_eval,
|
27
|
+
run_eval,
|
26
28
|
assert_test,
|
27
|
-
run_trace_eval
|
29
|
+
run_trace_eval,
|
30
|
+
safe_run_async,
|
28
31
|
)
|
29
32
|
from judgeval.data.trace_run import TraceRun
|
30
|
-
from judgeval.judges import JudgevalJudge
|
31
33
|
from judgeval.constants import (
|
32
|
-
JUDGMENT_EVAL_FETCH_API_URL,
|
33
|
-
JUDGMENT_EVAL_DELETE_API_URL,
|
34
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
34
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
35
35
|
JUDGMENT_PROJECT_DELETE_API_URL,
|
36
|
-
JUDGMENT_PROJECT_CREATE_API_URL
|
36
|
+
JUDGMENT_PROJECT_CREATE_API_URL,
|
37
37
|
)
|
38
|
-
from judgeval.utils.data_utils import add_from_yaml
|
39
38
|
from judgeval.common.exceptions import JudgmentAPIError
|
40
39
|
from langchain_core.callbacks import BaseCallbackHandler
|
41
40
|
from judgeval.common.tracer import Tracer
|
42
41
|
from judgeval.common.utils import validate_api_key
|
43
42
|
from pydantic import BaseModel
|
44
|
-
from judgeval.
|
43
|
+
from judgeval.run_evaluation import SpinnerWrappedTask
|
44
|
+
|
45
45
|
|
46
46
|
class EvalRunRequestBody(BaseModel):
|
47
47
|
eval_name: str
|
48
48
|
project_name: str
|
49
49
|
judgment_api_key: str
|
50
50
|
|
51
|
+
|
51
52
|
class DeleteEvalRunRequestBody(BaseModel):
|
52
53
|
eval_names: List[str]
|
53
54
|
project_name: str
|
54
55
|
judgment_api_key: str
|
55
56
|
|
57
|
+
|
56
58
|
class SingletonMeta(type):
|
57
|
-
_instances = {}
|
59
|
+
_instances: Dict[type, "JudgmentClient"] = {}
|
58
60
|
|
59
61
|
def __call__(cls, *args, **kwargs):
|
60
62
|
if cls not in cls._instances:
|
@@ -62,179 +64,168 @@ class SingletonMeta(type):
|
|
62
64
|
cls._instances[cls] = instance
|
63
65
|
return cls._instances[cls]
|
64
66
|
|
67
|
+
|
65
68
|
class JudgmentClient(metaclass=SingletonMeta):
|
66
|
-
def __init__(
|
69
|
+
def __init__(
|
70
|
+
self,
|
71
|
+
judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
|
72
|
+
organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
|
73
|
+
):
|
67
74
|
# Check if API key is None
|
68
75
|
if judgment_api_key is None:
|
69
|
-
raise ValueError(
|
70
|
-
|
76
|
+
raise ValueError(
|
77
|
+
"JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable."
|
78
|
+
)
|
79
|
+
|
71
80
|
# Check if organization ID is None
|
72
81
|
if organization_id is None:
|
73
|
-
raise ValueError(
|
74
|
-
|
82
|
+
raise ValueError(
|
83
|
+
"JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable."
|
84
|
+
)
|
85
|
+
|
75
86
|
self.judgment_api_key = judgment_api_key
|
76
87
|
self.organization_id = organization_id
|
77
88
|
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
78
|
-
|
89
|
+
|
79
90
|
# Verify API key is valid
|
80
91
|
result, response = validate_api_key(judgment_api_key)
|
81
92
|
if not result:
|
82
93
|
# May be bad to output their invalid API key...
|
83
94
|
raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
|
84
95
|
else:
|
85
|
-
print(
|
96
|
+
print("Successfully initialized JudgmentClient!")
|
86
97
|
|
87
98
|
def a_run_evaluation(
|
88
|
-
self,
|
99
|
+
self,
|
89
100
|
examples: List[Example],
|
90
101
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
91
|
-
model: Optional[
|
92
|
-
aggregator: Optional[str] = None,
|
93
|
-
metadata: Optional[Dict[str, Any]] = None,
|
94
|
-
log_results: bool = True,
|
102
|
+
model: Optional[str] = "gpt-4.1",
|
95
103
|
project_name: str = "default_project",
|
96
104
|
eval_run_name: str = "default_eval_run",
|
97
105
|
override: bool = False,
|
98
106
|
append: bool = False,
|
99
|
-
ignore_errors: bool = True,
|
100
|
-
rules: Optional[List[Rule]] = None
|
101
107
|
) -> List[ScoringResult]:
|
102
|
-
|
103
|
-
examples=examples,
|
104
|
-
scorers=scorers,
|
105
|
-
model=model,
|
106
|
-
|
107
|
-
|
108
|
-
log_results=log_results,
|
109
|
-
project_name=project_name,
|
110
|
-
eval_run_name=eval_run_name,
|
108
|
+
result = self.run_evaluation(
|
109
|
+
examples=examples,
|
110
|
+
scorers=scorers,
|
111
|
+
model=model,
|
112
|
+
project_name=project_name,
|
113
|
+
eval_run_name=eval_run_name,
|
111
114
|
override=override,
|
112
|
-
append=append,
|
113
|
-
|
114
|
-
rules=rules
|
115
|
+
append=append,
|
116
|
+
async_execution=True,
|
115
117
|
)
|
118
|
+
assert not isinstance(result, (asyncio.Task, SpinnerWrappedTask))
|
119
|
+
return result
|
116
120
|
|
117
121
|
def run_trace_evaluation(
|
118
122
|
self,
|
119
123
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
120
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
121
|
-
traces: Optional[List[Trace]] = None,
|
122
124
|
examples: Optional[List[Example]] = None,
|
123
|
-
|
124
|
-
|
125
|
+
function: Optional[Callable] = None,
|
126
|
+
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
127
|
+
traces: Optional[List[Trace]] = None,
|
128
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
125
129
|
project_name: str = "default_project",
|
126
130
|
eval_run_name: str = "default_eval_trace",
|
127
|
-
|
131
|
+
model: Optional[str] = "gpt-4.1",
|
128
132
|
append: bool = False,
|
129
133
|
override: bool = False,
|
130
|
-
ignore_errors: bool = True,
|
131
|
-
rules: Optional[List[Rule]] = None,
|
132
|
-
function: Optional[Callable] = None,
|
133
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
134
|
-
tools: Optional[List[Dict[str, Any]]] = None
|
135
134
|
) -> List[ScoringResult]:
|
136
|
-
try:
|
137
|
-
|
138
|
-
if test_file:
|
139
|
-
try:
|
140
|
-
examples = add_from_yaml(test_file)
|
141
|
-
except FileNotFoundError:
|
142
|
-
raise FileNotFoundError(f"Test file not found: {test_file}")
|
143
|
-
|
135
|
+
try:
|
144
136
|
if examples and not function:
|
145
137
|
raise ValueError("Cannot pass in examples without a function")
|
146
|
-
|
138
|
+
|
147
139
|
if traces and function:
|
148
140
|
raise ValueError("Cannot pass in traces and function")
|
149
|
-
|
141
|
+
|
150
142
|
if examples and traces:
|
151
143
|
raise ValueError("Cannot pass in both examples and traces")
|
152
|
-
|
144
|
+
|
153
145
|
trace_run = TraceRun(
|
154
146
|
project_name=project_name,
|
155
147
|
eval_name=eval_run_name,
|
156
148
|
traces=traces,
|
157
149
|
scorers=scorers,
|
158
150
|
model=model,
|
159
|
-
aggregator=aggregator,
|
160
|
-
log_results=log_results,
|
161
151
|
append=append,
|
162
152
|
judgment_api_key=self.judgment_api_key,
|
163
153
|
organization_id=self.organization_id,
|
164
|
-
tools=tools
|
154
|
+
tools=tools,
|
165
155
|
)
|
166
|
-
return run_trace_eval(trace_run, override,
|
156
|
+
return run_trace_eval(trace_run, override, function, tracer, examples)
|
167
157
|
except ValueError as e:
|
168
|
-
raise ValueError(
|
158
|
+
raise ValueError(
|
159
|
+
f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
|
160
|
+
)
|
169
161
|
except Exception as e:
|
170
162
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
171
163
|
|
172
164
|
def run_evaluation(
|
173
|
-
self,
|
165
|
+
self,
|
174
166
|
examples: Union[List[Example], List[CustomExample]],
|
175
167
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
176
|
-
model: Optional[
|
177
|
-
aggregator: Optional[str] = None,
|
178
|
-
metadata: Optional[Dict[str, Any]] = None,
|
179
|
-
log_results: bool = True,
|
168
|
+
model: Optional[str] = "gpt-4.1",
|
180
169
|
project_name: str = "default_project",
|
181
170
|
eval_run_name: str = "default_eval_run",
|
182
171
|
override: bool = False,
|
183
172
|
append: bool = False,
|
184
|
-
ignore_errors: bool = True,
|
185
173
|
async_execution: bool = False,
|
186
|
-
|
187
|
-
) -> Union[List[ScoringResult], asyncio.Task]:
|
174
|
+
) -> Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]:
|
188
175
|
"""
|
189
176
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
190
|
-
|
177
|
+
|
191
178
|
Args:
|
192
179
|
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
193
180
|
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
194
|
-
model (
|
195
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
196
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
197
|
-
log_results (bool): Whether to log the results to the Judgment API
|
181
|
+
model (str): The model used as a judge when using LLM as a Judge
|
198
182
|
project_name (str): The name of the project the evaluation results belong to
|
199
183
|
eval_run_name (str): A name for this evaluation run
|
200
184
|
override (bool): Whether to override an existing evaluation run with the same name
|
201
|
-
|
202
|
-
|
203
|
-
|
185
|
+
append (bool): Whether to append to an existing evaluation run with the same name
|
186
|
+
async_execution (bool): Whether to execute the evaluation asynchronously
|
187
|
+
|
204
188
|
Returns:
|
205
189
|
List[ScoringResult]: The results of the evaluation
|
206
190
|
"""
|
207
191
|
if override and append:
|
208
|
-
raise ValueError(
|
192
|
+
raise ValueError(
|
193
|
+
"Cannot set both override and append to True. Please choose one."
|
194
|
+
)
|
209
195
|
|
210
196
|
try:
|
211
|
-
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
|
212
|
-
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
213
|
-
|
214
197
|
eval = EvaluationRun(
|
215
|
-
log_results=log_results,
|
216
198
|
append=append,
|
217
199
|
project_name=project_name,
|
218
200
|
eval_name=eval_run_name,
|
219
201
|
examples=examples,
|
220
202
|
scorers=scorers,
|
221
203
|
model=model,
|
222
|
-
aggregator=aggregator,
|
223
|
-
metadata=metadata,
|
224
204
|
judgment_api_key=self.judgment_api_key,
|
225
|
-
|
226
|
-
|
205
|
+
organization_id=self.organization_id,
|
206
|
+
)
|
207
|
+
return run_eval(
|
208
|
+
eval,
|
209
|
+
override,
|
210
|
+
async_execution=async_execution,
|
227
211
|
)
|
228
|
-
return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
|
229
212
|
except ValueError as e:
|
230
|
-
raise ValueError(
|
213
|
+
raise ValueError(
|
214
|
+
f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}"
|
215
|
+
)
|
231
216
|
except Exception as e:
|
232
217
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
233
218
|
|
234
219
|
def create_dataset(self) -> EvalDataset:
|
235
220
|
return self.eval_dataset_client.create_dataset()
|
236
221
|
|
237
|
-
def push_dataset(
|
222
|
+
def push_dataset(
|
223
|
+
self,
|
224
|
+
alias: str,
|
225
|
+
dataset: EvalDataset,
|
226
|
+
project_name: str,
|
227
|
+
overwrite: Optional[bool] = False,
|
228
|
+
) -> bool:
|
238
229
|
"""
|
239
230
|
Uploads an `EvalDataset` to the Judgment platform for storage.
|
240
231
|
|
@@ -249,13 +240,15 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
249
240
|
# Set judgment_api_key just in case it was not set
|
250
241
|
dataset.judgment_api_key = self.judgment_api_key
|
251
242
|
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
252
|
-
|
253
|
-
def
|
243
|
+
|
244
|
+
def append_dataset(
|
245
|
+
self, alias: str, examples: List[Example], project_name: str
|
246
|
+
) -> bool:
|
254
247
|
"""
|
255
248
|
Appends an `EvalDataset` to the Judgment platform for storage.
|
256
249
|
"""
|
257
250
|
return self.eval_dataset_client.append_examples(alias, examples, project_name)
|
258
|
-
|
251
|
+
|
259
252
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
260
253
|
"""
|
261
254
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -273,7 +266,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
273
266
|
Deletes a saved `EvalDataset` from the Judgment platform.
|
274
267
|
"""
|
275
268
|
return self.eval_dataset_client.delete(alias, project_name)
|
276
|
-
|
269
|
+
|
277
270
|
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
278
271
|
"""
|
279
272
|
Retrieves all dataset stats from the Judgment platform for the project.
|
@@ -285,15 +278,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
285
278
|
dict: The retrieved dataset stats
|
286
279
|
"""
|
287
280
|
return self.eval_dataset_client.pull_project_dataset_stats(project_name)
|
288
|
-
|
289
|
-
def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
290
|
-
"""
|
291
|
-
Edits the dataset on Judgment platform by adding new examples
|
292
|
-
"""
|
293
|
-
return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
|
294
|
-
|
281
|
+
|
295
282
|
# Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
|
296
|
-
def pull_eval(
|
283
|
+
def pull_eval(
|
284
|
+
self, project_name: str, eval_run_name: str
|
285
|
+
) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
|
297
286
|
"""Pull evaluation results from the server.
|
298
287
|
|
299
288
|
Args:
|
@@ -305,109 +294,64 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
305
294
|
- id (str): The evaluation run ID
|
306
295
|
- results (List[ScoringResult]): List of scoring results
|
307
296
|
"""
|
308
|
-
eval_run_request_body = EvalRunRequestBody(
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
297
|
+
eval_run_request_body = EvalRunRequestBody(
|
298
|
+
project_name=project_name,
|
299
|
+
eval_name=eval_run_name,
|
300
|
+
judgment_api_key=self.judgment_api_key,
|
301
|
+
)
|
302
|
+
eval_run = requests.post(
|
303
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
304
|
+
headers={
|
305
|
+
"Content-Type": "application/json",
|
306
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
307
|
+
"X-Organization-Id": self.organization_id,
|
308
|
+
},
|
309
|
+
json=eval_run_request_body.model_dump(),
|
310
|
+
verify=True,
|
311
|
+
)
|
312
|
+
if eval_run.status_code != codes.ok:
|
320
313
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
321
314
|
|
322
315
|
return eval_run.json()
|
323
|
-
|
324
|
-
def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
|
325
|
-
"""
|
326
|
-
Deletes an evaluation from the server by project and run names.
|
327
316
|
|
328
|
-
Args:
|
329
|
-
project_name (str): Name of the project
|
330
|
-
eval_run_names (List[str]): List of names of the evaluation runs
|
331
|
-
|
332
|
-
Returns:
|
333
|
-
bool: Whether the evaluation was successfully deleted
|
334
|
-
"""
|
335
|
-
if not eval_run_names:
|
336
|
-
raise ValueError("No evaluation run names provided")
|
337
|
-
|
338
|
-
eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
|
339
|
-
eval_names=eval_run_names,
|
340
|
-
judgment_api_key=self.judgment_api_key)
|
341
|
-
response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
|
342
|
-
json=eval_run_request_body.model_dump(),
|
343
|
-
headers={
|
344
|
-
"Content-Type": "application/json",
|
345
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
346
|
-
"X-Organization-Id": self.organization_id
|
347
|
-
})
|
348
|
-
if response.status_code == 404:
|
349
|
-
raise ValueError(f"Eval results not found: {response.json()}")
|
350
|
-
elif response.status_code == 500:
|
351
|
-
raise ValueError(f"Error deleting eval results: {response.json()}")
|
352
|
-
return bool(response.json())
|
353
|
-
|
354
|
-
def delete_project_evals(self, project_name: str) -> bool:
|
355
|
-
"""
|
356
|
-
Deletes all evaluations from the server for a given project.
|
357
|
-
|
358
|
-
Args:
|
359
|
-
project_name (str): Name of the project
|
360
|
-
|
361
|
-
Returns:
|
362
|
-
bool: Whether the evaluations were successfully deleted
|
363
|
-
"""
|
364
|
-
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
365
|
-
json={
|
366
|
-
"project_name": project_name,
|
367
|
-
},
|
368
|
-
headers={
|
369
|
-
"Content-Type": "application/json",
|
370
|
-
"Authorization": f"Bearer {self.judgment_api_key}",
|
371
|
-
"X-Organization-Id": self.organization_id
|
372
|
-
})
|
373
|
-
if response.status_code != requests.codes.ok:
|
374
|
-
raise ValueError(f"Error deleting eval results: {response.json()}")
|
375
|
-
return response.json()
|
376
|
-
|
377
317
|
def create_project(self, project_name: str) -> bool:
|
378
318
|
"""
|
379
319
|
Creates a project on the server.
|
380
320
|
"""
|
381
|
-
response = requests.post(
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
321
|
+
response = requests.post(
|
322
|
+
JUDGMENT_PROJECT_CREATE_API_URL,
|
323
|
+
json={
|
324
|
+
"project_name": project_name,
|
325
|
+
},
|
326
|
+
headers={
|
327
|
+
"Content-Type": "application/json",
|
328
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
329
|
+
"X-Organization-Id": self.organization_id,
|
330
|
+
},
|
331
|
+
)
|
332
|
+
if response.status_code != codes.ok:
|
391
333
|
raise ValueError(f"Error creating project: {response.json()}")
|
392
334
|
return response.json()
|
393
|
-
|
335
|
+
|
394
336
|
def delete_project(self, project_name: str) -> bool:
|
395
337
|
"""
|
396
338
|
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
397
339
|
"""
|
398
|
-
response = requests.delete(
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
340
|
+
response = requests.delete(
|
341
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
342
|
+
json={
|
343
|
+
"project_name": project_name,
|
344
|
+
},
|
345
|
+
headers={
|
346
|
+
"Content-Type": "application/json",
|
347
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
348
|
+
"X-Organization-Id": self.organization_id,
|
349
|
+
},
|
350
|
+
)
|
351
|
+
if response.status_code != codes.ok:
|
408
352
|
raise ValueError(f"Error deleting project: {response.json()}")
|
409
353
|
return response.json()
|
410
|
-
|
354
|
+
|
411
355
|
def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
|
412
356
|
"""
|
413
357
|
Fetches a classifier scorer configuration from the Judgment API.
|
@@ -424,33 +368,41 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
424
368
|
request_body = {
|
425
369
|
"slug": slug,
|
426
370
|
}
|
427
|
-
|
371
|
+
|
428
372
|
response = requests.post(
|
429
373
|
f"{ROOT_API}/fetch_scorer/",
|
430
374
|
json=request_body,
|
431
375
|
headers={
|
432
376
|
"Content-Type": "application/json",
|
433
377
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
434
|
-
"X-Organization-Id": self.organization_id
|
378
|
+
"X-Organization-Id": self.organization_id,
|
435
379
|
},
|
436
|
-
verify=True
|
380
|
+
verify=True,
|
437
381
|
)
|
438
|
-
|
382
|
+
|
439
383
|
if response.status_code == 500:
|
440
|
-
raise JudgmentAPIError(
|
384
|
+
raise JudgmentAPIError(
|
385
|
+
f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}"
|
386
|
+
)
|
441
387
|
elif response.status_code != 200:
|
442
|
-
raise JudgmentAPIError(
|
443
|
-
|
388
|
+
raise JudgmentAPIError(
|
389
|
+
f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}"
|
390
|
+
)
|
391
|
+
|
444
392
|
scorer_config = response.json()
|
445
|
-
|
446
|
-
|
447
|
-
|
393
|
+
scorer_config.pop("created_at")
|
394
|
+
scorer_config.pop("updated_at")
|
395
|
+
|
448
396
|
try:
|
449
397
|
return ClassifierScorer(**scorer_config)
|
450
398
|
except Exception as e:
|
451
|
-
raise JudgmentAPIError(
|
399
|
+
raise JudgmentAPIError(
|
400
|
+
f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}"
|
401
|
+
)
|
452
402
|
|
453
|
-
def push_classifier_scorer(
|
403
|
+
def push_classifier_scorer(
|
404
|
+
self, scorer: ClassifierScorer, slug: str | None = None
|
405
|
+
) -> str:
|
454
406
|
"""
|
455
407
|
Pushes a classifier scorer configuration to the Judgment API.
|
456
408
|
|
@@ -468,62 +420,112 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
468
420
|
"name": scorer.name,
|
469
421
|
"conversation": scorer.conversation,
|
470
422
|
"options": scorer.options,
|
471
|
-
"slug": slug
|
423
|
+
"slug": slug,
|
472
424
|
}
|
473
|
-
|
425
|
+
|
474
426
|
response = requests.post(
|
475
427
|
f"{ROOT_API}/save_scorer/",
|
476
428
|
json=request_body,
|
477
429
|
headers={
|
478
430
|
"Content-Type": "application/json",
|
479
431
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
480
|
-
"X-Organization-Id": self.organization_id
|
432
|
+
"X-Organization-Id": self.organization_id,
|
481
433
|
},
|
482
|
-
verify=True
|
434
|
+
verify=True,
|
483
435
|
)
|
484
|
-
|
436
|
+
|
485
437
|
if response.status_code == 500:
|
486
|
-
raise JudgmentAPIError(
|
438
|
+
raise JudgmentAPIError(
|
439
|
+
f"The server is temporarily unavailable. \
|
487
440
|
Please try your request again in a few moments. \
|
488
|
-
Error details: {response.json().get('detail', '')}"
|
441
|
+
Error details: {response.json().get('detail', '')}"
|
442
|
+
)
|
489
443
|
elif response.status_code != 200:
|
490
|
-
raise JudgmentAPIError(
|
491
|
-
|
444
|
+
raise JudgmentAPIError(
|
445
|
+
f"Failed to save classifier scorer: {response.json().get('detail', '')}"
|
446
|
+
)
|
447
|
+
|
492
448
|
return response.json()["slug"]
|
493
|
-
|
449
|
+
|
494
450
|
def assert_test(
|
495
|
-
self,
|
451
|
+
self,
|
452
|
+
examples: List[Example],
|
496
453
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
497
|
-
|
498
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
499
|
-
test_file: Optional[str] = None,
|
500
|
-
aggregator: Optional[str] = None,
|
501
|
-
metadata: Optional[Dict[str, Any]] = None,
|
502
|
-
log_results: bool = True,
|
454
|
+
model: Optional[str] = "gpt-4.1",
|
503
455
|
project_name: str = "default_test",
|
504
456
|
eval_run_name: str = str(uuid4()),
|
505
457
|
override: bool = False,
|
506
|
-
|
458
|
+
append: bool = False,
|
459
|
+
async_execution: bool = False,
|
460
|
+
) -> None:
|
461
|
+
"""
|
462
|
+
Asserts a test by running the evaluation and checking the results for success
|
463
|
+
|
464
|
+
Args:
|
465
|
+
examples (List[Example]): The examples to evaluate.
|
466
|
+
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
467
|
+
model (str): The model used as a judge when using LLM as a Judge
|
468
|
+
project_name (str): The name of the project the evaluation results belong to
|
469
|
+
eval_run_name (str): A name for this evaluation run
|
470
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
471
|
+
append (bool): Whether to append to an existing evaluation run with the same name
|
472
|
+
async_execution (bool): Whether to run the evaluation asynchronously
|
473
|
+
"""
|
474
|
+
|
475
|
+
results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
|
476
|
+
|
477
|
+
results = self.run_evaluation(
|
478
|
+
examples=examples,
|
479
|
+
scorers=scorers,
|
480
|
+
model=model,
|
481
|
+
project_name=project_name,
|
482
|
+
eval_run_name=eval_run_name,
|
483
|
+
override=override,
|
484
|
+
append=append,
|
485
|
+
async_execution=async_execution,
|
486
|
+
)
|
487
|
+
|
488
|
+
if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
|
489
|
+
|
490
|
+
async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
|
491
|
+
return await results
|
492
|
+
|
493
|
+
actual_results = safe_run_async(run_async())
|
494
|
+
assert_test(actual_results) # Call the synchronous imported function
|
495
|
+
else:
|
496
|
+
# 'results' is already List[ScoringResult] here (synchronous path)
|
497
|
+
assert_test(results) # Call the synchronous imported function
|
498
|
+
|
499
|
+
def assert_trace_test(
|
500
|
+
self,
|
501
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
502
|
+
examples: Optional[List[Example]] = None,
|
507
503
|
function: Optional[Callable] = None,
|
508
504
|
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
505
|
+
traces: Optional[List[Trace]] = None,
|
509
506
|
tools: Optional[List[Dict[str, Any]]] = None,
|
510
|
-
|
507
|
+
model: Optional[str] = "gpt-4.1",
|
508
|
+
project_name: str = "default_test",
|
509
|
+
eval_run_name: str = str(uuid4()),
|
510
|
+
override: bool = False,
|
511
|
+
append: bool = False,
|
512
|
+
async_execution: bool = False,
|
511
513
|
) -> None:
|
512
514
|
"""
|
513
515
|
Asserts a test by running the evaluation and checking the results for success
|
514
|
-
|
516
|
+
|
515
517
|
Args:
|
516
|
-
examples (
|
517
|
-
test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
|
518
|
+
examples (List[Example]): The examples to evaluate.
|
518
519
|
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
519
|
-
model (
|
520
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
521
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
522
|
-
log_results (bool): Whether to log the results to the Judgment API
|
520
|
+
model (str): The model used as a judge when using LLM as a Judge
|
523
521
|
project_name (str): The name of the project the evaluation results belong to
|
524
522
|
eval_run_name (str): A name for this evaluation run
|
525
523
|
override (bool): Whether to override an existing evaluation run with the same name
|
526
|
-
|
524
|
+
append (bool): Whether to append to an existing evaluation run with the same name
|
525
|
+
function (Optional[Callable]): A function to use for evaluation
|
526
|
+
tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
|
527
|
+
tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
|
528
|
+
async_execution (bool): Whether to run the evaluation asynchronously
|
527
529
|
"""
|
528
530
|
|
529
531
|
# Check for enable_param_checking and tools
|
@@ -531,46 +533,32 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
531
533
|
if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
|
532
534
|
if scorer.kwargs.get("enable_param_checking") is True:
|
533
535
|
if not tools:
|
534
|
-
raise ValueError(
|
536
|
+
raise ValueError(
|
537
|
+
f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
|
538
|
+
)
|
539
|
+
|
540
|
+
results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
|
541
|
+
|
542
|
+
results = self.run_trace_evaluation(
|
543
|
+
examples=examples,
|
544
|
+
traces=traces,
|
545
|
+
scorers=scorers,
|
546
|
+
model=model,
|
547
|
+
project_name=project_name,
|
548
|
+
eval_run_name=eval_run_name,
|
549
|
+
override=override,
|
550
|
+
append=append,
|
551
|
+
function=function,
|
552
|
+
tracer=tracer,
|
553
|
+
tools=tools,
|
554
|
+
)
|
535
555
|
|
536
|
-
|
537
|
-
if (examples is None and test_file is None) or (examples is not None and test_file is not None):
|
538
|
-
raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
|
556
|
+
if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
|
539
557
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
model=model,
|
545
|
-
aggregator=aggregator,
|
546
|
-
log_results=log_results,
|
547
|
-
project_name=project_name,
|
548
|
-
eval_run_name=eval_run_name,
|
549
|
-
override=override,
|
550
|
-
rules=rules,
|
551
|
-
function=function,
|
552
|
-
tracer=tracer,
|
553
|
-
test_file=test_file,
|
554
|
-
tools=tools
|
555
|
-
)
|
556
|
-
else:
|
557
|
-
results = self.run_evaluation(
|
558
|
-
examples=examples,
|
559
|
-
scorers=scorers,
|
560
|
-
model=model,
|
561
|
-
aggregator=aggregator,
|
562
|
-
metadata=metadata,
|
563
|
-
log_results=log_results,
|
564
|
-
project_name=project_name,
|
565
|
-
eval_run_name=eval_run_name,
|
566
|
-
override=override,
|
567
|
-
rules=rules,
|
568
|
-
async_execution=async_execution
|
569
|
-
)
|
570
|
-
|
571
|
-
if async_execution:
|
572
|
-
# 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
|
573
|
-
actual_results = asyncio.run(results)
|
558
|
+
async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
|
559
|
+
return await results
|
560
|
+
|
561
|
+
actual_results = safe_run_async(run_async())
|
574
562
|
assert_test(actual_results) # Call the synchronous imported function
|
575
563
|
else:
|
576
564
|
# 'results' is already List[ScoringResult] here (synchronous path)
|