judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,60 +1,62 @@
1
1
  """
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
+
4
5
  import os
5
6
  from uuid import uuid4
6
7
  from typing import Optional, List, Dict, Any, Union, Callable
7
- import requests
8
+ from requests import codes
9
+ from judgeval.utils.requests import requests
8
10
  import asyncio
9
11
 
10
12
  from judgeval.constants import ROOT_API
11
13
  from judgeval.data.datasets import EvalDataset, EvalDatasetClient
12
14
  from judgeval.data import (
13
- ScoringResult,
15
+ ScoringResult,
14
16
  Example,
15
17
  CustomExample,
16
18
  Trace,
17
19
  )
18
20
  from judgeval.scorers import (
19
- APIJudgmentScorer,
20
- JudgevalScorer,
21
- ClassifierScorer,
21
+ APIJudgmentScorer,
22
+ JudgevalScorer,
23
+ ClassifierScorer,
22
24
  )
23
25
  from judgeval.evaluation_run import EvaluationRun
24
26
  from judgeval.run_evaluation import (
25
- run_eval,
27
+ run_eval,
26
28
  assert_test,
27
- run_trace_eval
29
+ run_trace_eval,
30
+ safe_run_async,
28
31
  )
29
32
  from judgeval.data.trace_run import TraceRun
30
- from judgeval.judges import JudgevalJudge
31
33
  from judgeval.constants import (
32
- JUDGMENT_EVAL_FETCH_API_URL,
33
- JUDGMENT_EVAL_DELETE_API_URL,
34
- JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
34
+ JUDGMENT_EVAL_FETCH_API_URL,
35
35
  JUDGMENT_PROJECT_DELETE_API_URL,
36
- JUDGMENT_PROJECT_CREATE_API_URL
36
+ JUDGMENT_PROJECT_CREATE_API_URL,
37
37
  )
38
- from judgeval.utils.data_utils import add_from_yaml
39
38
  from judgeval.common.exceptions import JudgmentAPIError
40
39
  from langchain_core.callbacks import BaseCallbackHandler
41
40
  from judgeval.common.tracer import Tracer
42
41
  from judgeval.common.utils import validate_api_key
43
42
  from pydantic import BaseModel
44
- from judgeval.rules import Rule
43
+ from judgeval.run_evaluation import SpinnerWrappedTask
44
+
45
45
 
46
46
  class EvalRunRequestBody(BaseModel):
47
47
  eval_name: str
48
48
  project_name: str
49
49
  judgment_api_key: str
50
50
 
51
+
51
52
  class DeleteEvalRunRequestBody(BaseModel):
52
53
  eval_names: List[str]
53
54
  project_name: str
54
55
  judgment_api_key: str
55
56
 
57
+
56
58
  class SingletonMeta(type):
57
- _instances = {}
59
+ _instances: Dict[type, "JudgmentClient"] = {}
58
60
 
59
61
  def __call__(cls, *args, **kwargs):
60
62
  if cls not in cls._instances:
@@ -62,179 +64,168 @@ class SingletonMeta(type):
62
64
  cls._instances[cls] = instance
63
65
  return cls._instances[cls]
64
66
 
67
+
65
68
  class JudgmentClient(metaclass=SingletonMeta):
66
- def __init__(self, judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"), organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID")):
69
+ def __init__(
70
+ self,
71
+ judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
72
+ organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
73
+ ):
67
74
  # Check if API key is None
68
75
  if judgment_api_key is None:
69
- raise ValueError("JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable.")
70
-
76
+ raise ValueError(
77
+ "JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable."
78
+ )
79
+
71
80
  # Check if organization ID is None
72
81
  if organization_id is None:
73
- raise ValueError("JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable.")
74
-
82
+ raise ValueError(
83
+ "JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable."
84
+ )
85
+
75
86
  self.judgment_api_key = judgment_api_key
76
87
  self.organization_id = organization_id
77
88
  self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
78
-
89
+
79
90
  # Verify API key is valid
80
91
  result, response = validate_api_key(judgment_api_key)
81
92
  if not result:
82
93
  # May be bad to output their invalid API key...
83
94
  raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
84
95
  else:
85
- print(f"Successfully initialized JudgmentClient!")
96
+ print("Successfully initialized JudgmentClient!")
86
97
 
87
98
  def a_run_evaluation(
88
- self,
99
+ self,
89
100
  examples: List[Example],
90
101
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
91
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
92
- aggregator: Optional[str] = None,
93
- metadata: Optional[Dict[str, Any]] = None,
94
- log_results: bool = True,
102
+ model: Optional[str] = "gpt-4.1",
95
103
  project_name: str = "default_project",
96
104
  eval_run_name: str = "default_eval_run",
97
105
  override: bool = False,
98
106
  append: bool = False,
99
- ignore_errors: bool = True,
100
- rules: Optional[List[Rule]] = None
101
107
  ) -> List[ScoringResult]:
102
- return self.run_evaluation(
103
- examples=examples,
104
- scorers=scorers,
105
- model=model,
106
- aggregator=aggregator,
107
- metadata=metadata,
108
- log_results=log_results,
109
- project_name=project_name,
110
- eval_run_name=eval_run_name,
108
+ result = self.run_evaluation(
109
+ examples=examples,
110
+ scorers=scorers,
111
+ model=model,
112
+ project_name=project_name,
113
+ eval_run_name=eval_run_name,
111
114
  override=override,
112
- append=append,
113
- ignore_errors=ignore_errors,
114
- rules=rules
115
+ append=append,
116
+ async_execution=True,
115
117
  )
118
+ assert not isinstance(result, (asyncio.Task, SpinnerWrappedTask))
119
+ return result
116
120
 
117
121
  def run_trace_evaluation(
118
122
  self,
119
123
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
120
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
121
- traces: Optional[List[Trace]] = None,
122
124
  examples: Optional[List[Example]] = None,
123
- test_file: Optional[str] = None,
124
- aggregator: Optional[str] = None,
125
+ function: Optional[Callable] = None,
126
+ tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
127
+ traces: Optional[List[Trace]] = None,
128
+ tools: Optional[List[Dict[str, Any]]] = None,
125
129
  project_name: str = "default_project",
126
130
  eval_run_name: str = "default_eval_trace",
127
- log_results: bool = True,
131
+ model: Optional[str] = "gpt-4.1",
128
132
  append: bool = False,
129
133
  override: bool = False,
130
- ignore_errors: bool = True,
131
- rules: Optional[List[Rule]] = None,
132
- function: Optional[Callable] = None,
133
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
134
- tools: Optional[List[Dict[str, Any]]] = None
135
134
  ) -> List[ScoringResult]:
136
- try:
137
-
138
- if test_file:
139
- try:
140
- examples = add_from_yaml(test_file)
141
- except FileNotFoundError:
142
- raise FileNotFoundError(f"Test file not found: {test_file}")
143
-
135
+ try:
144
136
  if examples and not function:
145
137
  raise ValueError("Cannot pass in examples without a function")
146
-
138
+
147
139
  if traces and function:
148
140
  raise ValueError("Cannot pass in traces and function")
149
-
141
+
150
142
  if examples and traces:
151
143
  raise ValueError("Cannot pass in both examples and traces")
152
-
144
+
153
145
  trace_run = TraceRun(
154
146
  project_name=project_name,
155
147
  eval_name=eval_run_name,
156
148
  traces=traces,
157
149
  scorers=scorers,
158
150
  model=model,
159
- aggregator=aggregator,
160
- log_results=log_results,
161
151
  append=append,
162
152
  judgment_api_key=self.judgment_api_key,
163
153
  organization_id=self.organization_id,
164
- tools=tools
154
+ tools=tools,
165
155
  )
166
- return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
156
+ return run_trace_eval(trace_run, override, function, tracer, examples)
167
157
  except ValueError as e:
168
- raise ValueError(f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}")
158
+ raise ValueError(
159
+ f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
160
+ )
169
161
  except Exception as e:
170
162
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
171
163
 
172
164
  def run_evaluation(
173
- self,
165
+ self,
174
166
  examples: Union[List[Example], List[CustomExample]],
175
167
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
176
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
177
- aggregator: Optional[str] = None,
178
- metadata: Optional[Dict[str, Any]] = None,
179
- log_results: bool = True,
168
+ model: Optional[str] = "gpt-4.1",
180
169
  project_name: str = "default_project",
181
170
  eval_run_name: str = "default_eval_run",
182
171
  override: bool = False,
183
172
  append: bool = False,
184
- ignore_errors: bool = True,
185
173
  async_execution: bool = False,
186
- rules: Optional[List[Rule]] = None
187
- ) -> Union[List[ScoringResult], asyncio.Task]:
174
+ ) -> Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]:
188
175
  """
189
176
  Executes an evaluation of `Example`s using one or more `Scorer`s
190
-
177
+
191
178
  Args:
192
179
  examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
193
180
  scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
194
- model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
195
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
196
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
197
- log_results (bool): Whether to log the results to the Judgment API
181
+ model (str): The model used as a judge when using LLM as a Judge
198
182
  project_name (str): The name of the project the evaluation results belong to
199
183
  eval_run_name (str): A name for this evaluation run
200
184
  override (bool): Whether to override an existing evaluation run with the same name
201
- ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
202
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
203
-
185
+ append (bool): Whether to append to an existing evaluation run with the same name
186
+ async_execution (bool): Whether to execute the evaluation asynchronously
187
+
204
188
  Returns:
205
189
  List[ScoringResult]: The results of the evaluation
206
190
  """
207
191
  if override and append:
208
- raise ValueError("Cannot set both override and append to True. Please choose one.")
192
+ raise ValueError(
193
+ "Cannot set both override and append to True. Please choose one."
194
+ )
209
195
 
210
196
  try:
211
- if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
212
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
213
-
214
197
  eval = EvaluationRun(
215
- log_results=log_results,
216
198
  append=append,
217
199
  project_name=project_name,
218
200
  eval_name=eval_run_name,
219
201
  examples=examples,
220
202
  scorers=scorers,
221
203
  model=model,
222
- aggregator=aggregator,
223
- metadata=metadata,
224
204
  judgment_api_key=self.judgment_api_key,
225
- rules=rules,
226
- organization_id=self.organization_id
205
+ organization_id=self.organization_id,
206
+ )
207
+ return run_eval(
208
+ eval,
209
+ override,
210
+ async_execution=async_execution,
227
211
  )
228
- return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
229
212
  except ValueError as e:
230
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
213
+ raise ValueError(
214
+ f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}"
215
+ )
231
216
  except Exception as e:
232
217
  raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
233
218
 
234
219
  def create_dataset(self) -> EvalDataset:
235
220
  return self.eval_dataset_client.create_dataset()
236
221
 
237
- def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
222
+ def push_dataset(
223
+ self,
224
+ alias: str,
225
+ dataset: EvalDataset,
226
+ project_name: str,
227
+ overwrite: Optional[bool] = False,
228
+ ) -> bool:
238
229
  """
239
230
  Uploads an `EvalDataset` to the Judgment platform for storage.
240
231
 
@@ -249,13 +240,15 @@ class JudgmentClient(metaclass=SingletonMeta):
249
240
  # Set judgment_api_key just in case it was not set
250
241
  dataset.judgment_api_key = self.judgment_api_key
251
242
  return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
252
-
253
- def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
243
+
244
+ def append_dataset(
245
+ self, alias: str, examples: List[Example], project_name: str
246
+ ) -> bool:
254
247
  """
255
248
  Appends an `EvalDataset` to the Judgment platform for storage.
256
249
  """
257
250
  return self.eval_dataset_client.append_examples(alias, examples, project_name)
258
-
251
+
259
252
  def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
260
253
  """
261
254
  Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -273,7 +266,7 @@ class JudgmentClient(metaclass=SingletonMeta):
273
266
  Deletes a saved `EvalDataset` from the Judgment platform.
274
267
  """
275
268
  return self.eval_dataset_client.delete(alias, project_name)
276
-
269
+
277
270
  def pull_project_dataset_stats(self, project_name: str) -> dict:
278
271
  """
279
272
  Retrieves all dataset stats from the Judgment platform for the project.
@@ -285,15 +278,11 @@ class JudgmentClient(metaclass=SingletonMeta):
285
278
  dict: The retrieved dataset stats
286
279
  """
287
280
  return self.eval_dataset_client.pull_project_dataset_stats(project_name)
288
-
289
- def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
290
- """
291
- Edits the dataset on Judgment platform by adding new examples
292
- """
293
- return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
294
-
281
+
295
282
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
296
- def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
283
+ def pull_eval(
284
+ self, project_name: str, eval_run_name: str
285
+ ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
297
286
  """Pull evaluation results from the server.
298
287
 
299
288
  Args:
@@ -305,109 +294,64 @@ class JudgmentClient(metaclass=SingletonMeta):
305
294
  - id (str): The evaluation run ID
306
295
  - results (List[ScoringResult]): List of scoring results
307
296
  """
308
- eval_run_request_body = EvalRunRequestBody(project_name=project_name,
309
- eval_name=eval_run_name,
310
- judgment_api_key=self.judgment_api_key)
311
- eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
312
- headers={
313
- "Content-Type": "application/json",
314
- "Authorization": f"Bearer {self.judgment_api_key}",
315
- "X-Organization-Id": self.organization_id
316
- },
317
- json=eval_run_request_body.model_dump(),
318
- verify=True)
319
- if eval_run.status_code != requests.codes.ok:
297
+ eval_run_request_body = EvalRunRequestBody(
298
+ project_name=project_name,
299
+ eval_name=eval_run_name,
300
+ judgment_api_key=self.judgment_api_key,
301
+ )
302
+ eval_run = requests.post(
303
+ JUDGMENT_EVAL_FETCH_API_URL,
304
+ headers={
305
+ "Content-Type": "application/json",
306
+ "Authorization": f"Bearer {self.judgment_api_key}",
307
+ "X-Organization-Id": self.organization_id,
308
+ },
309
+ json=eval_run_request_body.model_dump(),
310
+ verify=True,
311
+ )
312
+ if eval_run.status_code != codes.ok:
320
313
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
321
314
 
322
315
  return eval_run.json()
323
-
324
- def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
325
- """
326
- Deletes an evaluation from the server by project and run names.
327
316
 
328
- Args:
329
- project_name (str): Name of the project
330
- eval_run_names (List[str]): List of names of the evaluation runs
331
-
332
- Returns:
333
- bool: Whether the evaluation was successfully deleted
334
- """
335
- if not eval_run_names:
336
- raise ValueError("No evaluation run names provided")
337
-
338
- eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
339
- eval_names=eval_run_names,
340
- judgment_api_key=self.judgment_api_key)
341
- response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
342
- json=eval_run_request_body.model_dump(),
343
- headers={
344
- "Content-Type": "application/json",
345
- "Authorization": f"Bearer {self.judgment_api_key}",
346
- "X-Organization-Id": self.organization_id
347
- })
348
- if response.status_code == 404:
349
- raise ValueError(f"Eval results not found: {response.json()}")
350
- elif response.status_code == 500:
351
- raise ValueError(f"Error deleting eval results: {response.json()}")
352
- return bool(response.json())
353
-
354
- def delete_project_evals(self, project_name: str) -> bool:
355
- """
356
- Deletes all evaluations from the server for a given project.
357
-
358
- Args:
359
- project_name (str): Name of the project
360
-
361
- Returns:
362
- bool: Whether the evaluations were successfully deleted
363
- """
364
- response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
365
- json={
366
- "project_name": project_name,
367
- },
368
- headers={
369
- "Content-Type": "application/json",
370
- "Authorization": f"Bearer {self.judgment_api_key}",
371
- "X-Organization-Id": self.organization_id
372
- })
373
- if response.status_code != requests.codes.ok:
374
- raise ValueError(f"Error deleting eval results: {response.json()}")
375
- return response.json()
376
-
377
317
  def create_project(self, project_name: str) -> bool:
378
318
  """
379
319
  Creates a project on the server.
380
320
  """
381
- response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
382
- json={
383
- "project_name": project_name,
384
- },
385
- headers={
386
- "Content-Type": "application/json",
387
- "Authorization": f"Bearer {self.judgment_api_key}",
388
- "X-Organization-Id": self.organization_id
389
- })
390
- if response.status_code != requests.codes.ok:
321
+ response = requests.post(
322
+ JUDGMENT_PROJECT_CREATE_API_URL,
323
+ json={
324
+ "project_name": project_name,
325
+ },
326
+ headers={
327
+ "Content-Type": "application/json",
328
+ "Authorization": f"Bearer {self.judgment_api_key}",
329
+ "X-Organization-Id": self.organization_id,
330
+ },
331
+ )
332
+ if response.status_code != codes.ok:
391
333
  raise ValueError(f"Error creating project: {response.json()}")
392
334
  return response.json()
393
-
335
+
394
336
  def delete_project(self, project_name: str) -> bool:
395
337
  """
396
338
  Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
397
339
  """
398
- response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
399
- json={
400
- "project_name": project_name,
401
- },
402
- headers={
403
- "Content-Type": "application/json",
404
- "Authorization": f"Bearer {self.judgment_api_key}",
405
- "X-Organization-Id": self.organization_id
406
- })
407
- if response.status_code != requests.codes.ok:
340
+ response = requests.delete(
341
+ JUDGMENT_PROJECT_DELETE_API_URL,
342
+ json={
343
+ "project_name": project_name,
344
+ },
345
+ headers={
346
+ "Content-Type": "application/json",
347
+ "Authorization": f"Bearer {self.judgment_api_key}",
348
+ "X-Organization-Id": self.organization_id,
349
+ },
350
+ )
351
+ if response.status_code != codes.ok:
408
352
  raise ValueError(f"Error deleting project: {response.json()}")
409
353
  return response.json()
410
-
354
+
411
355
  def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
412
356
  """
413
357
  Fetches a classifier scorer configuration from the Judgment API.
@@ -424,33 +368,41 @@ class JudgmentClient(metaclass=SingletonMeta):
424
368
  request_body = {
425
369
  "slug": slug,
426
370
  }
427
-
371
+
428
372
  response = requests.post(
429
373
  f"{ROOT_API}/fetch_scorer/",
430
374
  json=request_body,
431
375
  headers={
432
376
  "Content-Type": "application/json",
433
377
  "Authorization": f"Bearer {self.judgment_api_key}",
434
- "X-Organization-Id": self.organization_id
378
+ "X-Organization-Id": self.organization_id,
435
379
  },
436
- verify=True
380
+ verify=True,
437
381
  )
438
-
382
+
439
383
  if response.status_code == 500:
440
- raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}")
384
+ raise JudgmentAPIError(
385
+ f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}"
386
+ )
441
387
  elif response.status_code != 200:
442
- raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
443
-
388
+ raise JudgmentAPIError(
389
+ f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}"
390
+ )
391
+
444
392
  scorer_config = response.json()
445
- created_at = scorer_config.pop("created_at")
446
- updated_at = scorer_config.pop("updated_at")
447
-
393
+ scorer_config.pop("created_at")
394
+ scorer_config.pop("updated_at")
395
+
448
396
  try:
449
397
  return ClassifierScorer(**scorer_config)
450
398
  except Exception as e:
451
- raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}")
399
+ raise JudgmentAPIError(
400
+ f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}"
401
+ )
452
402
 
453
- def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str:
403
+ def push_classifier_scorer(
404
+ self, scorer: ClassifierScorer, slug: str | None = None
405
+ ) -> str:
454
406
  """
455
407
  Pushes a classifier scorer configuration to the Judgment API.
456
408
 
@@ -468,62 +420,112 @@ class JudgmentClient(metaclass=SingletonMeta):
468
420
  "name": scorer.name,
469
421
  "conversation": scorer.conversation,
470
422
  "options": scorer.options,
471
- "slug": slug
423
+ "slug": slug,
472
424
  }
473
-
425
+
474
426
  response = requests.post(
475
427
  f"{ROOT_API}/save_scorer/",
476
428
  json=request_body,
477
429
  headers={
478
430
  "Content-Type": "application/json",
479
431
  "Authorization": f"Bearer {self.judgment_api_key}",
480
- "X-Organization-Id": self.organization_id
432
+ "X-Organization-Id": self.organization_id,
481
433
  },
482
- verify=True
434
+ verify=True,
483
435
  )
484
-
436
+
485
437
  if response.status_code == 500:
486
- raise JudgmentAPIError(f"The server is temporarily unavailable. \
438
+ raise JudgmentAPIError(
439
+ f"The server is temporarily unavailable. \
487
440
  Please try your request again in a few moments. \
488
- Error details: {response.json().get('detail', '')}")
441
+ Error details: {response.json().get('detail', '')}"
442
+ )
489
443
  elif response.status_code != 200:
490
- raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
491
-
444
+ raise JudgmentAPIError(
445
+ f"Failed to save classifier scorer: {response.json().get('detail', '')}"
446
+ )
447
+
492
448
  return response.json()["slug"]
493
-
449
+
494
450
  def assert_test(
495
- self,
451
+ self,
452
+ examples: List[Example],
496
453
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
497
- examples: Optional[List[Example]] = None,
498
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
499
- test_file: Optional[str] = None,
500
- aggregator: Optional[str] = None,
501
- metadata: Optional[Dict[str, Any]] = None,
502
- log_results: bool = True,
454
+ model: Optional[str] = "gpt-4.1",
503
455
  project_name: str = "default_test",
504
456
  eval_run_name: str = str(uuid4()),
505
457
  override: bool = False,
506
- rules: Optional[List[Rule]] = None,
458
+ append: bool = False,
459
+ async_execution: bool = False,
460
+ ) -> None:
461
+ """
462
+ Asserts a test by running the evaluation and checking the results for success
463
+
464
+ Args:
465
+ examples (List[Example]): The examples to evaluate.
466
+ scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
467
+ model (str): The model used as a judge when using LLM as a Judge
468
+ project_name (str): The name of the project the evaluation results belong to
469
+ eval_run_name (str): A name for this evaluation run
470
+ override (bool): Whether to override an existing evaluation run with the same name
471
+ append (bool): Whether to append to an existing evaluation run with the same name
472
+ async_execution (bool): Whether to run the evaluation asynchronously
473
+ """
474
+
475
+ results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
476
+
477
+ results = self.run_evaluation(
478
+ examples=examples,
479
+ scorers=scorers,
480
+ model=model,
481
+ project_name=project_name,
482
+ eval_run_name=eval_run_name,
483
+ override=override,
484
+ append=append,
485
+ async_execution=async_execution,
486
+ )
487
+
488
+ if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
489
+
490
+ async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
491
+ return await results
492
+
493
+ actual_results = safe_run_async(run_async())
494
+ assert_test(actual_results) # Call the synchronous imported function
495
+ else:
496
+ # 'results' is already List[ScoringResult] here (synchronous path)
497
+ assert_test(results) # Call the synchronous imported function
498
+
499
+ def assert_trace_test(
500
+ self,
501
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
502
+ examples: Optional[List[Example]] = None,
507
503
  function: Optional[Callable] = None,
508
504
  tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
505
+ traces: Optional[List[Trace]] = None,
509
506
  tools: Optional[List[Dict[str, Any]]] = None,
510
- async_execution: bool = False
507
+ model: Optional[str] = "gpt-4.1",
508
+ project_name: str = "default_test",
509
+ eval_run_name: str = str(uuid4()),
510
+ override: bool = False,
511
+ append: bool = False,
512
+ async_execution: bool = False,
511
513
  ) -> None:
512
514
  """
513
515
  Asserts a test by running the evaluation and checking the results for success
514
-
516
+
515
517
  Args:
516
- examples (Optional[List[Example]]): The examples to evaluate. Must be provided if test_file is not.
517
- test_file (Optional[str]): Path to a YAML file containing test examples. Must be provided if examples is not.
518
+ examples (List[Example]): The examples to evaluate.
518
519
  scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
519
- model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
520
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
521
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
522
- log_results (bool): Whether to log the results to the Judgment API
520
+ model (str): The model used as a judge when using LLM as a Judge
523
521
  project_name (str): The name of the project the evaluation results belong to
524
522
  eval_run_name (str): A name for this evaluation run
525
523
  override (bool): Whether to override an existing evaluation run with the same name
526
- rules (Optional[List[Rule]]): Rules to evaluate against scoring results
524
+ append (bool): Whether to append to an existing evaluation run with the same name
525
+ function (Optional[Callable]): A function to use for evaluation
526
+ tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
527
+ tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
528
+ async_execution (bool): Whether to run the evaluation asynchronously
527
529
  """
528
530
 
529
531
  # Check for enable_param_checking and tools
@@ -531,46 +533,32 @@ class JudgmentClient(metaclass=SingletonMeta):
531
533
  if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
532
534
  if scorer.kwargs.get("enable_param_checking") is True:
533
535
  if not tools:
534
- raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
536
+ raise ValueError(
537
+ f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
538
+ )
539
+
540
+ results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
541
+
542
+ results = self.run_trace_evaluation(
543
+ examples=examples,
544
+ traces=traces,
545
+ scorers=scorers,
546
+ model=model,
547
+ project_name=project_name,
548
+ eval_run_name=eval_run_name,
549
+ override=override,
550
+ append=append,
551
+ function=function,
552
+ tracer=tracer,
553
+ tools=tools,
554
+ )
535
555
 
536
- # Validate that exactly one of examples or test_file is provided
537
- if (examples is None and test_file is None) or (examples is not None and test_file is not None):
538
- raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
556
+ if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
539
557
 
540
- if function:
541
- results = self.run_trace_evaluation(
542
- examples=examples,
543
- scorers=scorers,
544
- model=model,
545
- aggregator=aggregator,
546
- log_results=log_results,
547
- project_name=project_name,
548
- eval_run_name=eval_run_name,
549
- override=override,
550
- rules=rules,
551
- function=function,
552
- tracer=tracer,
553
- test_file=test_file,
554
- tools=tools
555
- )
556
- else:
557
- results = self.run_evaluation(
558
- examples=examples,
559
- scorers=scorers,
560
- model=model,
561
- aggregator=aggregator,
562
- metadata=metadata,
563
- log_results=log_results,
564
- project_name=project_name,
565
- eval_run_name=eval_run_name,
566
- override=override,
567
- rules=rules,
568
- async_execution=async_execution
569
- )
570
-
571
- if async_execution:
572
- # 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
573
- actual_results = asyncio.run(results)
558
+ async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
559
+ return await results
560
+
561
+ actual_results = safe_run_async(run_async())
574
562
  assert_test(actual_results) # Call the synchronous imported function
575
563
  else:
576
564
  # 'results' is already List[ScoringResult] here (synchronous path)