judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/data/example.py CHANGED
@@ -5,7 +5,7 @@ Classes for representing examples in a dataset.
5
5
 
6
6
  from typing import TypeVar, Optional, Any, Dict, List
7
7
  from uuid import uuid4
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel, Field, field_validator
9
9
  from enum import Enum
10
10
  from datetime import datetime
11
11
  import time
@@ -40,6 +40,13 @@ class Example(BaseModel):
40
40
  timestamp: Optional[str] = None
41
41
  trace_id: Optional[str] = None
42
42
 
43
+ @field_validator('input', 'actual_output', mode='before')
44
+ def convert_to_str(cls, value):
45
+ try:
46
+ return str(value)
47
+ except Exception:
48
+ return repr(value)
49
+
43
50
  def __init__(self, **data):
44
51
  if 'example_id' not in data:
45
52
  data['example_id'] = str(uuid4())
@@ -6,6 +6,7 @@ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
7
  from judgeval.common.logger import debug, error
8
8
  from judgeval.judges import JudgevalJudge
9
+ from judgeval.rules import Rule
9
10
 
10
11
  class EvaluationRun(BaseModel):
11
12
  """
@@ -20,10 +21,12 @@ class EvaluationRun(BaseModel):
20
21
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
22
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
23
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
24
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
23
25
  """
24
26
 
25
27
  # The user will specify whether they want log_results when they call run_eval
26
28
  log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
29
+ organization_id: Optional[str] = None
27
30
  project_name: Optional[str] = None
28
31
  eval_name: Optional[str] = None
29
32
  examples: List[Example]
@@ -34,6 +37,7 @@ class EvaluationRun(BaseModel):
34
37
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
35
38
  judgment_api_key: Optional[str] = ""
36
39
  override: Optional[bool] = False
40
+ rules: Optional[List[Rule]] = None
37
41
 
38
42
  def model_dump(self, **kwargs):
39
43
  data = super().model_dump(**kwargs)
@@ -44,6 +48,11 @@ class EvaluationRun(BaseModel):
44
48
  else {"score_type": scorer.score_type, "threshold": scorer.threshold}
45
49
  for scorer in self.scorers
46
50
  ]
51
+
52
+ if self.rules:
53
+ # Process rules to ensure proper serialization
54
+ data["rules"] = [rule.model_dump() for rule in self.rules]
55
+
47
56
  return data
48
57
 
49
58
  @field_validator('log_results', mode='before')
@@ -14,7 +14,7 @@ BASE_CONVERSATION = [
14
14
  ]
15
15
 
16
16
  class TogetherJudge(JudgevalJudge):
17
- def __init__(self, model: str = "QWEN", **kwargs):
17
+ def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
18
18
  debug(f"Initializing TogetherJudge with model={model}")
19
19
  self.model = model
20
20
  self.kwargs = kwargs
judgeval/judges/utils.py CHANGED
@@ -39,7 +39,7 @@ def create_judge(
39
39
  Please either set the `use_judgment` flag to True or use
40
40
  non-Judgment models."""
41
41
  )
42
- if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
42
+ if m not in ACCEPTABLE_MODELS:
43
43
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
44
44
  return MixtureOfJudges(models=model), True
45
45
  # If model is a string, check that it corresponds to a valid model
@@ -15,7 +15,8 @@ from judgeval.scorers import (
15
15
  APIJudgmentScorer,
16
16
  JudgevalScorer,
17
17
  ClassifierScorer,
18
- ScorerWrapper
18
+ ScorerWrapper,
19
+ score,
19
20
  )
20
21
  from judgeval.evaluation_run import EvaluationRun
21
22
  from judgeval.run_evaluation import (
@@ -26,6 +27,7 @@ from judgeval.judges import JudgevalJudge
26
27
  from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
28
  from judgeval.common.exceptions import JudgmentAPIError
28
29
  from pydantic import BaseModel
30
+ from judgeval.rules import Rule
29
31
 
30
32
  class EvalRunRequestBody(BaseModel):
31
33
  eval_name: str
@@ -34,9 +36,10 @@ class EvalRunRequestBody(BaseModel):
34
36
 
35
37
 
36
38
  class JudgmentClient:
37
- def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
39
+ def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
38
40
  self.judgment_api_key = judgment_api_key
39
- self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
41
+ self.organization_id = organization_id
42
+ self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
40
43
 
41
44
  # Verify API key is valid
42
45
  result, response = self._validate_api_key()
@@ -57,17 +60,69 @@ class JudgmentClient:
57
60
  project_name: str = "default_project",
58
61
  eval_run_name: str = "default_eval_run",
59
62
  override: bool = False,
60
- use_judgment: bool = True
63
+ use_judgment: bool = True,
64
+ rules: Optional[List[Rule]] = None
61
65
  ) -> List[ScoringResult]:
62
66
  """
63
67
  Executes an evaluation of `Example`s using one or more `Scorer`s
68
+
69
+ Args:
70
+ examples (List[Example]): The examples to evaluate
71
+ scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
72
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
73
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
74
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
75
+ log_results (bool): Whether to log the results to the Judgment API
76
+ project_name (str): The name of the project the evaluation results belong to
77
+ eval_run_name (str): A name for this evaluation run
78
+ override (bool): Whether to override an existing evaluation run with the same name
79
+ use_judgment (bool): Whether to use Judgment API for evaluation
80
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
81
+
82
+ Returns:
83
+ List[ScoringResult]: The results of the evaluation
64
84
  """
65
85
  try:
66
86
  # Load appropriate implementations for all scorers
67
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
68
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
69
- for scorer in scorers
70
- ]
87
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
88
+ for scorer in scorers:
89
+ try:
90
+ if isinstance(scorer, ScorerWrapper):
91
+ loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
92
+ else:
93
+ loaded_scorers.append(scorer)
94
+ except Exception as e:
95
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
96
+
97
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
98
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
99
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
100
+
101
+ # Convert ScorerWrapper in rules to their implementations
102
+ loaded_rules = None
103
+ if rules:
104
+ loaded_rules = []
105
+ for rule in rules:
106
+ try:
107
+ processed_conditions = []
108
+ for condition in rule.conditions:
109
+ # Convert metric if it's a ScorerWrapper
110
+ if isinstance(condition.metric, ScorerWrapper):
111
+ try:
112
+ condition_copy = condition.model_copy()
113
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
114
+ processed_conditions.append(condition_copy)
115
+ except Exception as e:
116
+ raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
117
+ else:
118
+ processed_conditions.append(condition)
119
+
120
+ # Create new rule with processed conditions
121
+ new_rule = rule.model_copy()
122
+ new_rule.conditions = processed_conditions
123
+ loaded_rules.append(new_rule)
124
+ except Exception as e:
125
+ raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
71
126
 
72
127
  eval = EvaluationRun(
73
128
  log_results=log_results,
@@ -78,11 +133,15 @@ class JudgmentClient:
78
133
  model=model,
79
134
  aggregator=aggregator,
80
135
  metadata=metadata,
81
- judgment_api_key=self.judgment_api_key
136
+ judgment_api_key=self.judgment_api_key,
137
+ rules=loaded_rules,
138
+ organization_id=self.organization_id
82
139
  )
83
140
  return run_eval(eval, override)
84
141
  except ValueError as e:
85
142
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
143
+ except Exception as e:
144
+ raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
86
145
 
87
146
  def evaluate_dataset(
88
147
  self,
@@ -94,17 +153,68 @@ class JudgmentClient:
94
153
  project_name: str = "",
95
154
  eval_run_name: str = "",
96
155
  log_results: bool = False,
97
- use_judgment: bool = True
156
+ use_judgment: bool = True,
157
+ rules: Optional[List[Rule]] = None
98
158
  ) -> List[ScoringResult]:
99
159
  """
100
160
  Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
161
+
162
+ Args:
163
+ dataset (EvalDataset): The dataset containing examples to evaluate
164
+ scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
165
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
166
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
167
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
168
+ project_name (str): The name of the project the evaluation results belong to
169
+ eval_run_name (str): A name for this evaluation run
170
+ log_results (bool): Whether to log the results to the Judgment API
171
+ use_judgment (bool): Whether to use Judgment API for evaluation
172
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
173
+
174
+ Returns:
175
+ List[ScoringResult]: The results of the evaluation
101
176
  """
102
177
  try:
103
178
  # Load appropriate implementations for all scorers
104
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
105
- scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
106
- for scorer in scorers
107
- ]
179
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
180
+ for scorer in scorers:
181
+ try:
182
+ if isinstance(scorer, ScorerWrapper):
183
+ loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
184
+ else:
185
+ loaded_scorers.append(scorer)
186
+ except Exception as e:
187
+ raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
188
+
189
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
190
+ if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
191
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
192
+
193
+ # Convert ScorerWrapper in rules to their implementations
194
+ loaded_rules = None
195
+ if rules:
196
+ loaded_rules = []
197
+ for rule in rules:
198
+ try:
199
+ processed_conditions = []
200
+ for condition in rule.conditions:
201
+ # Convert metric if it's a ScorerWrapper
202
+ if isinstance(condition.metric, ScorerWrapper):
203
+ try:
204
+ condition_copy = condition.model_copy()
205
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
206
+ processed_conditions.append(condition_copy)
207
+ except Exception as e:
208
+ raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
209
+ else:
210
+ processed_conditions.append(condition)
211
+
212
+ # Create new rule with processed conditions
213
+ new_rule = rule.model_copy()
214
+ new_rule.conditions = processed_conditions
215
+ loaded_rules.append(new_rule)
216
+ except Exception as e:
217
+ raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
108
218
 
109
219
  evaluation_run = EvaluationRun(
110
220
  log_results=log_results,
@@ -115,11 +225,15 @@ class JudgmentClient:
115
225
  model=model,
116
226
  aggregator=aggregator,
117
227
  metadata=metadata,
118
- judgment_api_key=self.judgment_api_key
228
+ judgment_api_key=self.judgment_api_key,
229
+ rules=loaded_rules,
230
+ organization_id=self.organization_id
119
231
  )
120
232
  return run_eval(evaluation_run)
121
233
  except ValueError as e:
122
234
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
235
+ except Exception as e:
236
+ raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
123
237
 
124
238
  def create_dataset(self) -> EvalDataset:
125
239
  return self.eval_dataset_client.create_dataset()
@@ -189,9 +303,11 @@ class JudgmentClient:
189
303
  eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
190
304
  headers={
191
305
  "Content-Type": "application/json",
192
- "Authorization": f"Bearer {self.judgment_api_key}"
306
+ "Authorization": f"Bearer {self.judgment_api_key}",
307
+ "X-Organization-Id": self.organization_id
193
308
  },
194
- json=eval_run_request_body.model_dump())
309
+ json=eval_run_request_body.model_dump(),
310
+ verify=False)
195
311
  if eval_run.status_code != requests.codes.ok:
196
312
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
197
313
 
@@ -222,7 +338,8 @@ class JudgmentClient:
222
338
  json=eval_run_request_body.model_dump(),
223
339
  headers={
224
340
  "Content-Type": "application/json",
225
- "Authorization": f"Bearer {self.judgment_api_key}"
341
+ "Authorization": f"Bearer {self.judgment_api_key}",
342
+ "X-Organization-Id": self.organization_id
226
343
  })
227
344
  if response.status_code != requests.codes.ok:
228
345
  raise ValueError(f"Error deleting eval results: {response.json()}")
@@ -241,11 +358,12 @@ class JudgmentClient:
241
358
  response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
242
359
  json={
243
360
  "project_name": project_name,
244
- "judgment_api_key": self.judgment_api_key
361
+ "judgment_api_key": self.judgment_api_key,
245
362
  },
246
363
  headers={
247
364
  "Content-Type": "application/json",
248
- "Authorization": f"Bearer {self.judgment_api_key}"
365
+ "Authorization": f"Bearer {self.judgment_api_key}",
366
+ "X-Organization-Id": self.organization_id
249
367
  })
250
368
  if response.status_code != requests.codes.ok:
251
369
  raise ValueError(f"Error deleting eval results: {response.json()}")
@@ -261,7 +379,8 @@ class JudgmentClient:
261
379
  "Content-Type": "application/json",
262
380
  "Authorization": f"Bearer {self.judgment_api_key}",
263
381
  },
264
- json={} # Empty body now
382
+ json={}, # Empty body now
383
+ verify=False
265
384
  )
266
385
  if response.status_code == 200:
267
386
  return True, response.json()
@@ -283,7 +402,6 @@ class JudgmentClient:
283
402
  """
284
403
  request_body = {
285
404
  "slug": slug,
286
- # "judgment_api_key": self.judgment_api_key
287
405
  }
288
406
 
289
407
  response = requests.post(
@@ -291,8 +409,10 @@ class JudgmentClient:
291
409
  json=request_body,
292
410
  headers={
293
411
  "Content-Type": "application/json",
294
- "Authorization": f"Bearer {self.judgment_api_key}"
295
- }
412
+ "Authorization": f"Bearer {self.judgment_api_key}",
413
+ "X-Organization-Id": self.organization_id
414
+ },
415
+ verify=False
296
416
  )
297
417
 
298
418
  if response.status_code == 500:
@@ -325,7 +445,6 @@ class JudgmentClient:
325
445
  "name": scorer.name,
326
446
  "conversation": scorer.conversation,
327
447
  "options": scorer.options,
328
- # "judgment_api_key": self.judgment_api_key,
329
448
  "slug": slug
330
449
  }
331
450
 
@@ -334,8 +453,10 @@ class JudgmentClient:
334
453
  json=request_body,
335
454
  headers={
336
455
  "Content-Type": "application/json",
337
- "Authorization": f"Bearer {self.judgment_api_key}"
338
- }
456
+ "Authorization": f"Bearer {self.judgment_api_key}",
457
+ "X-Organization-Id": self.organization_id
458
+ },
459
+ verify=False
339
460
  )
340
461
 
341
462
  if response.status_code == 500:
@@ -358,9 +479,22 @@ class JudgmentClient:
358
479
  project_name: str = "default_project",
359
480
  eval_run_name: str = "default_eval_run",
360
481
  override: bool = False,
482
+ rules: Optional[List[Rule]] = None
361
483
  ) -> None:
362
484
  """
363
485
  Asserts a test by running the evaluation and checking the results for success
486
+
487
+ Args:
488
+ examples (List[Example]): The examples to evaluate
489
+ scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
490
+ model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
491
+ aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
492
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
493
+ log_results (bool): Whether to log the results to the Judgment API
494
+ project_name (str): The name of the project the evaluation results belong to
495
+ eval_run_name (str): A name for this evaluation run
496
+ override (bool): Whether to override an existing evaluation run with the same name
497
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
364
498
  """
365
499
  results = self.run_evaluation(
366
500
  examples=examples,
@@ -371,7 +505,8 @@ class JudgmentClient:
371
505
  log_results=log_results,
372
506
  project_name=project_name,
373
507
  eval_run_name=eval_run_name,
374
- override=override
508
+ override=override,
509
+ rules=rules
375
510
  )
376
511
 
377
512
  assert_test(results)