judgeval 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
41
41
  from judgeval.judgment_client import JudgmentClient
42
42
  from judgeval.data import Example
43
43
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
44
+ from judgeval.rules import Rule
45
+ from judgeval.evaluation_run import EvaluationRun
46
+ from judgeval.judges import JudgevalJudge
44
47
 
45
48
  from rich import print as rprint
46
49
 
47
50
  from judgeval.data.result import ScoringResult
48
- from judgeval.evaluation_run import EvaluationRun
49
51
 
50
52
  # Define type aliases for better code readability and maintainability
51
53
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
@@ -205,7 +207,8 @@ class TraceManagerClient:
205
207
  "Content-Type": "application/json",
206
208
  "Authorization": f"Bearer {self.judgment_api_key}",
207
209
  "X-Organization-Id": self.organization_id
208
- }
210
+ },
211
+ verify=False
209
212
  )
210
213
 
211
214
  if response.status_code != HTTPStatus.OK:
@@ -229,7 +232,8 @@ class TraceManagerClient:
229
232
  "Content-Type": "application/json",
230
233
  "Authorization": f"Bearer {self.judgment_api_key}",
231
234
  "X-Organization-Id": self.organization_id
232
- }
235
+ },
236
+ verify=False
233
237
  )
234
238
 
235
239
  if response.status_code == HTTPStatus.BAD_REQUEST:
@@ -285,17 +289,29 @@ class TraceManagerClient:
285
289
 
286
290
  class TraceClient:
287
291
  """Client for managing a single trace context"""
288
- def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
289
- self.tracer = tracer
290
- self.trace_id = trace_id
292
+
293
+ def __init__(
294
+ self,
295
+ tracer: Optional["Tracer"],
296
+ trace_id: Optional[str] = None,
297
+ name: str = "default",
298
+ project_name: str = "default_project",
299
+ overwrite: bool = False,
300
+ rules: Optional[List[Rule]] = None,
301
+ ):
291
302
  self.name = name
303
+ self.trace_id = trace_id or str(uuid.uuid4())
292
304
  self.project_name = project_name
305
+ self.overwrite = overwrite
306
+ self.tracer = tracer
307
+ # Initialize rules with either provided rules or an empty list
308
+ self.rules = rules or []
309
+
293
310
  self.client: JudgmentClient = tracer.client
294
311
  self.entries: List[TraceEntry] = []
295
312
  self.start_time = time.time()
296
313
  self.span_type = None
297
314
  self._current_span: Optional[TraceEntry] = None
298
- self.overwrite = overwrite
299
315
  self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
300
316
 
301
317
  @contextmanager
@@ -348,7 +364,7 @@ class TraceClient:
348
364
  expected_tools: Optional[List[str]] = None,
349
365
  additional_metadata: Optional[Dict[str, Any]] = None,
350
366
  model: Optional[str] = None,
351
- log_results: Optional[bool] = True,
367
+ log_results: Optional[bool] = True
352
368
  ):
353
369
  start_time = time.time() # Record start time
354
370
  example = Example(
@@ -362,29 +378,68 @@ class TraceClient:
362
378
  additional_metadata=additional_metadata,
363
379
  trace_id=self.trace_id
364
380
  )
365
-
381
+ loaded_rules = None
382
+ if self.rules:
383
+ loaded_rules = []
384
+ for rule in self.rules:
385
+ processed_conditions = []
386
+ for condition in rule.conditions:
387
+ # Convert metric if it's a ScorerWrapper
388
+ try:
389
+ if isinstance(condition.metric, ScorerWrapper):
390
+ condition_copy = condition.model_copy()
391
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
392
+ processed_conditions.append(condition_copy)
393
+ else:
394
+ processed_conditions.append(condition)
395
+ except Exception as e:
396
+ warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
397
+ processed_conditions.append(condition) # Keep original condition as fallback
398
+
399
+ # Create new rule with processed conditions
400
+ new_rule = rule.model_copy()
401
+ new_rule.conditions = processed_conditions
402
+ loaded_rules.append(new_rule)
366
403
  try:
367
404
  # Load appropriate implementations for all scorers
368
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
369
- scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
370
- for scorer in scorers
371
- ]
405
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
406
+ for scorer in scorers:
407
+ try:
408
+ if isinstance(scorer, ScorerWrapper):
409
+ loaded_scorers.append(scorer.load_implementation(use_judgment=True))
410
+ else:
411
+ loaded_scorers.append(scorer)
412
+ except Exception as e:
413
+ warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
414
+ # Skip this scorer
415
+
416
+ if not loaded_scorers:
417
+ warnings.warn("No valid scorers available for evaluation")
418
+ return
419
+
420
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
421
+ if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
422
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
423
+
372
424
  except Exception as e:
373
- raise ValueError(f"Failed to load scorers: {str(e)}")
425
+ warnings.warn(f"Failed to load scorers: {str(e)}")
426
+ return
374
427
 
428
+ # Combine the trace-level rules with any evaluation-specific rules)
375
429
  eval_run = EvaluationRun(
376
430
  organization_id=self.tracer.organization_id,
377
431
  log_results=log_results,
378
432
  project_name=self.project_name,
379
433
  eval_name=f"{self.name.capitalize()}-"
380
434
  f"{self._current_span}-"
381
- f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
435
+ f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
382
436
  examples=[example],
383
437
  scorers=loaded_scorers,
384
438
  model=model,
385
439
  metadata={},
386
440
  judgment_api_key=self.tracer.api_key,
387
- override=self.overwrite
441
+ override=self.overwrite,
442
+ rules=loaded_rules # Use the combined rules
388
443
  )
389
444
 
390
445
  self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
@@ -562,7 +617,6 @@ class TraceClient:
562
617
  "empty_save": empty_save,
563
618
  "overwrite": overwrite
564
619
  }
565
-
566
620
  # Execute asynchrous evaluation in the background
567
621
  if not empty_save: # Only send to RabbitMQ if the trace is not empty
568
622
  connection = pika.BlockingConnection(
@@ -572,13 +626,16 @@ class TraceClient:
572
626
  channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
573
627
  trace_data["judgment_api_key"] = self.tracer.api_key
574
628
  trace_data["organization_id"] = self.tracer.organization_id
575
-
576
629
  channel.basic_publish(
577
630
  exchange='',
578
631
  routing_key=RABBITMQ_QUEUE,
579
632
  body=json.dumps(trace_data),
580
633
  properties=pika.BasicProperties(
581
- delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
634
+ delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
635
+ headers={
636
+ 'api_key': self.tracer.api_key,
637
+ 'organization_id': self.tracer.organization_id
638
+ }
582
639
  ))
583
640
  connection.close()
584
641
 
@@ -597,7 +654,12 @@ class Tracer:
597
654
  cls._instance = super(Tracer, cls).__new__(cls)
598
655
  return cls._instance
599
656
 
600
- def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project", organization_id: str = os.getenv("ORGANIZATION_ID")):
657
+ def __init__(
658
+ self,
659
+ api_key: str = os.getenv("JUDGMENT_API_KEY"),
660
+ project_name: str = "default_project",
661
+ rules: Optional[List[Rule]] = None, # Added rules parameter
662
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
601
663
  if not hasattr(self, 'initialized'):
602
664
  if not api_key:
603
665
  raise ValueError("Tracer must be configured with a Judgment API key")
@@ -611,6 +673,7 @@ class Tracer:
611
673
  self.organization_id: str = organization_id
612
674
  self.depth: int = 0
613
675
  self._current_trace: Optional[str] = None
676
+ self.rules: List[Rule] = rules or [] # Store rules at tracer level
614
677
  self.initialized: bool = True
615
678
  elif hasattr(self, 'project_name') and self.project_name != project_name:
616
679
  warnings.warn(
@@ -621,11 +684,25 @@ class Tracer:
621
684
  )
622
685
 
623
686
  @contextmanager
624
- def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
687
+ def trace(
688
+ self,
689
+ name: str,
690
+ project_name: str = None,
691
+ overwrite: bool = False,
692
+ rules: Optional[List[Rule]] = None # Added rules parameter
693
+ ) -> Generator[TraceClient, None, None]:
625
694
  """Start a new trace context using a context manager"""
626
695
  trace_id = str(uuid.uuid4())
627
696
  project = project_name if project_name is not None else self.project_name
628
- trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
697
+
698
+ trace = TraceClient(
699
+ self,
700
+ trace_id,
701
+ name,
702
+ project_name=project,
703
+ overwrite=overwrite,
704
+ rules=self.rules # Pass combined rules to the trace client
705
+ )
629
706
  prev_trace = self._current_trace
630
707
  self._current_trace = trace
631
708
 
@@ -669,9 +746,9 @@ class Tracer:
669
746
  trace = self._current_trace
670
747
  else:
671
748
  trace_id = str(uuid.uuid4())
672
- trace_name = str(uuid.uuid4())
749
+ trace_name = func.__name__
673
750
  project = project_name if project_name is not None else self.project_name
674
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
751
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
675
752
  self._current_trace = trace
676
753
  # Only save empty trace for the root call
677
754
  trace.save(empty_save=True, overwrite=overwrite)
@@ -706,9 +783,9 @@ class Tracer:
706
783
  trace = self._current_trace
707
784
  else:
708
785
  trace_id = str(uuid.uuid4())
709
- trace_name = str(uuid.uuid4())
786
+ trace_name = func.__name__
710
787
  project = project_name if project_name is not None else self.project_name
711
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
788
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
712
789
  self._current_trace = trace
713
790
  # Only save empty trace for the root call
714
791
  trace.save(empty_save=True, overwrite=overwrite)
@@ -741,14 +818,15 @@ def wrap(client: Any) -> Any:
741
818
  Wraps an API client to add tracing capabilities.
742
819
  Supports OpenAI, Together, and Anthropic clients.
743
820
  """
744
- tracer = Tracer._instance # Get the global tracer instance
745
-
746
821
  # Get the appropriate configuration for this client type
747
822
  span_name, original_create = _get_client_config(client)
748
823
 
749
824
  def traced_create(*args, **kwargs):
750
- # Skip tracing if no active trace
751
- if not (tracer and tracer._current_trace):
825
+ # Get the current tracer instance (might be created after client was wrapped)
826
+ tracer = Tracer._instance
827
+
828
+ # Skip tracing if no tracer exists or no active trace
829
+ if not tracer or not tracer._current_trace:
752
830
  return original_create(*args, **kwargs)
753
831
 
754
832
  with tracer._current_trace.span(span_name, span_type="llm") as span:
judgeval/common/utils.py CHANGED
@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
21
21
  from judgeval.constants import *
22
22
  from judgeval.common.logger import debug, error
23
23
 
24
- LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
25
24
 
26
25
  class CustomModelParameters(pydantic.BaseModel):
27
26
  model_name: str
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
72
71
  def validate_model(cls, model):
73
72
  if not model:
74
73
  raise ValueError("Model cannot be empty")
75
- if model not in TOGETHER_SUPPORTED_MODELS and model not in LITELLM_SUPPORTED_MODELS:
74
+ if model not in ACCEPTABLE_MODELS:
76
75
  raise ValueError(f"Model {model} is not in the list of supported models.")
77
76
  return model
78
77
 
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
114
113
  if request.response_format is not None:
115
114
  debug(f"Using response format: {request.response_format}")
116
115
  response = together_client.chat.completions.create(
117
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
116
+ model=request.model,
118
117
  messages=request.messages,
119
118
  response_format=request.response_format
120
119
  )
121
120
  else:
122
121
  response = together_client.chat.completions.create(
123
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
122
+ model=request.model,
124
123
  messages=request.messages,
125
124
  )
126
125
 
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
144
143
  if request.response_format is not None:
145
144
  debug(f"Using response format: {request.response_format}")
146
145
  response = await async_together_client.chat.completions.create(
147
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
146
+ model=request.model,
148
147
  messages=request.messages,
149
148
  response_format=request.response_format
150
149
  )
151
150
  else:
152
151
  response = await async_together_client.chat.completions.create(
153
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
152
+ model=request.model,
154
153
  messages=request.messages,
155
154
  )
156
155
  return response.choices[0].message.content
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
174
173
 
175
174
  # Validate all models are supported
176
175
  for model in models:
177
- if model not in TOGETHER_SUPPORTED_MODELS:
178
- raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
176
+ if model not in ACCEPTABLE_MODELS:
177
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
179
178
 
180
179
  # Validate input lengths match
181
180
  if response_formats is None:
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
223
222
 
224
223
  # Validate all models are supported
225
224
  for model in models:
226
- if model not in TOGETHER_SUPPORTED_MODELS:
227
- raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
225
+ if model not in ACCEPTABLE_MODELS:
226
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
228
227
 
229
228
  # Validate input lengths match
230
229
  if response_formats is None:
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
322
321
  # Add validation
323
322
  validate_chat_messages(messages)
324
323
 
325
- if model not in LITELLM_SUPPORTED_MODELS:
326
- raise ValueError(f"Model {model} is not in the list of supported Litellm models: {LITELLM_SUPPORTED_MODELS}.")
324
+ if model not in ACCEPTABLE_MODELS:
325
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
327
326
 
328
327
  if response_format is not None:
329
328
  response = await litellm.acompletion(
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
409
408
  models (List[str]): List of models to query
410
409
  messages (List[Mapping]): List of messages to query
411
410
  response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
412
-
411
+
413
412
  Returns:
414
413
  List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
415
414
  """
judgeval/constants.py CHANGED
@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
51
51
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
52
52
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
53
53
  RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
54
-
55
54
  # Models
56
- TOGETHER_SUPPORTED_MODELS = {
57
- "QWEN": "Qwen/Qwen2-72B-Instruct",
58
- "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
59
- "LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
60
- "LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
61
- "MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
62
- "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
63
- }
55
+ LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
56
+
57
+ TOGETHER_SUPPORTED_MODELS = [
58
+ "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
59
+ "Qwen/Qwen2-VL-72B-Instruct",
60
+ "meta-llama/Llama-Vision-Free",
61
+ "Gryphe/MythoMax-L2-13b",
62
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
63
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
64
+ "deepseek-ai/DeepSeek-R1",
65
+ "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
66
+ "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
67
+ "google/gemma-2-27b-it",
68
+ "mistralai/Mistral-Small-24B-Instruct-2501",
69
+ "mistralai/Mixtral-8x22B-Instruct-v0.1",
70
+ "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
71
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
72
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
73
+ "deepseek-ai/DeepSeek-V3",
74
+ "Qwen/Qwen2-72B-Instruct",
75
+ "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
76
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
77
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
78
+ "togethercomputer/MoA-1",
79
+ "Qwen/QwQ-32B-Preview",
80
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
81
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
82
+ "mistralai/Mistral-7B-Instruct-v0.2",
83
+ "databricks/dbrx-instruct",
84
+ "meta-llama/Llama-3-8b-chat-hf",
85
+ "google/gemma-2b-it",
86
+ "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
87
+ "google/gemma-2-9b-it",
88
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo",
89
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
90
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
91
+ "Gryphe/MythoMax-L2-13b-Lite",
92
+ "meta-llama/Llama-2-7b-chat-hf",
93
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
94
+ "meta-llama/Llama-2-13b-chat-hf",
95
+ "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
96
+ "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
97
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
98
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
99
+ "microsoft/WizardLM-2-8x22B",
100
+ "mistralai/Mistral-7B-Instruct-v0.3",
101
+ "scb10x/scb10x-llama3-1-typhoon2-60256",
102
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
103
+ "scb10x/scb10x-llama3-1-typhoon-18370",
104
+ "meta-llama/Llama-3.2-3B-Instruct-Turbo",
105
+ "meta-llama/Llama-3-70b-chat-hf",
106
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
107
+ "togethercomputer/MoA-1-Turbo",
108
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
109
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
110
+ "mistralai/Mistral-7B-Instruct-v0.1"
111
+ ]
64
112
 
65
113
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
66
114
 
67
- ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
115
+ ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
68
116
 
69
117
  ## System settings
70
118
  MAX_WORKER_THREADS = 10
119
+
120
+ # Maximum number of concurrent operations for evaluation runs
121
+ MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
@@ -20,7 +20,7 @@ class EvalDataset:
20
20
  organization_id: str = field(default="")
21
21
  def __init__(self,
22
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
- organization_id: str = os.getenv("ORGANIZATION_ID"),
23
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
24
24
  ground_truths: List[GroundTruthExample] = [],
25
25
  examples: List[Example] = [],
26
26
  ):
@@ -68,7 +68,8 @@ class EvalDatasetClient:
68
68
  "Content-Type": "application/json",
69
69
  "Authorization": f"Bearer {self.judgment_api_key}",
70
70
  "X-Organization-Id": self.organization_id
71
- }
71
+ },
72
+ verify=False
72
73
  )
73
74
  if response.status_code == 500:
74
75
  error(f"Server error during push: {content.get('message')}")
@@ -132,7 +133,8 @@ class EvalDatasetClient:
132
133
  "Content-Type": "application/json",
133
134
  "Authorization": f"Bearer {self.judgment_api_key}",
134
135
  "X-Organization-Id": self.organization_id
135
- }
136
+ },
137
+ verify=False
136
138
  )
137
139
  response.raise_for_status()
138
140
  except requests.exceptions.RequestException as e:
@@ -190,7 +192,8 @@ class EvalDatasetClient:
190
192
  "Content-Type": "application/json",
191
193
  "Authorization": f"Bearer {self.judgment_api_key}",
192
194
  "X-Organization-Id": self.organization_id
193
- }
195
+ },
196
+ verify=False
194
197
  )
195
198
  response.raise_for_status()
196
199
  except requests.exceptions.RequestException as e:
@@ -233,7 +236,6 @@ class EvalDatasetClient:
233
236
  "alias": alias,
234
237
  "examples": [e.to_dict() for e in examples],
235
238
  "ground_truths": [g.to_dict() for g in ground_truths],
236
- "judgment_api_key": self.judgment_api_key
237
239
  }
238
240
 
239
241
  try:
@@ -244,7 +246,8 @@ class EvalDatasetClient:
244
246
  "Content-Type": "application/json",
245
247
  "Authorization": f"Bearer {self.judgment_api_key}",
246
248
  "X-Organization-Id": self.organization_id
247
- }
249
+ },
250
+ verify=False
248
251
  )
249
252
  response.raise_for_status()
250
253
  except requests.exceptions.RequestException as e:
@@ -275,7 +278,8 @@ class EvalDatasetClient:
275
278
  "Authorization": f"Bearer {self.judgment_api_key}",
276
279
  "X-Organization-Id": self.organization_id
277
280
  },
278
- stream=True
281
+ stream=True,
282
+ verify=False
279
283
  )
280
284
  response.raise_for_status()
281
285
  except requests.exceptions.HTTPError as err:
@@ -6,6 +6,7 @@ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
6
  from judgeval.constants import ACCEPTABLE_MODELS
7
7
  from judgeval.common.logger import debug, error
8
8
  from judgeval.judges import JudgevalJudge
9
+ from judgeval.rules import Rule
9
10
 
10
11
  class EvaluationRun(BaseModel):
11
12
  """
@@ -20,6 +21,7 @@ class EvaluationRun(BaseModel):
20
21
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
22
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
23
  judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
24
+ rules (Optional[List[Rule]]): Rules to evaluate against scoring results
23
25
  """
24
26
 
25
27
  # The user will specify whether they want log_results when they call run_eval
@@ -35,6 +37,7 @@ class EvaluationRun(BaseModel):
35
37
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
36
38
  judgment_api_key: Optional[str] = ""
37
39
  override: Optional[bool] = False
40
+ rules: Optional[List[Rule]] = None
38
41
 
39
42
  def model_dump(self, **kwargs):
40
43
  data = super().model_dump(**kwargs)
@@ -45,6 +48,11 @@ class EvaluationRun(BaseModel):
45
48
  else {"score_type": scorer.score_type, "threshold": scorer.threshold}
46
49
  for scorer in self.scorers
47
50
  ]
51
+
52
+ if self.rules:
53
+ # Process rules to ensure proper serialization
54
+ data["rules"] = [rule.model_dump() for rule in self.rules]
55
+
48
56
  return data
49
57
 
50
58
  @field_validator('log_results', mode='before')
@@ -14,7 +14,7 @@ BASE_CONVERSATION = [
14
14
  ]
15
15
 
16
16
  class TogetherJudge(JudgevalJudge):
17
- def __init__(self, model: str = "QWEN", **kwargs):
17
+ def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
18
18
  debug(f"Initializing TogetherJudge with model={model}")
19
19
  self.model = model
20
20
  self.kwargs = kwargs
judgeval/judges/utils.py CHANGED
@@ -39,7 +39,7 @@ def create_judge(
39
39
  Please either set the `use_judgment` flag to True or use
40
40
  non-Judgment models."""
41
41
  )
42
- if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
42
+ if m not in ACCEPTABLE_MODELS:
43
43
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
44
44
  return MixtureOfJudges(models=model), True
45
45
  # If model is a string, check that it corresponds to a valid model