judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
41
41
  from judgeval.judgment_client import JudgmentClient
42
42
  from judgeval.data import Example
43
43
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
44
+ from judgeval.rules import Rule
45
+ from judgeval.evaluation_run import EvaluationRun
46
+ from judgeval.judges import JudgevalJudge
44
47
 
45
48
  from rich import print as rprint
46
49
 
47
50
  from judgeval.data.result import ScoringResult
48
- from judgeval.evaluation_run import EvaluationRun
49
51
 
50
52
  # Define type aliases for better code readability and maintainability
51
53
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
@@ -188,8 +190,9 @@ class TraceManagerClient:
188
190
  - Saving a trace
189
191
  - Deleting a trace
190
192
  """
191
- def __init__(self, judgment_api_key: str):
193
+ def __init__(self, judgment_api_key: str, organization_id: str):
192
194
  self.judgment_api_key = judgment_api_key
195
+ self.organization_id = organization_id
193
196
 
194
197
  def fetch_trace(self, trace_id: str):
195
198
  """
@@ -199,12 +202,13 @@ class TraceManagerClient:
199
202
  JUDGMENT_TRACES_FETCH_API_URL,
200
203
  json={
201
204
  "trace_id": trace_id,
202
- # "judgment_api_key": self.judgment_api_key,
203
205
  },
204
206
  headers={
205
207
  "Content-Type": "application/json",
206
- "Authorization": f"Bearer {self.judgment_api_key}"
207
- }
208
+ "Authorization": f"Bearer {self.judgment_api_key}",
209
+ "X-Organization-Id": self.organization_id
210
+ },
211
+ verify=False
208
212
  )
209
213
 
210
214
  if response.status_code != HTTPStatus.OK:
@@ -226,8 +230,10 @@ class TraceManagerClient:
226
230
  json=trace_data,
227
231
  headers={
228
232
  "Content-Type": "application/json",
229
- "Authorization": f"Bearer {self.judgment_api_key}"
230
- }
233
+ "Authorization": f"Bearer {self.judgment_api_key}",
234
+ "X-Organization-Id": self.organization_id
235
+ },
236
+ verify=False
231
237
  )
232
238
 
233
239
  if response.status_code == HTTPStatus.BAD_REQUEST:
@@ -245,12 +251,12 @@ class TraceManagerClient:
245
251
  response = requests.delete(
246
252
  JUDGMENT_TRACES_DELETE_API_URL,
247
253
  json={
248
- "judgment_api_key": self.judgment_api_key,
249
254
  "trace_ids": [trace_id],
250
255
  },
251
256
  headers={
252
257
  "Content-Type": "application/json",
253
- "Authorization": f"Bearer {self.judgment_api_key}"
258
+ "Authorization": f"Bearer {self.judgment_api_key}",
259
+ "X-Organization-Id": self.organization_id
254
260
  }
255
261
  )
256
262
 
@@ -266,12 +272,12 @@ class TraceManagerClient:
266
272
  response = requests.delete(
267
273
  JUDGMENT_TRACES_DELETE_API_URL,
268
274
  json={
269
- # "judgment_api_key": self.judgment_api_key,
270
275
  "trace_ids": trace_ids,
271
276
  },
272
277
  headers={
273
278
  "Content-Type": "application/json",
274
- "Authorization": f"Bearer {self.judgment_api_key}"
279
+ "Authorization": f"Bearer {self.judgment_api_key}",
280
+ "X-Organization-Id": self.organization_id
275
281
  }
276
282
  )
277
283
 
@@ -283,18 +289,30 @@ class TraceManagerClient:
283
289
 
284
290
  class TraceClient:
285
291
  """Client for managing a single trace context"""
286
- def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
287
- self.tracer = tracer
288
- self.trace_id = trace_id
292
+
293
+ def __init__(
294
+ self,
295
+ tracer: Optional["Tracer"],
296
+ trace_id: Optional[str] = None,
297
+ name: str = "default",
298
+ project_name: str = "default_project",
299
+ overwrite: bool = False,
300
+ rules: Optional[List[Rule]] = None,
301
+ ):
289
302
  self.name = name
303
+ self.trace_id = trace_id or str(uuid.uuid4())
290
304
  self.project_name = project_name
305
+ self.overwrite = overwrite
306
+ self.tracer = tracer
307
+ # Initialize rules with either provided rules or an empty list
308
+ self.rules = rules or []
309
+
291
310
  self.client: JudgmentClient = tracer.client
292
311
  self.entries: List[TraceEntry] = []
293
312
  self.start_time = time.time()
294
313
  self.span_type = None
295
314
  self._current_span: Optional[TraceEntry] = None
296
- self.overwrite = overwrite
297
- self.trace_manager_client = TraceManagerClient(tracer.api_key) # Manages DB operations for trace data
315
+ self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
298
316
 
299
317
  @contextmanager
300
318
  def span(self, name: str, span_type: SpanType = "span"):
@@ -346,7 +364,7 @@ class TraceClient:
346
364
  expected_tools: Optional[List[str]] = None,
347
365
  additional_metadata: Optional[Dict[str, Any]] = None,
348
366
  model: Optional[str] = None,
349
- log_results: Optional[bool] = True,
367
+ log_results: Optional[bool] = True
350
368
  ):
351
369
  start_time = time.time() # Record start time
352
370
  example = Example(
@@ -360,28 +378,68 @@ class TraceClient:
360
378
  additional_metadata=additional_metadata,
361
379
  trace_id=self.trace_id
362
380
  )
363
-
381
+ loaded_rules = None
382
+ if self.rules:
383
+ loaded_rules = []
384
+ for rule in self.rules:
385
+ processed_conditions = []
386
+ for condition in rule.conditions:
387
+ # Convert metric if it's a ScorerWrapper
388
+ try:
389
+ if isinstance(condition.metric, ScorerWrapper):
390
+ condition_copy = condition.model_copy()
391
+ condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
392
+ processed_conditions.append(condition_copy)
393
+ else:
394
+ processed_conditions.append(condition)
395
+ except Exception as e:
396
+ warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
397
+ processed_conditions.append(condition) # Keep original condition as fallback
398
+
399
+ # Create new rule with processed conditions
400
+ new_rule = rule.model_copy()
401
+ new_rule.conditions = processed_conditions
402
+ loaded_rules.append(new_rule)
364
403
  try:
365
404
  # Load appropriate implementations for all scorers
366
- loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
367
- scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
368
- for scorer in scorers
369
- ]
405
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
406
+ for scorer in scorers:
407
+ try:
408
+ if isinstance(scorer, ScorerWrapper):
409
+ loaded_scorers.append(scorer.load_implementation(use_judgment=True))
410
+ else:
411
+ loaded_scorers.append(scorer)
412
+ except Exception as e:
413
+ warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
414
+ # Skip this scorer
415
+
416
+ if not loaded_scorers:
417
+ warnings.warn("No valid scorers available for evaluation")
418
+ return
419
+
420
+ # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
421
+ if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
422
+ raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
423
+
370
424
  except Exception as e:
371
- raise ValueError(f"Failed to load scorers: {str(e)}")
425
+ warnings.warn(f"Failed to load scorers: {str(e)}")
426
+ return
372
427
 
428
+ # Combine the trace-level rules with any evaluation-specific rules)
373
429
  eval_run = EvaluationRun(
430
+ organization_id=self.tracer.organization_id,
374
431
  log_results=log_results,
375
432
  project_name=self.project_name,
376
433
  eval_name=f"{self.name.capitalize()}-"
377
434
  f"{self._current_span}-"
378
- f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
435
+ f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
379
436
  examples=[example],
380
437
  scorers=loaded_scorers,
381
438
  model=model,
382
439
  metadata={},
383
440
  judgment_api_key=self.tracer.api_key,
384
- override=self.overwrite
441
+ override=self.overwrite,
442
+ rules=loaded_rules # Use the combined rules
385
443
  )
386
444
 
387
445
  self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
@@ -546,7 +604,6 @@ class TraceClient:
546
604
  # Create trace document
547
605
  trace_data = {
548
606
  "trace_id": self.trace_id,
549
- "api_key": self.tracer.api_key,
550
607
  "name": self.name,
551
608
  "project_name": self.project_name,
552
609
  "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
@@ -560,7 +617,6 @@ class TraceClient:
560
617
  "empty_save": empty_save,
561
618
  "overwrite": overwrite
562
619
  }
563
-
564
620
  # Execute asynchrous evaluation in the background
565
621
  if not empty_save: # Only send to RabbitMQ if the trace is not empty
566
622
  connection = pika.BlockingConnection(
@@ -568,37 +624,23 @@ class TraceClient:
568
624
  channel = connection.channel()
569
625
 
570
626
  channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
571
-
627
+ trace_data["judgment_api_key"] = self.tracer.api_key
628
+ trace_data["organization_id"] = self.tracer.organization_id
572
629
  channel.basic_publish(
573
630
  exchange='',
574
631
  routing_key=RABBITMQ_QUEUE,
575
632
  body=json.dumps(trace_data),
576
633
  properties=pika.BasicProperties(
577
- delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
634
+ delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
635
+ headers={
636
+ 'api_key': self.tracer.api_key,
637
+ 'organization_id': self.tracer.organization_id
638
+ }
578
639
  ))
579
640
  connection.close()
580
641
 
581
642
  self.trace_manager_client.save_trace(trace_data, empty_save)
582
643
 
583
-
584
- # Save trace data by making POST request to API
585
- response = requests.post(
586
- JUDGMENT_TRACES_SAVE_API_URL,
587
- json=trace_data,
588
- headers={
589
- "Content-Type": "application/json",
590
- "Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
591
- }
592
- )
593
-
594
- if response.status_code == HTTPStatus.BAD_REQUEST:
595
- raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
596
- elif response.status_code != HTTPStatus.OK:
597
- raise ValueError(f"Failed to save trace data: {response.text}")
598
-
599
- if not empty_save and "ui_results_url" in response.json():
600
- rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
601
-
602
644
  return self.trace_id, trace_data
603
645
 
604
646
  def delete(self):
@@ -612,16 +654,26 @@ class Tracer:
612
654
  cls._instance = super(Tracer, cls).__new__(cls)
613
655
  return cls._instance
614
656
 
615
- def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
657
+ def __init__(
658
+ self,
659
+ api_key: str = os.getenv("JUDGMENT_API_KEY"),
660
+ project_name: str = "default_project",
661
+ rules: Optional[List[Rule]] = None, # Added rules parameter
662
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
616
663
  if not hasattr(self, 'initialized'):
617
664
  if not api_key:
618
665
  raise ValueError("Tracer must be configured with a Judgment API key")
619
666
 
667
+ if not organization_id:
668
+ raise ValueError("Tracer must be configured with an Organization ID")
669
+
620
670
  self.api_key: str = api_key
621
671
  self.project_name: str = project_name
622
672
  self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
673
+ self.organization_id: str = organization_id
623
674
  self.depth: int = 0
624
675
  self._current_trace: Optional[str] = None
676
+ self.rules: List[Rule] = rules or [] # Store rules at tracer level
625
677
  self.initialized: bool = True
626
678
  elif hasattr(self, 'project_name') and self.project_name != project_name:
627
679
  warnings.warn(
@@ -632,11 +684,25 @@ class Tracer:
632
684
  )
633
685
 
634
686
  @contextmanager
635
- def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
687
+ def trace(
688
+ self,
689
+ name: str,
690
+ project_name: str = None,
691
+ overwrite: bool = False,
692
+ rules: Optional[List[Rule]] = None # Added rules parameter
693
+ ) -> Generator[TraceClient, None, None]:
636
694
  """Start a new trace context using a context manager"""
637
695
  trace_id = str(uuid.uuid4())
638
696
  project = project_name if project_name is not None else self.project_name
639
- trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
697
+
698
+ trace = TraceClient(
699
+ self,
700
+ trace_id,
701
+ name,
702
+ project_name=project,
703
+ overwrite=overwrite,
704
+ rules=self.rules # Pass combined rules to the trace client
705
+ )
640
706
  prev_trace = self._current_trace
641
707
  self._current_trace = trace
642
708
 
@@ -680,9 +746,9 @@ class Tracer:
680
746
  trace = self._current_trace
681
747
  else:
682
748
  trace_id = str(uuid.uuid4())
683
- trace_name = str(uuid.uuid4())
749
+ trace_name = func.__name__
684
750
  project = project_name if project_name is not None else self.project_name
685
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
751
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
686
752
  self._current_trace = trace
687
753
  # Only save empty trace for the root call
688
754
  trace.save(empty_save=True, overwrite=overwrite)
@@ -717,9 +783,9 @@ class Tracer:
717
783
  trace = self._current_trace
718
784
  else:
719
785
  trace_id = str(uuid.uuid4())
720
- trace_name = str(uuid.uuid4())
786
+ trace_name = func.__name__
721
787
  project = project_name if project_name is not None else self.project_name
722
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
788
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
723
789
  self._current_trace = trace
724
790
  # Only save empty trace for the root call
725
791
  trace.save(empty_save=True, overwrite=overwrite)
@@ -752,14 +818,15 @@ def wrap(client: Any) -> Any:
752
818
  Wraps an API client to add tracing capabilities.
753
819
  Supports OpenAI, Together, and Anthropic clients.
754
820
  """
755
- tracer = Tracer._instance # Get the global tracer instance
756
-
757
821
  # Get the appropriate configuration for this client type
758
822
  span_name, original_create = _get_client_config(client)
759
823
 
760
824
  def traced_create(*args, **kwargs):
761
- # Skip tracing if no active trace
762
- if not (tracer and tracer._current_trace):
825
+ # Get the current tracer instance (might be created after client was wrapped)
826
+ tracer = Tracer._instance
827
+
828
+ # Skip tracing if no tracer exists or no active trace
829
+ if not tracer or not tracer._current_trace:
763
830
  return original_create(*args, **kwargs)
764
831
 
765
832
  with tracer._current_trace.span(span_name, span_type="llm") as span:
judgeval/common/utils.py CHANGED
@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
21
21
  from judgeval.constants import *
22
22
  from judgeval.common.logger import debug, error
23
23
 
24
- LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
25
24
 
26
25
  class CustomModelParameters(pydantic.BaseModel):
27
26
  model_name: str
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
72
71
  def validate_model(cls, model):
73
72
  if not model:
74
73
  raise ValueError("Model cannot be empty")
75
- if model not in TOGETHER_SUPPORTED_MODELS and model not in LITELLM_SUPPORTED_MODELS:
74
+ if model not in ACCEPTABLE_MODELS:
76
75
  raise ValueError(f"Model {model} is not in the list of supported models.")
77
76
  return model
78
77
 
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
114
113
  if request.response_format is not None:
115
114
  debug(f"Using response format: {request.response_format}")
116
115
  response = together_client.chat.completions.create(
117
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
116
+ model=request.model,
118
117
  messages=request.messages,
119
118
  response_format=request.response_format
120
119
  )
121
120
  else:
122
121
  response = together_client.chat.completions.create(
123
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
122
+ model=request.model,
124
123
  messages=request.messages,
125
124
  )
126
125
 
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
144
143
  if request.response_format is not None:
145
144
  debug(f"Using response format: {request.response_format}")
146
145
  response = await async_together_client.chat.completions.create(
147
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
146
+ model=request.model,
148
147
  messages=request.messages,
149
148
  response_format=request.response_format
150
149
  )
151
150
  else:
152
151
  response = await async_together_client.chat.completions.create(
153
- model=TOGETHER_SUPPORTED_MODELS.get(request.model),
152
+ model=request.model,
154
153
  messages=request.messages,
155
154
  )
156
155
  return response.choices[0].message.content
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
174
173
 
175
174
  # Validate all models are supported
176
175
  for model in models:
177
- if model not in TOGETHER_SUPPORTED_MODELS:
178
- raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
176
+ if model not in ACCEPTABLE_MODELS:
177
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
179
178
 
180
179
  # Validate input lengths match
181
180
  if response_formats is None:
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
223
222
 
224
223
  # Validate all models are supported
225
224
  for model in models:
226
- if model not in TOGETHER_SUPPORTED_MODELS:
227
- raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
225
+ if model not in ACCEPTABLE_MODELS:
226
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
228
227
 
229
228
  # Validate input lengths match
230
229
  if response_formats is None:
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
322
321
  # Add validation
323
322
  validate_chat_messages(messages)
324
323
 
325
- if model not in LITELLM_SUPPORTED_MODELS:
326
- raise ValueError(f"Model {model} is not in the list of supported Litellm models: {LITELLM_SUPPORTED_MODELS}.")
324
+ if model not in ACCEPTABLE_MODELS:
325
+ raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
327
326
 
328
327
  if response_format is not None:
329
328
  response = await litellm.acompletion(
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
409
408
  models (List[str]): List of models to query
410
409
  messages (List[Mapping]): List of messages to query
411
410
  response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
412
-
411
+
413
412
  Returns:
414
413
  List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
415
414
  """
judgeval/constants.py CHANGED
@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
51
51
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
52
52
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
53
53
  RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
54
-
55
54
  # Models
56
- TOGETHER_SUPPORTED_MODELS = {
57
- "QWEN": "Qwen/Qwen2-72B-Instruct",
58
- "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
59
- "LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
60
- "LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
61
- "MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
62
- "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
63
- }
55
+ LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
56
+
57
+ TOGETHER_SUPPORTED_MODELS = [
58
+ "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
59
+ "Qwen/Qwen2-VL-72B-Instruct",
60
+ "meta-llama/Llama-Vision-Free",
61
+ "Gryphe/MythoMax-L2-13b",
62
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
63
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
64
+ "deepseek-ai/DeepSeek-R1",
65
+ "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
66
+ "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
67
+ "google/gemma-2-27b-it",
68
+ "mistralai/Mistral-Small-24B-Instruct-2501",
69
+ "mistralai/Mixtral-8x22B-Instruct-v0.1",
70
+ "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
71
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
72
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
73
+ "deepseek-ai/DeepSeek-V3",
74
+ "Qwen/Qwen2-72B-Instruct",
75
+ "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
76
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
77
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
78
+ "togethercomputer/MoA-1",
79
+ "Qwen/QwQ-32B-Preview",
80
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
81
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
82
+ "mistralai/Mistral-7B-Instruct-v0.2",
83
+ "databricks/dbrx-instruct",
84
+ "meta-llama/Llama-3-8b-chat-hf",
85
+ "google/gemma-2b-it",
86
+ "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
87
+ "google/gemma-2-9b-it",
88
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo",
89
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
90
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
91
+ "Gryphe/MythoMax-L2-13b-Lite",
92
+ "meta-llama/Llama-2-7b-chat-hf",
93
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
94
+ "meta-llama/Llama-2-13b-chat-hf",
95
+ "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
96
+ "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
97
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
98
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
99
+ "microsoft/WizardLM-2-8x22B",
100
+ "mistralai/Mistral-7B-Instruct-v0.3",
101
+ "scb10x/scb10x-llama3-1-typhoon2-60256",
102
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
103
+ "scb10x/scb10x-llama3-1-typhoon-18370",
104
+ "meta-llama/Llama-3.2-3B-Instruct-Turbo",
105
+ "meta-llama/Llama-3-70b-chat-hf",
106
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
107
+ "togethercomputer/MoA-1-Turbo",
108
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
109
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
110
+ "mistralai/Mistral-7B-Instruct-v0.1"
111
+ ]
64
112
 
65
113
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
66
114
 
67
- ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
115
+ ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
68
116
 
69
117
  ## System settings
70
118
  MAX_WORKER_THREADS = 10
119
+
120
+ # Maximum number of concurrent operations for evaluation runs
121
+ MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
@@ -17,9 +17,10 @@ class EvalDataset:
17
17
  _alias: Union[str, None] = field(default=None)
18
18
  _id: Union[str, None] = field(default=None)
19
19
  judgment_api_key: str = field(default="")
20
-
20
+ organization_id: str = field(default="")
21
21
  def __init__(self,
22
22
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
23
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
23
24
  ground_truths: List[GroundTruthExample] = [],
24
25
  examples: List[Example] = [],
25
26
  ):
@@ -31,7 +32,7 @@ class EvalDataset:
31
32
  self._alias = None
32
33
  self._id = None
33
34
  self.judgment_api_key = judgment_api_key
34
-
35
+ self.organization_id = organization_id
35
36
 
36
37
  def add_from_json(self, file_path: str) -> None:
37
38
  debug(f"Loading dataset from JSON file: {file_path}")
@@ -19,8 +19,9 @@ from judgeval.data.datasets.ground_truth import GroundTruthExample
19
19
 
20
20
 
21
21
  class EvalDatasetClient:
22
- def __init__(self, judgment_api_key: str):
22
+ def __init__(self, judgment_api_key: str, organization_id: str):
23
23
  self.judgment_api_key = judgment_api_key
24
+ self.organization_id = organization_id
24
25
 
25
26
  def create_dataset(self) -> EvalDataset:
26
27
  return EvalDataset(judgment_api_key=self.judgment_api_key)
@@ -58,7 +59,6 @@ class EvalDatasetClient:
58
59
  "ground_truths": [g.to_dict() for g in dataset.ground_truths],
59
60
  "examples": [e.to_dict() for e in dataset.examples],
60
61
  "overwrite": overwrite,
61
- # "judgment_api_key": dataset.judgment_api_key
62
62
  }
63
63
  try:
64
64
  response = requests.post(
@@ -66,8 +66,10 @@ class EvalDatasetClient:
66
66
  json=content,
67
67
  headers={
68
68
  "Content-Type": "application/json",
69
- "Authorization": f"Bearer {self.judgment_api_key}"
70
- }
69
+ "Authorization": f"Bearer {self.judgment_api_key}",
70
+ "X-Organization-Id": self.organization_id
71
+ },
72
+ verify=False
71
73
  )
72
74
  if response.status_code == 500:
73
75
  error(f"Server error during push: {content.get('message')}")
@@ -121,7 +123,6 @@ class EvalDatasetClient:
121
123
  )
122
124
  request_body = {
123
125
  "alias": alias,
124
- # "judgment_api_key": self.judgment_api_key
125
126
  }
126
127
 
127
128
  try:
@@ -130,8 +131,10 @@ class EvalDatasetClient:
130
131
  json=request_body,
131
132
  headers={
132
133
  "Content-Type": "application/json",
133
- "Authorization": f"Bearer {self.judgment_api_key}"
134
- }
134
+ "Authorization": f"Bearer {self.judgment_api_key}",
135
+ "X-Organization-Id": self.organization_id
136
+ },
137
+ verify=False
135
138
  )
136
139
  response.raise_for_status()
137
140
  except requests.exceptions.RequestException as e:
@@ -179,7 +182,6 @@ class EvalDatasetClient:
179
182
  total=100,
180
183
  )
181
184
  request_body = {
182
- # "judgment_api_key": self.judgment_api_key
183
185
  }
184
186
 
185
187
  try:
@@ -188,8 +190,10 @@ class EvalDatasetClient:
188
190
  json=request_body,
189
191
  headers={
190
192
  "Content-Type": "application/json",
191
- "Authorization": f"Bearer {self.judgment_api_key}"
192
- }
193
+ "Authorization": f"Bearer {self.judgment_api_key}",
194
+ "X-Organization-Id": self.organization_id
195
+ },
196
+ verify=False
193
197
  )
194
198
  response.raise_for_status()
195
199
  except requests.exceptions.RequestException as e:
@@ -232,13 +236,18 @@ class EvalDatasetClient:
232
236
  "alias": alias,
233
237
  "examples": [e.to_dict() for e in examples],
234
238
  "ground_truths": [g.to_dict() for g in ground_truths],
235
- "judgment_api_key": self.judgment_api_key
236
239
  }
237
240
 
238
241
  try:
239
242
  response = requests.post(
240
243
  JUDGMENT_DATASETS_EDIT_API_URL,
241
- json=content
244
+ json=content,
245
+ headers={
246
+ "Content-Type": "application/json",
247
+ "Authorization": f"Bearer {self.judgment_api_key}",
248
+ "X-Organization-Id": self.organization_id
249
+ },
250
+ verify=False
242
251
  )
243
252
  response.raise_for_status()
244
253
  except requests.exceptions.RequestException as e:
@@ -266,9 +275,11 @@ class EvalDatasetClient:
266
275
  json={"alias": alias},
267
276
  headers={
268
277
  "Content-Type": "application/json",
269
- "Authorization": f"Bearer {self.judgment_api_key}"
278
+ "Authorization": f"Bearer {self.judgment_api_key}",
279
+ "X-Organization-Id": self.organization_id
270
280
  },
271
- stream=True
281
+ stream=True,
282
+ verify=False
272
283
  )
273
284
  response.raise_for_status()
274
285
  except requests.exceptions.HTTPError as err: