judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +126 -59
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +3 -2
- judgeval/data/datasets/eval_dataset_client.py +25 -14
- judgeval/data/example.py +8 -1
- judgeval/evaluation_run.py +9 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +163 -28
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +32 -14
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/RECORD +20 -18
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
|
|
41
41
|
from judgeval.judgment_client import JudgmentClient
|
42
42
|
from judgeval.data import Example
|
43
43
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
44
|
+
from judgeval.rules import Rule
|
45
|
+
from judgeval.evaluation_run import EvaluationRun
|
46
|
+
from judgeval.judges import JudgevalJudge
|
44
47
|
|
45
48
|
from rich import print as rprint
|
46
49
|
|
47
50
|
from judgeval.data.result import ScoringResult
|
48
|
-
from judgeval.evaluation_run import EvaluationRun
|
49
51
|
|
50
52
|
# Define type aliases for better code readability and maintainability
|
51
53
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
@@ -188,8 +190,9 @@ class TraceManagerClient:
|
|
188
190
|
- Saving a trace
|
189
191
|
- Deleting a trace
|
190
192
|
"""
|
191
|
-
def __init__(self, judgment_api_key: str):
|
193
|
+
def __init__(self, judgment_api_key: str, organization_id: str):
|
192
194
|
self.judgment_api_key = judgment_api_key
|
195
|
+
self.organization_id = organization_id
|
193
196
|
|
194
197
|
def fetch_trace(self, trace_id: str):
|
195
198
|
"""
|
@@ -199,12 +202,13 @@ class TraceManagerClient:
|
|
199
202
|
JUDGMENT_TRACES_FETCH_API_URL,
|
200
203
|
json={
|
201
204
|
"trace_id": trace_id,
|
202
|
-
# "judgment_api_key": self.judgment_api_key,
|
203
205
|
},
|
204
206
|
headers={
|
205
207
|
"Content-Type": "application/json",
|
206
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
207
|
-
|
208
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
209
|
+
"X-Organization-Id": self.organization_id
|
210
|
+
},
|
211
|
+
verify=False
|
208
212
|
)
|
209
213
|
|
210
214
|
if response.status_code != HTTPStatus.OK:
|
@@ -226,8 +230,10 @@ class TraceManagerClient:
|
|
226
230
|
json=trace_data,
|
227
231
|
headers={
|
228
232
|
"Content-Type": "application/json",
|
229
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
230
|
-
|
233
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
234
|
+
"X-Organization-Id": self.organization_id
|
235
|
+
},
|
236
|
+
verify=False
|
231
237
|
)
|
232
238
|
|
233
239
|
if response.status_code == HTTPStatus.BAD_REQUEST:
|
@@ -245,12 +251,12 @@ class TraceManagerClient:
|
|
245
251
|
response = requests.delete(
|
246
252
|
JUDGMENT_TRACES_DELETE_API_URL,
|
247
253
|
json={
|
248
|
-
"judgment_api_key": self.judgment_api_key,
|
249
254
|
"trace_ids": [trace_id],
|
250
255
|
},
|
251
256
|
headers={
|
252
257
|
"Content-Type": "application/json",
|
253
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
258
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
259
|
+
"X-Organization-Id": self.organization_id
|
254
260
|
}
|
255
261
|
)
|
256
262
|
|
@@ -266,12 +272,12 @@ class TraceManagerClient:
|
|
266
272
|
response = requests.delete(
|
267
273
|
JUDGMENT_TRACES_DELETE_API_URL,
|
268
274
|
json={
|
269
|
-
# "judgment_api_key": self.judgment_api_key,
|
270
275
|
"trace_ids": trace_ids,
|
271
276
|
},
|
272
277
|
headers={
|
273
278
|
"Content-Type": "application/json",
|
274
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
279
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
280
|
+
"X-Organization-Id": self.organization_id
|
275
281
|
}
|
276
282
|
)
|
277
283
|
|
@@ -283,18 +289,30 @@ class TraceManagerClient:
|
|
283
289
|
|
284
290
|
class TraceClient:
|
285
291
|
"""Client for managing a single trace context"""
|
286
|
-
|
287
|
-
|
288
|
-
self
|
292
|
+
|
293
|
+
def __init__(
|
294
|
+
self,
|
295
|
+
tracer: Optional["Tracer"],
|
296
|
+
trace_id: Optional[str] = None,
|
297
|
+
name: str = "default",
|
298
|
+
project_name: str = "default_project",
|
299
|
+
overwrite: bool = False,
|
300
|
+
rules: Optional[List[Rule]] = None,
|
301
|
+
):
|
289
302
|
self.name = name
|
303
|
+
self.trace_id = trace_id or str(uuid.uuid4())
|
290
304
|
self.project_name = project_name
|
305
|
+
self.overwrite = overwrite
|
306
|
+
self.tracer = tracer
|
307
|
+
# Initialize rules with either provided rules or an empty list
|
308
|
+
self.rules = rules or []
|
309
|
+
|
291
310
|
self.client: JudgmentClient = tracer.client
|
292
311
|
self.entries: List[TraceEntry] = []
|
293
312
|
self.start_time = time.time()
|
294
313
|
self.span_type = None
|
295
314
|
self._current_span: Optional[TraceEntry] = None
|
296
|
-
self.
|
297
|
-
self.trace_manager_client = TraceManagerClient(tracer.api_key) # Manages DB operations for trace data
|
315
|
+
self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
|
298
316
|
|
299
317
|
@contextmanager
|
300
318
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -346,7 +364,7 @@ class TraceClient:
|
|
346
364
|
expected_tools: Optional[List[str]] = None,
|
347
365
|
additional_metadata: Optional[Dict[str, Any]] = None,
|
348
366
|
model: Optional[str] = None,
|
349
|
-
log_results: Optional[bool] = True
|
367
|
+
log_results: Optional[bool] = True
|
350
368
|
):
|
351
369
|
start_time = time.time() # Record start time
|
352
370
|
example = Example(
|
@@ -360,28 +378,68 @@ class TraceClient:
|
|
360
378
|
additional_metadata=additional_metadata,
|
361
379
|
trace_id=self.trace_id
|
362
380
|
)
|
363
|
-
|
381
|
+
loaded_rules = None
|
382
|
+
if self.rules:
|
383
|
+
loaded_rules = []
|
384
|
+
for rule in self.rules:
|
385
|
+
processed_conditions = []
|
386
|
+
for condition in rule.conditions:
|
387
|
+
# Convert metric if it's a ScorerWrapper
|
388
|
+
try:
|
389
|
+
if isinstance(condition.metric, ScorerWrapper):
|
390
|
+
condition_copy = condition.model_copy()
|
391
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
|
392
|
+
processed_conditions.append(condition_copy)
|
393
|
+
else:
|
394
|
+
processed_conditions.append(condition)
|
395
|
+
except Exception as e:
|
396
|
+
warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
|
397
|
+
processed_conditions.append(condition) # Keep original condition as fallback
|
398
|
+
|
399
|
+
# Create new rule with processed conditions
|
400
|
+
new_rule = rule.model_copy()
|
401
|
+
new_rule.conditions = processed_conditions
|
402
|
+
loaded_rules.append(new_rule)
|
364
403
|
try:
|
365
404
|
# Load appropriate implementations for all scorers
|
366
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
367
|
-
|
368
|
-
|
369
|
-
|
405
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
406
|
+
for scorer in scorers:
|
407
|
+
try:
|
408
|
+
if isinstance(scorer, ScorerWrapper):
|
409
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=True))
|
410
|
+
else:
|
411
|
+
loaded_scorers.append(scorer)
|
412
|
+
except Exception as e:
|
413
|
+
warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
414
|
+
# Skip this scorer
|
415
|
+
|
416
|
+
if not loaded_scorers:
|
417
|
+
warnings.warn("No valid scorers available for evaluation")
|
418
|
+
return
|
419
|
+
|
420
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
421
|
+
if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
422
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
423
|
+
|
370
424
|
except Exception as e:
|
371
|
-
|
425
|
+
warnings.warn(f"Failed to load scorers: {str(e)}")
|
426
|
+
return
|
372
427
|
|
428
|
+
# Combine the trace-level rules with any evaluation-specific rules)
|
373
429
|
eval_run = EvaluationRun(
|
430
|
+
organization_id=self.tracer.organization_id,
|
374
431
|
log_results=log_results,
|
375
432
|
project_name=self.project_name,
|
376
433
|
eval_name=f"{self.name.capitalize()}-"
|
377
434
|
f"{self._current_span}-"
|
378
|
-
f"[{','.join(scorer.
|
435
|
+
f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
|
379
436
|
examples=[example],
|
380
437
|
scorers=loaded_scorers,
|
381
438
|
model=model,
|
382
439
|
metadata={},
|
383
440
|
judgment_api_key=self.tracer.api_key,
|
384
|
-
override=self.overwrite
|
441
|
+
override=self.overwrite,
|
442
|
+
rules=loaded_rules # Use the combined rules
|
385
443
|
)
|
386
444
|
|
387
445
|
self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
|
@@ -546,7 +604,6 @@ class TraceClient:
|
|
546
604
|
# Create trace document
|
547
605
|
trace_data = {
|
548
606
|
"trace_id": self.trace_id,
|
549
|
-
"api_key": self.tracer.api_key,
|
550
607
|
"name": self.name,
|
551
608
|
"project_name": self.project_name,
|
552
609
|
"created_at": datetime.fromtimestamp(self.start_time).isoformat(),
|
@@ -560,7 +617,6 @@ class TraceClient:
|
|
560
617
|
"empty_save": empty_save,
|
561
618
|
"overwrite": overwrite
|
562
619
|
}
|
563
|
-
|
564
620
|
# Execute asynchrous evaluation in the background
|
565
621
|
if not empty_save: # Only send to RabbitMQ if the trace is not empty
|
566
622
|
connection = pika.BlockingConnection(
|
@@ -568,37 +624,23 @@ class TraceClient:
|
|
568
624
|
channel = connection.channel()
|
569
625
|
|
570
626
|
channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
|
571
|
-
|
627
|
+
trace_data["judgment_api_key"] = self.tracer.api_key
|
628
|
+
trace_data["organization_id"] = self.tracer.organization_id
|
572
629
|
channel.basic_publish(
|
573
630
|
exchange='',
|
574
631
|
routing_key=RABBITMQ_QUEUE,
|
575
632
|
body=json.dumps(trace_data),
|
576
633
|
properties=pika.BasicProperties(
|
577
|
-
delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
|
634
|
+
delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
|
635
|
+
headers={
|
636
|
+
'api_key': self.tracer.api_key,
|
637
|
+
'organization_id': self.tracer.organization_id
|
638
|
+
}
|
578
639
|
))
|
579
640
|
connection.close()
|
580
641
|
|
581
642
|
self.trace_manager_client.save_trace(trace_data, empty_save)
|
582
643
|
|
583
|
-
|
584
|
-
# Save trace data by making POST request to API
|
585
|
-
response = requests.post(
|
586
|
-
JUDGMENT_TRACES_SAVE_API_URL,
|
587
|
-
json=trace_data,
|
588
|
-
headers={
|
589
|
-
"Content-Type": "application/json",
|
590
|
-
"Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
|
591
|
-
}
|
592
|
-
)
|
593
|
-
|
594
|
-
if response.status_code == HTTPStatus.BAD_REQUEST:
|
595
|
-
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
596
|
-
elif response.status_code != HTTPStatus.OK:
|
597
|
-
raise ValueError(f"Failed to save trace data: {response.text}")
|
598
|
-
|
599
|
-
if not empty_save and "ui_results_url" in response.json():
|
600
|
-
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
601
|
-
|
602
644
|
return self.trace_id, trace_data
|
603
645
|
|
604
646
|
def delete(self):
|
@@ -612,16 +654,26 @@ class Tracer:
|
|
612
654
|
cls._instance = super(Tracer, cls).__new__(cls)
|
613
655
|
return cls._instance
|
614
656
|
|
615
|
-
def __init__(
|
657
|
+
def __init__(
|
658
|
+
self,
|
659
|
+
api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
660
|
+
project_name: str = "default_project",
|
661
|
+
rules: Optional[List[Rule]] = None, # Added rules parameter
|
662
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
616
663
|
if not hasattr(self, 'initialized'):
|
617
664
|
if not api_key:
|
618
665
|
raise ValueError("Tracer must be configured with a Judgment API key")
|
619
666
|
|
667
|
+
if not organization_id:
|
668
|
+
raise ValueError("Tracer must be configured with an Organization ID")
|
669
|
+
|
620
670
|
self.api_key: str = api_key
|
621
671
|
self.project_name: str = project_name
|
622
672
|
self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
|
673
|
+
self.organization_id: str = organization_id
|
623
674
|
self.depth: int = 0
|
624
675
|
self._current_trace: Optional[str] = None
|
676
|
+
self.rules: List[Rule] = rules or [] # Store rules at tracer level
|
625
677
|
self.initialized: bool = True
|
626
678
|
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
627
679
|
warnings.warn(
|
@@ -632,11 +684,25 @@ class Tracer:
|
|
632
684
|
)
|
633
685
|
|
634
686
|
@contextmanager
|
635
|
-
def trace(
|
687
|
+
def trace(
|
688
|
+
self,
|
689
|
+
name: str,
|
690
|
+
project_name: str = None,
|
691
|
+
overwrite: bool = False,
|
692
|
+
rules: Optional[List[Rule]] = None # Added rules parameter
|
693
|
+
) -> Generator[TraceClient, None, None]:
|
636
694
|
"""Start a new trace context using a context manager"""
|
637
695
|
trace_id = str(uuid.uuid4())
|
638
696
|
project = project_name if project_name is not None else self.project_name
|
639
|
-
|
697
|
+
|
698
|
+
trace = TraceClient(
|
699
|
+
self,
|
700
|
+
trace_id,
|
701
|
+
name,
|
702
|
+
project_name=project,
|
703
|
+
overwrite=overwrite,
|
704
|
+
rules=self.rules # Pass combined rules to the trace client
|
705
|
+
)
|
640
706
|
prev_trace = self._current_trace
|
641
707
|
self._current_trace = trace
|
642
708
|
|
@@ -680,9 +746,9 @@ class Tracer:
|
|
680
746
|
trace = self._current_trace
|
681
747
|
else:
|
682
748
|
trace_id = str(uuid.uuid4())
|
683
|
-
trace_name =
|
749
|
+
trace_name = func.__name__
|
684
750
|
project = project_name if project_name is not None else self.project_name
|
685
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
751
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
686
752
|
self._current_trace = trace
|
687
753
|
# Only save empty trace for the root call
|
688
754
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -717,9 +783,9 @@ class Tracer:
|
|
717
783
|
trace = self._current_trace
|
718
784
|
else:
|
719
785
|
trace_id = str(uuid.uuid4())
|
720
|
-
trace_name =
|
786
|
+
trace_name = func.__name__
|
721
787
|
project = project_name if project_name is not None else self.project_name
|
722
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
788
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
723
789
|
self._current_trace = trace
|
724
790
|
# Only save empty trace for the root call
|
725
791
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -752,14 +818,15 @@ def wrap(client: Any) -> Any:
|
|
752
818
|
Wraps an API client to add tracing capabilities.
|
753
819
|
Supports OpenAI, Together, and Anthropic clients.
|
754
820
|
"""
|
755
|
-
tracer = Tracer._instance # Get the global tracer instance
|
756
|
-
|
757
821
|
# Get the appropriate configuration for this client type
|
758
822
|
span_name, original_create = _get_client_config(client)
|
759
823
|
|
760
824
|
def traced_create(*args, **kwargs):
|
761
|
-
#
|
762
|
-
|
825
|
+
# Get the current tracer instance (might be created after client was wrapped)
|
826
|
+
tracer = Tracer._instance
|
827
|
+
|
828
|
+
# Skip tracing if no tracer exists or no active trace
|
829
|
+
if not tracer or not tracer._current_trace:
|
763
830
|
return original_create(*args, **kwargs)
|
764
831
|
|
765
832
|
with tracer._current_trace.span(span_name, span_type="llm") as span:
|
judgeval/common/utils.py
CHANGED
@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
|
|
21
21
|
from judgeval.constants import *
|
22
22
|
from judgeval.common.logger import debug, error
|
23
23
|
|
24
|
-
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
25
24
|
|
26
25
|
class CustomModelParameters(pydantic.BaseModel):
|
27
26
|
model_name: str
|
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
|
|
72
71
|
def validate_model(cls, model):
|
73
72
|
if not model:
|
74
73
|
raise ValueError("Model cannot be empty")
|
75
|
-
if model not in
|
74
|
+
if model not in ACCEPTABLE_MODELS:
|
76
75
|
raise ValueError(f"Model {model} is not in the list of supported models.")
|
77
76
|
return model
|
78
77
|
|
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
|
|
114
113
|
if request.response_format is not None:
|
115
114
|
debug(f"Using response format: {request.response_format}")
|
116
115
|
response = together_client.chat.completions.create(
|
117
|
-
model=
|
116
|
+
model=request.model,
|
118
117
|
messages=request.messages,
|
119
118
|
response_format=request.response_format
|
120
119
|
)
|
121
120
|
else:
|
122
121
|
response = together_client.chat.completions.create(
|
123
|
-
model=
|
122
|
+
model=request.model,
|
124
123
|
messages=request.messages,
|
125
124
|
)
|
126
125
|
|
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
|
|
144
143
|
if request.response_format is not None:
|
145
144
|
debug(f"Using response format: {request.response_format}")
|
146
145
|
response = await async_together_client.chat.completions.create(
|
147
|
-
model=
|
146
|
+
model=request.model,
|
148
147
|
messages=request.messages,
|
149
148
|
response_format=request.response_format
|
150
149
|
)
|
151
150
|
else:
|
152
151
|
response = await async_together_client.chat.completions.create(
|
153
|
-
model=
|
152
|
+
model=request.model,
|
154
153
|
messages=request.messages,
|
155
154
|
)
|
156
155
|
return response.choices[0].message.content
|
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
|
|
174
173
|
|
175
174
|
# Validate all models are supported
|
176
175
|
for model in models:
|
177
|
-
if model not in
|
178
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
176
|
+
if model not in ACCEPTABLE_MODELS:
|
177
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
179
178
|
|
180
179
|
# Validate input lengths match
|
181
180
|
if response_formats is None:
|
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
|
|
223
222
|
|
224
223
|
# Validate all models are supported
|
225
224
|
for model in models:
|
226
|
-
if model not in
|
227
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
225
|
+
if model not in ACCEPTABLE_MODELS:
|
226
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
228
227
|
|
229
228
|
# Validate input lengths match
|
230
229
|
if response_formats is None:
|
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
|
|
322
321
|
# Add validation
|
323
322
|
validate_chat_messages(messages)
|
324
323
|
|
325
|
-
if model not in
|
326
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
324
|
+
if model not in ACCEPTABLE_MODELS:
|
325
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
327
326
|
|
328
327
|
if response_format is not None:
|
329
328
|
response = await litellm.acompletion(
|
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
|
|
409
408
|
models (List[str]): List of models to query
|
410
409
|
messages (List[Mapping]): List of messages to query
|
411
410
|
response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
|
412
|
-
|
411
|
+
|
413
412
|
Returns:
|
414
413
|
List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
|
415
414
|
"""
|
judgeval/constants.py
CHANGED
@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
|
51
51
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
52
52
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
53
53
|
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
54
|
-
|
55
54
|
# Models
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
55
|
+
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
56
|
+
|
57
|
+
TOGETHER_SUPPORTED_MODELS = [
|
58
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
59
|
+
"Qwen/Qwen2-VL-72B-Instruct",
|
60
|
+
"meta-llama/Llama-Vision-Free",
|
61
|
+
"Gryphe/MythoMax-L2-13b",
|
62
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
63
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
64
|
+
"deepseek-ai/DeepSeek-R1",
|
65
|
+
"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
|
66
|
+
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
67
|
+
"google/gemma-2-27b-it",
|
68
|
+
"mistralai/Mistral-Small-24B-Instruct-2501",
|
69
|
+
"mistralai/Mixtral-8x22B-Instruct-v0.1",
|
70
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
71
|
+
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
72
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
|
73
|
+
"deepseek-ai/DeepSeek-V3",
|
74
|
+
"Qwen/Qwen2-72B-Instruct",
|
75
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Lite",
|
76
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
77
|
+
"upstage/SOLAR-10.7B-Instruct-v1.0",
|
78
|
+
"togethercomputer/MoA-1",
|
79
|
+
"Qwen/QwQ-32B-Preview",
|
80
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
81
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
82
|
+
"mistralai/Mistral-7B-Instruct-v0.2",
|
83
|
+
"databricks/dbrx-instruct",
|
84
|
+
"meta-llama/Llama-3-8b-chat-hf",
|
85
|
+
"google/gemma-2b-it",
|
86
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Lite",
|
87
|
+
"google/gemma-2-9b-it",
|
88
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
89
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
|
90
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
91
|
+
"Gryphe/MythoMax-L2-13b-Lite",
|
92
|
+
"meta-llama/Llama-2-7b-chat-hf",
|
93
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
94
|
+
"meta-llama/Llama-2-13b-chat-hf",
|
95
|
+
"scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
|
96
|
+
"scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
|
97
|
+
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
98
|
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
99
|
+
"microsoft/WizardLM-2-8x22B",
|
100
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
101
|
+
"scb10x/scb10x-llama3-1-typhoon2-60256",
|
102
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
103
|
+
"scb10x/scb10x-llama3-1-typhoon-18370",
|
104
|
+
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
105
|
+
"meta-llama/Llama-3-70b-chat-hf",
|
106
|
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
107
|
+
"togethercomputer/MoA-1-Turbo",
|
108
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
|
109
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
110
|
+
"mistralai/Mistral-7B-Instruct-v0.1"
|
111
|
+
]
|
64
112
|
|
65
113
|
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
|
66
114
|
|
67
|
-
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS
|
115
|
+
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
68
116
|
|
69
117
|
## System settings
|
70
118
|
MAX_WORKER_THREADS = 10
|
119
|
+
|
120
|
+
# Maximum number of concurrent operations for evaluation runs
|
121
|
+
MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
|
@@ -17,9 +17,10 @@ class EvalDataset:
|
|
17
17
|
_alias: Union[str, None] = field(default=None)
|
18
18
|
_id: Union[str, None] = field(default=None)
|
19
19
|
judgment_api_key: str = field(default="")
|
20
|
-
|
20
|
+
organization_id: str = field(default="")
|
21
21
|
def __init__(self,
|
22
22
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
23
24
|
ground_truths: List[GroundTruthExample] = [],
|
24
25
|
examples: List[Example] = [],
|
25
26
|
):
|
@@ -31,7 +32,7 @@ class EvalDataset:
|
|
31
32
|
self._alias = None
|
32
33
|
self._id = None
|
33
34
|
self.judgment_api_key = judgment_api_key
|
34
|
-
|
35
|
+
self.organization_id = organization_id
|
35
36
|
|
36
37
|
def add_from_json(self, file_path: str) -> None:
|
37
38
|
debug(f"Loading dataset from JSON file: {file_path}")
|
@@ -19,8 +19,9 @@ from judgeval.data.datasets.ground_truth import GroundTruthExample
|
|
19
19
|
|
20
20
|
|
21
21
|
class EvalDatasetClient:
|
22
|
-
def __init__(self, judgment_api_key: str):
|
22
|
+
def __init__(self, judgment_api_key: str, organization_id: str):
|
23
23
|
self.judgment_api_key = judgment_api_key
|
24
|
+
self.organization_id = organization_id
|
24
25
|
|
25
26
|
def create_dataset(self) -> EvalDataset:
|
26
27
|
return EvalDataset(judgment_api_key=self.judgment_api_key)
|
@@ -58,7 +59,6 @@ class EvalDatasetClient:
|
|
58
59
|
"ground_truths": [g.to_dict() for g in dataset.ground_truths],
|
59
60
|
"examples": [e.to_dict() for e in dataset.examples],
|
60
61
|
"overwrite": overwrite,
|
61
|
-
# "judgment_api_key": dataset.judgment_api_key
|
62
62
|
}
|
63
63
|
try:
|
64
64
|
response = requests.post(
|
@@ -66,8 +66,10 @@ class EvalDatasetClient:
|
|
66
66
|
json=content,
|
67
67
|
headers={
|
68
68
|
"Content-Type": "application/json",
|
69
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
70
|
-
|
69
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
70
|
+
"X-Organization-Id": self.organization_id
|
71
|
+
},
|
72
|
+
verify=False
|
71
73
|
)
|
72
74
|
if response.status_code == 500:
|
73
75
|
error(f"Server error during push: {content.get('message')}")
|
@@ -121,7 +123,6 @@ class EvalDatasetClient:
|
|
121
123
|
)
|
122
124
|
request_body = {
|
123
125
|
"alias": alias,
|
124
|
-
# "judgment_api_key": self.judgment_api_key
|
125
126
|
}
|
126
127
|
|
127
128
|
try:
|
@@ -130,8 +131,10 @@ class EvalDatasetClient:
|
|
130
131
|
json=request_body,
|
131
132
|
headers={
|
132
133
|
"Content-Type": "application/json",
|
133
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
134
|
-
|
134
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
135
|
+
"X-Organization-Id": self.organization_id
|
136
|
+
},
|
137
|
+
verify=False
|
135
138
|
)
|
136
139
|
response.raise_for_status()
|
137
140
|
except requests.exceptions.RequestException as e:
|
@@ -179,7 +182,6 @@ class EvalDatasetClient:
|
|
179
182
|
total=100,
|
180
183
|
)
|
181
184
|
request_body = {
|
182
|
-
# "judgment_api_key": self.judgment_api_key
|
183
185
|
}
|
184
186
|
|
185
187
|
try:
|
@@ -188,8 +190,10 @@ class EvalDatasetClient:
|
|
188
190
|
json=request_body,
|
189
191
|
headers={
|
190
192
|
"Content-Type": "application/json",
|
191
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
192
|
-
|
193
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
194
|
+
"X-Organization-Id": self.organization_id
|
195
|
+
},
|
196
|
+
verify=False
|
193
197
|
)
|
194
198
|
response.raise_for_status()
|
195
199
|
except requests.exceptions.RequestException as e:
|
@@ -232,13 +236,18 @@ class EvalDatasetClient:
|
|
232
236
|
"alias": alias,
|
233
237
|
"examples": [e.to_dict() for e in examples],
|
234
238
|
"ground_truths": [g.to_dict() for g in ground_truths],
|
235
|
-
"judgment_api_key": self.judgment_api_key
|
236
239
|
}
|
237
240
|
|
238
241
|
try:
|
239
242
|
response = requests.post(
|
240
243
|
JUDGMENT_DATASETS_EDIT_API_URL,
|
241
|
-
json=content
|
244
|
+
json=content,
|
245
|
+
headers={
|
246
|
+
"Content-Type": "application/json",
|
247
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
248
|
+
"X-Organization-Id": self.organization_id
|
249
|
+
},
|
250
|
+
verify=False
|
242
251
|
)
|
243
252
|
response.raise_for_status()
|
244
253
|
except requests.exceptions.RequestException as e:
|
@@ -266,9 +275,11 @@ class EvalDatasetClient:
|
|
266
275
|
json={"alias": alias},
|
267
276
|
headers={
|
268
277
|
"Content-Type": "application/json",
|
269
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
278
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
279
|
+
"X-Organization-Id": self.organization_id
|
270
280
|
},
|
271
|
-
stream=True
|
281
|
+
stream=True,
|
282
|
+
verify=False
|
272
283
|
)
|
273
284
|
response.raise_for_status()
|
274
285
|
except requests.exceptions.HTTPError as err:
|