judgeval 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +108 -30
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +1 -1
- judgeval/data/datasets/eval_dataset_client.py +10 -6
- judgeval/evaluation_run.py +8 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +147 -18
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +22 -8
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/RECORD +19 -17
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
- {judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
|
|
41
41
|
from judgeval.judgment_client import JudgmentClient
|
42
42
|
from judgeval.data import Example
|
43
43
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
44
|
+
from judgeval.rules import Rule
|
45
|
+
from judgeval.evaluation_run import EvaluationRun
|
46
|
+
from judgeval.judges import JudgevalJudge
|
44
47
|
|
45
48
|
from rich import print as rprint
|
46
49
|
|
47
50
|
from judgeval.data.result import ScoringResult
|
48
|
-
from judgeval.evaluation_run import EvaluationRun
|
49
51
|
|
50
52
|
# Define type aliases for better code readability and maintainability
|
51
53
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
@@ -205,7 +207,8 @@ class TraceManagerClient:
|
|
205
207
|
"Content-Type": "application/json",
|
206
208
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
207
209
|
"X-Organization-Id": self.organization_id
|
208
|
-
}
|
210
|
+
},
|
211
|
+
verify=False
|
209
212
|
)
|
210
213
|
|
211
214
|
if response.status_code != HTTPStatus.OK:
|
@@ -229,7 +232,8 @@ class TraceManagerClient:
|
|
229
232
|
"Content-Type": "application/json",
|
230
233
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
231
234
|
"X-Organization-Id": self.organization_id
|
232
|
-
}
|
235
|
+
},
|
236
|
+
verify=False
|
233
237
|
)
|
234
238
|
|
235
239
|
if response.status_code == HTTPStatus.BAD_REQUEST:
|
@@ -285,17 +289,29 @@ class TraceManagerClient:
|
|
285
289
|
|
286
290
|
class TraceClient:
|
287
291
|
"""Client for managing a single trace context"""
|
288
|
-
|
289
|
-
|
290
|
-
self
|
292
|
+
|
293
|
+
def __init__(
|
294
|
+
self,
|
295
|
+
tracer: Optional["Tracer"],
|
296
|
+
trace_id: Optional[str] = None,
|
297
|
+
name: str = "default",
|
298
|
+
project_name: str = "default_project",
|
299
|
+
overwrite: bool = False,
|
300
|
+
rules: Optional[List[Rule]] = None,
|
301
|
+
):
|
291
302
|
self.name = name
|
303
|
+
self.trace_id = trace_id or str(uuid.uuid4())
|
292
304
|
self.project_name = project_name
|
305
|
+
self.overwrite = overwrite
|
306
|
+
self.tracer = tracer
|
307
|
+
# Initialize rules with either provided rules or an empty list
|
308
|
+
self.rules = rules or []
|
309
|
+
|
293
310
|
self.client: JudgmentClient = tracer.client
|
294
311
|
self.entries: List[TraceEntry] = []
|
295
312
|
self.start_time = time.time()
|
296
313
|
self.span_type = None
|
297
314
|
self._current_span: Optional[TraceEntry] = None
|
298
|
-
self.overwrite = overwrite
|
299
315
|
self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id) # Manages DB operations for trace data
|
300
316
|
|
301
317
|
@contextmanager
|
@@ -348,7 +364,7 @@ class TraceClient:
|
|
348
364
|
expected_tools: Optional[List[str]] = None,
|
349
365
|
additional_metadata: Optional[Dict[str, Any]] = None,
|
350
366
|
model: Optional[str] = None,
|
351
|
-
log_results: Optional[bool] = True
|
367
|
+
log_results: Optional[bool] = True
|
352
368
|
):
|
353
369
|
start_time = time.time() # Record start time
|
354
370
|
example = Example(
|
@@ -362,29 +378,68 @@ class TraceClient:
|
|
362
378
|
additional_metadata=additional_metadata,
|
363
379
|
trace_id=self.trace_id
|
364
380
|
)
|
365
|
-
|
381
|
+
loaded_rules = None
|
382
|
+
if self.rules:
|
383
|
+
loaded_rules = []
|
384
|
+
for rule in self.rules:
|
385
|
+
processed_conditions = []
|
386
|
+
for condition in rule.conditions:
|
387
|
+
# Convert metric if it's a ScorerWrapper
|
388
|
+
try:
|
389
|
+
if isinstance(condition.metric, ScorerWrapper):
|
390
|
+
condition_copy = condition.model_copy()
|
391
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
|
392
|
+
processed_conditions.append(condition_copy)
|
393
|
+
else:
|
394
|
+
processed_conditions.append(condition)
|
395
|
+
except Exception as e:
|
396
|
+
warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
|
397
|
+
processed_conditions.append(condition) # Keep original condition as fallback
|
398
|
+
|
399
|
+
# Create new rule with processed conditions
|
400
|
+
new_rule = rule.model_copy()
|
401
|
+
new_rule.conditions = processed_conditions
|
402
|
+
loaded_rules.append(new_rule)
|
366
403
|
try:
|
367
404
|
# Load appropriate implementations for all scorers
|
368
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
369
|
-
|
370
|
-
|
371
|
-
|
405
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
406
|
+
for scorer in scorers:
|
407
|
+
try:
|
408
|
+
if isinstance(scorer, ScorerWrapper):
|
409
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=True))
|
410
|
+
else:
|
411
|
+
loaded_scorers.append(scorer)
|
412
|
+
except Exception as e:
|
413
|
+
warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
414
|
+
# Skip this scorer
|
415
|
+
|
416
|
+
if not loaded_scorers:
|
417
|
+
warnings.warn("No valid scorers available for evaluation")
|
418
|
+
return
|
419
|
+
|
420
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
421
|
+
if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
422
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
423
|
+
|
372
424
|
except Exception as e:
|
373
|
-
|
425
|
+
warnings.warn(f"Failed to load scorers: {str(e)}")
|
426
|
+
return
|
374
427
|
|
428
|
+
# Combine the trace-level rules with any evaluation-specific rules)
|
375
429
|
eval_run = EvaluationRun(
|
376
430
|
organization_id=self.tracer.organization_id,
|
377
431
|
log_results=log_results,
|
378
432
|
project_name=self.project_name,
|
379
433
|
eval_name=f"{self.name.capitalize()}-"
|
380
434
|
f"{self._current_span}-"
|
381
|
-
f"[{','.join(scorer.
|
435
|
+
f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
|
382
436
|
examples=[example],
|
383
437
|
scorers=loaded_scorers,
|
384
438
|
model=model,
|
385
439
|
metadata={},
|
386
440
|
judgment_api_key=self.tracer.api_key,
|
387
|
-
override=self.overwrite
|
441
|
+
override=self.overwrite,
|
442
|
+
rules=loaded_rules # Use the combined rules
|
388
443
|
)
|
389
444
|
|
390
445
|
self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
|
@@ -562,7 +617,6 @@ class TraceClient:
|
|
562
617
|
"empty_save": empty_save,
|
563
618
|
"overwrite": overwrite
|
564
619
|
}
|
565
|
-
|
566
620
|
# Execute asynchrous evaluation in the background
|
567
621
|
if not empty_save: # Only send to RabbitMQ if the trace is not empty
|
568
622
|
connection = pika.BlockingConnection(
|
@@ -572,13 +626,16 @@ class TraceClient:
|
|
572
626
|
channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
|
573
627
|
trace_data["judgment_api_key"] = self.tracer.api_key
|
574
628
|
trace_data["organization_id"] = self.tracer.organization_id
|
575
|
-
|
576
629
|
channel.basic_publish(
|
577
630
|
exchange='',
|
578
631
|
routing_key=RABBITMQ_QUEUE,
|
579
632
|
body=json.dumps(trace_data),
|
580
633
|
properties=pika.BasicProperties(
|
581
|
-
delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
|
634
|
+
delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
|
635
|
+
headers={
|
636
|
+
'api_key': self.tracer.api_key,
|
637
|
+
'organization_id': self.tracer.organization_id
|
638
|
+
}
|
582
639
|
))
|
583
640
|
connection.close()
|
584
641
|
|
@@ -597,7 +654,12 @@ class Tracer:
|
|
597
654
|
cls._instance = super(Tracer, cls).__new__(cls)
|
598
655
|
return cls._instance
|
599
656
|
|
600
|
-
def __init__(
|
657
|
+
def __init__(
|
658
|
+
self,
|
659
|
+
api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
660
|
+
project_name: str = "default_project",
|
661
|
+
rules: Optional[List[Rule]] = None, # Added rules parameter
|
662
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
601
663
|
if not hasattr(self, 'initialized'):
|
602
664
|
if not api_key:
|
603
665
|
raise ValueError("Tracer must be configured with a Judgment API key")
|
@@ -611,6 +673,7 @@ class Tracer:
|
|
611
673
|
self.organization_id: str = organization_id
|
612
674
|
self.depth: int = 0
|
613
675
|
self._current_trace: Optional[str] = None
|
676
|
+
self.rules: List[Rule] = rules or [] # Store rules at tracer level
|
614
677
|
self.initialized: bool = True
|
615
678
|
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
616
679
|
warnings.warn(
|
@@ -621,11 +684,25 @@ class Tracer:
|
|
621
684
|
)
|
622
685
|
|
623
686
|
@contextmanager
|
624
|
-
def trace(
|
687
|
+
def trace(
|
688
|
+
self,
|
689
|
+
name: str,
|
690
|
+
project_name: str = None,
|
691
|
+
overwrite: bool = False,
|
692
|
+
rules: Optional[List[Rule]] = None # Added rules parameter
|
693
|
+
) -> Generator[TraceClient, None, None]:
|
625
694
|
"""Start a new trace context using a context manager"""
|
626
695
|
trace_id = str(uuid.uuid4())
|
627
696
|
project = project_name if project_name is not None else self.project_name
|
628
|
-
|
697
|
+
|
698
|
+
trace = TraceClient(
|
699
|
+
self,
|
700
|
+
trace_id,
|
701
|
+
name,
|
702
|
+
project_name=project,
|
703
|
+
overwrite=overwrite,
|
704
|
+
rules=self.rules # Pass combined rules to the trace client
|
705
|
+
)
|
629
706
|
prev_trace = self._current_trace
|
630
707
|
self._current_trace = trace
|
631
708
|
|
@@ -669,9 +746,9 @@ class Tracer:
|
|
669
746
|
trace = self._current_trace
|
670
747
|
else:
|
671
748
|
trace_id = str(uuid.uuid4())
|
672
|
-
trace_name =
|
749
|
+
trace_name = func.__name__
|
673
750
|
project = project_name if project_name is not None else self.project_name
|
674
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
751
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
675
752
|
self._current_trace = trace
|
676
753
|
# Only save empty trace for the root call
|
677
754
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -706,9 +783,9 @@ class Tracer:
|
|
706
783
|
trace = self._current_trace
|
707
784
|
else:
|
708
785
|
trace_id = str(uuid.uuid4())
|
709
|
-
trace_name =
|
786
|
+
trace_name = func.__name__
|
710
787
|
project = project_name if project_name is not None else self.project_name
|
711
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
|
788
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
712
789
|
self._current_trace = trace
|
713
790
|
# Only save empty trace for the root call
|
714
791
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -741,14 +818,15 @@ def wrap(client: Any) -> Any:
|
|
741
818
|
Wraps an API client to add tracing capabilities.
|
742
819
|
Supports OpenAI, Together, and Anthropic clients.
|
743
820
|
"""
|
744
|
-
tracer = Tracer._instance # Get the global tracer instance
|
745
|
-
|
746
821
|
# Get the appropriate configuration for this client type
|
747
822
|
span_name, original_create = _get_client_config(client)
|
748
823
|
|
749
824
|
def traced_create(*args, **kwargs):
|
750
|
-
#
|
751
|
-
|
825
|
+
# Get the current tracer instance (might be created after client was wrapped)
|
826
|
+
tracer = Tracer._instance
|
827
|
+
|
828
|
+
# Skip tracing if no tracer exists or no active trace
|
829
|
+
if not tracer or not tracer._current_trace:
|
752
830
|
return original_create(*args, **kwargs)
|
753
831
|
|
754
832
|
with tracer._current_trace.span(span_name, span_type="llm") as span:
|
judgeval/common/utils.py
CHANGED
@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
|
|
21
21
|
from judgeval.constants import *
|
22
22
|
from judgeval.common.logger import debug, error
|
23
23
|
|
24
|
-
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
25
24
|
|
26
25
|
class CustomModelParameters(pydantic.BaseModel):
|
27
26
|
model_name: str
|
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
|
|
72
71
|
def validate_model(cls, model):
|
73
72
|
if not model:
|
74
73
|
raise ValueError("Model cannot be empty")
|
75
|
-
if model not in
|
74
|
+
if model not in ACCEPTABLE_MODELS:
|
76
75
|
raise ValueError(f"Model {model} is not in the list of supported models.")
|
77
76
|
return model
|
78
77
|
|
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
|
|
114
113
|
if request.response_format is not None:
|
115
114
|
debug(f"Using response format: {request.response_format}")
|
116
115
|
response = together_client.chat.completions.create(
|
117
|
-
model=
|
116
|
+
model=request.model,
|
118
117
|
messages=request.messages,
|
119
118
|
response_format=request.response_format
|
120
119
|
)
|
121
120
|
else:
|
122
121
|
response = together_client.chat.completions.create(
|
123
|
-
model=
|
122
|
+
model=request.model,
|
124
123
|
messages=request.messages,
|
125
124
|
)
|
126
125
|
|
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
|
|
144
143
|
if request.response_format is not None:
|
145
144
|
debug(f"Using response format: {request.response_format}")
|
146
145
|
response = await async_together_client.chat.completions.create(
|
147
|
-
model=
|
146
|
+
model=request.model,
|
148
147
|
messages=request.messages,
|
149
148
|
response_format=request.response_format
|
150
149
|
)
|
151
150
|
else:
|
152
151
|
response = await async_together_client.chat.completions.create(
|
153
|
-
model=
|
152
|
+
model=request.model,
|
154
153
|
messages=request.messages,
|
155
154
|
)
|
156
155
|
return response.choices[0].message.content
|
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
|
|
174
173
|
|
175
174
|
# Validate all models are supported
|
176
175
|
for model in models:
|
177
|
-
if model not in
|
178
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
176
|
+
if model not in ACCEPTABLE_MODELS:
|
177
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
179
178
|
|
180
179
|
# Validate input lengths match
|
181
180
|
if response_formats is None:
|
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
|
|
223
222
|
|
224
223
|
# Validate all models are supported
|
225
224
|
for model in models:
|
226
|
-
if model not in
|
227
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
225
|
+
if model not in ACCEPTABLE_MODELS:
|
226
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
228
227
|
|
229
228
|
# Validate input lengths match
|
230
229
|
if response_formats is None:
|
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
|
|
322
321
|
# Add validation
|
323
322
|
validate_chat_messages(messages)
|
324
323
|
|
325
|
-
if model not in
|
326
|
-
raise ValueError(f"Model {model} is not in the list of supported
|
324
|
+
if model not in ACCEPTABLE_MODELS:
|
325
|
+
raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
|
327
326
|
|
328
327
|
if response_format is not None:
|
329
328
|
response = await litellm.acompletion(
|
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
|
|
409
408
|
models (List[str]): List of models to query
|
410
409
|
messages (List[Mapping]): List of messages to query
|
411
410
|
response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
|
412
|
-
|
411
|
+
|
413
412
|
Returns:
|
414
413
|
List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
|
415
414
|
"""
|
judgeval/constants.py
CHANGED
@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
|
51
51
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
52
52
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
53
53
|
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
54
|
-
|
55
54
|
# Models
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
55
|
+
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
56
|
+
|
57
|
+
TOGETHER_SUPPORTED_MODELS = [
|
58
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
|
59
|
+
"Qwen/Qwen2-VL-72B-Instruct",
|
60
|
+
"meta-llama/Llama-Vision-Free",
|
61
|
+
"Gryphe/MythoMax-L2-13b",
|
62
|
+
"Qwen/Qwen2.5-72B-Instruct-Turbo",
|
63
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
64
|
+
"deepseek-ai/DeepSeek-R1",
|
65
|
+
"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
|
66
|
+
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
67
|
+
"google/gemma-2-27b-it",
|
68
|
+
"mistralai/Mistral-Small-24B-Instruct-2501",
|
69
|
+
"mistralai/Mixtral-8x22B-Instruct-v0.1",
|
70
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
|
71
|
+
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
72
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
|
73
|
+
"deepseek-ai/DeepSeek-V3",
|
74
|
+
"Qwen/Qwen2-72B-Instruct",
|
75
|
+
"meta-llama/Meta-Llama-3-8B-Instruct-Lite",
|
76
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
77
|
+
"upstage/SOLAR-10.7B-Instruct-v1.0",
|
78
|
+
"togethercomputer/MoA-1",
|
79
|
+
"Qwen/QwQ-32B-Preview",
|
80
|
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
81
|
+
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
82
|
+
"mistralai/Mistral-7B-Instruct-v0.2",
|
83
|
+
"databricks/dbrx-instruct",
|
84
|
+
"meta-llama/Llama-3-8b-chat-hf",
|
85
|
+
"google/gemma-2b-it",
|
86
|
+
"meta-llama/Meta-Llama-3-70B-Instruct-Lite",
|
87
|
+
"google/gemma-2-9b-it",
|
88
|
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
89
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
|
90
|
+
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
91
|
+
"Gryphe/MythoMax-L2-13b-Lite",
|
92
|
+
"meta-llama/Llama-2-7b-chat-hf",
|
93
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
94
|
+
"meta-llama/Llama-2-13b-chat-hf",
|
95
|
+
"scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
|
96
|
+
"scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
|
97
|
+
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
98
|
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
99
|
+
"microsoft/WizardLM-2-8x22B",
|
100
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
101
|
+
"scb10x/scb10x-llama3-1-typhoon2-60256",
|
102
|
+
"Qwen/Qwen2.5-7B-Instruct-Turbo",
|
103
|
+
"scb10x/scb10x-llama3-1-typhoon-18370",
|
104
|
+
"meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
105
|
+
"meta-llama/Llama-3-70b-chat-hf",
|
106
|
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
107
|
+
"togethercomputer/MoA-1-Turbo",
|
108
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
|
109
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
110
|
+
"mistralai/Mistral-7B-Instruct-v0.1"
|
111
|
+
]
|
64
112
|
|
65
113
|
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
|
66
114
|
|
67
|
-
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS
|
115
|
+
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
68
116
|
|
69
117
|
## System settings
|
70
118
|
MAX_WORKER_THREADS = 10
|
119
|
+
|
120
|
+
# Maximum number of concurrent operations for evaluation runs
|
121
|
+
MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
|
@@ -20,7 +20,7 @@ class EvalDataset:
|
|
20
20
|
organization_id: str = field(default="")
|
21
21
|
def __init__(self,
|
22
22
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
|
-
organization_id: str = os.getenv("
|
23
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
24
24
|
ground_truths: List[GroundTruthExample] = [],
|
25
25
|
examples: List[Example] = [],
|
26
26
|
):
|
@@ -68,7 +68,8 @@ class EvalDatasetClient:
|
|
68
68
|
"Content-Type": "application/json",
|
69
69
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
70
70
|
"X-Organization-Id": self.organization_id
|
71
|
-
}
|
71
|
+
},
|
72
|
+
verify=False
|
72
73
|
)
|
73
74
|
if response.status_code == 500:
|
74
75
|
error(f"Server error during push: {content.get('message')}")
|
@@ -132,7 +133,8 @@ class EvalDatasetClient:
|
|
132
133
|
"Content-Type": "application/json",
|
133
134
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
134
135
|
"X-Organization-Id": self.organization_id
|
135
|
-
}
|
136
|
+
},
|
137
|
+
verify=False
|
136
138
|
)
|
137
139
|
response.raise_for_status()
|
138
140
|
except requests.exceptions.RequestException as e:
|
@@ -190,7 +192,8 @@ class EvalDatasetClient:
|
|
190
192
|
"Content-Type": "application/json",
|
191
193
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
194
|
"X-Organization-Id": self.organization_id
|
193
|
-
}
|
195
|
+
},
|
196
|
+
verify=False
|
194
197
|
)
|
195
198
|
response.raise_for_status()
|
196
199
|
except requests.exceptions.RequestException as e:
|
@@ -233,7 +236,6 @@ class EvalDatasetClient:
|
|
233
236
|
"alias": alias,
|
234
237
|
"examples": [e.to_dict() for e in examples],
|
235
238
|
"ground_truths": [g.to_dict() for g in ground_truths],
|
236
|
-
"judgment_api_key": self.judgment_api_key
|
237
239
|
}
|
238
240
|
|
239
241
|
try:
|
@@ -244,7 +246,8 @@ class EvalDatasetClient:
|
|
244
246
|
"Content-Type": "application/json",
|
245
247
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
246
248
|
"X-Organization-Id": self.organization_id
|
247
|
-
}
|
249
|
+
},
|
250
|
+
verify=False
|
248
251
|
)
|
249
252
|
response.raise_for_status()
|
250
253
|
except requests.exceptions.RequestException as e:
|
@@ -275,7 +278,8 @@ class EvalDatasetClient:
|
|
275
278
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
276
279
|
"X-Organization-Id": self.organization_id
|
277
280
|
},
|
278
|
-
stream=True
|
281
|
+
stream=True,
|
282
|
+
verify=False
|
279
283
|
)
|
280
284
|
response.raise_for_status()
|
281
285
|
except requests.exceptions.HTTPError as err:
|
judgeval/evaluation_run.py
CHANGED
@@ -6,6 +6,7 @@ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
|
6
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
7
7
|
from judgeval.common.logger import debug, error
|
8
8
|
from judgeval.judges import JudgevalJudge
|
9
|
+
from judgeval.rules import Rule
|
9
10
|
|
10
11
|
class EvaluationRun(BaseModel):
|
11
12
|
"""
|
@@ -20,6 +21,7 @@ class EvaluationRun(BaseModel):
|
|
20
21
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
21
22
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
22
23
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
24
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
23
25
|
"""
|
24
26
|
|
25
27
|
# The user will specify whether they want log_results when they call run_eval
|
@@ -35,6 +37,7 @@ class EvaluationRun(BaseModel):
|
|
35
37
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
36
38
|
judgment_api_key: Optional[str] = ""
|
37
39
|
override: Optional[bool] = False
|
40
|
+
rules: Optional[List[Rule]] = None
|
38
41
|
|
39
42
|
def model_dump(self, **kwargs):
|
40
43
|
data = super().model_dump(**kwargs)
|
@@ -45,6 +48,11 @@ class EvaluationRun(BaseModel):
|
|
45
48
|
else {"score_type": scorer.score_type, "threshold": scorer.threshold}
|
46
49
|
for scorer in self.scorers
|
47
50
|
]
|
51
|
+
|
52
|
+
if self.rules:
|
53
|
+
# Process rules to ensure proper serialization
|
54
|
+
data["rules"] = [rule.model_dump() for rule in self.rules]
|
55
|
+
|
48
56
|
return data
|
49
57
|
|
50
58
|
@field_validator('log_results', mode='before')
|
@@ -14,7 +14,7 @@ BASE_CONVERSATION = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
class TogetherJudge(JudgevalJudge):
|
17
|
-
def __init__(self, model: str = "
|
17
|
+
def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
|
18
18
|
debug(f"Initializing TogetherJudge with model={model}")
|
19
19
|
self.model = model
|
20
20
|
self.kwargs = kwargs
|
judgeval/judges/utils.py
CHANGED
@@ -39,7 +39,7 @@ def create_judge(
|
|
39
39
|
Please either set the `use_judgment` flag to True or use
|
40
40
|
non-Judgment models."""
|
41
41
|
)
|
42
|
-
if m not in
|
42
|
+
if m not in ACCEPTABLE_MODELS:
|
43
43
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
44
44
|
return MixtureOfJudges(models=model), True
|
45
45
|
# If model is a string, check that it corresponds to a valid model
|