PyPI - judgeval - Versions diffs - 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

judgeval 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

judgeval/common/tracer.py +108 -30
judgeval/common/utils.py +12 -13
judgeval/constants.py +61 -10
judgeval/data/datasets/dataset.py +1 -1
judgeval/data/datasets/eval_dataset_client.py +10 -6
judgeval/evaluation_run.py +8 -0
judgeval/judges/together_judge.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +147 -18
judgeval/rules.py +384 -0
judgeval/run_evaluation.py +22 -8
judgeval/scorers/api_scorer.py +11 -12
judgeval/scorers/base_scorer.py +1 -1
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
judgeval/utils/alerts.py +43 -0
{judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
{judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/RECORD +19 -17
{judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
{judgeval-0.0.14.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
+from judgeval.rules import Rule
+from judgeval.evaluation_run import EvaluationRun
+from judgeval.judges import JudgevalJudge
 from rich import print as rprint
 from judgeval.data.result import ScoringResult
-from judgeval.evaluation_run import EvaluationRun
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
@@ -205,7 +207,8 @@ class TraceManagerClient:
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {self.judgment_api_key}",
                 "X-Organization-Id": self.organization_id
-            }
+            },
+            verify=False
         )
         if response.status_code != HTTPStatus.OK:
@@ -229,7 +232,8 @@ class TraceManagerClient:
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {self.judgment_api_key}",
                 "X-Organization-Id": self.organization_id
-            }
+            },
+            verify=False
         )
         if response.status_code == HTTPStatus.BAD_REQUEST:
@@ -285,17 +289,29 @@ class TraceManagerClient:
 class TraceClient:
     """Client for managing a single trace context"""
-    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
-        self.tracer = tracer
-        self.trace_id = trace_id
+    def __init__(
+        self,
+        tracer: Optional["Tracer"],
+        trace_id: Optional[str] = None,
+        name: str = "default",
+        project_name: str = "default_project",
+        overwrite: bool = False,
+        rules: Optional[List[Rule]] = None,
+    ):
         self.name = name
+        self.trace_id = trace_id or str(uuid.uuid4())
         self.project_name = project_name
+        self.overwrite = overwrite
+        self.tracer = tracer
+        # Initialize rules with either provided rules or an empty list
+        self.rules = rules or []
         self.client: JudgmentClient = tracer.client
         self.entries: List[TraceEntry] = []
         self.start_time = time.time()
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
-        self.overwrite = overwrite
         self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id)  # Manages DB operations for trace data
     @contextmanager
@@ -348,7 +364,7 @@ class TraceClient:
         expected_tools: Optional[List[str]] = None,
         additional_metadata: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
-        log_results: Optional[bool] = True,
+        log_results: Optional[bool] = True
     ):
         start_time = time.time()  # Record start time
         example = Example(
@@ -362,29 +378,68 @@ class TraceClient:
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
+        loaded_rules = None
+        if self.rules:
+            loaded_rules = []
+            for rule in self.rules:
+                processed_conditions = []
+                for condition in rule.conditions:
+                    # Convert metric if it's a ScorerWrapper
+                    try:
+                        if isinstance(condition.metric, ScorerWrapper):
+                            condition_copy = condition.model_copy()
+                            condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
+                            processed_conditions.append(condition_copy)
+                        else:
+                            processed_conditions.append(condition)
+                    except Exception as e:
+                        warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
+                        processed_conditions.append(condition)  # Keep original condition as fallback
+                # Create new rule with processed conditions
+                new_rule = rule.model_copy()
+                new_rule.conditions = processed_conditions
+                loaded_rules.append(new_rule)
         try:
             # Load appropriate implementations for all scorers
-            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
-                scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
-                for scorer in scorers
-            ]
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
+            for scorer in scorers:
+                try:
+                    if isinstance(scorer, ScorerWrapper):
+                        loaded_scorers.append(scorer.load_implementation(use_judgment=True))
+                    else:
+                        loaded_scorers.append(scorer)
+                except Exception as e:
+                    warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
+                    # Skip this scorer
+            if not loaded_scorers:
+                warnings.warn("No valid scorers available for evaluation")
+                return
+            # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
+            if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
+                raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
         except Exception as e:
-            raise ValueError(f"Failed to load scorers: {str(e)}")
+            warnings.warn(f"Failed to load scorers: {str(e)}")
+            return
+        # Combine the trace-level rules with any evaluation-specific rules)
         eval_run = EvaluationRun(
             organization_id=self.tracer.organization_id,
             log_results=log_results,
             project_name=self.project_name,
             eval_name=f"{self.name.capitalize()}-"
                 f"{self._current_span}-"
-                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
+                f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
             examples=[example],
             scorers=loaded_scorers,
             model=model,
             metadata={},
             judgment_api_key=self.tracer.api_key,
-            override=self.overwrite
+            override=self.overwrite,
+            rules=loaded_rules # Use the combined rules
         )
         self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
@@ -562,7 +617,6 @@ class TraceClient:
             "empty_save": empty_save,
             "overwrite": overwrite
         }
         # Execute asynchrous evaluation in the background
         if not empty_save:  # Only send to RabbitMQ if the trace is not empty
             connection = pika.BlockingConnection(
@@ -572,13 +626,16 @@ class TraceClient:
             channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
             trace_data["judgment_api_key"] = self.tracer.api_key
             trace_data["organization_id"] = self.tracer.organization_id
             channel.basic_publish(
                 exchange='',
                 routing_key=RABBITMQ_QUEUE,
                 body=json.dumps(trace_data),
                 properties=pika.BasicProperties(
-                    delivery_mode=pika.DeliveryMode.Transient  # Changed from Persistent to Transient
+                    delivery_mode=pika.DeliveryMode.Transient,  # Changed from Persistent to Transient
+                    headers={
+                        'api_key': self.tracer.api_key,
+                        'organization_id': self.tracer.organization_id
+                    }
                 ))
             connection.close()
@@ -597,7 +654,12 @@ class Tracer:
             cls._instance = super(Tracer, cls).__new__(cls)
         return cls._instance
-    def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project", organization_id: str = os.getenv("ORGANIZATION_ID")):
+    def __init__(
+        self,
+        api_key: str = os.getenv("JUDGMENT_API_KEY"),
+        project_name: str = "default_project",
+        rules: Optional[List[Rule]] = None,  # Added rules parameter
+        organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
         if not hasattr(self, 'initialized'):
             if not api_key:
                 raise ValueError("Tracer must be configured with a Judgment API key")
@@ -611,6 +673,7 @@ class Tracer:
             self.organization_id: str = organization_id
             self.depth: int = 0
             self._current_trace: Optional[str] = None
+            self.rules: List[Rule] = rules or []  # Store rules at tracer level
             self.initialized: bool = True
         elif hasattr(self, 'project_name') and self.project_name != project_name:
             warnings.warn(
@@ -621,11 +684,25 @@ class Tracer:
             )
     @contextmanager
-    def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
+    def trace(
+        self,
+        name: str,
+        project_name: str = None,
+        overwrite: bool = False,
+        rules: Optional[List[Rule]] = None  # Added rules parameter
+    ) -> Generator[TraceClient, None, None]:
         """Start a new trace context using a context manager"""
         trace_id = str(uuid.uuid4())
         project = project_name if project_name is not None else self.project_name
-        trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
+        trace = TraceClient(
+            self,
+            trace_id,
+            name,
+            project_name=project,
+            overwrite=overwrite,
+            rules=self.rules  # Pass combined rules to the trace client
+        )
         prev_trace = self._current_trace
         self._current_trace = trace
@@ -669,9 +746,9 @@ class Tracer:
                     trace = self._current_trace
                 else:
                     trace_id = str(uuid.uuid4())
-                    trace_name = str(uuid.uuid4())
+                    trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -706,9 +783,9 @@ class Tracer:
                     trace = self._current_trace
                 else:
                     trace_id = str(uuid.uuid4())
-                    trace_name = str(uuid.uuid4())
+                    trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -741,14 +818,15 @@ def wrap(client: Any) -> Any:
     Wraps an API client to add tracing capabilities.
     Supports OpenAI, Together, and Anthropic clients.
     """
-    tracer = Tracer._instance  # Get the global tracer instance
     # Get the appropriate configuration for this client type
     span_name, original_create = _get_client_config(client)
     def traced_create(*args, **kwargs):
-        # Skip tracing if no active trace
-        if not (tracer and tracer._current_trace):
+        # Get the current tracer instance (might be created after client was wrapped)
+        tracer = Tracer._instance
+        # Skip tracing if no tracer exists or no active trace
+        if not tracer or not tracer._current_trace:
             return original_create(*args, **kwargs)
         with tracer._current_trace.span(span_name, span_type="llm") as span:

judgeval/common/utils.py CHANGED Viewed

@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
 from judgeval.constants import *
 from judgeval.common.logger import debug, error
-LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
 class CustomModelParameters(pydantic.BaseModel):
     model_name: str
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
     def validate_model(cls, model):
         if not model:
             raise ValueError("Model cannot be empty")
-        if model not in TOGETHER_SUPPORTED_MODELS and model not in LITELLM_SUPPORTED_MODELS:
+        if model not in ACCEPTABLE_MODELS:
             raise ValueError(f"Model {model} is not in the list of supported models.")
         return model
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
             response_format=request.response_format
         )
     else:
         response = together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
         )
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = await async_together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
             response_format=request.response_format
         )
     else:
         response = await async_together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
         )
     return response.choices[0].message.content
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
     # Validate all models are supported
     for model in models:
-        if model not in TOGETHER_SUPPORTED_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
+        if model not in ACCEPTABLE_MODELS:
+            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     # Validate input lengths match
     if response_formats is None:
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
     # Validate all models are supported
     for model in models:
-        if model not in TOGETHER_SUPPORTED_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
+        if model not in ACCEPTABLE_MODELS:
+            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     # Validate input lengths match
     if response_formats is None:
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
     # Add validation
     validate_chat_messages(messages)
-    if model not in LITELLM_SUPPORTED_MODELS:
-        raise ValueError(f"Model {model} is not in the list of supported Litellm models: {LITELLM_SUPPORTED_MODELS}.")
+    if model not in ACCEPTABLE_MODELS:
+        raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     if response_format is not None:
         response = await litellm.acompletion(
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
         models (List[str]): List of models to query
         messages (List[Mapping]): List of messages to query
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
     """

judgeval/constants.py CHANGED Viewed

@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
 RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
 # Models
-TOGETHER_SUPPORTED_MODELS = {
-    "QWEN": "Qwen/Qwen2-72B-Instruct",
-    "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-    "LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
-    "MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
-    "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-}
+LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
+TOGETHER_SUPPORTED_MODELS = [
+  "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
+  "Qwen/Qwen2-VL-72B-Instruct",
+  "meta-llama/Llama-Vision-Free",
+  "Gryphe/MythoMax-L2-13b",
+  "Qwen/Qwen2.5-72B-Instruct-Turbo",
+  "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+  "deepseek-ai/DeepSeek-R1",
+  "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
+  "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+  "google/gemma-2-27b-it",
+  "mistralai/Mistral-Small-24B-Instruct-2501",
+  "mistralai/Mixtral-8x22B-Instruct-v0.1",
+  "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
+  "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
+  "deepseek-ai/DeepSeek-V3",
+  "Qwen/Qwen2-72B-Instruct",
+  "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
+  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+  "upstage/SOLAR-10.7B-Instruct-v1.0",
+  "togethercomputer/MoA-1",
+  "Qwen/QwQ-32B-Preview",
+  "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+  "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+  "mistralai/Mistral-7B-Instruct-v0.2",
+  "databricks/dbrx-instruct",
+  "meta-llama/Llama-3-8b-chat-hf",
+  "google/gemma-2b-it",
+  "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
+  "google/gemma-2-9b-it",
+  "meta-llama/Llama-3.3-70B-Instruct-Turbo",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+  "Gryphe/MythoMax-L2-13b-Lite",
+  "meta-llama/Llama-2-7b-chat-hf",
+  "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "meta-llama/Llama-2-13b-chat-hf",
+  "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
+  "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
+  "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+  "Qwen/Qwen2.5-Coder-32B-Instruct",
+  "microsoft/WizardLM-2-8x22B",
+  "mistralai/Mistral-7B-Instruct-v0.3",
+  "scb10x/scb10x-llama3-1-typhoon2-60256",
+  "Qwen/Qwen2.5-7B-Instruct-Turbo",
+  "scb10x/scb10x-llama3-1-typhoon-18370",
+  "meta-llama/Llama-3.2-3B-Instruct-Turbo",
+  "meta-llama/Llama-3-70b-chat-hf",
+  "mistralai/Mixtral-8x7B-Instruct-v0.1",
+  "togethercomputer/MoA-1-Turbo",
+  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
+  "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+  "mistralai/Mistral-7B-Instruct-v0.1"
+]
 JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
-ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
+ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
 ## System settings
 MAX_WORKER_THREADS = 10
+# Maximum number of concurrent operations for evaluation runs
+MAX_CONCURRENT_EVALUATIONS = 50  # Adjust based on system capabilities

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -20,7 +20,7 @@ class EvalDataset:
     organization_id: str = field(default="")
     def __init__(self,
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
-                 organization_id: str = os.getenv("ORGANIZATION_ID"),
+                 organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  ground_truths: List[GroundTruthExample] = [],
                  examples: List[Example] = [],
                  ):

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -68,7 +68,8 @@ class EvalDatasetClient:
                         "Content-Type": "application/json",
                         "Authorization": f"Bearer {self.judgment_api_key}",
                         "X-Organization-Id": self.organization_id
-                    }
+                    },
+                    verify=False
                 )
                 if response.status_code == 500:
                     error(f"Server error during push: {content.get('message')}")
@@ -132,7 +133,8 @@ class EvalDatasetClient:
                             "Content-Type": "application/json",
                             "Authorization": f"Bearer {self.judgment_api_key}",
                             "X-Organization-Id": self.organization_id
-                        }
+                        },
+                        verify=False
                     )
                     response.raise_for_status()
                 except requests.exceptions.RequestException as e:
@@ -190,7 +192,8 @@ class EvalDatasetClient:
                             "Content-Type": "application/json",
                             "Authorization": f"Bearer {self.judgment_api_key}",
                             "X-Organization-Id": self.organization_id
-                        }
+                        },
+                        verify=False
                     )
                     response.raise_for_status()
                 except requests.exceptions.RequestException as e:
@@ -233,7 +236,6 @@ class EvalDatasetClient:
                 "alias": alias,
                 "examples": [e.to_dict() for e in examples],
                 "ground_truths": [g.to_dict() for g in ground_truths],
-                "judgment_api_key": self.judgment_api_key
             }
             try:
@@ -244,7 +246,8 @@ class EvalDatasetClient:
                         "Content-Type": "application/json",
                         "Authorization": f"Bearer {self.judgment_api_key}",
                         "X-Organization-Id": self.organization_id
-                    }
+                    },
+                    verify=False
                 )
                 response.raise_for_status()
             except requests.exceptions.RequestException as e:
@@ -275,7 +278,8 @@ class EvalDatasetClient:
                         "Authorization": f"Bearer {self.judgment_api_key}",
                         "X-Organization-Id": self.organization_id
                     },
-                    stream=True
+                    stream=True,
+                    verify=False
                 )
                 response.raise_for_status()
             except requests.exceptions.HTTPError as err:

judgeval/evaluation_run.py CHANGED Viewed

@@ -6,6 +6,7 @@ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.common.logger import debug, error
 from judgeval.judges import JudgevalJudge
+from judgeval.rules import Rule
 class EvaluationRun(BaseModel):
     """
@@ -20,6 +21,7 @@ class EvaluationRun(BaseModel):
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
+        rules (Optional[List[Rule]]): Rules to evaluate against scoring results
     """
     # The user will specify whether they want log_results when they call run_eval
@@ -35,6 +37,7 @@ class EvaluationRun(BaseModel):
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
+    rules: Optional[List[Rule]] = None
     def model_dump(self, **kwargs):
         data = super().model_dump(**kwargs)
@@ -45,6 +48,11 @@ class EvaluationRun(BaseModel):
             else {"score_type": scorer.score_type, "threshold": scorer.threshold}
             for scorer in self.scorers
         ]
+        if self.rules:
+            # Process rules to ensure proper serialization
+              data["rules"] = [rule.model_dump() for rule in self.rules]
         return data
     @field_validator('log_results', mode='before')

judgeval/judges/together_judge.py CHANGED Viewed

@@ -14,7 +14,7 @@ BASE_CONVERSATION = [
 ]
 class TogetherJudge(JudgevalJudge):
-    def __init__(self, model: str = "QWEN", **kwargs):
+    def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
         debug(f"Initializing TogetherJudge with model={model}")
         self.model = model
         self.kwargs = kwargs

judgeval/judges/utils.py CHANGED Viewed

@@ -39,7 +39,7 @@ def create_judge(
                     Please either set the `use_judgment` flag to True or use
                     non-Judgment models."""
                 )
-            if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
+            if m not in ACCEPTABLE_MODELS:
                 raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
         return MixtureOfJudges(models=model), True
     # If model is a string, check that it corresponds to a valid model

judgeval 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

judgeval 0.0.14py3-none-any.whl → 0.0.15py3-none-any.whl