PyPI - judgeval - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

judgeval 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

judgeval/common/tracer.py +126 -59
judgeval/common/utils.py +12 -13
judgeval/constants.py +61 -10
judgeval/data/datasets/dataset.py +3 -2
judgeval/data/datasets/eval_dataset_client.py +25 -14
judgeval/data/example.py +8 -1
judgeval/evaluation_run.py +9 -0
judgeval/judges/together_judge.py +1 -1
judgeval/judges/utils.py +1 -1
judgeval/judgment_client.py +163 -28
judgeval/rules.py +384 -0
judgeval/run_evaluation.py +32 -14
judgeval/scorers/api_scorer.py +11 -12
judgeval/scorers/base_scorer.py +1 -1
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
judgeval/utils/alerts.py +43 -0
{judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
{judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/RECORD +20 -18
{judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
{judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -41,11 +41,13 @@ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FET
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
+from judgeval.rules import Rule
+from judgeval.evaluation_run import EvaluationRun
+from judgeval.judges import JudgevalJudge
 from rich import print as rprint
 from judgeval.data.result import ScoringResult
-from judgeval.evaluation_run import EvaluationRun
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
@@ -188,8 +190,9 @@ class TraceManagerClient:
     - Saving a trace
     - Deleting a trace
     """
-    def __init__(self, judgment_api_key: str):
+    def __init__(self, judgment_api_key: str, organization_id: str):
         self.judgment_api_key = judgment_api_key
+        self.organization_id = organization_id
     def fetch_trace(self, trace_id: str):
         """
@@ -199,12 +202,13 @@ class TraceManagerClient:
             JUDGMENT_TRACES_FETCH_API_URL,
             json={
                 "trace_id": trace_id,
-                # "judgment_api_key": self.judgment_api_key,
             },
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}"
-            }
+                "Authorization": f"Bearer {self.judgment_api_key}",
+                "X-Organization-Id": self.organization_id
+            },
+            verify=False
         )
         if response.status_code != HTTPStatus.OK:
@@ -226,8 +230,10 @@ class TraceManagerClient:
             json=trace_data,
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}"
-            }
+                "Authorization": f"Bearer {self.judgment_api_key}",
+                "X-Organization-Id": self.organization_id
+            },
+            verify=False
         )
         if response.status_code == HTTPStatus.BAD_REQUEST:
@@ -245,12 +251,12 @@ class TraceManagerClient:
         response = requests.delete(
             JUDGMENT_TRACES_DELETE_API_URL,
             json={
-                "judgment_api_key": self.judgment_api_key,
                 "trace_ids": [trace_id],
             },
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}"
+                "Authorization": f"Bearer {self.judgment_api_key}",
+                "X-Organization-Id": self.organization_id
             }
         )
@@ -266,12 +272,12 @@ class TraceManagerClient:
         response = requests.delete(
             JUDGMENT_TRACES_DELETE_API_URL,
             json={
-                # "judgment_api_key": self.judgment_api_key,
                 "trace_ids": trace_ids,
             },
             headers={
                 "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.judgment_api_key}"
+                "Authorization": f"Bearer {self.judgment_api_key}",
+                "X-Organization-Id": self.organization_id
             }
         )
@@ -283,18 +289,30 @@ class TraceManagerClient:
 class TraceClient:
     """Client for managing a single trace context"""
-    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
-        self.tracer = tracer
-        self.trace_id = trace_id
+    def __init__(
+        self,
+        tracer: Optional["Tracer"],
+        trace_id: Optional[str] = None,
+        name: str = "default",
+        project_name: str = "default_project",
+        overwrite: bool = False,
+        rules: Optional[List[Rule]] = None,
+    ):
         self.name = name
+        self.trace_id = trace_id or str(uuid.uuid4())
         self.project_name = project_name
+        self.overwrite = overwrite
+        self.tracer = tracer
+        # Initialize rules with either provided rules or an empty list
+        self.rules = rules or []
         self.client: JudgmentClient = tracer.client
         self.entries: List[TraceEntry] = []
         self.start_time = time.time()
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
-        self.overwrite = overwrite
-        self.trace_manager_client = TraceManagerClient(tracer.api_key)  # Manages DB operations for trace data
+        self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id)  # Manages DB operations for trace data
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -346,7 +364,7 @@ class TraceClient:
         expected_tools: Optional[List[str]] = None,
         additional_metadata: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
-        log_results: Optional[bool] = True,
+        log_results: Optional[bool] = True
     ):
         start_time = time.time()  # Record start time
         example = Example(
@@ -360,28 +378,68 @@ class TraceClient:
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
+        loaded_rules = None
+        if self.rules:
+            loaded_rules = []
+            for rule in self.rules:
+                processed_conditions = []
+                for condition in rule.conditions:
+                    # Convert metric if it's a ScorerWrapper
+                    try:
+                        if isinstance(condition.metric, ScorerWrapper):
+                            condition_copy = condition.model_copy()
+                            condition_copy.metric = condition.metric.load_implementation(use_judgment=True)
+                            processed_conditions.append(condition_copy)
+                        else:
+                            processed_conditions.append(condition)
+                    except Exception as e:
+                        warnings.warn(f"Failed to convert ScorerWrapper in rule '{rule.name}', condition metric '{condition.metric_name}': {str(e)}")
+                        processed_conditions.append(condition)  # Keep original condition as fallback
+                # Create new rule with processed conditions
+                new_rule = rule.model_copy()
+                new_rule.conditions = processed_conditions
+                loaded_rules.append(new_rule)
         try:
             # Load appropriate implementations for all scorers
-            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
-                scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
-                for scorer in scorers
-            ]
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
+            for scorer in scorers:
+                try:
+                    if isinstance(scorer, ScorerWrapper):
+                        loaded_scorers.append(scorer.load_implementation(use_judgment=True))
+                    else:
+                        loaded_scorers.append(scorer)
+                except Exception as e:
+                    warnings.warn(f"Failed to load implementation for scorer {scorer}: {str(e)}")
+                    # Skip this scorer
+            if not loaded_scorers:
+                warnings.warn("No valid scorers available for evaluation")
+                return
+            # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
+            if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
+                raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
         except Exception as e:
-            raise ValueError(f"Failed to load scorers: {str(e)}")
+            warnings.warn(f"Failed to load scorers: {str(e)}")
+            return
+        # Combine the trace-level rules with any evaluation-specific rules)
         eval_run = EvaluationRun(
+            organization_id=self.tracer.organization_id,
             log_results=log_results,
             project_name=self.project_name,
             eval_name=f"{self.name.capitalize()}-"
                 f"{self._current_span}-"
-                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
+                f"[{','.join(scorer.score_type.capitalize() for scorer in loaded_scorers)}]",
             examples=[example],
             scorers=loaded_scorers,
             model=model,
             metadata={},
             judgment_api_key=self.tracer.api_key,
-            override=self.overwrite
+            override=self.overwrite,
+            rules=loaded_rules # Use the combined rules
         )
         self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
@@ -546,7 +604,6 @@ class TraceClient:
         # Create trace document
         trace_data = {
             "trace_id": self.trace_id,
-            "api_key": self.tracer.api_key,
             "name": self.name,
             "project_name": self.project_name,
             "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
@@ -560,7 +617,6 @@ class TraceClient:
             "empty_save": empty_save,
             "overwrite": overwrite
         }
         # Execute asynchrous evaluation in the background
         if not empty_save:  # Only send to RabbitMQ if the trace is not empty
             connection = pika.BlockingConnection(
@@ -568,37 +624,23 @@ class TraceClient:
             channel = connection.channel()
             channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
+            trace_data["judgment_api_key"] = self.tracer.api_key
+            trace_data["organization_id"] = self.tracer.organization_id
             channel.basic_publish(
                 exchange='',
                 routing_key=RABBITMQ_QUEUE,
                 body=json.dumps(trace_data),
                 properties=pika.BasicProperties(
-                    delivery_mode=pika.DeliveryMode.Transient  # Changed from Persistent to Transient
+                    delivery_mode=pika.DeliveryMode.Transient,  # Changed from Persistent to Transient
+                    headers={
+                        'api_key': self.tracer.api_key,
+                        'organization_id': self.tracer.organization_id
+                    }
                 ))
             connection.close()
         self.trace_manager_client.save_trace(trace_data, empty_save)
-        # Save trace data by making POST request to API
-        response = requests.post(
-            JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
-            headers={
-                "Content-Type": "application/json",
-                "Authorization": f"Bearer {self.tracer.api_key}"  # Bearer token format
-            }
-        )
-        if response.status_code == HTTPStatus.BAD_REQUEST:
-            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
-        elif response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to save trace data: {response.text}")
-        if not empty_save and "ui_results_url" in response.json():
-            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
         return self.trace_id, trace_data
     def delete(self):
@@ -612,16 +654,26 @@ class Tracer:
             cls._instance = super(Tracer, cls).__new__(cls)
         return cls._instance
-    def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
+    def __init__(
+        self,
+        api_key: str = os.getenv("JUDGMENT_API_KEY"),
+        project_name: str = "default_project",
+        rules: Optional[List[Rule]] = None,  # Added rules parameter
+        organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
         if not hasattr(self, 'initialized'):
             if not api_key:
                 raise ValueError("Tracer must be configured with a Judgment API key")
+            if not organization_id:
+                raise ValueError("Tracer must be configured with an Organization ID")
             self.api_key: str = api_key
             self.project_name: str = project_name
             self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
+            self.organization_id: str = organization_id
             self.depth: int = 0
             self._current_trace: Optional[str] = None
+            self.rules: List[Rule] = rules or []  # Store rules at tracer level
             self.initialized: bool = True
         elif hasattr(self, 'project_name') and self.project_name != project_name:
             warnings.warn(
@@ -632,11 +684,25 @@ class Tracer:
             )
     @contextmanager
-    def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
+    def trace(
+        self,
+        name: str,
+        project_name: str = None,
+        overwrite: bool = False,
+        rules: Optional[List[Rule]] = None  # Added rules parameter
+    ) -> Generator[TraceClient, None, None]:
         """Start a new trace context using a context manager"""
         trace_id = str(uuid.uuid4())
         project = project_name if project_name is not None else self.project_name
-        trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
+        trace = TraceClient(
+            self,
+            trace_id,
+            name,
+            project_name=project,
+            overwrite=overwrite,
+            rules=self.rules  # Pass combined rules to the trace client
+        )
         prev_trace = self._current_trace
         self._current_trace = trace
@@ -680,9 +746,9 @@ class Tracer:
                     trace = self._current_trace
                 else:
                     trace_id = str(uuid.uuid4())
-                    trace_name = str(uuid.uuid4())
+                    trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -717,9 +783,9 @@ class Tracer:
                     trace = self._current_trace
                 else:
                     trace_id = str(uuid.uuid4())
-                    trace_name = str(uuid.uuid4())
+                    trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -752,14 +818,15 @@ def wrap(client: Any) -> Any:
     Wraps an API client to add tracing capabilities.
     Supports OpenAI, Together, and Anthropic clients.
     """
-    tracer = Tracer._instance  # Get the global tracer instance
     # Get the appropriate configuration for this client type
     span_name, original_create = _get_client_config(client)
     def traced_create(*args, **kwargs):
-        # Skip tracing if no active trace
-        if not (tracer and tracer._current_trace):
+        # Get the current tracer instance (might be created after client was wrapped)
+        tracer = Tracer._instance
+        # Skip tracing if no tracer exists or no active trace
+        if not tracer or not tracer._current_trace:
             return original_create(*args, **kwargs)
         with tracer._current_trace.span(span_name, span_type="llm") as span:

judgeval/common/utils.py CHANGED Viewed

@@ -21,7 +21,6 @@ from judgeval.clients import async_together_client, together_client
 from judgeval.constants import *
 from judgeval.common.logger import debug, error
-LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
 class CustomModelParameters(pydantic.BaseModel):
     model_name: str
@@ -72,7 +71,7 @@ class ChatCompletionRequest(pydantic.BaseModel):
     def validate_model(cls, model):
         if not model:
             raise ValueError("Model cannot be empty")
-        if model not in TOGETHER_SUPPORTED_MODELS and model not in LITELLM_SUPPORTED_MODELS:
+        if model not in ACCEPTABLE_MODELS:
             raise ValueError(f"Model {model} is not in the list of supported models.")
         return model
@@ -114,13 +113,13 @@ def fetch_together_api_response(model: str, messages: List[Mapping], response_fo
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
             response_format=request.response_format
         )
     else:
         response = together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
         )
@@ -144,13 +143,13 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = await async_together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
             response_format=request.response_format
         )
     else:
         response = await async_together_client.chat.completions.create(
-            model=TOGETHER_SUPPORTED_MODELS.get(request.model),
+            model=request.model,
             messages=request.messages,
         )
     return response.choices[0].message.content
@@ -174,8 +173,8 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
     # Validate all models are supported
     for model in models:
-        if model not in TOGETHER_SUPPORTED_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
+        if model not in ACCEPTABLE_MODELS:
+            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     # Validate input lengths match
     if response_formats is None:
@@ -223,8 +222,8 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
     # Validate all models are supported
     for model in models:
-        if model not in TOGETHER_SUPPORTED_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported TogetherAI models: {TOGETHER_SUPPORTED_MODELS}.")
+        if model not in ACCEPTABLE_MODELS:
+            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     # Validate input lengths match
     if response_formats is None:
@@ -322,8 +321,8 @@ async def afetch_litellm_api_response(model: str, messages: List[Mapping], respo
     # Add validation
     validate_chat_messages(messages)
-    if model not in LITELLM_SUPPORTED_MODELS:
-        raise ValueError(f"Model {model} is not in the list of supported Litellm models: {LITELLM_SUPPORTED_MODELS}.")
+    if model not in ACCEPTABLE_MODELS:
+        raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
     if response_format is not None:
         response = await litellm.acompletion(
@@ -409,7 +408,7 @@ async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Ma
         models (List[str]): List of models to query
         messages (List[Mapping]): List of messages to query
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
     """

judgeval/constants.py CHANGED Viewed

@@ -51,20 +51,71 @@ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
 RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
 # Models
-TOGETHER_SUPPORTED_MODELS = {
-    "QWEN": "Qwen/Qwen2-72B-Instruct",
-    "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-    "LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
-    "MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
-    "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-}
+LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
+TOGETHER_SUPPORTED_MODELS = [
+  "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
+  "Qwen/Qwen2-VL-72B-Instruct",
+  "meta-llama/Llama-Vision-Free",
+  "Gryphe/MythoMax-L2-13b",
+  "Qwen/Qwen2.5-72B-Instruct-Turbo",
+  "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+  "deepseek-ai/DeepSeek-R1",
+  "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
+  "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+  "google/gemma-2-27b-it",
+  "mistralai/Mistral-Small-24B-Instruct-2501",
+  "mistralai/Mixtral-8x22B-Instruct-v0.1",
+  "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
+  "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
+  "deepseek-ai/DeepSeek-V3",
+  "Qwen/Qwen2-72B-Instruct",
+  "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
+  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+  "upstage/SOLAR-10.7B-Instruct-v1.0",
+  "togethercomputer/MoA-1",
+  "Qwen/QwQ-32B-Preview",
+  "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+  "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+  "mistralai/Mistral-7B-Instruct-v0.2",
+  "databricks/dbrx-instruct",
+  "meta-llama/Llama-3-8b-chat-hf",
+  "google/gemma-2b-it",
+  "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
+  "google/gemma-2-9b-it",
+  "meta-llama/Llama-3.3-70B-Instruct-Turbo",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
+  "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+  "Gryphe/MythoMax-L2-13b-Lite",
+  "meta-llama/Llama-2-7b-chat-hf",
+  "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "meta-llama/Llama-2-13b-chat-hf",
+  "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
+  "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
+  "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+  "Qwen/Qwen2.5-Coder-32B-Instruct",
+  "microsoft/WizardLM-2-8x22B",
+  "mistralai/Mistral-7B-Instruct-v0.3",
+  "scb10x/scb10x-llama3-1-typhoon2-60256",
+  "Qwen/Qwen2.5-7B-Instruct-Turbo",
+  "scb10x/scb10x-llama3-1-typhoon-18370",
+  "meta-llama/Llama-3.2-3B-Instruct-Turbo",
+  "meta-llama/Llama-3-70b-chat-hf",
+  "mistralai/Mixtral-8x7B-Instruct-v0.1",
+  "togethercomputer/MoA-1-Turbo",
+  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
+  "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+  "mistralai/Mistral-7B-Instruct-v0.1"
+]
 JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
-ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
+ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
 ## System settings
 MAX_WORKER_THREADS = 10
+# Maximum number of concurrent operations for evaluation runs
+MAX_CONCURRENT_EVALUATIONS = 50  # Adjust based on system capabilities

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -17,9 +17,10 @@ class EvalDataset:
     _alias: Union[str, None] = field(default=None)
     _id: Union[str, None] = field(default=None)
     judgment_api_key: str = field(default="")
+    organization_id: str = field(default="")
     def __init__(self,
                  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
+                 organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
                  ground_truths: List[GroundTruthExample] = [],
                  examples: List[Example] = [],
                  ):
@@ -31,7 +32,7 @@ class EvalDataset:
         self._alias = None
         self._id = None
         self.judgment_api_key = judgment_api_key
+        self.organization_id = organization_id
     def add_from_json(self, file_path: str) -> None:
         debug(f"Loading dataset from JSON file: {file_path}")

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -19,8 +19,9 @@ from judgeval.data.datasets.ground_truth import GroundTruthExample
 class EvalDatasetClient:
-    def __init__(self, judgment_api_key: str):
+    def __init__(self, judgment_api_key: str, organization_id: str):
         self.judgment_api_key = judgment_api_key
+        self.organization_id = organization_id
     def create_dataset(self) -> EvalDataset:
         return EvalDataset(judgment_api_key=self.judgment_api_key)
@@ -58,7 +59,6 @@ class EvalDatasetClient:
                     "ground_truths": [g.to_dict() for g in dataset.ground_truths],
                     "examples": [e.to_dict() for e in dataset.examples],
                     "overwrite": overwrite,
-                    # "judgment_api_key": dataset.judgment_api_key
                 }
             try:
                 response = requests.post(
@@ -66,8 +66,10 @@ class EvalDatasetClient:
                     json=content,
                     headers={
                         "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.judgment_api_key}"
-                    }
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
+                    },
+                    verify=False
                 )
                 if response.status_code == 500:
                     error(f"Server error during push: {content.get('message')}")
@@ -121,7 +123,6 @@ class EvalDatasetClient:
                 )
                 request_body = {
                     "alias": alias,
-                    # "judgment_api_key": self.judgment_api_key
                 }
                 try:
@@ -130,8 +131,10 @@ class EvalDatasetClient:
                         json=request_body,
                         headers={
                             "Content-Type": "application/json",
-                            "Authorization": f"Bearer {self.judgment_api_key}"
-                        }
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        },
+                        verify=False
                     )
                     response.raise_for_status()
                 except requests.exceptions.RequestException as e:
@@ -179,7 +182,6 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
-                    # "judgment_api_key": self.judgment_api_key
                 }
                 try:
@@ -188,8 +190,10 @@ class EvalDatasetClient:
                         json=request_body,
                         headers={
                             "Content-Type": "application/json",
-                            "Authorization": f"Bearer {self.judgment_api_key}"
-                        }
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        },
+                        verify=False
                     )
                     response.raise_for_status()
                 except requests.exceptions.RequestException as e:
@@ -232,13 +236,18 @@ class EvalDatasetClient:
                 "alias": alias,
                 "examples": [e.to_dict() for e in examples],
                 "ground_truths": [g.to_dict() for g in ground_truths],
-                "judgment_api_key": self.judgment_api_key
             }
             try:
                 response = requests.post(
                     JUDGMENT_DATASETS_EDIT_API_URL,
-                    json=content
+                    json=content,
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
+                    },
+                    verify=False
                 )
                 response.raise_for_status()
             except requests.exceptions.RequestException as e:
@@ -266,9 +275,11 @@ class EvalDatasetClient:
                     json={"alias": alias},
                     headers={
                         "Content-Type": "application/json",
-                        "Authorization": f"Bearer {self.judgment_api_key}"
+                        "Authorization": f"Bearer {self.judgment_api_key}",
+                        "X-Organization-Id": self.organization_id
                     },
-                    stream=True
+                    stream=True,
+                    verify=False
                 )
                 response.raise_for_status()
             except requests.exceptions.HTTPError as err:

judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

judgeval 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl