PyPI - judgeval - Versions diffs - 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl - Mend

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

judgeval/__init__.py +5 -4
judgeval/clients.py +6 -6
judgeval/common/__init__.py +7 -2
judgeval/common/exceptions.py +2 -3
judgeval/common/logger.py +74 -49
judgeval/common/s3_storage.py +30 -23
judgeval/common/tracer.py +1273 -939
judgeval/common/utils.py +416 -244
judgeval/constants.py +73 -61
judgeval/data/__init__.py +1 -1
judgeval/data/custom_example.py +3 -2
judgeval/data/datasets/dataset.py +80 -54
judgeval/data/datasets/eval_dataset_client.py +131 -181
judgeval/data/example.py +67 -43
judgeval/data/result.py +11 -9
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +25 -16
judgeval/data/trace.py +57 -29
judgeval/data/trace_run.py +5 -11
judgeval/evaluation_run.py +22 -82
judgeval/integrations/langgraph.py +546 -184
judgeval/judges/base_judge.py +1 -2
judgeval/judges/litellm_judge.py +33 -11
judgeval/judges/mixture_of_judges.py +128 -78
judgeval/judges/together_judge.py +22 -9
judgeval/judges/utils.py +14 -5
judgeval/judgment_client.py +259 -271
judgeval/rules.py +169 -142
judgeval/run_evaluation.py +462 -305
judgeval/scorers/api_scorer.py +20 -11
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorer.py +77 -58
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
judgeval/scorers/prompt_scorer.py +48 -37
judgeval/scorers/score.py +86 -53
judgeval/scorers/utils.py +11 -7
judgeval/tracer/__init__.py +1 -1
judgeval/utils/alerts.py +23 -12
judgeval/utils/{data_utils.py → file_utils.py} +5 -9
judgeval/utils/requests.py +29 -0
judgeval/version_check.py +5 -2
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
judgeval-0.0.46.dist-info/RECORD +69 -0
judgeval-0.0.44.dist-info/RECORD +0 -68
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/utils.py CHANGED Viewed

@@ -2,7 +2,7 @@
 This file contains utility functions used in repo scripts
 For API calling, we support:
-    - parallelized model calls on the same prompt
+    - parallelized model calls on the same prompt
     - batched model calls on different prompts
 NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
@@ -13,9 +13,9 @@ import asyncio
 import concurrent.futures
 import os
 from types import TracebackType
-import requests
+from judgeval.utils.requests import requests
 import pprint
-from typing import Any, Dict, List, Literal, Mapping, Optional, TypeAlias, Union
+from typing import Any, Dict, List, Mapping, Optional, TypeAlias, Union, TypeGuard
 # Third-party imports
 import litellm
@@ -24,7 +24,13 @@ from dotenv import load_dotenv
 # Local application/library-specific imports
 from judgeval.clients import async_together_client, together_client
-from judgeval.constants import *
+from judgeval.constants import (
+    ACCEPTABLE_MODELS,
+    MAX_WORKER_THREADS,
+    ROOT_API,
+    TOGETHER_SUPPORTED_MODELS,
+    LITELLM_SUPPORTED_MODELS,
+)
 from judgeval.common.logger import debug, error
@@ -32,72 +38,80 @@ class CustomModelParameters(pydantic.BaseModel):
     model_name: str
     secret_key: str
     litellm_base_url: str
-    @pydantic.field_validator('model_name')
+    @pydantic.field_validator("model_name")
     def validate_model_name(cls, v):
         if not v:
             raise ValueError("Model name cannot be empty")
         return v
-    @pydantic.field_validator('secret_key')
+    @pydantic.field_validator("secret_key")
     def validate_secret_key(cls, v):
         if not v:
             raise ValueError("Secret key cannot be empty")
         return v
-    @pydantic.field_validator('litellm_base_url')
+    @pydantic.field_validator("litellm_base_url")
     def validate_litellm_base_url(cls, v):
         if not v:
             raise ValueError("Litellm base URL cannot be empty")
         return v
 class ChatCompletionRequest(pydantic.BaseModel):
     model: str
     messages: List[Dict[str, str]]
     response_format: Optional[Union[pydantic.BaseModel, Dict[str, Any]]] = None
-    @pydantic.field_validator('messages')
+    @pydantic.field_validator("messages")
     def validate_messages(cls, messages):
         if not messages:
             raise ValueError("Messages cannot be empty")
         for msg in messages:
             if not isinstance(msg, dict):
                 raise TypeError("Message must be a dictionary")
-            if 'role' not in msg:
+            if "role" not in msg:
                 raise ValueError("Message missing required 'role' field")
-            if 'content' not in msg:
+            if "content" not in msg:
                 raise ValueError("Message missing required 'content' field")
-            if msg['role'] not in ['system', 'user', 'assistant']:
-                raise ValueError(f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'")
+            if msg["role"] not in ["system", "user", "assistant"]:
+                raise ValueError(
+                    f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'"
+                )
         return messages
-    @pydantic.field_validator('model')
+    @pydantic.field_validator("model")
     def validate_model(cls, model):
         if not model:
             raise ValueError("Model cannot be empty")
         if model not in ACCEPTABLE_MODELS:
             raise ValueError(f"Model {model} is not in the list of supported models.")
         return model
-    @pydantic.field_validator('response_format', mode='before')
+    @pydantic.field_validator("response_format", mode="before")
     def validate_response_format(cls, response_format):
         if response_format is not None:
             if not isinstance(response_format, (dict, pydantic.BaseModel)):
-                raise TypeError("Response format must be a dictionary or pydantic model")
+                raise TypeError(
+                    "Response format must be a dictionary or pydantic model"
+                )
             # Optional: Add additional validation for required fields if needed
             # For example, checking for 'type': 'json' in OpenAI's format
         return response_format
-os.environ['LITELLM_LOG'] = 'DEBUG'
+os.environ["LITELLM_LOG"] = "DEBUG"
 load_dotenv()
 def read_file(file_path: str) -> str:
-    with open(file_path, "r", encoding='utf-8') as file:
+    with open(file_path, "r", encoding="utf-8") as file:
         return file.read()
 def validate_api_key(judgment_api_key: str):
     """
     Validates that the user api key is valid
@@ -109,66 +123,67 @@ def validate_api_key(judgment_api_key: str):
             "Authorization": f"Bearer {judgment_api_key}",
         },
         json={},  # Empty body now
-        verify=True
+        verify=True,
     )
     if response.status_code == 200:
         return True, response.json()
     else:
         return False, response.json().get("detail", "Error validating API key")
-def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+def fetch_together_api_response(
+    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+) -> str:
     """
     Fetches a single response from the Together API for a given model and messages.
     """
     # Validate request
     if messages is None or messages == []:
         raise ValueError("Messages cannot be empty")
     request = ChatCompletionRequest(
-            model=model,
-            messages=messages,
-            response_format=response_format
+        model=model, messages=messages, response_format=response_format
     )
     debug(f"Calling Together API with model: {request.model}")
     debug(f"Messages: {request.messages}")
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = together_client.chat.completions.create(
             model=request.model,
             messages=request.messages,
-            response_format=request.response_format
+            response_format=request.response_format,
         )
     else:
         response = together_client.chat.completions.create(
             model=request.model,
             messages=request.messages,
         )
     debug(f"Received response: {response.choices[0].message.content[:100]}...")
     return response.choices[0].message.content
-async def afetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+async def afetch_together_api_response(
+    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Together API for a given model and messages.
     """
     request = ChatCompletionRequest(
-        model=model,
-        messages=messages,
-        response_format=response_format
+        model=model, messages=messages, response_format=response_format
     )
     debug(f"Calling Together API with model: {request.model}")
     debug(f"Messages: {request.messages}")
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = await async_together_client.chat.completions.create(
             model=request.model,
             messages=request.messages,
-            response_format=request.response_format
+            response_format=request.response_format,
         )
     else:
         response = await async_together_client.chat.completions.create(
@@ -178,7 +193,11 @@ async def afetch_together_api_response(model: str, messages: List[Mapping], resp
     return response.choices[0].message.content
-def query_together_api_multiple_calls(models: List[str], messages: List[List[Mapping]], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+def query_together_api_multiple_calls(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[Union[str, None]]:
     """
     Queries the Together API for multiple calls in parallel
@@ -197,25 +216,35 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
     # Validate all models are supported
     for model in models:
         if model not in ACCEPTABLE_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
+            raise ValueError(
+                f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
+            )
     # Validate input lengths match
     if response_formats is None:
         response_formats = [None] * len(models)
     if not (len(models) == len(messages) == len(response_formats)):
-        raise ValueError("Number of models, messages, and response formats must be the same")
+        raise ValueError(
+            "Number of models, messages, and response formats must be the same"
+        )
     # Validate message format
     validate_batched_chat_messages(messages)
-    num_workers = int(os.getenv('NUM_WORKER_THREADS', MAX_WORKER_THREADS))
+    num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
     # Initialize results to maintain ordered outputs
-    out = [None] * len(messages)
+    out: List[str | None] = [None] * len(messages)
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
         # Submit all queries to together API with index, gets back the response content
-        futures = {executor.submit(fetch_together_api_response, model, message, response_format): idx \
-                   for idx, (model, message, response_format) in enumerate(zip(models, messages, response_formats))}
+        futures = {
+            executor.submit(
+                fetch_together_api_response, model, message, response_format
+            ): idx
+            for idx, (model, message, response_format) in enumerate(
+                zip(models, messages, response_formats)
+            )
+        }
         # Collect results as they complete -- result is response content
         for future in concurrent.futures.as_completed(futures):
             idx = futures[future]
@@ -223,11 +252,15 @@ def query_together_api_multiple_calls(models: List[str], messages: List[List[Map
                 out[idx] = future.result()
             except Exception as e:
                 error(f"Error in parallel call {idx}: {str(e)}")
-                out[idx] = None
+                out[idx] = None
     return out
-async def aquery_together_api_multiple_calls(models: List[str], messages: List[List[Mapping]], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+async def aquery_together_api_multiple_calls(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[Union[str, None]]:
     """
     Queries the Together API for multiple calls in parallel
@@ -246,57 +279,65 @@ async def aquery_together_api_multiple_calls(models: List[str], messages: List[L
     # Validate all models are supported
     for model in models:
         if model not in ACCEPTABLE_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
+            raise ValueError(
+                f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
+            )
     # Validate input lengths match
     if response_formats is None:
         response_formats = [None] * len(models)
     if not (len(models) == len(messages) == len(response_formats)):
-        raise ValueError("Number of models, messages, and response formats must be the same")
+        raise ValueError(
+            "Number of models, messages, and response formats must be the same"
+        )
     # Validate message format
     validate_batched_chat_messages(messages)
     debug(f"Starting parallel Together API calls for {len(messages)} messages")
-    out = [None] * len(messages)
+    out: List[Union[str, None]] = [None] * len(messages)
     async def fetch_and_store(idx, model, message, response_format):
         try:
             debug(f"Processing call {idx} with model {model}")
-            out[idx] = await afetch_together_api_response(model, message, response_format)
+            out[idx] = await afetch_together_api_response(
+                model, message, response_format
+            )
         except Exception as e:
             error(f"Error in parallel call {idx}: {str(e)}")
             out[idx] = None
     tasks = [
         fetch_and_store(idx, model, message, response_format)
-        for idx, (model, message, response_format) in enumerate(zip(models, messages, response_formats))
+        for idx, (model, message, response_format) in enumerate(
+            zip(models, messages, response_formats)
+        )
     ]
     await asyncio.gather(*tasks)
     debug(f"Completed {len(messages)} parallel calls")
     return out
-def fetch_litellm_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+def fetch_litellm_api_response(
+    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+) -> str:
     """
     Fetches a single response from the Litellm API for a given model and messages.
     """
     request = ChatCompletionRequest(
-        model=model,
-        messages=messages,
-        response_format=response_format
+        model=model, messages=messages, response_format=response_format
     )
     debug(f"Calling LiteLLM API with model: {request.model}")
     debug(f"Messages: {request.messages}")
     if request.response_format is not None:
         debug(f"Using response format: {request.response_format}")
         response = litellm.completion(
             model=request.model,
             messages=request.messages,
-            response_format=request.response_format
+            response_format=request.response_format,
         )
     else:
         response = litellm.completion(
@@ -306,23 +347,29 @@ def fetch_litellm_api_response(model: str, messages: List[Mapping], response_for
     return response.choices[0].message.content
-def fetch_custom_litellm_api_response(custom_model_parameters: CustomModelParameters, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+def fetch_custom_litellm_api_response(
+    custom_model_parameters: CustomModelParameters,
+    messages: List[Mapping],
+    response_format: pydantic.BaseModel = None,
+) -> str:
     if messages is None or messages == []:
         raise ValueError("Messages cannot be empty")
     if custom_model_parameters is None:
         raise ValueError("Custom model parameters cannot be empty")
     if not isinstance(custom_model_parameters, CustomModelParameters):
-        raise ValueError("Custom model parameters must be a CustomModelParameters object")
+        raise ValueError(
+            "Custom model parameters must be a CustomModelParameters object"
+        )
     if response_format is not None:
         response = litellm.completion(
             model=custom_model_parameters.model_name,
             messages=messages,
             api_key=custom_model_parameters.secret_key,
             base_url=custom_model_parameters.litellm_base_url,
-            response_format=response_format
+            response_format=response_format,
         )
     else:
         response = litellm.completion(
@@ -334,53 +381,61 @@ def fetch_custom_litellm_api_response(custom_model_parameters: CustomModelParame
     return response.choices[0].message.content
-async def afetch_litellm_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+async def afetch_litellm_api_response(
+    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
     """
     if messages is None or messages == []:
         raise ValueError("Messages cannot be empty")
     # Add validation
     validate_chat_messages(messages)
     if model not in ACCEPTABLE_MODELS:
-        raise ValueError(f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}.")
+        raise ValueError(
+            f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
+        )
     if response_format is not None:
         response = await litellm.acompletion(
-            model=model,
-            messages=messages,
-            response_format=response_format
+            model=model, messages=messages, response_format=response_format
         )
     else:
         response = await litellm.acompletion(
             model=model,
-            messages=messages,
+            messages=messages,
         )
     return response.choices[0].message.content
-async def afetch_custom_litellm_api_response(custom_model_parameters: CustomModelParameters, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
+async def afetch_custom_litellm_api_response(
+    custom_model_parameters: CustomModelParameters,
+    messages: List[Mapping],
+    response_format: pydantic.BaseModel = None,
+) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
     """
     if messages is None or messages == []:
         raise ValueError("Messages cannot be empty")
     if custom_model_parameters is None:
         raise ValueError("Custom model parameters cannot be empty")
     if not isinstance(custom_model_parameters, CustomModelParameters):
-        raise ValueError("Custom model parameters must be a CustomModelParameters object")
+        raise ValueError(
+            "Custom model parameters must be a CustomModelParameters object"
+        )
     if response_format is not None:
         response = await litellm.acompletion(
             model=custom_model_parameters.model_name,
             messages=messages,
             api_key=custom_model_parameters.secret_key,
             base_url=custom_model_parameters.litellm_base_url,
-            response_format=response_format
+            response_format=response_format,
         )
     else:
         response = await litellm.acompletion(
@@ -392,26 +447,36 @@ async def afetch_custom_litellm_api_response(custom_model_parameters: CustomMode
     return response.choices[0].message.content
-def query_litellm_api_multiple_calls(models: List[str], messages: List[Mapping], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+def query_litellm_api_multiple_calls(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[Union[str, None]]:
     """
     Queries the Litellm API for multiple calls in parallel
     Args:
         models (List[str]): List of models to query
-        messages (List[Mapping]): List of messages to query
+        messages (List[List[Mapping]]): List of messages to query
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
     """
-    num_workers = int(os.getenv('NUM_WORKER_THREADS', MAX_WORKER_THREADS))
+    num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
     # Initialize results to maintain ordered outputs
-    out = [None] * len(messages)
+    out: List[Union[str, None]] = [None] * len(messages)
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
         # Submit all queries to Litellm API with index, gets back the response content
-        futures = {executor.submit(fetch_litellm_api_response, model, message, response_format): idx \
-                    for idx, (model, message, response_format) in enumerate(zip(models, messages, response_formats))}
+        futures = {
+            executor.submit(
+                fetch_litellm_api_response, model, message, response_format
+            ): idx
+            for idx, (model, message, response_format) in enumerate(
+                zip(models, messages, response_formats or [None] * len(messages))
+            )
+        }
         # Collect results as they complete -- result is response content
         for future in concurrent.futures.as_completed(futures):
             idx = futures[future]
@@ -419,37 +484,45 @@ def query_litellm_api_multiple_calls(models: List[str], messages: List[Mapping],
                 out[idx] = future.result()
             except Exception as e:
                 error(f"Error in parallel call {idx}: {str(e)}")
-                out[idx] = None
+                out[idx] = None
     return out
-async def aquery_litellm_api_multiple_calls(models: List[str], messages: List[Mapping], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+async def aquery_litellm_api_multiple_calls(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[Union[str, None]]:
     """
     Queries the Litellm API for multiple calls in parallel
     Args:
         models (List[str]): List of models to query
-        messages (List[Mapping]): List of messages to query
+        messages (List[List[Mapping]]): List of messages to query
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
     """
     # Initialize results to maintain ordered outputs
-    out = [None] * len(messages)
+    out: List[Union[str, None]] = [None] * len(messages)
     async def fetch_and_store(idx, model, message, response_format):
         try:
-            out[idx] = await afetch_litellm_api_response(model, message, response_format)
+            out[idx] = await afetch_litellm_api_response(
+                model, message, response_format
+            )
         except Exception as e:
             error(f"Error in parallel call {idx}: {str(e)}")
             out[idx] = None
     tasks = [
         fetch_and_store(idx, model, message, response_format)
-        for idx, (model, message, response_format) in enumerate(zip(models, messages, response_formats))
+        for idx, (model, message, response_format) in enumerate(
+            zip(models, messages, response_formats or [None] * len(messages))
+        )
     ]
     await asyncio.gather(*tasks)
     return out
@@ -458,56 +531,75 @@ def validate_chat_messages(messages, batched: bool = False):
     """Validate chat message format before API call"""
     if not isinstance(messages, list):
         raise TypeError("Messages must be a list")
     for msg in messages:
         if not isinstance(msg, dict):
             if batched and not isinstance(msg, list):
                 raise TypeError("Each message must be a list")
             elif not batched:
                 raise TypeError("Message must be a dictionary")
-        if 'role' not in msg:
+        if "role" not in msg:
             raise ValueError("Message missing required 'role' field")
-        if 'content' not in msg:
+        if "content" not in msg:
             raise ValueError("Message missing required 'content' field")
-        if msg['role'] not in ['system', 'user', 'assistant']:
-            raise ValueError(f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'")
+        if msg["role"] not in ["system", "user", "assistant"]:
+            raise ValueError(
+                f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'"
+            )
-def validate_batched_chat_messages(messages: List[List[Mapping]]):
+def validate_batched_chat_messages(messages):
     """
     Validate format of batched chat messages before API call
     Args:
         messages (List[List[Mapping]]): List of message lists, where each inner list contains
             message dictionaries with 'role' and 'content' fields
     Raises:
         TypeError: If messages format is invalid
         ValueError: If message content is invalid
     """
     if not isinstance(messages, list):
         raise TypeError("Batched messages must be a list")
     if not messages:
         raise ValueError("Batched messages cannot be empty")
     for message_list in messages:
         if not isinstance(message_list, list):
             raise TypeError("Each batch item must be a list of messages")
         # Validate individual messages using existing function
         validate_chat_messages(message_list)
-def get_chat_completion(model_type: str,
-                        messages : Union[List[Mapping], List[List[Mapping]]],
-                        response_format: pydantic.BaseModel = None,
-                        batched: bool = False
-                        ) -> Union[str, List[str]]:
+def is_batched_messages(
+    messages: Union[List[Mapping], List[List[Mapping]]],
+) -> TypeGuard[List[List[Mapping]]]:
+    return isinstance(messages, list) and all(isinstance(msg, list) for msg in messages)
+def is_simple_messages(
+    messages: Union[List[Mapping], List[List[Mapping]]],
+) -> TypeGuard[List[Mapping]]:
+    return isinstance(messages, list) and all(
+        not isinstance(msg, list) for msg in messages
+    )
+def get_chat_completion(
+    model_type: str,
+    messages: Union[List[Mapping], List[List[Mapping]]],
+    response_format: pydantic.BaseModel = None,
+    batched: bool = False,
+) -> Union[str, List[str | None]]:
     """
     Generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
     Parameters:
         - model_type (str): The type of model to use for generating completions.
-        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
+        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
             If batched is True, this should be a list of lists of mappings.
         - response_format (pydantic.BaseModel, optional): The format of the response. Defaults to None.
         - batched (bool, optional): Whether to process messages in batch mode. Defaults to False.
@@ -516,50 +608,71 @@ def get_chat_completion(model_type: str,
     Raises:
         - ValueError: If requested model is not supported by Litellm or TogetherAI.
     """
     # Check for empty messages list
     if not messages or messages == []:
         raise ValueError("Messages cannot be empty")
     # Add validation
     if batched:
         validate_batched_chat_messages(messages)
     else:
         validate_chat_messages(messages)
-    if batched and model_type in TOGETHER_SUPPORTED_MODELS:
-        return query_together_api_multiple_calls(models=[model_type] * len(messages),
-                                                 messages=messages,
-                                                 response_formats=[response_format] * len(messages))
-    elif batched and model_type in LITELLM_SUPPORTED_MODELS:
-        return query_litellm_api_multiple_calls(models=[model_type] * len(messages),
-                                                messages=messages,
-                                                response_format=response_format)
-    elif not batched and model_type in TOGETHER_SUPPORTED_MODELS:
-        return fetch_together_api_response(model=model_type,
-                                           messages=messages,
-                                           response_format=response_format)
-    elif not batched and model_type in LITELLM_SUPPORTED_MODELS:
-        return fetch_litellm_api_response(model=model_type,
-                                          messages=messages,
-                                          response_format=response_format)
-    raise ValueError(f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again.")
-async def aget_chat_completion(model_type: str,
-                               messages : Union[List[Mapping], List[List[Mapping]]],
-                               response_format: pydantic.BaseModel = None,
-                               batched: bool = False
-                               ) -> Union[str, List[str]]:
+    if (
+        batched
+        and is_batched_messages(messages)
+        and model_type in TOGETHER_SUPPORTED_MODELS
+    ):
+        return query_together_api_multiple_calls(
+            models=[model_type] * len(messages),
+            messages=messages,
+            response_formats=[response_format] * len(messages),
+        )
+    elif (
+        batched
+        and is_batched_messages(messages)
+        and model_type in LITELLM_SUPPORTED_MODELS
+    ):
+        return query_litellm_api_multiple_calls(
+            models=[model_type] * len(messages),
+            messages=messages,
+            response_formats=[response_format] * len(messages),
+        )
+    elif (
+        not batched
+        and is_simple_messages(messages)
+        and model_type in TOGETHER_SUPPORTED_MODELS
+    ):
+        return fetch_together_api_response(
+            model=model_type, messages=messages, response_format=response_format
+        )
+    elif (
+        not batched
+        and is_simple_messages(messages)
+        and model_type in LITELLM_SUPPORTED_MODELS
+    ):
+        return fetch_litellm_api_response(
+            model=model_type, messages=messages, response_format=response_format
+        )
+    raise ValueError(
+        f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
+    )
+async def aget_chat_completion(
+    model_type: str,
+    messages: Union[List[Mapping], List[List[Mapping]]],
+    response_format: pydantic.BaseModel = None,
+    batched: bool = False,
+) -> Union[str, List[str | None]]:
     """
     ASYNCHRONOUSLY generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
     Parameters:
         - model_type (str): The type of model to use for generating completions.
-        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
+        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
             If batched is True, this should be a list of lists of mappings.
         - response_format (pydantic.BaseModel, optional): The format of the response. Defaults to None.
         - batched (bool, optional): Whether to process messages in batch mode. Defaults to False.
@@ -569,38 +682,64 @@ async def aget_chat_completion(model_type: str,
         - ValueError: If requested model is not supported by Litellm or TogetherAI.
     """
     debug(f"Starting chat completion for model {model_type}, batched={batched}")
     if batched:
         validate_batched_chat_messages(messages)
     else:
         validate_chat_messages(messages)
-    if batched and model_type in TOGETHER_SUPPORTED_MODELS:
+    if (
+        batched
+        and is_batched_messages(messages)
+        and model_type in TOGETHER_SUPPORTED_MODELS
+    ):
         debug("Using batched Together API call")
-        return await aquery_together_api_multiple_calls(models=[model_type] * len(messages),
-                                                        messages=messages,
-                                                        response_formats=[response_format] * len(messages))
-    elif batched and model_type in LITELLM_SUPPORTED_MODELS:
+        return await aquery_together_api_multiple_calls(
+            models=[model_type] * len(messages),
+            messages=messages,
+            response_formats=[response_format] * len(messages),
+        )
+    elif (
+        batched
+        and is_batched_messages(messages)
+        and model_type in LITELLM_SUPPORTED_MODELS
+    ):
         debug("Using batched LiteLLM API call")
-        return await aquery_litellm_api_multiple_calls(models=[model_type] * len(messages),
-                                                       messages=messages,
-                                                       response_formats=[response_format] * len(messages))
-    elif not batched and model_type in TOGETHER_SUPPORTED_MODELS:
+        return await aquery_litellm_api_multiple_calls(
+            models=[model_type] * len(messages),
+            messages=messages,
+            response_formats=[response_format] * len(messages),
+        )
+    elif (
+        not batched
+        and is_simple_messages(messages)
+        and model_type in TOGETHER_SUPPORTED_MODELS
+    ):
         debug("Using single Together API call")
-        return await afetch_together_api_response(model=model_type,
-                                                  messages=messages,
-                                                  response_format=response_format)
-    elif not batched and model_type in LITELLM_SUPPORTED_MODELS:
+        return await afetch_together_api_response(
+            model=model_type, messages=messages, response_format=response_format
+        )
+    elif (
+        not batched
+        and is_simple_messages(messages)
+        and model_type in LITELLM_SUPPORTED_MODELS
+    ):
         debug("Using single LiteLLM API call")
-        return await afetch_litellm_api_response(model=model_type,
-                                                 messages=messages,
-                                                 response_format=response_format)
+        return await afetch_litellm_api_response(
+            model=model_type, messages=messages, response_format=response_format
+        )
     error(f"Model {model_type} not supported by either API")
-    raise ValueError(f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again.")
+    raise ValueError(
+        f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
+    )
-def get_completion_multiple_models(models: List[str], messages: List[List[Mapping]], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+def get_completion_multiple_models(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[str | None]:
     """
     Retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
@@ -608,28 +747,32 @@ def get_completion_multiple_models(models: List[str], messages: List[List[Mappin
         models (List[str]): List of models to query
         messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: List of completions from the models in the order of the input models
     Raises:
         ValueError: If a model is not supported by Litellm or Together
     """
     debug(f"Starting multiple model completion for {len(models)} models")
     if models is None or models == []:
         raise ValueError("Models list cannot be empty")
     validate_batched_chat_messages(messages)
     if len(models) != len(messages):
         error(f"Model/message count mismatch: {len(models)} vs {len(messages)}")
-        raise ValueError(f"Number of models and messages must be the same: {len(models)} != {len(messages)}")
+        raise ValueError(
+            f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
+        )
     if response_formats is None:
         response_formats = [None] * len(models)
     # Partition the model requests into TogetherAI and Litellm models, but keep the ordering saved
     together_calls, litellm_calls = {}, {}  # index -> model, message, response_format
     together_responses, litellm_responses = [], []
-    for idx, (model, message, r_format) in enumerate(zip(models, messages, response_formats)):
+    for idx, (model, message, r_format) in enumerate(
+        zip(models, messages, response_formats)
+    ):
         if model in TOGETHER_SUPPORTED_MODELS:
             debug(f"Model {model} routed to Together API")
             together_calls[idx] = (model, message, r_format)
@@ -638,39 +781,49 @@ def get_completion_multiple_models(models: List[str], messages: List[List[Mappin
             litellm_calls[idx] = (model, message, r_format)
         else:
             error(f"Model {model} not supported by either API")
-            raise ValueError(f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again.")
+            raise ValueError(
+                f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
+            )
     # Add validation before processing
     for msg_list in messages:
         validate_chat_messages(msg_list)
     # Get the responses from the TogetherAI models
     # List of responses from the TogetherAI models in order of the together_calls dict
     if together_calls:
         debug(f"Executing {len(together_calls)} Together API calls")
-        together_responses = query_together_api_multiple_calls(models=[model for model, _, _ in together_calls.values()],
-                                                           messages=[message for _, message, _ in together_calls.values()],
-                                                           response_formats=[format for _, _, format in together_calls.values()])
+        together_responses = query_together_api_multiple_calls(
+            models=[model for model, _, _ in together_calls.values()],
+            messages=[message for _, message, _ in together_calls.values()],
+            response_formats=[format for _, _, format in together_calls.values()],
+        )
     # Get the responses from the Litellm models
     if litellm_calls:
         debug(f"Executing {len(litellm_calls)} LiteLLM API calls")
-        litellm_responses = query_litellm_api_multiple_calls(models=[model for model, _, _ in litellm_calls.values()],
-                                                        messages=[message for _, message, _ in litellm_calls.values()],
-                                                        response_formats=[format for _, _, format in litellm_calls.values()])
+        litellm_responses = query_litellm_api_multiple_calls(
+            models=[model for model, _, _ in litellm_calls.values()],
+            messages=[message for _, message, _ in litellm_calls.values()],
+            response_formats=[format for _, _, format in litellm_calls.values()],
+        )
     # Merge the responses in the order of the original models
     debug("Merging responses")
-    out = [None] * len(models)
+    out: List[Union[str, None]] = [None] * len(models)
     for idx, (model, message, r_format) in together_calls.items():
         out[idx] = together_responses.pop(0)
     for idx, (model, message, r_format) in litellm_calls.items():
         out[idx] = litellm_responses.pop(0)
     debug("Multiple model completion finished")
-    return out
+    return out
-async def aget_completion_multiple_models(models: List[str], messages: List[List[Mapping]], response_formats: List[pydantic.BaseModel] = None) -> List[str]:
+async def aget_completion_multiple_models(
+    models: List[str],
+    messages: List[List[Mapping]],
+    response_formats: List[pydantic.BaseModel] | None = None,
+) -> List[str | None]:
     """
     ASYNCHRONOUSLY retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
@@ -678,7 +831,7 @@ async def aget_completion_multiple_models(models: List[str], messages: List[List
         models (List[str]): List of models to query
         messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
         response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
     Returns:
         List[str]: List of completions from the models in the order of the input models
     Raises:
@@ -686,48 +839,54 @@ async def aget_completion_multiple_models(models: List[str], messages: List[List
     """
     if models is None or models == []:
         raise ValueError("Models list cannot be empty")
     if len(models) != len(messages):
-        raise ValueError(f"Number of models and messages must be the same: {len(models)} != {len(messages)}")
+        raise ValueError(
+            f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
+        )
     if response_formats is None:
         response_formats = [None] * len(models)
     validate_batched_chat_messages(messages)
     # Partition the model requests into TogetherAI and Litellm models, but keep the ordering saved
     together_calls, litellm_calls = {}, {}  # index -> model, message, response_format
     together_responses, litellm_responses = [], []
-    for idx, (model, message, r_format) in enumerate(zip(models, messages, response_formats)):
+    for idx, (model, message, r_format) in enumerate(
+        zip(models, messages, response_formats)
+    ):
         if model in TOGETHER_SUPPORTED_MODELS:
             together_calls[idx] = (model, message, r_format)
         elif model in LITELLM_SUPPORTED_MODELS:
             litellm_calls[idx] = (model, message, r_format)
         else:
-            raise ValueError(f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again.")
+            raise ValueError(
+                f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
+            )
     # Add validation before processing
     for msg_list in messages:
         validate_chat_messages(msg_list)
     # Get the responses from the TogetherAI models
     # List of responses from the TogetherAI models in order of the together_calls dict
     if together_calls:
         together_responses = await aquery_together_api_multiple_calls(
-            models=[model for model, _, _ in together_calls.values()],
-            messages=[message for _, message, _ in together_calls.values()],
-            response_formats=[format for _, _, format in together_calls.values()]
+            models=[model for model, _, _ in together_calls.values()],
+            messages=[message for _, message, _ in together_calls.values()],
+            response_formats=[format for _, _, format in together_calls.values()],
         )
     # Get the responses from the Litellm models
     if litellm_calls:
         litellm_responses = await aquery_litellm_api_multiple_calls(
             models=[model for model, _, _ in litellm_calls.values()],
             messages=[message for _, message, _ in litellm_calls.values()],
-            response_formats=[format for _, _, format in litellm_calls.values()]
+            response_formats=[format for _, _, format in litellm_calls.values()],
         )
     # Merge the responses in the order of the original models
-    out = [None] * len(models)
+    out: List[Union[str, None]] = [None] * len(models)
     for idx, (model, message, r_format) in together_calls.items():
         out[idx] = together_responses.pop(0)
     for idx, (model, message, r_format) in litellm_calls.items():
@@ -736,53 +895,66 @@ async def aget_completion_multiple_models(models: List[str], messages: List[List
 if __name__ == "__main__":
-    # Batched
-    pprint.pprint(get_chat_completion(
-        model_type="LLAMA3_405B_INSTRUCT_TURBO",
-        messages=[
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is the capital of France?"},
-            ],
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is the capital of Japan?"},
-            ]
+    batched_messages: List[List[Mapping]] = [
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "What is the capital of France?"},
         ],
-        batched=True
-    ))
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "What is the capital of Japan?"},
+        ],
+    ]
-    # Non batched
-    pprint.pprint(get_chat_completion(
-        model_type="LLAMA3_8B_INSTRUCT_TURBO",
-        messages=[
+    non_batched_messages: List[Mapping] = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+    batched_messages_2: List[List[Mapping]] = [
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "What is the capital of China?"},
+        ],
+        [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "What is the capital of France?"},
         ],
-        batched=False
-    ))
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "What is the capital of Japan?"},
+        ],
+    ]
+    # Batched
+    pprint.pprint(
+        get_chat_completion(
+            model_type="LLAMA3_405B_INSTRUCT_TURBO",
+            messages=batched_messages,
+            batched=True,
+        )
+    )
+    # Non batched
+    pprint.pprint(
+        get_chat_completion(
+            model_type="LLAMA3_8B_INSTRUCT_TURBO",
+            messages=non_batched_messages,
+            batched=False,
+        )
+    )
     # Batched single completion to multiple models
-    pprint.pprint(get_completion_multiple_models(
-        models=[
-            "LLAMA3_70B_INSTRUCT_TURBO", "LLAMA3_405B_INSTRUCT_TURBO", "gpt-4.1-mini"
-        ],
-        messages=[
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is the capital of China?"},
+    pprint.pprint(
+        get_completion_multiple_models(
+            models=[
+                "LLAMA3_70B_INSTRUCT_TURBO",
+                "LLAMA3_405B_INSTRUCT_TURBO",
+                "gpt-4.1-mini",
             ],
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is the capital of France?"},
-            ],
-            [
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": "What is the capital of Japan?"},
-            ]
-        ]
-    ))
+            messages=batched_messages_2,
+        )
+    )
 ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
 OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]

judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl