PyPI - judgeval - Versions diffs - 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

judgeval/__init__.py +173 -10
judgeval/api/__init__.py +523 -0
judgeval/api/api_types.py +413 -0
judgeval/cli.py +112 -0
judgeval/constants.py +7 -30
judgeval/data/__init__.py +1 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +14 -40
judgeval/data/judgment_types.py +396 -146
judgeval/data/result.py +11 -18
judgeval/data/scorer_data.py +3 -26
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +115 -194
judgeval/dataset/__init__.py +335 -0
judgeval/env.py +55 -0
judgeval/evaluation/__init__.py +346 -0
judgeval/exceptions.py +28 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +51 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +77 -16
judgeval/judges/together_judge.py +88 -17
judgeval/judges/utils.py +7 -20
judgeval/judgment_attribute_keys.py +55 -0
judgeval/{common/logger.py → logger.py} +24 -8
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +11 -11
judgeval/scorers/agent_scorer.py +15 -19
judgeval/scorers/api_scorer.py +21 -23
judgeval/scorers/base_scorer.py +54 -36
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
judgeval/scorers/score.py +64 -47
judgeval/scorers/utils.py +2 -107
judgeval/tracer/__init__.py +1111 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +123 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +392 -0
judgeval/trainer/trainable_model.py +252 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +74 -28
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +5 -3
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/v1/__init__.py +88 -0
judgeval/v1/data/__init__.py +7 -0
judgeval/v1/data/example.py +44 -0
judgeval/v1/data/scorer_data.py +42 -0
judgeval/v1/data/scoring_result.py +44 -0
judgeval/v1/datasets/__init__.py +6 -0
judgeval/v1/datasets/dataset.py +214 -0
judgeval/v1/datasets/dataset_factory.py +94 -0
judgeval/v1/evaluation/__init__.py +6 -0
judgeval/v1/evaluation/evaluation.py +182 -0
judgeval/v1/evaluation/evaluation_factory.py +17 -0
judgeval/v1/instrumentation/__init__.py +6 -0
judgeval/v1/instrumentation/llm/__init__.py +7 -0
judgeval/v1/instrumentation/llm/config.py +78 -0
judgeval/v1/instrumentation/llm/constants.py +11 -0
judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
judgeval/v1/instrumentation/llm/providers.py +19 -0
judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
judgeval/v1/integrations/langgraph/__init__.py +13 -0
judgeval/v1/integrations/openlit/__init__.py +47 -0
judgeval/v1/internal/api/__init__.py +525 -0
judgeval/v1/internal/api/api_types.py +413 -0
judgeval/v1/prompts/__init__.py +6 -0
judgeval/v1/prompts/prompt.py +29 -0
judgeval/v1/prompts/prompt_factory.py +189 -0
judgeval/v1/py.typed +0 -0
judgeval/v1/scorers/__init__.py +6 -0
judgeval/v1/scorers/api_scorer.py +82 -0
judgeval/v1/scorers/base_scorer.py +17 -0
judgeval/v1/scorers/built_in/__init__.py +17 -0
judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
judgeval/v1/scorers/built_in/faithfulness.py +28 -0
judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
judgeval/v1/scorers/scorers_factory.py +49 -0
judgeval/v1/tracer/__init__.py +7 -0
judgeval/v1/tracer/base_tracer.py +520 -0
judgeval/v1/tracer/exporters/__init__.py +14 -0
judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
judgeval/v1/tracer/exporters/span_store.py +50 -0
judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
judgeval/v1/tracer/processors/__init__.py +6 -0
judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
judgeval/v1/tracer/tracer.py +67 -0
judgeval/v1/tracer/tracer_factory.py +38 -0
judgeval/v1/trainers/__init__.py +5 -0
judgeval/v1/trainers/base_trainer.py +62 -0
judgeval/v1/trainers/config.py +123 -0
judgeval/v1/trainers/console.py +144 -0
judgeval/v1/trainers/fireworks_trainer.py +392 -0
judgeval/v1/trainers/trainable_model.py +252 -0
judgeval/v1/trainers/trainers_factory.py +37 -0
judgeval/v1/utils.py +18 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.23.0.dist-info/METADATA +266 -0
judgeval-0.23.0.dist-info/RECORD +201 -0
judgeval-0.23.0.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -34
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -352
judgeval/common/api/constants.py +0 -165
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -98
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -1916
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -234
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -211
judgeval/common/tracer/trace_manager.py +0 -92
judgeval/common/utils.py +0 -940
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
judgeval/data/tool.py +0 -5
judgeval/data/trace_run.py +0 -37
judgeval/evaluation_run.py +0 -75
judgeval/integrations/langgraph.py +0 -843
judgeval/judges/mixture_of_judges.py +0 -286
judgeval/judgment_client.py +0 -369
judgeval/rules.py +0 -521
judgeval/run_evaluation.py +0 -684
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.1.0.dist-info/METADATA +0 -202
judgeval-0.1.0.dist-info/RECORD +0 -73
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/utils.py DELETED Viewed

@@ -1,940 +0,0 @@
-"""
-This file contains utility functions used in repo scripts
-For API calling, we support:
-    - parallelized model calls on the same prompt
-    - batched model calls on different prompts
-NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
-"""
-# Standard library imports
-import asyncio
-import concurrent.futures
-import os
-from types import TracebackType
-from judgeval.common.api.constants import ROOT_API
-from judgeval.utils.requests import requests
-import pprint
-from typing import Any, Dict, List, Mapping, Optional, TypeAlias, Union, TypeGuard
-# Third-party imports
-import litellm
-import pydantic
-from dotenv import load_dotenv
-# Local application/library-specific imports
-from judgeval.clients import async_together_client, together_client
-from judgeval.constants import (
-    ACCEPTABLE_MODELS,
-    MAX_WORKER_THREADS,
-    TOGETHER_SUPPORTED_MODELS,
-    LITELLM_SUPPORTED_MODELS,
-)
-from judgeval.common.logger import judgeval_logger
-class CustomModelParameters(pydantic.BaseModel):
-    model_name: str
-    secret_key: str
-    litellm_base_url: str
-    @pydantic.field_validator("model_name")
-    @classmethod
-    def validate_model_name(cls, v):
-        if not v:
-            raise ValueError("Model name cannot be empty")
-        return v
-    @pydantic.field_validator("secret_key")
-    @classmethod
-    def validate_secret_key(cls, v):
-        if not v:
-            raise ValueError("Secret key cannot be empty")
-        return v
-    @pydantic.field_validator("litellm_base_url")
-    @classmethod
-    def validate_litellm_base_url(cls, v):
-        if not v:
-            raise ValueError("Litellm base URL cannot be empty")
-        return v
-class ChatCompletionRequest(pydantic.BaseModel):
-    model: str
-    messages: List[Dict[str, str]]
-    response_format: Optional[Union[pydantic.BaseModel, Dict[str, Any]]] = None
-    @pydantic.field_validator("messages")
-    @classmethod
-    def validate_messages(cls, messages):
-        if not messages:
-            raise ValueError("Messages cannot be empty")
-        for msg in messages:
-            if not isinstance(msg, dict):
-                raise TypeError("Message must be a dictionary")
-            if "role" not in msg:
-                raise ValueError("Message missing required 'role' field")
-            if "content" not in msg:
-                raise ValueError("Message missing required 'content' field")
-            if msg["role"] not in ["system", "user", "assistant"]:
-                raise ValueError(
-                    f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'"
-                )
-        return messages
-    @pydantic.field_validator("model")
-    @classmethod
-    def validate_model(cls, model):
-        if not model:
-            raise ValueError("Model cannot be empty")
-        if model not in ACCEPTABLE_MODELS:
-            raise ValueError(f"Model {model} is not in the list of supported models.")
-        return model
-    @pydantic.field_validator("response_format", mode="before")
-    @classmethod
-    def validate_response_format(cls, response_format):
-        if response_format is not None:
-            if not isinstance(response_format, (dict, pydantic.BaseModel)):
-                raise TypeError(
-                    "Response format must be a dictionary or pydantic model"
-                )
-            # Optional: Add additional validation for required fields if needed
-            # For example, checking for 'type': 'json' in OpenAI's format
-        return response_format
-os.environ["LITELLM_LOG"] = "DEBUG"
-load_dotenv()
-def read_file(file_path: str) -> str:
-    with open(file_path, "r", encoding="utf-8") as file:
-        return file.read()
-def validate_api_key(judgment_api_key: str):
-    """
-    Validates that the user api key is valid
-    """
-    response = requests.post(
-        f"{ROOT_API}/auth/validate_api_key/",
-        headers={
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {judgment_api_key}",
-        },
-        json={},
-        verify=True,
-    )
-    if response.status_code == 200:
-        return True, response.json()
-    else:
-        return False, response.json().get("detail", "Error validating API key")
-def fetch_together_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
-) -> str:
-    """
-    Fetches a single response from the Together API for a given model and messages.
-    """
-    # Validate request
-    if messages is None or messages == []:
-        raise ValueError("Messages cannot be empty")
-    request = ChatCompletionRequest(
-        model=model, messages=messages, response_format=response_format
-    )
-    if request.response_format is not None:
-        response = together_client.chat.completions.create(
-            model=request.model,
-            messages=request.messages,
-            response_format=request.response_format,
-        )
-    else:
-        response = together_client.chat.completions.create(
-            model=request.model,
-            messages=request.messages,
-        )
-    return response.choices[0].message.content
-async def afetch_together_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
-) -> str:
-    """
-    ASYNCHRONOUSLY Fetches a single response from the Together API for a given model and messages.
-    """
-    request = ChatCompletionRequest(
-        model=model, messages=messages, response_format=response_format
-    )
-    if request.response_format is not None:
-        response = await async_together_client.chat.completions.create(
-            model=request.model,
-            messages=request.messages,
-            response_format=request.response_format,
-        )
-    else:
-        response = await async_together_client.chat.completions.create(
-            model=request.model,
-            messages=request.messages,
-        )
-    return response.choices[0].message.content
-def query_together_api_multiple_calls(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[Union[str, None]]:
-    """
-    Queries the Together API for multiple calls in parallel
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: TogetherAI responses for each model and message pair in order. Any exceptions in the thread call result in a None.
-    """
-    # Check for empty models list
-    if not models:
-        raise ValueError("Models list cannot be empty")
-    # Validate all models are supported
-    for model in models:
-        if model not in ACCEPTABLE_MODELS:
-            raise ValueError(
-                f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
-            )
-    # Validate input lengths match
-    if response_formats is None:
-        response_formats = [None] * len(models)
-    if not (len(models) == len(messages) == len(response_formats)):
-        raise ValueError(
-            "Number of models, messages, and response formats must be the same"
-        )
-    # Validate message format
-    validate_batched_chat_messages(messages)
-    num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
-    # Initialize results to maintain ordered outputs
-    out: List[str | None] = [None] * len(messages)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-        # Submit all queries to together API with index, gets back the response content
-        futures = {
-            executor.submit(
-                fetch_together_api_response, model, message, response_format
-            ): idx
-            for idx, (model, message, response_format) in enumerate(
-                zip(models, messages, response_formats)
-            )
-        }
-        # Collect results as they complete -- result is response content
-        for future in concurrent.futures.as_completed(futures):
-            idx = futures[future]
-            try:
-                out[idx] = future.result()
-            except Exception as e:
-                judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
-                out[idx] = None
-    return out
-async def aquery_together_api_multiple_calls(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[Union[str, None]]:
-    """
-    Queries the Together API for multiple calls in parallel
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: TogetherAI responses for each model and message pair in order. Any exceptions in the thread call result in a None.
-    """
-    # Check for empty models list
-    if not models:
-        raise ValueError("Models list cannot be empty")
-    # Validate all models are supported
-    for model in models:
-        if model not in ACCEPTABLE_MODELS:
-            raise ValueError(
-                f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
-            )
-    # Validate input lengths match
-    if response_formats is None:
-        response_formats = [None] * len(models)
-    if not (len(models) == len(messages) == len(response_formats)):
-        raise ValueError(
-            "Number of models, messages, and response formats must be the same"
-        )
-    # Validate message format
-    validate_batched_chat_messages(messages)
-    out: List[Union[str, None]] = [None] * len(messages)
-    async def fetch_and_store(idx, model, message, response_format):
-        try:
-            out[idx] = await afetch_together_api_response(
-                model, message, response_format
-            )
-        except Exception as e:
-            judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
-            out[idx] = None
-    tasks = [
-        fetch_and_store(idx, model, message, response_format)
-        for idx, (model, message, response_format) in enumerate(
-            zip(models, messages, response_formats)
-        )
-    ]
-    await asyncio.gather(*tasks)
-    return out
-def fetch_litellm_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
-) -> str:
-    """
-    Fetches a single response from the Litellm API for a given model and messages.
-    """
-    request = ChatCompletionRequest(
-        model=model, messages=messages, response_format=response_format
-    )
-    if request.response_format is not None:
-        response = litellm.completion(
-            model=request.model,
-            messages=request.messages,
-            response_format=request.response_format,
-        )
-    else:
-        response = litellm.completion(
-            model=request.model,
-            messages=request.messages,
-        )
-    return response.choices[0].message.content
-def fetch_custom_litellm_api_response(
-    custom_model_parameters: CustomModelParameters,
-    messages: List[Mapping],
-    response_format: pydantic.BaseModel = None,
-) -> str:
-    if messages is None or messages == []:
-        raise ValueError("Messages cannot be empty")
-    if custom_model_parameters is None:
-        raise ValueError("Custom model parameters cannot be empty")
-    if not isinstance(custom_model_parameters, CustomModelParameters):
-        raise ValueError(
-            "Custom model parameters must be a CustomModelParameters object"
-        )
-    if response_format is not None:
-        response = litellm.completion(
-            model=custom_model_parameters.model_name,
-            messages=messages,
-            api_key=custom_model_parameters.secret_key,
-            base_url=custom_model_parameters.litellm_base_url,
-            response_format=response_format,
-        )
-    else:
-        response = litellm.completion(
-            model=custom_model_parameters.model_name,
-            messages=messages,
-            api_key=custom_model_parameters.secret_key,
-            base_url=custom_model_parameters.litellm_base_url,
-        )
-    return response.choices[0].message.content
-async def afetch_litellm_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
-) -> str:
-    """
-    ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
-    """
-    if messages is None or messages == []:
-        raise ValueError("Messages cannot be empty")
-    # Add validation
-    validate_chat_messages(messages)
-    if model not in ACCEPTABLE_MODELS:
-        raise ValueError(
-            f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
-        )
-    if response_format is not None:
-        response = await litellm.acompletion(
-            model=model, messages=messages, response_format=response_format
-        )
-    else:
-        response = await litellm.acompletion(
-            model=model,
-            messages=messages,
-        )
-    return response.choices[0].message.content
-async def afetch_custom_litellm_api_response(
-    custom_model_parameters: CustomModelParameters,
-    messages: List[Mapping],
-    response_format: pydantic.BaseModel = None,
-) -> str:
-    """
-    ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
-    """
-    if messages is None or messages == []:
-        raise ValueError("Messages cannot be empty")
-    if custom_model_parameters is None:
-        raise ValueError("Custom model parameters cannot be empty")
-    if not isinstance(custom_model_parameters, CustomModelParameters):
-        raise ValueError(
-            "Custom model parameters must be a CustomModelParameters object"
-        )
-    if response_format is not None:
-        response = await litellm.acompletion(
-            model=custom_model_parameters.model_name,
-            messages=messages,
-            api_key=custom_model_parameters.secret_key,
-            base_url=custom_model_parameters.litellm_base_url,
-            response_format=response_format,
-        )
-    else:
-        response = await litellm.acompletion(
-            model=custom_model_parameters.model_name,
-            messages=messages,
-            api_key=custom_model_parameters.secret_key,
-            base_url=custom_model_parameters.litellm_base_url,
-        )
-    return response.choices[0].message.content
-def query_litellm_api_multiple_calls(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[Union[str, None]]:
-    """
-    Queries the Litellm API for multiple calls in parallel
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
-    """
-    num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
-    # Initialize results to maintain ordered outputs
-    out: List[Union[str, None]] = [None] * len(messages)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-        # Submit all queries to Litellm API with index, gets back the response content
-        futures = {
-            executor.submit(
-                fetch_litellm_api_response, model, message, response_format
-            ): idx
-            for idx, (model, message, response_format) in enumerate(
-                zip(models, messages, response_formats or [None] * len(messages))
-            )
-        }
-        # Collect results as they complete -- result is response content
-        for future in concurrent.futures.as_completed(futures):
-            idx = futures[future]
-            try:
-                out[idx] = future.result()
-            except Exception as e:
-                judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
-                out[idx] = None
-    return out
-async def aquery_litellm_api_multiple_calls(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[Union[str, None]]:
-    """
-    Queries the Litellm API for multiple calls in parallel
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: Litellm responses for each model and message pair in order. Any exceptions in the thread call result in a None.
-    """
-    # Initialize results to maintain ordered outputs
-    out: List[Union[str, None]] = [None] * len(messages)
-    async def fetch_and_store(idx, model, message, response_format):
-        try:
-            out[idx] = await afetch_litellm_api_response(
-                model, message, response_format
-            )
-        except Exception as e:
-            judgeval_logger.error(f"Error in parallel call {idx}: {str(e)}")
-            out[idx] = None
-    tasks = [
-        fetch_and_store(idx, model, message, response_format)
-        for idx, (model, message, response_format) in enumerate(
-            zip(models, messages, response_formats or [None] * len(messages))
-        )
-    ]
-    await asyncio.gather(*tasks)
-    return out
-def validate_chat_messages(messages, batched: bool = False):
-    """Validate chat message format before API call"""
-    if not isinstance(messages, list):
-        raise TypeError("Messages must be a list")
-    for msg in messages:
-        if not isinstance(msg, dict):
-            if batched and not isinstance(msg, list):
-                raise TypeError("Each message must be a list")
-            elif not batched:
-                raise TypeError("Message must be a dictionary")
-        if "role" not in msg:
-            raise ValueError("Message missing required 'role' field")
-        if "content" not in msg:
-            raise ValueError("Message missing required 'content' field")
-        if msg["role"] not in ["system", "user", "assistant"]:
-            raise ValueError(
-                f"Invalid role '{msg['role']}'. Must be 'system', 'user', or 'assistant'"
-            )
-def validate_batched_chat_messages(messages):
-    """
-    Validate format of batched chat messages before API call
-    Args:
-        messages (List[List[Mapping]]): List of message lists, where each inner list contains
-            message dictionaries with 'role' and 'content' fields
-    Raises:
-        TypeError: If messages format is invalid
-        ValueError: If message content is invalid
-    """
-    if not isinstance(messages, list):
-        raise TypeError("Batched messages must be a list")
-    if not messages:
-        raise ValueError("Batched messages cannot be empty")
-    for message_list in messages:
-        if not isinstance(message_list, list):
-            raise TypeError("Each batch item must be a list of messages")
-        # Validate individual messages using existing function
-        validate_chat_messages(message_list)
-def is_batched_messages(
-    messages: Union[List[Mapping], List[List[Mapping]]],
-) -> TypeGuard[List[List[Mapping]]]:
-    return isinstance(messages, list) and all(isinstance(msg, list) for msg in messages)
-def is_simple_messages(
-    messages: Union[List[Mapping], List[List[Mapping]]],
-) -> TypeGuard[List[Mapping]]:
-    return isinstance(messages, list) and all(
-        not isinstance(msg, list) for msg in messages
-    )
-def get_chat_completion(
-    model_type: str,
-    messages: Union[List[Mapping], List[List[Mapping]]],
-    response_format: pydantic.BaseModel = None,
-    batched: bool = False,
-) -> Union[str, List[str | None]]:
-    """
-    Generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
-    Parameters:
-        - model_type (str): The type of model to use for generating completions.
-        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
-            If batched is True, this should be a list of lists of mappings.
-        - response_format (pydantic.BaseModel, optional): The format of the response. Defaults to None.
-        - batched (bool, optional): Whether to process messages in batch mode. Defaults to False.
-    Returns:
-        - str: The generated chat completion(s). If batched is True, returns a list of strings.
-    Raises:
-        - ValueError: If requested model is not supported by Litellm or TogetherAI.
-    """
-    # Check for empty messages list
-    if not messages or messages == []:
-        raise ValueError("Messages cannot be empty")
-    # Add validation
-    if batched:
-        validate_batched_chat_messages(messages)
-    else:
-        validate_chat_messages(messages)
-    if (
-        batched
-        and is_batched_messages(messages)
-        and model_type in TOGETHER_SUPPORTED_MODELS
-    ):
-        return query_together_api_multiple_calls(
-            models=[model_type] * len(messages),
-            messages=messages,
-            response_formats=[response_format] * len(messages),
-        )
-    elif (
-        batched
-        and is_batched_messages(messages)
-        and model_type in LITELLM_SUPPORTED_MODELS
-    ):
-        return query_litellm_api_multiple_calls(
-            models=[model_type] * len(messages),
-            messages=messages,
-            response_formats=[response_format] * len(messages),
-        )
-    elif (
-        not batched
-        and is_simple_messages(messages)
-        and model_type in TOGETHER_SUPPORTED_MODELS
-    ):
-        return fetch_together_api_response(
-            model=model_type, messages=messages, response_format=response_format
-        )
-    elif (
-        not batched
-        and is_simple_messages(messages)
-        and model_type in LITELLM_SUPPORTED_MODELS
-    ):
-        return fetch_litellm_api_response(
-            model=model_type, messages=messages, response_format=response_format
-        )
-    raise ValueError(
-        f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
-    )
-async def aget_chat_completion(
-    model_type: str,
-    messages: Union[List[Mapping], List[List[Mapping]]],
-    response_format: pydantic.BaseModel = None,
-    batched: bool = False,
-) -> Union[str, List[str | None]]:
-    """
-    ASYNCHRONOUSLY generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
-    Parameters:
-        - model_type (str): The type of model to use for generating completions.
-        - messages (Union[List[Mapping], List[List[Mapping]]]): The messages to be used for generating completions.
-            If batched is True, this should be a list of lists of mappings.
-        - response_format (pydantic.BaseModel, optional): The format of the response. Defaults to None.
-        - batched (bool, optional): Whether to process messages in batch mode. Defaults to False.
-    Returns:
-        - str: The generated chat completion(s). If batched is True, returns a list of strings.
-    Raises:
-        - ValueError: If requested model is not supported by Litellm or TogetherAI.
-    """
-    if batched:
-        validate_batched_chat_messages(messages)
-    else:
-        validate_chat_messages(messages)
-    if (
-        batched
-        and is_batched_messages(messages)
-        and model_type in TOGETHER_SUPPORTED_MODELS
-    ):
-        return await aquery_together_api_multiple_calls(
-            models=[model_type] * len(messages),
-            messages=messages,
-            response_formats=[response_format] * len(messages),
-        )
-    elif (
-        batched
-        and is_batched_messages(messages)
-        and model_type in LITELLM_SUPPORTED_MODELS
-    ):
-        return await aquery_litellm_api_multiple_calls(
-            models=[model_type] * len(messages),
-            messages=messages,
-            response_formats=[response_format] * len(messages),
-        )
-    elif (
-        not batched
-        and is_simple_messages(messages)
-        and model_type in TOGETHER_SUPPORTED_MODELS
-    ):
-        return await afetch_together_api_response(
-            model=model_type, messages=messages, response_format=response_format
-        )
-    elif (
-        not batched
-        and is_simple_messages(messages)
-        and model_type in LITELLM_SUPPORTED_MODELS
-    ):
-        return await afetch_litellm_api_response(
-            model=model_type, messages=messages, response_format=response_format
-        )
-    judgeval_logger.error(f"Model {model_type} not supported by either API")
-    raise ValueError(
-        f"Model {model_type} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
-    )
-def get_completion_multiple_models(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[str | None]:
-    """
-    Retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: List of completions from the models in the order of the input models
-    Raises:
-        ValueError: If a model is not supported by Litellm or Together
-    """
-    if models is None or models == []:
-        raise ValueError("Models list cannot be empty")
-    validate_batched_chat_messages(messages)
-    if len(models) != len(messages):
-        judgeval_logger.error(
-            f"Model/message count mismatch: {len(models)} vs {len(messages)}"
-        )
-        raise ValueError(
-            f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
-        )
-    if response_formats is None:
-        response_formats = [None] * len(models)
-    # Partition the model requests into TogetherAI and Litellm models, but keep the ordering saved
-    together_calls, litellm_calls = {}, {}  # index -> model, message, response_format
-    together_responses, litellm_responses = [], []
-    for idx, (model, message, r_format) in enumerate(
-        zip(models, messages, response_formats)
-    ):
-        if model in TOGETHER_SUPPORTED_MODELS:
-            together_calls[idx] = (model, message, r_format)
-        elif model in LITELLM_SUPPORTED_MODELS:
-            litellm_calls[idx] = (model, message, r_format)
-        else:
-            judgeval_logger.error(f"Model {model} not supported by either API")
-            raise ValueError(
-                f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
-            )
-    # Add validation before processing
-    for msg_list in messages:
-        validate_chat_messages(msg_list)
-    # Get the responses from the TogetherAI models
-    # List of responses from the TogetherAI models in order of the together_calls dict
-    if together_calls:
-        together_responses = query_together_api_multiple_calls(
-            models=[model for model, _, _ in together_calls.values()],
-            messages=[message for _, message, _ in together_calls.values()],
-            response_formats=[format for _, _, format in together_calls.values()],
-        )
-    # Get the responses from the Litellm models
-    if litellm_calls:
-        litellm_responses = query_litellm_api_multiple_calls(
-            models=[model for model, _, _ in litellm_calls.values()],
-            messages=[message for _, message, _ in litellm_calls.values()],
-            response_formats=[format for _, _, format in litellm_calls.values()],
-        )
-    # Merge the responses in the order of the original models
-    out: List[Union[str, None]] = [None] * len(models)
-    for idx, (model, message, r_format) in together_calls.items():
-        out[idx] = together_responses.pop(0)
-    for idx, (model, message, r_format) in litellm_calls.items():
-        out[idx] = litellm_responses.pop(0)
-    return out
-async def aget_completion_multiple_models(
-    models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[str | None]:
-    """
-    ASYNCHRONOUSLY retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
-    Args:
-        models (List[str]): List of models to query
-        messages (List[List[Mapping]]): List of messages to query. Each inner object corresponds to a single prompt.
-        response_formats (List[pydantic.BaseModel], optional): A list of the format of the response if JSON forcing. Defaults to None.
-    Returns:
-        List[str]: List of completions from the models in the order of the input models
-    Raises:
-        ValueError: If a model is not supported by Litellm or Together
-    """
-    if models is None or models == []:
-        raise ValueError("Models list cannot be empty")
-    if len(models) != len(messages):
-        raise ValueError(
-            f"Number of models and messages must be the same: {len(models)} != {len(messages)}"
-        )
-    if response_formats is None:
-        response_formats = [None] * len(models)
-    validate_batched_chat_messages(messages)
-    # Partition the model requests into TogetherAI and Litellm models, but keep the ordering saved
-    together_calls, litellm_calls = {}, {}  # index -> model, message, response_format
-    together_responses, litellm_responses = [], []
-    for idx, (model, message, r_format) in enumerate(
-        zip(models, messages, response_formats)
-    ):
-        if model in TOGETHER_SUPPORTED_MODELS:
-            together_calls[idx] = (model, message, r_format)
-        elif model in LITELLM_SUPPORTED_MODELS:
-            litellm_calls[idx] = (model, message, r_format)
-        else:
-            raise ValueError(
-                f"Model {model} is not supported by Litellm or TogetherAI for chat completions. Please check the model name and try again."
-            )
-    # Add validation before processing
-    for msg_list in messages:
-        validate_chat_messages(msg_list)
-    # Get the responses from the TogetherAI models
-    # List of responses from the TogetherAI models in order of the together_calls dict
-    if together_calls:
-        together_responses = await aquery_together_api_multiple_calls(
-            models=[model for model, _, _ in together_calls.values()],
-            messages=[message for _, message, _ in together_calls.values()],
-            response_formats=[format for _, _, format in together_calls.values()],
-        )
-    # Get the responses from the Litellm models
-    if litellm_calls:
-        litellm_responses = await aquery_litellm_api_multiple_calls(
-            models=[model for model, _, _ in litellm_calls.values()],
-            messages=[message for _, message, _ in litellm_calls.values()],
-            response_formats=[format for _, _, format in litellm_calls.values()],
-        )
-    # Merge the responses in the order of the original models
-    out: List[Union[str, None]] = [None] * len(models)
-    for idx, (model, message, r_format) in together_calls.items():
-        out[idx] = together_responses.pop(0)
-    for idx, (model, message, r_format) in litellm_calls.items():
-        out[idx] = litellm_responses.pop(0)
-    return out
-if __name__ == "__main__":
-    batched_messages: List[List[Mapping]] = [
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "What is the capital of France?"},
-        ],
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "What is the capital of Japan?"},
-        ],
-    ]
-    non_batched_messages: List[Mapping] = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is the capital of France?"},
-    ]
-    batched_messages_2: List[List[Mapping]] = [
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "What is the capital of China?"},
-        ],
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "What is the capital of France?"},
-        ],
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "What is the capital of Japan?"},
-        ],
-    ]
-    # Batched
-    pprint.pprint(
-        get_chat_completion(
-            model_type="LLAMA3_405B_INSTRUCT_TURBO",
-            messages=batched_messages,
-            batched=True,
-        )
-    )
-    # Non batched
-    pprint.pprint(
-        get_chat_completion(
-            model_type="LLAMA3_8B_INSTRUCT_TURBO",
-            messages=non_batched_messages,
-            batched=False,
-        )
-    )
-    # Batched single completion to multiple models
-    pprint.pprint(
-        get_completion_multiple_models(
-            models=[
-                "LLAMA3_70B_INSTRUCT_TURBO",
-                "LLAMA3_405B_INSTRUCT_TURBO",
-                "gpt-4.1-mini",
-            ],
-            messages=batched_messages_2,
-        )
-    )
-ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
-OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]

judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl