PyPI - judgeval - Versions diffs - 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

judgeval/__init__.py +173 -10
judgeval/api/__init__.py +523 -0
judgeval/api/api_types.py +413 -0
judgeval/cli.py +112 -0
judgeval/constants.py +7 -30
judgeval/data/__init__.py +1 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +14 -40
judgeval/data/judgment_types.py +396 -146
judgeval/data/result.py +11 -18
judgeval/data/scorer_data.py +3 -26
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +115 -194
judgeval/dataset/__init__.py +335 -0
judgeval/env.py +55 -0
judgeval/evaluation/__init__.py +346 -0
judgeval/exceptions.py +28 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +51 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +77 -16
judgeval/judges/together_judge.py +88 -17
judgeval/judges/utils.py +7 -20
judgeval/judgment_attribute_keys.py +55 -0
judgeval/{common/logger.py → logger.py} +24 -8
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +11 -11
judgeval/scorers/agent_scorer.py +15 -19
judgeval/scorers/api_scorer.py +21 -23
judgeval/scorers/base_scorer.py +54 -36
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
judgeval/scorers/score.py +64 -47
judgeval/scorers/utils.py +2 -107
judgeval/tracer/__init__.py +1111 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +123 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +392 -0
judgeval/trainer/trainable_model.py +252 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +74 -28
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +5 -3
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/v1/__init__.py +88 -0
judgeval/v1/data/__init__.py +7 -0
judgeval/v1/data/example.py +44 -0
judgeval/v1/data/scorer_data.py +42 -0
judgeval/v1/data/scoring_result.py +44 -0
judgeval/v1/datasets/__init__.py +6 -0
judgeval/v1/datasets/dataset.py +214 -0
judgeval/v1/datasets/dataset_factory.py +94 -0
judgeval/v1/evaluation/__init__.py +6 -0
judgeval/v1/evaluation/evaluation.py +182 -0
judgeval/v1/evaluation/evaluation_factory.py +17 -0
judgeval/v1/instrumentation/__init__.py +6 -0
judgeval/v1/instrumentation/llm/__init__.py +7 -0
judgeval/v1/instrumentation/llm/config.py +78 -0
judgeval/v1/instrumentation/llm/constants.py +11 -0
judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
judgeval/v1/instrumentation/llm/providers.py +19 -0
judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
judgeval/v1/integrations/langgraph/__init__.py +13 -0
judgeval/v1/integrations/openlit/__init__.py +47 -0
judgeval/v1/internal/api/__init__.py +525 -0
judgeval/v1/internal/api/api_types.py +413 -0
judgeval/v1/prompts/__init__.py +6 -0
judgeval/v1/prompts/prompt.py +29 -0
judgeval/v1/prompts/prompt_factory.py +189 -0
judgeval/v1/py.typed +0 -0
judgeval/v1/scorers/__init__.py +6 -0
judgeval/v1/scorers/api_scorer.py +82 -0
judgeval/v1/scorers/base_scorer.py +17 -0
judgeval/v1/scorers/built_in/__init__.py +17 -0
judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
judgeval/v1/scorers/built_in/faithfulness.py +28 -0
judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
judgeval/v1/scorers/scorers_factory.py +49 -0
judgeval/v1/tracer/__init__.py +7 -0
judgeval/v1/tracer/base_tracer.py +520 -0
judgeval/v1/tracer/exporters/__init__.py +14 -0
judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
judgeval/v1/tracer/exporters/span_store.py +50 -0
judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
judgeval/v1/tracer/processors/__init__.py +6 -0
judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
judgeval/v1/tracer/tracer.py +67 -0
judgeval/v1/tracer/tracer_factory.py +38 -0
judgeval/v1/trainers/__init__.py +5 -0
judgeval/v1/trainers/base_trainer.py +62 -0
judgeval/v1/trainers/config.py +123 -0
judgeval/v1/trainers/console.py +144 -0
judgeval/v1/trainers/fireworks_trainer.py +392 -0
judgeval/v1/trainers/trainable_model.py +252 -0
judgeval/v1/trainers/trainers_factory.py +37 -0
judgeval/v1/utils.py +18 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.23.0.dist-info/METADATA +266 -0
judgeval-0.23.0.dist-info/RECORD +201 -0
judgeval-0.23.0.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -34
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -352
judgeval/common/api/constants.py +0 -165
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -98
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -1916
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -234
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -211
judgeval/common/tracer/trace_manager.py +0 -92
judgeval/common/utils.py +0 -940
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
judgeval/data/tool.py +0 -5
judgeval/data/trace_run.py +0 -37
judgeval/evaluation_run.py +0 -75
judgeval/integrations/langgraph.py +0 -843
judgeval/judges/mixture_of_judges.py +0 -286
judgeval/judgment_client.py +0 -369
judgeval/rules.py +0 -521
judgeval/run_evaluation.py +0 -684
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.1.0.dist-info/METADATA +0 -202
judgeval-0.1.0.dist-info/RECORD +0 -73
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judges/mixture_of_judges.py DELETED Viewed

@@ -1,286 +0,0 @@
-"""
-Implementation for Mixture of Judges model through Judgeval
-Enables client to use multiple models to generate responses and then aggregate them into a single response.
-"""
-import pydantic
-from typing import List, Union
-from judgeval.judges import JudgevalJudge
-from judgeval.common.utils import (
-    get_completion_multiple_models,
-    get_chat_completion,
-    aget_completion_multiple_models,
-    aget_chat_completion,
-)
-from judgeval.common.logger import judgeval_logger
-def build_dynamic_mixture_prompt(
-    judge_responses: List[str],
-    custom_system_prompt: str | None = None,
-    custom_conversation_history: List[dict] | None = None,
-) -> List[dict]:
-    """
-    Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
-    In this implementation, we simply concatenate the judge responses into a formatted string, then
-    pass it into a default prompt template. This template can be customized by providing a custom prompt.
-    Args:
-        judge_responses (List[str]): List of responses from individual judges to be synthesized
-        custom_system_prompt (str, optional): Custom system prompt to override the default one. Defaults to None.
-        custom_conversation_history (List[dict], optional): Custom conversation history to override the default one. Defaults to None.
-    """
-    formatted_responses = "\n".join(
-        [
-            f"# Judge {i + 1}'s response: #\n{response}"
-            for i, response in enumerate(judge_responses)
-        ]
-    )
-    # This is the default prompt for the Mixture of Judges model
-    """
-    You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:
-    1. Analyze and compare the key points, patterns, and agreements between the answers.
-    2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.
-    3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.
-    4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.
-    5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.
-    ## Start of Judge Responses ##
-    {{judge_responses}}
-    ## End of Judge Responses ##
-    Synthesized response:
-    """
-    default_conversation = [  # inject the judge responses into the default prompt
-        {
-            "role": "system",
-            "content": "You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. ",
-        },
-        {
-            "role": "user",
-            "content": '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n    "claims": [\n        {\n            "claim": "A full refund is offered within 30 days.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        },\n        {\n            "claim": "The 30-day full refund is offered at no extra cost.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        }\n    ]\n}\n# Judge 3\'s response: #\n {\n    "claims": [\n        {\n            "claim": "A 30-day full refund is offered.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        },\n        {\n            "claim": "The 30-day full refund is offered at no extra cost.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        }\n    ]\n}\n## End of Judge Responses ##\nSynthesized response:',
-        },
-        {
-            "role": "assistant",
-            "content": 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n    "claims": [\n        {\n            "claim": "A full refund is offered within 30 days.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        },\n        {\n            "claim": "The 30-day full refund is offered at no extra cost.",\n            "quote": "We offer a 30-day full refund at no extra cost."\n        }\n    ]\n}',
-        },
-        {
-            "role": "user",
-            "content": "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:",
-        },
-        {
-            "role": "assistant",
-            "content": "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.",
-        },
-        {
-            "role": "user",
-            "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
-        },
-    ]
-    # If a custom system prompt is provided, validate and use it
-    if custom_system_prompt is not None:
-        if not isinstance(custom_system_prompt, str):
-            judgeval_logger.error(
-                f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
-            )
-            raise TypeError(
-                f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
-            )
-        if not custom_system_prompt:
-            raise ValueError("Custom system prompt cannot be empty")
-        # Override the default system prompt, but also add special instructions for handling JSON
-        default_conversation[0]["content"] = (
-            custom_system_prompt
-            + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
-        )
-    # If a custom conversation history is provided, append the judge responses to it
-    if custom_conversation_history is not None:
-        # Validate custom conversation history format
-        for message in custom_conversation_history:
-            if not isinstance(message, dict):
-                raise TypeError(
-                    f"Custom conversation history must be a list of dictionaries. Received: {message}."
-                )
-            if "role" not in message or "content" not in message:
-                raise ValueError("Each message must have 'role' and 'content' keys")
-            if not isinstance(message["role"], str) or not isinstance(
-                message["content"], str
-            ):
-                raise TypeError(
-                    f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
-                )
-            if message["role"] not in ["system", "user", "assistant"]:
-                raise ValueError(
-                    f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
-                )
-        judge_responses_prompt = {
-            "role": "user",
-            "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
-        }
-        return custom_conversation_history + [judge_responses_prompt]
-    # Otherwise return the default conversation with system prompt and examples
-    # No customization, return the default conversation with system prompt and examples
-    return default_conversation
-BASE_CONVERSATION = [
-    {"role": "system", "content": "You are a helpful assistant."},
-]  # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
-class MixtureOfJudges(JudgevalJudge):
-    """
-    IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
-    in kwargs:
-    {
-        "custom_prompt": "Your custom prompt here",
-        "custom_conversation": [
-            {"role": "system", "content": "System message 1"},
-            {"role": "user", "content": "User message 1"},
-            {"role": "assistant", "content": "Assistant message 1"},
-            ...
-        ]
-    }
-    """
-    def __init__(
-        self,
-        models: List[str] = [
-            "QWEN",
-            "LLAMA3_70B_INSTRUCT_TURBO",
-            "MISTRAL_8x22B_INSTRUCT",
-        ],
-        aggregator: str = "gpt-4.1",
-        **kwargs,
-    ):
-        """
-        `models` are the individual judge models to be used for generating responses.
-        `aggregator` is the model that will aggregate the responses from the individual judges.
-        kwargs include "custom_prompt" and "custom_conversation" for customizing the prompt for the Mixture of Judges model.
-        """
-        self.models = models
-        self.aggregator = aggregator
-        self.kwargs = kwargs
-        super().__init__(model_name=models)
-    def generate(
-        self,
-        input: Union[str, List[dict]],
-        response_schema: pydantic.BaseModel = None,
-        aggregation_schema: pydantic.BaseModel = None,
-        **kwargs,
-    ) -> str:
-        """
-        Args:
-            input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
-            response_schema (pydantic.BaseModel): Response schema for individual judge models.
-            aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
-            kwargs: Additional keyword arguments.
-        """
-        # Convert input to conversation format if needed
-        if isinstance(input, str):
-            convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
-        elif isinstance(input, list):
-            convo = input
-        else:
-            judgeval_logger.error(f"Invalid input type received: {type(input)}")
-            raise TypeError(
-                f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
-            )
-        try:
-            responses = get_completion_multiple_models(
-                models=self.models,
-                messages=[convo] * len(self.models),
-                response_formats=[response_schema] * len(self.models),
-            )
-        except Exception:
-            raise
-        compiled_mixture_prompt = build_dynamic_mixture_prompt(
-            responses,
-            self.kwargs.get("custom_prompt"),
-            self.kwargs.get("custom_conversation"),
-        )
-        try:
-            mixed_response = get_chat_completion(
-                model_type=self.aggregator,
-                messages=compiled_mixture_prompt,
-                response_format=aggregation_schema,
-            )
-        except Exception:
-            raise
-        return mixed_response
-    async def a_generate(
-        self,
-        input: Union[str, List[dict]],
-        response_schema: pydantic.BaseModel = None,
-        aggregation_schema: pydantic.BaseModel = None,
-        **kwargs,
-    ) -> str:
-        """
-        Args:
-            input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
-            response_schema (pydantic.BaseModel): Response schema for individual judge models.
-            aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
-            kwargs: Additional keyword arguments.
-        """
-        # Convert input to conversation format if needed
-        if isinstance(input, str):
-            convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
-        elif isinstance(input, list):
-            convo = input
-        else:
-            judgeval_logger.error(f"Invalid input type received: {type(input)}")
-            raise TypeError(
-                f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
-            )
-        try:
-            responses = await aget_completion_multiple_models(
-                models=self.models,
-                messages=[convo] * len(self.models),
-                response_formats=[response_schema] * len(self.models),
-            )
-        except Exception:
-            raise
-        compiled_mixture_prompt = build_dynamic_mixture_prompt(
-            responses,
-            self.kwargs.get("custom_prompt"),
-            self.kwargs.get("custom_conversation"),
-        )
-        try:
-            mixed_response = await aget_chat_completion(
-                model_type=self.aggregator,
-                messages=compiled_mixture_prompt,
-                response_format=aggregation_schema,
-            )
-        except Exception:
-            raise
-        return mixed_response
-    def load_model(self):
-        return self.models
-    def get_model_name(self) -> List[str]:
-        return self.models

judgeval/judgment_client.py DELETED Viewed

@@ -1,369 +0,0 @@
-"""
-Implements the JudgmentClient to interact with the Judgment API.
-"""
-import os
-from uuid import uuid4
-from typing import Optional, List, Dict, Any, Union, Callable
-from judgeval.data.datasets import EvalDataset, EvalDatasetClient
-from judgeval.data import (
-    ScoringResult,
-    Example,
-    Trace,
-)
-from judgeval.scorers import (
-    APIScorerConfig,
-    BaseScorer,
-)
-from judgeval.evaluation_run import EvaluationRun
-from judgeval.run_evaluation import (
-    run_eval,
-    assert_test,
-    run_trace_eval,
-)
-from judgeval.data.trace_run import TraceRun
-from judgeval.common.api import JudgmentApiClient
-from judgeval.common.exceptions import JudgmentAPIError
-from langchain_core.callbacks import BaseCallbackHandler
-from judgeval.common.tracer import Tracer
-from judgeval.common.utils import validate_api_key
-from pydantic import BaseModel
-from judgeval.common.logger import judgeval_logger
-class EvalRunRequestBody(BaseModel):
-    eval_name: str
-    project_name: str
-class DeleteEvalRunRequestBody(BaseModel):
-    eval_names: List[str]
-    project_name: str
-class SingletonMeta(type):
-    _instances: Dict[type, "JudgmentClient"] = {}
-    def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            instance = super().__call__(*args, **kwargs)
-            cls._instances[cls] = instance
-        return cls._instances[cls]
-class JudgmentClient(metaclass=SingletonMeta):
-    def __init__(
-        self,
-        api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
-        organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
-    ):
-        if not api_key:
-            raise ValueError(
-                "api_key parameter must be provided. Please provide a valid API key value or set the JUDGMENT_API_KEY environment variable."
-            )
-        if not organization_id:
-            raise ValueError(
-                "organization_id parameter must be provided. Please provide a valid organization ID value or set the JUDGMENT_ORG_ID environment variable."
-            )
-        self.judgment_api_key = api_key
-        self.organization_id = organization_id
-        self.api_client = JudgmentApiClient(api_key, organization_id)
-        self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
-        # Verify API key is valid
-        result, response = validate_api_key(api_key)
-        if not result:
-            # May be bad to output their invalid API key...
-            raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
-        else:
-            judgeval_logger.info("Successfully initialized JudgmentClient!")
-    def run_trace_evaluation(
-        self,
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        examples: Optional[List[Example]] = None,
-        function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
-        traces: Optional[List[Trace]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_trace",
-        model: Optional[str] = "gpt-4.1",
-        append: bool = False,
-        override: bool = False,
-    ) -> List[ScoringResult]:
-        try:
-            if examples and not function:
-                raise ValueError("Cannot pass in examples without a function")
-            if traces and function:
-                raise ValueError("Cannot pass in traces and function")
-            if examples and traces:
-                raise ValueError("Cannot pass in both examples and traces")
-            trace_run = TraceRun(
-                project_name=project_name,
-                eval_name=eval_run_name,
-                traces=traces,
-                scorers=scorers,
-                model=model,
-                append=append,
-                organization_id=self.organization_id,
-                tools=tools,
-            )
-            return run_trace_eval(
-                trace_run, self.judgment_api_key, override, function, tracer, examples
-            )
-        except ValueError as e:
-            raise ValueError(
-                f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
-            )
-        except Exception as e:
-            raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def run_evaluation(
-        self,
-        examples: List[Example],
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_run",
-        override: bool = False,
-        append: bool = False,
-    ) -> List[ScoringResult]:
-        """
-        Executes an evaluation of `Example`s using one or more `Scorer`s
-        Args:
-            examples (List[Example]): The examples to evaluate
-            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
-            model (str): The model used as a judge when using LLM as a Judge
-            project_name (str): The name of the project the evaluation results belong to
-            eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-        Returns:
-            List[ScoringResult]: The results of the evaluation
-        """
-        if override and append:
-            raise ValueError(
-                "Cannot set both override and append to True. Please choose one."
-            )
-        try:
-            eval = EvaluationRun(
-                append=append,
-                override=override,
-                project_name=project_name,
-                eval_name=eval_run_name,
-                examples=examples,
-                scorers=scorers,
-                model=model,
-                organization_id=self.organization_id,
-            )
-            return run_eval(
-                eval,
-                self.judgment_api_key,
-                override,
-            )
-        except ValueError as e:
-            raise ValueError(
-                f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}"
-            )
-        except Exception as e:
-            raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def create_dataset(self) -> EvalDataset:
-        return self.eval_dataset_client.create_dataset()
-    def push_dataset(
-        self,
-        alias: str,
-        dataset: EvalDataset,
-        project_name: str,
-        overwrite: Optional[bool] = False,
-    ) -> bool:
-        """
-        Uploads an `EvalDataset` to the Judgment platform for storage.
-        Args:
-            alias (str): The name to use for the dataset
-            dataset (EvalDataset): The dataset to upload to Judgment
-            overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
-        Returns:
-            bool: Whether the dataset was successfully uploaded
-        """
-        # Set judgment_api_key just in case it was not set
-        dataset.judgment_api_key = self.judgment_api_key
-        return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def append_dataset(
-        self, alias: str, examples: List[Example], project_name: str
-    ) -> bool:
-        """
-        Appends an `EvalDataset` to the Judgment platform for storage.
-        """
-        return self.eval_dataset_client.append_examples(alias, examples, project_name)
-    def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
-        """
-        Retrieves a saved `EvalDataset` from the Judgment platform.
-        Args:
-            alias (str): The name of the dataset to retrieve
-        Returns:
-            EvalDataset: The retrieved dataset
-        """
-        return self.eval_dataset_client.pull(alias, project_name)
-    def delete_dataset(self, alias: str, project_name: str) -> bool:
-        """
-        Deletes a saved `EvalDataset` from the Judgment platform.
-        """
-        return self.eval_dataset_client.delete(alias, project_name)
-    def pull_project_dataset_stats(self, project_name: str) -> dict:
-        """
-        Retrieves all dataset stats from the Judgment platform for the project.
-        Args:
-            project_name (str): The name of the project to retrieve
-        Returns:
-            dict: The retrieved dataset stats
-        """
-        return self.eval_dataset_client.pull_project_dataset_stats(project_name)
-    # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
-    def pull_eval(
-        self, project_name: str, eval_run_name: str
-    ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
-        """Pull evaluation results from the server.
-        Args:
-            project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
-        Returns:
-            Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
-                - id (str): The evaluation run ID
-                - results (List[ScoringResult]): List of scoring results
-        """
-        return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
-    def create_project(self, project_name: str) -> bool:
-        """
-        Creates a project on the server.
-        """
-        self.api_client.create_project(project_name)
-        return True
-    def delete_project(self, project_name: str) -> bool:
-        """
-        Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
-        """
-        self.api_client.delete_project(project_name)
-        return True
-    def assert_test(
-        self,
-        examples: List[Example],
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
-        project_name: str = "default_test",
-        eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
-    ) -> None:
-        """
-        Asserts a test by running the evaluation and checking the results for success
-        Args:
-            examples (List[Example]): The examples to evaluate.
-            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
-            model (str): The model used as a judge when using LLM as a Judge
-            project_name (str): The name of the project the evaluation results belong to
-            eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            async_execution (bool): Whether to run the evaluation asynchronously
-        """
-        results: List[ScoringResult]
-        results = self.run_evaluation(
-            examples=examples,
-            scorers=scorers,
-            model=model,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            append=append,
-        )
-        assert_test(results)
-    def assert_trace_test(
-        self,
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        examples: Optional[List[Example]] = None,
-        function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
-        traces: Optional[List[Trace]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        model: Optional[str] = "gpt-4.1",
-        project_name: str = "default_test",
-        eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
-        async_execution: bool = False,
-    ) -> None:
-        """
-        Asserts a test by running the evaluation and checking the results for success
-        Args:
-            examples (List[Example]): The examples to evaluate.
-            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
-            model (str): The model used as a judge when using LLM as a Judge
-            project_name (str): The name of the project the evaluation results belong to
-            eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            function (Optional[Callable]): A function to use for evaluation
-            tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
-            tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
-            async_execution (bool): Whether to run the evaluation asynchronously
-        """
-        # Check for enable_param_checking and tools
-        for scorer in scorers:
-            if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
-                if scorer.kwargs.get("enable_param_checking") is True:
-                    if not tools:
-                        raise ValueError(
-                            f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
-                        )
-        results: List[ScoringResult]
-        results = self.run_trace_evaluation(
-            examples=examples,
-            traces=traces,
-            scorers=scorers,
-            model=model,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            append=append,
-            function=function,
-            tracer=tracer,
-            tools=tools,
-        )
-        assert_test(results)

judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl