PyPI - judgeval - Versions diffs - 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl - Mend

judgeval 0.0.51py3-none-any.whl → 0.0.53py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

judgeval/common/logger.py +46 -199
judgeval/common/s3_storage.py +2 -6
judgeval/common/tracer.py +182 -262
judgeval/common/utils.py +16 -36
judgeval/constants.py +14 -20
judgeval/data/__init__.py +0 -2
judgeval/data/datasets/dataset.py +6 -10
judgeval/data/datasets/eval_dataset_client.py +25 -27
judgeval/data/example.py +5 -138
judgeval/data/judgment_types.py +214 -0
judgeval/data/result.py +7 -25
judgeval/data/scorer_data.py +28 -40
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/tool.py +3 -54
judgeval/data/trace.py +31 -50
judgeval/data/trace_run.py +3 -3
judgeval/evaluation_run.py +16 -23
judgeval/integrations/langgraph.py +11 -12
judgeval/judges/litellm_judge.py +3 -6
judgeval/judges/mixture_of_judges.py +8 -25
judgeval/judges/together_judge.py +3 -6
judgeval/judgment_client.py +22 -24
judgeval/rules.py +7 -19
judgeval/run_evaluation.py +79 -242
judgeval/scorers/__init__.py +4 -20
judgeval/scorers/agent_scorer.py +21 -0
judgeval/scorers/api_scorer.py +28 -38
judgeval/scorers/base_scorer.py +98 -0
judgeval/scorers/example_scorer.py +19 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
judgeval/scorers/score.py +45 -330
judgeval/scorers/utils.py +6 -88
judgeval/utils/file_utils.py +4 -6
judgeval/version_check.py +3 -2
{judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
judgeval-0.0.53.dist-info/RECORD +65 -0
judgeval/data/custom_example.py +0 -19
judgeval/scorers/judgeval_scorer.py +0 -177
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
judgeval/scorers/prompt_scorer.py +0 -296
judgeval-0.0.51.dist-info/RECORD +0 -69
{judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
{judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judges/mixture_of_judges.py CHANGED Viewed

@@ -13,7 +13,7 @@ from judgeval.common.utils import (
     aget_completion_multiple_models,
     aget_chat_completion,
 )
-from judgeval.common.logger import debug, error
+from judgeval.common.logger import judgeval_logger
 def build_dynamic_mixture_prompt(
@@ -85,14 +85,13 @@ def build_dynamic_mixture_prompt(
     # If a custom system prompt is provided, validate and use it
     if custom_system_prompt is not None:
         if not isinstance(custom_system_prompt, str):
-            error(
+            judgeval_logger.error(
                 f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
             )
             raise TypeError(
                 f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
             )
         if not custom_system_prompt:
-            error("ValueError: Custom system prompt cannot be empty")
             raise ValueError("Custom system prompt cannot be empty")
         # Override the default system prompt, but also add special instructions for handling JSON
         default_conversation[0]["content"] = (
@@ -105,31 +104,21 @@ def build_dynamic_mixture_prompt(
         # Validate custom conversation history format
         for message in custom_conversation_history:
             if not isinstance(message, dict):
-                error(
-                    f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}."
-                )
                 raise TypeError(
                     f"Custom conversation history must be a list of dictionaries. Received: {message}."
                 )
             if "role" not in message or "content" not in message:
-                error("ValueError: Each message must have 'role' and 'content' keys")
                 raise ValueError("Each message must have 'role' and 'content' keys")
             if not isinstance(message["role"], str) or not isinstance(
                 message["content"], str
             ):
-                error(
-                    f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
-                )
                 raise TypeError(
                     f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
                 )
             if message["role"] not in ["system", "user", "assistant"]:
-                error(
-                    f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
-                )
                 raise ValueError(
                     f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
                 )
@@ -200,7 +189,6 @@ class MixtureOfJudges(JudgevalJudge):
             aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
             kwargs: Additional keyword arguments.
         """
-        debug(f"Generating response for input type: {type(input)}")
         # Convert input to conversation format if needed
         if isinstance(input, str):
@@ -208,7 +196,7 @@ class MixtureOfJudges(JudgevalJudge):
         elif isinstance(input, list):
             convo = input
         else:
-            error(f"Invalid input type received: {type(input)}")
+            judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError(
                 f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
             )
@@ -219,8 +207,7 @@ class MixtureOfJudges(JudgevalJudge):
                 messages=[convo] * len(self.models),
                 response_formats=[response_schema] * len(self.models),
             )
-        except Exception as e:
-            error(f"Error getting completions from multiple models: {str(e)}")
+        except Exception:
             raise
         compiled_mixture_prompt = build_dynamic_mixture_prompt(
@@ -235,8 +222,7 @@ class MixtureOfJudges(JudgevalJudge):
                 messages=compiled_mixture_prompt,
                 response_format=aggregation_schema,
             )
-        except Exception as e:
-            error(f"Error getting chat completion from aggregator: {str(e)}")
+        except Exception:
             raise
         return mixed_response
@@ -255,7 +241,6 @@ class MixtureOfJudges(JudgevalJudge):
             aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
             kwargs: Additional keyword arguments.
         """
-        debug(f"Generating response for input type: {type(input)}")
         # Convert input to conversation format if needed
         if isinstance(input, str):
@@ -263,7 +248,7 @@ class MixtureOfJudges(JudgevalJudge):
         elif isinstance(input, list):
             convo = input
         else:
-            error(f"Invalid input type received: {type(input)}")
+            judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError(
                 f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
             )
@@ -274,8 +259,7 @@ class MixtureOfJudges(JudgevalJudge):
                 messages=[convo] * len(self.models),
                 response_formats=[response_schema] * len(self.models),
             )
-        except Exception as e:
-            error(f"Error getting async completions from multiple models: {str(e)}")
+        except Exception:
             raise
         compiled_mixture_prompt = build_dynamic_mixture_prompt(
@@ -290,8 +274,7 @@ class MixtureOfJudges(JudgevalJudge):
                 messages=compiled_mixture_prompt,
                 response_format=aggregation_schema,
             )
-        except Exception as e:
-            error(f"Error getting async chat completion from aggregator: {str(e)}")
+        except Exception:
             raise
         return mixed_response

judgeval/judges/together_judge.py CHANGED Viewed

@@ -4,13 +4,13 @@ Implementation of using TogetherAI inference for judges.
 from pydantic import BaseModel
 from typing import List, Union
-from judgeval.common.logger import debug, error
 from judgeval.judges import JudgevalJudge
 from judgeval.common.utils import (
     fetch_together_api_response,
     afetch_together_api_response,
 )
+from judgeval.common.logger import judgeval_logger
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
@@ -19,14 +19,12 @@ BASE_CONVERSATION = [
 class TogetherJudge(JudgevalJudge):
     def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
-        debug(f"Initializing TogetherJudge with model={model}")
         self.model = model
         self.kwargs = kwargs
         super().__init__(model_name=model)
     # TODO: Fix cost for generate and a_generate
     def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
-        debug(f"Generating response for input type: {type(input)}")
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
             return fetch_together_api_response(
@@ -38,13 +36,12 @@ class TogetherJudge(JudgevalJudge):
                 self.model, convo, response_format=schema
             )
         else:
-            error(f"Invalid input type received: {type(input)}")
+            judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError("Input must be a string or a list of dictionaries.")
     async def a_generate(
         self, input: Union[str, List[dict]], schema: BaseModel = None
     ) -> str:
-        debug(f"Async generating response for input type: {type(input)}")
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
             res = await afetch_together_api_response(
@@ -58,7 +55,7 @@ class TogetherJudge(JudgevalJudge):
             )
             return res
         else:
-            error(f"Invalid input type received: {type(input)}")
+            judgeval_logger.error(f"Invalid input type received: {type(input)}")
             raise TypeError("Input must be a string or a list of dictionaries.")
     def load_model(self) -> str:

judgeval/judgment_client.py CHANGED Viewed

@@ -14,12 +14,11 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example,
-    CustomExample,
     Trace,
 )
 from judgeval.scorers import (
-    APIJudgmentScorer,
-    JudgevalScorer,
+    APIScorerConfig,
+    BaseScorer,
     ClassifierScorer,
 )
 from judgeval.evaluation_run import EvaluationRun
@@ -41,6 +40,7 @@ from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.run_evaluation import SpinnerWrappedTask
+from judgeval.common.logger import judgeval_logger
 class EvalRunRequestBody(BaseModel):
@@ -68,37 +68,35 @@ class SingletonMeta(type):
 class JudgmentClient(metaclass=SingletonMeta):
     def __init__(
         self,
-        judgment_api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
+        api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
         organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
     ):
-        # Check if API key is None
-        if judgment_api_key is None:
+        if not api_key:
             raise ValueError(
-                "JUDGMENT_API_KEY cannot be None. Please provide a valid API key or set the JUDGMENT_API_KEY environment variable."
+                "api_key parameter must be provided. Please provide a valid API key value or set the JUDGMENT_API_KEY environment variable."
             )
-        # Check if organization ID is None
-        if organization_id is None:
+        if not organization_id:
             raise ValueError(
-                "JUDGMENT_ORG_ID cannot be None. Please provide a valid organization ID or set the JUDGMENT_ORG_ID environment variable."
+                "organization_id parameter must be provided. Please provide a valid organization ID value or set the JUDGMENT_ORG_ID environment variable."
             )
-        self.judgment_api_key = judgment_api_key
+        self.judgment_api_key = api_key
         self.organization_id = organization_id
-        self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
+        self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
         # Verify API key is valid
-        result, response = validate_api_key(judgment_api_key)
+        result, response = validate_api_key(api_key)
         if not result:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
         else:
-            print("Successfully initialized JudgmentClient!")
+            judgeval_logger.info("Successfully initialized JudgmentClient!")
     def a_run_evaluation(
         self,
         examples: List[Example],
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         model: Optional[str] = "gpt-4.1",
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
@@ -120,7 +118,7 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_trace_evaluation(
         self,
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         examples: Optional[List[Example]] = None,
         function: Optional[Callable] = None,
         tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -163,8 +161,8 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_evaluation(
         self,
-        examples: Union[List[Example], List[CustomExample]],
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        examples: List[Example],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         model: Optional[str] = "gpt-4.1",
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
@@ -176,8 +174,8 @@ class JudgmentClient(metaclass=SingletonMeta):
         Executes an evaluation of `Example`s using one or more `Scorer`s
         Args:
-            examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
-            scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
+            examples (List[Example]): The examples to evaluate
+            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
@@ -450,7 +448,7 @@ class JudgmentClient(metaclass=SingletonMeta):
     def assert_test(
         self,
         examples: List[Example],
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         model: Optional[str] = "gpt-4.1",
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
@@ -463,7 +461,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         Args:
             examples (List[Example]): The examples to evaluate.
-            scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
+            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
@@ -498,7 +496,7 @@ class JudgmentClient(metaclass=SingletonMeta):
     def assert_trace_test(
         self,
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        scorers: List[Union[APIScorerConfig, BaseScorer]],
         examples: Optional[List[Example]] = None,
         function: Optional[Callable] = None,
         tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
@@ -516,7 +514,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         Args:
             examples (List[Example]): The examples to evaluate.
-            scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
+            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run

judgeval/rules.py CHANGED Viewed

@@ -3,12 +3,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
 """
 from typing import Dict, List, Optional, Union, Any, Tuple
-from pydantic import BaseModel, Field, field_validator, ConfigDict
+from pydantic import BaseModel, Field, ConfigDict
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import uuid
-from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
+from judgeval.scorers import APIScorerConfig, BaseScorer
 from judgeval.utils.alerts import AlertStatus, AlertResult
@@ -18,7 +18,7 @@ class Condition(BaseModel):
     Example:
         {
-            "metric": FaithfulnessScorer(threshold=0.7)  # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
+            "metric": FaithfulnessScorer(threshold=0.7)  # Must be a scorer object: APIScorerConfig, BaseScorer
         }
     The Condition class uses the scorer's threshold and success function internally.
@@ -26,13 +26,13 @@ class Condition(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    metric: Union[APIJudgmentScorer, JudgevalScorer]
+    metric: Union[APIScorerConfig, BaseScorer]
     @property
     def metric_name(self) -> str:
         """Get the name of the metric for lookups in scores dictionary."""
         if hasattr(self.metric, "score_type"):
-            # Handle APIJudgmentScorer and JudgevalScorer which have score_type
+            # Handle APIScorerConfig and BaseScorer which have score_type
             return self.metric.score_type
         elif hasattr(self.metric, "__name__"):
             # Handle cases where metric has a __name__ attribute
@@ -58,8 +58,8 @@ class Condition(BaseModel):
         # Use the scorer's success check function if available
         if hasattr(self.metric, "success_check"):
             return self.metric.success_check()
-        elif hasattr(self.metric, "_success_check"):
-            return self.metric._success_check()
+        elif hasattr(self.metric, "success_check"):
+            return self.metric.success_check()
         else:
             # Fallback to default comparison (greater than or equal)
             return value >= self.threshold if self.threshold is not None else False
@@ -241,18 +241,6 @@ class Rule(BaseModel):
         return data
-    @field_validator("conditions")
-    def validate_conditions_not_empty(cls, v):
-        if not v:
-            raise ValueError("Conditions list cannot be empty")
-        return v
-    @field_validator("combine_type")
-    def validate_combine_type(cls, v):
-        if v not in ["all", "any"]:
-            raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
-        return v
 class RulesEngine:
     """

judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

judgeval 0.0.51py3-none-any.whl → 0.0.53py3-none-any.whl