PyPI - judgeval - Versions diffs - 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl - Mend

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +361 -236
judgeval/constants.py +3 -0
judgeval/data/__init__.py +2 -1
judgeval/data/example.py +14 -13
judgeval/data/tool.py +47 -0
judgeval/data/trace.py +28 -39
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +27 -6
judgeval/run_evaluation.py +395 -37
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +8 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +5 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.40.dist-info/METADATA +1441 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
judgeval-0.0.38.dist-info/METADATA +0 -247
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py ADDED Viewed

@@ -0,0 +1,124 @@
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import List, Mapping, Optional, Dict
+from pydantic import model_serializer
+class ClassifierScorer(APIJudgmentScorer):
+    """
+    In the Judgment backend, this scorer is implemented as a PromptScorer that takes
+    1. a system role that may involve the Example object
+    2. options for scores on the example
+    and uses a judge to execute the evaluation from the system role and classify into one of the options
+    ex:
+    system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
+    options = {"positive": 1, "negative": 0}
+    Args:
+        name (str): The name of the scorer
+        slug (str): A unique identifier for the scorer
+        conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
+        options (Mapping[str, float]): A mapping of classification options to their corresponding scores
+        threshold (float): The threshold for determining success (default: 0.5)
+        include_reason (bool): Whether to include reasoning in the response (default: True)
+        strict_mode (bool): Whether to use strict mode (default: False)
+        verbose_mode (bool): Whether to include verbose logging (default: False)
+    """
+    name: Optional[str] = None
+    slug: Optional[str] = None
+    conversation: Optional[List[dict]] = None
+    options: Optional[Mapping[str, float]] = None
+    verbose_mode: bool = False
+    strict_mode: bool = False
+    include_reason: bool = True,
+    async_mode: bool = True,
+    threshold: float = 0.5
+    def __init__(
+        self,
+        name: str,
+        slug: str,
+        conversation: List[dict],
+        options: Mapping[str, float],
+        threshold: float = 0.5,
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+        async_mode: bool = True,
+    ):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CLASSIFIER,
+        )
+        self.name = name
+        self.verbose_mode = verbose_mode
+        self.strict_mode = strict_mode
+        self.include_reason = include_reason
+        self.slug = slug
+        self.conversation = conversation
+        self.options = options
+        self.async_mode = async_mode
+    def update_name(self, name: str):
+        """
+        Updates the name of the scorer.
+        """
+        self.name = name
+    def update_threshold(self, threshold: float):
+        """
+        Updates the threshold of the scorer.
+        """
+        self.threshold = threshold
+    def update_conversation(self, conversation: List[dict]):
+        """
+        Updates the conversation with the new conversation.
+        Sample conversation:
+        [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
+        """
+        self.conversation = conversation
+    def update_options(self, options: Mapping[str, float]):
+        """
+        Updates the options with the new options.
+        Sample options:
+        {"yes": 1, "no": 0}
+        """
+        self.options = options
+    def __str__(self):
+        return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
+    # @model_serializer
+    # def serialize_model(self) -> dict:
+    #     """
+    #     Defines how the ClassifierScorer should be serialized when model_dump() is called.
+    #     """
+    #     return {
+    #         "name": self.name,
+    #         "score_type": self.name,
+    #         "conversation": self.conversation,
+    #         "options": self.options,
+    #         "threshold": self.threshold,
+    #         "include_reason": self.include_reason,
+    #         "async_mode": self.async_mode,
+    #         "strict_mode": self.strict_mode,
+    #         "verbose_mode": self.verbose_mode,
+    #     }
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "score_type": self.name,
+            "conversation": self.conversation,
+            "options": self.options,
+            "threshold": self.threshold,
+            "include_reason": self.include_reason,
+            "async_mode": self.async_mode,
+            "strict_mode": self.strict_mode,
+            "verbose_mode": self.verbose_mode,
+        }

judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+`judgeval` tool dependency scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import Optional, Dict
+class ToolDependencyScorer(APIJudgmentScorer):
+    kwargs: Optional[Dict] = None
+    def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.TOOL_DEPENDENCY
+        )
+        self.kwargs = {"enable_param_checking": enable_param_checking}
+    @property
+    def __name__(self):
+        return "Tool Dependency"

judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py CHANGED Viewed

@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
 from judgeval.scorers import ClassifierScorer
 Text2SQLScorer = ClassifierScorer(
-    "Text to SQL",
+    name="Text to SQL",
     slug="text2sql-1010101010",
     threshold=1.0,
     conversation=[{

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -37,6 +37,7 @@ from judgeval.scorers.utils import (
     get_or_create_event_loop,
     create_verbose_logs
 )
+from judgeval.judges import JudgevalJudge
 class ReasonScore(BaseModel):
@@ -49,7 +50,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
     score_type: str
     threshold: float = Field(default=0.5)
     using_native_model: bool = Field(default=True)
+    model: Optional[JudgevalJudge] = Field(default=None)
+    skipped: bool = Field(default=False)
     # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
     _response: Optional[dict] = None
     _result: Optional[float] = None
@@ -276,166 +278,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
     def __name__(self):
         return self.name
-class ClassifierScorer(PromptScorer):
-    """
-    This is a PromptScorer that takes
-    1. a system role that may involve the Example object
-    2. options for scores on the example
-    and uses a judge to execute the evaluation from the system role and classify into one of the options
-    ex:
-    system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
-    options = {"positive": 1, "negative": 0}
-    """
-    conversation: List[dict]
-    options: Mapping[str, float]
-    def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
-                 threshold: float = 0.5, include_reason: bool = True,
-                 async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
-        # Initialize BaseModel first with all fields
-        BaseModel.__init__(
-            self,
-            name=name,
-            slug=slug,
-            score_type=name,
-            conversation=conversation,
-            options=options,
-            threshold=threshold,
-            include_reason=include_reason,
-            async_mode=async_mode,
-            strict_mode=strict_mode,
-            verbose_mode=verbose_mode,
-        )
-        # Then initialize JudgevalScorer
-        JudgevalScorer.__init__(
-            self,
-            score_type=name,
-            threshold=threshold,
-            include_reason=include_reason,
-            async_mode=async_mode,
-            strict_mode=strict_mode,
-            verbose_mode=verbose_mode,
-        )
-    def _build_measure_prompt(self, example: Example) -> List[dict]:
-        """
-        Builds the measure prompt for the classifier scorer.
-        Args:
-            example (Example): The example to build the prompt for
-        Returns:
-            List[dict]: The measure prompt for the classifier scorer
-        """
-        replacement_words = {
-            "{{actual_output}}": example.actual_output,
-            "{{expected_output}}": example.expected_output,
-            "{{context}}": example.context,
-            "{{retrieval_context}}": example.retrieval_context,
-            "{{tools_called}}": example.tools_called,
-            "{{expected_tools}}": example.expected_tools,
-        }
-        # Make a copy of the conversation to avoid modifying the original
-        conversation_copy = [dict(message) for message in self.conversation]
-        # Only replace if double brackets are found in the content
-        for message in conversation_copy:
-            content = message["content"]
-            if "{{" in content:
-                for key, value in replacement_words.items():
-                    if key in content:
-                        message["content"] = content.replace(key, str(value))
-        return conversation_copy
-    def _build_schema(self) -> dict:
-        return self.options
-    def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
-        """
-        Enforces the judge model to choose an option from the schema.
-        We want the model to choose an option from the schema and a reason for the choice.
-        """
-        options = list(schema.keys())
-        options_str = ", ".join(options)
-        system_role = judge_prompt[0]["content"]
-        system_role += (
-            f"\n\nYou must choose one of the following options: {options_str}. "
-            "Format your response as a JSON object with two fields:\n"
-            "1. 'choice': Your selected option (must be one of the provided choices)\n"
-            "2. 'reason': A brief explanation for why you made this choice\n\n"
-            "Example response format:\n"
-            "{\n"
-            '    "choice": "<one of the valid options>",\n'
-            '    "reason": "<your explanation>"\n'
-            "}"
-        )
-        judge_prompt[0]["content"] = system_role
-        return judge_prompt
-    def _process_response(self, response: dict) -> Tuple[float, str]:
-        choice = response.get("choice")
-        if choice not in self.options:
-            raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
-        reason = response.get("reason", "No reason could be found in model response.")
-        return self.options[choice], reason
-    def _success_check(self, **kwargs) -> bool:
-        return self.score >= self.threshold
-    def update_name(self, name: str):
-        """
-        Updates the name of the scorer.
-        """
-        self.name = name
-    def update_threshold(self, threshold: float):
-        """
-        Updates the threshold of the scorer.
-        """
-        self.threshold = threshold
-    def update_conversation(self, conversation: List[dict]):
-        """
-        Updates the conversation with the new conversation.
-        Sample conversation:
-        [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
-        """
-        self.conversation = conversation
-    def update_options(self, options: Mapping[str, float]):
-        """
-        Updates the options with the new options.
-        Sample options:
-        {"yes": 1, "no": 0}
-        """
-        self.options = options
-    def __str__(self):
-        return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
-    @model_serializer
-    def serialize_model(self) -> dict:
-        """
-        Defines how the ClassifierScorer should be serialized when model_dump() is called.
-        """
-        return {
-            "name": self.name,
-            "score_type": self.score_type,
-            "conversation": self.conversation,
-            "options": self.options,
-            "threshold": self.threshold,
-            "include_reason": self.include_reason,
-            "async_mode": self.async_mode,
-            "strict_mode": self.strict_mode,
-            "verbose_mode": self.verbose_mode,
-        }
+    class Config:
+        arbitrary_types_allowed = True

judgeval/scorers/score.py CHANGED Viewed

@@ -48,7 +48,7 @@ async def safe_a_score_example(
         info(f"Successfully scored example {example.example_id}")
     except MissingTestCaseParamsError as e:
         if skip_on_missing_params:  # Skip the example if the scorer requires parameters that are missing
-            with example_logging_context(example.timestamp, example.example_id):
+            with example_logging_context(example.created_at, example.example_id):
                 warning(f"Skipping example {example.example_id} due to missing parameters")
             scorer.skipped = True
             return
@@ -56,10 +56,10 @@ async def safe_a_score_example(
             if ignore_errors:  # Gracefully handle the error, does not stop the evaluation
                 scorer.error = str(e)
                 scorer.success = False
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
             else:  # Raise the error and stop the evaluation
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
                 raise
     except TypeError:  # in case a_score_example does not accept _show_indicator
@@ -68,27 +68,27 @@ async def safe_a_score_example(
         except MissingTestCaseParamsError as e:
             if skip_on_missing_params:
                 scorer.skipped = True
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     warning(f"Skipping example {example.example_id} due to missing parameters")
                 return
             else:
                 if ignore_errors:
                     scorer.error = str(e)
                     scorer.success = False
-                    with example_logging_context(example.timestamp, example.example_id):
+                    with example_logging_context(example.created_at, example.example_id):
                         warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
                 else:
-                    with example_logging_context(example.timestamp, example.example_id):
+                    with example_logging_context(example.created_at, example.example_id):
                         error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
                     raise
     except Exception as e:
         if ignore_errors:
             scorer.error = str(e)
             scorer.success = False  # Assuming you want to set success to False
-            with example_logging_context(example.timestamp, example.example_id):
+            with example_logging_context(example.created_at, example.example_id):
                 warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
         else:
-            with example_logging_context(example.timestamp, example.example_id):
+            with example_logging_context(example.created_at, example.example_id):
                 error(f"Stopping example {example.example_id}: {str(e)}")
             raise
@@ -128,7 +128,7 @@ async def score_task(
         except MissingTestCaseParamsError as e:
             if skip_on_missing_params:
                 scorer.skipped = True
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     debug(f"Skipping example {example.example_id} due to missing parameters")
                 return
             else:
@@ -137,7 +137,7 @@ async def score_task(
                     scorer.success = False  # Override success
                     finish_text = "Failed"
                 else:
-                    with example_logging_context(example.timestamp, example.example_id):
+                    with example_logging_context(example.created_at, example.example_id):
                         error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
                     raise
         except TypeError:
@@ -147,7 +147,7 @@ async def score_task(
             except MissingTestCaseParamsError as e:
                 if skip_on_missing_params:
                     scorer.skipped = True
-                    with example_logging_context(example.timestamp, example.example_id):
+                    with example_logging_context(example.created_at, example.example_id):
                         debug(f"Skipping example {example.example_id} due to missing parameters")
                     return
                 else:
@@ -156,7 +156,7 @@ async def score_task(
                         scorer.success = False  # Override success
                         finish_text = "Failed"
                     else:
-                        with example_logging_context(example.timestamp, example.example_id):
+                        with example_logging_context(example.created_at, example.example_id):
                             error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
                         raise
         except Exception as e:
@@ -164,10 +164,10 @@ async def score_task(
                 scorer.error = str(e)
                 scorer.success = False  # Override success
                 finish_text = "Failed"
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
             else:
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     error(f"Stopping example {example.example_id}: {str(e)}")
                 raise
@@ -305,7 +305,7 @@ async def a_execute_scoring(
             bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
         ) as pbar:
             for i, ex in enumerate(examples):
-                with example_logging_context(ex.timestamp, ex.example_id):
+                with example_logging_context(ex.created_at, ex.example_id):
                     debug(f"Starting scoring for example {ex.example_id}")
                     debug(f"Input: {ex.input}")
                     debug(f"Using {len(scorers)} scorers")

judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl