PyPI - judgeval - Versions diffs - 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl - Mend

judgeval 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +504 -257
judgeval/common/utils.py +5 -1
judgeval/constants.py +2 -0
judgeval/data/__init__.py +2 -1
judgeval/data/datasets/dataset.py +12 -6
judgeval/data/datasets/eval_dataset_client.py +3 -1
judgeval/data/example.py +7 -7
judgeval/data/tool.py +29 -1
judgeval/data/trace.py +31 -39
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +34 -7
judgeval/run_evaluation.py +67 -19
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +12 -1
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +8 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.41.dist-info/METADATA +1450 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/RECORD +26 -24
judgeval-0.0.39.dist-info/METADATA +0 -247
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/WHEEL +0 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.41.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import requests
 import time
+import json
 import sys
 import itertools
 import threading
@@ -204,9 +205,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
             )
     return results
-def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
+def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
     """
-    Checks if the current experiment, if one exists, has the same type (examples of sequences)
+    Checks if the current experiment, if one exists, has the same type (examples of traces)
     """
     try:
         response = requests.post(
@@ -220,7 +221,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
                 "eval_name": eval_name,
                 "project_name": project_name,
                 "judgment_api_key": judgment_api_key,
-                "is_sequence": is_sequence
+                "is_trace": is_trace
             },
             verify=True
         )
@@ -362,14 +363,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
+    prompt_user = False
     for scorer in scorers:
         for example in examples:
             missing_params = []
             for param in scorer.required_params:
                 if getattr(example, param.value) is None:
-                    missing_params.append(f"'{param.value}'")
+                    missing_params.append(f"{param.value}")
             if missing_params:
-                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+                rprint(f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
+                rprint(f"Missing parameters: {', '.join(missing_params)}")
+                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint("-"*40)
+                prompt_user = True
+    if prompt_user:
+        user_input = input("Do you want to continue? (y/n)")
+        if user_input.lower() != "y":
+            sys.exit(0)
+        else:
+            rprint("[green]Continuing...[/green]")
 def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -382,7 +395,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
         )
     if trace_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        # Check that the current experiment, if one exists, has the same type (examples or traces)
         check_experiment_type(
             trace_run.eval_name,
             trace_run.project_name,
@@ -390,21 +403,27 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
             trace_run.organization_id,
             True
         )
     if function and tracer:
         new_traces: List[Trace] = []
         tracer.offline_mode = True
+        tracer.traces = []
         for example in examples:
             if example.input:
-                result = run_with_spinner("Running agent function: ", function, **example.input)
+                if isinstance(example.input, str):
+                    result = run_with_spinner("Running agent function: ", function, example.input)
+                elif isinstance(example.input, dict):
+                    result = run_with_spinner("Running agent function: ", function, **example.input)
+                else:
+                    raise ValueError(f"Input must be string or dict, got {type(example.input)}")
             else:
                 result = run_with_spinner("Running agent function: ", function)
         for i, trace in enumerate(tracer.traces):
             # We set the root-level trace span with the expected tools of the Trace
             trace = Trace(**trace)
-            trace.entries[0].expected_tools = examples[i].expected_tools
+            trace.trace_spans[0].expected_tools = examples[i].expected_tools
             new_traces.append(trace)
         trace_run.traces = new_traces
+        tracer.traces = []
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
@@ -423,7 +442,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
     debug("Processing API results")
     # TODO: allow for custom scorer on traces
     if trace_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
         rprint(pretty_str)
     return scoring_results
@@ -504,7 +523,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
                 info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
             # Check status
-            response = requests.get(
+            response = await asyncio.to_thread(
+                requests.get,
                 JUDGMENT_GET_EVAL_STATUS_API_URL,
                 headers={
                     "Content-Type": "application/json",
@@ -531,7 +551,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
             # If complete, get results and return
             if status == "completed" or status == "complete":
                 info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
-                results_response = requests.post(
+                results_response = await asyncio.to_thread(
+                    requests.post,
                     JUDGMENT_EVAL_FETCH_API_URL,
                     headers={
                         "Content-Type": "application/json",
@@ -723,7 +744,18 @@ class SpinnerWrappedTask:
     def __await__(self):
         async def _spin_and_await():
-            return await await_with_spinner(self.task, self.message)
+            # self.task resolves to (scoring_results, pretty_str_to_print)
+            task_result_tuple = await await_with_spinner(self.task, self.message)
+            # Unpack the tuple
+            scoring_results, pretty_str_to_print = task_result_tuple
+            # Print the pretty string if it exists, after spinner is cleared
+            if pretty_str_to_print:
+                rprint(pretty_str_to_print)
+            # Return only the scoring_results to the original awaiter
+            return scoring_results
         return _spin_and_await().__await__()
     # Proxy all Task attributes and methods to the underlying task
@@ -756,7 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         )
     if evaluation_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        # Check that the current experiment, if one exists, has the same type (examples of traces)
         check_experiment_type(
             evaluation_run.eval_name,
             evaluation_run.project_name,
@@ -769,8 +801,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     debug("Initializing examples with IDs and timestamps")
     for idx, example in enumerate(evaluation_run.examples):
         example.example_index = idx  # Set numeric index
-        example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        with example_logging_context(example.timestamp, example.example_id):
+        with example_logging_context(example.created_at, example.example_id):
             debug(f"Initialized example {example.example_id} (index: {example.example_index})")
             debug(f"Input: {example.input}")
             debug(f"Actual output: {example.actual_output}")
@@ -824,7 +855,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             payload = evaluation_run.model_dump(warnings=False)
             # Send the evaluation to the queue
-            response = requests.post(
+            response = await asyncio.to_thread(
+                requests.post,
                 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
                 headers={
                     "Content-Type": "application/json",
@@ -843,13 +875,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
             # Poll until the evaluation is complete
-            return await _poll_evaluation_until_complete(
+            results = await _poll_evaluation_until_complete(
                 eval_name=evaluation_run.eval_name,
                 project_name=evaluation_run.project_name,
                 judgment_api_key=evaluation_run.judgment_api_key,
                 organization_id=evaluation_run.organization_id,
                 original_examples=evaluation_run.examples  # Pass the original examples
             )
+            pretty_str_to_print = None
+            if evaluation_run.log_results and results: # Ensure results exist before logging
+                send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
+                try:
+                    # Run the blocking log_evaluation_results in a separate thread
+                    pretty_str_to_print = await asyncio.to_thread(
+                        log_evaluation_results,
+                        send_results,
+                        evaluation_run
+                    )
+                except Exception as e:
+                    error(f"Error logging results after async evaluation: {str(e)}")
+            return results, pretty_str_to_print
         # Create a regular task
         task = asyncio.create_task(_async_evaluation_workflow())
@@ -860,6 +907,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             f"Processing evaluation '{evaluation_run.eval_name}': "
         )
     else:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         if judgment_scorers:
             # Execute evaluation using Judgment API
             info("Starting API evaluation")
@@ -895,7 +943,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             # We should be removing local scorers soon
             info("Starting local evaluation")
             for example in evaluation_run.examples:
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     debug(f"Processing example {example.example_id}: {example.input}")
             results: List[ScoringResult] = asyncio.run(

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.scorers.judgeval_scorer import JudgevalScorer
-from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
+from judgeval.scorers.prompt_scorer import PromptScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
     ExecutionOrderScorer,
     JSONCorrectnessScorer,
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     GroundednessScorer,
     DerailmentScorer,
     ToolOrderScorer,
+    ClassifierScorer,
+    ToolDependencyScorer,
 )
 from judgeval.scorers.judgeval_scorers.classifiers import (
     Text2SQLScorer,
@@ -43,4 +45,5 @@ __all__ = [
     "GroundednessScorer",
     "DerailmentScorer",
     "ToolOrderScorer",
+    "ToolDependencyScorer",
 ]

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.constants import UNBOUNDED_SCORERS
+from judgeval.data.example import ExampleParams
 class JudgevalScorer:
     """
     Base class for scorers in `judgeval`.
@@ -39,6 +39,9 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    required_params: Optional[List[ExampleParams]] = None  # The required parameters for the scorer
+    error: Optional[str] = None
+    success: Optional[bool] = None
     def __init__(
         self,
@@ -49,6 +52,7 @@ class JudgevalScorer:
         reason: Optional[str] = None,
         success: Optional[bool] = None,
         evaluation_model: Optional[str] = None,
+        required_params: Optional[List[ExampleParams]] = None,
         strict_mode: bool = False,
         async_mode: bool = True,
         verbose_mode: bool = True,
@@ -85,6 +89,7 @@ class JudgevalScorer:
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs
             self.additional_metadata = additional_metadata
+            self.required_params = required_params
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """
@@ -145,3 +150,9 @@ class JudgevalScorer:
             "additional_metadata": self.additional_metadata,
         }
         return f"JudgevalScorer({attributes})"
+    def to_dict(self):
+        return {
+            "score_type": str(self.score_type),  # Convert enum to string for serialization
+            "threshold": self.threshold
+        }

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -29,4 +31,6 @@ __all__ = [
     "GroundednessScorer",
     "DerailmentScorer",
     "ToolOrderScorer",
+    "ClassifierScorer",
+    "ToolDependencyScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py ADDED Viewed

@@ -0,0 +1,124 @@
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import List, Mapping, Optional, Dict
+from pydantic import model_serializer
+class ClassifierScorer(APIJudgmentScorer):
+    """
+    In the Judgment backend, this scorer is implemented as a PromptScorer that takes
+    1. a system role that may involve the Example object
+    2. options for scores on the example
+    and uses a judge to execute the evaluation from the system role and classify into one of the options
+    ex:
+    system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
+    options = {"positive": 1, "negative": 0}
+    Args:
+        name (str): The name of the scorer
+        slug (str): A unique identifier for the scorer
+        conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
+        options (Mapping[str, float]): A mapping of classification options to their corresponding scores
+        threshold (float): The threshold for determining success (default: 0.5)
+        include_reason (bool): Whether to include reasoning in the response (default: True)
+        strict_mode (bool): Whether to use strict mode (default: False)
+        verbose_mode (bool): Whether to include verbose logging (default: False)
+    """
+    name: Optional[str] = None
+    slug: Optional[str] = None
+    conversation: Optional[List[dict]] = None
+    options: Optional[Mapping[str, float]] = None
+    verbose_mode: bool = False
+    strict_mode: bool = False
+    include_reason: bool = True,
+    async_mode: bool = True,
+    threshold: float = 0.5
+    def __init__(
+        self,
+        name: str,
+        slug: str,
+        conversation: List[dict],
+        options: Mapping[str, float],
+        threshold: float = 0.5,
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+        async_mode: bool = True,
+    ):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CLASSIFIER,
+        )
+        self.name = name
+        self.verbose_mode = verbose_mode
+        self.strict_mode = strict_mode
+        self.include_reason = include_reason
+        self.slug = slug
+        self.conversation = conversation
+        self.options = options
+        self.async_mode = async_mode
+    def update_name(self, name: str):
+        """
+        Updates the name of the scorer.
+        """
+        self.name = name
+    def update_threshold(self, threshold: float):
+        """
+        Updates the threshold of the scorer.
+        """
+        self.threshold = threshold
+    def update_conversation(self, conversation: List[dict]):
+        """
+        Updates the conversation with the new conversation.
+        Sample conversation:
+        [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
+        """
+        self.conversation = conversation
+    def update_options(self, options: Mapping[str, float]):
+        """
+        Updates the options with the new options.
+        Sample options:
+        {"yes": 1, "no": 0}
+        """
+        self.options = options
+    def __str__(self):
+        return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
+    # @model_serializer
+    # def serialize_model(self) -> dict:
+    #     """
+    #     Defines how the ClassifierScorer should be serialized when model_dump() is called.
+    #     """
+    #     return {
+    #         "name": self.name,
+    #         "score_type": self.name,
+    #         "conversation": self.conversation,
+    #         "options": self.options,
+    #         "threshold": self.threshold,
+    #         "include_reason": self.include_reason,
+    #         "async_mode": self.async_mode,
+    #         "strict_mode": self.strict_mode,
+    #         "verbose_mode": self.verbose_mode,
+    #     }
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "score_type": self.name,
+            "conversation": self.conversation,
+            "options": self.options,
+            "threshold": self.threshold,
+            "include_reason": self.include_reason,
+            "async_mode": self.async_mode,
+            "strict_mode": self.strict_mode,
+            "verbose_mode": self.verbose_mode,
+        }

judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+`judgeval` tool dependency scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+from typing import Optional, Dict
+class ToolDependencyScorer(APIJudgmentScorer):
+    kwargs: Optional[Dict] = None
+    def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.TOOL_DEPENDENCY
+        )
+        self.kwargs = {"enable_param_checking": enable_param_checking}
+    @property
+    def __name__(self):
+        return "Tool Dependency"

judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py CHANGED Viewed

@@ -7,7 +7,7 @@ Determines if the LLM-generated SQL query is valid and works for the natural lan
 from judgeval.scorers import ClassifierScorer
 Text2SQLScorer = ClassifierScorer(
-    "Text to SQL",
+    name="Text to SQL",
     slug="text2sql-1010101010",
     threshold=1.0,
     conversation=[{

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
 from pydantic import BaseModel, model_serializer, Field
 from judgeval.data import Example
+from judgeval.data.example import ExampleParams
 from judgeval.scorers import JudgevalScorer
 from judgeval.scorers.utils import (
     scorer_progress_meter,
@@ -37,6 +38,7 @@ from judgeval.scorers.utils import (
     get_or_create_event_loop,
     create_verbose_logs
 )
+from judgeval.judges import JudgevalJudge
 class ReasonScore(BaseModel):
@@ -49,7 +51,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
     score_type: str
     threshold: float = Field(default=0.5)
     using_native_model: bool = Field(default=True)
+    model: Optional[JudgevalJudge] = Field(default=None)
+    skipped: bool = Field(default=False)
     # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
     _response: Optional[dict] = None
     _result: Optional[float] = None
@@ -62,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        required_params: Optional[List[ExampleParams]] = None,
     ):
         # Initialize BaseModel first
         BaseModel.__init__(
@@ -83,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
             async_mode=async_mode,
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
+            required_params=required_params,
         )
     def score_example(
@@ -276,166 +281,5 @@ class PromptScorer(JudgevalScorer, BaseModel):
     def __name__(self):
         return self.name
-class ClassifierScorer(PromptScorer):
-    """
-    This is a PromptScorer that takes
-    1. a system role that may involve the Example object
-    2. options for scores on the example
-    and uses a judge to execute the evaluation from the system role and classify into one of the options
-    ex:
-    system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
-    options = {"positive": 1, "negative": 0}
-    """
-    conversation: List[dict]
-    options: Mapping[str, float]
-    def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
-                 threshold: float = 0.5, include_reason: bool = True,
-                 async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
-        # Initialize BaseModel first with all fields
-        BaseModel.__init__(
-            self,
-            name=name,
-            slug=slug,
-            score_type=name,
-            conversation=conversation,
-            options=options,
-            threshold=threshold,
-            include_reason=include_reason,
-            async_mode=async_mode,
-            strict_mode=strict_mode,
-            verbose_mode=verbose_mode,
-        )
-        # Then initialize JudgevalScorer
-        JudgevalScorer.__init__(
-            self,
-            score_type=name,
-            threshold=threshold,
-            include_reason=include_reason,
-            async_mode=async_mode,
-            strict_mode=strict_mode,
-            verbose_mode=verbose_mode,
-        )
-    def _build_measure_prompt(self, example: Example) -> List[dict]:
-        """
-        Builds the measure prompt for the classifier scorer.
-        Args:
-            example (Example): The example to build the prompt for
-        Returns:
-            List[dict]: The measure prompt for the classifier scorer
-        """
-        replacement_words = {
-            "{{actual_output}}": example.actual_output,
-            "{{expected_output}}": example.expected_output,
-            "{{context}}": example.context,
-            "{{retrieval_context}}": example.retrieval_context,
-            "{{tools_called}}": example.tools_called,
-            "{{expected_tools}}": example.expected_tools,
-        }
-        # Make a copy of the conversation to avoid modifying the original
-        conversation_copy = [dict(message) for message in self.conversation]
-        # Only replace if double brackets are found in the content
-        for message in conversation_copy:
-            content = message["content"]
-            if "{{" in content:
-                for key, value in replacement_words.items():
-                    if key in content:
-                        message["content"] = content.replace(key, str(value))
-        return conversation_copy
-    def _build_schema(self) -> dict:
-        return self.options
-    def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
-        """
-        Enforces the judge model to choose an option from the schema.
-        We want the model to choose an option from the schema and a reason for the choice.
-        """
-        options = list(schema.keys())
-        options_str = ", ".join(options)
-        system_role = judge_prompt[0]["content"]
-        system_role += (
-            f"\n\nYou must choose one of the following options: {options_str}. "
-            "Format your response as a JSON object with two fields:\n"
-            "1. 'choice': Your selected option (must be one of the provided choices)\n"
-            "2. 'reason': A brief explanation for why you made this choice\n\n"
-            "Example response format:\n"
-            "{\n"
-            '    "choice": "<one of the valid options>",\n'
-            '    "reason": "<your explanation>"\n'
-            "}"
-        )
-        judge_prompt[0]["content"] = system_role
-        return judge_prompt
-    def _process_response(self, response: dict) -> Tuple[float, str]:
-        choice = response.get("choice")
-        if choice not in self.options:
-            raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
-        reason = response.get("reason", "No reason could be found in model response.")
-        return self.options[choice], reason
-    def _success_check(self, **kwargs) -> bool:
-        return self.score >= self.threshold
-    def update_name(self, name: str):
-        """
-        Updates the name of the scorer.
-        """
-        self.name = name
-    def update_threshold(self, threshold: float):
-        """
-        Updates the threshold of the scorer.
-        """
-        self.threshold = threshold
-    def update_conversation(self, conversation: List[dict]):
-        """
-        Updates the conversation with the new conversation.
-        Sample conversation:
-        [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
-        """
-        self.conversation = conversation
-    def update_options(self, options: Mapping[str, float]):
-        """
-        Updates the options with the new options.
-        Sample options:
-        {"yes": 1, "no": 0}
-        """
-        self.options = options
-    def __str__(self):
-        return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
-    @model_serializer
-    def serialize_model(self) -> dict:
-        """
-        Defines how the ClassifierScorer should be serialized when model_dump() is called.
-        """
-        return {
-            "name": self.name,
-            "score_type": self.score_type,
-            "conversation": self.conversation,
-            "options": self.options,
-            "threshold": self.threshold,
-            "include_reason": self.include_reason,
-            "async_mode": self.async_mode,
-            "strict_mode": self.strict_mode,
-            "verbose_mode": self.verbose_mode,
-        }
+    class Config:
+        arbitrary_types_allowed = True

judgeval 0.0.39__py3-none-any.whl → 0.0.41__py3-none-any.whl

judgeval 0.0.39py3-none-any.whl → 0.0.41py3-none-any.whl