PyPI - judgeval - Versions diffs - 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl - Mend

judgeval 0.0.39py3-none-any.whl → 0.0.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +361 -236
judgeval/constants.py +2 -0
judgeval/data/__init__.py +2 -1
judgeval/data/example.py +7 -7
judgeval/data/tool.py +29 -1
judgeval/data/trace.py +26 -38
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +25 -6
judgeval/run_evaluation.py +50 -16
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +8 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +5 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.40.dist-info/METADATA +1441 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -21
judgeval-0.0.39.dist-info/METADATA +0 -247
{judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
{judgeval-0.0.39.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
     GROUNDEDNESS = "groundedness"
     DERAILMENT = "derailment"
     TOOL_ORDER = "tool_order"
+    CLASSIFIER = "classifier"
+    TOOL_DEPENDENCY = "tool_dependency"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup

judgeval/data/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.trace import Trace, TraceSpan
+from judgeval.data.trace import Trace, TraceSpan, TraceUsage
 __all__ = [
@@ -15,4 +15,5 @@ __all__ = [
     "generate_scoring_result",
     "Trace",
     "TraceSpan",
+    "TraceUsage"
 ]

judgeval/data/example.py CHANGED Viewed

@@ -36,15 +36,15 @@ class Example(BaseModel):
     name: Optional[str] = None
     example_id: str = Field(default_factory=lambda: str(uuid4()))
     example_index: Optional[int] = None
-    timestamp: Optional[str] = None
+    created_at: Optional[str] = None
     trace_id: Optional[str] = None
     def __init__(self, **data):
         if 'example_id' not in data:
             data['example_id'] = str(uuid4())
         # Set timestamp if not provided
-        if 'timestamp' not in data:
-            data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
+        if 'created_at' not in data:
+            data['created_at'] = datetime.now().isoformat()
         super().__init__(**data)
     @field_validator('input', mode='before')
@@ -123,9 +123,9 @@ class Example(BaseModel):
             raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
         return v
-    @field_validator('timestamp', mode='before')
+    @field_validator('created_at', mode='before')
     @classmethod
-    def validate_timestamp(cls, v):
+    def validate_created_at(cls, v):
         if v is not None and not isinstance(v, str):
             raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
         return v
@@ -150,7 +150,7 @@ class Example(BaseModel):
             "name": self.name,
             "example_id": self.example_id,
             "example_index": self.example_index,
-            "timestamp": self.timestamp,
+            "created_at": self.created_at,
         }
     def __str__(self):
@@ -166,5 +166,5 @@ class Example(BaseModel):
             f"name={self.name}, "
             f"example_id={self.example_id}, "
             f"example_index={self.example_index}, "
-            f"timestamp={self.timestamp}, "
+            f"created_at={self.created_at}, "
         )

judgeval/data/tool.py CHANGED Viewed

@@ -1,10 +1,14 @@
 from pydantic import BaseModel, field_validator
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, List
 import warnings
 class Tool(BaseModel):
     tool_name: str
     parameters: Optional[Dict[str, Any]] = None
+    agent_name: Optional[str] = None
+    result_dependencies: Optional[List[Dict[str, Any]]] = None
+    action_dependencies: Optional[List[Dict[str, Any]]] = None
+    require_all: Optional[bool] = None
     @field_validator('tool_name')
     def validate_tool_name(cls, v):
@@ -16,4 +20,28 @@ class Tool(BaseModel):
     def validate_parameters(cls, v):
         if v is not None and not isinstance(v, dict):
             warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
+        return v
+    @field_validator('agent_name')
+    def validate_agent_name(cls, v):
+        if v is not None and not isinstance(v, str):
+            warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
+        return v
+    @field_validator('result_dependencies')
+    def validate_result_dependencies(cls, v):
+        if v is not None and not isinstance(v, list):
+            warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
+        return v
+    @field_validator('action_dependencies')
+    def validate_action_dependencies(cls, v):
+        if v is not None and not isinstance(v, list):
+            warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
+        return v
+    @field_validator('require_all')
+    def validate_require_all(cls, v):
+        if v is not None and not isinstance(v, bool):
+            warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
         return v

judgeval/data/trace.py CHANGED Viewed

@@ -5,36 +5,52 @@ from judgeval.data.tool import Tool
 import json
 from datetime import datetime, timezone
+class TraceUsage(BaseModel):
+    prompt_tokens: Optional[int] = None
+    completion_tokens: Optional[int] = None
+    total_tokens: Optional[int] = None
+    prompt_tokens_cost_usd: Optional[float] = None
+    completion_tokens_cost_usd: Optional[float] = None
+    total_cost_usd: Optional[float] = None
+    model_name: Optional[str] = None
 class TraceSpan(BaseModel):
     span_id: str
     trace_id: str
-    function: Optional[str] = None
+    function: str
     depth: int
     created_at: Optional[Any] = None
     parent_span_id: Optional[str] = None
     span_type: Optional[str] = "span"
     inputs: Optional[Dict[str, Any]] = None
+    error: Optional[Dict[str, Any]] = None
     output: Optional[Any] = None
+    usage: Optional[TraceUsage] = None
     duration: Optional[float] = None
     annotation: Optional[List[Dict[str, Any]]] = None
     evaluation_runs: Optional[List[EvaluationRun]] = []
     expected_tools: Optional[List[Tool]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
+    has_evaluation: Optional[bool] = False
+    agent_name: Optional[str] = None
     def model_dump(self, **kwargs):
         return {
             "span_id": self.span_id,
             "trace_id": self.trace_id,
             "depth": self.depth,
-#             "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
             "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
-            "inputs": self._serialize_inputs(),
-            "output": self._serialize_output(),
+            "inputs": self._serialize_value(self.inputs),
+            "output": self._serialize_value(self.output),
+            "error": self._serialize_value(self.error),
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
             "parent_span_id": self.parent_span_id,
             "function": self.function,
             "duration": self.duration,
-            "span_type": self.span_type
+            "span_type": self.span_type,
+            "usage": self.usage.model_dump() if self.usage else None,
+            "has_evaluation": self.has_evaluation,
+            "agent_name": self.agent_name
         }
     def print_span(self):
@@ -42,30 +58,6 @@ class TraceSpan(BaseModel):
         indent = "  " * self.depth
         parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
         print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
-    def _serialize_inputs(self) -> dict:
-        """Helper method to serialize input data safely."""
-        if self.inputs is None:
-            return {}
-        serialized_inputs = {}
-        for key, value in self.inputs.items():
-            if isinstance(value, BaseModel):
-                serialized_inputs[key] = value.model_dump()
-            elif isinstance(value, (list, tuple)):
-                # Handle lists/tuples of arguments
-                serialized_inputs[key] = [
-                    item.model_dump() if isinstance(item, BaseModel)
-                    else None if not self._is_json_serializable(item)
-                    else item
-                    for item in value
-                ]
-            else:
-                if self._is_json_serializable(value):
-                    serialized_inputs[key] = value
-                else:
-                    serialized_inputs[key] = self.safe_stringify(value, self.function)
-        return serialized_inputs
     def _is_json_serializable(self, obj: Any) -> bool:
         """Helper method to check if an object is JSON serializable."""
@@ -88,15 +80,11 @@ class TraceSpan(BaseModel):
             return repr(output)
         except (TypeError, OverflowError, ValueError):
             pass
-        warnings.warn(
-            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
-        )
         return None
-    def _serialize_output(self) -> Any:
-        """Helper method to serialize output data safely."""
-        if self.output is None:
+    def _serialize_value(self, value: Any) -> Any:
+        """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
+        if value is None:
             return None
         def serialize_value(value):
@@ -117,8 +105,8 @@ class TraceSpan(BaseModel):
                     # Fallback to safe stringification
                     return self.safe_stringify(value, self.function)
-        # Start serialization with the top-level output
-        return serialize_value(self.output)
+        # Start serialization with the top-level value
+        return serialize_value(value)
 class Trace(BaseModel):
     trace_id: str

judgeval/data/trace_run.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from pydantic import BaseModel
 from typing import List, Optional, Dict, Any, Union, Callable
 from judgeval.data import Trace
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
         rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         append (Optional[bool]): Whether to append to existing evaluation results
+        tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
     """
     # The user will specify whether they want log_results when they call run_eval
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
     rules: Optional[List[Rule]] = None
+    tools: Optional[List[Dict[str, Any]]] = None
     class Config:
         arbitrary_types_allowed = True

judgeval/evaluation_run.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, Field
 from judgeval.data import Example, CustomExample
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
     # The user will specify whether they want log_results when they call run_eval
     log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
     organization_id: Optional[str] = None
-    project_name: Optional[str] = None
-    eval_name: Optional[str] = None
+    project_name: Optional[str] = Field(default=None, validate_default=True)
+    eval_name: Optional[str] = Field(default=None, validate_default=True)
     examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
-    aggregator: Optional[str] = None
+    aggregator: Optional[str] = Field(default=None, validate_default=True)
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
     def validate_scorers(cls, v):
         if not v:
             raise ValueError("Scorers cannot be empty.")
-        for s in v:
-            if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
-                raise ValueError(f"Invalid type for Scorer: {type(s)}")
         return v
     @field_validator('model')

judgeval/judgment_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 from uuid import uuid4
 from typing import Optional, List, Dict, Any, Union, Callable
 import requests
+import asyncio
 from judgeval.constants import ROOT_API
 from judgeval.data.datasets import EvalDataset, EvalDatasetClient
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
         ignore_errors: bool = True,
         rules: Optional[List[Rule]] = None,
         function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None
     ) -> List[ScoringResult]:
         try:
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
                 append=append,
                 judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id,
+                tools=tools
             )
             return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         ignore_errors: bool = True,
         async_execution: bool = False,
         rules: Optional[List[Rule]] = None
-    ) -> List[ScoringResult]:
+    ) -> Union[List[ScoringResult], asyncio.Task]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -480,7 +483,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         return response.json()["slug"]
-    async def assert_test(
+    def assert_test(
         self,
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
         examples: Optional[List[Example]] = None,
@@ -495,6 +498,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         rules: Optional[List[Rule]] = None,
         function: Optional[Callable] = None,
         tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
         async_execution: bool = False
     ) -> None:
         """
@@ -513,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
             override (bool): Whether to override an existing evaluation run with the same name
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         """
+        # Check for enable_param_checking and tools
+        for scorer in scorers:
+            if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
+                if scorer.kwargs.get("enable_param_checking") is True:
+                    if not tools:
+                        raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
         # Validate that exactly one of examples or test_file is provided
         if (examples is None and test_file is None) or (examples is not None and test_file is not None):
             raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
@@ -530,10 +542,11 @@ class JudgmentClient(metaclass=SingletonMeta):
                 rules=rules,
                 function=function,
                 tracer=tracer,
-                test_file=test_file
+                test_file=test_file,
+                tools=tools
             )
         else:
-            results = await self.run_evaluation(
+            results = self.run_evaluation(
                 examples=examples,
                 scorers=scorers,
                 model=model,
@@ -547,4 +560,10 @@ class JudgmentClient(metaclass=SingletonMeta):
                 async_execution=async_execution
             )
-        assert_test(results)
+        if async_execution:
+            # 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
+            actual_results = asyncio.run(results)
+            assert_test(actual_results)  # Call the synchronous imported function
+        else:
+            # 'results' is already List[ScoringResult] here (synchronous path)
+            assert_test(results)  # Call the synchronous imported function

judgeval/run_evaluation.py CHANGED Viewed

@@ -204,9 +204,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
             )
     return results
-def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
+def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_trace: bool) -> None:
     """
-    Checks if the current experiment, if one exists, has the same type (examples of sequences)
+    Checks if the current experiment, if one exists, has the same type (examples of traces)
     """
     try:
         response = requests.post(
@@ -220,7 +220,7 @@ def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: s
                 "eval_name": eval_name,
                 "project_name": project_name,
                 "judgment_api_key": judgment_api_key,
-                "is_sequence": is_sequence
+                "is_trace": is_trace
             },
             verify=True
         )
@@ -382,7 +382,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
         )
     if trace_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        # Check that the current experiment, if one exists, has the same type (examples or traces)
         check_experiment_type(
             trace_run.eval_name,
             trace_run.project_name,
@@ -390,13 +390,18 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
             trace_run.organization_id,
             True
         )
     if function and tracer:
         new_traces: List[Trace] = []
         tracer.offline_mode = True
+        tracer.traces = []
         for example in examples:
             if example.input:
-                result = run_with_spinner("Running agent function: ", function, **example.input)
+                if isinstance(example.input, str):
+                    result = run_with_spinner("Running agent function: ", function, example.input)
+                elif isinstance(example.input, dict):
+                    result = run_with_spinner("Running agent function: ", function, **example.input)
+                else:
+                    raise ValueError(f"Input must be string or dict, got {type(example.input)}")
             else:
                 result = run_with_spinner("Running agent function: ", function)
         for i, trace in enumerate(tracer.traces):
@@ -405,6 +410,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
             trace.entries[0].expected_tools = examples[i].expected_tools
             new_traces.append(trace)
         trace_run.traces = new_traces
+        tracer.traces = []
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
@@ -423,7 +429,7 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
     debug("Processing API results")
     # TODO: allow for custom scorer on traces
     if trace_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], trace_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["agent_results"], trace_run)
         rprint(pretty_str)
     return scoring_results
@@ -504,7 +510,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
                 info(f"Polling for evaluation '{eval_name}' in project '{project_name}' (attempt {poll_count})")
             # Check status
-            response = requests.get(
+            response = await asyncio.to_thread(
+                requests.get,
                 JUDGMENT_GET_EVAL_STATUS_API_URL,
                 headers={
                     "Content-Type": "application/json",
@@ -531,7 +538,8 @@ async def _poll_evaluation_until_complete(eval_name: str, project_name: str, jud
             # If complete, get results and return
             if status == "completed" or status == "complete":
                 info(f"Evaluation '{eval_name}' reported as completed, fetching and verifying results...")
-                results_response = requests.post(
+                results_response = await asyncio.to_thread(
+                    requests.post,
                     JUDGMENT_EVAL_FETCH_API_URL,
                     headers={
                         "Content-Type": "application/json",
@@ -723,7 +731,18 @@ class SpinnerWrappedTask:
     def __await__(self):
         async def _spin_and_await():
-            return await await_with_spinner(self.task, self.message)
+            # self.task resolves to (scoring_results, pretty_str_to_print)
+            task_result_tuple = await await_with_spinner(self.task, self.message)
+            # Unpack the tuple
+            scoring_results, pretty_str_to_print = task_result_tuple
+            # Print the pretty string if it exists, after spinner is cleared
+            if pretty_str_to_print:
+                rprint(pretty_str_to_print)
+            # Return only the scoring_results to the original awaiter
+            return scoring_results
         return _spin_and_await().__await__()
     # Proxy all Task attributes and methods to the underlying task
@@ -756,7 +775,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         )
     if evaluation_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        # Check that the current experiment, if one exists, has the same type (examples of traces)
         check_experiment_type(
             evaluation_run.eval_name,
             evaluation_run.project_name,
@@ -769,8 +788,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
     debug("Initializing examples with IDs and timestamps")
     for idx, example in enumerate(evaluation_run.examples):
         example.example_index = idx  # Set numeric index
-        example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        with example_logging_context(example.timestamp, example.example_id):
+        with example_logging_context(example.created_at, example.example_id):
             debug(f"Initialized example {example.example_id} (index: {example.example_index})")
             debug(f"Input: {example.input}")
             debug(f"Actual output: {example.actual_output}")
@@ -824,7 +842,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             payload = evaluation_run.model_dump(warnings=False)
             # Send the evaluation to the queue
-            response = requests.post(
+            response = await asyncio.to_thread(
+                requests.post,
                 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
                 headers={
                     "Content-Type": "application/json",
@@ -843,13 +862,28 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             info(f"Successfully added evaluation '{evaluation_run.eval_name}' to queue")
             # Poll until the evaluation is complete
-            return await _poll_evaluation_until_complete(
+            results = await _poll_evaluation_until_complete(
                 eval_name=evaluation_run.eval_name,
                 project_name=evaluation_run.project_name,
                 judgment_api_key=evaluation_run.judgment_api_key,
                 organization_id=evaluation_run.organization_id,
                 original_examples=evaluation_run.examples  # Pass the original examples
             )
+            pretty_str_to_print = None
+            if evaluation_run.log_results and results: # Ensure results exist before logging
+                send_results = [scoring_result.model_dump(warnings=False) for scoring_result in results]
+                try:
+                    # Run the blocking log_evaluation_results in a separate thread
+                    pretty_str_to_print = await asyncio.to_thread(
+                        log_evaluation_results,
+                        send_results,
+                        evaluation_run
+                    )
+                except Exception as e:
+                    error(f"Error logging results after async evaluation: {str(e)}")
+            return results, pretty_str_to_print
         # Create a regular task
         task = asyncio.create_task(_async_evaluation_workflow())
@@ -895,7 +929,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             # We should be removing local scorers soon
             info("Starting local evaluation")
             for example in evaluation_run.examples:
-                with example_logging_context(example.timestamp, example.example_id):
+                with example_logging_context(example.created_at, example.example_id):
                     debug(f"Processing example {example.example_id}: {example.input}")
             results: List[ScoringResult] = asyncio.run(

judgeval/scorers/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.scorers.judgeval_scorer import JudgevalScorer
-from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
+from judgeval.scorers.prompt_scorer import PromptScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
     ExecutionOrderScorer,
     JSONCorrectnessScorer,
@@ -17,6 +17,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     GroundednessScorer,
     DerailmentScorer,
     ToolOrderScorer,
+    ClassifierScorer,
+    ToolDependencyScorer,
 )
 from judgeval.scorers.judgeval_scorers.classifiers import (
     Text2SQLScorer,
@@ -43,4 +45,5 @@ __all__ = [
     "GroundednessScorer",
     "DerailmentScorer",
     "ToolOrderScorer",
+    "ToolDependencyScorer",
 ]

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -39,6 +39,8 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    error: Optional[str] = None
+    success: Optional[bool] = None
     def __init__(
         self,
@@ -145,3 +147,9 @@ class JudgevalScorer:
             "additional_metadata": self.additional_metadata,
         }
         return f"JudgevalScorer({attributes})"
+    def to_dict(self):
+        return {
+            "score_type": str(self.score_type),  # Convert enum to string for serialization
+            "threshold": self.threshold
+        }

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -13,6 +13,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import ClassifierScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import ToolDependencyScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -29,4 +31,6 @@ __all__ = [
     "GroundednessScorer",
     "DerailmentScorer",
     "ToolOrderScorer",
+    "ClassifierScorer",
+    "ToolDependencyScorer",
 ]

judgeval 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl

judgeval 0.0.39py3-none-any.whl → 0.0.40py3-none-any.whl