PyPI - judgeval - Versions diffs - 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl - Mend

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/clients.py +6 -4
judgeval/common/tracer.py +361 -236
judgeval/constants.py +3 -0
judgeval/data/__init__.py +2 -1
judgeval/data/example.py +14 -13
judgeval/data/tool.py +47 -0
judgeval/data/trace.py +28 -39
judgeval/data/trace_run.py +2 -1
judgeval/evaluation_run.py +4 -7
judgeval/judgment_client.py +27 -6
judgeval/run_evaluation.py +395 -37
judgeval/scorers/__init__.py +4 -1
judgeval/scorers/judgeval_scorer.py +8 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +4 -0
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +124 -0
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +20 -0
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +1 -1
judgeval/scorers/prompt_scorer.py +5 -164
judgeval/scorers/score.py +15 -15
judgeval-0.0.40.dist-info/METADATA +1441 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/RECORD +23 -20
judgeval-0.0.38.dist-info/METADATA +0 -247
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/WHEEL +0 -0
{judgeval-0.0.38.dist-info → judgeval-0.0.40.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -28,6 +28,8 @@ class APIScorer(str, Enum):
     GROUNDEDNESS = "groundedness"
     DERAILMENT = "derailment"
     TOOL_ORDER = "tool_order"
+    CLASSIFIER = "classifier"
+    TOOL_DEPENDENCY = "tool_dependency"
     @classmethod
     def _missing_(cls, value):
         # Handle case-insensitive lookup
@@ -59,6 +61,7 @@ JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
 JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
+JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)

judgeval/data/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.trace import Trace, TraceSpan
+from judgeval.data.trace import Trace, TraceSpan, TraceUsage
 __all__ = [
@@ -15,4 +15,5 @@ __all__ = [
     "generate_scoring_result",
     "Trace",
     "TraceSpan",
+    "TraceUsage"
 ]

judgeval/data/example.py CHANGED Viewed

@@ -8,6 +8,7 @@ from uuid import uuid4
 from pydantic import BaseModel, Field, field_validator
 from enum import Enum
 from datetime import datetime
+from judgeval.data.tool import Tool
 import time
@@ -31,19 +32,19 @@ class Example(BaseModel):
     retrieval_context: Optional[List[str]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
     tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[Dict[str, Any]]] = None
+    expected_tools: Optional[List[Tool]] = None
     name: Optional[str] = None
     example_id: str = Field(default_factory=lambda: str(uuid4()))
     example_index: Optional[int] = None
-    timestamp: Optional[str] = None
+    created_at: Optional[str] = None
     trace_id: Optional[str] = None
     def __init__(self, **data):
         if 'example_id' not in data:
             data['example_id'] = str(uuid4())
         # Set timestamp if not provided
-        if 'timestamp' not in data:
-            data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
+        if 'created_at' not in data:
+            data['created_at'] = datetime.now().isoformat()
         super().__init__(**data)
     @field_validator('input', mode='before')
@@ -82,17 +83,17 @@ class Example(BaseModel):
             raise ValueError(f"All items in expected_output must be strings but got {v}")
         return v
-    @field_validator('expected_tools', mode='before')
+    @field_validator('expected_tools')
     @classmethod
     def validate_expected_tools(cls, v):
         if v is not None:
             if not isinstance(v, list):
-                raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
+                raise ValueError(f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}")
-            # Check that each item in the list is a dictionary
+            # Check that each item in the list is a Tool
             for i, item in enumerate(v):
-                if not isinstance(item, dict):
-                    raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
+                if not isinstance(item, Tool):
+                    raise ValueError(f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}")
         return v
@@ -122,9 +123,9 @@ class Example(BaseModel):
             raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
         return v
-    @field_validator('timestamp', mode='before')
+    @field_validator('created_at', mode='before')
     @classmethod
-    def validate_timestamp(cls, v):
+    def validate_created_at(cls, v):
         if v is not None and not isinstance(v, str):
             raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
         return v
@@ -149,7 +150,7 @@ class Example(BaseModel):
             "name": self.name,
             "example_id": self.example_id,
             "example_index": self.example_index,
-            "timestamp": self.timestamp,
+            "created_at": self.created_at,
         }
     def __str__(self):
@@ -165,5 +166,5 @@ class Example(BaseModel):
             f"name={self.name}, "
             f"example_id={self.example_id}, "
             f"example_index={self.example_index}, "
-            f"timestamp={self.timestamp}, "
+            f"created_at={self.created_at}, "
         )

judgeval/data/tool.py ADDED Viewed

@@ -0,0 +1,47 @@
+from pydantic import BaseModel, field_validator
+from typing import Dict, Any, Optional, List
+import warnings
+class Tool(BaseModel):
+    tool_name: str
+    parameters: Optional[Dict[str, Any]] = None
+    agent_name: Optional[str] = None
+    result_dependencies: Optional[List[Dict[str, Any]]] = None
+    action_dependencies: Optional[List[Dict[str, Any]]] = None
+    require_all: Optional[bool] = None
+    @field_validator('tool_name')
+    def validate_tool_name(cls, v):
+        if not v:
+            warnings.warn("Tool name is empty or None", UserWarning)
+        return v
+    @field_validator('parameters')
+    def validate_parameters(cls, v):
+        if v is not None and not isinstance(v, dict):
+            warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
+        return v
+    @field_validator('agent_name')
+    def validate_agent_name(cls, v):
+        if v is not None and not isinstance(v, str):
+            warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
+        return v
+    @field_validator('result_dependencies')
+    def validate_result_dependencies(cls, v):
+        if v is not None and not isinstance(v, list):
+            warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
+        return v
+    @field_validator('action_dependencies')
+    def validate_action_dependencies(cls, v):
+        if v is not None and not isinstance(v, list):
+            warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
+        return v
+    @field_validator('require_all')
+    def validate_require_all(cls, v):
+        if v is not None and not isinstance(v, bool):
+            warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
+        return v

judgeval/data/trace.py CHANGED Viewed

@@ -1,39 +1,56 @@
 from pydantic import BaseModel
 from typing import Optional, Dict, Any, List
 from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.tool import Tool
 import json
 from datetime import datetime, timezone
+class TraceUsage(BaseModel):
+    prompt_tokens: Optional[int] = None
+    completion_tokens: Optional[int] = None
+    total_tokens: Optional[int] = None
+    prompt_tokens_cost_usd: Optional[float] = None
+    completion_tokens_cost_usd: Optional[float] = None
+    total_cost_usd: Optional[float] = None
+    model_name: Optional[str] = None
 class TraceSpan(BaseModel):
     span_id: str
     trace_id: str
-    function: Optional[str] = None
+    function: str
     depth: int
     created_at: Optional[Any] = None
     parent_span_id: Optional[str] = None
     span_type: Optional[str] = "span"
     inputs: Optional[Dict[str, Any]] = None
+    error: Optional[Dict[str, Any]] = None
     output: Optional[Any] = None
+    usage: Optional[TraceUsage] = None
     duration: Optional[float] = None
     annotation: Optional[List[Dict[str, Any]]] = None
     evaluation_runs: Optional[List[EvaluationRun]] = []
-    expected_tools: Optional[List[Dict[str, Any]]] = None
+    expected_tools: Optional[List[Tool]] = None
     additional_metadata: Optional[Dict[str, Any]] = None
+    has_evaluation: Optional[bool] = False
+    agent_name: Optional[str] = None
     def model_dump(self, **kwargs):
         return {
             "span_id": self.span_id,
             "trace_id": self.trace_id,
             "depth": self.depth,
-#             "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
             "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
-            "inputs": self._serialize_inputs(),
-            "output": self._serialize_output(),
+            "inputs": self._serialize_value(self.inputs),
+            "output": self._serialize_value(self.output),
+            "error": self._serialize_value(self.error),
             "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
             "parent_span_id": self.parent_span_id,
             "function": self.function,
             "duration": self.duration,
-            "span_type": self.span_type
+            "span_type": self.span_type,
+            "usage": self.usage.model_dump() if self.usage else None,
+            "has_evaluation": self.has_evaluation,
+            "agent_name": self.agent_name
         }
     def print_span(self):
@@ -41,30 +58,6 @@ class TraceSpan(BaseModel):
         indent = "  " * self.depth
         parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
         print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
-    def _serialize_inputs(self) -> dict:
-        """Helper method to serialize input data safely."""
-        if self.inputs is None:
-            return {}
-        serialized_inputs = {}
-        for key, value in self.inputs.items():
-            if isinstance(value, BaseModel):
-                serialized_inputs[key] = value.model_dump()
-            elif isinstance(value, (list, tuple)):
-                # Handle lists/tuples of arguments
-                serialized_inputs[key] = [
-                    item.model_dump() if isinstance(item, BaseModel)
-                    else None if not self._is_json_serializable(item)
-                    else item
-                    for item in value
-                ]
-            else:
-                if self._is_json_serializable(value):
-                    serialized_inputs[key] = value
-                else:
-                    serialized_inputs[key] = self.safe_stringify(value, self.function)
-        return serialized_inputs
     def _is_json_serializable(self, obj: Any) -> bool:
         """Helper method to check if an object is JSON serializable."""
@@ -87,15 +80,11 @@ class TraceSpan(BaseModel):
             return repr(output)
         except (TypeError, OverflowError, ValueError):
             pass
-        warnings.warn(
-            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
-        )
         return None
-    def _serialize_output(self) -> Any:
-        """Helper method to serialize output data safely."""
-        if self.output is None:
+    def _serialize_value(self, value: Any) -> Any:
+        """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
+        if value is None:
             return None
         def serialize_value(value):
@@ -116,8 +105,8 @@ class TraceSpan(BaseModel):
                     # Fallback to safe stringification
                     return self.safe_stringify(value, self.function)
-        # Start serialization with the top-level output
-        return serialize_value(self.output)
+        # Start serialization with the top-level value
+        return serialize_value(value)
 class Trace(BaseModel):
     trace_id: str

judgeval/data/trace_run.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from pydantic import BaseModel
 from typing import List, Optional, Dict, Any, Union, Callable
 from judgeval.data import Trace
@@ -22,6 +21,7 @@ class TraceRun(BaseModel):
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
         rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         append (Optional[bool]): Whether to append to existing evaluation results
+        tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
     """
     # The user will specify whether they want log_results when they call run_eval
@@ -40,6 +40,7 @@ class TraceRun(BaseModel):
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
     rules: Optional[List[Rule]] = None
+    tools: Optional[List[Dict[str, Any]]] = None
     class Config:
         arbitrary_types_allowed = True

judgeval/evaluation_run.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, Field
 from judgeval.data import Example, CustomExample
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
@@ -27,12 +27,12 @@ class EvaluationRun(BaseModel):
     # The user will specify whether they want log_results when they call run_eval
     log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
     organization_id: Optional[str] = None
-    project_name: Optional[str] = None
-    eval_name: Optional[str] = None
+    project_name: Optional[str] = Field(default=None, validate_default=True)
+    eval_name: Optional[str] = Field(default=None, validate_default=True)
     examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
     model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
-    aggregator: Optional[str] = None
+    aggregator: Optional[str] = Field(default=None, validate_default=True)
     metadata: Optional[Dict[str, Any]] = None
     trace_span_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -96,9 +96,6 @@ class EvaluationRun(BaseModel):
     def validate_scorers(cls, v):
         if not v:
             raise ValueError("Scorers cannot be empty.")
-        for s in v:
-            if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
-                raise ValueError(f"Invalid type for Scorer: {type(s)}")
         return v
     @field_validator('model')

judgeval/judgment_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 from uuid import uuid4
 from typing import Optional, List, Dict, Any, Union, Callable
 import requests
+import asyncio
 from judgeval.constants import ROOT_API
 from judgeval.data.datasets import EvalDataset, EvalDatasetClient
@@ -121,7 +122,8 @@ class JudgmentClient(metaclass=SingletonMeta):
         ignore_errors: bool = True,
         rules: Optional[List[Rule]] = None,
         function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None
     ) -> List[ScoringResult]:
         try:
@@ -151,6 +153,7 @@ class JudgmentClient(metaclass=SingletonMeta):
                 append=append,
                 judgment_api_key=self.judgment_api_key,
                 organization_id=self.organization_id,
+                tools=tools
             )
             return run_trace_eval(trace_run, override, ignore_errors, function, tracer, examples)
         except ValueError as e:
@@ -173,7 +176,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         ignore_errors: bool = True,
         async_execution: bool = False,
         rules: Optional[List[Rule]] = None
-    ) -> List[ScoringResult]:
+    ) -> Union[List[ScoringResult], asyncio.Task]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -494,7 +497,9 @@ class JudgmentClient(metaclass=SingletonMeta):
         override: bool = False,
         rules: Optional[List[Rule]] = None,
         function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None
+        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        async_execution: bool = False
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -512,6 +517,14 @@ class JudgmentClient(metaclass=SingletonMeta):
             override (bool): Whether to override an existing evaluation run with the same name
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         """
+        # Check for enable_param_checking and tools
+        for scorer in scorers:
+            if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
+                if scorer.kwargs.get("enable_param_checking") is True:
+                    if not tools:
+                        raise ValueError(f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer.")
         # Validate that exactly one of examples or test_file is provided
         if (examples is None and test_file is None) or (examples is not None and test_file is not None):
             raise ValueError("Exactly one of 'examples' or 'test_file' must be provided, but not both")
@@ -529,7 +542,8 @@ class JudgmentClient(metaclass=SingletonMeta):
                 rules=rules,
                 function=function,
                 tracer=tracer,
-                test_file=test_file
+                test_file=test_file,
+                tools=tools
             )
         else:
             results = self.run_evaluation(
@@ -542,7 +556,14 @@ class JudgmentClient(metaclass=SingletonMeta):
                 project_name=project_name,
                 eval_run_name=eval_run_name,
                 override=override,
-                rules=rules
+                rules=rules,
+                async_execution=async_execution
             )
-        assert_test(results)
+        if async_execution:
+            # 'results' is an asyncio.Task here, awaiting it gives List[ScoringResult]
+            actual_results = asyncio.run(results)
+            assert_test(actual_results)  # Call the synchronous imported function
+        else:
+            # 'results' is already List[ScoringResult] here (synchronous path)
+            assert_test(results)  # Call the synchronous imported function

judgeval 0.0.38__py3-none-any.whl → 0.0.40__py3-none-any.whl

judgeval 0.0.38py3-none-any.whl → 0.0.40py3-none-any.whl