PyPI - judgeval - Versions diffs - 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl - Mend

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

judgeval/__init__.py +5 -4
judgeval/clients.py +6 -6
judgeval/common/__init__.py +7 -2
judgeval/common/exceptions.py +2 -3
judgeval/common/logger.py +74 -49
judgeval/common/s3_storage.py +30 -23
judgeval/common/tracer.py +1273 -939
judgeval/common/utils.py +416 -244
judgeval/constants.py +73 -61
judgeval/data/__init__.py +1 -1
judgeval/data/custom_example.py +3 -2
judgeval/data/datasets/dataset.py +80 -54
judgeval/data/datasets/eval_dataset_client.py +131 -181
judgeval/data/example.py +67 -43
judgeval/data/result.py +11 -9
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +25 -16
judgeval/data/trace.py +57 -29
judgeval/data/trace_run.py +5 -11
judgeval/evaluation_run.py +22 -82
judgeval/integrations/langgraph.py +546 -184
judgeval/judges/base_judge.py +1 -2
judgeval/judges/litellm_judge.py +33 -11
judgeval/judges/mixture_of_judges.py +128 -78
judgeval/judges/together_judge.py +22 -9
judgeval/judges/utils.py +14 -5
judgeval/judgment_client.py +259 -271
judgeval/rules.py +169 -142
judgeval/run_evaluation.py +462 -305
judgeval/scorers/api_scorer.py +20 -11
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorer.py +77 -58
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
judgeval/scorers/prompt_scorer.py +48 -37
judgeval/scorers/score.py +86 -53
judgeval/scorers/utils.py +11 -7
judgeval/tracer/__init__.py +1 -1
judgeval/utils/alerts.py +23 -12
judgeval/utils/{data_utils.py → file_utils.py} +5 -9
judgeval/utils/requests.py +29 -0
judgeval/version_check.py +5 -2
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
judgeval-0.0.46.dist-info/RECORD +69 -0
judgeval-0.0.44.dist-info/RECORD +0 -68
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0

judgeval/data/result.py CHANGED Viewed

@@ -1,6 +1,5 @@
-from dataclasses import dataclass
-from typing import List, Union, Optional, Dict, Any, Union
-from judgeval.common.logger import debug, error
+from typing import List, Optional, Union
+from judgeval.common.logger import debug
 from pydantic import BaseModel
 from judgeval.data import ScorerData, Example, CustomExample
 from judgeval.data.trace import TraceSpan
@@ -12,13 +11,14 @@ class ScoringResult(BaseModel):
     Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
     Args:
-        success (bool): Whether the evaluation was successful.
+        success (bool): Whether the evaluation was successful.
                         This means that all scorers applied to this example returned a success.
         scorer_data (List[ScorerData]): The scorers data for the evaluated example
         data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
     """
-    # Fields for scoring outputs
+    # Fields for scoring outputs
     success: bool  # used for unit testing
     scorers_data: Union[List[ScorerData], None]
     name: Optional[str] = None
@@ -26,16 +26,18 @@ class ScoringResult(BaseModel):
     # The original example object that was used to create the ScoringResult
     data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
     trace_id: Optional[str] = None
     # Additional fields for internal use
     run_duration: Optional[float] = None
     evaluation_cost: Optional[float] = None
     def to_dict(self) -> dict:
         """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
         return {
             "success": self.success,
-            "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
+            "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data]
+            if self.scorers_data
+            else None,
             "data_object": self.data_object.to_dict() if self.data_object else None,
         }

judgeval/data/scorer_data.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pydantic import BaseModel
 from judgeval.scorers import JudgevalScorer
 class ScorerData(BaseModel):
     """
     ScorerData holds the information related to a single, completed Scorer evaluation run.
@@ -18,13 +19,14 @@ class ScorerData(BaseModel):
     information surrounding the evaluation run such as the claims and verdicts generated by the
     judge model(s).
     """
     name: str
     threshold: float
     success: bool
     score: Optional[float] = None
     reason: Optional[str] = None
     strict_mode: Optional[bool] = None
-    evaluation_model: Union[List[str], str] = None
+    evaluation_model: Union[List[str], str] | None = None
     error: Optional[str] = None
     evaluation_cost: Union[float, None] = None
     verbose_logs: Optional[str] = None
@@ -43,7 +45,7 @@ class ScorerData(BaseModel):
             "error": self.error,
             "evaluation_cost": self.evaluation_cost,
             "verbose_logs": self.verbose_logs,
-            "additional_metadata": self.additional_metadata
+            "additional_metadata": self.additional_metadata,
         }

judgeval/data/tool.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pydantic import BaseModel, field_validator
 from typing import Dict, Any, Optional, List
 import warnings
 class Tool(BaseModel):
     tool_name: str
     parameters: Optional[Dict[str, Any]] = None
@@ -9,39 +10,47 @@ class Tool(BaseModel):
     result_dependencies: Optional[List[Dict[str, Any]]] = None
     action_dependencies: Optional[List[Dict[str, Any]]] = None
     require_all: Optional[bool] = None
-    @field_validator('tool_name')
+    @field_validator("tool_name")
     def validate_tool_name(cls, v):
         if not v:
             warnings.warn("Tool name is empty or None", UserWarning)
         return v
-    @field_validator('parameters')
+    @field_validator("parameters")
     def validate_parameters(cls, v):
         if v is not None and not isinstance(v, dict):
-            warnings.warn(f"Parameters should be a dictionary, got {type(v)}", UserWarning)
+            warnings.warn(
+                f"Parameters should be a dictionary, got {type(v)}", UserWarning
+            )
         return v
-    @field_validator('agent_name')
+    @field_validator("agent_name")
     def validate_agent_name(cls, v):
         if v is not None and not isinstance(v, str):
             warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
         return v
-    @field_validator('result_dependencies')
+    @field_validator("result_dependencies")
     def validate_result_dependencies(cls, v):
         if v is not None and not isinstance(v, list):
-            warnings.warn(f"Result dependencies should be a list, got {type(v)}", UserWarning)
+            warnings.warn(
+                f"Result dependencies should be a list, got {type(v)}", UserWarning
+            )
         return v
-    @field_validator('action_dependencies')
+    @field_validator("action_dependencies")
     def validate_action_dependencies(cls, v):
         if v is not None and not isinstance(v, list):
-            warnings.warn(f"Action dependencies should be a list, got {type(v)}", UserWarning)
+            warnings.warn(
+                f"Action dependencies should be a list, got {type(v)}", UserWarning
+            )
         return v
-    @field_validator('require_all')
+    @field_validator("require_all")
     def validate_require_all(cls, v):
         if v is not None and not isinstance(v, bool):
-            warnings.warn(f"Require all should be a boolean, got {type(v)}", UserWarning)
-        return v
+            warnings.warn(
+                f"Require all should be a boolean, got {type(v)}", UserWarning
+            )
+        return v

judgeval/data/trace.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from typing import Optional, Dict, Any, List
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.tool import Tool
 import json
+import sys
 from datetime import datetime, timezone
 class TraceUsage(BaseModel):
     prompt_tokens: Optional[int] = None
     completion_tokens: Optional[int] = None
@@ -14,6 +16,7 @@ class TraceUsage(BaseModel):
     total_cost_usd: Optional[float] = None
     model_name: Optional[str] = None
 class TraceSpan(BaseModel):
     span_id: str
     trace_id: str
@@ -41,11 +44,15 @@ class TraceSpan(BaseModel):
             "span_id": self.span_id,
             "trace_id": self.trace_id,
             "depth": self.depth,
-            "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
+            "created_at": datetime.fromtimestamp(
+                self.created_at, tz=timezone.utc
+            ).isoformat(),
             "inputs": self._serialize_value(self.inputs),
             "output": self._serialize_value(self.output),
             "error": self._serialize_value(self.error),
-            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
+            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
+            if self.evaluation_runs
+            else [],
             "parent_span_id": self.parent_span_id,
             "function": self.function,
             "duration": self.duration,
@@ -55,13 +62,15 @@ class TraceSpan(BaseModel):
             "agent_name": self.agent_name,
             "state_before": self.state_before,
             "state_after": self.state_after,
-            "additional_metadata": self._serialize_value(self.additional_metadata)
+            "additional_metadata": self._serialize_value(self.additional_metadata),
         }
     def print_span(self):
         """Print the span with proper formatting and parent relationship information."""
         indent = "  " * self.depth
-        parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
+        parent_info = (
+            f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
+        )
         print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
     def _is_json_serializable(self, obj: Any) -> bool:
@@ -80,38 +89,56 @@ class TraceSpan(BaseModel):
             return str(output)
         except (TypeError, OverflowError, ValueError):
             pass
         try:
             return repr(output)
         except (TypeError, OverflowError, ValueError):
             pass
         return None
     def _serialize_value(self, value: Any) -> Any:
         """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
         if value is None:
             return None
-        def serialize_value(value):
-            if isinstance(value, BaseModel):
-                return value.model_dump()
-            elif isinstance(value, dict):
-                # Recursively serialize dictionary values
-                return {k: serialize_value(v) for k, v in value.items()}
-            elif isinstance(value, (list, tuple)):
-                # Recursively serialize list/tuple items
-                return [serialize_value(item) for item in value]
-            else:
-                # Try direct JSON serialization first
-                try:
-                    json.dumps(value)
-                    return value
-                except (TypeError, OverflowError, ValueError):
-                    # Fallback to safe stringification
-                    return self.safe_stringify(value, self.function)
+        recursion_limit = sys.getrecursionlimit()
+        recursion_limit = int(recursion_limit * 0.75)
+        def serialize_value(value, current_depth=0):
+            try:
+                if current_depth > recursion_limit:
+                    return {"error": "max_depth_reached: " + type(value).__name__}
+                if isinstance(value, BaseModel):
+                    return value.model_dump()
+                elif isinstance(value, dict):
+                    # Recursively serialize dictionary values
+                    return {
+                        k: serialize_value(v, current_depth + 1)
+                        for k, v in value.items()
+                    }
+                elif isinstance(value, (list, tuple)):
+                    # Recursively serialize list/tuple items
+                    return [serialize_value(item, current_depth + 1) for item in value]
+                else:
+                    # Try direct JSON serialization first
+                    try:
+                        json.dumps(value)
+                        return value
+                    except (TypeError, OverflowError, ValueError):
+                        # Fallback to safe stringification
+                        return self.safe_stringify(value, self.function)
+                    except Exception:
+                        return {"error": "Unable to serialize"}
+            except Exception:
+                return {"error": "Unable to serialize"}
         # Start serialization with the top-level value
-        return serialize_value(value)
+        try:
+            return serialize_value(value, current_depth=0)
+        except Exception:
+            return {"error": "Unable to serialize"}
 class Trace(BaseModel):
     trace_id: str
@@ -121,6 +148,7 @@ class Trace(BaseModel):
     trace_spans: List[TraceSpan]
     overwrite: bool = False
     offline_mode: bool = False
-    rules: Optional[Dict[str, Any]] = None
+    rules: Dict[str, Any] = Field(default_factory=dict)
     has_notification: Optional[bool] = False
+    customer_id: Optional[str] = None
+    tags: List[str] = Field(default_factory=list)

judgeval/data/trace_run.py CHANGED Viewed

@@ -1,22 +1,20 @@
 from pydantic import BaseModel
-from typing import List, Optional, Dict, Any, Union, Callable
+from typing import List, Optional, Dict, Any, Union
 from judgeval.data import Trace
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
-from judgeval.judges import JudgevalJudge
 from judgeval.rules import Rule
 class TraceRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
-    Args:
+    Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
         traces (List[Trace]): The traces to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
-        aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
         rules (Optional[List[Rule]]): Rules to evaluate against scoring results
@@ -24,16 +22,12 @@ class TraceRun(BaseModel):
         tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
     """
-    # The user will specify whether they want log_results when they call run_eval
-    log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
     organization_id: Optional[str] = None
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
     traces: Optional[List[Trace]] = None
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
-    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
-    aggregator: Optional[str] = None
-    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[str] = "gpt-4.1"
     trace_span_id: Optional[str] = None
     append: Optional[bool] = False
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -43,4 +37,4 @@ class TraceRun(BaseModel):
     tools: Optional[List[Dict[str, Any]]] = None
     class Config:
-        arbitrary_types_allowed = True
+        arbitrary_types_allowed = True

judgeval/evaluation_run.py CHANGED Viewed

@@ -1,144 +1,84 @@
-from typing import List, Optional, Dict, Any, Union
+from typing import List, Optional, Union
 from pydantic import BaseModel, field_validator, Field
 from judgeval.data import Example, CustomExample
 from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
-from judgeval.common.logger import debug, error
-from judgeval.judges import JudgevalJudge
-from judgeval.rules import Rule
 class EvaluationRun(BaseModel):
     """
     Stores example and evaluation scorers together for running an eval task
-    Args:
+    Args:
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
         examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
         scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
-        aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
         judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
-        rules (Optional[List[Rule]]): Rules to evaluate against scoring results
     """
-    # The user will specify whether they want log_results when they call run_eval
-    log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
     organization_id: Optional[str] = None
     project_name: Optional[str] = Field(default=None, validate_default=True)
     eval_name: Optional[str] = Field(default=None, validate_default=True)
     examples: Union[List[Example], List[CustomExample]]
     scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
-    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
-    aggregator: Optional[str] = Field(default=None, validate_default=True)
-    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[str] = "gpt-4.1"
     trace_span_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False
     append: Optional[bool] = False
-    rules: Optional[List[Rule]] = None
     def model_dump(self, **kwargs):
         data = super().model_dump(**kwargs)
         data["scorers"] = [
-            scorer.to_dict() if hasattr(scorer, "to_dict")
-            else scorer.model_dump() if hasattr(scorer, "model_dump")
+            scorer.to_dict()
+            if hasattr(scorer, "to_dict")
+            else scorer.model_dump()
+            if hasattr(scorer, "model_dump")
             else {"score_type": scorer.score_type, "threshold": scorer.threshold}
             for scorer in self.scorers
         ]
-        if self.rules:
-            # Process rules to ensure proper serialization
-              data["rules"] = [rule.model_dump() for rule in self.rules]
         return data
-    @field_validator('log_results', mode='before')
-    def validate_log_results(cls, v):
-        if not isinstance(v, bool):
-            raise ValueError(f"log_results must be a boolean. Received {v} of type {type(v)}")
-        return v
-    @field_validator('project_name')
-    def validate_project_name(cls, v, values):
-        if values.data.get('log_results', False) and not v:
-            debug("No project name provided when log_results is True")
-            error("Validation failed: Project name required when logging results")
-            raise ValueError("Project name is required when log_results is True. Please include the project_name argument.")
-        return v
-    @field_validator('eval_name')
-    def validate_eval_name(cls, v, values):
-        if values.data.get('log_results', False) and not v:
-            debug("No eval name provided when log_results is True")
-            error("Validation failed: Eval name required when logging results")
-            raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
-        return v
-    @field_validator('examples')
+    @field_validator("examples")
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")
         first_type = type(v[0])
         if first_type not in (Example, CustomExample):
             raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
         if not all(isinstance(ex, first_type) for ex in v):
-            raise ValueError("All examples must be of the same type, either all Example or all CustomExample.")
+            raise ValueError(
+                "All examples must be of the same type, either all Example or all CustomExample."
+            )
         return v
-    @field_validator('scorers')
+    @field_validator("scorers")
     def validate_scorers(cls, v):
         if not v:
             raise ValueError("Scorers cannot be empty.")
         return v
-    @field_validator('model')
+    @field_validator("model")
     def validate_model(cls, v, values):
         if not v:
             raise ValueError("Model cannot be empty.")
-        # Check if model is a judgevalJudge
-        if isinstance(v, JudgevalJudge):
-            # Verify all scorers are JudgevalScorer when using judgevalJudge
-            scorers = values.data.get('scorers', [])
-            if not all(isinstance(s, JudgevalScorer) for s in scorers):
-                raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
-            return v
         # Check if model is string or list of strings
         if isinstance(v, str):
             if v not in ACCEPTABLE_MODELS:
-                raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
-            return v
-        if isinstance(v, list):
-            if not all(isinstance(m, str) for m in v):
-                raise ValueError("When providing a list of models, all elements must be strings")
-            for m in v:
-                if m not in ACCEPTABLE_MODELS:
-                    raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
+                raise ValueError(
+                    f"Model name {v} not recognized. Please select a valid model name.)"
+                )
             return v
-        raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
-    @field_validator('aggregator', mode='before')
-    def validate_aggregator(cls, v, values):
-        model = values.data.get('model')
-        if isinstance(model, list) and v is None:
-            raise ValueError("Aggregator cannot be empty.")
-        if isinstance(model, list) and not isinstance(v, str):
-            raise ValueError("Aggregator must be a string if provided.")
-        if v is not None and v not in ACCEPTABLE_MODELS:
-            raise ValueError(f"Model name {v} not recognized.")
-        return v
     class Config:
         arbitrary_types_allowed = True

judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl