PyPI - judgeval - Versions diffs - 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl - Mend

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

judgeval/__init__.py +5 -4
judgeval/clients.py +6 -6
judgeval/common/__init__.py +7 -2
judgeval/common/exceptions.py +2 -3
judgeval/common/logger.py +74 -49
judgeval/common/s3_storage.py +30 -23
judgeval/common/tracer.py +1273 -939
judgeval/common/utils.py +416 -244
judgeval/constants.py +73 -61
judgeval/data/__init__.py +1 -1
judgeval/data/custom_example.py +3 -2
judgeval/data/datasets/dataset.py +80 -54
judgeval/data/datasets/eval_dataset_client.py +131 -181
judgeval/data/example.py +67 -43
judgeval/data/result.py +11 -9
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +25 -16
judgeval/data/trace.py +57 -29
judgeval/data/trace_run.py +5 -11
judgeval/evaluation_run.py +22 -82
judgeval/integrations/langgraph.py +546 -184
judgeval/judges/base_judge.py +1 -2
judgeval/judges/litellm_judge.py +33 -11
judgeval/judges/mixture_of_judges.py +128 -78
judgeval/judges/together_judge.py +22 -9
judgeval/judges/utils.py +14 -5
judgeval/judgment_client.py +259 -271
judgeval/rules.py +169 -142
judgeval/run_evaluation.py +462 -305
judgeval/scorers/api_scorer.py +20 -11
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorer.py +77 -58
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
judgeval/scorers/prompt_scorer.py +48 -37
judgeval/scorers/score.py +86 -53
judgeval/scorers/utils.py +11 -7
judgeval/tracer/__init__.py +1 -1
judgeval/utils/alerts.py +23 -12
judgeval/utils/{data_utils.py → file_utils.py} +5 -9
judgeval/utils/requests.py +29 -0
judgeval/version_check.py +5 -2
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
judgeval-0.0.46.dist-info/RECORD +69 -0
judgeval-0.0.44.dist-info/RECORD +0 -68
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0

judgeval/rules.py CHANGED Viewed

@@ -2,43 +2,39 @@
 Rules system for Judgeval that enables alerts based on metric thresholds.
 """
-from typing import Dict, List, Optional, Union, Any, Set, Tuple
+from typing import Dict, List, Optional, Union, Any, Tuple
 from pydantic import BaseModel, Field, field_validator, ConfigDict
-from enum import Enum
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
-import time
 import uuid
-import os
-import re
-import json
-from datetime import datetime
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.utils.alerts import AlertStatus, AlertResult
 class Condition(BaseModel):
     """
     A single metric condition.
     Example:
         {
             "metric": FaithfulnessScorer(threshold=0.7)  # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
         }
     The Condition class uses the scorer's threshold and success function internally.
     """
     model_config = ConfigDict(arbitrary_types_allowed=True)
     metric: Union[APIJudgmentScorer, JudgevalScorer]
     @property
     def metric_name(self) -> str:
         """Get the name of the metric for lookups in scores dictionary."""
-        if hasattr(self.metric, 'score_type'):
+        if hasattr(self.metric, "score_type"):
             # Handle APIJudgmentScorer and JudgevalScorer which have score_type
             return self.metric.score_type
-        elif hasattr(self.metric, '__name__'):
+        elif hasattr(self.metric, "__name__"):
             # Handle cases where metric has a __name__ attribute
             return self.metric.__name__
         # Fallback to string representation
@@ -47,7 +43,7 @@ class Condition(BaseModel):
     @property
     def threshold(self) -> float:
         """Get the threshold from the metric."""
-        return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
+        return self.metric.threshold if hasattr(self.metric, "threshold") else 0.5
     def evaluate(self, value: float) -> bool:
         """
@@ -56,22 +52,23 @@ class Condition(BaseModel):
         Uses the scorer's success check function if available.
         """
         # Store the value in the scorer
-        if hasattr(self.metric, 'score'):
+        if hasattr(self.metric, "score"):
             self.metric.score = value
         # Use the scorer's success check function if available
-        if hasattr(self.metric, 'success_check'):
+        if hasattr(self.metric, "success_check"):
             return self.metric.success_check()
-        elif hasattr(self.metric, '_success_check'):
+        elif hasattr(self.metric, "_success_check"):
             return self.metric._success_check()
         else:
             # Fallback to default comparison (greater than or equal)
             return value >= self.threshold if self.threshold is not None else False
 class PagerDutyConfig(BaseModel):
     """
     Configuration for PagerDuty notifications.
     Attributes:
         routing_key: PagerDuty integration routing key
         severity: Severity level (critical, error, warning, info)
@@ -80,13 +77,14 @@ class PagerDutyConfig(BaseModel):
         group: Optional logical grouping for the alert
         class_type: Optional class/type of alert event
     """
     routing_key: str
     severity: str = "error"  # critical, error, warning, info
     source: str = "judgeval"
     component: Optional[str] = None
     group: Optional[str] = None
     class_type: Optional[str] = None
     def model_dump(self, **kwargs):
         """Convert the PagerDutyConfig to a dictionary for JSON serialization."""
         return {
@@ -95,13 +93,14 @@ class PagerDutyConfig(BaseModel):
             "source": self.source,
             "component": self.component,
             "group": self.group,
-            "class_type": self.class_type
+            "class_type": self.class_type,
         }
 class NotificationConfig(BaseModel):
     """
     Configuration for notifications when a rule is triggered.
     Example:
         {
             "enabled": true,
@@ -113,33 +112,37 @@ class NotificationConfig(BaseModel):
             },
             "send_at": 1632150000  # Unix timestamp (specific date/time)
         }
     Communication Methods:
         - "email": Send emails to specified email addresses
         - "broadcast_slack": Send broadcast notifications to all configured Slack channels
         - "broadcast_email": Send broadcast emails to all organization emails
         - "pagerduty": Send alerts to PagerDuty using the configured routing key
     """
     enabled: bool = True
     communication_methods: List[str] = []
     email_addresses: Optional[List[str]] = None
     pagerduty_config: Optional[PagerDutyConfig] = None
     send_at: Optional[int] = None  # Unix timestamp for scheduled notifications
     def model_dump(self, **kwargs):
         """Convert the NotificationConfig to a dictionary for JSON serialization."""
         return {
             "enabled": self.enabled,
             "communication_methods": self.communication_methods,
             "email_addresses": self.email_addresses,
-            "pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
-            "send_at": self.send_at
+            "pagerduty_config": self.pagerduty_config.model_dump()
+            if self.pagerduty_config
+            else None,
+            "send_at": self.send_at,
         }
 class Rule(BaseModel):
     """
     Configuration for a single rule.
     Example:
         {
             "rule_id": "123e4567-e89b-12d3-a456-426614174000",
@@ -157,34 +160,32 @@ class Rule(BaseModel):
             }
         }
     """
-    rule_id: str = Field(default_factory=lambda: str(uuid.uuid4()))  # Random UUID string as default value
+    rule_id: str = Field(
+        default_factory=lambda: str(uuid.uuid4())
+    )  # Random UUID string as default value
     name: str
     description: Optional[str] = None
     conditions: List[Condition]
     combine_type: str = Field(..., pattern="^(all|any)$")  # all = AND, any = OR
     notification: Optional[NotificationConfig] = None  # Configuration for notifications
     def model_dump(self, **kwargs):
         """
         Custom serialization that properly handles condition serialization.
         """
         data = super().model_dump(**kwargs)
         # Special handling for conditions with complex metric objects
         if "conditions" in data:
             for i, condition in enumerate(data["conditions"]):
                 if "metric" in condition:
                     # Get the actual metric object
                     metric_obj = self.conditions[i].metric
                     # Create standardized metric representation needed by server API
-                    metric_data = {
-                        "score_type": "",
-                        "threshold": 0.0,
-                        "name": ""
-                    }
+                    metric_data = {"score_type": "", "threshold": 0.0, "name": ""}
                     # First try to use object's own serialization methods
                     if hasattr(metric_obj, "to_dict"):
                         orig_data = metric_obj.to_dict()
@@ -196,61 +197,67 @@ class Rule(BaseModel):
                         # Copy any existing fields
                         for key, value in orig_data.items():
                             metric_data[key] = value
                     # If we already have data from original serialization methods but missing required fields
-                    if 'name' in metric_data and 'score_type' not in metric_data:
-                        metric_data['score_type'] = metric_data['name']
+                    if "name" in metric_data and "score_type" not in metric_data:
+                        metric_data["score_type"] = metric_data["name"]
                     # Ensure required fields have values by checking various sources
-                    if not metric_data['score_type']:
+                    if not metric_data["score_type"]:
                         # Try to get score_type from different possible attributes
-                        if hasattr(metric_obj, 'score_type'):
-                            metric_data['score_type'] = metric_obj.score_type
-                        elif hasattr(metric_obj, 'name'):
-                            metric_data['score_type'] = metric_obj.name
+                        if hasattr(metric_obj, "score_type"):
+                            metric_data["score_type"] = metric_obj.score_type
+                        elif hasattr(metric_obj, "name"):
+                            metric_data["score_type"] = metric_obj.name
                         else:
                             # Last resort: use string representation
-                            metric_data['score_type'] = str(metric_obj)
+                            metric_data["score_type"] = str(metric_obj)
                     # Make sure threshold is set
-                    if not metric_data.get('threshold') and metric_data.get('threshold') != 0.0:
-                        if hasattr(metric_obj, 'threshold'):
-                            metric_data['threshold'] = metric_obj.threshold
+                    if (
+                        not metric_data.get("threshold")
+                        and metric_data.get("threshold") != 0.0
+                    ):
+                        if hasattr(metric_obj, "threshold"):
+                            metric_data["threshold"] = metric_obj.threshold
                         else:
                             # Use condition threshold if metric doesn't have one
-                            metric_data['threshold'] = self.conditions[i].threshold
+                            metric_data["threshold"] = self.conditions[i].threshold
                     # Make sure name is set
-                    if not metric_data.get('name'):
-                        if hasattr(metric_obj, '__name__'):
-                            metric_data['name'] = metric_obj.__name__
-                        elif hasattr(metric_obj, 'name'):
-                            metric_data['name'] = metric_obj.name
+                    if not metric_data.get("name"):
+                        if hasattr(metric_obj, "__name__"):
+                            metric_data["name"] = metric_obj.__name__
+                        elif hasattr(metric_obj, "name"):
+                            metric_data["name"] = metric_obj.name
                         else:
                             # Fallback to score_type if available
-                            metric_data['name'] = metric_data.get('score_type', str(metric_obj))
+                            metric_data["name"] = metric_data.get(
+                                "score_type", str(metric_obj)
+                            )
                     # Update the condition with our properly serialized metric
                     condition["metric"] = metric_data
         return data
-    @field_validator('conditions')
+    @field_validator("conditions")
     def validate_conditions_not_empty(cls, v):
         if not v:
             raise ValueError("Conditions list cannot be empty")
         return v
-    @field_validator('combine_type')
+    @field_validator("combine_type")
     def validate_combine_type(cls, v):
         if v not in ["all", "any"]:
             raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
         return v
 class RulesEngine:
     """
     Engine for creating and evaluating rules against metrics.
     Example:
         ```python
         # Define rules
@@ -265,10 +272,10 @@ class RulesEngine:
                 combine_type="all"
             )
         }
         # Create rules engine
         engine = RulesEngine(rules)
         # Configure notifications
         engine.configure_notification(
             rule_id="1",
@@ -276,29 +283,33 @@ class RulesEngine:
             communication_methods=["slack", "email"],
             email_addresses=["user@example.com"]
         )
         # Evaluate rules
         scores = {"faithfulness": 0.65, "relevancy": 0.85}
         results = engine.evaluate_rules(scores, {"example_id": "example_123"})
         ```
     """
     def __init__(self, rules: Dict[str, Rule]):
         """
         Initialize the rules engine.
         Args:
             rules: Dictionary mapping rule IDs to Rule objects
         """
         self.rules = rules
-    def configure_notification(self, rule_id: str, enabled: bool = True,
-                              communication_methods: List[str] = None,
-                              email_addresses: List[str] = None,
-                              send_at: Optional[int] = None) -> None:
+    def configure_notification(
+        self,
+        rule_id: str,
+        enabled: bool = True,
+        communication_methods: List[str] | None = None,
+        email_addresses: List[str] | None = None,
+        send_at: Optional[int] = None,
+    ) -> None:
         """
         Configure notification settings for a specific rule.
         Args:
             rule_id: ID of the rule to configure notifications for
             enabled: Whether notifications are enabled for this rule
@@ -308,32 +319,35 @@ class RulesEngine:
         """
         if rule_id not in self.rules:
             raise ValueError(f"Rule ID '{rule_id}' not found")
         rule = self.rules[rule_id]
         # Create notification configuration if it doesn't exist
         if rule.notification is None:
             rule.notification = NotificationConfig()
         # Set notification parameters
         rule.notification.enabled = enabled
         if communication_methods is not None:
             rule.notification.communication_methods = communication_methods
         if email_addresses is not None:
             rule.notification.email_addresses = email_addresses
         if send_at is not None:
             rule.notification.send_at = send_at
-    def configure_all_notifications(self, enabled: bool = True,
-                                   communication_methods: List[str] = None,
-                                   email_addresses: List[str] = None,
-                                   send_at: Optional[int] = None) -> None:
+    def configure_all_notifications(
+        self,
+        enabled: bool = True,
+        communication_methods: List[str] | None = None,
+        email_addresses: List[str] | None = None,
+        send_at: Optional[int] = None,
+    ) -> None:
         """
         Configure notification settings for all rules.
         Args:
             enabled: Whether notifications are enabled
             communication_methods: List of notification methods (e.g., ["slack", "email"])
@@ -346,14 +360,18 @@ class RulesEngine:
                 enabled=enabled,
                 communication_methods=communication_methods,
                 email_addresses=email_addresses,
-                send_at=send_at
+                send_at=send_at,
             )
-    def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
+    def evaluate_rules(
+        self,
+        scores: Dict[str, float],
+        example_metadata: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, AlertResult]:
         """
         Evaluate all rules against a set of scores.
         Returns mapping of rule IDs to their alert results.
         Args:
             scores: Dictionary of metric names to their score values
             example_metadata: Optional dictionary containing example metadata (example_id, timestamp)
@@ -364,33 +382,37 @@ class RulesEngine:
             # Evaluate each condition
             condition_results = []
             passed_conditions = []
             for condition in rule.conditions:
                 # Get the metric name for lookup
                 metric_name = condition.metric_name
                 value = scores.get(metric_name)
                 if value is None:
                     # Skip this condition instead of evaluating it as false
-                    condition_results.append({
-                        "metric": metric_name,
-                        "value": None,
-                        "threshold": condition.threshold,
-                        "passed": None,  # Using None to indicate the condition was skipped
-                        "skipped": True  # Add a flag to indicate this condition was skipped
-                    })
+                    condition_results.append(
+                        {
+                            "metric": metric_name,
+                            "value": None,
+                            "threshold": condition.threshold,
+                            "passed": None,  # Using None to indicate the condition was skipped
+                            "skipped": True,  # Add a flag to indicate this condition was skipped
+                        }
+                    )
                     continue  # Skip adding to passed_conditions
                 else:
                     passed = condition.evaluate(value)
-                    condition_results.append({
-                        "metric": metric_name,
-                        "value": value,
-                        "threshold": condition.threshold,
-                        "passed": passed,
-                        "skipped": False  # Indicate this condition was evaluated
-                    })
+                    condition_results.append(
+                        {
+                            "metric": metric_name,
+                            "value": value,
+                            "threshold": condition.threshold,
+                            "passed": passed,
+                            "skipped": False,  # Indicate this condition was evaluated
+                        }
+                    )
                     passed_conditions.append(passed)
             # Determine if alert should trigger - only consider conditions that weren't skipped
             if not passed_conditions:
                 # If all conditions were skipped, the rule doesn't trigger
@@ -401,21 +423,23 @@ class RulesEngine:
                     # - All evaluated conditions must pass
                     # - All conditions must have been evaluated (none skipped)
                     all_conditions_passed = all(passed_conditions)
-                    all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
+                    all_conditions_evaluated = len(passed_conditions) == len(
+                        rule.conditions
+                    )
                     triggered = all_conditions_passed and all_conditions_evaluated
                 else:
                     # For "any" combine_type, at least one condition must pass
                     triggered = any(passed_conditions)
             # Create alert result with example metadata
             notification_config = None
             if triggered and rule.notification:
                 # If rule has a notification config and the alert is triggered, include it in the result
                 notification_config = rule.notification
             # Set the alert status based on whether the rule was triggered using proper enum values
             status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
             # Create the alert result
             alert_result = AlertResult(
                 status=status,
@@ -425,26 +449,32 @@ class RulesEngine:
                 notification=notification_config,
                 metadata=example_metadata or {},
                 combine_type=rule.combine_type,
-                project_id=example_metadata.get("project_id") if example_metadata else None,
-                trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
+                project_id=example_metadata.get("project_id")
+                if example_metadata
+                else None,
+                trace_span_id=example_metadata.get("trace_span_id")
+                if example_metadata
+                else None,
             )
             results[rule_id] = alert_result
         return results
-    async def evaluate_rules_parallel(self,
-                               example_scores: Dict[str, Dict[str, float]],
-                               example_metadata: Dict[str, Dict[str, Any]],
-                               max_concurrent: int = 100) -> Dict[str, Dict[str, AlertResult]]:
+    async def evaluate_rules_parallel(
+        self,
+        example_scores: Dict[str, Dict[str, float]],
+        example_metadata: Dict[str, Dict[str, Any]],
+        max_concurrent: int = 100,
+    ) -> Dict[str, Dict[str, AlertResult]]:
         """
         Evaluate all rules against multiple examples in parallel.
         Args:
             example_scores: Dictionary mapping example_ids to their score dictionaries
             example_metadata: Dictionary mapping example_ids to their metadata
             max_concurrent: Maximum number of concurrent evaluations
         Returns:
             Dictionary mapping example_ids to dictionaries of rule_ids and their alert results
         """
@@ -452,7 +482,7 @@ class RulesEngine:
         semaphore = asyncio.Semaphore(max_concurrent)
         results = {}
         tasks = []
         # Create a task for each example
         for example_id, scores in example_scores.items():
             metadata = example_metadata.get(example_id, {})
@@ -460,33 +490,35 @@ class RulesEngine:
                 semaphore=semaphore,
                 example_id=example_id,
                 scores=scores,
-                metadata=metadata
+                metadata=metadata,
             )
             tasks.append(task)
         # Run all tasks and collect results
         example_results = await asyncio.gather(*tasks)
         # Organize results by example_id
         for example_id, result in example_results:
             results[example_id] = result
         return results
-    async def _evaluate_with_semaphore(self,
-                                semaphore: asyncio.Semaphore,
-                                example_id: str,
-                                scores: Dict[str, float],
-                                metadata: Dict[str, Any]) -> Tuple[str, Dict[str, AlertResult]]:
+    async def _evaluate_with_semaphore(
+        self,
+        semaphore: asyncio.Semaphore,
+        example_id: str,
+        scores: Dict[str, float],
+        metadata: Dict[str, Any],
+    ) -> Tuple[str, Dict[str, AlertResult]]:
         """
         Helper method to evaluate rules for an example with semaphore control.
         Args:
             semaphore: Semaphore to control concurrency
             example_id: ID of the example being evaluated
             scores: Dictionary of scores for this example
             metadata: Metadata for this example
         Returns:
             Tuple of (example_id, rule_results)
         """
@@ -494,13 +526,8 @@ class RulesEngine:
             # Run the evaluation in a thread pool to avoid blocking the event loop
             # for CPU-bound operations
             with ThreadPoolExecutor() as executor:
-                start_time = time.perf_counter()
                 rule_results = await asyncio.get_event_loop().run_in_executor(
-                    executor,
-                    self.evaluate_rules,
-                    scores,
-                    metadata
+                    executor, self.evaluate_rules, scores, metadata
                 )
-                end_time = time.perf_counter()
-                return (example_id, rule_results)
+                return (example_id, rule_results)

judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl