PyPI - judgeval - Versions diffs - 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

judgeval/common/tracer.py +528 -166
judgeval/constants.py +7 -4
judgeval/data/__init__.py +0 -3
judgeval/data/datasets/dataset.py +42 -19
judgeval/data/datasets/eval_dataset_client.py +59 -20
judgeval/data/result.py +34 -56
judgeval/integrations/langgraph.py +16 -12
judgeval/judgment_client.py +85 -23
judgeval/rules.py +177 -60
judgeval/run_evaluation.py +143 -122
judgeval/scorers/score.py +21 -18
judgeval/utils/alerts.py +32 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD +16 -17
judgeval/data/api_example.py +0 -98
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -27,7 +27,8 @@ from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
     JUDGMENT_EVAL_DELETE_API_URL,
     JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
-    JUDGMENT_PROJECT_DELETE_API_URL
+    JUDGMENT_PROJECT_DELETE_API_URL,
+    JUDGMENT_PROJECT_CREATE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -38,8 +39,21 @@ class EvalRunRequestBody(BaseModel):
     project_name: str
     judgment_api_key: str
+class DeleteEvalRunRequestBody(BaseModel):
+    eval_names: List[str]
+    project_name: str
+    judgment_api_key: str
+class SingletonMeta(type):
+    _instances = {}
-class JudgmentClient:
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        return cls._instances[cls]
+class JudgmentClient(metaclass=SingletonMeta):
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
         self.judgment_api_key = judgment_api_key
         self.organization_id = organization_id
@@ -51,8 +65,25 @@ class JudgmentClient:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
         else:
-            print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
+            print(f"Successfully initialized JudgmentClient!")
+    def a_run_evaluation(
+        self,
+        examples: List[Example],
+        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
+        model: Union[str, List[str], JudgevalJudge],
+        aggregator: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        log_results: bool = True,
+        project_name: str = "default_project",
+        eval_run_name: str = "default_eval_run",
+        override: bool = False,
+        use_judgment: bool = True,
+        ignore_errors: bool = True,
+        rules: Optional[List[Rule]] = None
+    ) -> List[ScoringResult]:
+        return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
     def run_evaluation(
         self,
         examples: List[Example],
@@ -65,6 +96,8 @@ class JudgmentClient:
         eval_run_name: str = "default_eval_run",
         override: bool = False,
         use_judgment: bool = True,
+        ignore_errors: bool = True,
+        async_execution: bool = False,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
         """
@@ -81,6 +114,7 @@ class JudgmentClient:
             eval_run_name (str): A name for this evaluation run
             override (bool): Whether to override an existing evaluation run with the same name
             use_judgment (bool): Whether to use Judgment API for evaluation
+            ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         Returns:
@@ -141,7 +175,7 @@ class JudgmentClient:
                 rules=loaded_rules,
                 organization_id=self.organization_id
             )
-            return run_eval(eval, override)
+            return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
         except ValueError as e:
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
@@ -242,7 +276,7 @@ class JudgmentClient:
     def create_dataset(self) -> EvalDataset:
         return self.eval_dataset_client.create_dataset()
-    def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
+    def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
         """
         Uploads an `EvalDataset` to the Judgment platform for storage.
@@ -256,9 +290,9 @@ class JudgmentClient:
         """
         # Set judgment_api_key just in case it was not set
         dataset.judgment_api_key = self.judgment_api_key
-        return self.eval_dataset_client.push(dataset, alias, overwrite)
+        return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def pull_dataset(self, alias: str) -> EvalDataset:
+    def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -268,25 +302,31 @@ class JudgmentClient:
         Returns:
             EvalDataset: The retrieved dataset
         """
-        return self.eval_dataset_client.pull(alias)
+        return self.eval_dataset_client.pull(alias, project_name)
+    def delete_dataset(self, alias: str, project_name: str) -> bool:
+        """
+        Deletes a saved `EvalDataset` from the Judgment platform.
+        """
+        return self.eval_dataset_client.delete(alias, project_name)
-    def pull_all_user_dataset_stats(self) -> dict:
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
         """
-        Retrieves all dataset stats from the Judgment platform for the user.
+        Retrieves all dataset stats from the Judgment platform for the project.
         Args:
-            alias (str): The name of the dataset to retrieve
+            project_name (str): The name of the project to retrieve
         Returns:
-            EvalDataset: The retrieved dataset
+            dict: The retrieved dataset stats
         """
-        return self.eval_dataset_client.pull_all_user_dataset_stats()
+        return self.eval_dataset_client.pull_project_dataset_stats(project_name)
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
         """
-        return self.eval_dataset_client.edit_dataset(alias, examples)
+        return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -324,19 +364,22 @@ class JudgmentClient:
             eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
         return eval_run_result
-    def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
+    def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
         """
-        Deletes an evaluation from the server by project and run name.
+        Deletes an evaluation from the server by project and run names.
         Args:
             project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
+            eval_run_names (List[str]): List of names of the evaluation runs
         Returns:
             bool: Whether the evaluation was successfully deleted
         """
-        eval_run_request_body = EvalRunRequestBody(project_name=project_name,
-                                                   eval_name=eval_run_name,
+        if not eval_run_names:
+            raise ValueError("No evaluation run names provided")
+        eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
+                                                   eval_names=eval_run_names,
                                                    judgment_api_key=self.judgment_api_key)
         response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
                         json=eval_run_request_body.model_dump(),
@@ -345,9 +388,11 @@ class JudgmentClient:
                             "Authorization": f"Bearer {self.judgment_api_key}",
                             "X-Organization-Id": self.organization_id
                         })
-        if response.status_code != requests.codes.ok:
+        if response.status_code == 404:
+            raise ValueError(f"Eval results not found: {response.json()}")
+        elif response.status_code == 500:
             raise ValueError(f"Error deleting eval results: {response.json()}")
-        return response.json()
+        return bool(response.json())
     def delete_project_evals(self, project_name: str) -> bool:
         """
@@ -372,6 +417,23 @@ class JudgmentClient:
             raise ValueError(f"Error deleting eval results: {response.json()}")
         return response.json()
+    def create_project(self, project_name: str) -> bool:
+        """
+        Creates a project on the server.
+        """
+        response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
+                        json={
+                            "project_name": project_name,
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error creating project: {response.json()}")
+        return response.json()
     def delete_project(self, project_name: str) -> bool:
         """
         Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.

judgeval/rules.py CHANGED Viewed

@@ -17,15 +17,6 @@ class AlertStatus(str, Enum):
     TRIGGERED = "triggered"
     NOT_TRIGGERED = "not_triggered"
-class Operator(str, Enum):
-    """Comparison operators for conditions."""
-    GT = ">"
-    GTE = ">="
-    LT = "<"
-    LTE = "<="
-    EQ = "=="
-    NEQ = "!="
 class Condition(BaseModel):
     """
     A single metric condition.
@@ -33,15 +24,13 @@ class Condition(BaseModel):
     Example:
         {
             "metric": FaithfulnessScorer(threshold=0.7)  # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
-            "operator": ">=",
-            "threshold": 0.7
         }
+    The Condition class uses the scorer's threshold and success function internally.
     """
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
-    operator: Operator
-    threshold: float
+    metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
     @property
     def metric_name(self) -> str:
@@ -58,22 +47,60 @@ class Condition(BaseModel):
         # Fallback to string representation
         return str(self.metric)
+    @property
+    def threshold(self) -> float:
+        """Get the threshold from the metric."""
+        return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
     def evaluate(self, value: float) -> bool:
-        """Evaluate this condition against a value."""
-        if self.operator == Operator.GT:
-            return value > self.threshold
-        elif self.operator == Operator.GTE:
-            return value >= self.threshold
-        elif self.operator == Operator.LT:
-            return value < self.threshold
-        elif self.operator == Operator.LTE:
-            return value <= self.threshold
-        elif self.operator == Operator.EQ:
-            return value == self.threshold
-        elif self.operator == Operator.NEQ:
-            return value != self.threshold
+        """
+        Evaluate the condition against a value.
+        Returns True if the condition passes, False otherwise.
+        Uses the scorer's success check function if available.
+        """
+        # Store the value in the scorer
+        if hasattr(self.metric, 'score'):
+            self.metric.score = value
+        # Use the scorer's success check function if available
+        if hasattr(self.metric, 'success_check'):
+            return self.metric.success_check()
+        elif hasattr(self.metric, '_success_check'):
+            return self.metric._success_check()
         else:
-            raise ValueError(f"Unknown operator: {self.operator}")
+            # Fallback to default comparison (greater than or equal)
+            return value >= self.threshold if self.threshold is not None else False
+class NotificationConfig(BaseModel):
+    """
+    Configuration for notifications when a rule is triggered.
+    Example:
+        {
+            "enabled": true,
+            "communication_methods": ["email", "broadcast_slack", "broadcast_email"],
+            "email_addresses": ["user1@example.com", "user2@example.com"],
+            "send_at": 1632150000  # Unix timestamp (specific date/time)
+        }
+    Communication Methods:
+        - "email": Send emails to specified email addresses
+        - "broadcast_slack": Send broadcast notifications to all configured Slack channels
+        - "broadcast_email": Send broadcast emails to all organization emails
+    """
+    enabled: bool = True
+    communication_methods: List[str] = []
+    email_addresses: Optional[List[str]] = None
+    send_at: Optional[int] = None  # Unix timestamp for scheduled notifications
+    def model_dump(self, **kwargs):
+        """Convert the NotificationConfig to a dictionary for JSON serialization."""
+        return {
+            "enabled": self.enabled,
+            "communication_methods": self.communication_methods,
+            "email_addresses": self.email_addresses,
+            "send_at": self.send_at
+        }
 class Rule(BaseModel):
     """
@@ -85,10 +112,15 @@ class Rule(BaseModel):
             "name": "Quality Check",
             "description": "Check if quality metrics meet thresholds",
             "conditions": [
-                {"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
-                {"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
+                {"metric": FaithfulnessScorer(threshold=0.7)},
+                {"metric": AnswerRelevancyScorer(threshold=0.8)}
             ],
-            "combine_type": "all"  # "all" or "any"
+            "combine_type": "all",  # "all" or "any"
+            "notification": {
+                "enabled": true,
+                "communication_methods": ["slack", "email"],
+                "email_addresses": ["user1@example.com", "user2@example.com"]
+            }
         }
     """
     rule_id: str = Field(default_factory=lambda: str(uuid.uuid4()))  # Random UUID string as default value
@@ -96,6 +128,8 @@ class Rule(BaseModel):
     description: Optional[str] = None
     conditions: List[Condition]
     combine_type: str = Field(..., pattern="^(all|any)$")  # all = AND, any = OR
+    notification: Optional[NotificationConfig] = None  # Configuration for notifications
     def model_dump(self, **kwargs):
         """
@@ -168,7 +202,6 @@ class Rule(BaseModel):
             raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
         return v
 class AlertResult(BaseModel):
     """
     Result of evaluating a rule.
@@ -185,6 +218,11 @@ class AlertResult(BaseModel):
             "metadata": {
                 "example_id": "example_123",
                 "timestamp": "20240321_123456"
+            },
+            "notification": {
+                "enabled": true,
+                "communication_methods": ["slack", "email"],
+                "email_addresses": ["user1@example.com", "user2@example.com"]
             }
         }
     """
@@ -193,6 +231,7 @@ class AlertResult(BaseModel):
     rule_name: str
     conditions_result: List[Dict[str, Any]]
     metadata: Dict[str, Any] = {}
+    notification: Optional[NotificationConfig] = None  # Configuration for notifications
     @property
     def example_id(self) -> Optional[str]:
@@ -206,36 +245,105 @@ class AlertResult(BaseModel):
 class RulesEngine:
     """
-    Engine for evaluating rules and managing alerts.
+    Engine for creating and evaluating rules against metrics.
-    Example usage:
+    Example:
+        ```python
+        # Define rules
         rules = {
-            "quality_check": Rule(
+            "1": Rule(
                 name="Quality Check",
+                description="Check if quality metrics meet thresholds",
                 conditions=[
-                    Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
-                    Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
+                    Condition(metric=FaithfulnessScorer(threshold=0.7)),
+                    Condition(metric=AnswerRelevancyScorer(threshold=0.8))
                 ],
                 combine_type="all"
             )
         }
+        # Create rules engine
         engine = RulesEngine(rules)
-        scores = {"faithfulness": 0.8, "relevancy": 0.9}
-        alerts = engine.evaluate_rules(scores, example_metadata={
-            "example_id": "example_123",
-            "timestamp": "20240321_123456"
-        })
+        # Configure notifications
+        engine.configure_notification(
+            rule_id="1",
+            enabled=True,
+            communication_methods=["slack", "email"],
+            email_addresses=["user@example.com"]
+        )
+        # Evaluate rules
+        scores = {"faithfulness": 0.65, "relevancy": 0.85}
+        results = engine.evaluate_rules(scores, {"example_id": "example_123"})
+        ```
     """
     def __init__(self, rules: Dict[str, Rule]):
         """
-        Initialize the RulesEngine with rules.
+        Initialize the rules engine.
         Args:
-            rules: Dictionary mapping rule IDs to rule configurations
+            rules: Dictionary mapping rule IDs to Rule objects
         """
         self.rules = rules
+    def configure_notification(self, rule_id: str, enabled: bool = True,
+                              communication_methods: List[str] = None,
+                              email_addresses: List[str] = None,
+                              send_at: Optional[int] = None) -> None:
+        """
+        Configure notification settings for a specific rule.
+        Args:
+            rule_id: ID of the rule to configure notifications for
+            enabled: Whether notifications are enabled for this rule
+            communication_methods: List of notification methods (e.g., ["slack", "email"])
+            email_addresses: List of email addresses to send notifications to
+            send_at: Optional Unix timestamp for when to send the notification
+        """
+        if rule_id not in self.rules:
+            raise ValueError(f"Rule ID '{rule_id}' not found")
+        rule = self.rules[rule_id]
+        # Create notification configuration if it doesn't exist
+        if rule.notification is None:
+            rule.notification = NotificationConfig()
+        # Set notification parameters
+        rule.notification.enabled = enabled
+        if communication_methods is not None:
+            rule.notification.communication_methods = communication_methods
+        if email_addresses is not None:
+            rule.notification.email_addresses = email_addresses
+        if send_at is not None:
+            rule.notification.send_at = send_at
+    def configure_all_notifications(self, enabled: bool = True,
+                                   communication_methods: List[str] = None,
+                                   email_addresses: List[str] = None,
+                                   send_at: Optional[int] = None) -> None:
+        """
+        Configure notification settings for all rules.
+        Args:
+            enabled: Whether notifications are enabled
+            communication_methods: List of notification methods (e.g., ["slack", "email"])
+            email_addresses: List of email addresses to send notifications to
+            send_at: Optional Unix timestamp for when to send the notification
+        """
+        for rule_id, rule in self.rules.items():
+            self.configure_notification(
+                rule_id=rule_id,
+                enabled=enabled,
+                communication_methods=communication_methods,
+                email_addresses=email_addresses,
+                send_at=send_at
+            )
     def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
         """
@@ -257,13 +365,13 @@ class RulesEngine:
                 # Get the metric name for lookup
                 metric_name = condition.metric_name
                 value = scores.get(metric_name)
                 if value is None:
                     # Skip this condition instead of evaluating it as false
                     condition_results.append({
                         "metric": metric_name,
                         "value": None,
                         "threshold": condition.threshold,
-                        "operator": condition.operator,
                         "passed": None,  # Using None to indicate the condition was skipped
                         "skipped": True  # Add a flag to indicate this condition was skipped
                     })
@@ -274,7 +382,6 @@ class RulesEngine:
                         "metric": metric_name,
                         "value": value,
                         "threshold": condition.threshold,
-                        "operator": condition.operator,
                         "passed": passed,
                         "skipped": False  # Indicate this condition was evaluated
                     })
@@ -285,23 +392,36 @@ class RulesEngine:
                 # If all conditions were skipped, the rule doesn't trigger
                 triggered = False
             else:
-                triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
+                if rule.combine_type == "all":
+                    # For "all" combine_type:
+                    # - All evaluated conditions must pass
+                    # - All conditions must have been evaluated (none skipped)
+                    all_conditions_passed = all(passed_conditions)
+                    all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
+                    triggered = all_conditions_passed and all_conditions_evaluated
+                else:
+                    # For "any" combine_type, at least one condition must pass
+                    triggered = any(passed_conditions)
             # Create alert result with example metadata
+            notification_config = None
+            if triggered and rule.notification:
+                # If rule has a notification config and the alert is triggered, include it in the result
+                notification_config = rule.notification
+            # Set the alert status based on whether the rule was triggered
+            status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
+            # Create the alert result
             alert_result = AlertResult(
-                status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
-                rule_id=rule.rule_id,  # Include the rule's unique identifier
+                status=status,
+                rule_id=rule.rule_id,
                 rule_name=rule.name,
-                conditions_result=condition_results
+                conditions_result=condition_results,
+                notification=notification_config,
+                metadata=example_metadata or {}
             )
-            # Add example metadata if provided
-            if example_metadata:
-                if "example_id" in example_metadata:
-                    alert_result.metadata["example_id"] = example_metadata["example_id"]
-                if "timestamp" in example_metadata:
-                    alert_result.metadata["timestamp"] = example_metadata["timestamp"]
             results[rule_id] = alert_result
         return results
@@ -376,7 +496,4 @@ class RulesEngine:
                 )
                 end_time = time.perf_counter()
-                # Could log performance metrics here if needed
-                # debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
                 return (example_id, rule_results)

judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl