PyPI - judgeval - Versions diffs - 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl - Mend

judgeval 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

judgeval/common/tracer.py +55 -8
judgeval/constants.py +3 -2
judgeval/data/datasets/dataset.py +42 -19
judgeval/integrations/langgraph.py +16 -12
judgeval/judgment_client.py +39 -9
judgeval/rules.py +177 -60
judgeval/run_evaluation.py +140 -103
judgeval/scorers/score.py +16 -11
judgeval/utils/alerts.py +32 -1
{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/METADATA +1 -1
{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/RECORD +13 -13
{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/WHEEL +0 -0
{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -20,6 +20,7 @@ from rich import print as rprint
 # Third-party imports
 import pika
 import requests
+from litellm import cost_per_token
 from pydantic import BaseModel
 from rich import print as rprint
 from openai import OpenAI
@@ -332,6 +333,9 @@ class TraceClient:
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
         self.trace_manager_client = TraceManagerClient(tracer.api_key, tracer.organization_id)  # Manages DB operations for trace data
+        self.visited_nodes = []  # Track nodes visited through langgraph_node spans
+        self.executed_tools = []  # Track tools executed through tool spans
+        self.executed_node_tools = []  # Track node:tool combinations
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -618,30 +622,70 @@ class TraceClient:
         total_completion_tokens = 0
         total_tokens = 0
+        total_prompt_tokens_cost = 0.0
+        total_completion_tokens_cost = 0.0
+        total_cost = 0.0
         for entry in condensed_entries:
             if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
-                usage = entry["output"].get("usage", {})
+                output = entry["output"]
+                usage = output.get("usage", {})
+                model_name = entry.get("inputs", {}).get("model", "")
+                prompt_tokens = 0
+                completion_tokens = 0
                 # Handle OpenAI/Together format
                 if "prompt_tokens" in usage:
-                    total_prompt_tokens += usage.get("prompt_tokens", 0)
-                    total_completion_tokens += usage.get("completion_tokens", 0)
+                    prompt_tokens = usage.get("prompt_tokens", 0)
+                    completion_tokens = usage.get("completion_tokens", 0)
+                    total_prompt_tokens += prompt_tokens
+                    total_completion_tokens += completion_tokens
                 # Handle Anthropic format
                 elif "input_tokens" in usage:
-                    total_prompt_tokens += usage.get("input_tokens", 0)
-                    total_completion_tokens += usage.get("output_tokens", 0)
+                    prompt_tokens = usage.get("input_tokens", 0)
+                    completion_tokens = usage.get("output_tokens", 0)
+                    total_prompt_tokens += prompt_tokens
+                    total_completion_tokens += completion_tokens
                 total_tokens += usage.get("total_tokens", 0)
+                # Calculate costs if model name is available
+                if model_name:
+                    try:
+                        prompt_cost, completion_cost = cost_per_token(
+                            model=model_name,
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens
+                        )
+                        total_prompt_tokens_cost += prompt_cost
+                        total_completion_tokens_cost += completion_cost
+                        total_cost += prompt_cost + completion_cost
+                        # Add cost information directly to the usage dictionary in the condensed entry
+                        if "usage" not in output:
+                            output["usage"] = {}
+                        output["usage"]["prompt_tokens_cost_usd"] = prompt_cost
+                        output["usage"]["completion_tokens_cost_usd"] = completion_cost
+                        output["usage"]["total_cost_usd"] = prompt_cost + completion_cost
+                    except Exception as e:
+                        # If cost calculation fails, continue without adding costs
+                        print(f"Error calculating cost for model '{model_name}': {str(e)}")
+                        pass
         # Create trace document
         trace_data = {
             "trace_id": self.trace_id,
             "name": self.name,
             "project_name": self.project_name,
-            "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
+            "created_at": datetime.utcfromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
             "token_counts": {
                 "prompt_tokens": total_prompt_tokens,
                 "completion_tokens": total_completion_tokens,
                 "total_tokens": total_tokens,
+                "prompt_tokens_cost_usd": total_prompt_tokens_cost,
+                "completion_tokens_cost_usd": total_completion_tokens_cost,
+                "total_cost_usd": total_cost
             },
             "entries": condensed_entries,
             "empty_save": empty_save,
@@ -697,7 +741,6 @@ class Tracer:
             if not organization_id:
                 raise ValueError("Tracer must be configured with an Organization ID")
             self.api_key: str = api_key
             self.project_name: str = project_name
             self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
@@ -767,8 +810,9 @@ class Tracer:
             project_name: Optional project name override
             overwrite: Whether to overwrite existing traces
         """
+        # If monitoring is disabled, return the function as is
         if not self.enable_monitoring:
-            return
+            return func if func else lambda f: f
         if func is None:
             return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
@@ -872,6 +916,9 @@ class Tracer:
             return wrapper
     def async_evaluate(self, *args, **kwargs):
+        if not self.enable_evaluations:
+            return
         if self._current_trace:
             self._current_trace.async_evaluate(*args, **kwargs)
         else:

judgeval/constants.py CHANGED Viewed

@@ -46,13 +46,14 @@ JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
 JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
-JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
+JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
+JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
+JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -90,9 +90,18 @@ class EvalDataset:
     def add_from_csv(
         self,
         file_path: str,
+        header_mapping: dict,
+        primary_delimiter: str = ",",
+        secondary_delimiter: str = ";"
         ) -> None:
         """
         Add Examples from a CSV file.
+        Args:
+            file_path (str): Path to the CSV file
+            header_mapping (dict): Dictionary mapping Example headers to custom headers
+            primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
+            secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
         """
         try:
             import pandas as pd
@@ -102,9 +111,10 @@ class EvalDataset:
             )
         # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
-        df = pd.read_csv(file_path, dtype={'trace_id': str})
+        df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
         """
-        Expect the CSV to have headers
+        The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
+        Available headers for Example objects are as follows:
         "input", "actual_output", "expected_output", "context", \
         "retrieval_context", "additional_metadata", "tools_called", \
@@ -113,35 +123,48 @@ class EvalDataset:
         We want to collect the examples separately which can
         be determined by the "example" column. If the value is True, then it is an
-        example
+        example, and we expect the `input` and `actual_output` fields to be non-null.
-        We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
-        This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
+        We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
+        This can be adjusted using the `secondary_delimiter` parameter.
         """
         examples = []
+        def process_csv_row(value, header):
+            """
+            Maps a singular value in the CSV file to the appropriate type based on the header.
+            If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
+            """
+            # check that the CSV value is not null for entry
+            null_replacement = dict() if header == 'additional_metadata' else None
+            if pd.isna(value) or value == '':
+                return null_replacement
+            try:
+                value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
+            except (ValueError, SyntaxError):
+                value = str(value)
+            if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
+                # attempt to split the value by the secondary delimiter
+                value = value.split(secondary_delimiter)
+            return value
         for _, row in df.iterrows():
             data = {
-                "input": row["input"],
-                "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
-                "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
-                "context": row["context"].split(";") if pd.notna(row["context"]) else [],
-                "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
-                "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
-                "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
-                "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
-                "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
-                "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
+                header: process_csv_row(
+                    row[header_mapping[header]], header
+                )
+                for header in header_mapping
             }
-            if row["example"]:
-                data["name"] = row["name"] if pd.notna(row["name"]) else None
+            if "example" in header_mapping and row[header_mapping["example"]]:
+                if "name" in header_mapping:
+                    data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
                 # every Example has `input` and `actual_output` fields
                 if data["input"] is not None and data["actual_output"] is not None:
                     e = Example(**data)
                     examples.append(e)
                 else:
                     raise ValueError("Every example must have an 'input' and 'actual_output' field.")
         for e in examples:
             self.add_example(e)

judgeval/integrations/langgraph.py CHANGED Viewed

@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             self.start_span("LangGraph", span_type="Main Function")
-        node = metadata.get("langgraph_node")
-        if node != None and node != self.previous_node:
-            self.start_span(node, span_type="node")
-            self.executed_node_tools.append(node)
-            self.executed_nodes.append(node)
-            self.trace_client.record_input({
-                'args': inputs,
-                'kwargs': kwargs
-            })
-        self.previous_node = node
+        metadata = kwargs.get("metadata", {})
+        if node := metadata.get("langgraph_node"):
+            if node != self.previous_node:
+                # Track node execution
+                self.trace_client.visited_nodes.append(node)
+                self.trace_client.executed_node_tools.append(node)
+                self.trace_client.record_input({
+                    'args': inputs,
+                    'kwargs': kwargs
+                })
+            self.previous_node = node
     def on_chain_end(
         self,
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
     ):
         name = serialized["name"]
         self.start_span(name, span_type="tool")
-        self.executed_node_tools.append(f"{self.previous_node}:{name}")
-        self.executed_tools.append(name)
+        if name:
+            # Track tool execution
+            self.trace_client.executed_tools.append(name)
+            node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
+            self.trace_client.executed_node_tools.append(node_tool)
         self.trace_client.record_input({
             'args': input_str,
             'kwargs': kwargs

judgeval/judgment_client.py CHANGED Viewed

@@ -38,6 +38,11 @@ class EvalRunRequestBody(BaseModel):
     project_name: str
     judgment_api_key: str
+class DeleteEvalRunRequestBody(BaseModel):
+    eval_names: List[str]
+    project_name: str
+    judgment_api_key: str
 class JudgmentClient:
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
@@ -52,7 +57,24 @@ class JudgmentClient:
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
         else:
             print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
+    def a_run_evaluation(
+        self,
+        examples: List[Example],
+        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
+        model: Union[str, List[str], JudgevalJudge],
+        aggregator: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        log_results: bool = True,
+        project_name: str = "default_project",
+        eval_run_name: str = "default_eval_run",
+        override: bool = False,
+        use_judgment: bool = True,
+        ignore_errors: bool = True,
+        rules: Optional[List[Rule]] = None
+    ) -> List[ScoringResult]:
+        return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
     def run_evaluation(
         self,
         examples: List[Example],
@@ -65,6 +87,8 @@ class JudgmentClient:
         eval_run_name: str = "default_eval_run",
         override: bool = False,
         use_judgment: bool = True,
+        ignore_errors: bool = True,
+        async_execution: bool = False,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
         """
@@ -81,6 +105,7 @@ class JudgmentClient:
             eval_run_name (str): A name for this evaluation run
             override (bool): Whether to override an existing evaluation run with the same name
             use_judgment (bool): Whether to use Judgment API for evaluation
+            ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
             rules (Optional[List[Rule]]): Rules to evaluate against scoring results
         Returns:
@@ -141,7 +166,7 @@ class JudgmentClient:
                 rules=loaded_rules,
                 organization_id=self.organization_id
             )
-            return run_eval(eval, override)
+            return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
         except ValueError as e:
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
         except Exception as e:
@@ -324,19 +349,22 @@ class JudgmentClient:
             eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
         return eval_run_result
-    def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
+    def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
         """
-        Deletes an evaluation from the server by project and run name.
+        Deletes an evaluation from the server by project and run names.
         Args:
             project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
+            eval_run_names (List[str]): List of names of the evaluation runs
         Returns:
             bool: Whether the evaluation was successfully deleted
         """
-        eval_run_request_body = EvalRunRequestBody(project_name=project_name,
-                                                   eval_name=eval_run_name,
+        if not eval_run_names:
+            raise ValueError("No evaluation run names provided")
+        eval_run_request_body = DeleteEvalRunRequestBody(project_name=project_name,
+                                                   eval_names=eval_run_names,
                                                    judgment_api_key=self.judgment_api_key)
         response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
                         json=eval_run_request_body.model_dump(),
@@ -345,9 +373,11 @@ class JudgmentClient:
                             "Authorization": f"Bearer {self.judgment_api_key}",
                             "X-Organization-Id": self.organization_id
                         })
-        if response.status_code != requests.codes.ok:
+        if response.status_code == 404:
+            raise ValueError(f"Eval results not found: {response.json()}")
+        elif response.status_code == 500:
             raise ValueError(f"Error deleting eval results: {response.json()}")
-        return response.json()
+        return bool(response.json())
     def delete_project_evals(self, project_name: str) -> bool:
         """

judgeval/rules.py CHANGED Viewed

@@ -17,15 +17,6 @@ class AlertStatus(str, Enum):
     TRIGGERED = "triggered"
     NOT_TRIGGERED = "not_triggered"
-class Operator(str, Enum):
-    """Comparison operators for conditions."""
-    GT = ">"
-    GTE = ">="
-    LT = "<"
-    LTE = "<="
-    EQ = "=="
-    NEQ = "!="
 class Condition(BaseModel):
     """
     A single metric condition.
@@ -33,15 +24,13 @@ class Condition(BaseModel):
     Example:
         {
             "metric": FaithfulnessScorer(threshold=0.7)  # Must be a scorer object: APIJudgmentScorer, JudgevalScorer, or ScorerWrapper
-            "operator": ">=",
-            "threshold": 0.7
         }
+    The Condition class uses the scorer's threshold and success function internally.
     """
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
-    operator: Operator
-    threshold: float
+    metric: Union[APIJudgmentScorer, JudgevalScorer, ScorerWrapper]
     @property
     def metric_name(self) -> str:
@@ -58,22 +47,60 @@ class Condition(BaseModel):
         # Fallback to string representation
         return str(self.metric)
+    @property
+    def threshold(self) -> float:
+        """Get the threshold from the metric."""
+        return self.metric.threshold if hasattr(self.metric, 'threshold') else 0.5
     def evaluate(self, value: float) -> bool:
-        """Evaluate this condition against a value."""
-        if self.operator == Operator.GT:
-            return value > self.threshold
-        elif self.operator == Operator.GTE:
-            return value >= self.threshold
-        elif self.operator == Operator.LT:
-            return value < self.threshold
-        elif self.operator == Operator.LTE:
-            return value <= self.threshold
-        elif self.operator == Operator.EQ:
-            return value == self.threshold
-        elif self.operator == Operator.NEQ:
-            return value != self.threshold
+        """
+        Evaluate the condition against a value.
+        Returns True if the condition passes, False otherwise.
+        Uses the scorer's success check function if available.
+        """
+        # Store the value in the scorer
+        if hasattr(self.metric, 'score'):
+            self.metric.score = value
+        # Use the scorer's success check function if available
+        if hasattr(self.metric, 'success_check'):
+            return self.metric.success_check()
+        elif hasattr(self.metric, '_success_check'):
+            return self.metric._success_check()
         else:
-            raise ValueError(f"Unknown operator: {self.operator}")
+            # Fallback to default comparison (greater than or equal)
+            return value >= self.threshold if self.threshold is not None else False
+class NotificationConfig(BaseModel):
+    """
+    Configuration for notifications when a rule is triggered.
+    Example:
+        {
+            "enabled": true,
+            "communication_methods": ["email", "broadcast_slack", "broadcast_email"],
+            "email_addresses": ["user1@example.com", "user2@example.com"],
+            "send_at": 1632150000  # Unix timestamp (specific date/time)
+        }
+    Communication Methods:
+        - "email": Send emails to specified email addresses
+        - "broadcast_slack": Send broadcast notifications to all configured Slack channels
+        - "broadcast_email": Send broadcast emails to all organization emails
+    """
+    enabled: bool = True
+    communication_methods: List[str] = []
+    email_addresses: Optional[List[str]] = None
+    send_at: Optional[int] = None  # Unix timestamp for scheduled notifications
+    def model_dump(self, **kwargs):
+        """Convert the NotificationConfig to a dictionary for JSON serialization."""
+        return {
+            "enabled": self.enabled,
+            "communication_methods": self.communication_methods,
+            "email_addresses": self.email_addresses,
+            "send_at": self.send_at
+        }
 class Rule(BaseModel):
     """
@@ -85,10 +112,15 @@ class Rule(BaseModel):
             "name": "Quality Check",
             "description": "Check if quality metrics meet thresholds",
             "conditions": [
-                {"metric": FaithfulnessScorer(threshold=0.7), "operator": ">=", "threshold": 0.7},
-                {"metric": AnswerRelevancyScorer(threshold=0.8), "operator": ">=", "threshold": 0.8}
+                {"metric": FaithfulnessScorer(threshold=0.7)},
+                {"metric": AnswerRelevancyScorer(threshold=0.8)}
             ],
-            "combine_type": "all"  # "all" or "any"
+            "combine_type": "all",  # "all" or "any"
+            "notification": {
+                "enabled": true,
+                "communication_methods": ["slack", "email"],
+                "email_addresses": ["user1@example.com", "user2@example.com"]
+            }
         }
     """
     rule_id: str = Field(default_factory=lambda: str(uuid.uuid4()))  # Random UUID string as default value
@@ -96,6 +128,8 @@ class Rule(BaseModel):
     description: Optional[str] = None
     conditions: List[Condition]
     combine_type: str = Field(..., pattern="^(all|any)$")  # all = AND, any = OR
+    notification: Optional[NotificationConfig] = None  # Configuration for notifications
     def model_dump(self, **kwargs):
         """
@@ -168,7 +202,6 @@ class Rule(BaseModel):
             raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
         return v
 class AlertResult(BaseModel):
     """
     Result of evaluating a rule.
@@ -185,6 +218,11 @@ class AlertResult(BaseModel):
             "metadata": {
                 "example_id": "example_123",
                 "timestamp": "20240321_123456"
+            },
+            "notification": {
+                "enabled": true,
+                "communication_methods": ["slack", "email"],
+                "email_addresses": ["user1@example.com", "user2@example.com"]
             }
         }
     """
@@ -193,6 +231,7 @@ class AlertResult(BaseModel):
     rule_name: str
     conditions_result: List[Dict[str, Any]]
     metadata: Dict[str, Any] = {}
+    notification: Optional[NotificationConfig] = None  # Configuration for notifications
     @property
     def example_id(self) -> Optional[str]:
@@ -206,36 +245,105 @@ class AlertResult(BaseModel):
 class RulesEngine:
     """
-    Engine for evaluating rules and managing alerts.
+    Engine for creating and evaluating rules against metrics.
-    Example usage:
+    Example:
+        ```python
+        # Define rules
         rules = {
-            "quality_check": Rule(
+            "1": Rule(
                 name="Quality Check",
+                description="Check if quality metrics meet thresholds",
                 conditions=[
-                    Condition(metric=FaithfulnessScorer(threshold=0.7), operator=">=", threshold=0.7),
-                    Condition(metric=AnswerRelevancyScorer(threshold=0.8), operator=">=", threshold=0.8)
+                    Condition(metric=FaithfulnessScorer(threshold=0.7)),
+                    Condition(metric=AnswerRelevancyScorer(threshold=0.8))
                 ],
                 combine_type="all"
             )
         }
+        # Create rules engine
         engine = RulesEngine(rules)
-        scores = {"faithfulness": 0.8, "relevancy": 0.9}
-        alerts = engine.evaluate_rules(scores, example_metadata={
-            "example_id": "example_123",
-            "timestamp": "20240321_123456"
-        })
+        # Configure notifications
+        engine.configure_notification(
+            rule_id="1",
+            enabled=True,
+            communication_methods=["slack", "email"],
+            email_addresses=["user@example.com"]
+        )
+        # Evaluate rules
+        scores = {"faithfulness": 0.65, "relevancy": 0.85}
+        results = engine.evaluate_rules(scores, {"example_id": "example_123"})
+        ```
     """
     def __init__(self, rules: Dict[str, Rule]):
         """
-        Initialize the RulesEngine with rules.
+        Initialize the rules engine.
         Args:
-            rules: Dictionary mapping rule IDs to rule configurations
+            rules: Dictionary mapping rule IDs to Rule objects
         """
         self.rules = rules
+    def configure_notification(self, rule_id: str, enabled: bool = True,
+                              communication_methods: List[str] = None,
+                              email_addresses: List[str] = None,
+                              send_at: Optional[int] = None) -> None:
+        """
+        Configure notification settings for a specific rule.
+        Args:
+            rule_id: ID of the rule to configure notifications for
+            enabled: Whether notifications are enabled for this rule
+            communication_methods: List of notification methods (e.g., ["slack", "email"])
+            email_addresses: List of email addresses to send notifications to
+            send_at: Optional Unix timestamp for when to send the notification
+        """
+        if rule_id not in self.rules:
+            raise ValueError(f"Rule ID '{rule_id}' not found")
+        rule = self.rules[rule_id]
+        # Create notification configuration if it doesn't exist
+        if rule.notification is None:
+            rule.notification = NotificationConfig()
+        # Set notification parameters
+        rule.notification.enabled = enabled
+        if communication_methods is not None:
+            rule.notification.communication_methods = communication_methods
+        if email_addresses is not None:
+            rule.notification.email_addresses = email_addresses
+        if send_at is not None:
+            rule.notification.send_at = send_at
+    def configure_all_notifications(self, enabled: bool = True,
+                                   communication_methods: List[str] = None,
+                                   email_addresses: List[str] = None,
+                                   send_at: Optional[int] = None) -> None:
+        """
+        Configure notification settings for all rules.
+        Args:
+            enabled: Whether notifications are enabled
+            communication_methods: List of notification methods (e.g., ["slack", "email"])
+            email_addresses: List of email addresses to send notifications to
+            send_at: Optional Unix timestamp for when to send the notification
+        """
+        for rule_id, rule in self.rules.items():
+            self.configure_notification(
+                rule_id=rule_id,
+                enabled=enabled,
+                communication_methods=communication_methods,
+                email_addresses=email_addresses,
+                send_at=send_at
+            )
     def evaluate_rules(self, scores: Dict[str, float], example_metadata: Optional[Dict[str, Any]] = None) -> Dict[str, AlertResult]:
         """
@@ -257,13 +365,13 @@ class RulesEngine:
                 # Get the metric name for lookup
                 metric_name = condition.metric_name
                 value = scores.get(metric_name)
                 if value is None:
                     # Skip this condition instead of evaluating it as false
                     condition_results.append({
                         "metric": metric_name,
                         "value": None,
                         "threshold": condition.threshold,
-                        "operator": condition.operator,
                         "passed": None,  # Using None to indicate the condition was skipped
                         "skipped": True  # Add a flag to indicate this condition was skipped
                     })
@@ -274,7 +382,6 @@ class RulesEngine:
                         "metric": metric_name,
                         "value": value,
                         "threshold": condition.threshold,
-                        "operator": condition.operator,
                         "passed": passed,
                         "skipped": False  # Indicate this condition was evaluated
                     })
@@ -285,23 +392,36 @@ class RulesEngine:
                 # If all conditions were skipped, the rule doesn't trigger
                 triggered = False
             else:
-                triggered = all(passed_conditions) if rule.combine_type == "all" else any(passed_conditions)
+                if rule.combine_type == "all":
+                    # For "all" combine_type:
+                    # - All evaluated conditions must pass
+                    # - All conditions must have been evaluated (none skipped)
+                    all_conditions_passed = all(passed_conditions)
+                    all_conditions_evaluated = len(passed_conditions) == len(rule.conditions)
+                    triggered = all_conditions_passed and all_conditions_evaluated
+                else:
+                    # For "any" combine_type, at least one condition must pass
+                    triggered = any(passed_conditions)
             # Create alert result with example metadata
+            notification_config = None
+            if triggered and rule.notification:
+                # If rule has a notification config and the alert is triggered, include it in the result
+                notification_config = rule.notification
+            # Set the alert status based on whether the rule was triggered
+            status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
+            # Create the alert result
             alert_result = AlertResult(
-                status=AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED,
-                rule_id=rule.rule_id,  # Include the rule's unique identifier
+                status=status,
+                rule_id=rule.rule_id,
                 rule_name=rule.name,
-                conditions_result=condition_results
+                conditions_result=condition_results,
+                notification=notification_config,
+                metadata=example_metadata or {}
             )
-            # Add example metadata if provided
-            if example_metadata:
-                if "example_id" in example_metadata:
-                    alert_result.metadata["example_id"] = example_metadata["example_id"]
-                if "timestamp" in example_metadata:
-                    alert_result.metadata["timestamp"] = example_metadata["timestamp"]
             results[rule_id] = alert_result
         return results
@@ -376,7 +496,4 @@ class RulesEngine:
                 )
                 end_time = time.perf_counter()
-                # Could log performance metrics here if needed
-                # debug(f"Rule evaluation for example {example_id} took {end_time - start_time:.4f} seconds")
                 return (example_id, rule_results)

judgeval/run_evaluation.py CHANGED Viewed

@@ -23,17 +23,35 @@ from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
-    MAX_CONCURRENT_EVALUATIONS
+    MAX_CONCURRENT_EVALUATIONS,
+    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.evaluation_run import EvaluationRun
 from judgeval.common.logger import (
     debug,
     info,
     error,
     example_logging_context
 )
+from judgeval.evaluation_run import EvaluationRun
+def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
+    """
+    Sends an evaluation run to the RabbitMQ evaluation queue.
+    """
+    payload = evaluation_run.model_dump(warnings=False)
+    response = requests.post(
+        JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+            "X-Organization-Id": evaluation_run.organization_id
+        },
+        json=payload,
+        verify=True
+    )
+    return response.json()
 def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
     """
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         # submit API request to execute evals
         payload = evaluation_run.model_dump(warnings=False)
         response = requests.post(
-            JUDGMENT_EVAL_API_URL, headers={
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
-            "X-Organization-Id": evaluation_run.organization_id
-        },
-        json=payload,
-        verify=True)
+            JUDGMENT_EVAL_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+                "X-Organization-Id": evaluation_run.organization_id
+            },
+            json=payload,
+            verify=True
+        )
         response_data = response.json()
     except Exception as e:
         error(f"Error: {e}")
@@ -281,13 +301,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
                     # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
                     print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
-def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
+def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
+        override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
+        ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
         Args:
             project_name (str): The name of the project the evaluation results belong to
@@ -354,101 +375,117 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     api_results: List[ScoringResult] = []
     local_results: List[ScoringResult] = []
-    # Execute evaluation using Judgment API
-    if judgment_scorers:
+    if async_execution:
         check_examples(evaluation_run.examples, evaluation_run.scorers)
-        info("Starting API evaluation")
-        debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
-        try:  # execute an EvaluationRun with just JudgmentScorers
-            api_evaluation_run: EvaluationRun = EvaluationRun(
-                eval_name=evaluation_run.eval_name,
-                project_name=evaluation_run.project_name,
-                examples=evaluation_run.examples,
-                scorers=judgment_scorers,
-                model=evaluation_run.model,
-                aggregator=evaluation_run.aggregator,
-                metadata=evaluation_run.metadata,
-                judgment_api_key=evaluation_run.judgment_api_key,
-                organization_id=evaluation_run.organization_id,
-                log_results=evaluation_run.log_results,
-                rules=evaluation_run.rules
+        info("Starting async evaluation")
+        payload = evaluation_run.model_dump(warnings=False)
+        requests.post(
+            JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+                "X-Organization-Id": evaluation_run.organization_id
+            },
+            json=payload,
+            verify=True
+        )
+        print("Successfully added evaluation to queue")
+    else:
+        if judgment_scorers:
+            # Execute evaluation using Judgment API
+            check_examples(evaluation_run.examples, evaluation_run.scorers)
+            info("Starting API evaluation")
+            debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
+            try:  # execute an EvaluationRun with just JudgmentScorers
+                api_evaluation_run: EvaluationRun = EvaluationRun(
+                    eval_name=evaluation_run.eval_name,
+                    project_name=evaluation_run.project_name,
+                    examples=evaluation_run.examples,
+                    scorers=judgment_scorers,
+                    model=evaluation_run.model,
+                    aggregator=evaluation_run.aggregator,
+                    metadata=evaluation_run.metadata,
+                    judgment_api_key=evaluation_run.judgment_api_key,
+                    organization_id=evaluation_run.organization_id,
+                    log_results=evaluation_run.log_results,
+                    rules=evaluation_run.rules
+                )
+                debug("Sending request to Judgment API")
+                response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
+                info(f"Received {len(response_data['results'])} results from API")
+            except JudgmentAPIError as e:
+                error(f"An error occurred while executing the Judgment API request: {str(e)}")
+                raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
+            except ValueError as e:
+                raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
+            # Convert the response data to `ScoringResult` objects
+            debug("Processing API results")
+            for idx, result in enumerate(response_data["results"]):
+                with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
+                    for scorer in judgment_scorers:
+                        debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
+                    # filter for key-value pairs that are used to initialize ScoringResult
+                    # there may be some stuff in here that doesn't belong in ScoringResult
+                    # TODO: come back and refactor this to have ScoringResult take in **kwargs
+                    filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
+                    # Convert scorers_data dicts to ScorerData objects
+                    if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
+                        filtered_result["scorers_data"] = [
+                            ScorerData(**scorer_dict)
+                            for scorer_dict in filtered_result["scorers_data"]
+                        ]
+                    api_results.append(ScoringResult(**filtered_result))
+        # Run local evals
+        if local_scorers:  # List[JudgevalScorer]
+            # We should be removing local scorers soon
+            info("Starting local evaluation")
+            for example in evaluation_run.examples:
+                with example_logging_context(example.timestamp, example.example_id):
+                    debug(f"Processing example {example.example_id}: {example.input}")
+            results: List[ScoringResult] = asyncio.run(
+                a_execute_scoring(
+                    evaluation_run.examples,
+                    local_scorers,
+                    model=evaluation_run.model,
+                    ignore_errors=ignore_errors,
+                    skip_on_missing_params=True,
+                    show_indicator=True,
+                    _use_bar_indicator=True,
+                    throttle_value=0,
+                    max_concurrent=MAX_CONCURRENT_EVALUATIONS,
+                )
             )
-            debug("Sending request to Judgment API")
-            response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
-            info(f"Received {len(response_data['results'])} results from API")
-        except JudgmentAPIError as e:
-            error(f"An error occurred while executing the Judgment API request: {str(e)}")
-            raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
-        except ValueError as e:
-            raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
+            local_results = results
+            info(f"Local evaluation complete with {len(local_results)} results")
+        # Aggregate the ScorerData from the API and local evaluations
+        debug("Merging API and local results")
+        merged_results: List[ScoringResult] = merge_results(api_results, local_results)
+        merged_results = check_missing_scorer_data(merged_results)
+        info(f"Successfully merged {len(merged_results)} results")
+        # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
+        # if evaluation_run.rules and merged_results:
+        #     run_rules(
+        #         local_results=merged_results,
+        #         rules=evaluation_run.rules,
+        #         judgment_api_key=evaluation_run.judgment_api_key,
+        #         organization_id=evaluation_run.organization_id
+        #     )
-        # Convert the response data to `ScoringResult` objects
-        debug("Processing API results")
-        for idx, result in enumerate(response_data["results"]):
-            with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
-                for scorer in judgment_scorers:
-                    debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
-                # filter for key-value pairs that are used to initialize ScoringResult
-                # there may be some stuff in here that doesn't belong in ScoringResult
-                # TODO: come back and refactor this to have ScoringResult take in **kwargs
-                filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
-                # Convert scorers_data dicts to ScorerData objects
-                if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
-                    filtered_result["scorers_data"] = [
-                        ScorerData(**scorer_dict)
-                        for scorer_dict in filtered_result["scorers_data"]
-                    ]
-                api_results.append(ScoringResult(**filtered_result))
-    # Run local evals
-    if local_scorers:  # List[JudgevalScorer]
-        # We should be removing local scorers soon
-        info("Starting local evaluation")
-        for example in evaluation_run.examples:
-            with example_logging_context(example.timestamp, example.example_id):
-                debug(f"Processing example {example.example_id}: {example.input}")
-        results: List[ScoringResult] = asyncio.run(
-            a_execute_scoring(
-                evaluation_run.examples,
-                local_scorers,
-                model=evaluation_run.model,
-                ignore_errors=True,
-                skip_on_missing_params=True,
-                show_indicator=True,
-                _use_bar_indicator=True,
-                throttle_value=0,
-                max_concurrent=MAX_CONCURRENT_EVALUATIONS,
-            )
-        )
-        local_results = results
-        info(f"Local evaluation complete with {len(local_results)} results")
-    # Aggregate the ScorerData from the API and local evaluations
-    debug("Merging API and local results")
-    merged_results: List[ScoringResult] = merge_results(api_results, local_results)
-    merged_results = check_missing_scorer_data(merged_results)
-    info(f"Successfully merged {len(merged_results)} results")
-    # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
-    # if evaluation_run.rules and merged_results:
-    #     run_rules(
-    #         local_results=merged_results,
-    #         rules=evaluation_run.rules,
-    #         judgment_api_key=evaluation_run.judgment_api_key,
-    #         organization_id=evaluation_run.organization_id
-    #     )
-    if evaluation_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
-        rprint(pretty_str)
-    for i, result in enumerate(merged_results):
-        if not result.scorers_data:  # none of the scorers could be executed on this example
-            info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
-    return merged_results
+        if evaluation_run.log_results:
+            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+            rprint(pretty_str)
+        for i, result in enumerate(merged_results):
+            if not result.scorers_data:  # none of the scorers could be executed on this example
+                info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
+        return merged_results
 def assert_test(scoring_results: List[ScoringResult]) -> None:
     """

judgeval/scorers/score.py CHANGED Viewed

@@ -274,15 +274,16 @@ async def a_execute_scoring(
     semaphore = asyncio.Semaphore(max_concurrent)
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
-        try:
-            async with semaphore:
+        async with semaphore:
+            try:
                 return await func(*args, **kwargs)
-        except Exception as e:
-            error(f"Error executing function: {e}")
-            if kwargs.get('ignore_errors', False):
-                # Return None when ignoring errors
-                return None
-            raise
+            except Exception as e:
+                print(f"Error executing function: {e}")
+                if kwargs.get('ignore_errors', False):
+                    # Simply return None when ignoring errors, as expected by the test
+                    return None
+                # If we're not ignoring errors, propagate the exception
+                raise
     if verbose_mode is not None:
         for scorer in scorers:
@@ -391,6 +392,7 @@ async def a_eval_examples_helper(
     Returns:
         None
     """
     show_metrics_indicator = show_indicator and not _use_bar_indicator
     for scorer in scorers:
@@ -416,12 +418,15 @@ async def a_eval_examples_helper(
             continue
         scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation
         process_example.update_scorer_data(scorer_data)  # Update process example with the same scorer data
     test_end_time = time.perf_counter()
     run_duration = test_end_time - scoring_start_time
     process_example.update_run_duration(run_duration)   # Update process example with execution time duration
-    scoring_results[score_index] = generate_scoring_result(process_example)  # Converts the outcomes of the executed test to a ScoringResult and saves it
+    # Generate the scoring result and store it safely (to avoid race conditions)
+    result = generate_scoring_result(process_example)
+    scoring_results[score_index] = result
     if pbar is not None:
         pbar.update(1)

judgeval/utils/alerts.py CHANGED Viewed

@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
     @property
     def conditions_results(self) -> List[Dict[str, Any]]:
         """Backwards compatibility property for the conditions_result field"""
-        return self.conditions_result
+        return self.conditions_result
+    def model_dump(self, **kwargs):
+        """
+        Convert the AlertResult to a dictionary for JSON serialization.
+        Args:
+            **kwargs: Additional arguments to pass to Pydantic's model_dump
+        Returns:
+            dict: Dictionary representation of the AlertResult
+        """
+        data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
+        # Handle the NotificationConfig object if it exists
+        if hasattr(self, "notification") and self.notification is not None:
+            if hasattr(self.notification, "model_dump"):
+                data["notification"] = self.notification.model_dump()
+            elif hasattr(self.notification, "dict"):
+                data["notification"] = self.notification.dict()
+            else:
+                # Manually convert the notification to a dictionary
+                notif = self.notification
+                data["notification"] = {
+                    "enabled": notif.enabled,
+                    "communication_methods": notif.communication_methods,
+                    "email_addresses": notif.email_addresses,
+                    "slack_channels": getattr(notif, "slack_channels", []),
+                    "send_at": notif.send_at
+                }
+        return data

{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.24
+Version: 0.0.26
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
+judgeval/constants.py,sha256=iTUro5SdXcYX00W18l32zL_EEEqHf5OT9uA5yZAme_s,5158
 judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
-judgeval/judgment_client.py,sha256=e-2e4KK-xy8-WLgzg8H0D6pZC8By9IWdu2iK-lHe39A,24076
-judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
-judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
+judgeval/judgment_client.py,sha256=2z134M0GeW3CdOZDx688UXmqJUlU31hlcFlLwUhF_Tg,25429
+judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
+judgeval/run_evaluation.py,sha256=8FZ-shJ0120iTuT2S1rXzmVcoIHPsFPb0THTGOtKoHM,25772
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
+judgeval/common/tracer.py,sha256=Z87Q3pQrtfHYvE1vsTMdIUfR-iz_IM8dqvW9VwVdtMQ,42434
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
 judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
 judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
@@ -16,9 +16,9 @@ judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
 judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
+judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
 judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
-judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
+judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -31,7 +31,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
+judgeval/scorers/score.py,sha256=PhyAyMkc7KO_DZpFSN1HD_FS3BvdleQPZhYvQkNAdxI,18816
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
@@ -86,8 +86,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
-judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
-judgeval-0.0.24.dist-info/METADATA,sha256=YvmYQNZs3P1l5ggRe0ejgauUIYJWTsrxlxUZNffKDeI,5418
-judgeval-0.0.24.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.24.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.24.dist-info/RECORD,,
+judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
+judgeval-0.0.26.dist-info/METADATA,sha256=rhTpfY5GRclxtkkXU4RrUj1ckpuxd2xsgF53oQyK6qo,5418
+judgeval-0.0.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.26.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.26.dist-info/RECORD,,

{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.24.dist-info → judgeval-0.0.26.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.24__py3-none-any.whl → 0.0.26__py3-none-any.whl

judgeval 0.0.24py3-none-any.whl → 0.0.26py3-none-any.whl