PyPI - judgeval - Versions diffs - 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl - Mend

judgeval 0.0.40py3-none-any.whl → 0.0.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

judgeval/common/s3_storage.py +3 -1
judgeval/common/tracer.py +1079 -139
judgeval/common/utils.py +6 -2
judgeval/constants.py +5 -0
judgeval/data/datasets/dataset.py +12 -6
judgeval/data/datasets/eval_dataset_client.py +3 -1
judgeval/data/trace.py +7 -2
judgeval/integrations/langgraph.py +218 -34
judgeval/judgment_client.py +9 -1
judgeval/rules.py +60 -50
judgeval/run_evaluation.py +53 -29
judgeval/scorers/judgeval_scorer.py +4 -1
judgeval/scorers/prompt_scorer.py +3 -0
judgeval/utils/alerts.py +8 -0
{judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/METADATA +48 -50
{judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/RECORD +18 -18
{judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/WHEEL +0 -0
{judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/licenses/LICENSE.md +0 -0

judgeval/rules.py CHANGED Viewed

@@ -9,13 +9,13 @@ import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import time
 import uuid
+import os
+import re
+import json
+from datetime import datetime
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
-class AlertStatus(str, Enum):
-    """Status of an alert evaluation."""
-    TRIGGERED = "triggered"
-    NOT_TRIGGERED = "not_triggered"
+from judgeval.utils.alerts import AlertStatus, AlertResult
 class Condition(BaseModel):
     """
@@ -68,6 +68,36 @@ class Condition(BaseModel):
             # Fallback to default comparison (greater than or equal)
             return value >= self.threshold if self.threshold is not None else False
+class PagerDutyConfig(BaseModel):
+    """
+    Configuration for PagerDuty notifications.
+    Attributes:
+        routing_key: PagerDuty integration routing key
+        severity: Severity level (critical, error, warning, info)
+        source: Source of the alert (defaults to "judgeval")
+        component: Optional component that triggered the alert
+        group: Optional logical grouping for the alert
+        class_type: Optional class/type of alert event
+    """
+    routing_key: str
+    severity: str = "error"  # critical, error, warning, info
+    source: str = "judgeval"
+    component: Optional[str] = None
+    group: Optional[str] = None
+    class_type: Optional[str] = None
+    def model_dump(self, **kwargs):
+        """Convert the PagerDutyConfig to a dictionary for JSON serialization."""
+        return {
+            "routing_key": self.routing_key,
+            "severity": self.severity,
+            "source": self.source,
+            "component": self.component,
+            "group": self.group,
+            "class_type": self.class_type
+        }
 class NotificationConfig(BaseModel):
     """
     Configuration for notifications when a rule is triggered.
@@ -75,8 +105,12 @@ class NotificationConfig(BaseModel):
     Example:
         {
             "enabled": true,
-            "communication_methods": ["email", "broadcast_slack", "broadcast_email"],
+            "communication_methods": ["email", "broadcast_slack", "broadcast_email", "pagerduty"],
             "email_addresses": ["user1@example.com", "user2@example.com"],
+            "pagerduty_config": {
+                "routing_key": "R0ABCD1234567890123456789",
+                "severity": "error"
+            },
             "send_at": 1632150000  # Unix timestamp (specific date/time)
         }
@@ -84,10 +118,12 @@ class NotificationConfig(BaseModel):
         - "email": Send emails to specified email addresses
         - "broadcast_slack": Send broadcast notifications to all configured Slack channels
         - "broadcast_email": Send broadcast emails to all organization emails
+        - "pagerduty": Send alerts to PagerDuty using the configured routing key
     """
     enabled: bool = True
     communication_methods: List[str] = []
     email_addresses: Optional[List[str]] = None
+    pagerduty_config: Optional[PagerDutyConfig] = None
     send_at: Optional[int] = None  # Unix timestamp for scheduled notifications
     def model_dump(self, **kwargs):
@@ -96,6 +132,7 @@ class NotificationConfig(BaseModel):
             "enabled": self.enabled,
             "communication_methods": self.communication_methods,
             "email_addresses": self.email_addresses,
+            "pagerduty_config": self.pagerduty_config.model_dump() if self.pagerduty_config else None,
             "send_at": self.send_at
         }
@@ -144,7 +181,8 @@ class Rule(BaseModel):
                     # Create standardized metric representation needed by server API
                     metric_data = {
                         "score_type": "",
-                        "threshold": 0.0
+                        "threshold": 0.0,
+                        "name": ""
                     }
                     # First try to use object's own serialization methods
@@ -182,6 +220,16 @@ class Rule(BaseModel):
                             # Use condition threshold if metric doesn't have one
                             metric_data['threshold'] = self.conditions[i].threshold
+                    # Make sure name is set
+                    if not metric_data.get('name'):
+                        if hasattr(metric_obj, '__name__'):
+                            metric_data['name'] = metric_obj.__name__
+                        elif hasattr(metric_obj, 'name'):
+                            metric_data['name'] = metric_obj.name
+                        else:
+                            # Fallback to score_type if available
+                            metric_data['name'] = metric_data.get('score_type', str(metric_obj))
                     # Update the condition with our properly serialized metric
                     condition["metric"] = metric_data
@@ -199,47 +247,6 @@ class Rule(BaseModel):
             raise ValueError(f"combine_type must be 'all' or 'any', got: {v}")
         return v
-class AlertResult(BaseModel):
-    """
-    Result of evaluating a rule.
-    Example:
-        {
-            "status": "triggered",
-            "rule_name": "Quality Check",
-            "conditions_result": [
-                {"metric": "faithfulness", "value": 0.6, "threshold": 0.7, "passed": False},
-                {"metric": "relevancy", "value": 0.9, "threshold": 0.8, "passed": True}
-            ],
-            "rule_id": "123e4567-e89b-12d3-a456-426614174000",
-            "metadata": {
-                "example_id": "example_123",
-                "timestamp": "20240321_123456"
-            },
-            "notification": {
-                "enabled": true,
-                "communication_methods": ["slack", "email"],
-                "email_addresses": ["user1@example.com", "user2@example.com"]
-            }
-        }
-    """
-    status: AlertStatus
-    rule_id: Optional[str] = None  # The unique identifier of the rule
-    rule_name: str
-    conditions_result: List[Dict[str, Any]]
-    metadata: Dict[str, Any] = {}
-    notification: Optional[NotificationConfig] = None  # Configuration for notifications
-    @property
-    def example_id(self) -> Optional[str]:
-        """Get example_id from metadata for backward compatibility"""
-        return self.metadata.get("example_id")
-    @property
-    def timestamp(self) -> Optional[str]:
-        """Get timestamp from metadata for backward compatibility"""
-        return self.metadata.get("timestamp")
 class RulesEngine:
     """
     Engine for creating and evaluating rules against metrics.
@@ -406,7 +413,7 @@ class RulesEngine:
                 # If rule has a notification config and the alert is triggered, include it in the result
                 notification_config = rule.notification
-            # Set the alert status based on whether the rule was triggered
+            # Set the alert status based on whether the rule was triggered using proper enum values
             status = AlertStatus.TRIGGERED if triggered else AlertStatus.NOT_TRIGGERED
             # Create the alert result
@@ -416,7 +423,10 @@ class RulesEngine:
                 rule_name=rule.name,
                 conditions_result=condition_results,
                 notification=notification_config,
-                metadata=example_metadata or {}
+                metadata=example_metadata or {},
+                combine_type=rule.combine_type,
+                project_id=example_metadata.get("project_id") if example_metadata else None,
+                trace_span_id=example_metadata.get("trace_span_id") if example_metadata else None
             )
             results[rule_id] = alert_result

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import requests
 import time
+import json
 import sys
 import itertools
 import threading
@@ -99,9 +100,9 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         raise JudgmentAPIError(error_message)
     return response_data
-def execute_api_trace_eval(trace_run: TraceRun) -> List[Dict]:
+def execute_api_trace_eval(trace_run: TraceRun) -> Dict:
     """
-    Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
+    Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
     """
     try:
@@ -145,46 +146,47 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
     """
     # No merge required
     if not local_results and api_results:
-        return api_results
+        return [result.model_copy() for result in api_results]
     if not api_results and local_results:
-        return local_results
+        return [result.model_copy() for result in local_results]
     if len(api_results) != len(local_results):
         # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
         raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
+    # Create a copy of api_results to avoid modifying the input
+    merged_results = [result.model_copy() for result in api_results]
     # Each ScoringResult in api and local have all the same fields besides `scorers_data`
-    for api_result, local_result in zip(api_results, local_results):
-        if not (api_result.data_object and local_result.data_object):
+    for merged_result, local_result in zip(merged_results, local_results):
+        if not (merged_result.data_object and local_result.data_object):
             raise ValueError("Data object is None in one of the results.")
-        if api_result.data_object.input != local_result.data_object.input:
+        if merged_result.data_object.input != local_result.data_object.input:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.actual_output != local_result.data_object.actual_output:
+        if merged_result.data_object.actual_output != local_result.data_object.actual_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.expected_output != local_result.data_object.expected_output:
+        if merged_result.data_object.expected_output != local_result.data_object.expected_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.context != local_result.data_object.context:
+        if merged_result.data_object.context != local_result.data_object.context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
+        if merged_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
+        if merged_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.tools_called != local_result.data_object.tools_called:
+        if merged_result.data_object.tools_called != local_result.data_object.tools_called:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
+        if merged_result.data_object.expected_tools != local_result.data_object.expected_tools:
             raise ValueError("The API and local results are not aligned.")
         # Merge ScorerData from the API and local scorers together
-        api_scorer_data = api_result.scorers_data
+        api_scorer_data = merged_result.scorers_data
         local_scorer_data = local_result.scorers_data
         if api_scorer_data is None and local_scorer_data is not None:
-            api_result.scorers_data = local_scorer_data
-        if api_scorer_data is not None and local_scorer_data is not None:
-            api_result.scorers_data = api_scorer_data + local_scorer_data
+            merged_result.scorers_data = local_scorer_data
+        elif api_scorer_data is not None and local_scorer_data is not None:
+            merged_result.scorers_data = api_scorer_data + local_scorer_data
-    return api_results
+    return merged_results
 def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
@@ -362,14 +364,26 @@ def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScore
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
+    prompt_user = False
     for scorer in scorers:
         for example in examples:
             missing_params = []
             for param in scorer.required_params:
                 if getattr(example, param.value) is None:
-                    missing_params.append(f"'{param.value}'")
+                    missing_params.append(f"{param.value}")
             if missing_params:
-                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+                rprint(f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]")
+                rprint(f"Missing parameters: {', '.join(missing_params)}")
+                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint("-"*40)
+                prompt_user = True
+    if prompt_user:
+        user_input = input("Do you want to continue? (y/n)")
+        if user_input.lower() != "y":
+            sys.exit(0)
+        else:
+            rprint("[green]Continuing...[/green]")
 def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: bool = True, function: Optional[Callable] = None, tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None, examples: Optional[List[Example]] = None) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -392,8 +406,15 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
         )
     if function and tracer:
         new_traces: List[Trace] = []
-        tracer.offline_mode = True
-        tracer.traces = []
+        # Handle case where tracer is actually a callback handler
+        actual_tracer = tracer
+        if hasattr(tracer, 'tracer') and hasattr(tracer.tracer, 'traces'):
+            # This is a callback handler, get the underlying tracer
+            actual_tracer = tracer.tracer
+        actual_tracer.offline_mode = True
+        actual_tracer.traces = []
         for example in examples:
             if example.input:
                 if isinstance(example.input, str):
@@ -404,19 +425,21 @@ def run_trace_eval(trace_run: TraceRun, override: bool = False, ignore_errors: b
                     raise ValueError(f"Input must be string or dict, got {type(example.input)}")
             else:
                 result = run_with_spinner("Running agent function: ", function)
-        for i, trace in enumerate(tracer.traces):
+        for i, trace in enumerate(actual_tracer.traces):
             # We set the root-level trace span with the expected tools of the Trace
             trace = Trace(**trace)
-            trace.entries[0].expected_tools = examples[i].expected_tools
+            trace.trace_spans[0].expected_tools = examples[i].expected_tools
             new_traces.append(trace)
         trace_run.traces = new_traces
-        tracer.traces = []
+        actual_tracer.traces = []
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
         debug("Sending request to Judgment API")
-        response_data: List[Dict] = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
+        response_data: Dict = run_with_spinner("Running Trace Evaluation: ", execute_api_trace_eval, trace_run)
         scoring_results = [ScoringResult(**result) for result in response_data["results"]]
         info(f"Received {len(scoring_results)} results from API")
     except JudgmentAPIError as e:
@@ -894,6 +917,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             f"Processing evaluation '{evaluation_run.eval_name}': "
         )
     else:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         if judgment_scorers:
             # Execute evaluation using Judgment API
             info("Starting API evaluation")

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -12,7 +12,7 @@ from judgeval.common.logger import debug, info, warning, error
 from judgeval.judges import JudgevalJudge
 from judgeval.judges.utils import create_judge
 from judgeval.constants import UNBOUNDED_SCORERS
+from judgeval.data.example import ExampleParams
 class JudgevalScorer:
     """
     Base class for scorers in `judgeval`.
@@ -39,6 +39,7 @@ class JudgevalScorer:
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
+    required_params: Optional[List[ExampleParams]] = None  # The required parameters for the scorer
     error: Optional[str] = None
     success: Optional[bool] = None
@@ -51,6 +52,7 @@ class JudgevalScorer:
         reason: Optional[str] = None,
         success: Optional[bool] = None,
         evaluation_model: Optional[str] = None,
+        required_params: Optional[List[ExampleParams]] = None,
         strict_mode: bool = False,
         async_mode: bool = True,
         verbose_mode: bool = True,
@@ -87,6 +89,7 @@ class JudgevalScorer:
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs
             self.additional_metadata = additional_metadata
+            self.required_params = required_params
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -30,6 +30,7 @@ from typing import List, Optional, Tuple, Any, Mapping
 from pydantic import BaseModel, model_serializer, Field
 from judgeval.data import Example
+from judgeval.data.example import ExampleParams
 from judgeval.scorers import JudgevalScorer
 from judgeval.scorers.utils import (
     scorer_progress_meter,
@@ -64,6 +65,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        required_params: Optional[List[ExampleParams]] = None,
     ):
         # Initialize BaseModel first
         BaseModel.__init__(
@@ -85,6 +87,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
             async_mode=async_mode,
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
+            required_params=required_params,
         )
     def score_example(

judgeval/utils/alerts.py CHANGED Viewed

@@ -20,12 +20,20 @@ class AlertResult(BaseModel):
         status: Status of the alert (triggered or not)
         conditions_result: List of condition evaluation results
         metadata: Dictionary containing example_id, timestamp, and other metadata
+        notification: Optional notification configuration for triggered alerts
+        combine_type: The combination type used ("all" or "any")
+        project_id: Optional project identifier
+        trace_span_id: Optional trace span identifier
     """
     rule_name: str
     rule_id: Optional[str] = None  # The unique identifier of the rule
     status: AlertStatus
     conditions_result: List[Dict[str, Any]] = []
     metadata: Dict[str, Any] = {}
+    notification: Optional[Any] = None  # NotificationConfig when triggered, None otherwise
+    combine_type: Optional[str] = None  # "all" or "any"
+    project_id: Optional[str] = None  # Project identifier
+    trace_span_id: Optional[str] = None  # Trace span identifier
     @property
     def example_id(self) -> Optional[str]:

{judgeval-0.0.40.dist-info → judgeval-0.0.42.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.40
+Version: 0.0.42
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -18,6 +18,7 @@ Requires-Dist: langchain-core
 Requires-Dist: langchain-huggingface
 Requires-Dist: langchain-openai
 Requires-Dist: litellm==1.61.15
+Requires-Dist: matplotlib>=3.10.3
 Requires-Dist: nest-asyncio
 Requires-Dist: openai
 Requires-Dist: pandas
@@ -31,44 +32,47 @@ Description-Content-Type: text/markdown
 <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
 <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
-**Build monitoring & evaluation pipelines for complex agents**
+<br>
+<div style="font-size: 1.5em;">
+  Open source tracing, evals, and metrics to debug, test, and monitor LLM agents.
+</div>
-<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
+## [Judgment Cloud](https://app.judgmentlabs.ai/register)  • [Self-Host](https://docs.judgmentlabs.ai/self-hosting/get_started) • [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc)
-<br>
+[Docs](https://docs.judgmentlabs.ai/introduction) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
-## [🌐 Landing Page](https://www.judgmentlabs.ai/)  • [📚 Docs](https://judgment.mintlify.app/getting_started) • [🚀 Demos](https://www.youtube.com/@AlexShan-j3o)
+We're hiring! Join us in our mission to unleash optimized agents.
 [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
 [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
-[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/FMxHkYTtFE)
+[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/ZCnSXYug)
-</div>
+<img src="assets/experiments_pagev2.png" alt="Judgment Platform Experiments Page" width="800" />
-## Judgeval: open-source testing, monitoring, and optimization for AI agents
+</div>
-Judgeval offers robust tooling for evaluating and tracing LLM agent systems. It is dev-friendly and open-source (licensed under Apache 2.0).
-Judgeval gets you started in five minutes, after which you'll be ready to use all of its features as your agent becomes more complex. Judgeval is natively connected to the [Judgment Platform](https://www.judgmentlabs.ai/) for free and you can export your data and self-host at any time.
+Judgeval offers **robust open-source tooling** for tracing, evaluating, and monitoring LLM agents. It helps AI teams effectively **test and monitor** agents in development and production, **closing the agent feedback loop**.
-We support tracing agents built with LangGraph, OpenAI SDK, Anthropic, ... and allow custom eval integrations for any use case. Check out our quickstarts below or our [setup guide](https://docs.judgmentlabs.ai/getting-started) to get started.
+Judgeval can be set up **(cloud-hosted or self-hosted) in 5 minutes**!
+> 🎁 Generous monthly [free tier](https://judgmentlabs.ai/pricing) (10k traces, 1k evals) - No credit card required!
 Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
 ## 📋 Table of Contents
-* [✨ Features](#-features)
-    * [🔍 Tracing](#-tracing)
-    * [🧪 Evals](#-evals)
-    * [📡 Monitoring](#-monitoring)
-    * [📊 Datasets](#-datasets)
-    * [💡 Insights](#-insights)
-* [🛠️ Installation](#️-installation)
-* [🏁 Get Started](#-get-started)
-* [🏢 Self-Hosting](#-self-hosting)
-* [📚 Cookbooks](#-cookbooks)
-* [💻 Development with Cursor](#-development-with-cursor)
-* [⭐ Star Us on GitHub](#-star-us-on-github)
-* [❤️ Contributors](#️-contributors)
+- [✨ Features](#-features)
+- [🛠️ Installation](#️-installation)
+- [🏁 Quickstarts](#-quickstarts)
+  - [🛰️ Tracing](#️-tracing)
+  - [📝 Offline Evaluations](#-offline-evaluations)
+  - [📡 Online Evaluations](#-online-evaluations)
+- [🏢 Self-Hosting](#-self-hosting)
+  - [Key Features](#key-features)
+  - [Getting Started](#getting-started)
+- [📚 Cookbooks](#-cookbooks)
+- [💻 Development with Cursor](#-development-with-cursor)
+- [⭐ Star Us on GitHub](#-star-us-on-github)
+- [❤️ Contributors](#️-contributors)
 <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
@@ -77,11 +81,10 @@ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
 |  |  |
 |:---|:---:|
-| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
-| <h3>🧪 Evals</h3>15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Build custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails <br><br> | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
-| <h3>📡 Monitoring</h3>Real-time performance tracking of your agents in production environments. **Track all your metrics in one place.**<br><br>Set up **Slack/email alerts** for critical metrics and receive notifications when thresholds are exceeded.<br><br> **Useful for:** <br>•📉 Identifying degradation early <br>•📈 Visualizing performance trends across versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
-| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets hosted on Judgment's Platform. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🔄 Scaled analysis for A/B tests <br>• 🗃️ Filtered collections of agent runtime data| <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
-| <h3>💡 Insights</h3>Cluster on your data to reveal common use cases and failure modes.<br><br>Trace failures to their exact source with Judgment's Osiris agent, which localizes errors to specific components for precise fixes.<br><br> **Useful for:**<br>•🔮 Surfacing common inputs that lead to error<br>•🤖 Investigating agent/user behavior for optimization <br>| <p align="center"><img src="assets/dataset_clustering_screenshot_dm.png" alt="Insights dashboard" width="1200"/></p> |
+| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time.<br><br>Export trace data to the Judgment Platform or your own S3 buckets, {Parquet, JSON, YAML} files, or data warehouse.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 👤 Tracking user activity <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
+| <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
+| <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/monitoring_screenshot.png" alt="Monitoring Dashboard" width="1200"/></p> |
+| <h3>📊 Datasets</h3>Export trace data or import external testcases to datasets for scaled unit testing and structured experiments. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations. <br><br> **Useful for:**<br>• 🗃️ Filtered agent runtime data for fine tuning<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
 ## 🛠️ Installation
@@ -91,17 +94,19 @@ Get started with Judgeval by installing our SDK using pip:
 pip install judgeval
 ```
-Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
+Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment platform](https://app.judgmentlabs.ai/).
-**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
+```bash
+export JUDGMENT_API_KEY=...
+export JUDGMENT_ORG_ID=...
+```
-## 🏁 Get Started
+**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
-Here's how you can quickly start using Judgeval:
+## 🏁 Quickstarts
 ### 🛰️ Tracing
-Track your agent execution with full observability with just a few lines of code.
 Create a file named `traces.py` with the following code:
 ```python
@@ -126,12 +131,15 @@ def main():
 main()
 ```
+You'll see your trace exported to the Judgment Platform:
+<p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
 [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-trace) for a more detailed explanation.
 ### 📝 Offline Evaluations
-You can evaluate your agent's execution to measure quality metrics such as hallucination.
 Create a file named `evaluate.py` with the following code:
 ```python evaluate.py
@@ -147,7 +155,7 @@ example = Example(
     retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
 )
-scorer = FaithfulnessScorer(threshold=0.5)
+scorer = FaithfulnessScorer(threshold=0.5)  # Hallucination detector
 results = client.run_evaluation(
     examples=[example],
     scorers=[scorer],
@@ -196,6 +204,8 @@ def main():
 main()
 ```
+You should see an evaluation attached to your trace on the Judgment Platform.
 [Click here](https://docs.judgmentlabs.ai/getting-started#create-your-first-online-evaluation) for a more detailed explanation.
 ## 🏢 Self-Hosting
@@ -220,20 +230,8 @@ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judg
 ### Sample Agents
-#### 💰 [LangGraph Financial QA Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/financial_agent/demo.py)
-A LangGraph-based agent for financial queries, featuring RAG capabilities with a vector database for contextual data retrieval and evaluation of its reasoning and data accuracy.
-#### ✈️ [OpenAI Travel Agent](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/openai_travel_agent/agent.py)
-A travel planning agent using OpenAI API calls, custom tool functions, and RAG with a vector database for up-to-date and contextual travel information. Evaluated for itinerary quality and information relevance.
-### Custom Evaluators
-#### 🔍 [PII Detection](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/classifier_scorer/pii_checker.py)
-Detecting and evaluating Personal Identifiable Information (PII) leakage.
-#### 📧 [Cold Email Generation](https://github.com/JudgmentLabs/judgment-cookbook/blob/main/cookbooks/custom_scorers/cold_email_scorer.py)
-Evaluates if a cold email generator properly utilizes all relevant information about the target recipient.
+#### [Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent)
+A multi-agent system augmented with tool calls designed for general purpose tasks like financial research and math. Traced and evaluated on Faithfulness (factual adherence to retrieval context).
 ## 💻 Development with Cursor
 When building agents and LLM workflows in Cursor, providing proper context to your coding assistant helps ensure seamless integration with Judgment. This rule file supplies the essential context your coding assistant needs for successful implementation.

judgeval 0.0.40__py3-none-any.whl → 0.0.42__py3-none-any.whl

judgeval 0.0.40py3-none-any.whl → 0.0.42py3-none-any.whl