PyPI - judgeval - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

judgeval 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/cli.py +65 -0
judgeval/common/api/api.py +44 -38
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +8 -9
judgeval/common/tracer/core.py +448 -256
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +2 -1
judgeval/common/tracer/trace_manager.py +6 -1
judgeval/common/trainer/__init__.py +5 -0
judgeval/common/trainer/config.py +125 -0
judgeval/common/trainer/console.py +151 -0
judgeval/common/trainer/trainable_model.py +238 -0
judgeval/common/trainer/trainer.py +301 -0
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +37 -8
judgeval/data/trace.py +1 -0
judgeval/data/trace_run.py +0 -2
judgeval/integrations/langgraph.py +2 -1
judgeval/judgment_client.py +90 -135
judgeval/local_eval_queue.py +3 -5
judgeval/run_evaluation.py +43 -299
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +10 -47
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +29 -22
judgeval-0.7.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -80
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -6,10 +6,10 @@ import time
 import orjson
 import sys
 import threading
-from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
+from typing import List, Dict, Union, Tuple, Any, TYPE_CHECKING
 from rich import print as rprint
-from judgeval.data import ScorerData, ScoringResult, Example, Trace
+from judgeval.data import ScorerData, ScoringResult, Example
 from judgeval.scorers import BaseScorer, APIScorerConfig
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.common.api import JudgmentApiClient
@@ -22,10 +22,7 @@ from judgeval.common.logger import judgeval_logger
 if TYPE_CHECKING:
-    from judgeval.common.tracer import Tracer
-    from judgeval.data.trace_run import TraceRun
-    from judgeval.evaluation_run import EvaluationRun
-    from judgeval.integrations.langgraph import JudgevalCallbackHandler
+    from judgeval.data.evaluation_run import EvaluationRun
 def safe_run_async(coro):
@@ -99,29 +96,6 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
         )
-def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
-    """
-    Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
-    """
-    try:
-        # submit API request to execute evals
-        if not judgment_api_key or not trace_run.organization_id:
-            raise ValueError("API key and organization ID are required")
-        api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
-        return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
-    except Exception as e:
-        judgeval_logger.error(f"Error: {e}")
-        details = "An unknown error occurred."
-        if isinstance(e, JudgmentAPIException):
-            details = e.response_json.get("detail", "An unknown error occurred.")
-        raise JudgmentAPIError(
-            "An error occurred while executing the Judgment API request: " + details
-        )
 def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
     """
     Checks if any `ScoringResult` objects are missing `scorers_data`.
@@ -140,83 +114,9 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
     return results
-def check_experiment_type(
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-    is_trace: bool,
-) -> None:
-    """
-    Checks if the current experiment, if one exists, has the same type (examples of traces)
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_experiment_type(eval_name, project_name, is_trace)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 422:
-            judgeval_logger.error(f"{e.response_json}")
-            raise ValueError(f"{e.response_json}")
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
-def check_eval_run_name_exists(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> None:
-    """
-    Checks if an evaluation run name already exists for a given project.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-    Raises:
-        ValueError: If the evaluation run name already exists
-        JudgmentAPIError: If there's an API error during the check
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_eval_run_name_exists(eval_name, project_name)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 409:
-            error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
-            judgeval_logger.error(error_str)
-            raise ValueError(error_str)
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def check_example_keys(
-    keys: List[str],
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-) -> None:
-    """
-    Checks if the current experiment (if one exists) has the same keys for example
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_example_keys(keys, eval_name, project_name)
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
-    run: Union[EvaluationRun, TraceRun],
+    run: EvaluationRun,
     judgment_api_key: str,
 ) -> str:
     """
@@ -282,137 +182,8 @@ def check_examples(
             rprint("[green]Continuing...[/green]")
-def run_trace_eval(
-    trace_run: TraceRun,
-    judgment_api_key: str,
-    override: bool = False,
-    function: Optional[Callable] = None,
-    tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
-    examples: Optional[List[Example]] = None,
-) -> List[ScoringResult]:
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not trace_run.append:
-        check_eval_run_name_exists(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-        )
-    if trace_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples or traces)
-        check_experiment_type(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-            True,
-        )
-    if function and tracer and examples is not None:
-        new_traces: List[Trace] = []
-        # Handle case where tracer is actually a callback handler
-        actual_tracer = tracer
-        if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
-            # This is a callback handler, get the underlying tracer
-            actual_tracer = tracer.tracer
-        if trace_run.project_name != actual_tracer.project_name:
-            raise ValueError(
-                f"Project name mismatch between run_trace_eval and tracer. "
-                f"Trace run: {trace_run.project_name}, "
-                f"Tracer: {actual_tracer.project_name}"
-            )
-        actual_tracer.offline_mode = True
-        actual_tracer.traces = []
-        judgeval_logger.info("Running agent function: ")
-        for example in examples:
-            if example.input:
-                if isinstance(example.input, str):
-                    function(example.input)
-                elif isinstance(example.input, dict):
-                    function(**example.input)
-                else:
-                    raise ValueError(
-                        f"Input must be string or dict, got {type(example.input)}"
-                    )
-            else:
-                function()
-        for i, trace in enumerate(actual_tracer.traces):
-            # We set the root-level trace span with the expected tools of the Trace
-            trace = Trace(**trace)
-            trace.trace_spans[0].expected_tools = examples[i].expected_tools
-            new_traces.append(trace)
-        trace_run.traces = new_traces
-        actual_tracer.traces = []
-    # Execute evaluation using Judgment API
-    try:  # execute an EvaluationRun with just JudgmentScorers
-        judgeval_logger.info("Executing Trace Evaluation... ")
-        response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
-        scoring_results = [
-            ScoringResult(**result) for result in response_data["results"]
-        ]
-    except JudgmentAPIError as e:
-        raise JudgmentAPIError(
-            f"An error occurred while executing the Judgment API request: {str(e)}"
-        )
-    except ValueError as e:
-        raise ValueError(
-            f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
-        )
-    # Convert the response data to `ScoringResult` objects
-    # TODO: allow for custom scorer on traces
-    url = log_evaluation_results(
-        response_data["agent_results"], trace_run, judgment_api_key
-    )
-    rprint(
-        f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
-    )
-    return scoring_results
-async def get_evaluation_status(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> Dict:
-    """
-    Gets the status of an async evaluation run.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-        organization_id (str): Organization ID for the evaluation
-    Returns:
-        Dict: Status information including:
-            - status: 'pending', 'running', 'completed', or 'failed'
-            - results: List of ScoringResult objects if completed
-            - error: Error message if failed
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        return api_client.get_evaluation_status(eval_name, project_name)
-    except Exception as e:
-        raise JudgmentAPIError(
-            f"An error occurred while checking evaluation status: {str(e)}"
-        )
-def retrieve_counts(result: Dict):
-    scorer_data_count = 0
-    for example in result.get("examples", []):
-        for scorer in example.get("scorer_data", []):
-            scorer_data_count += 1
-    return scorer_data_count
 def _poll_evaluation_until_complete(
-    eval_name: str,
+    experiment_run_id: str,
     project_name: str,
     judgment_api_key: str,
     organization_id: str,
@@ -443,14 +214,16 @@ def _poll_evaluation_until_complete(
         poll_count += 1
         try:
             # Check status
-            status_response = api_client.get_evaluation_status(eval_name, project_name)
+            status_response = api_client.get_evaluation_status(
+                experiment_run_id, project_name
+            )
             if status_response.get("status") != "completed":
                 time.sleep(poll_interval_seconds)
                 continue
             results_response = api_client.fetch_evaluation_results(
-                project_name, eval_name
+                experiment_run_id, project_name
             )
             url = results_response.get("ui_results_url")
@@ -513,14 +286,15 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
     evaluation_run: EvaluationRun,
     judgment_api_key: str,
-    override: bool = False,
+    show_url: bool = True,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
-        override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
+        judgment_api_key (str): API key for authentication
+        show_url (bool): Whether to display the evaluation results URL. Defaults to True.
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -534,52 +308,31 @@ def run_eval(
                 f"All examples must have the same keys: {current_keys} != {keys}"
             )
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not evaluation_run.append:
-        check_eval_run_name_exists(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-        )
-    if evaluation_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of traces)
-        check_experiment_type(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-            False,
-        )
-        # Ensure that current experiment (if one exists) has the same keys for example
-        check_example_keys(
-            keys=list(keys),
-            eval_name=evaluation_run.eval_name,
-            project_name=evaluation_run.project_name,
-            judgment_api_key=judgment_api_key,
-            organization_id=evaluation_run.organization_id,
-        )
-    judgment_scorers: List[APIScorerConfig] = []
-    local_scorers: List[BaseScorer] = []
-    for scorer in evaluation_run.scorers:
-        if isinstance(scorer, APIScorerConfig):
-            judgment_scorers.append(scorer)
-        else:
-            local_scorers.append(scorer)
     results: List[ScoringResult] = []
     url = ""
-    if len(local_scorers) > 0 and len(judgment_scorers) > 0:
+    if (
+        len(evaluation_run.custom_scorers) > 0
+        and len(evaluation_run.judgment_scorers) > 0
+    ):
         error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
         judgeval_logger.error(error_msg)
         raise ValueError(error_msg)
-    if len(judgment_scorers) > 0:
-        check_examples(evaluation_run.examples, judgment_scorers)
+    e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
+    if evaluation_run.judgment_scorers or e2b_scorers:
+        if evaluation_run.judgment_scorers and e2b_scorers:
+            error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        if len(e2b_scorers) > 1:
+            error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
         stop_event = threading.Event()
         t = threading.Thread(
             target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -600,36 +353,26 @@ def run_eval(
                 )
                 raise JudgmentAPIError(error_message)
-            old_scorer_data_count = 0
-            if evaluation_run.append:
-                try:
-                    results_response = api_client.fetch_evaluation_results(
-                        evaluation_run.project_name, evaluation_run.eval_name
-                    )
-                    old_scorer_data_count = retrieve_counts(results_response)
-                except Exception:
-                    # This usually means the user did append = True but the eval run name doesn't exist yet
-                    pass
+            num_scorers = (
+                len(evaluation_run.judgment_scorers)
+                if evaluation_run.judgment_scorers
+                else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
+            )
             results, url = _poll_evaluation_until_complete(
-                eval_name=evaluation_run.eval_name,
+                experiment_run_id=evaluation_run.id,
                 project_name=evaluation_run.project_name,
                 judgment_api_key=judgment_api_key,
                 organization_id=evaluation_run.organization_id,
-                expected_scorer_data_count=(
-                    len(evaluation_run.scorers) * len(evaluation_run.examples)
-                )
-                + old_scorer_data_count,
+                expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
             )
         finally:
             stop_event.set()
             t.join()
-    if len(local_scorers) > 0:
+    else:
         results = safe_run_async(
             a_execute_scoring(
                 evaluation_run.examples,
-                local_scorers,
+                evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
                 max_concurrent=MAX_CONCURRENT_EVALUATIONS,
@@ -640,9 +383,10 @@ def run_eval(
             scoring_result.model_dump(warnings=False) for scoring_result in results
         ]
         url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
-    rprint(
-        f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
-    )
+    if show_url:
+        rprint(
+            f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
+        )
     return results

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
     name: Optional[str] = (
         None  # name of your scorer (Faithfulness, PromptScorer-randomslug)
     )
+    class_name: Optional[str] = None  # The name of the class of the scorer
     score: Optional[float] = None  # The float score of the scorer run on the test case
     score_breakdown: Optional[Dict] = None
     reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
     error: Optional[str] = None  # The error message if the scorer failed
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
     user: Optional[str] = None  # The user ID of the scorer
+    server_hosted: bool = False  # Whether the scorer is enabled for e2b
-    @model_validator(mode="before")
+    @model_validator(mode="after")
     @classmethod
-    def enforce_strict_threshold(cls, data: dict):
-        if data.get("strict_mode"):
-            data["threshold"] = 1.0
+    def enforce_strict_threshold(cls, data: "BaseScorer"):
+        if data.strict_mode:
+            data.threshold = 1.0
         return data
     @model_validator(mode="after")
     @classmethod
     def default_name(cls, m: "BaseScorer") -> "BaseScorer":
+        # Always set class_name to the string name of the class
+        m.class_name = m.__class__.__name__
         if not m.name:
-            # Try to use the class name if it exists and is not empty
-            class_name = getattr(m, "__class__", None)
-            if class_name and getattr(m.__class__, "__name__", None):
-                m.name = m.__class__.__name__
-            else:
-                m.name = m.score_type
+            m.name = m.class_name
         return m
     def _add_model(self, model: str):

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
 def push_prompt_scorer(
     name: str,
     prompt: str,
+    threshold: float,
     options: Optional[Dict[str, float]] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ) -> str:
     client = JudgmentApiClient(judgment_api_key, organization_id)
     try:
-        r = client.save_scorer(name, prompt, options)
+        r = client.save_scorer(name, prompt, threshold, options)
     except JudgmentAPIException as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
         return cls(
             name=name,
             prompt=scorer_config["prompt"],
+            threshold=scorer_config["threshold"],
             options=scorer_config.get("options"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
         cls,
         name: str,
         prompt: str,
+        threshold: Optional[float] = 0.5,
         options: Optional[Dict[str, float]] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
         if not scorer_exists(name, judgment_api_key, organization_id):
-            push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
+            push_prompt_scorer(
+                name, prompt, threshold, options, judgment_api_key, organization_id
+            )
             judgeval_logger.info(f"Successfully created PromptScorer: {name}")
             return cls(
                 name=name,
                 prompt=prompt,
+                threshold=threshold,
                 options=options,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
         judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
     # Getters
+    def get_threshold(self) -> float | None:
+        """
+        Returns the threshold of the scorer.
+        """
+        return self.threshold
     def get_prompt(self) -> str | None:
         """
         Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
         return {
             "name": self.name,
             "prompt": self.prompt,
+            "threshold": self.threshold,
             "options": self.options,
         }
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
         push_prompt_scorer(
             self.name,
             self.prompt,
+            self.threshold,
             self.options,
             self.judgment_api_key,
             self.organization_id,
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
+        return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)

{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.5.0
+Version: 0.7.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,8 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: boto3
+Requires-Dist: click<8.2.0
+Requires-Dist: fireworks-ai>=0.19.18
 Requires-Dist: langchain-anthropic
 Requires-Dist: langchain-core
 Requires-Dist: langchain-huggingface
@@ -23,6 +25,7 @@ Requires-Dist: orjson>=3.9.0
 Requires-Dist: python-dotenv
 Requires-Dist: requests
 Requires-Dist: rich
+Requires-Dist: typer>=0.9.0
 Provides-Extra: langchain
 Requires-Dist: langchain-anthropic; extra == 'langchain'
 Requires-Dist: langchain-core; extra == 'langchain'
@@ -37,7 +40,7 @@ Description-Content-Type: text/markdown
 <br>
 <div style="font-size: 1.5em;">
-    Enable self-learning agents with traces, evals, and environment data.
+    Enable self-learning agents with environment data and evals.
 </div>
 ## [Docs](https://docs.judgmentlabs.ai/)  •  [Judgment Cloud](https://app.judgmentlabs.ai/register)  • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)  • [Landing Page](https://judgmentlabs.ai/)
@@ -54,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
 </div>
-Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
+Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
 ## 🎬 See Judgeval in Action
-**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
+**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
 <table style="width: 100%; max-width: 800px; table-layout: fixed;">
 <tr>
@@ -67,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
   <br><strong>🤖 Agents Running</strong>
 </td>
 <td align="center" style="padding: 8px; width: 50%;">
-  <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
-  <br><strong>📊 Real-time Tracing</strong>
+  <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
+  <br><strong>📊 Capturing Environment Data </strong>
 </td>
 </tr>
 <tr>
@@ -109,54 +112,14 @@ export JUDGMENT_ORG_ID=...
 **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
-## 🏁 Quickstarts
-### 🛰️ Tracing
-Create a file named `agent.py` with the following code:
-```python
-from judgeval.tracer import Tracer, wrap
-from openai import OpenAI
-client = wrap(OpenAI())  # tracks all LLM calls
-judgment = Tracer(project_name="my_project")
-@judgment.observe(span_type="tool")
-def format_question(question: str) -> str:
-    # dummy tool
-    return f"Question : {question}"
-@judgment.observe(span_type="function")
-def run_agent(prompt: str) -> str:
-    task = format_question(prompt)
-    response = client.chat.completions.create(
-        model="gpt-4.1",
-        messages=[{"role": "user", "content": task}]
-    )
-    return response.choices[0].message.content
-run_agent("What is the capital of the United States?")
-```
-You'll see your trace exported to the Judgment Platform:
-<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
-[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
-<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
 ## ✨ Features
 |  |  |
 |:---|:---:|
-| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
 | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
 | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
-| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
+| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
 ## 🏢 Self-Hosting

judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

judgeval 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl