PyPI - judgeval - Versions diffs - 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

judgeval 0.7.1py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

judgeval/__init__.py +139 -12
judgeval/api/__init__.py +501 -0
judgeval/api/api_types.py +344 -0
judgeval/cli.py +2 -4
judgeval/constants.py +10 -26
judgeval/data/evaluation_run.py +49 -26
judgeval/data/example.py +2 -2
judgeval/data/judgment_types.py +266 -82
judgeval/data/result.py +4 -5
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +2 -2
judgeval/data/trace.py +7 -50
judgeval/data/trace_run.py +7 -4
judgeval/{dataset.py → dataset/__init__.py} +43 -28
judgeval/env.py +67 -0
judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +788 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +75 -15
judgeval/judges/together_judge.py +86 -18
judgeval/judges/utils.py +7 -21
judgeval/{common/logger.py → logger.py} +8 -6
judgeval/scorers/__init__.py +0 -4
judgeval/scorers/agent_scorer.py +3 -7
judgeval/scorers/api_scorer.py +8 -13
judgeval/scorers/base_scorer.py +52 -32
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
judgeval/scorers/score.py +21 -31
judgeval/scorers/trace_api_scorer.py +5 -0
judgeval/scorers/utils.py +1 -103
judgeval/tracer/__init__.py +1075 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +37 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +43 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +67 -0
judgeval/tracer/llm/__init__.py +1233 -0
judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
judgeval/tracer/managers.py +188 -0
judgeval/tracer/processors/__init__.py +181 -0
judgeval/tracer/utils.py +20 -0
judgeval/trainer/__init__.py +5 -0
judgeval/{common/trainer → trainer}/config.py +12 -9
judgeval/{common/trainer → trainer}/console.py +2 -9
judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
judgeval/{common/trainer → trainer}/trainer.py +119 -17
judgeval/utils/async_utils.py +2 -3
judgeval/utils/decorators.py +24 -0
judgeval/utils/file_utils.py +37 -4
judgeval/utils/guards.py +32 -0
judgeval/utils/meta.py +14 -0
judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
judgeval/utils/testing.py +88 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +3 -3
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
judgeval-0.9.0.dist-info/RECORD +80 -0
judgeval/clients.py +0 -35
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -375
judgeval/common/api/constants.py +0 -186
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -97
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -2427
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -188
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -207
judgeval/common/tracer/trace_manager.py +0 -101
judgeval/common/trainer/__init__.py +0 -5
judgeval/common/utils.py +0 -948
judgeval/integrations/langgraph.py +0 -844
judgeval/judges/mixture_of_judges.py +0 -287
judgeval/judgment_client.py +0 -267
judgeval/rules.py +0 -521
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.7.1.dist-info/RECORD +0 -82
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/{dataset.py → dataset/__init__.py} RENAMED Viewed

@@ -7,8 +7,9 @@ from typing import List, Literal, Optional
 from judgeval.data import Example, Trace
 from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
-from judgeval.common.api.api import JudgmentApiClient
-from judgeval.common.logger import judgeval_logger
+from judgeval.api import JudgmentSyncClient
+from judgeval.logger import judgeval_logger
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
 @dataclass
@@ -17,8 +18,8 @@ class Dataset:
     traces: List[Trace]
     name: str
     project_name: str
-    judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
-    organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
+    judgment_api_key: str = JUDGMENT_API_KEY or ""
+    organization_id: str = JUDGMENT_ORG_ID or ""
     @classmethod
     def get(
@@ -26,10 +27,14 @@ class Dataset:
         name: str,
         project_name: str,
     ):
-        client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
-        dataset = client.pull_dataset(name, project_name)
+        client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
+        dataset = client.datasets_pull_for_judgeval(
+            {
+                "dataset_alias": name,
+                "project_name": project_name,
+            },
+        )
         if not dataset:
-            judgeval_logger.error(f"Dataset {name} not found in project {project_name}")
             raise ValueError(f"Dataset {name} not found in project {project_name}")
         examples = dataset.get("examples", [])
         for e in examples:
@@ -61,14 +66,17 @@ class Dataset:
         if not traces:
             traces = []
-        client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
-        client.push_dataset(
-            name,
-            project_name,
-            examples=[e.model_dump() for e in examples],
-            traces=[t.model_dump() for t in traces],
-            overwrite=overwrite,
+        client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
+        client.datasets_push(
+            {
+                "dataset_alias": name,
+                "project_name": project_name,
+                "examples": [e.model_dump() for e in examples],  # type: ignore
+                "traces": [t.model_dump() for t in traces],  # type: ignore
+                "overwrite": overwrite,
+            }
         )
         judgeval_logger.info(f"Succesfull created dataset {name}!")
         return cls(
             name=name,
@@ -115,19 +123,30 @@ class Dataset:
         self.add_examples(examples)
     def add_examples(self, examples: List[Example]) -> None:
-        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
-        client.append_examples(
-            dataset_alias=self.name,
-            project_name=self.project_name,
-            examples=[e.model_dump() for e in examples],
+        client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
+        client.datasets_insert_examples(
+            {
+                "dataset_alias": self.name,
+                "project_name": self.project_name,
+                "examples": [
+                    {
+                        "name": e.name,
+                        "created_at": e.created_at,
+                        "example_id": e.example_id,
+                    }
+                    for e in examples
+                ],
+            }
         )
     def add_traces(self, traces: List[Trace]) -> None:
-        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
-        client.append_traces(
-            dataset_alias=self.name,
-            project_name=self.project_name,
-            traces=[t.model_dump() for t in traces],
+        client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
+        client.traces_add_to_dataset(
+            {
+                "dataset_alias": self.name,
+                "project_name": self.project_name,
+                "traces": [t.model_dump() for t in traces],  # type: ignore
+            }
         )
     def save_as(
@@ -174,10 +193,6 @@ class Dataset:
                 f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
             )
-    def delete(self):
-        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
-        client.delete_dataset(self.name, self.project_name)
     def __iter__(self):
         return iter(self.examples)

judgeval/env.py ADDED Viewed

@@ -0,0 +1,67 @@
+from __future__ import annotations
+from dotenv import load_dotenv
+load_dotenv()
+import os
+from typing import overload
+@overload
+def optional_env_var(var_name: str) -> str | None: ...
+@overload
+def optional_env_var(var_name: str, default: str) -> str: ...
+def optional_env_var(var_name: str, default: str | None = None) -> str | None:
+    return os.getenv(var_name, default)
+JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
+JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
+JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
+JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
+JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
+    "JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
+)
+JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
+    optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
+)
+JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
+JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
+JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
+JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
+JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
+JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
+JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
+JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
+JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
+TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
+TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")
+__all__ = (
+    "JUDGMENT_API_KEY",
+    "JUDGMENT_ORG_ID",
+    "JUDGMENT_API_URL",
+    "JUDGMENT_DEFAULT_GPT_MODEL",
+    "JUDGMENT_DEFAULT_TOGETHER_MODEL",
+    "JUDGMENT_MAX_CONCURRENT_EVALUATIONS",
+    "JUDGMENT_S3_ACCESS_KEY_ID",
+    "JUDGMENT_S3_SECRET_ACCESS_KEY",
+    "JUDGMENT_S3_REGION_NAME",
+    "JUDGMENT_S3_BUCKET_NAME",
+    "JUDGMENT_S3_PREFIX",
+    "JUDGMENT_S3_ENDPOINT_URL",
+    "JUDGMENT_S3_ADDRESSING_STYLE",
+    "JUDGMENT_NO_COLOR",
+    "TOGETHERAI_API_KEY",
+    "TOGETHER_API_KEY",
+)

judgeval/{run_evaluation.py → evaluation/__init__.py} RENAMED Viewed

@@ -6,19 +6,18 @@ import time
 import orjson
 import sys
 import threading
-from typing import List, Dict, Union, Tuple, Any, TYPE_CHECKING
+from typing import List, Dict, Union, Tuple, TYPE_CHECKING
 from rich import print as rprint
 from judgeval.data import ScorerData, ScoringResult, Example
 from judgeval.scorers import BaseScorer, APIScorerConfig
 from judgeval.scorers.score import a_execute_scoring
-from judgeval.common.api import JudgmentApiClient
-from judgeval.constants import (
-    MAX_CONCURRENT_EVALUATIONS,
+from judgeval.api import JudgmentSyncClient
+from judgeval.env import (
+    JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
 )
-from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.common.api.api import JudgmentAPIException
-from judgeval.common.logger import judgeval_logger
+from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
+from judgeval.logger import judgeval_logger
 if TYPE_CHECKING:
@@ -48,72 +47,6 @@ def safe_run_async(coro):
         return asyncio.run(coro)
-def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
-    """
-    Sends an evaluation run to the RabbitMQ evaluation queue.
-    """
-    if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
-        raise ValueError("API key and organization ID are required")
-    if not evaluation_run.eval_name or not evaluation_run.project_name:
-        raise ValueError("Eval name and project name are required")
-    api_client = JudgmentApiClient(
-        evaluation_run.judgment_api_key, evaluation_run.organization_id
-    )
-    return api_client.add_to_evaluation_queue(
-        evaluation_run.eval_name, evaluation_run.project_name
-    )
-def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
-    """
-    Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
-    Args:
-        evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
-    Returns:
-        List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
-                    object.
-    """
-    try:
-        # submit API request to execute evals
-        if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
-            raise ValueError("API key and organization ID are required")
-        api_client = JudgmentApiClient(
-            evaluation_run.judgment_api_key, evaluation_run.organization_id
-        )
-        return api_client.run_evaluation(evaluation_run.model_dump())
-    except Exception as e:
-        judgeval_logger.error(f"Error: {e}")
-        details = "No details provided"
-        if isinstance(e, JudgmentAPIException):
-            details = e.response_json.get("detail", "No details provided")
-        raise JudgmentAPIError(
-            "An error occurred while executing the Judgment API request: " + details
-        )
-def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
-    """
-    Checks if any `ScoringResult` objects are missing `scorers_data`.
-    If any are missing, logs an error and returns the results.
-    """
-    for i, result in enumerate(results):
-        if not result.scorers_data:
-            judgeval_logger.error(
-                f"Scorer data is missing for example {i}. "
-                "This is usually caused when the example does not contain "
-                "the fields required by the scorer. "
-                "Check that your example contains the fields required by the scorers. "
-                "TODO add docs link here for reference."
-            )
-    return results
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
     run: EvaluationRun,
@@ -135,17 +68,19 @@ def log_evaluation_results(
         if not judgment_api_key or not run.organization_id:
             raise ValueError("API key and organization ID are required")
-        api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
-        response = api_client.log_evaluation_results(
-            scoring_results,
-            run.model_dump(warnings=False),
+        api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
+        response = api_client.log_eval_results(
+            {
+                "results": scoring_results,  # type: ignore
+                "run": run.model_dump(warnings=False),  # type: ignore
+            }
         )
         url = response.get("ui_results_url")
         return url
     except Exception as e:
         judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
-        raise JudgmentAPIError(
+        raise JudgmentRuntimeError(
             f"Request failed while saving evaluation results to DB: {str(e)}"
         )
@@ -209,7 +144,7 @@ def _poll_evaluation_until_complete(
     """
     poll_count = 0
     exception_count = 0
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
+    api_client = JudgmentSyncClient(judgment_api_key, organization_id)
     while poll_count < max_poll_count:
         poll_count += 1
         try:
@@ -222,8 +157,11 @@ def _poll_evaluation_until_complete(
                 time.sleep(poll_interval_seconds)
                 continue
-            results_response = api_client.fetch_evaluation_results(
-                experiment_run_id, project_name
+            results_response = api_client.fetch_experiment_run(
+                {
+                    "experiment_run_id": experiment_run_id,
+                    "project_name": project_name,
+                }
             )
             url = results_response.get("ui_results_url")
@@ -264,13 +202,13 @@ def _poll_evaluation_until_complete(
             judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
             if exception_count > max_failures:
-                raise JudgmentAPIError(
+                raise JudgmentRuntimeError(
                     f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
                 )
             time.sleep(poll_interval_seconds)
-    raise JudgmentAPIError(
+    raise JudgmentRuntimeError(
         f"Error checking evaluation status after {poll_count} attempts"
     )
@@ -286,15 +224,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
     evaluation_run: EvaluationRun,
     judgment_api_key: str,
-    show_url: bool = True,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
-        judgment_api_key (str): API key for authentication
-        show_url (bool): Whether to display the evaluation results URL. Defaults to True.
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -339,11 +274,11 @@ def run_eval(
         )
         t.start()
         try:
-            api_client = JudgmentApiClient(
+            api_client = JudgmentSyncClient(
                 judgment_api_key, evaluation_run.organization_id
             )
-            response = api_client.add_to_evaluation_queue(
-                evaluation_run.model_dump(warnings=False)
+            response = api_client.add_to_run_eval_queue_examples(
+                evaluation_run.model_dump(warnings=False)  # type: ignore
             )
             if not response.get("success", False):
@@ -351,7 +286,7 @@ def run_eval(
                 judgeval_logger.error(
                     f"Error adding evaluation to queue: {error_message}"
                 )
-                raise JudgmentAPIError(error_message)
+                raise JudgmentRuntimeError(error_message)
             num_scorers = (
                 len(evaluation_run.judgment_scorers)
@@ -375,7 +310,7 @@ def run_eval(
                 evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
-                max_concurrent=MAX_CONCURRENT_EVALUATIONS,
+                max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
             )
         )
@@ -383,10 +318,9 @@ def run_eval(
             scoring_result.model_dump(warnings=False) for scoring_result in results
         ]
         url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
-    if show_url:
-        rprint(
-            f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
-        )
+    rprint(
+        f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
+    )
     return results

judgeval/exceptions.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from httpx import HTTPError, Response
+class JudgmentAPIError(HTTPError):
+    status_code: int
+    detail: str
+    response: Response
+    def __init__(self, status_code: int, detail: str, response: Response):
+        self.status_code = status_code
+        self.detail = detail
+        self.response = response
+        super().__init__(f"{status_code}: {detail}")
+class JudgmentTestError(Exception): ...
+class JudgmentRuntimeError(RuntimeError): ...
+class InvalidJudgeModelError(Exception): ...
+__all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")

judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

judgeval 0.7.1py3-none-any.whl → 0.9.0py3-none-any.whl