PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,264 @@
+import datetime
+import orjson
+import os
+import yaml
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+from judgeval.data import Example
+from judgeval.data.trace import Trace
+from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
+from judgeval.api import JudgmentSyncClient
+from judgeval.logger import judgeval_logger
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
+from judgeval.data.judgment_types import DatasetKind
+@dataclass
+class DatasetInfo:
+    dataset_id: str
+    name: str
+    created_at: str
+    kind: DatasetKind
+    entries: int
+    creator: str
+@dataclass
+class Dataset:
+    name: str
+    project_name: str
+    dataset_kind: DatasetKind = DatasetKind.example
+    examples: Optional[List[Example]] = None
+    traces: Optional[List[Trace]] = None
+    judgment_api_key: str | None = JUDGMENT_API_KEY
+    organization_id: str | None = JUDGMENT_ORG_ID
+    @classmethod
+    def get(
+        cls,
+        name: str,
+        project_name: str,
+    ):
+        if not cls.judgment_api_key or not cls.organization_id:
+            raise ValueError("Judgment API key and organization ID are required")
+        client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
+        dataset = client.datasets_pull_for_judgeval(
+            {
+                "dataset_name": name,
+                "project_name": project_name,
+            },
+        )
+        if not dataset:
+            raise ValueError(f"Dataset {name} not found in project {project_name}")
+        dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
+        if dataset_kind == DatasetKind.example:
+            examples = dataset.get("examples", [])
+            if examples is None:
+                examples = []
+            for e in examples:
+                if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
+                    e.update(e.pop("data"))  # type: ignore
+                    e.pop(
+                        "example_id"
+                    )  # TODO: remove once scorer data migration is complete
+            judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
+            return cls(
+                name=name,
+                project_name=project_name,
+                dataset_kind=dataset_kind,
+                examples=[Example(**e) for e in examples],
+            )
+        elif dataset_kind == DatasetKind.trace:
+            trace_data = dataset.get("traces", [])
+            if trace_data is None:
+                trace_data = []
+            traces = []
+            for trace_item in trace_data:
+                if isinstance(trace_item, dict):
+                    trace = Trace.from_dataset_trace_with_spans(trace_item)
+                    traces.append(trace)
+            judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
+            return cls(
+                name=name,
+                project_name=project_name,
+                dataset_kind=dataset_kind,
+                traces=traces,
+            )
+        else:
+            raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        project_name: str,
+        examples: List[Example] = [],
+        overwrite: bool = False,
+    ):
+        if not cls.judgment_api_key or not cls.organization_id:
+            raise ValueError("Judgment API key and organization ID are required")
+        if not examples:
+            examples = []
+        client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
+        client.datasets_create_for_judgeval(
+            {
+                "name": name,
+                "project_name": project_name,
+                "examples": examples,  # type: ignore
+                "dataset_kind": "example",
+                "overwrite": overwrite,
+            }
+        )
+        judgeval_logger.info(f"Successfully created dataset {name}!")
+        return cls(
+            name=name,
+            project_name=project_name,
+            examples=examples,
+        )
+    @classmethod
+    def list(cls, project_name: str):
+        if not cls.judgment_api_key or not cls.organization_id:
+            raise ValueError("Judgment API key and organization ID are required")
+        client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
+        datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
+        judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
+        return [DatasetInfo(**dataset_info) for dataset_info in datasets]
+    def add_from_json(self, file_path: str) -> None:
+        """
+        Adds examples from a JSON file.
+        The JSON file is expected to have the following format:
+        [
+            {
+                "key_01": "value_01",
+                "key_02": "value_02"
+            },
+            {
+                "key_11": "value_11",
+                "key_12": "value_12",
+                "key_13": "value_13"
+            },
+            ...
+        ]
+        """
+        examples = get_examples_from_json(file_path)
+        self.add_examples(examples)
+    def add_from_yaml(self, file_path: str) -> None:
+        """
+        Adds examples from a YAML file.
+        The YAML file is expected to have the following format:
+        - key_01: value_01
+          key_02: value_02
+        - key_11: value_11
+          key_12: value_12
+          key_13: value_13
+        ...
+        """
+        examples = get_examples_from_yaml(file_path)
+        self.add_examples(examples)
+    def add_examples(self, examples: List[Example]) -> None:
+        if not isinstance(examples, list):
+            raise TypeError("examples must be a list")
+        if not self.judgment_api_key or not self.organization_id:
+            raise ValueError("Judgment API key and organization ID are required")
+        client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
+        client.datasets_insert_examples_for_judgeval(
+            {
+                "dataset_name": self.name,
+                "project_name": self.project_name,
+                "examples": examples,  # type: ignore
+            }
+        )
+    def save_as(
+        self,
+        file_type: Literal["json", "yaml"],
+        dir_path: str,
+        save_name: str | None = None,
+    ) -> None:
+        """
+        Saves the dataset as a file. Save only the examples.
+        Args:
+            file_type (Literal["json", "csv"]): The file type to save the dataset as.
+            dir_path (str): The directory path to save the file to.
+            save_name (str, optional): The name of the file to save. Defaults to None.
+        """
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+        file_name = (
+            datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            if save_name is None
+            else save_name
+        )
+        complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
+        if file_type == "json":
+            with open(complete_path, "wb") as file:
+                file.write(
+                    orjson.dumps(
+                        {
+                            "examples": [e.to_dict() for e in self.examples]
+                            if self.examples
+                            else [],
+                        },
+                        option=orjson.OPT_INDENT_2,
+                    )
+                )
+        elif file_type == "yaml":
+            with open(complete_path, "w") as file:
+                yaml_data = {
+                    "examples": [e.to_dict() for e in self.examples]
+                    if self.examples
+                    else [],
+                }
+                yaml.dump(yaml_data, file, default_flow_style=False)
+        else:
+            ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
+            raise TypeError(
+                f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
+            )
+    def __iter__(self):
+        if self.dataset_kind == DatasetKind.example and self.examples:
+            return iter(self.examples)
+        elif self.dataset_kind == DatasetKind.trace and self.traces:
+            return iter(self.traces)
+        else:
+            return iter([])
+    def __len__(self):
+        if self.dataset_kind == DatasetKind.example and self.examples:
+            return len(self.examples)
+        elif self.dataset_kind == DatasetKind.trace and self.traces:
+            return len(self.traces)
+        else:
+            return 0
+    def __str__(self):
+        if self.dataset_kind == DatasetKind.example:
+            return (
+                f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
+            )
+        else:
+            return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"

judgeval/env.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+from dotenv import load_dotenv
+load_dotenv()
+import os
+from typing import overload
+@overload
+def optional_env_var(var_name: str) -> str | None: ...
+@overload
+def optional_env_var(var_name: str, default: str) -> str: ...
+def optional_env_var(var_name: str, default: str | None = None) -> str | None:
+    return os.getenv(var_name, default)
+JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
+JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
+JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
+JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5")
+JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
+    "JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
+)
+JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
+    optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
+)
+JUDGMENT_ENABLE_MONITORING = optional_env_var("JUDGMENT_ENABLE_MONITORING", "true")
+JUDGMENT_ENABLE_EVALUATIONS = optional_env_var("JUDGMENT_ENABLE_EVALUATIONS", "true")
+JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
+JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
+JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
+JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
+JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
+JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
+JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
+JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
+JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
+TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
+TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")

judgeval/evaluation/__init__.py ADDED Viewed

@@ -0,0 +1,344 @@
+from __future__ import annotations
+import asyncio
+import concurrent.futures
+import time
+import threading
+from typing import List, Tuple, TYPE_CHECKING
+from rich import print as rprint
+from judgeval.data import ScorerData, ScoringResult
+from judgeval.scorers.score import a_execute_scoring
+from judgeval.api import JudgmentSyncClient
+from judgeval.env import (
+    JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
+)
+from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
+from judgeval.logger import judgeval_logger
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
+if TYPE_CHECKING:
+    from judgeval.data.evaluation_run import ExampleEvaluationRun
+def safe_run_async(coro):
+    """
+    Safely run an async coroutine whether or not there's already an event loop running.
+    Args:
+        coro: The coroutine to run
+    Returns:
+        The result of the coroutine
+    """
+    try:
+        # Try to get the running loop
+        asyncio.get_running_loop()
+        # If we get here, there's already a loop running
+        # Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, coro)
+            return future.result()
+    except RuntimeError:
+        # No event loop is running, safe to use asyncio.run()
+        return asyncio.run(coro)
+def log_evaluation_results(
+    scoring_results: List[ScoringResult],
+    run: ExampleEvaluationRun,
+) -> str:
+    """
+    Logs evaluation results to the Judgment API database.
+    Args:
+        merged_results (List[ScoringResult]): The results to log
+        evaluation_run (EvaluationRun): The evaluation run containing project info and API key
+        judgment_api_key (str): The API key for the Judgment API
+    Raises:
+        JudgmentAPIError: If there's an API error during logging
+        ValueError: If there's a validation error with the results
+    """
+    try:
+        if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
+            raise ValueError("API key and organization ID are required")
+        api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
+        response = api_client.log_eval_results(
+            {
+                "results": scoring_results,  # type: ignore
+                "run": run.model_dump(warnings=False),  # type: ignore
+            }
+        )
+        url = response.get("ui_results_url")
+        return url
+    except Exception as e:
+        judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
+        raise JudgmentRuntimeError(
+            f"Request failed while saving evaluation results to DB: {str(e)}"
+        )
+def _poll_evaluation_until_complete(
+    evaluation_run: ExampleEvaluationRun,
+    expected_examples_count: int,
+    poll_interval_seconds: float = 5,
+    max_failures: int = 5,
+    max_poll_count: int = 60,  # This should be equivalent to 5 minutes
+) -> Tuple[List[ScoringResult], str]:
+    """
+    Polls until the evaluation is complete and returns the results.
+    Args:
+        eval_name (str): Name of the evaluation run
+        project_name (str): Name of the project
+        judgment_api_key (str): API key for authentication
+        organization_id (str): Organization ID for the evaluation
+        poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
+        original_examples (List[Example], optional): The original examples sent for evaluation.
+                                                    If provided, will match results with original examples.
+    Returns:
+        List[ScoringResult]: The evaluation results
+    """
+    project_name = evaluation_run.project_name
+    experiment_run_id = evaluation_run.id
+    if not project_name or not experiment_run_id:
+        raise ValueError("Project name and experiment run ID are required")
+    poll_count = 0
+    exception_count = 0
+    if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
+        raise ValueError("Judgment API key and organization ID are required")
+    api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
+    while poll_count < max_poll_count:
+        poll_count += 1
+        try:
+            # Check status
+            results_response = api_client.fetch_experiment_run(
+                {
+                    "experiment_run_id": experiment_run_id,
+                    "project_name": project_name,
+                }
+            )
+            example_scorer_pairings = results_response.get("results", [])
+            if len(example_scorer_pairings) != expected_examples_count:
+                time.sleep(poll_interval_seconds)
+                continue
+            url = results_response.get("ui_results_url")
+            scoring_result_list = []
+            for res in example_scorer_pairings:
+                example = res.get("data", {}).copy()
+                example["example_id"] = res.get("example_id")
+                scoring_result = ScoringResult(
+                    scorers_data=res.get("scorers", []),
+                    success=all(
+                        t.get("success", False) for t in res.get("scorers", [])
+                    ),
+                    data_object=example,
+                )
+                scoring_result_list.append(scoring_result)
+            return scoring_result_list, url
+        except Exception as e:
+            exception_count += 1
+            if isinstance(e, JudgmentAPIError):
+                raise
+            judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
+            if exception_count > max_failures:
+                raise JudgmentRuntimeError(
+                    f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
+                )
+            time.sleep(poll_interval_seconds)
+    raise JudgmentRuntimeError(
+        f"Error checking evaluation status after {poll_count} attempts"
+    )
+def progress_logger(stop_event, msg="Working...", interval=5):
+    start = time.time()
+    while not stop_event.is_set():
+        elapsed = int(time.time() - start)
+        judgeval_logger.info(f"{msg} ({elapsed} sec)")
+        stop_event.wait(interval)
+def run_eval(
+    evaluation_run: ExampleEvaluationRun,
+) -> List[ScoringResult]:
+    """
+    Executes an evaluation of `Example`s using one or more `Scorer`s
+    Args:
+        evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
+    Returns:
+        List[ScoringResult]: A list of ScoringResult objects
+    """
+    # Check that every example has the same keys
+    keys = evaluation_run.examples[0].get_fields().keys()
+    for example in evaluation_run.examples:
+        current_keys = example.get_fields().keys()
+        if current_keys != keys:
+            raise ValueError(
+                f"All examples must have the same keys: {current_keys} != {keys}"
+            )
+    results: List[ScoringResult] = []
+    url = ""
+    if (
+        len(evaluation_run.custom_scorers) > 0
+        and len(evaluation_run.judgment_scorers) > 0
+    ):
+        error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
+        judgeval_logger.error(error_msg)
+        raise ValueError(error_msg)
+    e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
+    if evaluation_run.judgment_scorers or e2b_scorers:
+        if evaluation_run.judgment_scorers and e2b_scorers:
+            error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        if len(e2b_scorers) > 1:
+            error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        stop_event = threading.Event()
+        t = threading.Thread(
+            target=progress_logger, args=(stop_event, "Running evaluation...")
+        )
+        t.start()
+        try:
+            if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
+                raise ValueError("Judgment API key and organization ID are required")
+            api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
+            response = api_client.add_to_run_eval_queue_examples(
+                evaluation_run.model_dump(warnings=False)  # type: ignore
+            )
+            if not response.get("success", False):
+                error_message = response.error
+                judgeval_logger.error(
+                    f"Error adding evaluation to queue: {error_message}"
+                )
+                raise JudgmentRuntimeError(error_message)
+            results, url = _poll_evaluation_until_complete(
+                evaluation_run=evaluation_run,
+                expected_examples_count=len(evaluation_run.examples),
+            )
+        finally:
+            stop_event.set()
+            t.join()
+    else:
+        results = safe_run_async(
+            a_execute_scoring(
+                evaluation_run.examples,
+                evaluation_run.custom_scorers,
+                model=evaluation_run.model,
+                throttle_value=0,
+                max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
+            )
+        )
+        send_results = [
+            scoring_result.model_dump(warnings=False) for scoring_result in results
+        ]
+        url = log_evaluation_results(send_results, evaluation_run)
+    rprint(
+        f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
+    )
+    return results
+def assert_test(scoring_results: List[ScoringResult]) -> None:
+    """
+    Collects all failed scorers from the scoring results.
+    Args:
+        ScoringResults (List[ScoringResult]): List of scoring results to check
+    Returns:
+        None. Raises exceptions for any failed test cases.
+    """
+    failed_cases: List[List[ScorerData]] = []
+    for result in scoring_results:
+        if not result.success:
+            # Create a test case context with all relevant fields
+            test_case: List[ScorerData] = []
+            if result.scorers_data:
+                # If the result was not successful, check each scorer_data
+                for scorer_data in result.scorers_data:
+                    if not scorer_data.success:
+                        test_case.append(scorer_data)
+            failed_cases.append(test_case)
+    if failed_cases:
+        error_msg = "The following test cases failed: \n"
+        for fail_case in failed_cases:
+            for fail_scorer in fail_case:
+                error_msg += (
+                    f"\nScorer Name: {fail_scorer.name}\n"
+                    f"Threshold: {fail_scorer.threshold}\n"
+                    f"Success: {fail_scorer.success}\n"
+                    f"Score: {fail_scorer.score}\n"
+                    f"Reason: {fail_scorer.reason}\n"
+                    f"Strict Mode: {fail_scorer.strict_mode}\n"
+                    f"Evaluation Model: {fail_scorer.evaluation_model}\n"
+                    f"Error: {fail_scorer.error}\n"
+                    f"Additional Metadata: {fail_scorer.additional_metadata}\n"
+                )
+            error_msg += "-" * 100
+        total_tests = len(scoring_results)
+        failed_tests = len(failed_cases)
+        passed_tests = total_tests - failed_tests
+        # Print summary with colors
+        rprint("\n" + "=" * 80)
+        if failed_tests == 0:
+            rprint(
+                f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
+            )
+        else:
+            rprint(
+                f"[bold red]⚠️  TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
+            )
+        rprint("=" * 80 + "\n")
+        # Print individual test cases
+        for i, result in enumerate(scoring_results):
+            test_num = i + 1
+            if result.success:
+                rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
+            else:
+                rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
+                if result.scorers_data:
+                    for scorer_data in result.scorers_data:
+                        if not scorer_data.success:
+                            rprint(f"  [yellow]Scorer: {scorer_data.name}[/yellow]")
+                            rprint(f"  [red]  Score: {scorer_data.score}[/red]")
+                            rprint(f"  [red]  Reason: {scorer_data.reason}[/red]")
+                            if scorer_data.error:
+                                rprint(f"  [red]  Error: {scorer_data.error}[/red]")
+                rprint("  " + "-" * 40)
+        rprint("\n" + "=" * 80)
+        if failed_tests > 0:
+            raise AssertionError(failed_cases)

judgeval/exceptions.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from httpx import HTTPError, Response
+class JudgmentAPIError(HTTPError):
+    status_code: int
+    detail: str
+    response: Response
+    def __init__(self, status_code: int, detail: str, response: Response):
+        self.status_code = status_code
+        self.detail = detail
+        self.response = response
+        super().__init__(f"{status_code}: {detail}")
+class JudgmentTestError(Exception): ...
+class JudgmentRuntimeError(RuntimeError): ...
+class InvalidJudgeModelError(Exception): ...
+__all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")

judgeval/integrations/langgraph/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from __future__ import annotations
+from abc import ABC
+import os
+class Langgraph(ABC):
+    @staticmethod
+    def initialize(otel_only: bool = True):
+        os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
+        os.environ["LANGSMITH_TRACING"] = "true"
+        if otel_only:
+            os.environ["LANGSMITH_OTEL_ONLY"] = "true"

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl