PyPI - judgeval - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

judgeval/__init__.py +2 -0
judgeval/cli.py +65 -0
judgeval/clients.py +2 -1
judgeval/common/api/api.py +46 -54
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +241 -0
judgeval/common/tracer/core.py +772 -467
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/providers.py +119 -0
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +16 -26
judgeval/constants.py +1 -0
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +38 -8
judgeval/data/trace.py +6 -122
judgeval/data/trace_run.py +2 -3
judgeval/dataset.py +2 -0
judgeval/integrations/langgraph.py +2 -1
judgeval/judges/litellm_judge.py +2 -1
judgeval/judges/mixture_of_judges.py +2 -1
judgeval/judges/utils.py +2 -1
judgeval/judgment_client.py +113 -53
judgeval/local_eval_queue.py +190 -0
judgeval/run_evaluation.py +43 -197
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
judgeval/scorers/score.py +33 -11
judgeval/utils/async_utils.py +36 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
judgeval-0.6.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -76
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,10 +1,12 @@
+from __future__ import annotations
 import asyncio
 import concurrent.futures
 import time
 import orjson
 import sys
 import threading
-from typing import List, Dict, Union, Optional, Callable, Tuple, Any
+from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
 from rich import print as rprint
 from judgeval.data import ScorerData, ScoringResult, Example, Trace
@@ -17,10 +19,13 @@ from judgeval.constants import (
 from judgeval.common.exceptions import JudgmentAPIError
 from judgeval.common.api.api import JudgmentAPIException
 from judgeval.common.logger import judgeval_logger
-from judgeval.evaluation_run import EvaluationRun
-from judgeval.data.trace_run import TraceRun
-from judgeval.common.tracer import Tracer
-from judgeval.integrations.langgraph import JudgevalCallbackHandler
+if TYPE_CHECKING:
+    from judgeval.common.tracer import Tracer
+    from judgeval.data.trace_run import TraceRun
+    from judgeval.data.evaluation_run import EvaluationRun
+    from judgeval.integrations.langgraph import JudgevalCallbackHandler
 def safe_run_async(coro):
@@ -135,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
     return results
-def check_experiment_type(
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-    is_trace: bool,
-) -> None:
-    """
-    Checks if the current experiment, if one exists, has the same type (examples of traces)
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_experiment_type(eval_name, project_name, is_trace)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 422:
-            judgeval_logger.error(f"{e.response_json}")
-            raise ValueError(f"{e.response_json}")
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
-def check_eval_run_name_exists(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> None:
-    """
-    Checks if an evaluation run name already exists for a given project.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-    Raises:
-        ValueError: If the evaluation run name already exists
-        JudgmentAPIError: If there's an API error during the check
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_eval_run_name_exists(eval_name, project_name)
-    except JudgmentAPIException as e:
-        if e.response.status_code == 409:
-            error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
-            judgeval_logger.error(error_str)
-            raise ValueError(error_str)
-        else:
-            raise e
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def check_example_keys(
-    keys: List[str],
-    eval_name: str,
-    project_name: str,
-    judgment_api_key: str,
-    organization_id: str,
-) -> None:
-    """
-    Checks if the current experiment (if one exists) has the same keys for example
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        api_client.check_example_keys(keys, eval_name, project_name)
-    except Exception as e:
-        judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
-        raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
     run: Union[EvaluationRun, TraceRun],
@@ -280,29 +211,10 @@ def check_examples(
 def run_trace_eval(
     trace_run: TraceRun,
     judgment_api_key: str,
-    override: bool = False,
     function: Optional[Callable] = None,
-    tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
+    tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
     examples: Optional[List[Example]] = None,
 ) -> List[ScoringResult]:
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not trace_run.append:
-        check_eval_run_name_exists(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-        )
-    if trace_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples or traces)
-        check_experiment_type(
-            trace_run.eval_name,
-            trace_run.project_name,
-            judgment_api_key,
-            trace_run.organization_id,
-            True,
-        )
     if function and tracer and examples is not None:
         new_traces: List[Trace] = []
@@ -371,43 +283,8 @@ def run_trace_eval(
     return scoring_results
-async def get_evaluation_status(
-    eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
-) -> Dict:
-    """
-    Gets the status of an async evaluation run.
-    Args:
-        eval_name (str): Name of the evaluation run
-        project_name (str): Name of the project
-        judgment_api_key (str): API key for authentication
-        organization_id (str): Organization ID for the evaluation
-    Returns:
-        Dict: Status information including:
-            - status: 'pending', 'running', 'completed', or 'failed'
-            - results: List of ScoringResult objects if completed
-            - error: Error message if failed
-    """
-    api_client = JudgmentApiClient(judgment_api_key, organization_id)
-    try:
-        return api_client.get_evaluation_status(eval_name, project_name)
-    except Exception as e:
-        raise JudgmentAPIError(
-            f"An error occurred while checking evaluation status: {str(e)}"
-        )
-def retrieve_counts(result: Dict):
-    scorer_data_count = 0
-    for example in result.get("examples", []):
-        for scorer in example.get("scorer_data", []):
-            scorer_data_count += 1
-    return scorer_data_count
 def _poll_evaluation_until_complete(
-    eval_name: str,
+    experiment_run_id: str,
     project_name: str,
     judgment_api_key: str,
     organization_id: str,
@@ -438,14 +315,16 @@ def _poll_evaluation_until_complete(
         poll_count += 1
         try:
             # Check status
-            status_response = api_client.get_evaluation_status(eval_name, project_name)
+            status_response = api_client.get_evaluation_status(
+                experiment_run_id, project_name
+            )
             if status_response.get("status") != "completed":
                 time.sleep(poll_interval_seconds)
                 continue
             results_response = api_client.fetch_evaluation_results(
-                project_name, eval_name
+                experiment_run_id, project_name
             )
             url = results_response.get("ui_results_url")
@@ -508,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
     evaluation_run: EvaluationRun,
     judgment_api_key: str,
-    override: bool = False,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
-        override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -529,52 +406,31 @@ def run_eval(
                 f"All examples must have the same keys: {current_keys} != {keys}"
             )
-    # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
-    if not override and not evaluation_run.append:
-        check_eval_run_name_exists(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-        )
-    if evaluation_run.append:
-        # Check that the current experiment, if one exists, has the same type (examples of traces)
-        check_experiment_type(
-            evaluation_run.eval_name,
-            evaluation_run.project_name,
-            judgment_api_key,
-            evaluation_run.organization_id,
-            False,
-        )
-        # Ensure that current experiment (if one exists) has the same keys for example
-        check_example_keys(
-            keys=list(keys),
-            eval_name=evaluation_run.eval_name,
-            project_name=evaluation_run.project_name,
-            judgment_api_key=judgment_api_key,
-            organization_id=evaluation_run.organization_id,
-        )
-    judgment_scorers: List[APIScorerConfig] = []
-    local_scorers: List[BaseScorer] = []
-    for scorer in evaluation_run.scorers:
-        if isinstance(scorer, APIScorerConfig):
-            judgment_scorers.append(scorer)
-        else:
-            local_scorers.append(scorer)
     results: List[ScoringResult] = []
     url = ""
-    if len(local_scorers) > 0 and len(judgment_scorers) > 0:
+    if (
+        len(evaluation_run.custom_scorers) > 0
+        and len(evaluation_run.judgment_scorers) > 0
+    ):
         error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
         judgeval_logger.error(error_msg)
         raise ValueError(error_msg)
-    if len(judgment_scorers) > 0:
-        check_examples(evaluation_run.examples, judgment_scorers)
+    e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
+    if evaluation_run.judgment_scorers or e2b_scorers:
+        if evaluation_run.judgment_scorers and e2b_scorers:
+            error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        if len(e2b_scorers) > 1:
+            error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
+            judgeval_logger.error(error_msg)
+            raise ValueError(error_msg)
+        check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
         stop_event = threading.Event()
         t = threading.Thread(
             target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -595,36 +451,26 @@ def run_eval(
                 )
                 raise JudgmentAPIError(error_message)
-            old_scorer_data_count = 0
-            if evaluation_run.append:
-                try:
-                    results_response = api_client.fetch_evaluation_results(
-                        evaluation_run.project_name, evaluation_run.eval_name
-                    )
-                    old_scorer_data_count = retrieve_counts(results_response)
-                except Exception:
-                    # This usually means the user did append = True but the eval run name doesn't exist yet
-                    pass
+            num_scorers = (
+                len(evaluation_run.judgment_scorers)
+                if evaluation_run.judgment_scorers
+                else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
+            )
             results, url = _poll_evaluation_until_complete(
-                eval_name=evaluation_run.eval_name,
+                experiment_run_id=evaluation_run.id,
                 project_name=evaluation_run.project_name,
                 judgment_api_key=judgment_api_key,
                 organization_id=evaluation_run.organization_id,
-                expected_scorer_data_count=(
-                    len(evaluation_run.scorers) * len(evaluation_run.examples)
-                )
-                + old_scorer_data_count,
+                expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
             )
         finally:
             stop_event.set()
             t.join()
-    if len(local_scorers) > 0:
+    else:
         results = safe_run_async(
             a_execute_scoring(
                 evaluation_run.examples,
-                local_scorers,
+                evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
                 max_concurrent=MAX_CONCURRENT_EVALUATIONS,

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
     name: Optional[str] = (
         None  # name of your scorer (Faithfulness, PromptScorer-randomslug)
     )
+    class_name: Optional[str] = None  # The name of the class of the scorer
     score: Optional[float] = None  # The float score of the scorer run on the test case
     score_breakdown: Optional[Dict] = None
     reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
     error: Optional[str] = None  # The error message if the scorer failed
     additional_metadata: Optional[Dict] = None  # Additional metadata for the scorer
     user: Optional[str] = None  # The user ID of the scorer
+    server_hosted: bool = False  # Whether the scorer is enabled for e2b
-    @model_validator(mode="before")
+    @model_validator(mode="after")
     @classmethod
-    def enforce_strict_threshold(cls, data: dict):
-        if data.get("strict_mode"):
-            data["threshold"] = 1.0
+    def enforce_strict_threshold(cls, data: "BaseScorer"):
+        if data.strict_mode:
+            data.threshold = 1.0
         return data
     @model_validator(mode="after")
     @classmethod
     def default_name(cls, m: "BaseScorer") -> "BaseScorer":
+        # Always set class_name to the string name of the class
+        m.class_name = m.__class__.__name__
         if not m.name:
-            # Try to use the class name if it exists and is not empty
-            class_name = getattr(m, "__class__", None)
-            if class_name and getattr(m.__class__, "__name__", None):
-                m.name = m.__class__.__name__
-            else:
-                m.name = m.score_type
+            m.name = m.class_name
         return m
     def _add_model(self, model: str):

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
 def push_prompt_scorer(
     name: str,
     prompt: str,
+    threshold: float,
     options: Optional[Dict[str, float]] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
 ) -> str:
     client = JudgmentApiClient(judgment_api_key, organization_id)
     try:
-        r = client.save_scorer(name, prompt, options)
+        r = client.save_scorer(name, prompt, threshold, options)
     except JudgmentAPIException as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
         return cls(
             name=name,
             prompt=scorer_config["prompt"],
+            threshold=scorer_config["threshold"],
             options=scorer_config.get("options"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
         cls,
         name: str,
         prompt: str,
+        threshold: Optional[float] = 0.5,
         options: Optional[Dict[str, float]] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
         if not scorer_exists(name, judgment_api_key, organization_id):
-            push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
+            push_prompt_scorer(
+                name, prompt, threshold, options, judgment_api_key, organization_id
+            )
             judgeval_logger.info(f"Successfully created PromptScorer: {name}")
             return cls(
                 name=name,
                 prompt=prompt,
+                threshold=threshold,
                 options=options,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
         judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
     # Getters
+    def get_threshold(self) -> float | None:
+        """
+        Returns the threshold of the scorer.
+        """
+        return self.threshold
     def get_prompt(self) -> str | None:
         """
         Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
         return {
             "name": self.name,
             "prompt": self.prompt,
+            "threshold": self.threshold,
             "options": self.options,
         }
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
         push_prompt_scorer(
             self.name,
             self.prompt,
+            self.threshold,
             self.options,
             self.judgment_api_key,
             self.organization_id,
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
+        return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)

judgeval/scorers/score.py CHANGED Viewed

@@ -17,6 +17,7 @@ from judgeval.scorers import BaseScorer
 from judgeval.scorers.utils import clone_scorers
 from judgeval.common.logger import judgeval_logger
 from judgeval.judges import JudgevalJudge
+from judgeval.constants import DEFAULT_GPT_MODEL
 async def safe_a_score_example(
@@ -55,10 +56,11 @@ async def safe_a_score_example(
 async def a_execute_scoring(
     examples: List[Example],
     scorers: List[BaseScorer],
-    model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
+    model: Optional[Union[str, List[str], JudgevalJudge]] = DEFAULT_GPT_MODEL,
     ignore_errors: bool = False,
     throttle_value: int = 0,
     max_concurrent: int = 100,
+    show_progress: bool = True,
 ) -> List[ScoringResult]:
     """
     Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
@@ -71,8 +73,7 @@ async def a_execute_scoring(
         ignore_errors (bool): Whether to ignore errors during evaluation.
         throttle_value (int): The amount of time to wait between starting each task.
         max_concurrent (int): The maximum number of concurrent tasks.
-        _use_bar_indicator (bool): Whether to use a progress bar indicator.
+        show_progress (bool): Whether to show the progress bar indicator.
     Returns:
         List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
@@ -101,16 +102,37 @@ async def a_execute_scoring(
     tasks = []
     cloned_scorers: List[BaseScorer]
-    with tqdm_asyncio(
-        desc=f"Evaluating {len(examples)} example(s) in parallel",
-        unit="Example",
-        total=len(examples),
-        bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
-    ) as pbar:
+    if show_progress:
+        with tqdm_asyncio(
+            desc=f"Evaluating {len(examples)} example(s) in parallel",
+            unit="Example",
+            total=len(examples),
+            bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
+        ) as pbar:
+            for i, ex in enumerate(examples):
+                if isinstance(ex, Example):
+                    if len(scorers) == 0:
+                        pbar.update(1)
+                        continue
+                    cloned_scorers = clone_scorers(scorers)
+                    task = execute_with_semaphore(
+                        func=a_eval_examples_helper,
+                        scorers=cloned_scorers,
+                        example=ex,
+                        scoring_results=scoring_results,
+                        score_index=i,
+                        ignore_errors=ignore_errors,
+                        pbar=pbar,
+                    )
+                    tasks.append(asyncio.create_task(task))
+                await asyncio.sleep(throttle_value)
+            await asyncio.gather(*tasks)
+    else:
         for i, ex in enumerate(examples):
             if isinstance(ex, Example):
                 if len(scorers) == 0:
-                    pbar.update(1)
                     continue
                 cloned_scorers = clone_scorers(scorers)
@@ -121,7 +143,7 @@ async def a_execute_scoring(
                     scoring_results=scoring_results,
                     score_index=i,
                     ignore_errors=ignore_errors,
-                    pbar=pbar,
+                    pbar=None,
                 )
                 tasks.append(asyncio.create_task(task))

judgeval/utils/async_utils.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Async utilities for judgeval."""
+import asyncio
+import concurrent.futures
+from typing import Awaitable, TypeVar
+# Generic type variable for coroutine return type
+T = TypeVar("T")
+def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
+    """Safely execute an async *coro* from synchronous code.
+    This helper handles two common situations:
+    1. **No running event loop** – Simply delegates to ``asyncio.run``.
+    2. **Existing running loop** – Executes the coroutine in a separate
+       thread so that we don't attempt to nest event loops (which would raise
+       ``RuntimeError``).
+    Args:
+        coro: The coroutine to execute.
+    Returns:
+        The result returned by *coro*.
+    """
+    try:
+        asyncio.get_running_loop()
+    except RuntimeError:
+        return asyncio.run(coro)
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future = executor.submit(lambda: asyncio.run(coro))
+        return future.result()

{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.4.0
+Version: 0.6.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -10,27 +10,26 @@ License-File: LICENSE.md
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
-Requires-Dist: anthropic
 Requires-Dist: boto3
-Requires-Dist: datamodel-code-generator>=0.31.1
-Requires-Dist: google-genai
-Requires-Dist: groq>=0.30.0
+Requires-Dist: click<8.2.0
 Requires-Dist: langchain-anthropic
 Requires-Dist: langchain-core
 Requires-Dist: langchain-huggingface
 Requires-Dist: langchain-openai
 Requires-Dist: litellm>=1.61.15
-Requires-Dist: matplotlib>=3.10.3
-Requires-Dist: nest-asyncio
-Requires-Dist: openai
+Requires-Dist: nest-asyncio>=1.6.0
 Requires-Dist: opentelemetry-api>=1.34.1
 Requires-Dist: opentelemetry-sdk>=1.34.1
 Requires-Dist: orjson>=3.9.0
-Requires-Dist: pandas
-Requires-Dist: python-dotenv==1.0.1
-Requires-Dist: python-slugify>=8.0.4
+Requires-Dist: python-dotenv
 Requires-Dist: requests
-Requires-Dist: together
+Requires-Dist: rich
+Requires-Dist: typer>=0.9.0
+Provides-Extra: langchain
+Requires-Dist: langchain-anthropic; extra == 'langchain'
+Requires-Dist: langchain-core; extra == 'langchain'
+Requires-Dist: langchain-huggingface; extra == 'langchain'
+Requires-Dist: langchain-openai; extra == 'langchain'
 Description-Content-Type: text/markdown
 <div align="center">

judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl