PyPI - judgeval - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

judgeval/__init__.py +2 -0
judgeval/cli.py +65 -0
judgeval/clients.py +2 -1
judgeval/common/api/api.py +46 -54
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +241 -0
judgeval/common/tracer/core.py +772 -467
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/providers.py +119 -0
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +16 -26
judgeval/constants.py +1 -0
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +38 -8
judgeval/data/trace.py +6 -122
judgeval/data/trace_run.py +2 -3
judgeval/dataset.py +2 -0
judgeval/integrations/langgraph.py +2 -1
judgeval/judges/litellm_judge.py +2 -1
judgeval/judges/mixture_of_judges.py +2 -1
judgeval/judges/utils.py +2 -1
judgeval/judgment_client.py +113 -53
judgeval/local_eval_queue.py +190 -0
judgeval/run_evaluation.py +43 -197
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
judgeval/scorers/score.py +33 -11
judgeval/utils/async_utils.py +36 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
judgeval-0.6.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -76
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judges/utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
 from judgeval.common.exceptions import InvalidJudgeModelError
 from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
+from judgeval.constants import DEFAULT_GPT_MODEL
 from judgeval.constants import (
     TOGETHER_SUPPORTED_MODELS,
     JUDGMENT_SUPPORTED_MODELS,
@@ -30,7 +31,7 @@ def create_judge(
     If no model is provided, uses GPT4o as the default judge.
     """
     if model is None:  # default option
-        return LiteLLMJudge(model="gpt-4.1"), True
+        return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
     if not isinstance(model, (str, list, JudgevalJudge)):
         raise InvalidJudgeModelError(
             f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."

judgeval/judgment_client.py CHANGED Viewed

@@ -2,9 +2,12 @@
 Implements the JudgmentClient to interact with the Judgment API.
 """
+from __future__ import annotations
 import os
+import importlib.util
+from pathlib import Path
 from uuid import uuid4
-from typing import Optional, List, Dict, Any, Union, Callable
+from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
 from judgeval.data import (
     ScoringResult,
@@ -15,7 +18,7 @@ from judgeval.scorers import (
     APIScorerConfig,
     BaseScorer,
 )
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
     assert_test,
@@ -28,7 +31,11 @@ from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.common.logger import judgeval_logger
-from judgeval.integrations.langgraph import JudgevalCallbackHandler
+if TYPE_CHECKING:
+    from judgeval.integrations.langgraph import JudgevalCallbackHandler
+from judgeval.constants import DEFAULT_GPT_MODEL
 class EvalRunRequestBody(BaseModel):
@@ -89,9 +96,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         tools: Optional[List[Dict[str, Any]]] = None,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_trace",
-        model: Optional[str] = "gpt-4.1",
-        append: bool = False,
-        override: bool = False,
+        model: Optional[str] = DEFAULT_GPT_MODEL,
     ) -> List[ScoringResult]:
         try:
             if examples and not function:
@@ -109,12 +114,11 @@ class JudgmentClient(metaclass=SingletonMeta):
                 traces=traces,
                 scorers=scorers,
                 model=model,
-                append=append,
                 organization_id=self.organization_id,
                 tools=tools,
             )
             return run_trace_eval(
-                trace_run, self.judgment_api_key, override, function, tracer, examples
+                trace_run, self.judgment_api_key, function, tracer, examples
             )
         except ValueError as e:
             raise ValueError(
@@ -127,11 +131,9 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
-        override: bool = False,
-        append: bool = False,
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -142,21 +144,13 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
         Returns:
             List[ScoringResult]: The results of the evaluation
         """
-        if override and append:
-            raise ValueError(
-                "Cannot set both override and append to True. Please choose one."
-            )
         try:
             eval = EvaluationRun(
-                append=append,
-                override=override,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
@@ -167,7 +161,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             return run_eval(
                 eval,
                 self.judgment_api_key,
-                override,
             )
         except ValueError as e:
             raise ValueError(
@@ -176,22 +169,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def pull_eval(
-        self, project_name: str, eval_run_name: str
-    ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
-        """Pull evaluation results from the server.
-        Args:
-            project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
-        Returns:
-            Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
-                - id (str): The evaluation run ID
-                - results (List[ScoringResult]): List of scoring results
-        """
-        return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
     def create_project(self, project_name: str) -> bool:
         """
         Creates a project on the server.
@@ -214,11 +191,9 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -229,9 +204,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            async_execution (bool): Whether to run the evaluation asynchronously
         """
         results: List[ScoringResult]
@@ -242,8 +214,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             model=model,
             project_name=project_name,
             eval_run_name=eval_run_name,
-            override=override,
-            append=append,
         )
         assert_test(results)
@@ -255,12 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
         tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
         traces: Optional[List[Trace]] = None,
         tools: Optional[List[Dict[str, Any]]] = None,
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
-        async_execution: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -271,12 +238,9 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
             function (Optional[Callable]): A function to use for evaluation
             tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
             tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
-            async_execution (bool): Whether to run the evaluation asynchronously
         """
         # Check for enable_param_checking and tools
@@ -297,11 +261,107 @@ class JudgmentClient(metaclass=SingletonMeta):
             model=model,
             project_name=project_name,
             eval_run_name=eval_run_name,
-            override=override,
-            append=append,
             function=function,
             tracer=tracer,
             tools=tools,
         )
         assert_test(results)
+    def _extract_scorer_name(self, scorer_file_path: str) -> str:
+        """Extract scorer name from the scorer file by importing it."""
+        try:
+            spec = importlib.util.spec_from_file_location(
+                "scorer_module", scorer_file_path
+            )
+            if spec is None or spec.loader is None:
+                raise ImportError(f"Could not load spec from {scorer_file_path}")
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if (
+                    isinstance(attr, type)
+                    and any("Scorer" in str(base) for base in attr.__mro__)
+                    and attr.__module__ == "scorer_module"
+                ):
+                    try:
+                        # Instantiate the scorer and get its name
+                        scorer_instance = attr()
+                        if hasattr(scorer_instance, "name"):
+                            return scorer_instance.name
+                    except Exception:
+                        # Skip if instantiation fails
+                        continue
+            raise AttributeError("No scorer class found or could be instantiated")
+        except Exception as e:
+            judgeval_logger.warning(f"Could not extract scorer name: {e}")
+            return Path(scorer_file_path).stem
+    def save_custom_scorer(
+        self,
+        scorer_file_path: str,
+        requirements_file_path: Optional[str] = None,
+        unique_name: Optional[str] = None,
+    ) -> bool:
+        """
+        Upload custom ExampleScorer from files to backend.
+        Args:
+            scorer_file_path: Path to Python file containing CustomScorer class
+            requirements_file_path: Optional path to requirements.txt
+            unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
+        Returns:
+            bool: True if upload successful
+        Raises:
+            ValueError: If scorer file is invalid
+            FileNotFoundError: If scorer file doesn't exist
+        """
+        import os
+        if not os.path.exists(scorer_file_path):
+            raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
+        # Auto-detect scorer name if not provided
+        if unique_name is None:
+            unique_name = self._extract_scorer_name(scorer_file_path)
+            judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
+        # Read scorer code
+        with open(scorer_file_path, "r") as f:
+            scorer_code = f.read()
+        # Read requirements (optional)
+        requirements_text = ""
+        if requirements_file_path and os.path.exists(requirements_file_path):
+            with open(requirements_file_path, "r") as f:
+                requirements_text = f.read()
+        # Upload to backend
+        judgeval_logger.info(
+            f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
+        )
+        try:
+            response = self.api_client.upload_custom_scorer(
+                scorer_name=unique_name,
+                scorer_code=scorer_code,
+                requirements_text=requirements_text,
+            )
+            if response.get("status") == "success":
+                judgeval_logger.info(
+                    f"Successfully uploaded custom scorer: {unique_name}"
+                )
+                return True
+            else:
+                judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
+                return False
+        except Exception as e:
+            judgeval_logger.error(f"Error uploading custom scorer: {e}")
+            raise

judgeval/local_eval_queue.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""Local evaluation queue for batching custom scorer evaluations.
+This module provides a simple in-memory queue for EvaluationRun objects that contain
+only local (BaseScorer) scorers. Useful for batching evaluations and processing them
+either synchronously or in a background thread.
+"""
+import queue
+import threading
+from typing import Callable, List, Optional
+import time
+from judgeval.common.logger import judgeval_logger
+from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
+from judgeval.data import ScoringResult
+from judgeval.data.evaluation_run import EvaluationRun
+from judgeval.utils.async_utils import safe_run_async
+from judgeval.scorers.score import a_execute_scoring
+class LocalEvaluationQueue:
+    """Lightweight in-memory queue for local evaluation runs.
+    Only supports EvaluationRuns with local scorers (BaseScorer instances).
+    API scorers (APIScorerConfig) are not supported as they have their own queue.
+    """
+    def __init__(
+        self, max_concurrent: int = MAX_CONCURRENT_EVALUATIONS, num_workers: int = 4
+    ):
+        if num_workers <= 0:
+            raise ValueError("num_workers must be a positive integer.")
+        self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
+        self._max_concurrent = max_concurrent
+        self._num_workers = num_workers  # Number of worker threads
+        self._worker_threads: List[threading.Thread] = []
+        self._shutdown_event = threading.Event()
+    def enqueue(self, evaluation_run: EvaluationRun) -> None:
+        """Add evaluation run to the queue."""
+        self._queue.put(evaluation_run)
+    def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
+        """Execute evaluation run locally and return results."""
+        if not evaluation_run.custom_scorers:
+            raise ValueError(
+                "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
+                "Found only APIScorerConfig instances."
+            )
+        return safe_run_async(
+            a_execute_scoring(
+                evaluation_run.examples,
+                evaluation_run.custom_scorers,
+                model=evaluation_run.model,
+                throttle_value=0,
+                max_concurrent=self._max_concurrent // self._num_workers,
+                show_progress=False,
+            )
+        )
+    def run_all(
+        self,
+        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+    ) -> None:
+        """Process all queued runs synchronously.
+        Args:
+            callback: Optional function called after each run with (run, results).
+        """
+        while not self._queue.empty():
+            run = self._queue.get()
+            if run is None:  # Sentinel for worker shutdown
+                self._queue.put(None)
+                break
+            results = self._process_run(run)
+            if callback:
+                callback(run, results)
+            self._queue.task_done()
+    def start_workers(
+        self,
+        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+    ) -> List[threading.Thread]:
+        """Start multiple background threads to process runs in parallel.
+        Args:
+            callback: Optional function called after each run with (run, results).
+        Returns:
+            List of started worker threads.
+        """
+        def _worker(worker_id: int) -> None:
+            while not self._shutdown_event.is_set():
+                try:
+                    # Use timeout so workers can check shutdown event periodically
+                    run = self._queue.get(timeout=1.0)
+                    if run is None:  # Sentinel to stop worker
+                        # Put sentinel back for other workers
+                        self._queue.put(None)
+                        self._queue.task_done()
+                        break
+                    try:
+                        results = self._process_run(run)
+                        if callback:
+                            callback(run, results)
+                    except Exception as exc:
+                        judgeval_logger.error(
+                            f"Worker {worker_id} error processing {run.eval_name}: {exc}"
+                        )
+                        # Continue processing other runs instead of shutting down all workers
+                    finally:
+                        self._queue.task_done()
+                except queue.Empty:
+                    # Timeout - check shutdown event and continue
+                    continue
+        # Start worker threads
+        for i in range(self._num_workers):
+            thread = threading.Thread(target=_worker, args=(i,), daemon=True)
+            thread.start()
+            self._worker_threads.append(thread)
+        return self._worker_threads
+    def start_worker(
+        self,
+        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+    ) -> Optional[threading.Thread]:
+        """Start a single background thread to process runs (backward compatibility).
+        Args:
+            callback: Optional function called after each run with (run, results).
+        Returns:
+            The started thread, or None if no threads were started.
+        """
+        threads = self.start_workers(callback)
+        return threads[0] if threads else None
+    def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
+        """Wait for all queued tasks to complete.
+        Args:
+            timeout: Maximum time to wait in seconds. None means wait indefinitely.
+        Returns:
+            True if all tasks completed, False if timeout occurred.
+        """
+        try:
+            if timeout is None:
+                self._queue.join()
+                return True
+            else:
+                start_time = time.time()
+                while not self._queue.empty() or self._queue.unfinished_tasks > 0:
+                    if time.time() - start_time > timeout:
+                        return False
+                    time.sleep(0.1)
+                return True
+        except Exception:
+            return False
+    def stop_workers(self) -> None:
+        """Signal all background workers to stop after current tasks complete."""
+        if not self._worker_threads:
+            return
+        # Signal shutdown
+        self._shutdown_event.set()
+        # Send sentinel to wake up any blocking workers
+        for _ in range(self._num_workers):
+            self._queue.put(None)
+        # Wait for all workers to finish with timeout
+        for thread in self._worker_threads:
+            if thread.is_alive():
+                thread.join(timeout=5.0)
+                if thread.is_alive():
+                    judgeval_logger.warning(
+                        f"Worker thread {thread.name} did not shut down gracefully"
+                    )
+        self._worker_threads.clear()
+        self._shutdown_event.clear()

judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl