PyPI - judgeval - Versions diffs - 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

judgeval 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/cli.py +65 -0
judgeval/common/api/api.py +44 -38
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +8 -9
judgeval/common/tracer/core.py +448 -256
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +2 -1
judgeval/common/tracer/trace_manager.py +6 -1
judgeval/common/trainer/__init__.py +5 -0
judgeval/common/trainer/config.py +125 -0
judgeval/common/trainer/console.py +151 -0
judgeval/common/trainer/trainable_model.py +238 -0
judgeval/common/trainer/trainer.py +301 -0
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +37 -8
judgeval/data/trace.py +1 -0
judgeval/data/trace_run.py +0 -2
judgeval/integrations/langgraph.py +2 -1
judgeval/judgment_client.py +90 -135
judgeval/local_eval_queue.py +3 -5
judgeval/run_evaluation.py +43 -299
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +10 -47
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +29 -22
judgeval-0.7.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -80
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
{judgeval-0.5.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -4,35 +4,31 @@ Implements the JudgmentClient to interact with the Judgment API.
 from __future__ import annotations
 import os
+import importlib.util
+from pathlib import Path
 from uuid import uuid4
-from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
+from typing import Optional, List, Dict, Union
 from judgeval.data import (
     ScoringResult,
     Example,
-    Trace,
 )
 from judgeval.scorers import (
     APIScorerConfig,
     BaseScorer,
 )
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
     assert_test,
-    run_trace_eval,
 )
-from judgeval.data.trace_run import TraceRun
 from judgeval.common.api import JudgmentApiClient
 from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.common.logger import judgeval_logger
-if TYPE_CHECKING:
-    from judgeval.integrations.langgraph import JudgevalCallbackHandler
 from judgeval.constants import DEFAULT_GPT_MODEL
@@ -84,50 +80,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         else:
             judgeval_logger.info("Successfully initialized JudgmentClient!")
-    def run_trace_evaluation(
-        self,
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        examples: Optional[List[Example]] = None,
-        function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
-        traces: Optional[List[Trace]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        project_name: str = "default_project",
-        eval_run_name: str = "default_eval_trace",
-        model: Optional[str] = DEFAULT_GPT_MODEL,
-        append: bool = False,
-        override: bool = False,
-    ) -> List[ScoringResult]:
-        try:
-            if examples and not function:
-                raise ValueError("Cannot pass in examples without a function")
-            if traces and function:
-                raise ValueError("Cannot pass in traces and function")
-            if examples and traces:
-                raise ValueError("Cannot pass in both examples and traces")
-            trace_run = TraceRun(
-                project_name=project_name,
-                eval_name=eval_run_name,
-                traces=traces,
-                scorers=scorers,
-                model=model,
-                append=append,
-                organization_id=self.organization_id,
-                tools=tools,
-            )
-            return run_trace_eval(
-                trace_run, self.judgment_api_key, override, function, tracer, examples
-            )
-        except ValueError as e:
-            raise ValueError(
-                f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
-            )
-        except Exception as e:
-            raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
     def run_evaluation(
         self,
         examples: List[Example],
@@ -135,8 +87,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
-        override: bool = False,
-        append: bool = False,
+        show_url: bool = True,
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -147,21 +98,13 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
         Returns:
             List[ScoringResult]: The results of the evaluation
         """
-        if override and append:
-            raise ValueError(
-                "Cannot set both override and append to True. Please choose one."
-            )
         try:
             eval = EvaluationRun(
-                append=append,
-                override=override,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
@@ -172,7 +115,7 @@ class JudgmentClient(metaclass=SingletonMeta):
             return run_eval(
                 eval,
                 self.judgment_api_key,
-                override,
+                show_url=show_url,
             )
         except ValueError as e:
             raise ValueError(
@@ -181,22 +124,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def pull_eval(
-        self, project_name: str, eval_run_name: str
-    ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
-        """Pull evaluation results from the server.
-        Args:
-            project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
-        Returns:
-            Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
-                - id (str): The evaluation run ID
-                - results (List[ScoringResult]): List of scoring results
-        """
-        return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
     def create_project(self, project_name: str) -> bool:
         """
         Creates a project on the server.
@@ -222,8 +149,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -234,9 +159,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            async_execution (bool): Whether to run the evaluation asynchronously
         """
         results: List[ScoringResult]
@@ -247,66 +169,99 @@ class JudgmentClient(metaclass=SingletonMeta):
             model=model,
             project_name=project_name,
             eval_run_name=eval_run_name,
-            override=override,
-            append=append,
         )
         assert_test(results)
-    def assert_trace_test(
+    def _extract_scorer_name(self, scorer_file_path: str) -> str:
+        """Extract scorer name from the scorer file by importing it."""
+        try:
+            spec = importlib.util.spec_from_file_location(
+                "scorer_module", scorer_file_path
+            )
+            if spec is None or spec.loader is None:
+                raise ImportError(f"Could not load spec from {scorer_file_path}")
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if (
+                    isinstance(attr, type)
+                    and any("Scorer" in str(base) for base in attr.__mro__)
+                    and attr.__module__ == "scorer_module"
+                ):
+                    try:
+                        # Instantiate the scorer and get its name
+                        scorer_instance = attr()
+                        if hasattr(scorer_instance, "name"):
+                            return scorer_instance.name
+                    except Exception:
+                        # Skip if instantiation fails
+                        continue
+            raise AttributeError("No scorer class found or could be instantiated")
+        except Exception as e:
+            judgeval_logger.warning(f"Could not extract scorer name: {e}")
+            return Path(scorer_file_path).stem
+    def upload_custom_scorer(
         self,
-        scorers: List[Union[APIScorerConfig, BaseScorer]],
-        examples: Optional[List[Example]] = None,
-        function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
-        traces: Optional[List[Trace]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        model: Optional[str] = DEFAULT_GPT_MODEL,
-        project_name: str = "default_test",
-        eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
-        async_execution: bool = False,
-    ) -> None:
+        scorer_file_path: str,
+        requirements_file_path: Optional[str] = None,
+        unique_name: Optional[str] = None,
+    ) -> bool:
         """
-        Asserts a test by running the evaluation and checking the results for success
+        Upload custom ExampleScorer from files to backend.
         Args:
-            examples (List[Example]): The examples to evaluate.
-            scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
-            model (str): The model used as a judge when using LLM as a Judge
-            project_name (str): The name of the project the evaluation results belong to
-            eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            function (Optional[Callable]): A function to use for evaluation
-            tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
-            tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
-            async_execution (bool): Whether to run the evaluation asynchronously
+            scorer_file_path: Path to Python file containing CustomScorer class
+            requirements_file_path: Optional path to requirements.txt
+            unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
+        Returns:
+            bool: True if upload successful
+        Raises:
+            ValueError: If scorer file is invalid
+            FileNotFoundError: If scorer file doesn't exist
         """
+        import os
-        # Check for enable_param_checking and tools
-        for scorer in scorers:
-            if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
-                if scorer.kwargs.get("enable_param_checking") is True:
-                    if not tools:
-                        raise ValueError(
-                            f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
-                        )
+        if not os.path.exists(scorer_file_path):
+            raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
-        results: List[ScoringResult]
+        # Auto-detect scorer name if not provided
+        if unique_name is None:
+            unique_name = self._extract_scorer_name(scorer_file_path)
+            judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
-        results = self.run_trace_evaluation(
-            examples=examples,
-            traces=traces,
-            scorers=scorers,
-            model=model,
-            project_name=project_name,
-            eval_run_name=eval_run_name,
-            override=override,
-            append=append,
-            function=function,
-            tracer=tracer,
-            tools=tools,
-        )
+        # Read scorer code
+        with open(scorer_file_path, "r") as f:
+            scorer_code = f.read()
-        assert_test(results)
+        # Read requirements (optional)
+        requirements_text = ""
+        if requirements_file_path and os.path.exists(requirements_file_path):
+            with open(requirements_file_path, "r") as f:
+                requirements_text = f.read()
+        try:
+            response = self.api_client.upload_custom_scorer(
+                scorer_name=unique_name,
+                scorer_code=scorer_code,
+                requirements_text=requirements_text,
+            )
+            if response.get("status") == "success":
+                judgeval_logger.info(
+                    f"Successfully uploaded custom scorer: {unique_name}"
+                )
+                return True
+            else:
+                judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
+                return False
+        except Exception as e:
+            judgeval_logger.error(f"Error uploading custom scorer: {e}")
+            raise

judgeval/local_eval_queue.py CHANGED Viewed

@@ -13,9 +13,8 @@ import time
 from judgeval.common.logger import judgeval_logger
 from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
 from judgeval.data import ScoringResult
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 from judgeval.utils.async_utils import safe_run_async
-from judgeval.scorers import BaseScorer
 from judgeval.scorers.score import a_execute_scoring
@@ -43,9 +42,8 @@ class LocalEvaluationQueue:
     def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
         """Execute evaluation run locally and return results."""
-        local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
-        if not local_scorers:
+        if not evaluation_run.custom_scorers:
             raise ValueError(
                 "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
                 "Found only APIScorerConfig instances."
@@ -54,7 +52,7 @@ class LocalEvaluationQueue:
         return safe_run_async(
             a_execute_scoring(
                 evaluation_run.examples,
-                local_scorers,
+                evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
                 max_concurrent=self._max_concurrent // self._num_workers,

judgeval 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

judgeval 0.5.0py3-none-any.whl → 0.7.0py3-none-any.whl