PyPI - judgeval - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

judgeval 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/cli.py +65 -0
judgeval/common/api/api.py +44 -38
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +8 -9
judgeval/common/tracer/core.py +278 -256
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +2 -1
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +37 -8
judgeval/data/trace.py +1 -0
judgeval/data/trace_run.py +0 -2
judgeval/integrations/langgraph.py +2 -1
judgeval/judgment_client.py +102 -47
judgeval/local_eval_queue.py +3 -5
judgeval/run_evaluation.py +33 -192
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +3 -1
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +23 -21
judgeval-0.6.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -80
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
{judgeval-0.5.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer/span_transformer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from pydantic import BaseModel
 from judgeval.common.api.json_encoder import json_encoder
 from judgeval.data import TraceSpan
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 class SpanTransformer:
@@ -150,6 +150,7 @@ class SpanTransformer:
                 "additional_metadata": judgment_data.get("additional_metadata"),
                 "has_evaluation": judgment_data.get("has_evaluation", False),
                 "agent_name": judgment_data.get("agent_name"),
+                "class_name": judgment_data.get("class_name"),
                 "state_before": judgment_data.get("state_before"),
                 "state_after": judgment_data.get("state_after"),
                 "update_id": judgment_data.get("update_id", 1),

judgeval/data/evaluation_run.py ADDED Viewed

@@ -0,0 +1,104 @@
+from typing import List, Optional, Union
+from pydantic import field_validator, model_validator, Field
+from datetime import datetime, timezone
+import uuid
+from judgeval.data import Example
+from judgeval.scorers import BaseScorer, APIScorerConfig
+from judgeval.constants import ACCEPTABLE_MODELS
+from judgeval.data.judgment_types import EvaluationRunJudgmentType
+class EvaluationRun(EvaluationRunJudgmentType):
+    """
+    Stores example and evaluation scorers together for running an eval task
+    Args:
+        project_name (str): The name of the project the evaluation results belong to
+        eval_name (str): A name for this evaluation run
+        examples (List[Example]): The examples to evaluate
+        scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
+        model (str): The model used as a judge when using LLM as a Judge
+        metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
+    """
+    id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
+    created_at: Optional[str] = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+    custom_scorers: Optional[List[BaseScorer]] = None
+    judgment_scorers: Optional[List[APIScorerConfig]] = None
+    organization_id: Optional[str] = None
+    def __init__(
+        self,
+        scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize EvaluationRun with automatic scorer classification.
+        Args:
+            scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
+            **kwargs: Other initialization arguments
+        """
+        if scorers is not None:
+            # Automatically sort scorers into appropriate fields
+            custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
+            judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
+            # Always set both fields as lists (even if empty) to satisfy validation
+            kwargs["custom_scorers"] = custom_scorers
+            kwargs["judgment_scorers"] = judgment_scorers
+        super().__init__(**kwargs)
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
+        data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
+        data["examples"] = [example.model_dump() for example in self.examples]
+        return data
+    @field_validator("examples")
+    def validate_examples(cls, v):
+        if not v:
+            raise ValueError("Examples cannot be empty.")
+        for item in v:
+            if not isinstance(item, Example):
+                raise ValueError(f"Item of type {type(item)} is not a Example")
+        return v
+    @model_validator(mode="after")
+    @classmethod
+    def validate_scorer_lists(cls, values):
+        custom_scorers = values.custom_scorers
+        judgment_scorers = values.judgment_scorers
+        # Check that both lists are not empty
+        if not custom_scorers and not judgment_scorers:
+            raise ValueError(
+                "At least one of custom_scorers or judgment_scorers must be provided."
+            )
+        # Check that only one list is filled
+        if custom_scorers and judgment_scorers:
+            raise ValueError(
+                "Only one of custom_scorers or judgment_scorers can be provided, not both."
+            )
+        return values
+    @field_validator("model")
+    def validate_model(cls, v, values):
+        if not v:
+            raise ValueError("Model cannot be empty.")
+        # Check if model is string or list of strings
+        if isinstance(v, str):
+            if v not in ACCEPTABLE_MODELS:
+                raise ValueError(
+                    f"Model name {v} not recognized. Please select a valid model name.)"
+                )
+            return v

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  openapi_new.json
-#   timestamp: 2025-08-01T22:19:19+00:00
+#   timestamp: 2025-08-08T18:50:51+00:00
 from __future__ import annotations
@@ -51,6 +51,31 @@ class ScorerConfigJudgmentType(BaseModel):
     kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
+class BaseScorerJudgmentType(BaseModel):
+    score_type: Annotated[str, Field(title="Score Type")]
+    threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
+    name: Annotated[Optional[str], Field(title="Name")] = None
+    class_name: Annotated[Optional[str], Field(title="Class Name")] = None
+    score: Annotated[Optional[float], Field(title="Score")] = None
+    score_breakdown: Annotated[
+        Optional[Dict[str, Any]], Field(title="Score Breakdown")
+    ] = None
+    reason: Annotated[Optional[str], Field(title="Reason")] = ""
+    using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
+        None
+    )
+    success: Annotated[Optional[bool], Field(title="Success")] = None
+    model: Annotated[Optional[str], Field(title="Model")] = None
+    model_client: Annotated[Any, Field(title="Model Client")] = None
+    strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
+    error: Annotated[Optional[str], Field(title="Error")] = None
+    additional_metadata: Annotated[
+        Optional[Dict[str, Any]], Field(title="Additional Metadata")
+    ] = None
+    user: Annotated[Optional[str], Field(title="User")] = None
+    server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
 class TraceUsageJudgmentType(BaseModel):
     prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
     completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
@@ -90,16 +115,21 @@ class HTTPValidationErrorJudgmentType(BaseModel):
     ] = None
-class JudgmentEvalJudgmentType(BaseModel):
+class EvaluationRunJudgmentType(BaseModel):
+    id: Annotated[Optional[str], Field(title="Id")] = None
     project_name: Annotated[Optional[str], Field(title="Project Name")] = None
     eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
     examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
-    scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
+    custom_scorers: Annotated[
+        Optional[List[BaseScorerJudgmentType]], Field(title="Custom Scorers")
+    ] = Field(default_factory=list)
+    judgment_scorers: Annotated[
+        Optional[List[ScorerConfigJudgmentType]], Field(title="Judgment Scorers")
+    ] = Field(default_factory=list)
     model: Annotated[str, Field(title="Model")]
-    append: Annotated[Optional[bool], Field(title="Append")] = False
-    override: Annotated[Optional[bool], Field(title="Override")] = False
     trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
     trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
+    created_at: Annotated[Optional[str], Field(title="Created At")] = None
 class TraceSpanJudgmentType(BaseModel):
@@ -123,6 +153,7 @@ class TraceSpanJudgmentType(BaseModel):
     ] = None
     has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
     agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
+    class_name: Annotated[Optional[str], Field(title="Class Name")] = None
     state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
         None
     )
@@ -172,8 +203,6 @@ class TraceRunJudgmentType(BaseModel):
     traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
     scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
     model: Annotated[str, Field(title="Model")]
-    append: Annotated[Optional[bool], Field(title="Append")] = False
-    override: Annotated[Optional[bool], Field(title="Override")] = False
     trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
     tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
@@ -181,5 +210,5 @@ class TraceRunJudgmentType(BaseModel):
 class EvalResultsJudgmentType(BaseModel):
     results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
     run: Annotated[
-        Union[TraceRunJudgmentType, JudgmentEvalJudgmentType], Field(title="Run")
+        Union[TraceRunJudgmentType, EvaluationRunJudgmentType], Field(title="Run")
     ]

judgeval/data/trace.py CHANGED Viewed

@@ -32,6 +32,7 @@ class TraceSpan(TraceSpanJudgmentType):
             "usage": self.usage.model_dump() if self.usage else None,
             "has_evaluation": self.has_evaluation,
             "agent_name": self.agent_name,
+            "class_name": self.class_name,
             "state_before": self.state_before,
             "state_after": self.state_after,
             "additional_metadata": json_encoder(self.additional_metadata),

judgeval/data/trace_run.py CHANGED Viewed

@@ -29,8 +29,6 @@ class TraceRun(BaseModel):
     scorers: List[Union[APIScorerConfig, BaseScorer]]
     model: Optional[str] = DEFAULT_GPT_MODEL
     trace_span_id: Optional[str] = None
-    append: Optional[bool] = False
-    override: Optional[bool] = False
     rules: Optional[List[Rule]] = None
     tools: Optional[List[Dict[str, Any]]] = None

judgeval/integrations/langgraph.py CHANGED Viewed

@@ -133,7 +133,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
         inputs: Optional[Dict[str, Any]] = None,
     ) -> None:
         """Start tracking a span, ensuring trace client exists"""
+        if name.startswith("__") and name.endswith("__"):
+            return
         start_time = time.time()
         span_id = str(uuid.uuid4())
         parent_span_id: Optional[str] = None

judgeval/judgment_client.py CHANGED Viewed

@@ -4,6 +4,8 @@ Implements the JudgmentClient to interact with the Judgment API.
 from __future__ import annotations
 import os
+import importlib.util
+from pathlib import Path
 from uuid import uuid4
 from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
@@ -16,7 +18,7 @@ from judgeval.scorers import (
     APIScorerConfig,
     BaseScorer,
 )
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 from judgeval.run_evaluation import (
     run_eval,
     assert_test,
@@ -95,8 +97,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_trace",
         model: Optional[str] = DEFAULT_GPT_MODEL,
-        append: bool = False,
-        override: bool = False,
     ) -> List[ScoringResult]:
         try:
             if examples and not function:
@@ -114,12 +114,11 @@ class JudgmentClient(metaclass=SingletonMeta):
                 traces=traces,
                 scorers=scorers,
                 model=model,
-                append=append,
                 organization_id=self.organization_id,
                 tools=tools,
             )
             return run_trace_eval(
-                trace_run, self.judgment_api_key, override, function, tracer, examples
+                trace_run, self.judgment_api_key, function, tracer, examples
             )
         except ValueError as e:
             raise ValueError(
@@ -135,8 +134,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
-        override: bool = False,
-        append: bool = False,
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
@@ -147,21 +144,13 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
         Returns:
             List[ScoringResult]: The results of the evaluation
         """
-        if override and append:
-            raise ValueError(
-                "Cannot set both override and append to True. Please choose one."
-            )
         try:
             eval = EvaluationRun(
-                append=append,
-                override=override,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
@@ -172,7 +161,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             return run_eval(
                 eval,
                 self.judgment_api_key,
-                override,
             )
         except ValueError as e:
             raise ValueError(
@@ -181,22 +169,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def pull_eval(
-        self, project_name: str, eval_run_name: str
-    ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
-        """Pull evaluation results from the server.
-        Args:
-            project_name (str): Name of the project
-            eval_run_name (str): Name of the evaluation run
-        Returns:
-            Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
-                - id (str): The evaluation run ID
-                - results (List[ScoringResult]): List of scoring results
-        """
-        return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
     def create_project(self, project_name: str) -> bool:
         """
         Creates a project on the server.
@@ -222,8 +194,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -234,9 +204,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
-            async_execution (bool): Whether to run the evaluation asynchronously
         """
         results: List[ScoringResult]
@@ -247,8 +214,6 @@ class JudgmentClient(metaclass=SingletonMeta):
             model=model,
             project_name=project_name,
             eval_run_name=eval_run_name,
-            override=override,
-            append=append,
         )
         assert_test(results)
@@ -263,9 +228,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
-        override: bool = False,
-        append: bool = False,
-        async_execution: bool = False,
     ) -> None:
         """
         Asserts a test by running the evaluation and checking the results for success
@@ -276,12 +238,9 @@ class JudgmentClient(metaclass=SingletonMeta):
             model (str): The model used as a judge when using LLM as a Judge
             project_name (str): The name of the project the evaluation results belong to
             eval_run_name (str): A name for this evaluation run
-            override (bool): Whether to override an existing evaluation run with the same name
-            append (bool): Whether to append to an existing evaluation run with the same name
             function (Optional[Callable]): A function to use for evaluation
             tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
             tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
-            async_execution (bool): Whether to run the evaluation asynchronously
         """
         # Check for enable_param_checking and tools
@@ -302,11 +261,107 @@ class JudgmentClient(metaclass=SingletonMeta):
             model=model,
             project_name=project_name,
             eval_run_name=eval_run_name,
-            override=override,
-            append=append,
             function=function,
             tracer=tracer,
             tools=tools,
         )
         assert_test(results)
+    def _extract_scorer_name(self, scorer_file_path: str) -> str:
+        """Extract scorer name from the scorer file by importing it."""
+        try:
+            spec = importlib.util.spec_from_file_location(
+                "scorer_module", scorer_file_path
+            )
+            if spec is None or spec.loader is None:
+                raise ImportError(f"Could not load spec from {scorer_file_path}")
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if (
+                    isinstance(attr, type)
+                    and any("Scorer" in str(base) for base in attr.__mro__)
+                    and attr.__module__ == "scorer_module"
+                ):
+                    try:
+                        # Instantiate the scorer and get its name
+                        scorer_instance = attr()
+                        if hasattr(scorer_instance, "name"):
+                            return scorer_instance.name
+                    except Exception:
+                        # Skip if instantiation fails
+                        continue
+            raise AttributeError("No scorer class found or could be instantiated")
+        except Exception as e:
+            judgeval_logger.warning(f"Could not extract scorer name: {e}")
+            return Path(scorer_file_path).stem
+    def save_custom_scorer(
+        self,
+        scorer_file_path: str,
+        requirements_file_path: Optional[str] = None,
+        unique_name: Optional[str] = None,
+    ) -> bool:
+        """
+        Upload custom ExampleScorer from files to backend.
+        Args:
+            scorer_file_path: Path to Python file containing CustomScorer class
+            requirements_file_path: Optional path to requirements.txt
+            unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
+        Returns:
+            bool: True if upload successful
+        Raises:
+            ValueError: If scorer file is invalid
+            FileNotFoundError: If scorer file doesn't exist
+        """
+        import os
+        if not os.path.exists(scorer_file_path):
+            raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
+        # Auto-detect scorer name if not provided
+        if unique_name is None:
+            unique_name = self._extract_scorer_name(scorer_file_path)
+            judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
+        # Read scorer code
+        with open(scorer_file_path, "r") as f:
+            scorer_code = f.read()
+        # Read requirements (optional)
+        requirements_text = ""
+        if requirements_file_path and os.path.exists(requirements_file_path):
+            with open(requirements_file_path, "r") as f:
+                requirements_text = f.read()
+        # Upload to backend
+        judgeval_logger.info(
+            f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
+        )
+        try:
+            response = self.api_client.upload_custom_scorer(
+                scorer_name=unique_name,
+                scorer_code=scorer_code,
+                requirements_text=requirements_text,
+            )
+            if response.get("status") == "success":
+                judgeval_logger.info(
+                    f"Successfully uploaded custom scorer: {unique_name}"
+                )
+                return True
+            else:
+                judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
+                return False
+        except Exception as e:
+            judgeval_logger.error(f"Error uploading custom scorer: {e}")
+            raise

judgeval/local_eval_queue.py CHANGED Viewed

@@ -13,9 +13,8 @@ import time
 from judgeval.common.logger import judgeval_logger
 from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
 from judgeval.data import ScoringResult
-from judgeval.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import EvaluationRun
 from judgeval.utils.async_utils import safe_run_async
-from judgeval.scorers import BaseScorer
 from judgeval.scorers.score import a_execute_scoring
@@ -43,9 +42,8 @@ class LocalEvaluationQueue:
     def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
         """Execute evaluation run locally and return results."""
-        local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
-        if not local_scorers:
+        if not evaluation_run.custom_scorers:
             raise ValueError(
                 "LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
                 "Found only APIScorerConfig instances."
@@ -54,7 +52,7 @@ class LocalEvaluationQueue:
         return safe_run_async(
             a_execute_scoring(
                 evaluation_run.examples,
-                local_scorers,
+                evaluation_run.custom_scorers,
                 model=evaluation_run.model,
                 throttle_value=0,
                 max_concurrent=self._max_concurrent // self._num_workers,

judgeval 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

judgeval 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl