PyPI - judgeval - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

judgeval 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

judgeval/__init__.py CHANGED Viewed

@@ -1,42 +1,4 @@
 # Import key components that should be publicly accessible
-from judgeval.common.utils import (
-    get_chat_completion,
-    aget_chat_completion,
-    get_completion_multiple_models,
-    aget_completion_multiple_models
-)
-from judgeval.data import (
-    Example,
-    ProcessExample,
-    ScorerData,
-    ScoringResult,
-)
-from judgeval.data.datasets import (
-    EvalDataset,
-    GroundTruthExample
-)
-from judgeval.judges import (
-    judgevalJudge,
-    LiteLLMJudge,
-    TogetherJudge,
-    MixtureOfJudges
-)
-from judgeval.scorers import (
-    JudgmentScorer,
-    CustomScorer,
-    PromptScorer,
-    ClassifierScorer,
-    ToolCorrectnessScorer,
-    JSONCorrectnessScorer,
-    SummarizationScorer,
-    HallucinationScorer,
-    FaithfulnessScorer,
-    ContextualRelevancyScorer,
-    ContextualPrecisionScorer,
-    ContextualRecallScorer,
-    AnswerRelevancyScorer
-)
 from judgeval.clients import client, langfuse, together_client
 from judgeval.judgment_client import JudgmentClient
@@ -46,38 +8,5 @@ __all__ = [
     'langfuse',
     'together_client',
-    # # Common utilities
-    # 'get_chat_completion',
-    # 'aget_chat_completion',
-    # 'get_completion_multiple_models',
-    # 'aget_completion_multiple_models',
-    # # Data classes
-    # 'Example',
-    # 'ProcessExample',
-    # 'ScorerData',
-    # 'ScoringResult',
-    # # Judges
-    # 'judgevalJudge',
-    # 'LiteLLMJudge',
-    # 'TogetherJudge',
-    # 'MixtureOfJudges',
-    # # Scorers
-    # 'JudgmentScorer',
-    # 'CustomScorer',
-    # 'PromptScorer',
-    # 'ClassifierScorer',
-    # 'ToolCorrectnessScorer',
-    # 'JSONCorrectnessScorer',
-    # 'SummarizationScorer',
-    # 'HallucinationScorer',
-    # 'FaithfulnessScorer',
-    # 'ContextualRelevancyScorer',
-    # 'ContextualPrecisionScorer',
-    # 'ContextualRecallScorer',
-    # 'AnswerRelevancyScorer',
     'JudgmentClient',
 ]

judgeval/common/tracer.py CHANGED Viewed

@@ -7,16 +7,7 @@ import functools
 import requests
 import uuid
 from contextlib import contextmanager
-from typing import (
-    Optional,
-    Any,
-    List,
-    Literal,
-    Tuple,
-    Generator,
-    TypeAlias,
-    Union
-)
+from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
 from dataclasses import dataclass, field
 from datetime import datetime
 from openai import OpenAI
@@ -33,7 +24,7 @@ from http import HTTPStatus
 from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import JudgmentScorer, CustomScorer
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
 from judgeval.data.result import ScoringResult
 # Define type aliases for better code readability and maintainability
@@ -76,16 +67,42 @@ class TraceEntry:
         elif self.type == "evaluation":
             print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
-    def to_dict(self) -> dict:
-        """Convert the trace entry to a dictionary format for storage/transmission."""
+    def _serialize_inputs(self) -> dict:
+        """Helper method to serialize input data safely.
+        Returns a dict with serializable versions of inputs, converting non-serializable
+        objects to None with a warning.
+        """
+        serialized_inputs = {}
+        for key, value in self.inputs.items():
+            if isinstance(value, BaseModel):
+                serialized_inputs[key] = value.model_dump()
+            elif isinstance(value, (list, tuple)):
+                # Handle lists/tuples of arguments
+                serialized_inputs[key] = [
+                    item.model_dump() if isinstance(item, BaseModel)
+                    else None if not self._is_json_serializable(item)
+                    else item
+                    for item in value
+                ]
+            else:
+                if self._is_json_serializable(value):
+                    serialized_inputs[key] = value
+                else:
+                    warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
+                    serialized_inputs[key] = None
+        return serialized_inputs
+    def _is_json_serializable(self, obj: Any) -> bool:
+        """Helper method to check if an object is JSON serializable."""
         try:
-            output = self._serialize_output()
+            json.dumps(obj)
+            return True
         except (TypeError, OverflowError, ValueError):
-            # Handle cases where output cannot be serialized
-            warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
-            output = None
+            return False
-        # Build a complete dictionary representation of the trace entry
+    def to_dict(self) -> dict:
+        """Convert the trace entry to a dictionary format for storage/transmission."""
         return {
             "type": self.type,
             "function": self.function,
@@ -93,8 +110,8 @@ class TraceEntry:
             "message": self.message,
             "timestamp": self.timestamp,
             "duration": self.duration,
-            "output": output,
-            "inputs": self.inputs or None,  # Convert empty dict to None
+            "output": self._serialize_output(),
+            "inputs": self._serialize_inputs(),
             "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
             "span_type": self.span_type
         }
@@ -104,18 +121,22 @@ class TraceEntry:
         Handles special cases:
         - Pydantic models are converted using model_dump()
-        - Other objects must be JSON serializable
+        - Non-serializable objects return None with a warning
         """
         if isinstance(self.output, BaseModel):
             return self.output.model_dump()
-        # Verify JSON serialization is possible
-        json.dumps(self.output)
-        return self.output
+        try:
+            # Try to serialize the output to verify it's JSON compatible
+            json.dumps(self.output)
+            return self.output
+        except (TypeError, OverflowError, ValueError):
+            warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
+            return None
 class TraceClient:
     """Client for managing a single trace context"""
-    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"):
+    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
         self.tracer = tracer
         self.trace_id = trace_id
         self.name = name
@@ -125,6 +146,7 @@ class TraceClient:
         self.start_time = time.time()
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
+        self.overwrite = overwrite
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -165,7 +187,7 @@ class TraceClient:
     async def async_evaluate(
         self,
-        scorers: List[Union[JudgmentScorer, CustomScorer]],
+        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
         input: Optional[str] = None,
         actual_output: Optional[str] = None,
         expected_output: Optional[str] = None,
@@ -175,7 +197,7 @@ class TraceClient:
         expected_tools: Optional[List[str]] = None,
         additional_metadata: Optional[Dict[str, Any]] = None,
         model: Optional[str] = None,
-        log_results: Optional[bool] = False,
+        log_results: Optional[bool] = True,
     ):
         start_time = time.time()  # Record start time
         example = Example(
@@ -195,9 +217,13 @@ class TraceClient:
             model=model,
             metadata={},
             log_results=log_results,
-            project_name="TestSpanLevel1",  # TODO this should be dynamic
-            eval_run_name="TestSpanLevel1",
-            override=True,
+            project_name=self.project_name,
+            eval_run_name=(
+                f"{self.name.capitalize()}-"
+                f"{self._current_span}-"
+                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
+            ),
+            override=self.overwrite
         )
         self.record_evaluation(scoring_results, start_time)  # Pass start_time to record_evaluation
@@ -393,7 +419,7 @@ class Tracer:
     def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
         """Start a new trace context using a context manager"""
         trace_id = str(uuid.uuid4())
-        trace = TraceClient(self, trace_id, name, project_name=project_name)
+        trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
         prev_trace = self._current_trace
         self._current_trace = trace

judgeval/constants.py CHANGED Viewed

@@ -15,6 +15,7 @@ class APIScorer(str, Enum):
     """
     FAITHFULNESS = "faithfulness"
     ANSWER_RELEVANCY = "answer_relevancy"
+    ANSWER_CORRECTNESS = "answer_correctness"
     HALLUCINATION = "hallucination"
     SUMMARIZATION = "summarization"
     CONTEXTUAL_RECALL = "contextual_recall"

judgeval/data/__init__.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from judgeval.data.example import Example
+from judgeval.data.example import Example, ExampleParams
 from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 __all__ = [
     "Example",
+    "ExampleParams",
     "ProcessExample",
     "create_process_example",
     "ScorerData",

judgeval/data/scorer_data.py CHANGED Viewed

@@ -7,7 +7,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
 from typing import List, Union, Optional, Dict
 from pydantic import BaseModel, Field
-from judgeval.scorers import CustomScorer
+from judgeval.scorers import JudgevalScorer
 class ScorerData(BaseModel):
     """
@@ -47,7 +47,7 @@ class ScorerData(BaseModel):
         }
-def create_scorer_data(scorer: CustomScorer) -> ScorerData:
+def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
     """
     After a `scorer` is run, it contains information about the example that was evaluated
     using the scorer. For example, after computing Faithfulness, the `scorer` object will contain

judgeval/evaluation_run.py CHANGED Viewed

@@ -2,11 +2,10 @@ from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, field_validator
 from judgeval.data import Example
-from judgeval.scorers import CustomScorer, JudgmentScorer
-from judgeval.judges import judgevalJudge
+from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.common.logger import debug, error
+from judgeval.judges import JudgevalJudge
 class EvaluationRun(BaseModel):
     """
@@ -28,8 +27,8 @@ class EvaluationRun(BaseModel):
     project_name: Optional[str] = None
     eval_name: Optional[str] = None
     examples: List[Example]
-    scorers: List[Union[JudgmentScorer, CustomScorer]]
-    model: Union[str, List[str], judgevalJudge]
+    scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
+    model: Union[str, List[str], JudgevalJudge]
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
@@ -39,8 +38,9 @@ class EvaluationRun(BaseModel):
         data = super().model_dump(**kwargs)
         data["scorers"] = [
-            scorer.to_dict() \
-            if hasattr(scorer, "to_dict") else {"score_type": scorer.score_type, "threshold": scorer.threshold}
+            scorer.to_dict() if hasattr(scorer, "to_dict")
+            else scorer.model_dump() if hasattr(scorer, "model_dump")
+            else {"score_type": scorer.score_type, "threshold": scorer.threshold}
             for scorer in self.scorers
         ]
         return data
@@ -81,7 +81,7 @@ class EvaluationRun(BaseModel):
         if not v:
             raise ValueError("Scorers cannot be empty.")
         for s in v:
-            if not isinstance(s, JudgmentScorer) and not isinstance(s, CustomScorer):
+            if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
                 raise ValueError(f"Invalid type for Scorer: {type(s)}")
         return v
@@ -89,20 +89,21 @@ class EvaluationRun(BaseModel):
     def validate_model(cls, v, values):
         if not v:
             raise ValueError("Model cannot be empty.")
         # Check if model is a judgevalJudge
-        if isinstance(v, judgevalJudge):
-            # Verify all scorers are CustomScorer when using judgevalJudge
+        if isinstance(v, JudgevalJudge):
+            # Verify all scorers are JudgevalScorer when using judgevalJudge
             scorers = values.data.get('scorers', [])
-            if not all(isinstance(s, CustomScorer) for s in scorers):
-                raise ValueError("When using a judgevalJudge model, all scorers must be CustomScorer type")
+            if not all(isinstance(s, JudgevalScorer) for s in scorers):
+                raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
             return v
         # Check if model is string or list of strings
         if isinstance(v, str):
             if v not in ACCEPTABLE_MODELS:
                 raise ValueError(f"Model name {v} not recognized.")
             return v
         if isinstance(v, list):
             if not all(isinstance(m, str) for m in v):
                 raise ValueError("When providing a list of models, all elements must be strings")
@@ -110,7 +111,7 @@ class EvaluationRun(BaseModel):
                 if m not in ACCEPTABLE_MODELS:
                     raise ValueError(f"Model name {m} not recognized.")
             return v
-        raise ValueError(f"Model must be one of: string, list of strings, or judgevalJudge instance. Received type {type(v)}.")
+        raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
     @field_validator('aggregator', mode='before')
     def validate_aggregator(cls, v, values):

judgeval/judges/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from pydantic import BaseModel
-from judgeval.judges.base_judge import judgevalJudge
+from judgeval.judges.base_judge import JudgevalJudge
 from judgeval.judges.litellm_judge import LiteLLMJudge
 from judgeval.judges.together_judge import TogetherJudge
 from judgeval.judges.mixture_of_judges import MixtureOfJudges
-__all__ = ["judgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
+__all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]

judgeval/judges/base_judge.py CHANGED Viewed

@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
 from typing import Optional, List
-class judgevalJudge(ABC):
+class JudgevalJudge(ABC):
     def __init__(self, model_name: Optional[str] = None, *args, **kwargs):
         self.model_name = model_name
         self.model = self.load_model(*args, **kwargs)

judgeval/judges/litellm_judge.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pydantic
 from typing import List, Union, Mapping
 from judgeval import *
-from judgeval.judges import judgevalJudge
+from judgeval.judges import JudgevalJudge
 from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
 from judgeval.common.logger import debug, error
@@ -11,7 +11,7 @@ BASE_CONVERSATION = [
 ]  # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
-class LiteLLMJudge(judgevalJudge):
+class LiteLLMJudge(JudgevalJudge):
     def __init__(self, model: str = "gpt-4o-mini", **kwargs):
         debug(f"Initializing LiteLLMJudge with model={model}")
         self.model = model

judgeval/judges/mixture_of_judges.py CHANGED Viewed

@@ -6,7 +6,7 @@ Enables client to use multiple models to generate responses and then aggregate t
 from judgeval import *
 import pydantic
 from typing import List, Union, Mapping, Dict
-from judgeval.judges import judgevalJudge
+from judgeval.judges import JudgevalJudge
 from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
 from judgeval.common.logger import debug, error
@@ -115,7 +115,7 @@ def build_dynamic_mixture_prompt(
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
 ]  # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
-class MixtureOfJudges(judgevalJudge):
+class MixtureOfJudges(JudgevalJudge):
     """
     IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
     in kwargs:

judgeval/judges/together_judge.py CHANGED Viewed

@@ -6,14 +6,14 @@ from pydantic import BaseModel
 from typing import List, Union, Mapping
 from judgeval.common.logger import debug, error
-from judgeval.judges import judgevalJudge
+from judgeval.judges import JudgevalJudge
 from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
 ]
-class TogetherJudge(judgevalJudge):
+class TogetherJudge(JudgevalJudge):
     def __init__(self, model: str = "QWEN", **kwargs):
         debug(f"Initializing TogetherJudge with model={model}")
         self.model = model

judgeval/judges/utils.py CHANGED Viewed

@@ -5,13 +5,13 @@ import litellm
 from typing import Optional, Union, Tuple, List
 from judgeval.common.exceptions import InvalidJudgeModelError
-from judgeval.judges import judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
+from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
 from judgeval.constants import TOGETHER_SUPPORTED_MODELS
 LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
 def create_judge(
-    model: Optional[Union[str, List[str], judgevalJudge]] = None) -> Tuple[judgevalJudge, bool]:
+    model: Optional[Union[str, List[str], JudgevalJudge]] = None) -> Tuple[JudgevalJudge, bool]:
     """
     Creates a judge model from string(s) or a judgeval judge object.
@@ -24,10 +24,10 @@ def create_judge(
     """
     if model is None:  # default option
         return LiteLLMJudge(model="gpt-4o"), True
-    if not isinstance(model, (str, list, judgevalJudge)):
+    if not isinstance(model, (str, list, JudgevalJudge)):
         raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
     # If model is already a valid judge type, return it and mark native
-    if isinstance(model, (judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
+    if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
         return model, True
     # Either string or List[str]

judgeval/judgment_client.py CHANGED Viewed

@@ -7,11 +7,22 @@ import requests
 from judgeval.constants import ROOT_API
 from judgeval.data.datasets import EvalDataset
-from judgeval.data import ScoringResult, Example
-from judgeval.judges import judgevalJudge
-from judgeval.scorers import JudgmentScorer, CustomScorer, ClassifierScorer
+from judgeval.data import (
+    ScoringResult,
+    Example
+)
+from judgeval.scorers import (
+    APIJudgmentScorer,
+    JudgevalScorer,
+    ClassifierScorer,
+    ScorerWrapper
+)
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.run_evaluation import run_eval
+from judgeval.run_evaluation import (
+    run_eval,
+    assert_test
+)
+from judgeval.judges import JudgevalJudge
 from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -37,25 +48,32 @@ class JudgmentClient:
     def run_evaluation(
         self,
         examples: List[Example],
-        scorers: List[Union[JudgmentScorer, CustomScorer]],
-        model: Union[str, List[str], judgevalJudge],
+        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
+        model: Union[str, List[str], JudgevalJudge],
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
-        log_results: bool = False,
-        project_name: str = "",
-        eval_run_name: str = "",
+        log_results: bool = True,
+        project_name: str = "default_project",
+        eval_run_name: str = "default_eval_run",
         override: bool = False,
+        use_judgment: bool = True
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of `Example`s using one or more `Scorer`s
         """
         try:
+            # Load appropriate implementations for all scorers
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
+                scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
+                for scorer in scorers
+            ]
             eval = EvaluationRun(
                 log_results=log_results,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
-                scorers=scorers,
+                scorers=loaded_scorers,
                 model=model,
                 aggregator=aggregator,
                 metadata=metadata,
@@ -68,24 +86,31 @@ class JudgmentClient:
     def evaluate_dataset(
         self,
         dataset: EvalDataset,
-        scorers: List[Union[JudgmentScorer, CustomScorer]],
-        model: Union[str, List[str]],
+        scorers: List[Union[ScorerWrapper, JudgevalScorer]],
+        model: Union[str, List[str], JudgevalJudge],
         aggregator: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
         project_name: str = "",
         eval_run_name: str = "",
-        log_results: bool = False
+        log_results: bool = False,
+        use_judgment: bool = True
     ) -> List[ScoringResult]:
         """
         Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
         """
         try:
+            # Load appropriate implementations for all scorers
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
+                scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
+                for scorer in scorers
+            ]
             evaluation_run = EvaluationRun(
                 log_results=log_results,
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=dataset.examples,
-                scorers=scorers,
+                scorers=loaded_scorers,
                 model=model,
                 aggregator=aggregator,
                 metadata=metadata,
@@ -241,4 +266,31 @@ class JudgmentClient:
             raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
         return response.json()["slug"]
+    def assert_test(
+        self,
+        examples: List[Example],
+        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
+        model: Union[str, List[str], JudgevalJudge],
+        aggregator: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        log_results: bool = False,
+        project_name: str = "",
+        eval_run_name: str = "",
+        override: bool = False,
+    ) -> None:
+        results = self.run_evaluation(
+            examples=examples,
+            scorers=scorers,
+            model=model,
+            aggregator=aggregator,
+            metadata=metadata,
+            log_results=log_results,
+            project_name=project_name,
+            eval_run_name=eval_run_name,
+            override=override
+        )
+        assert_test(results)

judgeval 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

judgeval 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl