PyPI - judgeval - Versions diffs - 0.0.1__py3-none-any.whl - Mend

judgeval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

judgeval/__init__.py +83 -0
judgeval/clients.py +19 -0
judgeval/common/__init__.py +8 -0
judgeval/common/exceptions.py +28 -0
judgeval/common/logger.py +189 -0
judgeval/common/tracer.py +587 -0
judgeval/common/utils.py +763 -0
judgeval/constants.py +55 -0
judgeval/data/__init__.py +14 -0
judgeval/data/api_example.py +111 -0
judgeval/data/datasets/__init__.py +4 -0
judgeval/data/datasets/dataset.py +407 -0
judgeval/data/datasets/ground_truth.py +54 -0
judgeval/data/datasets/utils.py +74 -0
judgeval/data/example.py +76 -0
judgeval/data/result.py +83 -0
judgeval/data/scorer_data.py +86 -0
judgeval/evaluation_run.py +130 -0
judgeval/judges/__init__.py +7 -0
judgeval/judges/base_judge.py +44 -0
judgeval/judges/litellm_judge.py +49 -0
judgeval/judges/mixture_of_judges.py +248 -0
judgeval/judges/together_judge.py +55 -0
judgeval/judges/utils.py +45 -0
judgeval/judgment_client.py +244 -0
judgeval/run_evaluation.py +355 -0
judgeval/scorers/__init__.py +30 -0
judgeval/scorers/base_scorer.py +51 -0
judgeval/scorers/custom_scorer.py +134 -0
judgeval/scorers/judgeval_scorers/__init__.py +21 -0
judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
judgeval/scorers/judgeval_scorers/summarization.py +20 -0
judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
judgeval/scorers/prompt_scorer.py +439 -0
judgeval/scorers/score.py +427 -0
judgeval/scorers/utils.py +175 -0
judgeval-0.0.1.dist-info/METADATA +40 -0
judgeval-0.0.1.dist-info/RECORD +46 -0
judgeval-0.0.1.dist-info/WHEEL +4 -0
judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0

judgeval/data/datasets/utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+from typing import List, Optional
+from judgeval.data.datasets.ground_truth import GroundTruthExample
+from judgeval.data import Example
+def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:
+    """
+    Convert a list of `Example` objects to a list of `GroundTruthExample` objects.
+    Args:
+        examples (List[Example]): A list of `Example` objects to convert.
+    Returns:
+        List[GroundTruthExample]: A list of `GroundTruthExample` objects.
+    """
+    if not isinstance(examples, list):
+        raise TypeError("Input should be a list of `Example` objects")
+    ground_truths = []
+    ground_truths = []
+    for e in examples:
+        g_truth = {
+            "input": e.input,
+            "actual_output": e.actual_output,
+            "expected_output": e.expected_output,
+            "context": e.context,
+            "retrieval_context": e.retrieval_context,
+            "tools_called": e.tools_called,
+            "expected_tools": e.expected_tools,
+        }
+        ground_truths.append(GroundTruthExample(**g_truth))
+    return ground_truths
+def ground_truths_to_examples(
+    ground_truths: List[GroundTruthExample],
+    _alias: Optional[str] = None,
+    _id: Optional[str] = None,
+    ) -> List[Example]:
+    """
+    Converts a list of `GroundTruthExample` objects to a list of `Example` objects.
+    Args:
+        ground_truths (List[GroundTruthExample]): A list of `GroundTruthExample` objects to convert.
+        _alias (Optional[str]): The alias of the dataset.
+        _id (Optional[str]): The ID of the dataset.
+    Returns:
+        List[Example]: A list of `Example` objects.
+    """
+    if not isinstance(ground_truths, list):
+        raise TypeError("Input should be a list of `GroundTruthExample` objects")
+    examples = []
+    for index, ground_truth in enumerate(ground_truths):
+        e = Example(
+            input=ground_truth.input,
+            actual_output=ground_truth.actual_output,
+            expected_output=ground_truth.expected_output,
+            context=ground_truth.context,
+            retrieval_context=ground_truth.retrieval_context,
+            additional_metadata=ground_truth.additional_metadata,
+            tools_called=ground_truth.tools_called,
+            expected_tools=ground_truth.expected_tools,
+            comments=ground_truth.comments,
+            _dataset_alias=_alias,
+            _dataset_id=_id,
+            _dataset_rank=index,
+        )
+        examples.append(e)
+    return examples

judgeval/data/example.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""
+Classes for representing examples in a dataset.
+"""
+from typing import TypeVar, Optional, Any, Dict, List
+from pydantic import BaseModel
+from enum import Enum
+from datetime import datetime
+Input = TypeVar('Input')
+Output = TypeVar('Output')
+class ExampleParams(Enum):
+    INPUT = "input"
+    ACTUAL_OUTPUT = "actual_output"
+    EXPECTED_OUTPUT = "expected_output"
+    CONTEXT = "context"
+    RETRIEVAL_CONTEXT = "retrieval_context"
+    TOOLS_CALLED = "tools_called"
+    EXPECTED_TOOLS = "expected_tools"
+    REASONING = "reasoning"
+class Example(BaseModel):
+    input: Input
+    actual_output: Output
+    expected_output: Optional[str] = None
+    context: Optional[List[str]] = None
+    retrieval_context: Optional[List[str]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    tools_called: Optional[List[str]] = None
+    expected_tools: Optional[List[str]] = None
+    name: Optional[str] = None
+    example_id: Optional[str] = None
+    timestamp: Optional[str] = None
+    trace_id: Optional[str] = None
+    def __init__(self, **data):
+        super().__init__(**data)
+        # Set timestamp if not provided
+        if self.timestamp is None:
+            self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    def to_dict(self):
+        return {
+            "input": self.input,
+            "actual_output": self.actual_output,
+            "expected_output": self.expected_output,
+            "context": self.context,
+            "retrieval_context": self.retrieval_context,
+            "additional_metadata": self.additional_metadata,
+            "tools_called": self.tools_called,
+            "expected_tools": self.expected_tools,
+            "name": self.name,
+            "example_id": self.example_id,
+            "timestamp": self.timestamp,
+            "trace_id": self.trace_id
+        }
+    def __str__(self):
+        return (
+            f"Example(input={self.input}, "
+            f"actual_output={self.actual_output}, "
+            f"expected_output={self.expected_output}, "
+            f"context={self.context}, "
+            f"retrieval_context={self.retrieval_context}, "
+            f"additional_metadata={self.additional_metadata}, "
+            f"tools_called={self.tools_called}, "
+            f"expected_tools={self.expected_tools}, "
+            f"name={self.name}, "
+            f"example_id={self.example_id}, "
+            f"timestamp={self.timestamp}, "
+            f"trace_id={self.trace_id})"
+        )

judgeval/data/result.py ADDED Viewed

@@ -0,0 +1,83 @@
+from dataclasses import dataclass
+from typing import List, Union, Optional
+from judgeval.data import ScorerData, ProcessExample
+@dataclass
+class ScoringResult:
+    """
+    A ScoringResult contains the output of one or more scorers applied to a single example.
+    Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
+    Args:
+        success (bool): Whether the evaluation was successful.
+                        This means that all scorers applied to this example returned a success.
+        scorer_data (List[ScorerData]): The scorers data for the evaluated example
+        input (Optional[str]): The input to the example
+        actual_output (Optional[str]): The actual output of the example
+        expected_output (Optional[str]): The expected output of the example
+        context (Optional[List[str]]): The context of the example
+        retrieval_context (Optional[List[str]]): The retrieval context of the example
+        trace_id (Optional[str]): The trace id of the example
+    """
+    # Fields for scoring outputs
+    success: bool  # used for unit testing
+    scorers_data: Union[List[ScorerData], None]
+    # Inputs from the original example
+    input: Optional[str] = None
+    actual_output: Optional[str] = None
+    expected_output: Optional[str] = None
+    context: Optional[List[str]] = None
+    retrieval_context: Optional[List[str]] = None
+    trace_id: Optional[str] = None
+    example_id: Optional[str] = None
+    eval_run_name: Optional[str] = None
+    def to_dict(self) -> dict:
+        """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
+        return {
+            "success": self.success,
+            "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
+            "input": self.input,
+            "actual_output": self.actual_output,
+            "expected_output": self.expected_output,
+            "context": self.context,
+            "retrieval_context": self.retrieval_context,
+            "trace_id": self.trace_id,
+            "example_id": self.example_id
+        }
+    def __str__(self) -> str:
+        return f"ScoringResult(\
+            success={self.success}, \
+            scorer_data={self.scorers_data}, \
+            input={self.input}, \
+            actual_output={self.actual_output}, \
+            expected_output={self.expected_output}, \
+            context={self.context}, \
+            retrieval_context={self.retrieval_context}, \
+            trace_id={self.trace_id})"
+def generate_scoring_result(
+    process_example: ProcessExample,
+) -> ScoringResult:
+    """
+    Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
+    When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
+    At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
+    """
+    return ScoringResult(
+        success=process_example.success,
+        scorers_data=process_example.scorers_data,
+        input=process_example.input,
+        actual_output=process_example.actual_output,
+        expected_output=process_example.expected_output,
+        context=process_example.context,
+        retrieval_context=process_example.retrieval_context,
+        trace_id=process_example.trace_id
+    )

judgeval/data/scorer_data.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+Implementation of the ScorerData class.
+ScorerData holds the information related to a single, completed Scorer evaluation run.
+"""
+from typing import List, Union, Optional, Dict
+from pydantic import BaseModel, Field
+from judgeval.scorers import CustomScorer
+class ScorerData(BaseModel):
+    """
+    ScorerData holds the information related to a single, completed Scorer evaluation run.
+    For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
+    object will contain whether the example passed its threshold expectation, as well as more detailed
+    information surrounding the evaluation run such as the claims and verdicts generated by the
+    judge model(s).
+    """
+    name: str
+    threshold: float
+    success: bool
+    score: Optional[float] = None
+    reason: Optional[str] = None
+    strict_mode: Optional[bool] = None
+    evaluation_model: Union[List[str], str] = None
+    error: Optional[str] = None
+    evaluation_cost: Union[float, None] = None
+    verbose_logs: Optional[str] = None
+    additional_metadata: Optional[Dict] = None
+    def to_dict(self) -> dict:
+        """Convert the ScorerData instance to a JSON-serializable dictionary."""
+        return {
+            "name": self.name,
+            "threshold": self.threshold,
+            "success": self.success,
+            "score": self.score,
+            "reason": self.reason,
+            "strict_mode": self.strict_mode,
+            "evaluation_model": self.evaluation_model,
+            "error": self.error,
+            "evaluation_cost": self.evaluation_cost,
+            "verbose_logs": self.verbose_logs,
+            "additional_metadata": self.additional_metadata
+        }
+def create_scorer_data(scorer: CustomScorer) -> ScorerData:
+    """
+    After a `scorer` is run, it contains information about the example that was evaluated
+    using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
+    whether the example passed its threshold, the score, the reason for score, etc.
+    This function takes an executed `scorer` object and produces a ScorerData object that
+    contains the output of the scorer run that can be exported to be logged as a part of
+    the ScorerResult.
+    """
+    if scorer.error is not None:  # error occurred during eval run
+        return ScorerData(
+            name=scorer.__name__,
+            threshold=scorer.threshold,
+            score=None,
+            reason=None,
+            success=False,
+            strict_mode=scorer.strict_mode,
+            evaluation_model=scorer.evaluation_model,
+            error=scorer.error,
+            evaluation_cost=scorer.evaluation_cost,
+            verbose_logs=scorer.verbose_logs,
+        )
+    else:  # standard execution, no error
+        return ScorerData(
+            name=scorer.__name__,
+            score=scorer.score,
+            threshold=scorer.threshold,
+            reason=scorer.reason,
+            success=scorer._success_check(),
+            strict_mode=scorer.strict_mode,
+            evaluation_model=scorer.evaluation_model,
+            error=None,
+            evaluation_cost=scorer.evaluation_cost,
+            verbose_logs=scorer.verbose_logs,
+            additional_metadata=scorer.additional_metadata,
+        )

judgeval/evaluation_run.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import List, Optional, Dict, Any, Union
+from pydantic import BaseModel, field_validator
+from judgeval.data import Example
+from judgeval.scorers import CustomScorer, JudgmentScorer
+from judgeval.judges import judgevalJudge
+from judgeval.constants import ACCEPTABLE_MODELS
+from judgeval.common.logger import debug, error
+class EvaluationRun(BaseModel):
+    """
+    Stores example and evaluation scorers together for running an eval task
+    Args:
+        project_name (str): The name of the project the evaluation results belong to
+        eval_name (str): A name for this evaluation run
+        examples (List[Example]): The examples to evaluate
+        scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
+        model (str): The model used as a judge when using LLM as a Judge
+        aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
+        metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
+        judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
+    """
+    # The user will specify whether they want log_results when they call run_eval
+    log_results: bool = False  # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
+    project_name: Optional[str] = None
+    eval_name: Optional[str] = None
+    examples: List[Example]
+    scorers: List[Union[JudgmentScorer, CustomScorer]]
+    model: Union[str, List[str], judgevalJudge]
+    aggregator: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+    # API Key will be "" until user calls client.run_eval(), then API Key will be set
+    judgment_api_key: Optional[str] = ""
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        data["scorers"] = [
+            scorer.to_dict() \
+            if hasattr(scorer, "to_dict") else {"score_type": scorer.score_type, "threshold": scorer.threshold}
+            for scorer in self.scorers
+        ]
+        return data
+    @field_validator('log_results', mode='before')
+    def validate_log_results(cls, v):
+        if not isinstance(v, bool):
+            raise ValueError(f"log_results must be a boolean. Received {v} of type {type(v)}")
+        return v
+    @field_validator('project_name')
+    def validate_project_name(cls, v, values):
+        if values.data.get('log_results', False) and not v:
+            debug("No project name provided when log_results is True")
+            error("Validation failed: Project name required when logging results")
+            raise ValueError("Project name is required when log_results is True. Please include the project_name argument.")
+        return v
+    @field_validator('eval_name')
+    def validate_eval_name(cls, v, values):
+        if values.data.get('log_results', False) and not v:
+            debug("No eval name provided when log_results is True")
+            error("Validation failed: Eval name required when logging results")
+            raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
+        return v
+    @field_validator('examples')
+    def validate_examples(cls, v):
+        if not v:
+            raise ValueError("Examples cannot be empty.")
+        for ex in v:
+            if not isinstance(ex, Example):
+                raise ValueError(f"Invalid type for Example: {type(ex)}")
+        return v
+    @field_validator('scorers')
+    def validate_scorers(cls, v):
+        if not v:
+            raise ValueError("Scorers cannot be empty.")
+        for s in v:
+            if not isinstance(s, JudgmentScorer) and not isinstance(s, CustomScorer):
+                raise ValueError(f"Invalid type for Scorer: {type(s)}")
+        return v
+    @field_validator('model')
+    def validate_model(cls, v, values):
+        if not v:
+            raise ValueError("Model cannot be empty.")
+        # Check if model is a judgevalJudge
+        if isinstance(v, judgevalJudge):
+            # Verify all scorers are CustomScorer when using judgevalJudge
+            scorers = values.data.get('scorers', [])
+            if not all(isinstance(s, CustomScorer) for s in scorers):
+                raise ValueError("When using a judgevalJudge model, all scorers must be CustomScorer type")
+            return v
+        # Check if model is string or list of strings
+        if isinstance(v, str):
+            if v not in ACCEPTABLE_MODELS:
+                raise ValueError(f"Model name {v} not recognized.")
+            return v
+        if isinstance(v, list):
+            if not all(isinstance(m, str) for m in v):
+                raise ValueError("When providing a list of models, all elements must be strings")
+            for m in v:
+                if m not in ACCEPTABLE_MODELS:
+                    raise ValueError(f"Model name {m} not recognized.")
+            return v
+        raise ValueError(f"Model must be one of: string, list of strings, or judgevalJudge instance. Received type {type(v)}.")
+    @field_validator('aggregator', mode='before')
+    def validate_aggregator(cls, v, values):
+        model = values.data.get('model')
+        if isinstance(model, list) and v is None:
+            raise ValueError("Aggregator cannot be empty.")
+        if isinstance(model, list) and not isinstance(v, str):
+            raise ValueError("Aggregator must be a string if provided.")
+        if v is not None and v not in ACCEPTABLE_MODELS:
+            raise ValueError(f"Model name {v} not recognized.")
+        return v
+    class Config:
+        arbitrary_types_allowed = True

judgeval/judges/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from pydantic import BaseModel
+from judgeval.judges.base_judge import judgevalJudge
+from judgeval.judges.litellm_judge import LiteLLMJudge
+from judgeval.judges.together_judge import TogetherJudge
+from judgeval.judges.mixture_of_judges import MixtureOfJudges
+__all__ = ["judgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]

judgeval/judges/base_judge.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""
+Implements the base class for all Judgeval Judge models.
+"""
+from abc import ABC, abstractmethod
+from typing import Optional, List
+class judgevalJudge(ABC):
+    def __init__(self, model_name: Optional[str] = None, *args, **kwargs):
+        self.model_name = model_name
+        self.model = self.load_model(*args, **kwargs)
+    @abstractmethod
+    def load_model(self, *args, **kwargs):
+        """Loads a model, that will be responsible for scoring.
+        Returns:
+            A model object
+        """
+        pass
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> str:
+        """Runs the model to output LLM response.
+        Returns:
+            A string.
+        """
+        pass
+    @abstractmethod
+    async def a_generate(self, *args, **kwargs) -> str:
+        """Runs the model to output LLM response.
+        Returns:
+            A string.
+        """
+        pass
+    @abstractmethod
+    def get_model_name(self, *args, **kwargs) -> str:
+        pass

judgeval/judges/litellm_judge.py ADDED Viewed

@@ -0,0 +1,49 @@
+import pydantic
+from typing import List, Union, Mapping
+from judgeval import *
+from judgeval.judges import judgevalJudge
+from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
+from judgeval.common.logger import debug, error
+BASE_CONVERSATION = [
+    {"role": "system", "content": "You are a helpful assistant."},
+]  # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
+class LiteLLMJudge(judgevalJudge):
+    def __init__(self, model: str = "gpt-4o-mini", **kwargs):
+        debug(f"Initializing LiteLLMJudge with model={model}")
+        self.model = model
+        self.kwargs = kwargs
+        super().__init__(model_name=model)
+    def generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
+        debug(f"Generating response for input type: {type(input)}")
+        if isinstance(input, str):
+            convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
+            return fetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
+        elif isinstance(input, list):
+            return fetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
+        else:
+            error(f"Invalid input type received: {type(input)}")
+            raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
+    async def a_generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
+        debug(f"Async generating response for input type: {type(input)}")
+        if isinstance(input, str):
+            convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
+            response = await afetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
+            return response
+        elif isinstance(input, list):
+            response = await afetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
+            return response
+        else:
+            error(f"Invalid input type received: {type(input)}")
+            raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
+    def load_model(self):
+        return self.model
+    def get_model_name(self) -> str:
+        return self.model