PyPI - judgeval - Versions diffs - 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl - Mend

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

judgeval/common/tracer.py +93 -55
judgeval/constants.py +4 -2
judgeval/data/__init__.py +4 -0
judgeval/data/custom_example.py +18 -0
judgeval/data/datasets/eval_dataset_client.py +62 -3
judgeval/data/example.py +1 -0
judgeval/data/result.py +7 -6
judgeval/data/sequence.py +59 -0
judgeval/data/sequence_run.py +42 -0
judgeval/evaluation_run.py +12 -7
judgeval/integrations/langgraph.py +89 -72
judgeval/judgment_client.py +77 -14
judgeval/run_evaluation.py +87 -13
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/judgeval_scorer.py +3 -0
judgeval/scorers/judgeval_scorers/__init__.py +7 -0
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
judgeval/scorers/score.py +6 -5
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
judgeval/data/custom_api_example.py +0 -91
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -34,6 +34,7 @@ class JudgevalScorer:
     async_mode: bool = True  # Whether to run the scorer in async mode
     verbose_mode: bool = True  # Whether to run the scorer in verbose mode
     include_reason: bool = False  # Whether to include the reason in the output
+    custom_example: bool = False  # Whether the scorer corresponds to CustomExamples
     error: Optional[str] = None  # The error message if the scorer failed
     evaluation_cost: Optional[float] = None  # The cost of running the scorer
     verbose_logs: Optional[str] = None  # The verbose logs of the scorer
@@ -52,6 +53,7 @@ class JudgevalScorer:
         async_mode: bool = True,
         verbose_mode: bool = True,
         include_reason: bool = False,
+        custom_example: bool = False,
         error: Optional[str] = None,
         evaluation_cost: Optional[float] = None,
         verbose_logs: Optional[str] = None,
@@ -78,6 +80,7 @@ class JudgevalScorer:
             self.async_mode = async_mode
             self.verbose_mode = verbose_mode
             self.include_reason = include_reason
+            self.custom_example = custom_example
             self.error = error
             self.evaluation_cost = evaluation_cost
             self.verbose_logs = verbose_logs

judgeval/scorers/judgeval_scorers/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
     ComparisonScorer as APIComparisonScorer,
     InstructionAdherenceScorer as APIInstructionAdherenceScorer,
     GroundednessScorer as APIGroundednessScorer,
+    DerailmentScorer as APIDerailmentScorer,
 )
 from judgeval.scorers.judgeval_scorers.local_implementations import (
@@ -153,6 +154,11 @@ GroundednessScorer = ScorerWrapper(
     api_implementation=APIGroundednessScorer,
 )
+DerailmentScorer = ScorerWrapper(
+    api_implementation=APIDerailmentScorer,
+    local_implementation=LocalInstructionAdherenceScorer # TODO: add local implementation
+)
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -166,4 +172,5 @@ __all__ = [
     "Text2SQLScorer",
     "ComparisonScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/__init__.py CHANGED Viewed

@@ -11,7 +11,7 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import Ans
 from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
 from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
+from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import DerailmentScorer
 __all__ = [
     "ExecutionOrderScorer",
     "JSONCorrectnessScorer",
@@ -26,4 +26,5 @@ __all__ = [
     "ComparisonScorer",
     "InstructionAdherenceScorer",
     "GroundednessScorer",
+    "DerailmentScorer",
 ]

judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+`judgeval` answer relevancy scorer
+TODO add link to docs page for this scorer
+"""
+# Internal imports
+from judgeval.scorers.api_scorer import APIJudgmentScorer
+from judgeval.constants import APIScorer
+class DerailmentScorer(APIJudgmentScorer):
+    def __init__(self, threshold: float):
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.DERAILMENT,
+        )
+    @property
+    def __name__(self):
+        return "Derailment"

judgeval/scorers/score.py CHANGED Viewed

@@ -11,6 +11,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from judgeval.data import (
     Example,
+    CustomExample,
     ScoringResult,
     generate_scoring_result,
     create_scorer_data,
@@ -240,7 +241,7 @@ async def score_with_indicator(
 async def a_execute_scoring(
-    examples: List[Example],
+    examples: Union[List[Example], List[CustomExample]],
     scorers: List[JudgevalScorer],
     model: Optional[Union[str, List[str], JudgevalJudge]] = None,
     ignore_errors: bool = True,
@@ -256,7 +257,7 @@ async def a_execute_scoring(
     Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
     Args:
-        examples (List[Example]): A list of `Example` objects to be evaluated.
+        examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
         scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
         model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
         ignore_errors (bool): Whether to ignore errors during evaluation.
@@ -313,7 +314,7 @@ async def a_execute_scoring(
                             debug(f"Scorer threshold: {scorer.threshold}")
                         if hasattr(scorer, 'model'):
                             debug(f"Scorer model: {type(scorer.model).__name__}")
-                if isinstance(ex, Example):
+                if isinstance(ex, Example) or isinstance(ex, CustomExample):
                     if len(scorers) == 0:
                         pbar.update(1)
                         continue
@@ -339,7 +340,7 @@ async def a_execute_scoring(
             await asyncio.gather(*tasks)
     else:
         for i, ex in enumerate(examples):
-            if isinstance(ex, Example):
+            if isinstance(ex, Example) or isinstance(ex, CustomExample):
                 if len(scorers) == 0:
                     continue
@@ -366,7 +367,7 @@ async def a_execute_scoring(
 async def a_eval_examples_helper(
     scorers: List[JudgevalScorer],
-    example: Example,
+    example: Union[Example, CustomExample],
     scoring_results: List[ScoringResult],
     score_index: int,
     ignore_errors: bool,

{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.29
+Version: 0.0.31
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD RENAMED Viewed

@@ -1,46 +1,49 @@
 judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
-judgeval/evaluation_run.py,sha256=6Kft3wZDWkdBDZoMwOhWf7zSAOF4naI7Pcg_YlZaZY4,6394
-judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
+judgeval/constants.py,sha256=XTqijsuuLEhUBXTjzNJVsee5U_Gl14ULLO5uQVW_nEE,5398
+judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
+judgeval/judgment_client.py,sha256=FncHkjyFx2vfXv4cu4DzbOO0ideHNOWtHVbc8pSXNxk,29754
 judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
-judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
+judgeval/run_evaluation.py,sha256=2Mv1iLthJeFQZSVhjLOcJKRZ52Sy6OxLb2KyQ_yVwnA,28484
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=b-eQyC_MPwMAVQJS6wtVW0_7hzk8tC9EV6NZZoNjWos,58188
+judgeval/common/tracer.py,sha256=9Qga-7rLFlQK-oM5eK1O_8Mn1SewIrPtFwWbSZFtSII,59651
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
-judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
-judgeval/data/custom_api_example.py,sha256=uW_ZBzkDLWumtudmfRHAJQkVYpm2qWgcDf7vBNLpS-o,3444
-judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
-judgeval/data/result.py,sha256=BT4f2FF5EFuiRjOmS4vuIXsrEwSlG16Vw3QaWi6PZzc,3122
+judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
+judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
+judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
+judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
+judgeval/data/sequence.py,sha256=DlQUjyWQJB6iNmiftDZ9N6C-nPtrOC1e0JZ57U00zZk,2387
+judgeval/data/sequence_run.py,sha256=GrnYSZBcZmt4tKQYA_1v09MFB8n3ccrkOJd4qyweHMg,1987
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
 judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
-judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
-judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
+judgeval/data/datasets/eval_dataset_client.py,sha256=xzXlBJRBEEmwsB79_eepm0Da-Bz8yRodX7ttk-u-BxU,14986
+judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
 judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
 judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
 judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
-judgeval/scorers/__init__.py,sha256=gkeKJvjXhswCnkEyjijrVvGVM3Om86egrZ-PUOGvNvI,1158
+judgeval/scorers/__init__.py,sha256=Z_88Sr45gLFAIbMHzG1BF24TUQGCDiuP9QpmVFvSYJM,1204
 judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
 judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1mC0,2183
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
-judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
+judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
+judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
-judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
-judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
+judgeval/scorers/judgeval_scorers/__init__.py,sha256=kSmQWKeBvLeZMfLYNQSc2qbJYo1MFIQnf3P-D4ltuSM,6232
+judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
 judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
 judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=tpSuzFAaW8X9xqA0aLLKwh7qmBK0Pc_bJZMIe_q412U,770
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=pFVhk4pLtQ-FnNlbI-dFF-SIh69Jza7erHqiPkFWoBo,758
 judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=RQ6DZwEhChfecd89Ey-T7ke--7qTaXZlRsNxwH8gaME,823
+judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=V9WPuwNMm097V7IknKs8UkmAk0yjnBXTcJha_BHXxTA,475
 judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=Pb3CiNF2Ca826B92wJCVAi_68lJjLhqqCKwQKaflSUg,1294
 judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=-BwOapqjryYNKNydtdkUiKIij76dY0O1jBmdc6dKazQ,692
 judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=ntEEeTANEOsGlcbiTAF_3r6BeSJEaVDns8po8T0L6Vg,692
@@ -87,7 +90,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.29.dist-info/METADATA,sha256=UAJZhCDCKCMOSnvYVZ9M2v3O0QdMErKdYWeN2NBp2_M,5418
-judgeval-0.0.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.29.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.29.dist-info/RECORD,,
+judgeval-0.0.31.dist-info/METADATA,sha256=g9288fIE7NDwXuqUylqCV0mby5hAY7yEztR8TOn5sNk,5418
+judgeval-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.31.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.31.dist-info/RECORD,,

judgeval/data/custom_api_example.py DELETED Viewed

@@ -1,91 +0,0 @@
-from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, ConfigDict, model_validator
-from judgeval.data.example import Example
-from judgeval.data.custom_example import CustomExample
-from judgeval.data.scorer_data import ScorerData
-from judgeval.common.logger import debug, error
-class ProcessExample(BaseModel):
-    """
-    ProcessExample is an `Example` object that contains intermediate information
-    about an undergoing evaluation on the original `Example`. It is used purely for
-    internal operations and keeping track of the evaluation process.
-    """
-    name: str
-    # input: Optional[str] = None
-    # actual_output: Optional[Union[str, List[str]]] = None
-    # expected_output: Optional[Union[str, List[str]]] = None
-    # context: Optional[list] = None
-    # retrieval_context: Optional[list] = None
-    # tools_called: Optional[list] = None
-    # expected_tools: Optional[list] = None
-    # make these optional, not all test cases in a conversation will be evaluated
-    success: Optional[bool] = None
-    scorers_data: Optional[List[ScorerData]] = None
-    run_duration: Optional[float] = None
-    evaluation_cost: Optional[float] = None
-    order: Optional[int] =  None
-    # These should map 1 to 1 from golden
-    additional_metadata: Optional[Dict] = None
-    comments: Optional[str] = None
-    trace_id: Optional[str] = None
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    def update_scorer_data(self, scorer_data: ScorerData):
-        """
-        Updates scorer data field of test case after the scorers have been
-        evaluated on this test case.
-        """
-        debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
-        # self.scorers_data is a list of ScorerData objects that contain the
-        # evaluation results of each scorer on this test case
-        if self.scorers_data is None:
-            self.scorers_data = [scorer_data]
-        else:
-            self.scorers_data.append(scorer_data)
-        if self.success is None:
-            # self.success will be None when it is a message
-            # in that case we will be setting success for the first time
-            self.success = scorer_data.success
-        else:
-            if scorer_data.success is False:
-                debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
-                self.success = False
-    def update_run_duration(self, run_duration: float):
-        self.run_duration = run_duration
-def create_process_custom_example(
-    example: CustomExample,
-) -> ProcessExample:
-    """
-    When an LLM Test Case is executed, we track its progress using an ProcessExample.
-    This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
-    """
-    success = True
-    if example.name is not None:
-        name = example.name
-    else:
-        name = "Test Case Placeholder"
-        debug(f"No name provided for example, using default name: {name}")
-    order = None
-    scorers_data = []
-    debug(f"Creating ProcessExample for: {name}")
-    process_ex = ProcessExample(
-        name=name,
-        success=success,
-        scorers_data=scorers_data,
-        run_duration=None,
-        evaluation_cost=None,
-        order=order,
-        additional_metadata=example.additional_metadata,
-        trace_id=example.trace_id
-    )
-    return process_ex

{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

judgeval 0.0.29py3-none-any.whl → 0.0.31py3-none-any.whl