PyPI - judgeval - Versions diffs - 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl - Mend

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

judgeval/__init__.py +5 -4
judgeval/clients.py +6 -6
judgeval/common/__init__.py +7 -2
judgeval/common/exceptions.py +2 -3
judgeval/common/logger.py +74 -49
judgeval/common/s3_storage.py +30 -23
judgeval/common/tracer.py +1273 -939
judgeval/common/utils.py +416 -244
judgeval/constants.py +73 -61
judgeval/data/__init__.py +1 -1
judgeval/data/custom_example.py +3 -2
judgeval/data/datasets/dataset.py +80 -54
judgeval/data/datasets/eval_dataset_client.py +131 -181
judgeval/data/example.py +67 -43
judgeval/data/result.py +11 -9
judgeval/data/scorer_data.py +4 -2
judgeval/data/tool.py +25 -16
judgeval/data/trace.py +57 -29
judgeval/data/trace_run.py +5 -11
judgeval/evaluation_run.py +22 -82
judgeval/integrations/langgraph.py +546 -184
judgeval/judges/base_judge.py +1 -2
judgeval/judges/litellm_judge.py +33 -11
judgeval/judges/mixture_of_judges.py +128 -78
judgeval/judges/together_judge.py +22 -9
judgeval/judges/utils.py +14 -5
judgeval/judgment_client.py +259 -271
judgeval/rules.py +169 -142
judgeval/run_evaluation.py +462 -305
judgeval/scorers/api_scorer.py +20 -11
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorer.py +77 -58
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
judgeval/scorers/prompt_scorer.py +48 -37
judgeval/scorers/score.py +86 -53
judgeval/scorers/utils.py +11 -7
judgeval/tracer/__init__.py +1 -1
judgeval/utils/alerts.py +23 -12
judgeval/utils/{data_utils.py → file_utils.py} +5 -9
judgeval/utils/requests.py +29 -0
judgeval/version_check.py +5 -2
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
judgeval-0.0.46.dist-info/RECORD +69 -0
judgeval-0.0.44.dist-info/RECORD +0 -68
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
{judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0

judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py CHANGED Viewed

@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from judgeval.data import ExampleParams
 class GroundednessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.GROUNDEDNESS,
             required_params=[
                 ExampleParams.INPUT,
                 ExampleParams.ACTUAL_OUTPUT,
                 ExampleParams.RETRIEVAL_CONTEXT,
-            ]
+            ],
         )
     @property

judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py CHANGED Viewed

@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from judgeval.data import ExampleParams
 class HallucinationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.HALLUCINATION,
             required_params=[
                 ExampleParams.INPUT,
                 ExampleParams.ACTUAL_OUTPUT,
                 ExampleParams.CONTEXT,
-            ]
+            ],
         )
     @property

judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py CHANGED Viewed

@@ -10,15 +10,16 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from judgeval.data import ExampleParams
 class InstructionAdherenceScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.INSTRUCTION_ADHERENCE,
             required_params=[
                 ExampleParams.INPUT,
                 ExampleParams.ACTUAL_OUTPUT,
-            ]
+            ],
         )
     @property

judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py CHANGED Viewed

@@ -5,33 +5,32 @@ TODO add link to docs page for this scorer
 """
 # External imports
 from pydantic import BaseModel, Field
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from judgeval.data import ExampleParams
 class JSONCorrectnessScorer(APIJudgmentScorer):
     json_schema: BaseModel = Field(None, exclude=True)
     def __init__(self, threshold: float, json_schema: BaseModel):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.JSON_CORRECTNESS,
             required_params=[
                 ExampleParams.INPUT,
                 ExampleParams.ACTUAL_OUTPUT,
-            ]
+            ],
         )
-        object.__setattr__(self, 'json_schema', json_schema)
+        object.__setattr__(self, "json_schema", json_schema)
     def to_dict(self):
         base_dict = super().to_dict()  # Get the parent class's dictionary
-        base_dict["kwargs"] = {
-            "json_schema": self.json_schema.model_json_schema()
-        }
+        base_dict["kwargs"] = {"json_schema": self.json_schema.model_json_schema()}
         return base_dict
     @property

judgeval/scorers/judgeval_scorers/api_scorers/summarization.py CHANGED Viewed

@@ -7,21 +7,21 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
+from judgeval.constants import APIScorer
 from judgeval.data import ExampleParams
 class SummarizationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.SUMMARIZATION,
             required_params=[
                 ExampleParams.INPUT,
                 ExampleParams.ACTUAL_OUTPUT,
-            ]
+            ],
         )
     @property
     def __name__(self):
         return "Summarization"

judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py CHANGED Viewed

@@ -6,13 +6,13 @@
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from typing import Optional, Dict
 class ToolDependencyScorer(APIJudgmentScorer):
     kwargs: Optional[Dict] = None
-    def __init__(self, threshold: float=1.0, enable_param_checking: bool = True):
-        super().__init__(
-            threshold=threshold,
-            score_type=APIScorer.TOOL_DEPENDENCY
-        )
+    def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
+        super().__init__(threshold=threshold, score_type=APIScorer.TOOL_DEPENDENCY)
         self.kwargs = {"enable_param_checking": enable_param_checking}
     @property

judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py CHANGED Viewed

@@ -6,11 +6,14 @@
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from typing import Optional, Dict
 class ToolOrderScorer(APIJudgmentScorer):
     kwargs: Optional[Dict] = None
-    def __init__(self, threshold: float=1.0, exact_match: bool=False):
+    def __init__(self, threshold: float = 1.0, exact_match: bool = False):
         super().__init__(
-            threshold=threshold,
+            threshold=threshold,
             score_type=APIScorer.TOOL_ORDER,
         )
         self.kwargs = {"exact_match": exact_match}

judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py CHANGED Viewed

@@ -4,15 +4,17 @@ ClassifierScorer implementation for basic Text-to-SQL evaluation.
 Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
 Determines if the LLM-generated SQL query is valid and works for the natural language query.
 """
 from judgeval.scorers import ClassifierScorer
 Text2SQLScorer = ClassifierScorer(
     name="Text to SQL",
     slug="text2sql-1010101010",
     threshold=1.0,
-    conversation=[{
-        "role": "system",
-        "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
+    conversation=[
+        {
+            "role": "system",
+            "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
 ** TASK INSTRUCTIONS **
 Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
@@ -44,11 +46,8 @@ LLM generated SQL query:
 Table schema:
 {{context}}
-        """
-    }],
-    options={
-        "Y": 1.0,
-        "N": 0.0
-    }
+        """,
+        }
+    ],
+    options={"Y": 1.0, "N": 0.0},
 )

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -9,7 +9,7 @@ To implement a subclass of PromptScorer, you need to implement the following met
 - success_check(): determines whether the evaluation was successful
 The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
-by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
+by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
 the judge, and parses the structured response to determine a score.
 For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
@@ -26,17 +26,17 @@ NOTE: When implementing build_measure_prompt and build_schema:
 """
 from abc import abstractmethod
-from typing import List, Optional, Tuple, Any, Mapping
-from pydantic import BaseModel, model_serializer, Field
+from typing import List, Optional, Tuple, Any
+from pydantic import BaseModel, Field
 from judgeval.data import Example
 from judgeval.data.example import ExampleParams
 from judgeval.scorers import JudgevalScorer
 from judgeval.scorers.utils import (
-    scorer_progress_meter,
+    scorer_progress_meter,
     parse_response_json,
     get_or_create_event_loop,
-    create_verbose_logs
+    create_verbose_logs,
 )
 from judgeval.judges import JudgevalJudge
@@ -56,10 +56,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
     # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
     _response: Optional[dict] = None
     _result: Optional[float] = None
     def __init__(
         self,
-        name: str,
+        name: str,
         threshold: float = 0.5,
         include_reason: bool = True,
         async_mode: bool = True,
@@ -91,10 +91,8 @@ class PromptScorer(JudgevalScorer, BaseModel):
         )
     def score_example(
-            self,
-            example: Example,
-            _show_indicator: bool = True
-            ) -> float:
+        self, example: Example, _show_indicator: bool = True
+    ) -> float | None:
         """
         Synchronous method for scoring an example using the prompt criteria.
         """
@@ -104,6 +102,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
                 loop.run_until_complete(
                     self.a_score_example(example, _show_indicator=False)
                 )
+                return self._result
             else:
                 result, reason = self.evaluate(example)
                 self.reason = reason
@@ -117,10 +116,10 @@ class PromptScorer(JudgevalScorer, BaseModel):
                 return result
     async def a_score_example(
-            self,
-            example: Example,
-            _show_indicator: bool = True,
-            ) -> float:
+        self,
+        example: Example,
+        _show_indicator: bool = True,
+    ) -> float:
         """
         Async method for scoring an example using the prompt criteria.
         """
@@ -135,30 +134,32 @@ class PromptScorer(JudgevalScorer, BaseModel):
                 ],
             )
             return result
     def evaluate(self, example: Example) -> Tuple[Any, str]:
         """
         Synchronous helper method for evaluating an example using the prompt criteria.
-        Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
+        Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
         for evaluation. The result is then parsed as JSON and returned.
         NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
         """
         prompt = self._build_measure_prompt(example)
-        if self.using_native_model:
+        if self.using_native_model and self.model:
             res = self.model.generate(prompt)
             response = parse_response_json(res, self)
             result, reason = self._process_response(response)
             return result, reason
         else:
-            raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.")
+            raise NotImplementedError(
+                "Non-native judge models are not supported in synchronous mode yet."
+            )
     async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
         """
         Asynchronous helper method for evaluating an example using the prompt criteria.
-        Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
+        Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
         for evaluation. The result is then parsed as JSON and returned.
         NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
@@ -166,7 +167,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         judge_prompt = self._build_measure_prompt(example)
         schema = self._build_schema()
         prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
-        if self.using_native_model:
+        if self.using_native_model and self.model:
             res = await self.model.a_generate(prompt)
             response = parse_response_json(res, self)
             self._response = response
@@ -177,7 +178,9 @@ class PromptScorer(JudgevalScorer, BaseModel):
             self._response = response
             return result, reason
         else:
-            raise NotImplementedError("Non-native judge models are not supported in async mode yet.")
+            raise NotImplementedError(
+                "Non-native judge models are not supported in async mode yet."
+            )
     # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
     @abstractmethod
@@ -190,7 +193,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         The prompt is typically a set of instructions that the judge model uses to evaluate the example.
-        This function returns a conversation prompt of the form
+        This function returns a conversation prompt of the form
         [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
         A basic version of implementing this function could be as follows:
@@ -201,7 +204,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         ]
         """
         pass
     # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
     @abstractmethod
     def _build_schema(self) -> dict:
@@ -214,23 +217,23 @@ class PromptScorer(JudgevalScorer, BaseModel):
         return {"score": int, "reason": str}
         """
         pass
     def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
         """
         Formats the final prompt to the judge model.
-        This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
-        and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
+        This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
+        and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
         The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
         Args:
-            judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
+            judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
                                        Each dictionary should contain a "content" key.
-            schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
+            schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
                            and the values are the types of the corresponding values.
         Returns:
-            List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
+            List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
                         of the first dictionary.
         Raises:
@@ -242,19 +245,27 @@ class PromptScorer(JudgevalScorer, BaseModel):
             formatted_prompt = format_measure_prompt(judge_prompt, schema)
             # formatted_prompt[0]["content"] will include the schema enforcement prompt
         """
-        SCHEMA_ENFORCEMENT_PROMPT = "\n\nPlease provide your response in the following JSON format: {"
-        if isinstance(judge_prompt, list) and all(isinstance(item, dict) for item in judge_prompt):
+        SCHEMA_ENFORCEMENT_PROMPT = (
+            "\n\nPlease provide your response in the following JSON format: {"
+        )
+        if isinstance(judge_prompt, list) and all(
+            isinstance(item, dict) for item in judge_prompt
+        ):
             # create formatting string for schema enforcement
-            # schema is a map between key and type of the value
+            # schema is a map between key and type of the value
             for key, key_type in schema.items():
                 SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
-            SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"  # remove trailing comma and space
+            SCHEMA_ENFORCEMENT_PROMPT = (
+                SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"
+            )  # remove trailing comma and space
             judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
             return judge_prompt
         else:
-            raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.")
+            raise TypeError(
+                f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead."
+            )
-    @abstractmethod
+    @abstractmethod
     def _process_response(self, response: dict):
         """
         Customizable method for processing the response from the judge model.
@@ -276,7 +287,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
         Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
         """
         pass
     @property
     def __name__(self):
         return self.name

judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

judgeval 0.0.44py3-none-any.whl → 0.0.46py3-none-any.whl