PyPI - judgeval - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

judgeval 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

judgeval/common/tracer.py +229 -44
judgeval/constants.py +15 -3
judgeval/data/datasets/__init__.py +2 -1
judgeval/data/datasets/dataset.py +1 -122
judgeval/data/datasets/eval_dataset_client.py +193 -0
judgeval/data/result.py +16 -1
judgeval/evaluation_run.py +2 -1
judgeval/judges/utils.py +14 -2
judgeval/judgment_client.py +64 -7
judgeval/run_evaluation.py +19 -0
judgeval/scorers/judgeval_scorer.py +8 -8
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
judgeval/scorers/prompt_scorer.py +2 -2
judgeval/scorers/score.py +11 -11
judgeval/scorers/utils.py +3 -3
judgeval/tracer/__init__.py +3 -0
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/METADATA +5 -4
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/RECORD +21 -19
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/WHEEL +0 -0
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md +0 -0

judgeval/data/datasets/eval_dataset_client.py ADDED Viewed

@@ -0,0 +1,193 @@
+from typing import Optional
+import requests
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from judgeval.common.logger import debug, error, warning, info
+from judgeval.constants import (
+    JUDGMENT_DATASETS_PUSH_API_URL,
+    JUDGMENT_DATASETS_PULL_API_URL,
+    JUDGMENT_DATASETS_PULL_ALL_API_URL
+)
+from judgeval.data import Example
+from judgeval.data.datasets import EvalDataset
+from judgeval.data.datasets.ground_truth import GroundTruthExample
+class EvalDatasetClient:
+    def __init__(self, judgment_api_key: str):
+        self.judgment_api_key = judgment_api_key
+    def create_dataset(self) -> EvalDataset:
+        return EvalDataset(judgment_api_key=self.judgment_api_key)
+    def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
+        debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
+        if overwrite:
+            warning(f"Overwrite enabled for alias '{alias}'")
+        """
+        Pushes the dataset to Judgment platform
+        Mock request:
+        dataset = {
+            "alias": alias,
+            "ground_truths": [...],
+            "examples": [...],
+            "overwrite": overwrite
+        } ==>
+        {
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        with Progress(
+            SpinnerColumn(style="rgb(106,0,255)"),
+            TextColumn("[progress.description]{task.description}"),
+            transient=False,
+        ) as progress:
+            task_id = progress.add_task(
+                f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
+                total=100,
+            )
+            content = {
+                    "alias": alias,
+                    "ground_truths": [g.to_dict() for g in dataset.ground_truths],
+                    "examples": [e.to_dict() for e in dataset.examples],
+                    "overwrite": overwrite,
+                    "judgment_api_key": dataset.judgment_api_key
+                }
+            try:
+                response = requests.post(
+                    JUDGMENT_DATASETS_PUSH_API_URL,
+                    json=content
+                )
+                if response.status_code == 500:
+                    error(f"Server error during push: {content.get('message')}")
+                    return False
+                response.raise_for_status()
+            except requests.exceptions.HTTPError as err:
+                if response.status_code == 422:
+                    error(f"Validation error during push: {err.response.json()}")
+                else:
+                    error(f"HTTP error during push: {err}")
+            info(f"Successfully pushed dataset with alias '{alias}'")
+            payload = response.json()
+            dataset._alias = payload.get("_alias")
+            dataset._id = payload.get("_id")
+            progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+            return True
+    def pull(self, alias: str) -> EvalDataset:
+        debug(f"Pulling dataset with alias '{alias}'")
+        """
+        Pulls the dataset from Judgment platform
+        Mock request:
+        {
+            "alias": alias,
+            "user_id": user_id
+        }
+        ==>
+        {
+            "ground_truths": [...],
+            "examples": [...],
+            "_alias": alias,
+            "_id": "..."  # ID of the dataset
+        }
+        """
+        # Make a POST request to the Judgment API to get the dataset
+        dataset = self.create_dataset()
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "alias": alias,
+                    "judgment_api_key": self.judgment_api_key
+                }
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_PULL_API_URL,
+                        json=request_body
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error pulling dataset: {str(e)}")
+                    raise
+                info(f"Successfully pulled dataset with alias '{alias}'")
+                payload = response.json()
+                dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
+                dataset.examples = [Example(**e) for e in payload.get("examples", [])]
+                dataset._alias = payload.get("_alias")
+                dataset._id = payload.get("_id")
+                progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+                return dataset
+    def pull_all_user_dataset_stats(self) -> dict:
+        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
+        """
+        Pulls the user datasets stats from Judgment platform
+        Mock request:
+        {
+            "user_id": user_id
+        }
+        ==>
+        {
+            "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
+            "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
+            ...
+        }
+        """
+        # Make a POST request to the Judgment API to get the dataset
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "judgment_api_key": self.judgment_api_key
+                }
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
+                        json=request_body
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error pulling dataset: {str(e)}")
+                    raise
+                info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
+                payload = response.json()
+                progress.update(
+                    task_id,
+                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
+                )
+                return payload

judgeval/data/result.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union, Optional
+from typing import List, Union, Optional, Dict, Any
 from judgeval.data import ScorerData, ProcessExample
@@ -18,6 +18,9 @@ class ScoringResult:
         expected_output (Optional[str]): The expected output of the example
         context (Optional[List[str]]): The context of the example
         retrieval_context (Optional[List[str]]): The retrieval context of the example
+        additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
+        tools_called (Optional[List[str]]): The tools called by the example
+        expected_tools (Optional[List[str]]): The expected tools of the example
         trace_id (Optional[str]): The trace id of the example
     """
@@ -31,6 +34,9 @@ class ScoringResult:
     expected_output: Optional[str] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
+    additional_metadata: Optional[Dict[str, Any]] = None
+    tools_called: Optional[List[str]] = None
+    expected_tools: Optional[List[str]] = None
     trace_id: Optional[str] = None
     example_id: Optional[str] = None
@@ -46,6 +52,9 @@ class ScoringResult:
             "expected_output": self.expected_output,
             "context": self.context,
             "retrieval_context": self.retrieval_context,
+            "additional_metadata": self.additional_metadata,
+            "tools_called": self.tools_called,
+            "expected_tools": self.expected_tools,
             "trace_id": self.trace_id,
             "example_id": self.example_id
         }
@@ -59,6 +68,9 @@ class ScoringResult:
             expected_output={self.expected_output}, \
             context={self.context}, \
             retrieval_context={self.retrieval_context}, \
+            additional_metadata={self.additional_metadata}, \
+            tools_called={self.tools_called}, \
+            expected_tools={self.expected_tools}, \
             trace_id={self.trace_id})"
@@ -79,5 +91,8 @@ def generate_scoring_result(
         expected_output=process_example.expected_output,
         context=process_example.context,
         retrieval_context=process_example.retrieval_context,
+        additional_metadata=process_example.additional_metadata,
+        tools_called=process_example.tools_called,
+        expected_tools=process_example.expected_tools,
         trace_id=process_example.trace_id
     )

judgeval/evaluation_run.py CHANGED Viewed

@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
         project_name (str): The name of the project the evaluation results belong to
         eval_name (str): A name for this evaluation run
         examples (List[Example]): The examples to evaluate
-        scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
+        scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
         model (str): The model used as a judge when using LLM as a Judge
         aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
         metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
     metadata: Optional[Dict[str, Any]] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
+    override: Optional[bool] = False
     def model_dump(self, **kwargs):
         data = super().model_dump(**kwargs)

judgeval/judges/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
 from judgeval.common.exceptions import InvalidJudgeModelError
 from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
-from judgeval.constants import TOGETHER_SUPPORTED_MODELS
+from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
 LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
@@ -33,7 +33,13 @@ def create_judge(
     # Either string or List[str]
     if isinstance(model, list):
         for m in model:
-            if m not in TOGETHER_SUPPORTED_MODELS and m not in LITELLM_SUPPORTED_MODELS:
+            if m in JUDGMENT_SUPPORTED_MODELS:
+                raise NotImplementedError(
+                    """Judgment models are not yet supported for local scoring.
+                    Please either set the `use_judgment` flag to True or use
+                    non-Judgment models."""
+                )
+            if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
                 raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
         return MixtureOfJudges(models=model), True
     # If model is a string, check that it corresponds to a valid model
@@ -41,5 +47,11 @@ def create_judge(
         return LiteLLMJudge(model=model), True
     if model in TOGETHER_SUPPORTED_MODELS:
         return TogetherJudge(model=model), True
+    if model in JUDGMENT_SUPPORTED_MODELS:
+        raise NotImplementedError(
+            """Judgment models are not yet supported for local scoring.
+            Please either set the `use_judgment` flag to True or use
+            non-Judgment models."""
+        )
     else:
         raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")

judgeval/judgment_client.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
 import requests
 from judgeval.constants import ROOT_API
-from judgeval.data.datasets import EvalDataset
+from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example
@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
     assert_test
 )
 from judgeval.judges import JudgevalJudge
-from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
+from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
 class JudgmentClient:
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
         self.judgment_api_key = judgment_api_key
+        self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
         # Verify API key is valid
         result, response = self._validate_api_key()
@@ -121,7 +122,7 @@ class JudgmentClient:
             raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
     def create_dataset(self) -> EvalDataset:
-        return EvalDataset(judgment_api_key=self.judgment_api_key)
+        return self.eval_dataset_client.create_dataset()
     def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
         """
@@ -137,7 +138,7 @@ class JudgmentClient:
         """
         # Set judgment_api_key just in case it was not set
         dataset.judgment_api_key = self.judgment_api_key
-        return dataset.push(alias, overwrite)
+        return self.eval_dataset_client.push(dataset, alias, overwrite)
     def pull_dataset(self, alias: str) -> EvalDataset:
         """
@@ -149,9 +150,20 @@ class JudgmentClient:
         Returns:
             EvalDataset: The retrieved dataset
         """
-        dataset = EvalDataset(judgment_api_key=self.judgment_api_key)
-        dataset.pull(alias)
-        return dataset
+        return self.eval_dataset_client.pull(alias)
+    def pull_all_user_dataset_stats(self) -> dict:
+        """
+        Retrieves all dataset stats from the Judgment platform for the user.
+        Args:
+            alias (str): The name of the dataset to retrieve
+        Returns:
+            EvalDataset: The retrieved dataset
+        """
+        return self.eval_dataset_client.pull_all_user_dataset_stats()
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -182,6 +194,51 @@ class JudgmentClient:
             eval_run_result[0]["id"] = result_id
             eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
         return eval_run_result
+    def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
+        """
+        Deletes an evaluation from the server by project and run name.
+        Args:
+            project_name (str): Name of the project
+            eval_run_name (str): Name of the evaluation run
+        Returns:
+            bool: Whether the evaluation was successfully deleted
+        """
+        eval_run_request_body = EvalRunRequestBody(project_name=project_name,
+                                                   eval_name=eval_run_name,
+                                                   judgment_api_key=self.judgment_api_key)
+        response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
+                        json=eval_run_request_body.model_dump(),
+                        headers={
+                            "Content-Type": "application/json",
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting eval results: {response.json()}")
+        return response.json()
+    def delete_project_evals(self, project_name: str) -> bool:
+        """
+        Deletes all evaluations from the server for a given project.
+        Args:
+            project_name (str): Name of the project
+        Returns:
+            bool: Whether the evaluations were successfully deleted
+        """
+        response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
+                        json={
+                            "project_name": project_name,
+                            "judgment_api_key": self.judgment_api_key
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting eval results: {response.json()}")
+        return response.json()
     def _validate_api_key(self):
         """

judgeval/run_evaluation.py CHANGED Viewed

@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
             raise ValueError("The API and local results are not aligned.")
         if api_result.retrieval_context != local_result.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
+        if api_result.additional_metadata != local_result.additional_metadata:
+            raise ValueError("The API and local results are not aligned.")
+        if api_result.tools_called != local_result.tools_called:
+            raise ValueError("The API and local results are not aligned.")
+        if api_result.expected_tools != local_result.expected_tools:
+            raise ValueError("The API and local results are not aligned.")
         # Merge ScorerData from the API and local scorers together
         api_scorer_data = api_result.scorers_data
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 debug(f"Context: {example.context}")
             if example.retrieval_context:
                 debug(f"Retrieval context: {example.retrieval_context}")
+            if example.additional_metadata:
+                debug(f"Additional metadata: {example.additional_metadata}")
+            if example.tools_called:
+                debug(f"Tools called: {example.tools_called}")
+            if example.expected_tools:
+                debug(f"Expected tools: {example.expected_tools}")
     debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
                 'expected_output': result.expected_output,
                 'context': result.context,
                 'retrieval_context': result.retrieval_context,
+                'additional_metadata': result.additional_metadata,
+                'tools_called': result.tools_called,
+                'expected_tools': result.expected_tools,
                 'eval_run_name': result.eval_run_name,
                 'failed_scorers': []
             }
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             error_msg += f"Expected Output: {fail_case['expected_output']}\n"
             error_msg += f"Context: {fail_case['context']}\n"
             error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
+            error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
+            error_msg += f"Tools Called: {fail_case['tools_called']}\n"
+            error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
             error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
             for fail_scorer in fail_case['failed_scorers']:

judgeval/scorers/judgeval_scorer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Custom Scorer class
+Judgeval Scorer class
 Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
 To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
@@ -57,12 +57,12 @@ class JudgevalScorer:
         verbose_logs: Optional[str] = None,
         additional_metadata: Optional[Dict] = None
         ):
-            debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
+            debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
             if not 0 <= threshold <= 1:
                 raise ValueError("Threshold must be between 0 and 1")
             if strict_mode:
                 warning("Strict mode enabled - scoring will be more rigorous")
-            info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
+            info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
             self.score_type = score_type
             self.threshold = threshold
             self.score = score
@@ -81,7 +81,7 @@ class JudgevalScorer:
     def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
         """
-        Adds the evaluation model to the CustomScorer instance
+        Adds the evaluation model to the JudgevalScorer instance
         This method is used at eval time
         """
@@ -116,10 +116,10 @@ class JudgevalScorer:
         raise NotImplementedError("You must implement the `passes` method in your custom scorer")
     def __str__(self):
-        debug("Converting CustomScorer instance to string representation")
+        debug("Converting JudgevalScorer instance to string representation")
         if self.error:
-            warning(f"CustomScorer contains error: {self.error}")
-        info(f"CustomScorer status - success: {self.success}, score: {self.score}")
+            warning(f"JudgevalScorer contains error: {self.error}")
+        info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
         attributes = {
             "score_type": self.score_type,
             "threshold": self.threshold,
@@ -137,4 +137,4 @@ class JudgevalScorer:
             "verbose_logs": self.verbose_logs,
             "additional_metadata": self.additional_metadata,
         }
-        return f"CustomScorer({attributes})"
+        return f"JudgevalScorer({attributes})"

judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Code for the local implementation of the Faithfulness metric.
 """
 from typing import List, Optional, Union
+from pprint import pprint
 from judgeval.constants import APIScorer
 from judgeval.data import (
     Example,
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
         ):
             self.claims = await self._a_generate_claims(example.actual_output)
             if self.additional_metadata is None:
                 self.additional_metadata = {}
             self.additional_metadata["claims"] = self.claims
             self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
             self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts]  # Add verdicts generated to metadata
             self.score = self._calculate_score()

judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py CHANGED Viewed

@@ -129,10 +129,13 @@ JSON:
     def create_verdicts(claims, retrieval_context):
         return f"""==== TASK INSTRUCTIONS ====
 You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
-Your task is to determine whether EACH claim is factually consistent with the retrieval context ("yes", "no", or "idk").
-ONLY choose 'no' if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
+I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
+For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
+YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
+Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
 Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
-Claims that are not justified by the retrieval context due to a lack of information MUST BE ANSWERED with 'idk'.
+Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
 ==== FORMATTING YOUR ANSWER ====
 Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.

judgeval/scorers/prompt_scorer.py CHANGED Viewed

@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
         )
-        # Then initialize CustomScorer
+        # Then initialize JudgevalScorer
         JudgevalScorer.__init__(
             self,
             score_type=name,
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
             strict_mode=strict_mode,
             verbose_mode=verbose_mode,
         )
-        # Then initialize CustomScorer
+        # Then initialize JudgevalScorer
         JudgevalScorer.__init__(
             self,
             score_type=name,

judgeval/scorers/score.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Infrastructure for executing evaluations of `Example`s using one or more `CustomScorer`s.
+Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
 """
@@ -30,15 +30,15 @@ async def safe_a_score_example(
 ):
     """
     Scoring task function when not using a progress indicator!
-    "Safely" scores an `Example` using a `CustomScorer` by gracefully handling any exceptions that may occur.
+    "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
     Args:
-        scorer (CustomScorer): The `CustomScorer` to use for scoring the example.
+        scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
         ignore_errors (bool): Whether to ignore errors during the evaluation.
         If set to false, any error will be raised and stop the evaluation.
-        If set to true, the error will be stored in the `error` attribute of the `CustomScorer` and the `success` attribute will be set to False.
+        If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
         skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
@@ -102,12 +102,12 @@ async def score_task(
     skip_on_missing_params: bool = True,
 ):
     """
-    Task function for asynchronously measuring a given example using a custom scorer.
+    Task function for asynchronously measuring a given example using a JudgevalScorer.
     Args:
         task_id (int): The ID of the task being measured.
         progress (Progress): An instance of the Progress class to track task progress.
-        scorer (CustomScorer): An instance of the CustomScorer class used to score the example.
+        scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
         example (Example): The example to be scored.
         ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
         skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
@@ -189,10 +189,10 @@ async def score_with_indicator(
     show_indicator: bool,
 ):
     """
-    Scores an example using a list of custom scorers, optionally displaying a progress indicator.
+    Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
     Args:
-        scorers (List[CustomScorer]): A list of custom scorer objects to evaluate the example.
+        scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
         example (Example): The example to be scored.
         ignore_errors (bool): If True, errors during scoring will be ignored.
         skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
@@ -253,8 +253,8 @@ async def a_execute_scoring(
     _use_bar_indicator: bool = True,
 ) -> List[ScoringResult]:
     """
-    Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
-    Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
+    Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
+    Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
     Args:
         examples (List[Example]): A list of `Example` objects to be evaluated.
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
     Evaluate a single example asynchronously using a list of scorers.
     Args:
-        scorers (List[CustomScorer]): List of CustomScorer objects to evaluate the example.
+        scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
         example (Example): The example to be evaluated.
         scoring_results (List[ScoringResult]): List to store the scoring results.
         score_index (int): Index at which the result should be stored in scoring_results.

judgeval 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

judgeval 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl