PyPI - judgeval - Versions diffs - 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl - Mend

judgeval 0.0.20py3-none-any.whl → 0.0.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

judgeval/judgment_client.py CHANGED Viewed

@@ -10,7 +10,6 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example,
-    GroundTruthExample
 )
 from judgeval.scorers import (
     APIJudgmentScorer,
@@ -27,7 +26,8 @@ from judgeval.judges import JudgevalJudge
 from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
     JUDGMENT_EVAL_DELETE_API_URL,
-    JUDGMENT_EVAL_DELETE_PROJECT_API_URL
+    JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
+    JUDGMENT_PROJECT_DELETE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -156,7 +156,7 @@ class JudgmentClient:
         metadata: Optional[Dict[str, Any]] = None,
         project_name: str = "",
         eval_run_name: str = "",
-        log_results: bool = False,
+        log_results: bool = True,
         use_judgment: bool = True,
         rules: Optional[List[Rule]] = None
     ) -> List[ScoringResult]:
@@ -282,11 +282,11 @@ class JudgmentClient:
         """
         return self.eval_dataset_client.pull_all_user_dataset_stats()
-    def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
+    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
         """
-        Edits the dataset on Judgment platform by adding new examples and ground truths
+        Edits the dataset on Judgment platform by adding new examples
         """
-        return self.eval_dataset_client.edit_dataset(alias, examples, ground_truths)
+        return self.eval_dataset_client.edit_dataset(alias, examples)
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -362,7 +362,6 @@ class JudgmentClient:
         response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
                         json={
                             "project_name": project_name,
-                            "judgment_api_key": self.judgment_api_key,
                         },
                         headers={
                             "Content-Type": "application/json",
@@ -372,6 +371,23 @@ class JudgmentClient:
         if response.status_code != requests.codes.ok:
             raise ValueError(f"Error deleting eval results: {response.json()}")
         return response.json()
+    def delete_project(self, project_name: str) -> bool:
+        """
+        Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
+        """
+        response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
+                        json={
+                            "project_name": project_name,
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error deleting project: {response.json()}")
+        return response.json()
     def _validate_api_key(self):
         """

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import asyncio
 import requests
-from typing import List, Dict
+import time
+import sys
+import itertools
+import threading
+from typing import List, Dict, Any
 from datetime import datetime
 from rich import print as rprint
 from judgeval.data import (
     ScorerData,
-    ScoringResult
+    ScoringResult,
+    Example
 )
 from judgeval.scorers import (
     JudgevalScorer,
@@ -14,7 +19,6 @@ from judgeval.scorers import (
     ClassifierScorer
 )
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
-def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
+def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
             raise JudgmentAPIError(error_message)
         if "ui_results_url" in res.json():
-            rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
+            url = res.json()['ui_results_url']
+            pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
+            return pretty_str
     except requests.exceptions.RequestException as e:
         error(f"Request failed while saving evaluation results to DB: {str(e)}")
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
         error(f"Failed to save evaluation results to DB: {str(e)}")
         raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
+def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
+        """Run a function with a spinner in the terminal."""
+        spinner = itertools.cycle(['|', '/', '-', '\\'])
+        def display_spinner():
+            while not stop_spinner_event.is_set():
+                sys.stdout.write(f'\r{message}{next(spinner)}')
+                sys.stdout.flush()
+                time.sleep(0.1)
+        stop_spinner_event = threading.Event()
+        spinner_thread = threading.Thread(target=display_spinner)
+        spinner_thread.start()
+        try:
+            result = func(*args, **kwargs)
+        except Exception as e:
+            error(f"An error occurred: {str(e)}")
+            stop_spinner_event.set()
+            spinner_thread.join()
+            raise e
+        finally:
+            stop_spinner_event.set()
+            spinner_thread.join()
+            sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
+            sys.stdout.flush()
+        return result
+def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
+    """
+    Checks if the example contains the necessary parameters for the scorer.
+    """
+    for scorer in scorers:
+        if isinstance(scorer, APIJudgmentScorer):
+            for example in examples:
+                missing_params = []
+                for param in scorer.required_params:
+                    if getattr(example, param.value) is None:
+                        missing_params.append(f"'{param.value}'")
+                if missing_params:
+                    # We do this because we want to inform users that an example is missing parameters for a scorer
+                    # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
+                    print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
 def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     Returns:
         List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
     """
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
     if not override and evaluation_run.log_results:
         check_eval_run_name_exists(
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     # Execute evaluation using Judgment API
     if judgment_scorers:
+        check_examples(evaluation_run.examples, evaluation_run.scorers)
         info("Starting API evaluation")
         debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
         try:  # execute an EvaluationRun with just JudgmentScorers
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 rules=evaluation_run.rules
             )
             debug("Sending request to Judgment API")
-            response_data: List[Dict] = execute_api_eval(api_evaluation_run)  # Dicts are `ScoringResult` objs
+            response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
             info(f"Received {len(response_data['results'])} results from API")
         except JudgmentAPIError as e:
             error(f"An error occurred while executing the Judgment API request: {str(e)}")
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
                 api_results.append(ScoringResult(**filtered_result))
     # Run local evals
     if local_scorers:  # List[JudgevalScorer]
+        # We should be removing local scorers soon
         info("Starting local evaluation")
         for example in evaluation_run.examples:
             with example_logging_context(example.timestamp, example.example_id):
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     #     )
     if evaluation_run.log_results:
-        log_evaluation_results(merged_results, evaluation_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+        rprint(pretty_str)
     for i, result in enumerate(merged_results):
         if not result.scorers_data:  # none of the scorers could be executed on this example

judgeval/scorers/api_scorer.py CHANGED Viewed

@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
 """
 from pydantic import BaseModel, field_validator
+from typing import List
 from judgeval.common.logger import debug, info, warning, error
+from judgeval.data import ExampleParams
 from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
     """
     score_type: APIScorer
     threshold: float
+    required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
     @field_validator('threshold')
     def validate_threshold(cls, v, info):

judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class AnswerCorrectnessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.ANSWER_CORRECTNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py CHANGED Viewed

@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class AnswerRelevancyScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.ANSWER_RELEVANCY,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/comparison.py CHANGED Viewed

@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
 from typing import Optional, Dict
+from judgeval.data import ExampleParams
 class ComparisonScorer(APIJudgmentScorer):
     kwargs: Optional[Dict] = None
     def __init__(self, threshold: float, criteria: str, description: str):
-        super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.COMPARISON,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
         self.kwargs = {"criteria": criteria, "description": description}
     @property

judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py CHANGED Viewed

@@ -8,11 +8,20 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualPrecisionScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_PRECISION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py CHANGED Viewed

@@ -8,12 +8,21 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualRecallScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_RECALL,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):
         return "Contextual Recall"

judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py CHANGED Viewed

@@ -8,15 +8,22 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class ContextualRelevancyScorer(APIJudgmentScorer):
     """
     Scorer that checks if the output of a model is relevant to the retrieval context
     """
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.CONTEXTUAL_RELEVANCY,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):
         return "Contextual Relevancy"

judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py CHANGED Viewed

@@ -8,13 +8,21 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
-from typing import Optional, Dict
+from typing import Optional, Dict, List
+from judgeval.data import ExampleParams
 class ExecutionOrderScorer(APIJudgmentScorer):
     kwargs: Optional[Dict] = None
     def __init__(self, threshold: float, should_exact_match: bool = False, should_consider_ordering: bool = False):
-        super().__init__(threshold=threshold, score_type=APIScorer.EXECUTION_ORDER)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.EXECUTION_ORDER,
+            required_params=[
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.EXPECTED_OUTPUT,
+            ]
+        )
         self.kwargs = {"should_exact_match": should_exact_match, "should_consider_ordering": should_consider_ordering}
     @property

judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class FaithfulnessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.FAITHFULNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class GroundednessScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.GROUNDEDNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.RETRIEVAL_CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py CHANGED Viewed

@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class HallucinationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.HALLUCINATION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+                ExampleParams.CONTEXT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py CHANGED Viewed

@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class InstructionAdherenceScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.INSTRUCTION_ADHERENCE,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py CHANGED Viewed

@@ -11,13 +11,20 @@ from pydantic import BaseModel, Field
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
 from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class JSONCorrectnessScorer(APIJudgmentScorer):
     json_schema: BaseModel = Field(None, exclude=True)
     def __init__(self, threshold: float, json_schema: BaseModel):
-        super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.JSON_CORRECTNESS,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
         object.__setattr__(self, 'json_schema', json_schema)
     def to_dict(self):

judgeval/scorers/judgeval_scorers/api_scorers/summarization.py CHANGED Viewed

@@ -7,12 +7,19 @@ TODO add link to docs page for this scorer
 # Internal imports
 from judgeval.scorers.api_scorer import APIJudgmentScorer
-from judgeval.constants import APIScorer
+from judgeval.constants import APIScorer
+from judgeval.data import ExampleParams
 class SummarizationScorer(APIJudgmentScorer):
     def __init__(self, threshold: float):
-        super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
+        super().__init__(
+            threshold=threshold,
+            score_type=APIScorer.SUMMARIZATION,
+            required_params=[
+                ExampleParams.INPUT,
+                ExampleParams.ACTUAL_OUTPUT,
+            ]
+        )
     @property
     def __name__(self):

{judgeval-0.0.20.dist-info → judgeval-0.0.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.20
+Version: 0.0.22
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: anthropic
 Requires-Dist: fastapi
+Requires-Dist: langchain
+Requires-Dist: langchain-anthropic
+Requires-Dist: langchain-core
+Requires-Dist: langchain-huggingface
+Requires-Dist: langchain-openai
 Requires-Dist: litellm
 Requires-Dist: nest-asyncio
 Requires-Dist: openai
+Requires-Dist: openpyxl
 Requires-Dist: pandas
 Requires-Dist: pika
 Requires-Dist: python-dotenv==1.0.1
@@ -23,8 +29,6 @@ Requires-Dist: supabase
 Requires-Dist: together
 Requires-Dist: uvicorn
 Provides-Extra: dev
-Requires-Dist: langfuse==2.50.3; extra == 'dev'
-Requires-Dist: patronus; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
 Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
 Requires-Dist: pytest>=8.3.4; extra == 'dev'

judgeval 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl

judgeval 0.0.20py3-none-any.whl → 0.0.22py3-none-any.whl