PyPI - judgeval - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.36__py3-none-any.whl - Mend

judgeval 0.0.35py3-none-any.whl → 0.0.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

judgeval/common/tracer.py +352 -118
judgeval/constants.py +3 -2
judgeval/data/datasets/dataset.py +3 -0
judgeval/data/datasets/eval_dataset_client.py +63 -3
judgeval/integrations/langgraph.py +1961 -299
judgeval/judgment_client.py +8 -2
judgeval/run_evaluation.py +67 -18
judgeval/scorers/score.py +1 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/METADATA +1 -2
{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/RECORD +12 -12
{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/WHEEL +0 -0
{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/licenses/LICENSE.md +0 -0

judgeval/judgment_client.py CHANGED Viewed

@@ -232,11 +232,17 @@ class JudgmentClient(metaclass=SingletonMeta):
         dataset.judgment_api_key = self.judgment_api_key
         return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
+    def append_example_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Appends an `EvalDataset` to the Judgment platform for storage.
         """
-        return self.eval_dataset_client.append(alias, examples, project_name)
+        return self.eval_dataset_client.append_examples(alias, examples, project_name)
+    def append_sequence_dataset(self, alias: str, sequences: List[Sequence], project_name: str) -> bool:
+        """
+        Appends a `Sequence` to the Judgment platform for storage.
+        """
+        return self.eval_dataset_client.append_sequences(alias, sequences, project_name)
     def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """

judgeval/run_evaluation.py CHANGED Viewed

@@ -198,6 +198,40 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
             )
     return results
+def check_experiment_type(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str, is_sequence: bool) -> None:
+    """
+    Checks if the current experiment, if one exists, has the same type (examples of sequences)
+    """
+    try:
+        response = requests.post(
+            f"{ROOT_API}/check_experiment_type/",
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {judgment_api_key}",
+                "X-Organization-Id": organization_id
+            },
+            json={
+                "eval_name": eval_name,
+                "project_name": project_name,
+                "judgment_api_key": judgment_api_key,
+                "is_sequence": is_sequence
+            },
+            verify=True
+        )
+        if response.status_code == 422:
+            error(f"{response.json()}")
+            raise ValueError(f"{response.json()}")
+        if not response.ok:
+            response_data = response.json()
+            error_message = response_data.get('detail', 'An unknown error occurred.')
+            error(f"Error checking eval run name: {error_message}")
+            raise JudgmentAPIError(error_message)
+    except requests.exceptions.RequestException as e:
+        error(f"Failed to check if experiment type exists: {str(e)}")
+        raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
 def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str, organization_id: str) -> None:
     """
@@ -264,7 +298,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], run: Union[Evalu
                 "X-Organization-Id": run.organization_id
             },
             json={
-                "results": [result.model_dump(warnings=False) for result in merged_results],
+                "results": merged_results,
                 "run": run.model_dump(warnings=False)
             },
             verify=True
@@ -318,21 +352,18 @@ def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
         return result
-def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
+def check_examples(examples: List[Example], scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]) -> None:
     """
     Checks if the example contains the necessary parameters for the scorer.
     """
     for scorer in scorers:
-        if isinstance(scorer, APIJudgmentScorer):
-            for example in examples:
-                missing_params = []
-                for param in scorer.required_params:
-                    if getattr(example, param.value) is None:
-                        missing_params.append(f"'{param.value}'")
-                if missing_params:
-                    # We do this because we want to inform users that an example is missing parameters for a scorer
-                    # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
-                    print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
+        for example in examples:
+            missing_params = []
+            for param in scorer.required_params:
+                if getattr(example, param.value) is None:
+                    missing_params.append(f"'{param.value}'")
+            if missing_params:
+                print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
 def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -344,6 +375,17 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
             sequence_run.organization_id
         )
+    if sequence_run.append:
+        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        check_experiment_type(
+            sequence_run.eval_name,
+            sequence_run.project_name,
+            sequence_run.judgment_api_key,
+            sequence_run.organization_id,
+            True
+        )
     # Execute evaluation using Judgment API
     info("Starting API evaluation")
     try:  # execute an EvaluationRun with just JudgmentScorers
@@ -359,13 +401,9 @@ def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_
     # Convert the response data to `ScoringResult` objects
     debug("Processing API results")
-    api_results = []
-    for result in response_data["results"]:
-        api_results.append(ScoringResult(**result))
     # TODO: allow for custom scorer on sequences
     if sequence_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
+        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, response_data["results"], sequence_run)
         rprint(pretty_str)
@@ -404,6 +442,16 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             evaluation_run.organization_id
         )
+    if evaluation_run.append:
+        # Check that the current experiment, if one exists, has the same type (examples of sequences)
+        check_experiment_type(
+            evaluation_run.eval_name,
+            evaluation_run.project_name,
+            evaluation_run.judgment_api_key,
+            evaluation_run.organization_id,
+            False
+        )
     # Set example IDs if not already set
     debug("Initializing examples with IDs and timestamps")
     for idx, example in enumerate(evaluation_run.examples):
@@ -539,7 +587,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #     )
         # print(merged_results)
         if evaluation_run.log_results:
-            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+            send_results = [result.model_dump(warnings=False) for result in merged_results]
+            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, send_results, evaluation_run)
             rprint(pretty_str)
         for i, result in enumerate(merged_results):

judgeval/scorers/score.py CHANGED Viewed

@@ -271,6 +271,7 @@ async def a_execute_scoring(
     Returns:
         List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
     """
     semaphore = asyncio.Semaphore(max_concurrent)
     async def execute_with_semaphore(func: Callable, *args, **kwargs):

{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.35
+Version: 0.0.36
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,7 +11,6 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.11
 Requires-Dist: anthropic
-Requires-Dist: boto3==1.38.3
 Requires-Dist: fastapi
 Requires-Dist: google-genai
 Requires-Dist: langchain

{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
+judgeval/constants.py,sha256=Gc1xpft2BkFRUIjj-puCzILsG1EUOEs8V-bUWP9b1WM,5508
 judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
-judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
+judgeval/judgment_client.py,sha256=slYLE80FqEIsqgShMtML4I64p-RrEfELbMgZnlXhxP0,22515
 judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
-judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
+judgeval/run_evaluation.py,sha256=1G-KYNHowfMKTD5j3cDd4EuEme00AqZkn6wpP3zMKUo,30241
 judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
 judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
-judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
+judgeval/common/tracer.py,sha256=bkN0Jol0mNosJeEJMtjM54jJDhEYL3OSBtkS4FB1m8E,105461
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
 judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
 judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
@@ -20,9 +20,9 @@ judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,
 judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
 judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
-judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
-judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
+judgeval/data/datasets/dataset.py,sha256=RmZ28oyDPfRsCx4k5ftMscoq0M0LN78MW6ofTiM81BI,13134
+judgeval/data/datasets/eval_dataset_client.py,sha256=uirHpkpLOfygXIz0xKAGTPx1qjbBTzdLFQK6yyoZduU,17544
+judgeval/integrations/langgraph.py,sha256=7LpWDpb8wgOkeRJvlr2COvF_O1f01zm-cwsI5trKoiw,123150
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -34,7 +34,7 @@ judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
+judgeval/scorers/score.py,sha256=fZuaZPumqkLCWcZdpTn3bJeHPNHXaDqgyb0WBp2EYgE,18742
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
@@ -57,7 +57,7 @@ judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMv
 judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
-judgeval-0.0.35.dist-info/METADATA,sha256=oAaDqpJCCZxUBOoVPTFbSjZgZ5xJMpGTxjngoJqmTO8,6126
-judgeval-0.0.35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.35.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.35.dist-info/RECORD,,
+judgeval-0.0.36.dist-info/METADATA,sha256=oexg66X9idECkevPAF2VkuQJBt-hYHvKmsZz5p5Y-LI,6097
+judgeval-0.0.36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.36.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.36.dist-info/RECORD,,

{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.35.dist-info → judgeval-0.0.36.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.35__py3-none-any.whl → 0.0.36__py3-none-any.whl

judgeval 0.0.35py3-none-any.whl → 0.0.36py3-none-any.whl