PyPI - judgeval - Versions diffs - 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

judgeval/common/tracer.py +528 -166
judgeval/constants.py +7 -4
judgeval/data/__init__.py +0 -3
judgeval/data/datasets/dataset.py +42 -19
judgeval/data/datasets/eval_dataset_client.py +59 -20
judgeval/data/result.py +34 -56
judgeval/integrations/langgraph.py +16 -12
judgeval/judgment_client.py +85 -23
judgeval/rules.py +177 -60
judgeval/run_evaluation.py +143 -122
judgeval/scorers/score.py +21 -18
judgeval/utils/alerts.py +32 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD +16 -17
judgeval/data/api_example.py +0 -98
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0

judgeval/run_evaluation.py CHANGED Viewed

@@ -23,17 +23,35 @@ from judgeval.constants import (
     ROOT_API,
     JUDGMENT_EVAL_API_URL,
     JUDGMENT_EVAL_LOG_API_URL,
-    MAX_CONCURRENT_EVALUATIONS
+    MAX_CONCURRENT_EVALUATIONS,
+    JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
-from judgeval.evaluation_run import EvaluationRun
 from judgeval.common.logger import (
     debug,
     info,
     error,
     example_logging_context
 )
+from judgeval.evaluation_run import EvaluationRun
+def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
+    """
+    Sends an evaluation run to the RabbitMQ evaluation queue.
+    """
+    payload = evaluation_run.model_dump(warnings=False)
+    response = requests.post(
+        JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+            "X-Organization-Id": evaluation_run.organization_id
+        },
+        json=payload,
+        verify=True
+    )
+    return response.json()
 def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
     """
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
         # submit API request to execute evals
         payload = evaluation_run.model_dump(warnings=False)
         response = requests.post(
-            JUDGMENT_EVAL_API_URL, headers={
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
-            "X-Organization-Id": evaluation_run.organization_id
-        },
-        json=payload,
-        verify=True)
+            JUDGMENT_EVAL_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+                "X-Organization-Id": evaluation_run.organization_id
+            },
+            json=payload,
+            verify=True
+        )
         response_data = response.json()
     except Exception as e:
         error(f"Error: {e}")
@@ -97,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
     # Each ScoringResult in api and local have all the same fields besides `scorers_data`
     for api_result, local_result in zip(api_results, local_results):
-        if api_result.input != local_result.input:
+        if not (api_result.data_object and local_result.data_object):
+            raise ValueError("Data object is None in one of the results.")
+        if api_result.data_object.input != local_result.data_object.input:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.actual_output != local_result.actual_output:
+        if api_result.data_object.actual_output != local_result.data_object.actual_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_output != local_result.expected_output:
+        if api_result.data_object.expected_output != local_result.data_object.expected_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.context != local_result.context:
+        if api_result.data_object.context != local_result.data_object.context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.retrieval_context != local_result.retrieval_context:
+        if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.additional_metadata != local_result.additional_metadata:
+        if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.tools_called != local_result.tools_called:
+        if api_result.data_object.tools_called != local_result.data_object.tools_called:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_tools != local_result.expected_tools:
+        if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
             raise ValueError("The API and local results are not aligned.")
@@ -281,13 +303,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
                     # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
                     print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
-def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
+def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
         evaluation_run (EvaluationRun): Stores example and evaluation together for running
+        override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
+        ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
         Args:
             project_name (str): The name of the project the evaluation results belong to
@@ -354,101 +377,101 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
     api_results: List[ScoringResult] = []
     local_results: List[ScoringResult] = []
-    # Execute evaluation using Judgment API
-    if judgment_scorers:
+    if async_execution:
         check_examples(evaluation_run.examples, evaluation_run.scorers)
-        info("Starting API evaluation")
-        debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
-        try:  # execute an EvaluationRun with just JudgmentScorers
-            api_evaluation_run: EvaluationRun = EvaluationRun(
-                eval_name=evaluation_run.eval_name,
-                project_name=evaluation_run.project_name,
-                examples=evaluation_run.examples,
-                scorers=judgment_scorers,
-                model=evaluation_run.model,
-                aggregator=evaluation_run.aggregator,
-                metadata=evaluation_run.metadata,
-                judgment_api_key=evaluation_run.judgment_api_key,
-                organization_id=evaluation_run.organization_id,
-                log_results=evaluation_run.log_results,
-                rules=evaluation_run.rules
-            )
-            debug("Sending request to Judgment API")
-            response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
-            info(f"Received {len(response_data['results'])} results from API")
-        except JudgmentAPIError as e:
-            error(f"An error occurred while executing the Judgment API request: {str(e)}")
-            raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
-        except ValueError as e:
-            raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
-        # Convert the response data to `ScoringResult` objects
-        debug("Processing API results")
-        for idx, result in enumerate(response_data["results"]):
-            with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
-                for scorer in judgment_scorers:
-                    debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
-                # filter for key-value pairs that are used to initialize ScoringResult
-                # there may be some stuff in here that doesn't belong in ScoringResult
-                # TODO: come back and refactor this to have ScoringResult take in **kwargs
-                filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
-                # Convert scorers_data dicts to ScorerData objects
-                if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
-                    filtered_result["scorers_data"] = [
-                        ScorerData(**scorer_dict)
-                        for scorer_dict in filtered_result["scorers_data"]
-                    ]
-                api_results.append(ScoringResult(**filtered_result))
-    # Run local evals
-    if local_scorers:  # List[JudgevalScorer]
-        # We should be removing local scorers soon
-        info("Starting local evaluation")
-        for example in evaluation_run.examples:
-            with example_logging_context(example.timestamp, example.example_id):
-                debug(f"Processing example {example.example_id}: {example.input}")
-        results: List[ScoringResult] = asyncio.run(
-            a_execute_scoring(
-                evaluation_run.examples,
-                local_scorers,
-                model=evaluation_run.model,
-                ignore_errors=True,
-                skip_on_missing_params=True,
-                show_indicator=True,
-                _use_bar_indicator=True,
-                throttle_value=0,
-                max_concurrent=MAX_CONCURRENT_EVALUATIONS,
-            )
+        info("Starting async evaluation")
+        payload = evaluation_run.model_dump(warnings=False)
+        requests.post(
+            JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
+                "X-Organization-Id": evaluation_run.organization_id
+            },
+            json=payload,
+            verify=True
         )
-        local_results = results
-        info(f"Local evaluation complete with {len(local_results)} results")
-    # Aggregate the ScorerData from the API and local evaluations
-    debug("Merging API and local results")
-    merged_results: List[ScoringResult] = merge_results(api_results, local_results)
-    merged_results = check_missing_scorer_data(merged_results)
-    info(f"Successfully merged {len(merged_results)} results")
-    # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
-    # if evaluation_run.rules and merged_results:
-    #     run_rules(
-    #         local_results=merged_results,
-    #         rules=evaluation_run.rules,
-    #         judgment_api_key=evaluation_run.judgment_api_key,
-    #         organization_id=evaluation_run.organization_id
-    #     )
-    if evaluation_run.log_results:
-        pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
-        rprint(pretty_str)
-    for i, result in enumerate(merged_results):
-        if not result.scorers_data:  # none of the scorers could be executed on this example
-            info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
-    return merged_results
+        print("Successfully added evaluation to queue")
+    else:
+        if judgment_scorers:
+            # Execute evaluation using Judgment API
+            check_examples(evaluation_run.examples, evaluation_run.scorers)
+            info("Starting API evaluation")
+            debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
+            try:  # execute an EvaluationRun with just JudgmentScorers
+                api_evaluation_run: EvaluationRun = EvaluationRun(
+                    eval_name=evaluation_run.eval_name,
+                    project_name=evaluation_run.project_name,
+                    examples=evaluation_run.examples,
+                    scorers=judgment_scorers,
+                    model=evaluation_run.model,
+                    aggregator=evaluation_run.aggregator,
+                    metadata=evaluation_run.metadata,
+                    judgment_api_key=evaluation_run.judgment_api_key,
+                    organization_id=evaluation_run.organization_id,
+                    log_results=evaluation_run.log_results,
+                    rules=evaluation_run.rules
+                )
+                debug("Sending request to Judgment API")
+                response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
+                info(f"Received {len(response_data['results'])} results from API")
+            except JudgmentAPIError as e:
+                error(f"An error occurred while executing the Judgment API request: {str(e)}")
+                raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
+            except ValueError as e:
+                raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
+            # Convert the response data to `ScoringResult` objects
+            debug("Processing API results")
+            api_results = [ScoringResult(**result) for result in response_data["results"]]
+        # Run local evals
+        if local_scorers:  # List[JudgevalScorer]
+            # We should be removing local scorers soon
+            info("Starting local evaluation")
+            for example in evaluation_run.examples:
+                with example_logging_context(example.timestamp, example.example_id):
+                    debug(f"Processing example {example.example_id}: {example.input}")
+            results: List[ScoringResult] = asyncio.run(
+                a_execute_scoring(
+                    evaluation_run.examples,
+                    local_scorers,
+                    model=evaluation_run.model,
+                    ignore_errors=ignore_errors,
+                    skip_on_missing_params=True,
+                    show_indicator=True,
+                    _use_bar_indicator=True,
+                    throttle_value=0,
+                    max_concurrent=MAX_CONCURRENT_EVALUATIONS,
+                )
+            )
+            local_results = results
+            info(f"Local evaluation complete with {len(local_results)} results")
+        # Aggregate the ScorerData from the API and local evaluations
+        debug("Merging API and local results")
+        merged_results: List[ScoringResult] = merge_results(api_results, local_results)
+        merged_results = check_missing_scorer_data(merged_results)
+        info(f"Successfully merged {len(merged_results)} results")
+        # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
+        # if evaluation_run.rules and merged_results:
+        #     run_rules(
+        #         local_results=merged_results,
+        #         rules=evaluation_run.rules,
+        #         judgment_api_key=evaluation_run.judgment_api_key,
+        #         organization_id=evaluation_run.organization_id
+        #     )
+        # print(merged_results)
+        if evaluation_run.log_results:
+            pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
+            rprint(pretty_str)
+        for i, result in enumerate(merged_results):
+            if not result.scorers_data:  # none of the scorers could be executed on this example
+                info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
+        return merged_results
 def assert_test(scoring_results: List[ScoringResult]) -> None:
     """
@@ -467,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.input,
-                'actual_output': result.actual_output,
-                'expected_output': result.expected_output,
-                'context': result.context,
-                'retrieval_context': result.retrieval_context,
-                'additional_metadata': result.additional_metadata,
-                'tools_called': result.tools_called,
-                'expected_tools': result.expected_tools,
-                'eval_run_name': result.eval_run_name,
+                'input': result.data_object.input,
+                'actual_output': result.data_object.actual_output,
+                'expected_output': result.data_object.expected_output,
+                'context': result.data_object.context,
+                'retrieval_context': result.data_object.retrieval_context,
+                'additional_metadata': result.data_object.additional_metadata,
+                'tools_called': result.data_object.tools_called,
+                'expected_tools': result.data_object.expected_tools,
                 'failed_scorers': []
             }
             if result.scorers_data:
@@ -496,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
             error_msg += f"Tools Called: {fail_case['tools_called']}\n"
             error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
-            error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
             for fail_scorer in fail_case['failed_scorers']:

judgeval/scorers/score.py CHANGED Viewed

@@ -13,7 +13,6 @@ from judgeval.data import (
     Example,
     ScoringResult,
     generate_scoring_result,
-    create_process_example,
     create_scorer_data,
 )
 from judgeval.scorers import JudgevalScorer
@@ -274,15 +273,16 @@ async def a_execute_scoring(
     semaphore = asyncio.Semaphore(max_concurrent)
     async def execute_with_semaphore(func: Callable, *args, **kwargs):
-        try:
-            async with semaphore:
+        async with semaphore:
+            try:
                 return await func(*args, **kwargs)
-        except Exception as e:
-            error(f"Error executing function: {e}")
-            if kwargs.get('ignore_errors', False):
-                # Return None when ignoring errors
-                return None
-            raise
+            except Exception as e:
+                print(f"Error executing function: {e}")
+                if kwargs.get('ignore_errors', False):
+                    # Simply return None when ignoring errors, as expected by the test
+                    return None
+                # If we're not ignoring errors, propagate the exception
+                raise
     if verbose_mode is not None:
         for scorer in scorers:
@@ -391,6 +391,7 @@ async def a_eval_examples_helper(
     Returns:
         None
     """
     show_metrics_indicator = show_indicator and not _use_bar_indicator
     for scorer in scorers:
@@ -398,7 +399,6 @@ async def a_eval_examples_helper(
         scorer.error = None  # Reset scorer error
     # scoring the Example
-    process_example = create_process_example(example)  # Creates process example to track progress
     scoring_start_time = time.perf_counter()
     await score_with_indicator(
         scorers=scorers,
@@ -409,19 +409,22 @@ async def a_eval_examples_helper(
     )  # execute the scoring functions of each scorer on the example
     # Now that all the scoring functions of each scorer have executed, we collect
-    # the results and update the process example with the scorer data
+    # the results and update the ScoringResult with the scorer data
+    success = True
+    scorer_data_list = []
     for scorer in scorers:
         # At this point, the scorer has been executed and already contains data.
         if getattr(scorer, 'skipped', False):
             continue
         scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation
-        process_example.update_scorer_data(scorer_data)  # Update process example with the same scorer data
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - scoring_start_time
+        success = success and scorer_data.success
+        scorer_data_list.append(scorer_data)
+    scoring_end_time = time.perf_counter()
+    run_duration = scoring_end_time - scoring_start_time
+    scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
+    scoring_results[score_index] = scoring_result
-    process_example.update_run_duration(run_duration)   # Update process example with execution time duration
-    scoring_results[score_index] = generate_scoring_result(process_example)  # Converts the outcomes of the executed test to a ScoringResult and saves it
     if pbar is not None:
         pbar.update(1)

judgeval/utils/alerts.py CHANGED Viewed

@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
     @property
     def conditions_results(self) -> List[Dict[str, Any]]:
         """Backwards compatibility property for the conditions_result field"""
-        return self.conditions_result
+        return self.conditions_result
+    def model_dump(self, **kwargs):
+        """
+        Convert the AlertResult to a dictionary for JSON serialization.
+        Args:
+            **kwargs: Additional arguments to pass to Pydantic's model_dump
+        Returns:
+            dict: Dictionary representation of the AlertResult
+        """
+        data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
+        # Handle the NotificationConfig object if it exists
+        if hasattr(self, "notification") and self.notification is not None:
+            if hasattr(self.notification, "model_dump"):
+                data["notification"] = self.notification.model_dump()
+            elif hasattr(self.notification, "dict"):
+                data["notification"] = self.notification.dict()
+            else:
+                # Manually convert the notification to a dictionary
+                notif = self.notification
+                data["notification"] = {
+                    "enabled": notif.enabled,
+                    "communication_methods": notif.communication_methods,
+                    "email_addresses": notif.email_addresses,
+                    "slack_channels": getattr(notif, "slack_channels", []),
+                    "send_at": notif.send_at
+                }
+        return data

{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.25
+Version: 0.0.27
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,23 @@
 judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
 judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
-judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
+judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
 judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
-judgeval/judgment_client.py,sha256=e-2e4KK-xy8-WLgzg8H0D6pZC8By9IWdu2iK-lHe39A,24076
-judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
-judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
+judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
+judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
+judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
+judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
-judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
-judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
+judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
 judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
-judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
+judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
-judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
-judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
-judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
+judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
+judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
+judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -31,7 +30,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
 judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
 judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
 judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
-judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
+judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
 judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
@@ -86,8 +85,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
-judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
-judgeval-0.0.25.dist-info/METADATA,sha256=09S16QU5qwYqwvrsdg36KVvv9-tnVcSKccgDldPqWpQ,5418
-judgeval-0.0.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.25.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.25.dist-info/RECORD,,
+judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
+judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
+judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.27.dist-info/RECORD,,

judgeval/data/api_example.py DELETED Viewed

@@ -1,98 +0,0 @@
-from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, ConfigDict, model_validator
-from judgeval.data.example import Example
-from judgeval.data.scorer_data import ScorerData
-from judgeval.common.logger import debug, error
-class ProcessExample(BaseModel):
-    """
-    ProcessExample is an `Example` object that contains intermediate information
-    about an undergoing evaluation on the original `Example`. It is used purely for
-    internal operations and keeping track of the evaluation process.
-    """
-    name: str
-    input: Optional[str] = None
-    actual_output: Optional[Union[str, List[str]]] = None
-    expected_output: Optional[Union[str, List[str]]] = None
-    context: Optional[list] = None
-    retrieval_context: Optional[list] = None
-    tools_called: Optional[list] = None
-    expected_tools: Optional[list] = None
-    # make these optional, not all test cases in a conversation will be evaluated
-    success: Optional[bool] = None
-    scorers_data: Optional[List[ScorerData]] = None
-    run_duration: Optional[float] = None
-    evaluation_cost: Optional[float] = None
-    order: Optional[int] =  None
-    # These should map 1 to 1 from golden
-    additional_metadata: Optional[Dict] = None
-    comments: Optional[str] = None
-    trace_id: Optional[str] = None
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    def update_scorer_data(self, scorer_data: ScorerData):
-        """
-        Updates scorer data field of test case after the scorers have been
-        evaluated on this test case.
-        """
-        debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
-        # self.scorers_data is a list of ScorerData objects that contain the
-        # evaluation results of each scorer on this test case
-        if self.scorers_data is None:
-            self.scorers_data = [scorer_data]
-        else:
-            self.scorers_data.append(scorer_data)
-        if self.success is None:
-            # self.success will be None when it is a message
-            # in that case we will be setting success for the first time
-            self.success = scorer_data.success
-        else:
-            if scorer_data.success is False:
-                debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
-                self.success = False
-    def update_run_duration(self, run_duration: float):
-        self.run_duration = run_duration
-def create_process_example(
-    example: Example,
-) -> ProcessExample:
-    """
-    When an LLM Test Case is executed, we track its progress using an ProcessExample.
-    This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
-    """
-    success = True
-    if example.name is not None:
-        name = example.name
-    else:
-        name = "Test Case Placeholder"
-        debug(f"No name provided for example, using default name: {name}")
-    order = None
-    scorers_data = []
-    debug(f"Creating ProcessExample for: {name}")
-    process_ex = ProcessExample(
-        name=name,
-        input=example.input,
-        actual_output=example.actual_output,
-        expected_output=example.expected_output,
-        context=example.context,
-        retrieval_context=example.retrieval_context,
-        tools_called=example.tools_called,
-        expected_tools=example.expected_tools,
-        success=success,
-        scorers_data=scorers_data,
-        run_duration=None,
-        evaluation_cost=None,
-        order=order,
-        additional_metadata=example.additional_metadata,
-        trace_id=example.trace_id
-    )
-    return process_ex

{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl