PyPI - judgeval - Versions diffs - 0.0.26__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

judgeval 0.0.26py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

judgeval/common/tracer.py +476 -161
judgeval/constants.py +4 -2
judgeval/data/__init__.py +0 -3
judgeval/data/datasets/eval_dataset_client.py +59 -20
judgeval/data/result.py +34 -56
judgeval/judgment_client.py +47 -15
judgeval/run_evaluation.py +20 -36
judgeval/scorers/score.py +9 -11
{judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
{judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/RECORD +12 -13
judgeval/data/api_example.py +0 -98
{judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
{judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -41,14 +41,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
-JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
-JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
+JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
+JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
+JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"

judgeval/data/__init__.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from judgeval.data.example import Example, ExampleParams
-from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 __all__ = [
     "Example",
     "ExampleParams",
-    "ProcessExample",
-    "create_process_example",
     "ScorerData",
     "create_scorer_data",
     "ScoringResult",

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
-    JUDGMENT_DATASETS_PULL_ALL_API_URL,
-    JUDGMENT_DATASETS_EDIT_API_URL,
+    JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
+    JUDGMENT_DATASETS_DELETE_API_URL,
+    JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
 from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
     def create_dataset(self) -> EvalDataset:
         return EvalDataset(judgment_api_key=self.judgment_api_key)
-    def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
+    def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
         debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
         if overwrite:
             warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
                 total=100,
             )
             content = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
                     "overwrite": overwrite,
                 }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
                 )
             return True
-    def pull(self, alias: str) -> EvalDataset:
+    def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """
         Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
         Mock request:
         {
             "alias": alias,
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name
                 }
                 try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset._alias = payload.get("_alias")
-                dataset._id = payload.get("_id")
+                dataset._alias = payload.get("alias")
+                dataset._id = payload.get("id")
                 progress.update(
                     task_id,
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
                 return dataset
+    def delete(self, alias: str, project_name: str) -> bool:
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "dataset_alias": alias,
+                    "project_name": project_name
+                }
-    def pull_all_user_dataset_stats(self) -> dict:
-        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_DELETE_API_URL,
+                        json=request_body,
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        },
+                        verify=True
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error deleting dataset: {str(e)}")
+                    raise
+                return True
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
+        debug(f"Pulling project datasets stats for project_name: {project_name}'")
         """
-        Pulls the user datasets stats from Judgment platform
+        Pulls the project datasets stats from Judgment platform
         Mock request:
         {
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
+                    "project_name": project_name
                 }
                 try:
                     response = requests.post(
-                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
+                        JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
                         json=request_body,
                         headers={
                             "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
                 return payload
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
@@ -213,7 +251,7 @@ class EvalDatasetClient:
         {
             "alias": alias,
             "examples": [...],
-            "judgment_api_key": self.judgment_api_key
+            "project_name": project_name
         }
         """
         with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
             )
             content = {
-                "alias": alias,
+                "dataset_alias": alias,
                 "examples": [e.to_dict() for e in examples],
+                "project_name": project_name
             }
             try:
                 response = requests.post(
-                    JUDGMENT_DATASETS_EDIT_API_URL,
+                    JUDGMENT_DATASETS_INSERT_API_URL,
                     json=content,
                     headers={
                         "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
             info(f"Successfully edited dataset '{alias}'")
             return True
-    def export_jsonl(self, alias: str) -> requests.Response:
+    def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
         """Export dataset in JSONL format from Judgment platform"""
         debug(f"Exporting dataset with alias '{alias}' as JSONL")
         with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
             try:
                 response = requests.post(
                     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
-                    json={"alias": alias},
+                    json={"dataset_alias": alias, "project_name": project_name},
                     headers={
                         "Content-Type": "application/json",
                         "Authorization": f"Bearer {self.judgment_api_key}",

judgeval/data/result.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from dataclasses import dataclass
 from typing import List, Union, Optional, Dict, Any, Union
+from judgeval.common.logger import debug, error
+from pydantic import BaseModel
+from judgeval.data import ScorerData, Example
-from judgeval.data import ScorerData, ProcessExample
-@dataclass
-class ScoringResult:
+class ScoringResult(BaseModel):
     """
     A ScoringResult contains the output of one or more scorers applied to a single example.
     Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
         success (bool): Whether the evaluation was successful.
                         This means that all scorers applied to this example returned a success.
         scorer_data (List[ScorerData]): The scorers data for the evaluated example
-        input (Optional[str]): The input to the example
-        actual_output (Optional[str]): The actual output of the example
-        expected_output (Optional[str]): The expected output of the example
-        context (Optional[List[str]]): The context of the example
-        retrieval_context (Optional[List[str]]): The retrieval context of the example
-        additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
-        tools_called (Optional[List[str]]): The tools called by the example
-        expected_tools (Optional[List[str]]): The expected tools of the example
-        trace_id (Optional[str]): The trace id of the example
+        data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
     """
     # Fields for scoring outputs
     success: bool  # used for unit testing
     scorers_data: Union[List[ScorerData], None]
+    name: Optional[str] = None
-    # Inputs from the original example
-    input: Optional[str] = None
-    actual_output: Optional[Union[str, List[str]]] = None
-    expected_output: Optional[Union[str, List[str]]] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+    # The original example object that was used to create the ScoringResult
+    data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
     trace_id: Optional[str] = None
-    example_id: Optional[str] = None
-    eval_run_name: Optional[str] = None
+    # Additional fields for internal use
+    run_duration: Optional[float] = None
+    evaluation_cost: Optional[float] = None
     def to_dict(self) -> dict:
         """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
         return {
             "success": self.success,
             "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "trace_id": self.trace_id,
-            "example_id": self.example_id
+            "data_object": self.data_object.to_dict() if self.data_object else None,
         }
     def __str__(self) -> str:
         return f"ScoringResult(\
             success={self.success}, \
             scorer_data={self.scorers_data}, \
-            input={self.input}, \
-            actual_output={self.actual_output}, \
-            expected_output={self.expected_output}, \
-            context={self.context}, \
-            retrieval_context={self.retrieval_context}, \
-            additional_metadata={self.additional_metadata}, \
-            tools_called={self.tools_called}, \
-            expected_tools={self.expected_tools}, \
-            trace_id={self.trace_id})"
+            data_object={self.data_object}, \
+            run_duration={self.run_duration}, \
+            evaluation_cost={self.evaluation_cost})"
 def generate_scoring_result(
-    process_example: ProcessExample,
+    example: Example,
+    success: bool,
+    scorers_data: List[ScorerData],
+    run_duration: float,
 ) -> ScoringResult:
     """
     Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    return ScoringResult(
-        success=process_example.success,
-        scorers_data=process_example.scorers_data,
-        input=process_example.input,
-        actual_output=process_example.actual_output,
-        expected_output=process_example.expected_output,
-        context=process_example.context,
-        retrieval_context=process_example.retrieval_context,
-        additional_metadata=process_example.additional_metadata,
-        tools_called=process_example.tools_called,
-        expected_tools=process_example.expected_tools,
-        trace_id=process_example.trace_id
+    if example.name is not None:
+        name = example.name
+    else:
+        name = "Test Case Placeholder"
+        debug(f"No name provided for example, using default name: {name}")
+    debug(f"Creating ScoringResult for: {name}")
+    scoring_result = ScoringResult(
+        name=name,
+        data_object=example,
+        success=success,
+        scorers_data=scorers_data,
+        run_duration=run_duration,
+        evaluation_cost=None,
     )
+    return scoring_result

judgeval/judgment_client.py CHANGED Viewed

@@ -27,7 +27,8 @@ from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
     JUDGMENT_EVAL_DELETE_API_URL,
     JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
-    JUDGMENT_PROJECT_DELETE_API_URL
+    JUDGMENT_PROJECT_DELETE_API_URL,
+    JUDGMENT_PROJECT_CREATE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -43,8 +44,16 @@ class DeleteEvalRunRequestBody(BaseModel):
     project_name: str
     judgment_api_key: str
+class SingletonMeta(type):
+    _instances = {}
-class JudgmentClient:
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        return cls._instances[cls]
+class JudgmentClient(metaclass=SingletonMeta):
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
         self.judgment_api_key = judgment_api_key
         self.organization_id = organization_id
@@ -56,8 +65,8 @@ class JudgmentClient:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
         else:
-            print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
+            print(f"Successfully initialized JudgmentClient!")
     def a_run_evaluation(
         self,
         examples: List[Example],
@@ -267,7 +276,7 @@ class JudgmentClient:
     def create_dataset(self) -> EvalDataset:
         return self.eval_dataset_client.create_dataset()
-    def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
+    def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
         """
         Uploads an `EvalDataset` to the Judgment platform for storage.
@@ -281,9 +290,9 @@ class JudgmentClient:
         """
         # Set judgment_api_key just in case it was not set
         dataset.judgment_api_key = self.judgment_api_key
-        return self.eval_dataset_client.push(dataset, alias, overwrite)
+        return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def pull_dataset(self, alias: str) -> EvalDataset:
+    def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -293,25 +302,31 @@ class JudgmentClient:
         Returns:
             EvalDataset: The retrieved dataset
         """
-        return self.eval_dataset_client.pull(alias)
+        return self.eval_dataset_client.pull(alias, project_name)
+    def delete_dataset(self, alias: str, project_name: str) -> bool:
+        """
+        Deletes a saved `EvalDataset` from the Judgment platform.
+        """
+        return self.eval_dataset_client.delete(alias, project_name)
-    def pull_all_user_dataset_stats(self) -> dict:
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
         """
-        Retrieves all dataset stats from the Judgment platform for the user.
+        Retrieves all dataset stats from the Judgment platform for the project.
         Args:
-            alias (str): The name of the dataset to retrieve
+            project_name (str): The name of the project to retrieve
         Returns:
-            EvalDataset: The retrieved dataset
+            dict: The retrieved dataset stats
         """
-        return self.eval_dataset_client.pull_all_user_dataset_stats()
+        return self.eval_dataset_client.pull_project_dataset_stats(project_name)
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
         """
-        return self.eval_dataset_client.edit_dataset(alias, examples)
+        return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -402,6 +417,23 @@ class JudgmentClient:
             raise ValueError(f"Error deleting eval results: {response.json()}")
         return response.json()
+    def create_project(self, project_name: str) -> bool:
+        """
+        Creates a project on the server.
+        """
+        response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
+                        json={
+                            "project_name": project_name,
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error creating project: {response.json()}")
+        return response.json()
     def delete_project(self, project_name: str) -> bool:
         """
         Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.

judgeval/run_evaluation.py CHANGED Viewed

@@ -117,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
     # Each ScoringResult in api and local have all the same fields besides `scorers_data`
     for api_result, local_result in zip(api_results, local_results):
-        if api_result.input != local_result.input:
+        if not (api_result.data_object and local_result.data_object):
+            raise ValueError("Data object is None in one of the results.")
+        if api_result.data_object.input != local_result.data_object.input:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.actual_output != local_result.actual_output:
+        if api_result.data_object.actual_output != local_result.data_object.actual_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_output != local_result.expected_output:
+        if api_result.data_object.expected_output != local_result.data_object.expected_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.context != local_result.context:
+        if api_result.data_object.context != local_result.data_object.context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.retrieval_context != local_result.retrieval_context:
+        if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.additional_metadata != local_result.additional_metadata:
+        if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.tools_called != local_result.tools_called:
+        if api_result.data_object.tools_called != local_result.data_object.tools_called:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_tools != local_result.expected_tools:
+        if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
             raise ValueError("The API and local results are not aligned.")
@@ -422,23 +424,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             # Convert the response data to `ScoringResult` objects
             debug("Processing API results")
-            for idx, result in enumerate(response_data["results"]):
-                with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
-                    for scorer in judgment_scorers:
-                        debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
-                    # filter for key-value pairs that are used to initialize ScoringResult
-                    # there may be some stuff in here that doesn't belong in ScoringResult
-                    # TODO: come back and refactor this to have ScoringResult take in **kwargs
-                    filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
-                    # Convert scorers_data dicts to ScorerData objects
-                    if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
-                        filtered_result["scorers_data"] = [
-                            ScorerData(**scorer_dict)
-                            for scorer_dict in filtered_result["scorers_data"]
-                        ]
-                    api_results.append(ScoringResult(**filtered_result))
+            api_results = [ScoringResult(**result) for result in response_data["results"]]
         # Run local evals
         if local_scorers:  # List[JudgevalScorer]
             # We should be removing local scorers soon
@@ -477,7 +463,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #         judgment_api_key=evaluation_run.judgment_api_key,
         #         organization_id=evaluation_run.organization_id
         #     )
+        # print(merged_results)
         if evaluation_run.log_results:
             pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
             rprint(pretty_str)
@@ -504,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.input,
-                'actual_output': result.actual_output,
-                'expected_output': result.expected_output,
-                'context': result.context,
-                'retrieval_context': result.retrieval_context,
-                'additional_metadata': result.additional_metadata,
-                'tools_called': result.tools_called,
-                'expected_tools': result.expected_tools,
-                'eval_run_name': result.eval_run_name,
+                'input': result.data_object.input,
+                'actual_output': result.data_object.actual_output,
+                'expected_output': result.data_object.expected_output,
+                'context': result.data_object.context,
+                'retrieval_context': result.data_object.retrieval_context,
+                'additional_metadata': result.data_object.additional_metadata,
+                'tools_called': result.data_object.tools_called,
+                'expected_tools': result.data_object.expected_tools,
                 'failed_scorers': []
             }
             if result.scorers_data:
@@ -533,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
             error_msg += f"Tools Called: {fail_case['tools_called']}\n"
             error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
-            error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
             for fail_scorer in fail_case['failed_scorers']:

judgeval/scorers/score.py CHANGED Viewed

@@ -13,7 +13,6 @@ from judgeval.data import (
     Example,
     ScoringResult,
     generate_scoring_result,
-    create_process_example,
     create_scorer_data,
 )
 from judgeval.scorers import JudgevalScorer
@@ -400,7 +399,6 @@ async def a_eval_examples_helper(
         scorer.error = None  # Reset scorer error
     # scoring the Example
-    process_example = create_process_example(example)  # Creates process example to track progress
     scoring_start_time = time.perf_counter()
     await score_with_indicator(
         scorers=scorers,
@@ -411,22 +409,22 @@ async def a_eval_examples_helper(
     )  # execute the scoring functions of each scorer on the example
     # Now that all the scoring functions of each scorer have executed, we collect
-    # the results and update the process example with the scorer data
+    # the results and update the ScoringResult with the scorer data
+    success = True
+    scorer_data_list = []
     for scorer in scorers:
         # At this point, the scorer has been executed and already contains data.
         if getattr(scorer, 'skipped', False):
             continue
         scorer_data = create_scorer_data(scorer)  # Fetch scorer data from completed scorer evaluation
-        process_example.update_scorer_data(scorer_data)  # Update process example with the same scorer data
+        success = success and scorer_data.success
+        scorer_data_list.append(scorer_data)
-    test_end_time = time.perf_counter()
-    run_duration = test_end_time - scoring_start_time
+    scoring_end_time = time.perf_counter()
+    run_duration = scoring_end_time - scoring_start_time
-    process_example.update_run_duration(run_duration)   # Update process example with execution time duration
-    # Generate the scoring result and store it safely (to avoid race conditions)
-    result = generate_scoring_result(process_example)
-    scoring_results[score_index] = result
+    scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
+    scoring_results[score_index] = scoring_result
     if pbar is not None:
         pbar.update(1)

{judgeval-0.0.26.dist-info → judgeval-0.0.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.0.26
+Version: 0.0.27
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

judgeval 0.0.26__py3-none-any.whl → 0.0.27__py3-none-any.whl

judgeval 0.0.26py3-none-any.whl → 0.0.27py3-none-any.whl