PyPI - judgeval - Versions diffs - 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl - Mend

judgeval 0.0.26py3-none-any.whl → 0.0.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

judgeval/common/tracer.py +515 -193
judgeval/constants.py +4 -2
judgeval/data/__init__.py +0 -3
judgeval/data/{api_example.py → custom_api_example.py} +12 -19
judgeval/data/datasets/eval_dataset_client.py +59 -20
judgeval/data/result.py +34 -56
judgeval/evaluation_run.py +1 -0
judgeval/judgment_client.py +47 -15
judgeval/run_evaluation.py +20 -36
judgeval/scorers/score.py +9 -11
{judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/METADATA +1 -1
{judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/RECORD +14 -14
{judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/WHEEL +0 -0
{judgeval-0.0.26.dist-info → judgeval-0.0.28.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -41,14 +41,16 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
-JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
-JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
+JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
+JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
 JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
+JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"

judgeval/data/__init__.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from judgeval.data.example import Example, ExampleParams
-from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 __all__ = [
     "Example",
     "ExampleParams",
-    "ProcessExample",
-    "create_process_example",
     "ScorerData",
     "create_scorer_data",
     "ScoringResult",

judgeval/data/{api_example.py → custom_api_example.py} RENAMED Viewed

@@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, ConfigDict, model_validator
 from judgeval.data.example import Example
+from judgeval.data.custom_example import CustomExample
 from judgeval.data.scorer_data import ScorerData
 from judgeval.common.logger import debug, error
@@ -12,13 +13,13 @@ class ProcessExample(BaseModel):
     internal operations and keeping track of the evaluation process.
     """
     name: str
-    input: Optional[str] = None
-    actual_output: Optional[Union[str, List[str]]] = None
-    expected_output: Optional[Union[str, List[str]]] = None
-    context: Optional[list] = None
-    retrieval_context: Optional[list] = None
-    tools_called: Optional[list] = None
-    expected_tools: Optional[list] = None
+    # input: Optional[str] = None
+    # actual_output: Optional[Union[str, List[str]]] = None
+    # expected_output: Optional[Union[str, List[str]]] = None
+    # context: Optional[list] = None
+    # retrieval_context: Optional[list] = None
+    # tools_called: Optional[list] = None
+    # expected_tools: Optional[list] = None
     # make these optional, not all test cases in a conversation will be evaluated
     success: Optional[bool] = None
@@ -57,10 +58,10 @@ class ProcessExample(BaseModel):
     def update_run_duration(self, run_duration: float):
         self.run_duration = run_duration
-def create_process_example(
-    example: Example,
+def create_process_custom_example(
+    example: CustomExample,
 ) -> ProcessExample:
     """
     When an LLM Test Case is executed, we track its progress using an ProcessExample.
@@ -79,13 +80,6 @@ def create_process_example(
     debug(f"Creating ProcessExample for: {name}")
     process_ex = ProcessExample(
         name=name,
-        input=example.input,
-        actual_output=example.actual_output,
-        expected_output=example.expected_output,
-        context=example.context,
-        retrieval_context=example.retrieval_context,
-        tools_called=example.tools_called,
-        expected_tools=example.expected_tools,
         success=success,
         scorers_data=scorers_data,
         run_duration=None,
@@ -94,5 +88,4 @@ def create_process_example(
         additional_metadata=example.additional_metadata,
         trace_id=example.trace_id
     )
-    return process_ex
+    return process_ex

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
-    JUDGMENT_DATASETS_PULL_ALL_API_URL,
-    JUDGMENT_DATASETS_EDIT_API_URL,
+    JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
+    JUDGMENT_DATASETS_DELETE_API_URL,
+    JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
 from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
     def create_dataset(self) -> EvalDataset:
         return EvalDataset(judgment_api_key=self.judgment_api_key)
-    def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
+    def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
         debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
         if overwrite:
             warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
                 total=100,
             )
             content = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
                     "overwrite": overwrite,
                 }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
                 )
             return True
-    def pull(self, alias: str) -> EvalDataset:
+    def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """
         Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
         Mock request:
         {
             "alias": alias,
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name
                 }
                 try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset._alias = payload.get("_alias")
-                dataset._id = payload.get("_id")
+                dataset._alias = payload.get("alias")
+                dataset._id = payload.get("id")
                 progress.update(
                     task_id,
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
                 return dataset
+    def delete(self, alias: str, project_name: str) -> bool:
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "dataset_alias": alias,
+                    "project_name": project_name
+                }
-    def pull_all_user_dataset_stats(self) -> dict:
-        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_DELETE_API_URL,
+                        json=request_body,
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        },
+                        verify=True
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error deleting dataset: {str(e)}")
+                    raise
+                return True
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
+        debug(f"Pulling project datasets stats for project_name: {project_name}'")
         """
-        Pulls the user datasets stats from Judgment platform
+        Pulls the project datasets stats from Judgment platform
         Mock request:
         {
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
+                    "project_name": project_name
                 }
                 try:
                     response = requests.post(
-                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
+                        JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
                         json=request_body,
                         headers={
                             "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
                 return payload
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
@@ -213,7 +251,7 @@ class EvalDatasetClient:
         {
             "alias": alias,
             "examples": [...],
-            "judgment_api_key": self.judgment_api_key
+            "project_name": project_name
         }
         """
         with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
             )
             content = {
-                "alias": alias,
+                "dataset_alias": alias,
                 "examples": [e.to_dict() for e in examples],
+                "project_name": project_name
             }
             try:
                 response = requests.post(
-                    JUDGMENT_DATASETS_EDIT_API_URL,
+                    JUDGMENT_DATASETS_INSERT_API_URL,
                     json=content,
                     headers={
                         "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
             info(f"Successfully edited dataset '{alias}'")
             return True
-    def export_jsonl(self, alias: str) -> requests.Response:
+    def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
         """Export dataset in JSONL format from Judgment platform"""
         debug(f"Exporting dataset with alias '{alias}' as JSONL")
         with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
             try:
                 response = requests.post(
                     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
-                    json={"alias": alias},
+                    json={"dataset_alias": alias, "project_name": project_name},
                     headers={
                         "Content-Type": "application/json",
                         "Authorization": f"Bearer {self.judgment_api_key}",

judgeval/data/result.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from dataclasses import dataclass
 from typing import List, Union, Optional, Dict, Any, Union
+from judgeval.common.logger import debug, error
+from pydantic import BaseModel
+from judgeval.data import ScorerData, Example
-from judgeval.data import ScorerData, ProcessExample
-@dataclass
-class ScoringResult:
+class ScoringResult(BaseModel):
     """
     A ScoringResult contains the output of one or more scorers applied to a single example.
     Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
         success (bool): Whether the evaluation was successful.
                         This means that all scorers applied to this example returned a success.
         scorer_data (List[ScorerData]): The scorers data for the evaluated example
-        input (Optional[str]): The input to the example
-        actual_output (Optional[str]): The actual output of the example
-        expected_output (Optional[str]): The expected output of the example
-        context (Optional[List[str]]): The context of the example
-        retrieval_context (Optional[List[str]]): The retrieval context of the example
-        additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
-        tools_called (Optional[List[str]]): The tools called by the example
-        expected_tools (Optional[List[str]]): The expected tools of the example
-        trace_id (Optional[str]): The trace id of the example
+        data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
     """
     # Fields for scoring outputs
     success: bool  # used for unit testing
     scorers_data: Union[List[ScorerData], None]
+    name: Optional[str] = None
-    # Inputs from the original example
-    input: Optional[str] = None
-    actual_output: Optional[Union[str, List[str]]] = None
-    expected_output: Optional[Union[str, List[str]]] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+    # The original example object that was used to create the ScoringResult
+    data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
     trace_id: Optional[str] = None
-    example_id: Optional[str] = None
-    eval_run_name: Optional[str] = None
+    # Additional fields for internal use
+    run_duration: Optional[float] = None
+    evaluation_cost: Optional[float] = None
     def to_dict(self) -> dict:
         """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
         return {
             "success": self.success,
             "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "trace_id": self.trace_id,
-            "example_id": self.example_id
+            "data_object": self.data_object.to_dict() if self.data_object else None,
         }
     def __str__(self) -> str:
         return f"ScoringResult(\
             success={self.success}, \
             scorer_data={self.scorers_data}, \
-            input={self.input}, \
-            actual_output={self.actual_output}, \
-            expected_output={self.expected_output}, \
-            context={self.context}, \
-            retrieval_context={self.retrieval_context}, \
-            additional_metadata={self.additional_metadata}, \
-            tools_called={self.tools_called}, \
-            expected_tools={self.expected_tools}, \
-            trace_id={self.trace_id})"
+            data_object={self.data_object}, \
+            run_duration={self.run_duration}, \
+            evaluation_cost={self.evaluation_cost})"
 def generate_scoring_result(
-    process_example: ProcessExample,
+    example: Example,
+    scorers_data: List[ScorerData],
+    run_duration: float,
+    success: bool,
 ) -> ScoringResult:
     """
     Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    return ScoringResult(
-        success=process_example.success,
-        scorers_data=process_example.scorers_data,
-        input=process_example.input,
-        actual_output=process_example.actual_output,
-        expected_output=process_example.expected_output,
-        context=process_example.context,
-        retrieval_context=process_example.retrieval_context,
-        additional_metadata=process_example.additional_metadata,
-        tools_called=process_example.tools_called,
-        expected_tools=process_example.expected_tools,
-        trace_id=process_example.trace_id
+    if example.name is not None:
+        name = example.name
+    else:
+        name = "Test Case Placeholder"
+        debug(f"No name provided for example, using default name: {name}")
+    debug(f"Creating ScoringResult for: {name}")
+    scoring_result = ScoringResult(
+        name=name,
+        data_object=example,
+        success=success,
+        scorers_data=scorers_data,
+        run_duration=run_duration,
+        evaluation_cost=None,
     )
+    return scoring_result

judgeval/evaluation_run.py CHANGED Viewed

@@ -34,6 +34,7 @@ class EvaluationRun(BaseModel):
     model: Union[str, List[str], JudgevalJudge]
     aggregator: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
+    trace_span_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     judgment_api_key: Optional[str] = ""
     override: Optional[bool] = False

judgeval/judgment_client.py CHANGED Viewed

@@ -27,7 +27,8 @@ from judgeval.constants import (
     JUDGMENT_EVAL_FETCH_API_URL,
     JUDGMENT_EVAL_DELETE_API_URL,
     JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
-    JUDGMENT_PROJECT_DELETE_API_URL
+    JUDGMENT_PROJECT_DELETE_API_URL,
+    JUDGMENT_PROJECT_CREATE_API_URL
 )
 from judgeval.common.exceptions import JudgmentAPIError
 from pydantic import BaseModel
@@ -43,8 +44,16 @@ class DeleteEvalRunRequestBody(BaseModel):
     project_name: str
     judgment_api_key: str
+class SingletonMeta(type):
+    _instances = {}
-class JudgmentClient:
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        return cls._instances[cls]
+class JudgmentClient(metaclass=SingletonMeta):
     def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
         self.judgment_api_key = judgment_api_key
         self.organization_id = organization_id
@@ -56,8 +65,8 @@ class JudgmentClient:
             # May be bad to output their invalid API key...
             raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
         else:
-            print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
+            print(f"Successfully initialized JudgmentClient!")
     def a_run_evaluation(
         self,
         examples: List[Example],
@@ -267,7 +276,7 @@ class JudgmentClient:
     def create_dataset(self) -> EvalDataset:
         return self.eval_dataset_client.create_dataset()
-    def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
+    def push_dataset(self, alias: str, dataset: EvalDataset, project_name: str, overwrite: Optional[bool] = False) -> bool:
         """
         Uploads an `EvalDataset` to the Judgment platform for storage.
@@ -281,9 +290,9 @@ class JudgmentClient:
         """
         # Set judgment_api_key just in case it was not set
         dataset.judgment_api_key = self.judgment_api_key
-        return self.eval_dataset_client.push(dataset, alias, overwrite)
+        return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def pull_dataset(self, alias: str) -> EvalDataset:
+    def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
         """
         Retrieves a saved `EvalDataset` from the Judgment platform.
@@ -293,25 +302,31 @@ class JudgmentClient:
         Returns:
             EvalDataset: The retrieved dataset
         """
-        return self.eval_dataset_client.pull(alias)
+        return self.eval_dataset_client.pull(alias, project_name)
+    def delete_dataset(self, alias: str, project_name: str) -> bool:
+        """
+        Deletes a saved `EvalDataset` from the Judgment platform.
+        """
+        return self.eval_dataset_client.delete(alias, project_name)
-    def pull_all_user_dataset_stats(self) -> dict:
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
         """
-        Retrieves all dataset stats from the Judgment platform for the user.
+        Retrieves all dataset stats from the Judgment platform for the project.
         Args:
-            alias (str): The name of the dataset to retrieve
+            project_name (str): The name of the project to retrieve
         Returns:
-            EvalDataset: The retrieved dataset
+            dict: The retrieved dataset stats
         """
-        return self.eval_dataset_client.pull_all_user_dataset_stats()
+        return self.eval_dataset_client.pull_project_dataset_stats(project_name)
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
         """
-        return self.eval_dataset_client.edit_dataset(alias, examples)
+        return self.eval_dataset_client.insert_dataset(alias, examples, project_name)
     # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -402,6 +417,23 @@ class JudgmentClient:
             raise ValueError(f"Error deleting eval results: {response.json()}")
         return response.json()
+    def create_project(self, project_name: str) -> bool:
+        """
+        Creates a project on the server.
+        """
+        response = requests.post(JUDGMENT_PROJECT_CREATE_API_URL,
+                        json={
+                            "project_name": project_name,
+                        },
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        })
+        if response.status_code != requests.codes.ok:
+            raise ValueError(f"Error creating project: {response.json()}")
+        return response.json()
     def delete_project(self, project_name: str) -> bool:
         """
         Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.

judgeval/run_evaluation.py CHANGED Viewed

@@ -117,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
     # Each ScoringResult in api and local have all the same fields besides `scorers_data`
     for api_result, local_result in zip(api_results, local_results):
-        if api_result.input != local_result.input:
+        if not (api_result.data_object and local_result.data_object):
+            raise ValueError("Data object is None in one of the results.")
+        if api_result.data_object.input != local_result.data_object.input:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.actual_output != local_result.actual_output:
+        if api_result.data_object.actual_output != local_result.data_object.actual_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_output != local_result.expected_output:
+        if api_result.data_object.expected_output != local_result.data_object.expected_output:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.context != local_result.context:
+        if api_result.data_object.context != local_result.data_object.context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.retrieval_context != local_result.retrieval_context:
+        if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.additional_metadata != local_result.additional_metadata:
+        if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.tools_called != local_result.tools_called:
+        if api_result.data_object.tools_called != local_result.data_object.tools_called:
             raise ValueError("The API and local results are not aligned.")
-        if api_result.expected_tools != local_result.expected_tools:
+        if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
             raise ValueError("The API and local results are not aligned.")
@@ -422,23 +424,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
             # Convert the response data to `ScoringResult` objects
             debug("Processing API results")
-            for idx, result in enumerate(response_data["results"]):
-                with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
-                    for scorer in judgment_scorers:
-                        debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
-                    # filter for key-value pairs that are used to initialize ScoringResult
-                    # there may be some stuff in here that doesn't belong in ScoringResult
-                    # TODO: come back and refactor this to have ScoringResult take in **kwargs
-                    filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
-                    # Convert scorers_data dicts to ScorerData objects
-                    if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
-                        filtered_result["scorers_data"] = [
-                            ScorerData(**scorer_dict)
-                            for scorer_dict in filtered_result["scorers_data"]
-                        ]
-                    api_results.append(ScoringResult(**filtered_result))
+            api_results = [ScoringResult(**result) for result in response_data["results"]]
         # Run local evals
         if local_scorers:  # List[JudgevalScorer]
             # We should be removing local scorers soon
@@ -477,7 +463,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
         #         judgment_api_key=evaluation_run.judgment_api_key,
         #         organization_id=evaluation_run.organization_id
         #     )
+        # print(merged_results)
         if evaluation_run.log_results:
             pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
             rprint(pretty_str)
@@ -504,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             # Create a test case context with all relevant fields
             test_case = {
-                'input': result.input,
-                'actual_output': result.actual_output,
-                'expected_output': result.expected_output,
-                'context': result.context,
-                'retrieval_context': result.retrieval_context,
-                'additional_metadata': result.additional_metadata,
-                'tools_called': result.tools_called,
-                'expected_tools': result.expected_tools,
-                'eval_run_name': result.eval_run_name,
+                'input': result.data_object.input,
+                'actual_output': result.data_object.actual_output,
+                'expected_output': result.data_object.expected_output,
+                'context': result.data_object.context,
+                'retrieval_context': result.data_object.retrieval_context,
+                'additional_metadata': result.data_object.additional_metadata,
+                'tools_called': result.data_object.tools_called,
+                'expected_tools': result.data_object.expected_tools,
                 'failed_scorers': []
             }
             if result.scorers_data:
@@ -533,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
             error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
             error_msg += f"Tools Called: {fail_case['tools_called']}\n"
             error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
-            error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
             for fail_scorer in fail_case['failed_scorers']:

judgeval 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl

judgeval 0.0.26py3-none-any.whl → 0.0.28py3-none-any.whl