PyPI - judgeval - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/common/api/api.py +38 -7
judgeval/common/api/constants.py +9 -1
judgeval/common/storage/s3_storage.py +2 -3
judgeval/common/tracer/core.py +66 -32
judgeval/common/tracer/otel_span_processor.py +4 -50
judgeval/common/tracer/span_transformer.py +16 -10
judgeval/common/utils.py +46 -38
judgeval/constants.py +2 -0
judgeval/data/example.py +9 -37
judgeval/data/judgment_types.py +23 -45
judgeval/data/result.py +8 -14
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +3 -4
judgeval/dataset.py +192 -0
judgeval/evaluation_run.py +1 -0
judgeval/judges/litellm_judge.py +2 -2
judgeval/judges/mixture_of_judges.py +6 -6
judgeval/judges/together_judge.py +6 -3
judgeval/judgment_client.py +9 -71
judgeval/run_evaluation.py +41 -9
judgeval/scorers/score.py +11 -7
judgeval/scorers/utils.py +3 -3
judgeval/utils/file_utils.py +40 -25
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0

judgeval/dataset.py ADDED Viewed

@@ -0,0 +1,192 @@
+import datetime
+import orjson
+import os
+import yaml
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+from judgeval.data import Example, Trace
+from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
+from judgeval.common.api.api import JudgmentApiClient
+from judgeval.common.logger import judgeval_logger
+@dataclass
+class Dataset:
+    examples: List[Example]
+    traces: List[Trace]
+    name: str
+    project_name: str
+    judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
+    organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
+    @classmethod
+    def get(
+        cls,
+        name: str,
+        project_name: str,
+    ):
+        client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
+        dataset = client.pull_dataset(name, project_name)
+        if not dataset:
+            judgeval_logger.error(f"Dataset {name} not found in project {project_name}")
+            raise ValueError(f"Dataset {name} not found in project {project_name}")
+        examples = dataset.get("examples", [])
+        for e in examples:
+            if isinstance(e, dict) and isinstance(e.get("data"), dict):
+                e.update(e.pop("data"))
+        return cls(
+            name=name,
+            project_name=project_name,
+            examples=[Example(**e) for e in examples],
+            traces=[Trace(**t) for t in dataset.get("traces", [])],
+        )
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        project_name: str,
+        examples: Optional[List[Example]] = None,
+        traces: Optional[List[Trace]] = None,
+        overwrite: bool = False,
+    ):
+        if examples and traces:
+            raise ValueError("Only one of examples or traces must be provided")
+        if not examples:
+            examples = []
+        if not traces:
+            traces = []
+        client = JudgmentApiClient(cls.judgment_api_key, cls.organization_id)
+        client.push_dataset(
+            name,
+            project_name,
+            examples=[e.model_dump() for e in examples],
+            traces=[t.model_dump() for t in traces],
+            overwrite=overwrite,
+        )
+        return cls(
+            name=name,
+            project_name=project_name,
+            examples=examples,
+            traces=traces,
+        )
+    def add_from_json(self, file_path: str) -> None:
+        """
+        Adds examples from a JSON file.
+        The JSON file is expected to have the following format:
+        [
+            {
+                "key_01": "value_01",
+                "key_02": "value_02"
+            },
+            {
+                "key_11": "value_11",
+                "key_12": "value_12",
+                "key_13": "value_13"
+            },
+            ...
+        ]
+        """
+        examples = get_examples_from_json(file_path)
+        self.add_examples(examples)
+    def add_from_yaml(self, file_path: str) -> None:
+        """
+        Adds examples from a YAML file.
+        The YAML file is expected to have the following format:
+        - key_01: value_01
+          key_02: value_02
+        - key_11: value_11
+          key_12: value_12
+          key_13: value_13
+        ...
+        """
+        examples = get_examples_from_yaml(file_path)
+        self.add_examples(examples)
+    def add_examples(self, examples: List[Example]) -> None:
+        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
+        client.append_examples(
+            dataset_alias=self.name,
+            project_name=self.project_name,
+            examples=[e.model_dump() for e in examples],
+        )
+    def add_traces(self, traces: List[Trace]) -> None:
+        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
+        client.append_traces(
+            dataset_alias=self.name,
+            project_name=self.project_name,
+            traces=[t.model_dump() for t in traces],
+        )
+    def save_as(
+        self,
+        file_type: Literal["json", "yaml"],
+        dir_path: str,
+        save_name: str | None = None,
+    ) -> None:
+        """
+        Saves the dataset as a file. Save only the examples.
+        Args:
+            file_type (Literal["json", "csv"]): The file type to save the dataset as.
+            dir_path (str): The directory path to save the file to.
+            save_name (str, optional): The name of the file to save. Defaults to None.
+        """
+        if not os.path.exists(dir_path):
+            os.makedirs(dir_path)
+        file_name = (
+            datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            if save_name is None
+            else save_name
+        )
+        complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
+        if file_type == "json":
+            with open(complete_path, "wb") as file:
+                file.write(
+                    orjson.dumps(
+                        {
+                            "examples": [e.to_dict() for e in self.examples],
+                        },
+                        option=orjson.OPT_INDENT_2,
+                    )
+                )
+        elif file_type == "yaml":
+            with open(complete_path, "w") as file:
+                yaml_data = {
+                    "examples": [e.to_dict() for e in self.examples],
+                }
+                yaml.dump(yaml_data, file, default_flow_style=False)
+        else:
+            ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
+            raise TypeError(
+                f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
+            )
+    def delete(self):
+        client = JudgmentApiClient(self.judgment_api_key, self.organization_id)
+        client.delete_dataset(self.name, self.project_name)
+    def __iter__(self):
+        return iter(self.examples)
+    def __len__(self):
+        return len(self.examples)
+    def __str__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"examples={self.examples}, "
+            f"traces={self.traces}, "
+            f"name={self.name}"
+            f")"
+        )

judgeval/evaluation_run.py CHANGED Viewed

@@ -36,6 +36,7 @@ class EvaluationRun(BaseModel):
         data["scorers"] = [
             scorer.model_dump() for scorer in self.scorers
         ]  # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
+        data["examples"] = [example.model_dump() for example in self.examples]
         return data

judgeval/judges/litellm_judge.py CHANGED Viewed

@@ -22,7 +22,7 @@ class LiteLLMJudge(JudgevalJudge):
     def generate(
         self,
         input: Union[str, List[Mapping[str, str]]],
-        schema: pydantic.BaseModel = None,
+        schema: Union[pydantic.BaseModel, None] = None,
     ) -> str:
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -42,7 +42,7 @@ class LiteLLMJudge(JudgevalJudge):
     async def a_generate(
         self,
         input: Union[str, List[Mapping[str, str]]],
-        schema: pydantic.BaseModel = None,
+        schema: Union[pydantic.BaseModel, None] = None,
     ) -> str:
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]

judgeval/judges/mixture_of_judges.py CHANGED Viewed

@@ -18,8 +18,8 @@ from judgeval.common.logger import judgeval_logger
 def build_dynamic_mixture_prompt(
     judge_responses: List[str],
-    custom_system_prompt: str | None = None,
-    custom_conversation_history: List[dict] | None = None,
+    custom_system_prompt: Union[str, None] = None,
+    custom_conversation_history: Union[List[dict], None] = None,
 ) -> List[dict]:
     """
     Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
@@ -178,8 +178,8 @@ class MixtureOfJudges(JudgevalJudge):
     def generate(
         self,
         input: Union[str, List[dict]],
-        response_schema: pydantic.BaseModel = None,
-        aggregation_schema: pydantic.BaseModel = None,
+        response_schema: Union[pydantic.BaseModel, None] = None,
+        aggregation_schema: Union[pydantic.BaseModel, None] = None,
         **kwargs,
     ) -> str:
         """
@@ -230,8 +230,8 @@ class MixtureOfJudges(JudgevalJudge):
     async def a_generate(
         self,
         input: Union[str, List[dict]],
-        response_schema: pydantic.BaseModel = None,
-        aggregation_schema: pydantic.BaseModel = None,
+        response_schema: Union[pydantic.BaseModel, None] = None,
+        aggregation_schema: Union[pydantic.BaseModel, None] = None,
         **kwargs,
     ) -> str:
         """

judgeval/judges/together_judge.py CHANGED Viewed

@@ -11,6 +11,7 @@ from judgeval.common.utils import (
     afetch_together_api_response,
 )
 from judgeval.common.logger import judgeval_logger
+from judgeval.constants import DEFAULT_TOGETHER_MODEL
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
@@ -18,13 +19,15 @@ BASE_CONVERSATION = [
 class TogetherJudge(JudgevalJudge):
-    def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
+    def __init__(self, model: str = DEFAULT_TOGETHER_MODEL, **kwargs):
         self.model = model
         self.kwargs = kwargs
         super().__init__(model_name=model)
     # TODO: Fix cost for generate and a_generate
-    def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
+    def generate(
+        self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
+    ) -> str:
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
             return fetch_together_api_response(
@@ -40,7 +43,7 @@ class TogetherJudge(JudgevalJudge):
             raise TypeError("Input must be a string or a list of dictionaries.")
     async def a_generate(
-        self, input: Union[str, List[dict]], schema: BaseModel = None
+        self, input: Union[str, List[dict]], schema: Union[BaseModel, None] = None
     ) -> str:
         if isinstance(input, str):
             convo = BASE_CONVERSATION + [{"role": "user", "content": input}]

judgeval/judgment_client.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 from uuid import uuid4
 from typing import Optional, List, Dict, Any, Union, Callable
-from judgeval.data.datasets import EvalDataset, EvalDatasetClient
 from judgeval.data import (
     ScoringResult,
     Example,
@@ -25,11 +24,11 @@ from judgeval.run_evaluation import (
 from judgeval.data.trace_run import TraceRun
 from judgeval.common.api import JudgmentApiClient
 from judgeval.common.exceptions import JudgmentAPIError
-from langchain_core.callbacks import BaseCallbackHandler
 from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.common.logger import judgeval_logger
+from judgeval.integrations.langgraph import JudgevalCallbackHandler
 class EvalRunRequestBody(BaseModel):
@@ -71,7 +70,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         self.judgment_api_key = api_key
         self.organization_id = organization_id
         self.api_client = JudgmentApiClient(api_key, organization_id)
-        self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
         # Verify API key is valid
         result, response = validate_api_key(api_key)
@@ -86,7 +84,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         scorers: List[Union[APIScorerConfig, BaseScorer]],
         examples: Optional[List[Example]] = None,
         function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
         traces: Optional[List[Trace]] = None,
         tools: Optional[List[Dict[str, Any]]] = None,
         project_name: str = "default_project",
@@ -178,70 +176,6 @@ class JudgmentClient(metaclass=SingletonMeta):
         except Exception as e:
             raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
-    def create_dataset(self) -> EvalDataset:
-        return self.eval_dataset_client.create_dataset()
-    def push_dataset(
-        self,
-        alias: str,
-        dataset: EvalDataset,
-        project_name: str,
-        overwrite: Optional[bool] = False,
-    ) -> bool:
-        """
-        Uploads an `EvalDataset` to the Judgment platform for storage.
-        Args:
-            alias (str): The name to use for the dataset
-            dataset (EvalDataset): The dataset to upload to Judgment
-            overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
-        Returns:
-            bool: Whether the dataset was successfully uploaded
-        """
-        # Set judgment_api_key just in case it was not set
-        dataset.judgment_api_key = self.judgment_api_key
-        return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
-    def append_dataset(
-        self, alias: str, examples: List[Example], project_name: str
-    ) -> bool:
-        """
-        Appends an `EvalDataset` to the Judgment platform for storage.
-        """
-        return self.eval_dataset_client.append_examples(alias, examples, project_name)
-    def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
-        """
-        Retrieves a saved `EvalDataset` from the Judgment platform.
-        Args:
-            alias (str): The name of the dataset to retrieve
-        Returns:
-            EvalDataset: The retrieved dataset
-        """
-        return self.eval_dataset_client.pull(alias, project_name)
-    def delete_dataset(self, alias: str, project_name: str) -> bool:
-        """
-        Deletes a saved `EvalDataset` from the Judgment platform.
-        """
-        return self.eval_dataset_client.delete(alias, project_name)
-    def pull_project_dataset_stats(self, project_name: str) -> dict:
-        """
-        Retrieves all dataset stats from the Judgment platform for the project.
-        Args:
-            project_name (str): The name of the project to retrieve
-        Returns:
-            dict: The retrieved dataset stats
-        """
-        return self.eval_dataset_client.pull_project_dataset_stats(project_name)
-    # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
     def pull_eval(
         self, project_name: str, eval_run_name: str
     ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -262,8 +196,12 @@ class JudgmentClient(metaclass=SingletonMeta):
         """
         Creates a project on the server.
         """
-        self.api_client.create_project(project_name)
-        return True
+        try:
+            self.api_client.create_project(project_name)
+            return True
+        except Exception as e:
+            judgeval_logger.error(f"Error creating project: {e}")
+            return False
     def delete_project(self, project_name: str) -> bool:
         """
@@ -314,7 +252,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         scorers: List[Union[APIScorerConfig, BaseScorer]],
         examples: Optional[List[Example]] = None,
         function: Optional[Callable] = None,
-        tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+        tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
         traces: Optional[List[Trace]] = None,
         tools: Optional[List[Dict[str, Any]]] = None,
         model: Optional[str] = "gpt-4.1",

judgeval/run_evaluation.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import asyncio
 import concurrent.futures
 import time
-import json
+import orjson
 import sys
 import threading
 from typing import List, Dict, Union, Optional, Callable, Tuple, Any
@@ -20,7 +20,7 @@ from judgeval.common.logger import judgeval_logger
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.trace_run import TraceRun
 from judgeval.common.tracer import Tracer
-from langchain_core.callbacks import BaseCallbackHandler
+from judgeval.integrations.langgraph import JudgevalCallbackHandler
 def safe_run_async(coro):
@@ -191,6 +191,24 @@ def check_eval_run_name_exists(
         raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
+def check_example_keys(
+    keys: List[str],
+    eval_name: str,
+    project_name: str,
+    judgment_api_key: str,
+    organization_id: str,
+) -> None:
+    """
+    Checks if the current experiment (if one exists) has the same keys for example
+    """
+    api_client = JudgmentApiClient(judgment_api_key, organization_id)
+    try:
+        api_client.check_example_keys(keys, eval_name, project_name)
+    except Exception as e:
+        judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
+        raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
     run: Union[EvaluationRun, TraceRun],
@@ -245,7 +263,9 @@ def check_examples(
                     f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
                 )
                 rprint(f"Missing parameters: {', '.join(missing_params)}")
-                rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
+                rprint(
+                    f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
+                )
                 rprint("-" * 40)
                 prompt_user = True
@@ -262,7 +282,7 @@ def run_trace_eval(
     judgment_api_key: str,
     override: bool = False,
     function: Optional[Callable] = None,
-    tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
+    tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
     examples: Optional[List[Example]] = None,
 ) -> List[ScoringResult]:
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
@@ -394,7 +414,7 @@ def _poll_evaluation_until_complete(
     expected_scorer_data_count: int,
     poll_interval_seconds: float = 5,
     max_failures: int = 5,
-    max_poll_count: int = 24,  # This should be equivalent to 120 seconds
+    max_poll_count: int = 60,  # This should be equivalent to 5 minutes
 ) -> Tuple[List[ScoringResult], str]:
     """
     Polls until the evaluation is complete and returns the results.
@@ -500,6 +520,14 @@ def run_eval(
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
     """
+    # Check that every example has the same keys
+    keys = evaluation_run.examples[0].get_fields().keys()
+    for example in evaluation_run.examples:
+        current_keys = example.get_fields().keys()
+        if current_keys != keys:
+            raise ValueError(
+                f"All examples must have the same keys: {current_keys} != {keys}"
+            )
     # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
     if not override and not evaluation_run.append:
@@ -520,9 +548,14 @@ def run_eval(
             False,
         )
-    # Set example IDs if not already set
-    for idx, example in enumerate(evaluation_run.examples):
-        example.example_index = idx  # Set numeric index
+        # Ensure that current experiment (if one exists) has the same keys for example
+        check_example_keys(
+            keys=list(keys),
+            eval_name=evaluation_run.eval_name,
+            project_name=evaluation_run.project_name,
+            judgment_api_key=judgment_api_key,
+            organization_id=evaluation_run.organization_id,
+        )
     judgment_scorers: List[APIScorerConfig] = []
     local_scorers: List[BaseScorer] = []
@@ -601,7 +634,6 @@ def run_eval(
         send_results = [
             scoring_result.model_dump(warnings=False) for scoring_result in results
         ]
         url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
     rprint(
         f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"

judgeval/scorers/score.py CHANGED Viewed

@@ -30,15 +30,19 @@ async def safe_a_score_example(
     Args:
         scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
         example (Example): The `Example` to be scored.
-        ignore_errors (bool): Whether to ignore errors during the evaluation.
-        If set to false, any error will be raised and stop the evaluation.
-        If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
-        skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
     """
     try:
-        scorer.score = await scorer.a_score_example(example)
+        score = await scorer.a_score_example(example)
+        if score is None:
+            raise Exception("a_score_example need to return a score")
+        elif score < 0:
+            judgeval_logger.warning("score cannot be less than 0 , setting to 0")
+            score = 0
+        elif score > 1:
+            judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
+            score = 1
+        else:
+            scorer.score = score
         scorer.success = scorer.success_check()
     except Exception as e:
         judgeval_logger.error(f"Error during scoring: {str(e)}")

judgeval/scorers/utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ Util functions for Scorer objects
 import asyncio
 import nest_asyncio
-import json
+import orjson
 import re
 from typing import List, Optional
@@ -48,8 +48,8 @@ def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None)
     )  # Remove trailing comma if present
     try:
-        return json.loads(json_str)
-    except json.JSONDecodeError:
+        return orjson.loads(json_str)
+    except orjson.JSONDecodeError:
         error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
         if scorer is not None:
             scorer.error = error_str

judgeval/utils/file_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import yaml
+import orjson
 from typing import List
 from judgeval.common.logger import judgeval_logger
@@ -9,37 +10,19 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
     """
     Adds examples from a YAML file.
-    The format of the YAML file is expected to be a dictionary with one key: "examples".
-    The value of the key is a list of dictionaries, where each dictionary represents an example.
     The YAML file is expected to have the following format:
-    examples:
-        - input: "test input"
-        actual_output: "test output"
-        expected_output: "expected output"
-        context:
-            - "context1"
-            - "context2"
-        retrieval_context:
-            - "retrieval1"
-        additional_metadata:
-            key: "value"
-        tools_called:
-            - "tool1"
-        expected_tools:
-            - {tool_name: "tool1", parameters: {"query": "test query 1"}}
-            - {tool_name: "tool2", parameters: {"query": "test query 2"}}
-        name: "test example"
-        example_id: null
-        timestamp: "20241230_160117"
-        trace_id: "123"
+    - key_01: value_01
+        key_02: value_02
+    - key_11: value_11
+        key_12: value_12
+        key_13: value_13
+    ...
     """
     try:
         with open(file_path, "r") as file:
             payload = yaml.safe_load(file)
             if payload is None:
                 raise ValueError("The YAML file is empty.")
-            examples = payload.get("examples", [])
     except FileNotFoundError:
         judgeval_logger.error(f"YAML file not found: {file_path}")
         raise FileNotFoundError(f"The file {file_path} was not found.")
@@ -47,5 +30,37 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
         judgeval_logger.error(f"Invalid YAML file: {file_path}")
         raise ValueError(f"The file {file_path} is not a valid YAML file.")
-    new_examples = [Example(**e) for e in examples]
+    new_examples = [Example(**e) for e in payload]
+    return new_examples
+def get_examples_from_json(file_path: str) -> List[Example] | None:
+    """
+    Adds examples from a JSON file.
+    The JSON file is expected to have the following format:
+    [
+        {
+            "key_01": "value_01",
+            "key_02": "value_02"
+        },
+        {
+            "key_11": "value_11",
+            "key_12": "value_12",
+            "key_13": "value_13"
+        },
+        ...
+    ]
+    """
+    try:
+        with open(file_path, "rb") as file:
+            payload = orjson.loads(file.read())
+    except FileNotFoundError:
+        judgeval_logger.error(f"JSON file not found: {file_path}")
+        raise FileNotFoundError(f"The file {file_path} was not found.")
+    except orjson.JSONDecodeError:
+        judgeval_logger.error(f"Invalid JSON file: {file_path}")
+        raise ValueError(f"The file {file_path} is not a valid JSON file.")
+    new_examples = [Example(**e) for e in payload]
     return new_examples

judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl