PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/data/api_example.py DELETED Viewed

@@ -1,111 +0,0 @@
-from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, Field, ConfigDict, model_validator
-from judgeval.data.example import Example
-from judgeval.data.scorer_data import ScorerData
-from judgeval.common.logger import debug, error
-class ProcessExample(BaseModel):
-    """
-    ProcessExample is an `Example` object that contains intermediate information
-    about an undergoing evaluation on the original `Example`. It is used purely for
-    internal operations and keeping track of the evaluation process.
-    """
-    name: str
-    input: Optional[str] = None
-    actual_output: Optional[str] = None
-    expected_output: Optional[str] = None
-    context: Optional[list] = None
-    retrieval_context: Optional[list] = None
-    tools_called: Optional[list] = None
-    expected_tools: Optional[list] = None
-    # make these optional, not all test cases in a conversation will be evaluated
-    success: Optional[bool] = None
-    scorers_data: Optional[List[ScorerData]] = None
-    run_duration: Optional[float] = None
-    evaluation_cost: Optional[float] = None
-    order: Optional[int] =  None
-    # These should map 1 to 1 from golden
-    additional_metadata: Optional[Dict] = None
-    comments: Optional[str] = None
-    trace_id: Optional[str] = None
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    def update_scorer_data(self, scorer_data: ScorerData):
-        """
-        Updates scorer data field of test case after the scorers have been
-        evaluated on this test case.
-        """
-        debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
-        # self.scorers_data is a list of ScorerData objects that contain the
-        # evaluation results of each scorer on this test case
-        if self.scorers_data is None:
-            self.scorers_data = [scorer_data]
-        else:
-            self.scorers_data.append(scorer_data)
-        if self.success is None:
-            # self.success will be None when it is a message
-            # in that case we will be setting success for the first time
-            self.success = scorer_data.success
-        else:
-            if scorer_data.success is False:
-                debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
-                self.success = False
-    def update_run_duration(self, run_duration: float):
-        self.run_duration = run_duration
-    @model_validator(mode="before")
-    def check_input(cls, values: Dict[str, Any]):
-        input = values.get("input")
-        actual_output = values.get("actual_output")
-        if (input is None or actual_output is None):
-            error(f"Validation error: Required fields missing. input={input}, actual_output={actual_output}")
-            raise ValueError(
-                "'input' and 'actual_output' must be provided."
-            )
-        return values
-def create_process_example(
-    example: Example,
-) -> ProcessExample:
-    """
-    When an LLM Test Case is executed, we track its progress using an ProcessExample.
-    This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
-    """
-    success = True
-    if example.name is not None:
-        name = example.name
-    else:
-        name = "Test Case Placeholder"
-        debug(f"No name provided for example, using default name: {name}")
-    order = None
-    scorers_data = []
-    debug(f"Creating ProcessExample for: {name}")
-    process_ex = ProcessExample(
-        name=name,
-        input=example.input,
-        actual_output=example.actual_output,
-        expected_output=example.expected_output,
-        context=example.context,
-        retrieval_context=example.retrieval_context,
-        tools_called=example.tools_called,
-        expected_tools=example.expected_tools,
-        success=success,
-        scorers_data=scorers_data,
-        run_duration=None,
-        evaluation_cost=None,
-        order=order,
-        additional_metadata=example.additional_metadata,
-        trace_id=example.trace_id
-    )
-    return process_ex

judgeval/data/datasets/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from judgeval.data.datasets.dataset import EvalDataset
-from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
-__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]

judgeval/data/datasets/dataset.py DELETED Viewed

@@ -1,286 +0,0 @@
-import ast
-import csv
-import datetime
-import json
-from dataclasses import dataclass, field
-import os
-from typing import List, Optional, Union, Literal
-from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data import Example
-from judgeval.common.logger import debug, error, warning, info
-@dataclass
-class EvalDataset:
-    ground_truths: List[GroundTruthExample]
-    examples: List[Example]
-    _alias: Union[str, None] = field(default=None)
-    _id: Union[str, None] = field(default=None)
-    judgment_api_key: str = field(default="")
-    def __init__(self,
-                 judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
-                 ground_truths: List[GroundTruthExample] = [],
-                 examples: List[Example] = [],
-                 ):
-        debug(f"Initializing EvalDataset with {len(ground_truths)} ground truths and {len(examples)} examples")
-        if not judgment_api_key:
-            warning("No judgment_api_key provided")
-        self.ground_truths = ground_truths
-        self.examples = examples
-        self._alias = None
-        self._id = None
-        self.judgment_api_key = judgment_api_key
-    def add_from_json(self, file_path: str) -> None:
-        debug(f"Loading dataset from JSON file: {file_path}")
-        """
-        Adds examples and ground truths from a JSON file.
-        The format of the JSON file is expected to be a dictionary with two keys: "examples" and "ground_truths".
-        The value of each key is a list of dictionaries, where each dictionary represents an example or ground truth.
-        The JSON file is expected to have the following format:
-        {
-            "ground_truths": [
-                {
-                    "input": "test input",
-                    "actual_output": null,
-                    "expected_output": "expected output",
-                    "context": [
-                    "context1"
-                ],
-                "retrieval_context": [
-                    "retrieval1"
-                ],
-                "additional_metadata": {
-                    "key": "value"
-                },
-                "comments": "test comment",
-                "tools_called": [
-                    "tool1"
-                ],
-                "expected_tools": [
-                    "tool1"
-                ],
-                "source_file": "test.py",
-                "trace_id": "094121"
-            }
-        ],
-        "examples": [
-            {
-                "input": "test input",
-                "actual_output": "test output",
-                "expected_output": "expected output",
-                "context": [
-                    "context1",
-                    "context2"
-                ],
-                "retrieval_context": [
-                    "retrieval1"
-                ],
-                "additional_metadata": {
-                    "key": "value"
-                },
-                "tools_called": [
-                    "tool1"
-                ],
-                "expected_tools": [
-                    "tool1",
-                    "tool2"
-                ],
-                "name": "test example",
-                "example_id": null,
-                "timestamp": "20241230_160117",
-                "trace_id": "123"
-            }
-            ]
-        }
-        """
-        try:
-            with open(file_path, "r") as file:
-                payload = json.load(file)
-                examples = payload.get("examples", [])
-                ground_truths = payload.get("ground_truths", [])
-        except FileNotFoundError:
-            error(f"JSON file not found: {file_path}")
-            raise FileNotFoundError(f"The file {file_path} was not found.")
-        except json.JSONDecodeError:
-            error(f"Invalid JSON file: {file_path}")
-            raise ValueError(f"The file {file_path} is not a valid JSON file.")
-        info(f"Added {len(examples)} examples and {len(ground_truths)} ground truths from JSON")
-        new_examples = [Example(**e) for e in examples]
-        for e in new_examples:
-            self.add_example(e)
-        new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
-        for g in new_ground_truths:
-            self.add_ground_truth(g)
-    def add_from_csv(
-        self,
-        file_path: str,
-        ) -> None:
-        """
-        Add Examples and GroundTruthExamples from a CSV file.
-        """
-        try:
-            import pandas as pd
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError(
-                "Please install pandas to use this method. 'pip install pandas'"
-            )
-        # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
-        df = pd.read_csv(file_path, dtype={'trace_id': str})
-        """
-        Expect the CSV to have headers
-        "input", "actual_output", "expected_output", "context", \
-        "retrieval_context", "additional_metadata", "tools_called", \
-        "expected_tools", "name", "comments", "source_file", "example", \
-        "trace_id"
-        We want to collect the examples and ground truths separately which can
-        be determined by the "example" column. If the value is True, then it is an
-        example, otherwise it is a ground truth.
-        We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
-        This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
-        """
-        examples, ground_truths = [], []
-        for _, row in df.iterrows():
-            data = {
-                "input": row["input"],
-                "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
-                "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
-                "context": row["context"].split(";") if pd.notna(row["context"]) else [],
-                "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
-                "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
-                "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
-                "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
-                "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
-            }
-            if row["example"]:
-                data["name"] = row["name"] if pd.notna(row["name"]) else None
-                # every Example has `input` and `actual_output` fields
-                if data["input"] is not None and data["actual_output"] is not None:
-                    e = Example(**data)
-                    examples.append(e)
-                else:
-                    raise ValueError("Every example must have an 'input' and 'actual_output' field.")
-            else:
-                # GroundTruthExample has `comments` and `source_file` fields
-                data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
-                data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
-                # every GroundTruthExample has `input` field
-                if data["input"] is not None:
-                    g = GroundTruthExample(**data)
-                    ground_truths.append(g)
-                else:
-                    raise ValueError("Every ground truth must have an 'input' field.")
-        for e in examples:
-            self.add_example(e)
-        for g in ground_truths:
-            self.add_ground_truth(g)
-    def add_example(self, e: Example) -> None:
-        self.examples = self.examples + [e]
-        # TODO if we need to add rank, then we need to do it here
-    def add_ground_truth(self, g: GroundTruthExample) -> None:
-        self.ground_truths = self.ground_truths + [g]
-    def save_as(self, file_type: Literal["json", "csv"], dir_path: str, save_name: str = None) -> None:
-        """
-        Saves the dataset as a file. Save both the ground truths and examples.
-        Args:
-            file_type (Literal["json", "csv"]): The file type to save the dataset as.
-            dir_path (str): The directory path to save the file to.
-            save_name (str, optional): The name of the file to save. Defaults to None.
-        """
-        if not os.path.exists(dir_path):
-            os.makedirs(dir_path)
-        file_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") if save_name is None else save_name
-        complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
-        if file_type == "json":
-            with open(complete_path, "w") as file:
-                json.dump(
-                    {
-                        "ground_truths": [g.to_dict() for g in self.ground_truths],
-                        "examples": [e.to_dict() for e in self.examples],
-                    },
-                    file,
-                    indent=4,
-                )
-        elif file_type == "csv":
-            with open(complete_path, "w", newline="") as file:
-                writer = csv.writer(file)
-                writer.writerow([
-                    "input", "actual_output", "expected_output", "context", \
-                    "retrieval_context", "additional_metadata", "tools_called", \
-                    "expected_tools", "name", "comments", "source_file", "example", \
-                    "trace_id"
-                ])
-                for e in self.examples:
-                    writer.writerow(
-                        [
-                            e.input,
-                            e.actual_output,
-                            e.expected_output,
-                            ";".join(e.context),
-                            ";".join(e.retrieval_context),
-                            e.additional_metadata,
-                            ";".join(e.tools_called),
-                            ";".join(e.expected_tools),
-                            e.name,
-                            None,  # Example does not have comments
-                            None,  # Example does not have source file
-                            True,  # Adding an Example
-                            e.trace_id
-                        ]
-                    )
-                for g in self.ground_truths:
-                    writer.writerow(
-                        [
-                            g.input,
-                            g.actual_output,
-                            g.expected_output,
-                            ";".join(g.context),
-                            ";".join(g.retrieval_context),
-                            g.additional_metadata,
-                            ";".join(g.tools_called),
-                            ";".join(g.expected_tools),
-                            None,  # GroundTruthExample does not have name
-                            g.comments,
-                            g.source_file,
-                            False,  # Adding a GroundTruthExample, not an Example
-                            g.trace_id
-                        ]
-                    )
-        else:
-            ACCEPTABLE_FILE_TYPES = ["json", "csv"]
-            raise TypeError(f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}")
-    def __iter__(self):
-        return iter(self.examples)
-    def __len__(self):
-        return len(self.examples)
-    def __str__(self):
-        return (
-            f"{self.__class__.__name__}("
-            f"ground_truths={self.ground_truths}, "
-            f"examples={self.examples}, "
-            f"_alias={self._alias}, "
-            f"_id={self._id}"
-            f")"
-        )

judgeval/data/datasets/eval_dataset_client.py DELETED Viewed

@@ -1,193 +0,0 @@
-from typing import Optional
-import requests
-from rich.progress import Progress, SpinnerColumn, TextColumn
-from judgeval.common.logger import debug, error, warning, info
-from judgeval.constants import (
-    JUDGMENT_DATASETS_PUSH_API_URL,
-    JUDGMENT_DATASETS_PULL_API_URL,
-    JUDGMENT_DATASETS_PULL_ALL_API_URL
-)
-from judgeval.data import Example
-from judgeval.data.datasets import EvalDataset
-from judgeval.data.datasets.ground_truth import GroundTruthExample
-class EvalDatasetClient:
-    def __init__(self, judgment_api_key: str):
-        self.judgment_api_key = judgment_api_key
-    def create_dataset(self) -> EvalDataset:
-        return EvalDataset(judgment_api_key=self.judgment_api_key)
-    def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
-        debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
-        if overwrite:
-            warning(f"Overwrite enabled for alias '{alias}'")
-        """
-        Pushes the dataset to Judgment platform
-        Mock request:
-        dataset = {
-            "alias": alias,
-            "ground_truths": [...],
-            "examples": [...],
-            "overwrite": overwrite
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "alias": alias,
-                    "ground_truths": [g.to_dict() for g in dataset.ground_truths],
-                    "examples": [e.to_dict() for e in dataset.examples],
-                    "overwrite": overwrite,
-                    "judgment_api_key": dataset.judgment_api_key
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_PUSH_API_URL,
-                    json=content
-                )
-                if response.status_code == 500:
-                    error(f"Server error during push: {content.get('message')}")
-                    return False
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during push: {err.response.json()}")
-                else:
-                    error(f"HTTP error during push: {err}")
-            info(f"Successfully pushed dataset with alias '{alias}'")
-            payload = response.json()
-            dataset._alias = payload.get("_alias")
-            dataset._id = payload.get("_id")
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
-    def pull(self, alias: str) -> EvalDataset:
-        debug(f"Pulling dataset with alias '{alias}'")
-        """
-        Pulls the dataset from Judgment platform
-        Mock request:
-        {
-            "alias": alias,
-            "user_id": user_id
-        }
-        ==>
-        {
-            "ground_truths": [...],
-            "examples": [...],
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        # Make a POST request to the Judgment API to get the dataset
-        dataset = self.create_dataset()
-        with Progress(
-                SpinnerColumn(style="rgb(106,0,255)"),
-                TextColumn("[progress.description]{task.description}"),
-                transient=False,
-            ) as progress:
-                task_id = progress.add_task(
-                    f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
-                    total=100,
-                )
-                request_body = {
-                    "alias": alias,
-                    "judgment_api_key": self.judgment_api_key
-                }
-                try:
-                    response = requests.post(
-                        JUDGMENT_DATASETS_PULL_API_URL,
-                        json=request_body
-                    )
-                    response.raise_for_status()
-                except requests.exceptions.RequestException as e:
-                    error(f"Error pulling dataset: {str(e)}")
-                    raise
-                info(f"Successfully pulled dataset with alias '{alias}'")
-                payload = response.json()
-                dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
-                dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset._alias = payload.get("_alias")
-                dataset._id = payload.get("_id")
-                progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-                return dataset
-    def pull_all_user_dataset_stats(self) -> dict:
-        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
-        """
-        Pulls the user datasets stats from Judgment platform
-        Mock request:
-        {
-            "user_id": user_id
-        }
-        ==>
-        {
-            "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
-            "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
-            ...
-        }
-        """
-        # Make a POST request to the Judgment API to get the dataset
-        with Progress(
-                SpinnerColumn(style="rgb(106,0,255)"),
-                TextColumn("[progress.description]{task.description}"),
-                transient=False,
-            ) as progress:
-                task_id = progress.add_task(
-                    f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
-                    total=100,
-                )
-                request_body = {
-                    "judgment_api_key": self.judgment_api_key
-                }
-                try:
-                    response = requests.post(
-                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
-                        json=request_body
-                    )
-                    response.raise_for_status()
-                except requests.exceptions.RequestException as e:
-                    error(f"Error pulling dataset: {str(e)}")
-                    raise
-                info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
-                payload = response.json()
-                progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-                return payload

judgeval/data/datasets/ground_truth.py DELETED Viewed

@@ -1,54 +0,0 @@
-from pydantic import BaseModel
-from typing import Optional, Dict, List
-class GroundTruthExample(BaseModel):
-    """
-    GroundTruthExample is the atomic unit of a `Dataset`. It is essentially the same
-    as an `Example`, but the `actual_output` field is optional to enable users to
-    run their workflow on the `input` field at test-time to evaluate their current
-    workflow's performance.
-    """
-    input: str
-    actual_output: Optional[str] = None
-    expected_output: Optional[str] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict] = None
-    comments: Optional[str] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
-    source_file: Optional[str] = None
-    trace_id: Optional[str] = None
-    def to_dict(self):
-        return {
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "comments": self.comments,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "source_file": self.source_file,
-            "trace_id": self.trace_id,
-        }
-    def __str__(self):
-        return (
-            f"{self.__class__.__name__}("
-            f"input={self.input}, "
-            f"actual_output={self.actual_output}, "
-            f"expected_output={self.expected_output}, "
-            f"context={self.context}, "
-            f"retrieval_context={self.retrieval_context}, "
-            f"additional_metadata={self.additional_metadata}, "
-            f"comments={self.comments}, "
-            f"tools_called={self.tools_called}, "
-            f"expected_tools={self.expected_tools}, "
-            f"source_file={self.source_file}, "
-            f"trace_id={self.trace_id}"
-            f")"
-        )

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl