PyPI - judgeval - Versions diffs - 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

judgeval/common/tracer.py +528 -166
judgeval/constants.py +7 -4
judgeval/data/__init__.py +0 -3
judgeval/data/datasets/dataset.py +42 -19
judgeval/data/datasets/eval_dataset_client.py +59 -20
judgeval/data/result.py +34 -56
judgeval/integrations/langgraph.py +16 -12
judgeval/judgment_client.py +85 -23
judgeval/rules.py +177 -60
judgeval/run_evaluation.py +143 -122
judgeval/scorers/score.py +21 -18
judgeval/utils/alerts.py +32 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD +16 -17
judgeval/data/api_example.py +0 -98
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
{judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0

judgeval/constants.py CHANGED Viewed

@@ -41,18 +41,21 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
-JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
-JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
+JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
+JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
-JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
+JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
+JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
 JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
 JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
+JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_trace_eval_queue/"
+JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 # RabbitMQ
 RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
 RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)

judgeval/data/__init__.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from judgeval.data.example import Example, ExampleParams
-from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
 __all__ = [
     "Example",
     "ExampleParams",
-    "ProcessExample",
-    "create_process_example",
     "ScorerData",
     "create_scorer_data",
     "ScoringResult",

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -90,9 +90,18 @@ class EvalDataset:
     def add_from_csv(
         self,
         file_path: str,
+        header_mapping: dict,
+        primary_delimiter: str = ",",
+        secondary_delimiter: str = ";"
         ) -> None:
         """
         Add Examples from a CSV file.
+        Args:
+            file_path (str): Path to the CSV file
+            header_mapping (dict): Dictionary mapping Example headers to custom headers
+            primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
+            secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
         """
         try:
             import pandas as pd
@@ -102,9 +111,10 @@ class EvalDataset:
             )
         # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
-        df = pd.read_csv(file_path, dtype={'trace_id': str})
+        df = pd.read_csv(file_path, dtype={'trace_id': str}, sep=primary_delimiter)
         """
-        Expect the CSV to have headers
+        The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
+        Available headers for Example objects are as follows:
         "input", "actual_output", "expected_output", "context", \
         "retrieval_context", "additional_metadata", "tools_called", \
@@ -113,35 +123,48 @@ class EvalDataset:
         We want to collect the examples separately which can
         be determined by the "example" column. If the value is True, then it is an
-        example
+        example, and we expect the `input` and `actual_output` fields to be non-null.
-        We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
-        This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
+        We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
+        This can be adjusted using the `secondary_delimiter` parameter.
         """
         examples = []
+        def process_csv_row(value, header):
+            """
+            Maps a singular value in the CSV file to the appropriate type based on the header.
+            If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
+            """
+            # check that the CSV value is not null for entry
+            null_replacement = dict() if header == 'additional_metadata' else None
+            if pd.isna(value) or value == '':
+                return null_replacement
+            try:
+                value = ast.literal_eval(value) if header == 'additional_metadata' else str(value)
+            except (ValueError, SyntaxError):
+                value = str(value)
+            if header in ["context", "retrieval_context", "tools_called", "expected_tools"]:
+                # attempt to split the value by the secondary delimiter
+                value = value.split(secondary_delimiter)
+            return value
         for _, row in df.iterrows():
             data = {
-                "input": row["input"],
-                "actual_output": row["actual_output"] if pd.notna(row["actual_output"]) else None,
-                "expected_output": row["expected_output"] if pd.notna(row["expected_output"]) else None,
-                "context": row["context"].split(";") if pd.notna(row["context"]) else [],
-                "retrieval_context": row["retrieval_context"].split(";") if pd.notna(row["retrieval_context"]) else [],
-                "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
-                "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
-                "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
-                "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
-                "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
+                header: process_csv_row(
+                    row[header_mapping[header]], header
+                )
+                for header in header_mapping
             }
-            if row["example"]:
-                data["name"] = row["name"] if pd.notna(row["name"]) else None
+            if "example" in header_mapping and row[header_mapping["example"]]:
+                if "name" in header_mapping:
+                    data["name"] = row[header_mapping["name"]] if pd.notna(row[header_mapping["name"]]) else None
                 # every Example has `input` and `actual_output` fields
                 if data["input"] is not None and data["actual_output"] is not None:
                     e = Example(**data)
                     examples.append(e)
                 else:
                     raise ValueError("Every example must have an 'input' and 'actual_output' field.")
         for e in examples:
             self.add_example(e)

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -7,8 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
 from judgeval.constants import (
     JUDGMENT_DATASETS_PUSH_API_URL,
     JUDGMENT_DATASETS_PULL_API_URL,
-    JUDGMENT_DATASETS_PULL_ALL_API_URL,
-    JUDGMENT_DATASETS_EDIT_API_URL,
+    JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
+    JUDGMENT_DATASETS_DELETE_API_URL,
+    JUDGMENT_DATASETS_INSERT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
 from judgeval.data import Example
@@ -25,7 +26,7 @@ class EvalDatasetClient:
     def create_dataset(self) -> EvalDataset:
         return EvalDataset(judgment_api_key=self.judgment_api_key)
-    def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
+    def push(self, dataset: EvalDataset, alias: str, project_name: str, overwrite: Optional[bool] = False) -> bool:
         debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
         if overwrite:
             warning(f"Overwrite enabled for alias '{alias}'")
@@ -53,7 +54,8 @@ class EvalDatasetClient:
                 total=100,
             )
             content = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name,
                     "examples": [e.to_dict() for e in dataset.examples],
                     "overwrite": overwrite,
                 }
@@ -88,7 +90,7 @@ class EvalDatasetClient:
                 )
             return True
-    def pull(self, alias: str) -> EvalDataset:
+    def pull(self, alias: str, project_name: str) -> EvalDataset:
         debug(f"Pulling dataset with alias '{alias}'")
         """
         Pulls the dataset from Judgment platform
@@ -96,7 +98,7 @@ class EvalDatasetClient:
         Mock request:
         {
             "alias": alias,
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -118,7 +120,8 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
-                    "alias": alias,
+                    "dataset_alias": alias,
+                    "project_name": project_name
                 }
                 try:
@@ -139,24 +142,58 @@ class EvalDatasetClient:
                 info(f"Successfully pulled dataset with alias '{alias}'")
                 payload = response.json()
                 dataset.examples = [Example(**e) for e in payload.get("examples", [])]
-                dataset._alias = payload.get("_alias")
-                dataset._id = payload.get("_id")
+                dataset._alias = payload.get("alias")
+                dataset._id = payload.get("id")
                 progress.update(
                     task_id,
                     description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
                 )
                 return dataset
+    def delete(self, alias: str, project_name: str) -> bool:
+        with Progress(
+                SpinnerColumn(style="rgb(106,0,255)"),
+                TextColumn("[progress.description]{task.description}"),
+                transient=False,
+            ) as progress:
+                task_id = progress.add_task(
+                    f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
+                    total=100,
+                )
+                request_body = {
+                    "dataset_alias": alias,
+                    "project_name": project_name
+                }
-    def pull_all_user_dataset_stats(self) -> dict:
-        debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
+                try:
+                    response = requests.post(
+                        JUDGMENT_DATASETS_DELETE_API_URL,
+                        json=request_body,
+                        headers={
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.judgment_api_key}",
+                            "X-Organization-Id": self.organization_id
+                        },
+                        verify=True
+                    )
+                    response.raise_for_status()
+                except requests.exceptions.RequestException as e:
+                    error(f"Error deleting dataset: {str(e)}")
+                    raise
+                return True
+    def pull_project_dataset_stats(self, project_name: str) -> dict:
+        debug(f"Pulling project datasets stats for project_name: {project_name}'")
         """
-        Pulls the user datasets stats from Judgment platform
+        Pulls the project datasets stats from Judgment platform
         Mock request:
         {
-            "user_id": user_id
+            "project_name": project_name
         }
         ==>
         {
@@ -177,11 +214,12 @@ class EvalDatasetClient:
                     total=100,
                 )
                 request_body = {
+                    "project_name": project_name
                 }
                 try:
                     response = requests.post(
-                        JUDGMENT_DATASETS_PULL_ALL_API_URL,
+                        JUDGMENT_DATASETS_PROJECT_STATS_API_URL,
                         json=request_body,
                         headers={
                             "Content-Type": "application/json",
@@ -205,7 +243,7 @@ class EvalDatasetClient:
                 return payload
-    def edit_dataset(self, alias: str, examples: List[Example]) -> bool:
+    def insert_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
         """
         Edits the dataset on Judgment platform by adding new examples
@@ -213,7 +251,7 @@ class EvalDatasetClient:
         {
             "alias": alias,
             "examples": [...],
-            "judgment_api_key": self.judgment_api_key
+            "project_name": project_name
         }
         """
         with Progress(
@@ -227,13 +265,14 @@ class EvalDatasetClient:
             )
             content = {
-                "alias": alias,
+                "dataset_alias": alias,
                 "examples": [e.to_dict() for e in examples],
+                "project_name": project_name
             }
             try:
                 response = requests.post(
-                    JUDGMENT_DATASETS_EDIT_API_URL,
+                    JUDGMENT_DATASETS_INSERT_API_URL,
                     json=content,
                     headers={
                         "Content-Type": "application/json",
@@ -250,7 +289,7 @@ class EvalDatasetClient:
             info(f"Successfully edited dataset '{alias}'")
             return True
-    def export_jsonl(self, alias: str) -> requests.Response:
+    def export_jsonl(self, alias: str, project_name: str) -> requests.Response:
         """Export dataset in JSONL format from Judgment platform"""
         debug(f"Exporting dataset with alias '{alias}' as JSONL")
         with Progress(
@@ -265,7 +304,7 @@ class EvalDatasetClient:
             try:
                 response = requests.post(
                     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
-                    json={"alias": alias},
+                    json={"dataset_alias": alias, "project_name": project_name},
                     headers={
                         "Content-Type": "application/json",
                         "Authorization": f"Bearer {self.judgment_api_key}",

judgeval/data/result.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from dataclasses import dataclass
 from typing import List, Union, Optional, Dict, Any, Union
+from judgeval.common.logger import debug, error
+from pydantic import BaseModel
+from judgeval.data import ScorerData, Example
-from judgeval.data import ScorerData, ProcessExample
-@dataclass
-class ScoringResult:
+class ScoringResult(BaseModel):
     """
     A ScoringResult contains the output of one or more scorers applied to a single example.
     Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -13,69 +14,44 @@ class ScoringResult:
         success (bool): Whether the evaluation was successful.
                         This means that all scorers applied to this example returned a success.
         scorer_data (List[ScorerData]): The scorers data for the evaluated example
-        input (Optional[str]): The input to the example
-        actual_output (Optional[str]): The actual output of the example
-        expected_output (Optional[str]): The expected output of the example
-        context (Optional[List[str]]): The context of the example
-        retrieval_context (Optional[List[str]]): The retrieval context of the example
-        additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
-        tools_called (Optional[List[str]]): The tools called by the example
-        expected_tools (Optional[List[str]]): The expected tools of the example
-        trace_id (Optional[str]): The trace id of the example
+        data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
     """
     # Fields for scoring outputs
     success: bool  # used for unit testing
     scorers_data: Union[List[ScorerData], None]
+    name: Optional[str] = None
-    # Inputs from the original example
-    input: Optional[str] = None
-    actual_output: Optional[Union[str, List[str]]] = None
-    expected_output: Optional[Union[str, List[str]]] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+    # The original example object that was used to create the ScoringResult
+    data_object: Optional[Example] = None #can be Example, CustomExample (future), WorkflowRun (future)
     trace_id: Optional[str] = None
-    example_id: Optional[str] = None
-    eval_run_name: Optional[str] = None
+    # Additional fields for internal use
+    run_duration: Optional[float] = None
+    evaluation_cost: Optional[float] = None
     def to_dict(self) -> dict:
         """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
         return {
             "success": self.success,
             "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "trace_id": self.trace_id,
-            "example_id": self.example_id
+            "data_object": self.data_object.to_dict() if self.data_object else None,
         }
     def __str__(self) -> str:
         return f"ScoringResult(\
             success={self.success}, \
             scorer_data={self.scorers_data}, \
-            input={self.input}, \
-            actual_output={self.actual_output}, \
-            expected_output={self.expected_output}, \
-            context={self.context}, \
-            retrieval_context={self.retrieval_context}, \
-            additional_metadata={self.additional_metadata}, \
-            tools_called={self.tools_called}, \
-            expected_tools={self.expected_tools}, \
-            trace_id={self.trace_id})"
+            data_object={self.data_object}, \
+            run_duration={self.run_duration}, \
+            evaluation_cost={self.evaluation_cost})"
 def generate_scoring_result(
-    process_example: ProcessExample,
+    example: Example,
+    success: bool,
+    scorers_data: List[ScorerData],
+    run_duration: float,
 ) -> ScoringResult:
     """
     Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +59,18 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    return ScoringResult(
-        success=process_example.success,
-        scorers_data=process_example.scorers_data,
-        input=process_example.input,
-        actual_output=process_example.actual_output,
-        expected_output=process_example.expected_output,
-        context=process_example.context,
-        retrieval_context=process_example.retrieval_context,
-        additional_metadata=process_example.additional_metadata,
-        tools_called=process_example.tools_called,
-        expected_tools=process_example.expected_tools,
-        trace_id=process_example.trace_id
+    if example.name is not None:
+        name = example.name
+    else:
+        name = "Test Case Placeholder"
+        debug(f"No name provided for example, using default name: {name}")
+    debug(f"Creating ScoringResult for: {name}")
+    scoring_result = ScoringResult(
+        name=name,
+        data_object=example,
+        success=success,
+        scorers_data=scorers_data,
+        run_duration=run_duration,
+        evaluation_cost=None,
     )
+    return scoring_result

judgeval/integrations/langgraph.py CHANGED Viewed

@@ -146,16 +146,17 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             self.start_span("LangGraph", span_type="Main Function")
-        node = metadata.get("langgraph_node")
-        if node != None and node != self.previous_node:
-            self.start_span(node, span_type="node")
-            self.executed_node_tools.append(node)
-            self.executed_nodes.append(node)
-            self.trace_client.record_input({
-                'args': inputs,
-                'kwargs': kwargs
-            })
-        self.previous_node = node
+        metadata = kwargs.get("metadata", {})
+        if node := metadata.get("langgraph_node"):
+            if node != self.previous_node:
+                # Track node execution
+                self.trace_client.visited_nodes.append(node)
+                self.trace_client.executed_node_tools.append(node)
+                self.trace_client.record_input({
+                    'args': inputs,
+                    'kwargs': kwargs
+                })
+            self.previous_node = node
     def on_chain_end(
         self,
@@ -198,8 +199,11 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
     ):
         name = serialized["name"]
         self.start_span(name, span_type="tool")
-        self.executed_node_tools.append(f"{self.previous_node}:{name}")
-        self.executed_tools.append(name)
+        if name:
+            # Track tool execution
+            self.trace_client.executed_tools.append(name)
+            node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
+            self.trace_client.executed_node_tools.append(node_tool)
         self.trace_client.record_input({
             'args': input_str,
             'kwargs': kwargs

judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

judgeval 0.0.25py3-none-any.whl → 0.0.27py3-none-any.whl