PyPI - judgeval - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

judgeval 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

judgeval/common/tracer.py +229 -44
judgeval/constants.py +15 -3
judgeval/data/datasets/__init__.py +2 -1
judgeval/data/datasets/dataset.py +1 -122
judgeval/data/datasets/eval_dataset_client.py +193 -0
judgeval/data/result.py +16 -1
judgeval/evaluation_run.py +2 -1
judgeval/judges/utils.py +14 -2
judgeval/judgment_client.py +64 -7
judgeval/run_evaluation.py +19 -0
judgeval/scorers/judgeval_scorer.py +8 -8
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
judgeval/scorers/prompt_scorer.py +2 -2
judgeval/scorers/score.py +11 -11
judgeval/scorers/utils.py +3 -3
judgeval/tracer/__init__.py +3 -0
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/METADATA +5 -4
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/RECORD +21 -19
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/WHEEL +0 -0
{judgeval-0.0.9.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -2,13 +2,26 @@
 Tracing system for judgeval that allows for function tracing using decorators.
 """
+import os
 import time
 import functools
 import requests
 import uuid
 from contextlib import contextmanager
-from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
-from dataclasses import dataclass, field
+from typing import (
+    Optional,
+    Any,
+    List,
+    Literal,
+    Tuple,
+    Generator,
+    TypeAlias,
+    Union
+)
+from dataclasses import (
+    dataclass,
+    field
+)
 from datetime import datetime
 from openai import OpenAI
 from together import Together
@@ -21,16 +34,25 @@ import warnings
 from pydantic import BaseModel
 from http import HTTPStatus
-from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
+import pika
+import os
+from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
-from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
+from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
+from rich import print as rprint
 from judgeval.data.result import ScoringResult
+from judgeval.evaluation_run import EvaluationRun
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
 SpanType = Literal['span', 'tool', 'llm', 'evaluation']
 @dataclass
 class TraceEntry:
     """Represents a single trace entry with its visual representation.
@@ -52,7 +74,7 @@ class TraceEntry:
     # Use field() for mutable defaults to avoid shared state issues
     inputs: dict = field(default_factory=dict)
     span_type: SpanType = "span"
-    evaluation_result: Optional[List[ScoringResult]] = field(default=None)
+    evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
     def print_entry(self):
         indent = "  " * self.depth
@@ -65,7 +87,8 @@ class TraceEntry:
         elif self.type == "input":
             print(f"{indent}Input: {self.inputs}")
         elif self.type == "evaluation":
-            print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
+            for evaluation_run in self.evaluation_runs:
+                print(f"{indent}Evaluation: {evaluation_run.model_dump()}")
     def _serialize_inputs(self) -> dict:
         """Helper method to serialize input data safely.
@@ -112,7 +135,7 @@ class TraceEntry:
             "duration": self.duration,
             "output": self._serialize_output(),
             "inputs": self._serialize_inputs(),
-            "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
+            "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
             "span_type": self.span_type
         }
@@ -121,8 +144,29 @@ class TraceEntry:
         Handles special cases:
         - Pydantic models are converted using model_dump()
+        - We try to serialize into JSON, then string, then the base representation (__repr__)
         - Non-serializable objects return None with a warning
         """
+        def safe_stringify(output, function_name):
+            """
+            Safely converts an object to a string or repr, handling serialization issues gracefully.
+            """
+            try:
+                return str(output)
+            except (TypeError, OverflowError, ValueError):
+                pass
+            try:
+                return repr(output)
+            except (TypeError, OverflowError, ValueError):
+                pass
+            warnings.warn(
+                f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
+            )
+            return None
         if isinstance(self.output, BaseModel):
             return self.output.model_dump()
@@ -131,8 +175,107 @@ class TraceEntry:
             json.dumps(self.output)
             return self.output
         except (TypeError, OverflowError, ValueError):
-            warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
-            return None
+            return safe_stringify(self.output, self.function)
+class TraceManagerClient:
+    """
+    Client for handling trace endpoints with the Judgment API
+    Operations include:
+    - Fetching a trace by id
+    - Saving a trace
+    - Deleting a trace
+    """
+    def __init__(self, judgment_api_key: str):
+        self.judgment_api_key = judgment_api_key
+    def fetch_trace(self, trace_id: str):
+        """
+        Fetch a trace by its id
+        """
+        response = requests.post(
+            JUDGMENT_TRACES_FETCH_API_URL,
+            json={
+                "trace_id": trace_id,
+                "judgment_api_key": self.judgment_api_key,
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to fetch traces: {response.text}")
+        return response.json()
+    def save_trace(self, trace_data: dict, empty_save: bool):
+        """
+        Saves a trace to the database
+        Args:
+            trace_data: The trace data to save
+            empty_save: Whether to save an empty trace
+            NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
+        """
+        response = requests.post(
+            JUDGMENT_TRACES_SAVE_API_URL,
+            json=trace_data,
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code == HTTPStatus.BAD_REQUEST:
+            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
+        elif response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to save trace data: {response.text}")
+        if not empty_save and "ui_results_url" in response.json():
+            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
+    def delete_trace(self, trace_id: str):
+        """
+        Delete a trace from the database.
+        """
+        response = requests.delete(
+            JUDGMENT_TRACES_DELETE_API_URL,
+            json={
+                "judgment_api_key": self.judgment_api_key,
+                "trace_ids": [trace_id],
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to delete trace: {response.text}")
+        return response.json()
+    def delete_traces(self, trace_ids: List[str]):
+        """
+        Delete a batch of traces from the database.
+        """
+        response = requests.delete(
+            JUDGMENT_TRACES_DELETE_API_URL,
+            json={
+                "judgment_api_key": self.judgment_api_key,
+                "trace_ids": trace_ids,
+            },
+            headers={
+                "Content-Type": "application/json",
+            }
+        )
+        if response.status_code != HTTPStatus.OK:
+            raise ValueError(f"Failed to delete trace: {response.text}")
+        return response.json()
 class TraceClient:
     """Client for managing a single trace context"""
@@ -147,6 +290,7 @@ class TraceClient:
         self.span_type = None
         self._current_span: Optional[TraceEntry] = None
         self.overwrite = overwrite
+        self.trace_manager_client = TraceManagerClient(tracer.api_key)  # Manages DB operations for trace data
     @contextmanager
     def span(self, name: str, span_type: SpanType = "span"):
@@ -163,6 +307,7 @@ class TraceClient:
             span_type=span_type
         ))
+        # Increment nested depth and set current span
         self.tracer.depth += 1
         prev_span = self._current_span
         self._current_span = name
@@ -185,7 +330,7 @@ class TraceClient:
             ))
             self._current_span = prev_span
-    async def async_evaluate(
+    def async_evaluate(
         self,
         scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
         input: Optional[str] = None,
@@ -211,25 +356,40 @@ class TraceClient:
             additional_metadata=additional_metadata,
             trace_id=self.trace_id
         )
-        scoring_results = self.client.run_evaluation(
-            examples=[example],
-            scorers=scorers,
-            model=model,
-            metadata={},
+        try:
+            # Load appropriate implementations for all scorers
+            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
+                scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
+                for scorer in scorers
+            ]
+        except Exception as e:
+            raise ValueError(f"Failed to load scorers: {str(e)}")
+        eval_run = EvaluationRun(
             log_results=log_results,
             project_name=self.project_name,
-            eval_run_name=(
-                f"{self.name.capitalize()}-"
+            eval_name=f"{self.name.capitalize()}-"
                 f"{self._current_span}-"
-                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
-            ),
+                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
+            examples=[example],
+            scorers=loaded_scorers,
+            model=model,
+            metadata={},
+            judgment_api_key=self.tracer.api_key,
             override=self.overwrite
         )
-        self.record_evaluation(scoring_results, start_time)  # Pass start_time to record_evaluation
+        self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
-    def record_evaluation(self, results: List[ScoringResult], start_time: float):
-        """Record evaluation results for the current span"""
+    def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
+        """
+        Add evaluation run data to the trace
+        Args:
+            eval_run (EvaluationRun): The evaluation run to add to the trace
+            start_time (float): The start time of the evaluation run
+        """
         if self._current_span:
             duration = time.time() - start_time  # Calculate duration from start_time
@@ -239,7 +399,7 @@ class TraceClient:
                 depth=self.tracer.depth,
                 message=f"Evaluation results for {self._current_span}",
                 timestamp=time.time(),
-                evaluation_result=results,
+                evaluation_runs=[eval_run],
                 duration=duration,
                 span_type="evaluation"
             ))
@@ -320,7 +480,7 @@ class TraceClient:
                     "timestamp": entry["timestamp"],
                     "inputs": None,
                     "output": None,
-                    "evaluation_result": None,
+                    "evaluation_runs": [],
                     "span_type": entry.get("span_type", "span")
                 }
                 active_functions.append(function)
@@ -343,8 +503,8 @@ class TraceClient:
                 if entry["type"] == "output" and entry["output"]:
                     current_entry["output"] = entry["output"]
-                if entry["type"] == "evaluation" and entry["evaluation_result"]:
-                    current_entry["evaluation_result"] = entry["evaluation_result"]
+                if entry["type"] == "evaluation" and entry["evaluation_runs"]:
+                    current_entry["evaluation_runs"] = entry["evaluation_runs"]
         # Sort by timestamp
         condensed.sort(key=lambda x: x["timestamp"])
@@ -361,6 +521,24 @@ class TraceClient:
         raw_entries = [entry.to_dict() for entry in self.entries]
         condensed_entries = self.condense_trace(raw_entries)
+        # Calculate total token counts from LLM API calls
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+        for entry in condensed_entries:
+            if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
+                usage = entry["output"].get("usage", {})
+                # Handle OpenAI/Together format
+                if "prompt_tokens" in usage:
+                    total_prompt_tokens += usage.get("prompt_tokens", 0)
+                    total_completion_tokens += usage.get("completion_tokens", 0)
+                # Handle Anthropic format
+                elif "input_tokens" in usage:
+                    total_prompt_tokens += usage.get("input_tokens", 0)
+                    total_completion_tokens += usage.get("output_tokens", 0)
+                total_tokens += usage.get("total_tokens", 0)
         # Create trace document
         trace_data = {
             "trace_id": self.trace_id,
@@ -370,31 +548,38 @@ class TraceClient:
             "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
             "duration": total_duration,
             "token_counts": {
-                "prompt_tokens": 0,  # Dummy value
-                "completion_tokens": 0,  # Dummy value
-                "total_tokens": 0,  # Dummy value
-            },  # TODO: Add token counts
+                "prompt_tokens": total_prompt_tokens,
+                "completion_tokens": total_completion_tokens,
+                "total_tokens": total_tokens,
+            },
             "entries": condensed_entries,
             "empty_save": empty_save,
             "overwrite": overwrite
         }
-        # Save trace data by making POST request to API
-        response = requests.post(
-            JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code == HTTPStatus.BAD_REQUEST:
-            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
-        elif response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to save trace data: {response.text}")
+        if not empty_save:
+            connection = pika.BlockingConnection(
+                pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
+            channel = connection.channel()
+            channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
+            channel.basic_publish(
+                exchange='',
+                routing_key=RABBITMQ_QUEUE,
+                body=json.dumps(trace_data),
+                properties=pika.BasicProperties(
+                    delivery_mode=pika.DeliveryMode.Transient  # Changed from Persistent to Transient
+                ))
+            connection.close()
+        self.trace_manager_client.save_trace(trace_data, empty_save)
         return self.trace_id, trace_data
+    def delete(self):
+        return self.trace_manager_client.delete_trace(self.trace_id)
 class Tracer:
     _instance = None
@@ -403,7 +588,7 @@ class Tracer:
             cls._instance = super(Tracer, cls).__new__(cls)
         return cls._instance
-    def __init__(self, api_key: str):
+    def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
         if not hasattr(self, 'initialized'):
             if not api_key:

judgeval/constants.py CHANGED Viewed

@@ -32,15 +32,25 @@ class APIScorer(str, Enum):
                 return member
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
-## API URLs
+# API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
+JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
 JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
 JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
+JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
+JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
+JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
 JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
+JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
-## Models
+# RabbitMQ
+RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
+RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
+RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
+# Models
 TOGETHER_SUPPORTED_MODELS = {
     "QWEN": "Qwen/Qwen2-72B-Instruct",
     "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
@@ -50,7 +60,9 @@ TOGETHER_SUPPORTED_MODELS = {
     "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 }
-ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys())
+JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
+ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
 ## System settings
 MAX_WORKER_THREADS = 10

judgeval/data/datasets/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from judgeval.data.datasets.dataset import EvalDataset
 from judgeval.data.datasets.ground_truth import GroundTruthExample
+from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
-__all__ = ["EvalDataset", "GroundTruthExample"]
+__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -2,16 +2,11 @@ import ast
 import csv
 import datetime
 import json
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn
-import requests
 from dataclasses import dataclass, field
 import os
 from typing import List, Optional, Union, Literal
-from judgeval.constants import JUDGMENT_DATASETS_PUSH_API_URL, JUDGMENT_DATASETS_PULL_API_URL
 from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data.datasets.utils import ground_truths_to_examples, examples_to_ground_truths
 from judgeval.data import Example
 from judgeval.common.logger import debug, error, warning, info
@@ -37,120 +32,6 @@ class EvalDataset:
         self._id = None
         self.judgment_api_key = judgment_api_key
-    def push(self, alias: str, overwrite: Optional[bool] = False) -> bool:
-        debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
-        if overwrite:
-            warning(f"Overwrite enabled for alias '{alias}'")
-        """
-        Pushes the dataset to Judgment platform
-        Mock request:
-        {
-            "alias": alias,
-            "ground_truths": [...],
-            "examples": [...],
-            "overwrite": overwrite
-        } ==>
-        {
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        with Progress(
-            SpinnerColumn(style="rgb(106,0,255)"),
-            TextColumn("[progress.description]{task.description}"),
-            transient=False,
-        ) as progress:
-            task_id = progress.add_task(
-                f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
-                total=100,
-            )
-            content = {
-                    "alias": alias,
-                    "ground_truths": [g.to_dict() for g in self.ground_truths],
-                    "examples": [e.to_dict() for e in self.examples],
-                    "overwrite": overwrite,
-                    "judgment_api_key": self.judgment_api_key
-                }
-            try:
-                response = requests.post(
-                    JUDGMENT_DATASETS_PUSH_API_URL,
-                    json=content
-                )
-                if response.status_code == 500:
-                    error(f"Server error during push: {content.get('message')}")
-                    return False
-                response.raise_for_status()
-            except requests.exceptions.HTTPError as err:
-                if response.status_code == 422:
-                    error(f"Validation error during push: {err.response.json()}")
-                else:
-                    error(f"HTTP error during push: {err}")
-            info(f"Successfully pushed dataset with alias '{alias}'")
-            payload = response.json()
-            self._alias = payload.get("_alias")
-            self._id = payload.get("_id")
-            progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
-            return True
-    def pull(self, alias: str):
-        debug(f"Pulling dataset with alias '{alias}'")
-        """
-        Pulls the dataset from Judgment platform
-        Mock request:
-        {
-            "alias": alias,
-            "user_id": user_id
-        }
-        ==>
-        {
-            "ground_truths": [...],
-            "examples": [...],
-            "_alias": alias,
-            "_id": "..."  # ID of the dataset
-        }
-        """
-        # Make a POST request to the Judgment API to get the dataset
-        with Progress(
-                SpinnerColumn(style="rgb(106,0,255)"),
-                TextColumn("[progress.description]{task.description}"),
-                transient=False,
-            ) as progress:
-                task_id = progress.add_task(
-                    f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
-                    total=100,
-                )
-                request_body = {
-                    "alias": alias,
-                    "judgment_api_key": self.judgment_api_key
-                }
-                try:
-                    response = requests.post(
-                        JUDGMENT_DATASETS_PULL_API_URL,
-                        json=request_body
-                    )
-                    response.raise_for_status()
-                except requests.exceptions.RequestException as e:
-                    error(f"Error pulling dataset: {str(e)}")
-                    raise
-                info(f"Successfully pulled dataset with alias '{alias}'")
-                payload = response.json()
-                self.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
-                self.examples = [Example(**e) for e in payload.get("examples", [])]
-                self._alias = payload.get("_alias")
-                self._id = payload.get("_id")
-                progress.update(
-                    task_id,
-                    description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
-                )
     def add_from_json(self, file_path: str) -> None:
         debug(f"Loading dataset from JSON file: {file_path}")
@@ -402,6 +283,4 @@ class EvalDataset:
             f"_alias={self._alias}, "
             f"_id={self._id}"
             f")"
-        )
+        )

judgeval 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

judgeval 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl