PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py DELETED Viewed

@@ -1,798 +0,0 @@
-"""
-Tracing system for judgeval that allows for function tracing using decorators.
-"""
-import os
-import time
-import functools
-import requests
-import uuid
-from contextlib import contextmanager
-from typing import (
-    Optional,
-    Any,
-    List,
-    Literal,
-    Tuple,
-    Generator,
-    TypeAlias,
-    Union
-)
-from dataclasses import (
-    dataclass,
-    field
-)
-from datetime import datetime
-from openai import OpenAI
-from together import Together
-from anthropic import Anthropic
-from typing import Dict
-import inspect
-import asyncio
-import json
-import warnings
-from pydantic import BaseModel
-from http import HTTPStatus
-import pika
-import os
-from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
-from judgeval.judgment_client import JudgmentClient
-from judgeval.data import Example
-from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
-from rich import print as rprint
-from judgeval.data.result import ScoringResult
-from judgeval.evaluation_run import EvaluationRun
-# Define type aliases for better code readability and maintainability
-ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
-TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
-SpanType = Literal['span', 'tool', 'llm', 'evaluation']
-@dataclass
-class TraceEntry:
-    """Represents a single trace entry with its visual representation.
-    Visual representations:
-    - enter: → (function entry)
-    - exit: ← (function exit)
-    - output: Output: (function return value)
-    - input: Input: (function parameters)
-    - evaluation: Evaluation: (evaluation results)
-    """
-    type: TraceEntryType
-    function: str  # Name of the function being traced
-    depth: int    # Indentation level for nested calls
-    message: str  # Human-readable description
-    timestamp: float  # Unix timestamp when entry was created
-    duration: Optional[float] = None  # Time taken (for exit/evaluation entries)
-    output: Any = None  # Function output value
-    # Use field() for mutable defaults to avoid shared state issues
-    inputs: dict = field(default_factory=dict)
-    span_type: SpanType = "span"
-    evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
-    def print_entry(self):
-        indent = "  " * self.depth
-        if self.type == "enter":
-            print(f"{indent}→ {self.function} (trace: {self.message})")
-        elif self.type == "exit":
-            print(f"{indent}← {self.function} ({self.duration:.3f}s)")
-        elif self.type == "output":
-            print(f"{indent}Output: {self.output}")
-        elif self.type == "input":
-            print(f"{indent}Input: {self.inputs}")
-        elif self.type == "evaluation":
-            for evaluation_run in self.evaluation_runs:
-                print(f"{indent}Evaluation: {evaluation_run.model_dump()}")
-    def _serialize_inputs(self) -> dict:
-        """Helper method to serialize input data safely.
-        Returns a dict with serializable versions of inputs, converting non-serializable
-        objects to None with a warning.
-        """
-        serialized_inputs = {}
-        for key, value in self.inputs.items():
-            if isinstance(value, BaseModel):
-                serialized_inputs[key] = value.model_dump()
-            elif isinstance(value, (list, tuple)):
-                # Handle lists/tuples of arguments
-                serialized_inputs[key] = [
-                    item.model_dump() if isinstance(item, BaseModel)
-                    else None if not self._is_json_serializable(item)
-                    else item
-                    for item in value
-                ]
-            else:
-                if self._is_json_serializable(value):
-                    serialized_inputs[key] = value
-                else:
-                    warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
-                    serialized_inputs[key] = None
-        return serialized_inputs
-    def _is_json_serializable(self, obj: Any) -> bool:
-        """Helper method to check if an object is JSON serializable."""
-        try:
-            json.dumps(obj)
-            return True
-        except (TypeError, OverflowError, ValueError):
-            return False
-    def to_dict(self) -> dict:
-        """Convert the trace entry to a dictionary format for storage/transmission."""
-        return {
-            "type": self.type,
-            "function": self.function,
-            "depth": self.depth,
-            "message": self.message,
-            "timestamp": self.timestamp,
-            "duration": self.duration,
-            "output": self._serialize_output(),
-            "inputs": self._serialize_inputs(),
-            "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
-            "span_type": self.span_type
-        }
-    def _serialize_output(self) -> Any:
-        """Helper method to serialize output data safely.
-        Handles special cases:
-        - Pydantic models are converted using model_dump()
-        - We try to serialize into JSON, then string, then the base representation (__repr__)
-        - Non-serializable objects return None with a warning
-        """
-        def safe_stringify(output, function_name):
-            """
-            Safely converts an object to a string or repr, handling serialization issues gracefully.
-            """
-            try:
-                return str(output)
-            except (TypeError, OverflowError, ValueError):
-                pass
-            try:
-                return repr(output)
-            except (TypeError, OverflowError, ValueError):
-                pass
-            warnings.warn(
-                f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
-            )
-            return None
-        if isinstance(self.output, BaseModel):
-            return self.output.model_dump()
-        try:
-            # Try to serialize the output to verify it's JSON compatible
-            json.dumps(self.output)
-            return self.output
-        except (TypeError, OverflowError, ValueError):
-            return safe_stringify(self.output, self.function)
-class TraceManagerClient:
-    """
-    Client for handling trace endpoints with the Judgment API
-    Operations include:
-    - Fetching a trace by id
-    - Saving a trace
-    - Deleting a trace
-    """
-    def __init__(self, judgment_api_key: str):
-        self.judgment_api_key = judgment_api_key
-    def fetch_trace(self, trace_id: str):
-        """
-        Fetch a trace by its id
-        """
-        response = requests.post(
-            JUDGMENT_TRACES_FETCH_API_URL,
-            json={
-                "trace_id": trace_id,
-                "judgment_api_key": self.judgment_api_key,
-            },
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to fetch traces: {response.text}")
-        return response.json()
-    def save_trace(self, trace_data: dict, empty_save: bool):
-        """
-        Saves a trace to the database
-        Args:
-            trace_data: The trace data to save
-            empty_save: Whether to save an empty trace
-            NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
-        """
-        response = requests.post(
-            JUDGMENT_TRACES_SAVE_API_URL,
-            json=trace_data,
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code == HTTPStatus.BAD_REQUEST:
-            raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
-        elif response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to save trace data: {response.text}")
-        if not empty_save and "ui_results_url" in response.json():
-            rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
-    def delete_trace(self, trace_id: str):
-        """
-        Delete a trace from the database.
-        """
-        response = requests.delete(
-            JUDGMENT_TRACES_DELETE_API_URL,
-            json={
-                "judgment_api_key": self.judgment_api_key,
-                "trace_ids": [trace_id],
-            },
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to delete trace: {response.text}")
-        return response.json()
-    def delete_traces(self, trace_ids: List[str]):
-        """
-        Delete a batch of traces from the database.
-        """
-        response = requests.delete(
-            JUDGMENT_TRACES_DELETE_API_URL,
-            json={
-                "judgment_api_key": self.judgment_api_key,
-                "trace_ids": trace_ids,
-            },
-            headers={
-                "Content-Type": "application/json",
-            }
-        )
-        if response.status_code != HTTPStatus.OK:
-            raise ValueError(f"Failed to delete trace: {response.text}")
-        return response.json()
-class TraceClient:
-    """Client for managing a single trace context"""
-    def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
-        self.tracer = tracer
-        self.trace_id = trace_id
-        self.name = name
-        self.project_name = project_name
-        self.client: JudgmentClient = tracer.client
-        self.entries: List[TraceEntry] = []
-        self.start_time = time.time()
-        self.span_type = None
-        self._current_span: Optional[TraceEntry] = None
-        self.overwrite = overwrite
-        self.trace_manager_client = TraceManagerClient(tracer.api_key)  # Manages DB operations for trace data
-    @contextmanager
-    def span(self, name: str, span_type: SpanType = "span"):
-        """Context manager for creating a trace span"""
-        start_time = time.time()
-        # Record span entry
-        self.add_entry(TraceEntry(
-            type="enter",
-            function=name,
-            depth=self.tracer.depth,
-            message=name,
-            timestamp=start_time,
-            span_type=span_type
-        ))
-        # Increment nested depth and set current span
-        self.tracer.depth += 1
-        prev_span = self._current_span
-        self._current_span = name
-        try:
-            yield self
-        finally:
-            self.tracer.depth -= 1
-            duration = time.time() - start_time
-            # Record span exit
-            self.add_entry(TraceEntry(
-                type="exit",
-                function=name,
-                depth=self.tracer.depth,
-                message=f"← {name}",
-                timestamp=time.time(),
-                duration=duration,
-                span_type=span_type
-            ))
-            self._current_span = prev_span
-    def async_evaluate(
-        self,
-        scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
-        input: Optional[str] = None,
-        actual_output: Optional[str] = None,
-        expected_output: Optional[str] = None,
-        context: Optional[List[str]] = None,
-        retrieval_context: Optional[List[str]] = None,
-        tools_called: Optional[List[str]] = None,
-        expected_tools: Optional[List[str]] = None,
-        additional_metadata: Optional[Dict[str, Any]] = None,
-        model: Optional[str] = None,
-        log_results: Optional[bool] = True,
-    ):
-        start_time = time.time()  # Record start time
-        example = Example(
-            input=input,
-            actual_output=actual_output,
-            expected_output=expected_output,
-            context=context,
-            retrieval_context=retrieval_context,
-            tools_called=tools_called,
-            expected_tools=expected_tools,
-            additional_metadata=additional_metadata,
-            trace_id=self.trace_id
-        )
-        try:
-            # Load appropriate implementations for all scorers
-            loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
-                scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
-                for scorer in scorers
-            ]
-        except Exception as e:
-            raise ValueError(f"Failed to load scorers: {str(e)}")
-        eval_run = EvaluationRun(
-            log_results=log_results,
-            project_name=self.project_name,
-            eval_name=f"{self.name.capitalize()}-"
-                f"{self._current_span}-"
-                f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
-            examples=[example],
-            scorers=loaded_scorers,
-            model=model,
-            metadata={},
-            judgment_api_key=self.tracer.api_key,
-            override=self.overwrite
-        )
-        self.add_eval_run(eval_run, start_time)  # Pass start_time to record_evaluation
-    def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
-        """
-        Add evaluation run data to the trace
-        Args:
-            eval_run (EvaluationRun): The evaluation run to add to the trace
-            start_time (float): The start time of the evaluation run
-        """
-        if self._current_span:
-            duration = time.time() - start_time  # Calculate duration from start_time
-            self.add_entry(TraceEntry(
-                type="evaluation",
-                function=self._current_span,
-                depth=self.tracer.depth,
-                message=f"Evaluation results for {self._current_span}",
-                timestamp=time.time(),
-                evaluation_runs=[eval_run],
-                duration=duration,
-                span_type="evaluation"
-            ))
-    def record_input(self, inputs: dict):
-        """Record input parameters for the current span"""
-        if self._current_span:
-            self.add_entry(TraceEntry(
-                type="input",
-                function=self._current_span,
-                depth=self.tracer.depth,
-                message=f"Inputs to {self._current_span}",
-                timestamp=time.time(),
-                inputs=inputs,
-                span_type=self.span_type
-            ))
-    async def _update_coroutine_output(self, entry: TraceEntry, coroutine: Any):
-        """Helper method to update the output of a trace entry once the coroutine completes"""
-        try:
-            result = await coroutine
-            entry.output = result
-            return result
-        except Exception as e:
-            entry.output = f"Error: {str(e)}"
-            raise
-    def record_output(self, output: Any):
-        """Record output for the current span"""
-        if self._current_span:
-            entry = TraceEntry(
-                type="output",
-                function=self._current_span,
-                depth=self.tracer.depth,
-                message=f"Output from {self._current_span}",
-                timestamp=time.time(),
-                output="<pending>" if inspect.iscoroutine(output) else output,
-                span_type=self.span_type
-            )
-            self.add_entry(entry)
-            if inspect.iscoroutine(output):
-                # Create a task to update the output once the coroutine completes
-                asyncio.create_task(self._update_coroutine_output(entry, output))
-    def add_entry(self, entry: TraceEntry):
-        """Add a trace entry to this trace context"""
-        self.entries.append(entry)
-        return self
-    def print(self):
-        """Print the complete trace with proper visual structure"""
-        for entry in self.entries:
-            entry.print_entry()
-    def get_duration(self) -> float:
-        """
-        Get the total duration of this trace
-        """
-        return time.time() - self.start_time
-    def condense_trace(self, entries: List[dict]) -> List[dict]:
-        """
-        Condenses trace entries into a single entry for each function call.
-        """
-        condensed = []
-        active_functions = []  # Stack to track nested function calls
-        function_entries = {}  # Store entries for each function
-        for entry in entries:
-            function = entry["function"]
-            if entry["type"] == "enter":
-                # Initialize new function entry
-                function_entries[function] = {
-                    "depth": entry["depth"],
-                    "function": function,
-                    "timestamp": entry["timestamp"],
-                    "inputs": None,
-                    "output": None,
-                    "evaluation_runs": [],
-                    "span_type": entry.get("span_type", "span")
-                }
-                active_functions.append(function)
-            elif entry["type"] == "exit" and function in active_functions:
-                # Complete function entry
-                current_entry = function_entries[function]
-                current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
-                condensed.append(current_entry)
-                active_functions.remove(function)
-                del function_entries[function]
-            elif function in active_functions:
-                # Update existing function entry with additional data
-                current_entry = function_entries[function]
-                if entry["type"] == "input" and entry["inputs"]:
-                    current_entry["inputs"] = entry["inputs"]
-                if entry["type"] == "output" and entry["output"]:
-                    current_entry["output"] = entry["output"]
-                if entry["type"] == "evaluation" and entry["evaluation_runs"]:
-                    current_entry["evaluation_runs"] = entry["evaluation_runs"]
-        # Sort by timestamp
-        condensed.sort(key=lambda x: x["timestamp"])
-        return condensed
-    def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
-        """
-        Save the current trace to the database.
-        Returns a tuple of (trace_id, trace_data) where trace_data is the trace data that was saved.
-        """
-        # Calculate total elapsed time
-        total_duration = self.get_duration()
-        raw_entries = [entry.to_dict() for entry in self.entries]
-        condensed_entries = self.condense_trace(raw_entries)
-        # Calculate total token counts from LLM API calls
-        total_prompt_tokens = 0
-        total_completion_tokens = 0
-        total_tokens = 0
-        for entry in condensed_entries:
-            if entry.get("span_type") == "llm" and isinstance(entry.get("output"), dict):
-                usage = entry["output"].get("usage", {})
-                # Handle OpenAI/Together format
-                if "prompt_tokens" in usage:
-                    total_prompt_tokens += usage.get("prompt_tokens", 0)
-                    total_completion_tokens += usage.get("completion_tokens", 0)
-                # Handle Anthropic format
-                elif "input_tokens" in usage:
-                    total_prompt_tokens += usage.get("input_tokens", 0)
-                    total_completion_tokens += usage.get("output_tokens", 0)
-                total_tokens += usage.get("total_tokens", 0)
-        # Create trace document
-        trace_data = {
-            "trace_id": self.trace_id,
-            "api_key": self.tracer.api_key,
-            "name": self.name,
-            "project_name": self.project_name,
-            "created_at": datetime.fromtimestamp(self.start_time).isoformat(),
-            "duration": total_duration,
-            "token_counts": {
-                "prompt_tokens": total_prompt_tokens,
-                "completion_tokens": total_completion_tokens,
-                "total_tokens": total_tokens,
-            },
-            "entries": condensed_entries,
-            "empty_save": empty_save,
-            "overwrite": overwrite
-        }
-        if not empty_save:
-            connection = pika.BlockingConnection(
-                pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
-            channel = connection.channel()
-            channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
-            channel.basic_publish(
-                exchange='',
-                routing_key=RABBITMQ_QUEUE,
-                body=json.dumps(trace_data),
-                properties=pika.BasicProperties(
-                    delivery_mode=pika.DeliveryMode.Transient  # Changed from Persistent to Transient
-                ))
-            connection.close()
-        self.trace_manager_client.save_trace(trace_data, empty_save)
-        return self.trace_id, trace_data
-    def delete(self):
-        return self.trace_manager_client.delete_trace(self.trace_id)
-class Tracer:
-    _instance = None
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super(Tracer, cls).__new__(cls)
-        return cls._instance
-    def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
-        if not hasattr(self, 'initialized'):
-            if not api_key:
-                raise ValueError("Tracer must be configured with a Judgment API key")
-            self.api_key: str = api_key
-            self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
-            self.depth: int = 0
-            self._current_trace: Optional[str] = None
-            self.initialized: bool = True
-    @contextmanager
-    def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
-        """Start a new trace context using a context manager"""
-        trace_id = str(uuid.uuid4())
-        trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
-        prev_trace = self._current_trace
-        self._current_trace = trace
-        # Automatically create top-level span
-        with trace.span(name or "unnamed_trace") as span:
-            try:
-                # Save the trace to the database to handle Evaluations' trace_id referential integrity
-                trace.save(empty_save=True, overwrite=overwrite)
-                yield trace
-            finally:
-                self._current_trace = prev_trace
-    def get_current_trace(self) -> Optional[TraceClient]:
-        """
-        Get the current trace context
-        """
-        return self._current_trace
-    def observe(self, func=None, *, name=None, span_type: SpanType = "span"):
-        """
-        Decorator to trace function execution with detailed entry/exit information.
-        Args:
-            func: The function to trace
-            name: Optional custom name for the function
-            span_type: The type of span to use for this observation (default: "span")
-        """
-        if func is None:
-            return lambda f: self.observe(f, name=name, span_type=span_type)
-        if asyncio.iscoroutinefunction(func):
-            @functools.wraps(func)
-            async def async_wrapper(*args, **kwargs):
-                if self._current_trace:
-                    span_name = name or func.__name__
-                    with self._current_trace.span(span_name, span_type=span_type) as span:
-                        # Set the span type
-                        span.span_type = span_type
-                        # Record inputs
-                        span.record_input({
-                            'args': list(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = await func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
-                        return result
-                return await func(*args, **kwargs)
-            return async_wrapper
-        else:
-            @functools.wraps(func)
-            def wrapper(*args, **kwargs):
-                if self._current_trace:
-                    span_name = name or func.__name__
-                    with self._current_trace.span(span_name, span_type=span_type) as span:
-                        # Set the span type
-                        span.span_type = span_type
-                        # Record inputs
-                        span.record_input({
-                            'args': list(args),
-                            'kwargs': kwargs
-                        })
-                        # Execute function
-                        result = func(*args, **kwargs)
-                        # Record output
-                        span.record_output(result)
-                        return result
-                return func(*args, **kwargs)
-            return wrapper
-def wrap(client: Any) -> Any:
-    """
-    Wraps an API client to add tracing capabilities.
-    Supports OpenAI, Together, and Anthropic clients.
-    """
-    tracer = Tracer._instance  # Get the global tracer instance
-    # Get the appropriate configuration for this client type
-    span_name, original_create = _get_client_config(client)
-    def traced_create(*args, **kwargs):
-        # Skip tracing if no active trace
-        if not (tracer and tracer._current_trace):
-            return original_create(*args, **kwargs)
-        with tracer._current_trace.span(span_name, span_type="llm") as span:
-            # Format and record the input parameters
-            input_data = _format_input_data(client, **kwargs)
-            span.record_input(input_data)
-            # Make the actual API call
-            response = original_create(*args, **kwargs)
-            # Format and record the output
-            output_data = _format_output_data(client, response)
-            span.record_output(output_data)
-            return response
-    # Replace the original method with our traced version
-    if isinstance(client, (OpenAI, Together)):
-        client.chat.completions.create = traced_create
-    elif isinstance(client, Anthropic):
-        client.messages.create = traced_create
-    return client
-# Helper functions for client-specific operations
-def _get_client_config(client: ApiClient) -> tuple[str, callable]:
-    """Returns configuration tuple for the given API client.
-    Args:
-        client: An instance of OpenAI, Together, or Anthropic client
-    Returns:
-        tuple: (span_name, create_method)
-            - span_name: String identifier for tracing
-            - create_method: Reference to the client's creation method
-    Raises:
-        ValueError: If client type is not supported
-    """
-    if isinstance(client, OpenAI):
-        return "OPENAI_API_CALL", client.chat.completions.create
-    elif isinstance(client, Together):
-        return "TOGETHER_API_CALL", client.chat.completions.create
-    elif isinstance(client, Anthropic):
-        return "ANTHROPIC_API_CALL", client.messages.create
-    raise ValueError(f"Unsupported client type: {type(client)}")
-def _format_input_data(client: ApiClient, **kwargs) -> dict:
-    """Format input parameters based on client type.
-    Extracts relevant parameters from kwargs based on the client type
-    to ensure consistent tracing across different APIs.
-    """
-    if isinstance(client, (OpenAI, Together)):
-        return {
-            "model": kwargs.get("model"),
-            "messages": kwargs.get("messages"),
-        }
-    # Anthropic requires additional max_tokens parameter
-    return {
-        "model": kwargs.get("model"),
-        "messages": kwargs.get("messages"),
-        "max_tokens": kwargs.get("max_tokens")
-    }
-def _format_output_data(client: ApiClient, response: Any) -> dict:
-    """Format API response data based on client type.
-    Normalizes different response formats into a consistent structure
-    for tracing purposes.
-    Returns:
-        dict containing:
-            - content: The generated text
-            - usage: Token usage statistics
-    """
-    if isinstance(client, (OpenAI, Together)):
-        return {
-            "content": response.choices[0].message.content,
-            "usage": {
-                "prompt_tokens": response.usage.prompt_tokens,
-                "completion_tokens": response.usage.completion_tokens,
-                "total_tokens": response.usage.total_tokens
-            }
-        }
-    # Anthropic has a different response structure
-    return {
-        "content": response.content[0].text,
-        "usage": {
-            "input_tokens": response.usage.input_tokens,
-            "output_tokens": response.usage.output_tokens,
-            "total_tokens": response.usage.input_tokens + response.usage.output_tokens
-        }
-    }

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl