PyPI - judgeval - Versions diffs - 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl - Mend

judgeval 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

judgeval/__init__.py CHANGED Viewed

@@ -1,12 +1,10 @@
 # Import key components that should be publicly accessible
-from judgeval.clients import client, langfuse, together_client
+from judgeval.clients import client, together_client
 from judgeval.judgment_client import JudgmentClient
 __all__ = [
     # Clients
     'client',
-    'langfuse',
     'together_client',
     'JudgmentClient',
 ]

judgeval/clients.py CHANGED Viewed

@@ -1,19 +1,12 @@
 import os
 from dotenv import load_dotenv
 from openai import OpenAI
-from langfuse import Langfuse
 from typing import Optional
 from together import Together, AsyncTogether
 PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
 load_dotenv(dotenv_path=PATH_TO_DOTENV)
-# Initialize required clients
-langfuse = Langfuse(
-    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
-    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
-    host=os.getenv("LANGFUSE_HOST"),
-)
 # Initialize optional OpenAI client
 client: Optional['OpenAI'] = None

judgeval/common/logger.py CHANGED Viewed

@@ -2,7 +2,6 @@ import logging
 from logging.handlers import RotatingFileHandler
 import sys
 from pathlib import Path
-from datetime import datetime
 from contextlib import contextmanager
 # Global variables

judgeval/common/tracer.py CHANGED Viewed

@@ -1,60 +1,68 @@
 """
 Tracing system for judgeval that allows for function tracing using decorators.
 """
-import os
-import time
+# Standard library imports
+import asyncio
 import functools
-import requests
-import uuid
-from contextlib import contextmanager
-from typing import (
-    Optional,
-    Any,
-    List,
-    Literal,
-    Tuple,
-    Generator,
-    TypeAlias,
-    Union
-)
-from dataclasses import (
-    dataclass,
-    field
-)
-from datetime import datetime
-from openai import OpenAI
-from together import Together
-from anthropic import Anthropic
-from typing import Dict
 import inspect
-import asyncio
 import json
+import os
+import time
+import uuid
 import warnings
-from pydantic import BaseModel
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime
 from http import HTTPStatus
+from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
+from rich import print as rprint
+from uuid import UUID
+from collections.abc import Sequence
+# Third-party imports
 import pika
-import os
+import requests
+from pydantic import BaseModel
+from rich import print as rprint
+from openai import OpenAI
+from together import Together
+from anthropic import Anthropic
-from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL,JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
+# Local application/library-specific imports
+from judgeval.constants import (
+    JUDGMENT_TRACES_SAVE_API_URL,
+    JUDGMENT_TRACES_FETCH_API_URL,
+    RABBITMQ_HOST,
+    RABBITMQ_PORT,
+    RABBITMQ_QUEUE,
+    JUDGMENT_TRACES_DELETE_API_URL,
+    JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
+)
 from judgeval.judgment_client import JudgmentClient
 from judgeval.data import Example
 from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
 from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
-from judgeval.judges import JudgevalJudge
+from judgeval.data.result import ScoringResult
-from rich import print as rprint
+from langchain_core.language_models import BaseChatModel
+from langchain_huggingface import ChatHuggingFace
+from langchain_openai import ChatOpenAI
+from langchain_anthropic import ChatAnthropic
+from langchain_core.utils.function_calling import convert_to_openai_tool
+from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
+from langchain_core.agents import AgentAction, AgentFinish
+from langchain_core.outputs import LLMResult
-from judgeval.data.result import ScoringResult
+from langchain_core.messages.ai import AIMessage
+from langchain_core.messages.tool import ToolMessage
+from langchain_core.messages.base import BaseMessage
+from langchain_core.documents import Document
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
 TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation']  # Valid trace entry types
-SpanType = Literal['span', 'tool', 'llm', 'evaluation']
+SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
 @dataclass
 class TraceEntry:
     """Represents a single trace entry with its visual representation.
@@ -419,7 +427,7 @@ class TraceClient:
             # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
             if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
-                raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
+                raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
         except Exception as e:
             warnings.warn(f"Failed to load scorers: {str(e)}")
@@ -455,9 +463,12 @@ class TraceClient:
         if self._current_span:
             duration = time.time() - start_time  # Calculate duration from start_time
+            prev_entry = self.entries[-1]
+            # Select the last entry in the trace if it's an LLM call, otherwise use the current span
             self.add_entry(TraceEntry(
                 type="evaluation",
-                function=self._current_span,
+                function=prev_entry.function if prev_entry.span_type == "llm" else self._current_span,
                 depth=self.tracer.depth,
                 message=f"Evaluation results for {self._current_span}",
                 timestamp=time.time(),
@@ -531,7 +542,7 @@ class TraceClient:
         active_functions = []  # Stack to track nested function calls
         function_entries = {}  # Store entries for each function
-        for entry in entries:
+        for i, entry in enumerate(entries):
             function = entry["function"]
             if entry["type"] == "enter":
@@ -553,9 +564,12 @@ class TraceClient:
                 current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
                 condensed.append(current_entry)
                 active_functions.remove(function)
-                del function_entries[function]
+                # del function_entries[function]
-            elif function in active_functions:
+            # The OR condition is to handle the LLM client case.
+            # LLM client is a special case where we exit the span, so when we attach evaluations to it,
+            # we have to check if the previous entry is an LLM call.
+            elif function in active_functions or entry["type"] == "evaluation" and entries[i-1]["function"] == entry["function"]:
                 # Update existing function entry with additional data
                 current_entry = function_entries[function]
@@ -570,6 +584,7 @@ class TraceClient:
         # Sort by timestamp
         condensed.sort(key=lambda x: x["timestamp"])
         return condensed
     def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
@@ -581,6 +596,7 @@ class TraceClient:
         total_duration = self.get_duration()
         raw_entries = [entry.to_dict() for entry in self.entries]
         condensed_entries = self.condense_trace(raw_entries)
         # Calculate total token counts from LLM API calls
@@ -755,7 +771,7 @@ class Tracer:
                     with trace.span(span_name, span_type=span_type) as span:
                         # Record inputs
                         span.record_input({
-                            'args': list(args),
+                            'args': str(args),
                             'kwargs': kwargs
                         })
@@ -792,7 +808,7 @@ class Tracer:
                     with trace.span(span_name, span_type=span_type) as span:
                         # Record inputs
                         span.record_input({
-                            'args': list(args),
+                            'args': str(args),
                             'kwargs': kwargs
                         })
@@ -810,6 +826,28 @@ class Tracer:
                         self._current_trace = None
             return wrapper
+    def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
+        """
+        Decorator to trace function execution with detailed entry/exit information.
+        """
+        if func is None:
+            return lambda f: self.observe(f, name=name, span_type=span_type)
+        if asyncio.iscoroutinefunction(func):
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                if self._current_trace:
+                    self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model=model, log_results=log_results)
+            return async_wrapper
+        else:
+            @functools.wraps(func)
+            def wrapper(*args, **kwargs):
+                if self._current_trace:
+                    self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
+            return wrapper
 def wrap(client: Any) -> Any:
     """
@@ -920,3 +958,173 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
             "total_tokens": response.usage.input_tokens + response.usage.output_tokens
         }
     }
+class JudgevalCallbackHandler(BaseCallbackHandler):
+    def __init__(self, trace_client: TraceClient):
+        self.trace_client = trace_client
+        self.openai_count = 1
+    def start_span(self, name: str, span_type: SpanType = "span"):
+        start_time = time.time()
+        # Record span entry
+        self.trace_client.add_entry(TraceEntry(
+            type="enter",
+            function=name,
+            depth=self.trace_client.tracer.depth,
+            message=name,
+            timestamp=start_time,
+            span_type=span_type
+        ))
+        self.trace_client.tracer.depth += 1
+        self.trace_client.prev_span = self.trace_client._current_span
+        self.trace_client._current_span = name
+        self._start_time = start_time
+    def end_span(self, name: str, span_type: SpanType = "span"):
+        self.trace_client.tracer.depth -= 1
+        duration = time.time() - self._start_time
+        # Record span exit
+        self.trace_client.add_entry(TraceEntry(
+            type="exit",
+            function=name,
+            depth=self.trace_client.tracer.depth,
+            message=f"← {name}",
+            timestamp=time.time(),
+            duration=duration,
+            span_type=span_type
+        ))
+        self.trace_client._current_span = self.trace_client.prev_span
+    def on_retriever_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        query: str,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        tags: Optional[list[str]] = None,
+        metadata: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> Any:
+        name = "RETRIEVER_CALL"
+        if serialized and "name" in serialized:
+            name = f"RETRIEVER_{serialized['name'].upper()}"
+        self.start_span(name, span_type="retriever")
+        self.trace_client.record_input({
+            'query': query,
+            'tags': tags,
+            'metadata': metadata,
+            'kwargs': kwargs
+        })
+    def on_retriever_end(
+        self,
+        documents: Sequence[Document],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any
+    ) -> Any:
+        # Process the retrieved documents into a format suitable for logging
+        doc_summary = []
+        for i, doc in enumerate(documents):
+            # Extract key information from each document
+            doc_data = {
+                "index": i,
+                "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
+                "metadata": doc.metadata
+            }
+            doc_summary.append(doc_data)
+        # Record the document data
+        self.trace_client.record_output({
+            "document_count": len(documents),
+            "documents": doc_summary
+        })
+        # End the retriever span
+        self.end_span(self.trace_client._current_span, span_type="retriever")
+    def on_tool_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        input_str: str,
+        run_id: Optional[UUID] = None,
+        parent_run_id: Optional[UUID] = None,
+        inputs: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ):
+        name = serialized["name"]
+        self.start_span(name, span_type="tool")
+        self.trace_client.record_input({
+            'args': input_str,
+            'kwargs': kwargs
+        })
+    def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
+        self.trace_client.record_output(output)
+        self.end_span(self.trace_client._current_span, span_type="tool")
+    def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
+        print(f"Agent action: {action}")
+    def on_agent_finish(
+            self,
+            finish: AgentFinish,
+            *,
+            run_id: UUID,
+            parent_run_id: Optional[UUID] = None,
+            tags: Optional[list[str]] = None,
+            **kwargs: Any,
+        ) -> None:
+            print(f"Agent action: {finish}")
+    def on_llm_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        prompts: list[str],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        name = "LLM call"
+        self.start_span(name, span_type="llm")
+        self.trace_client.record_input({
+            'args': prompts,
+            'kwargs': kwargs
+        })
+    def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
+        self.trace_client.record_output(response.generations[0][0].text)
+        self.end_span(self.trace_client._current_span, span_type="llm")
+    def on_chat_model_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        messages: list[list[BaseMessage]],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        if "openai" in serialized["id"]:
+            name = f"OPENAI_API_CALL_{self.openai_count}"
+            self.openai_count += 1
+        elif "anthropic" in serialized["id"]:
+            name = "ANTHROPIC_API_CALL"
+        elif "together" in serialized["id"]:
+            name = "TOGETHER_API_CALL"
+        else:
+            name = "LLM call"
+        self.start_span(name, span_type="llm")
+        self.trace_client.record_input({
+            'args': str(messages),
+            'kwargs': kwargs
+        })

judgeval/common/utils.py CHANGED Viewed

@@ -8,15 +8,19 @@ For API calling, we support:
 NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
 """
-import concurrent.futures
-from typing import List, Mapping, Dict, Union, Optional, Literal, Any
+# Standard library imports
 import asyncio
+import concurrent.futures
+import os
+import pprint
+from typing import Any, Dict, List, Literal, Mapping, Optional, Union
+# Third-party imports
 import litellm
 import pydantic
-import pprint
-import os
-from dotenv import load_dotenv
+from dotenv import load_dotenv
+# Local application/library-specific imports
 from judgeval.clients import async_together_client, together_client
 from judgeval.constants import *
 from judgeval.common.logger import debug, error

judgeval/constants.py CHANGED Viewed

@@ -21,8 +21,11 @@ class APIScorer(str, Enum):
     CONTEXTUAL_RECALL = "contextual_recall"
     CONTEXTUAL_RELEVANCY = "contextual_relevancy"
     CONTEXTUAL_PRECISION = "contextual_precision"
+    INSTRUCTION_ADHERENCE = "instruction_adherence"
     TOOL_CORRECTNESS = "tool_correctness"
     JSON_CORRECTNESS = "json_correctness"
+    COMPARISON = "comparison"
+    GROUNDEDNESS = "groundedness"
     @classmethod
     def _missing_(cls, value):
@@ -31,6 +34,8 @@ class APIScorer(str, Enum):
             if member.value == value.lower():
                 return member
+UNBOUNDED_SCORERS = set([APIScorer.COMPARISON])  # scorers whose scores are not bounded between 0-1
 ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 # API URLs
 JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
@@ -110,7 +115,7 @@ TOGETHER_SUPPORTED_MODELS = [
   "mistralai/Mistral-7B-Instruct-v0.1"
 ]
-JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
+JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
 ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS

judgeval/data/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
+from judgeval.data.ground_truth import GroundTruthExample
 __all__ = [
     "Example",
@@ -12,4 +13,5 @@ __all__ = [
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
+    "GroundTruthExample",
 ]

judgeval/data/api_example.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from typing import List, Optional, Dict, Any, Union
-from pydantic import BaseModel, Field, ConfigDict, model_validator
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, ConfigDict, model_validator
 from judgeval.data.example import Example
 from judgeval.data.scorer_data import ScorerData

judgeval/data/datasets/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from judgeval.data.datasets.dataset import EvalDataset
-from judgeval.data.datasets.ground_truth import GroundTruthExample
 from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
-__all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
+__all__ = ["EvalDataset", "EvalDatasetClient"]

judgeval/data/datasets/dataset.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import ast
 import csv
-import datetime
+import datetime
 import json
-from dataclasses import dataclass, field
 import os
-from typing import List, Optional, Union, Literal
+from dataclasses import dataclass, field
+from typing import List, Union, Literal
-from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data import Example
+from judgeval.data import Example, GroundTruthExample
 from judgeval.common.logger import debug, error, warning, info
 @dataclass

judgeval/data/datasets/eval_dataset_client.py CHANGED Viewed

@@ -11,9 +11,8 @@ from judgeval.constants import (
     JUDGMENT_DATASETS_EDIT_API_URL,
     JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
 )
-from judgeval.data import Example
+from judgeval.data import Example, GroundTruthExample
 from judgeval.data.datasets import EvalDataset
-from judgeval.data.datasets.ground_truth import GroundTruthExample

judgeval/data/datasets/utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import List, Optional
-from judgeval.data.datasets.ground_truth import GroundTruthExample
-from judgeval.data import Example
+from judgeval.data import Example, GroundTruthExample
 def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:

judgeval/data/example.py CHANGED Viewed

@@ -2,17 +2,12 @@
 Classes for representing examples in a dataset.
 """
-from typing import TypeVar, Optional, Any, Dict, List
+from typing import Optional, Any, Dict, List
 from uuid import uuid4
 from pydantic import BaseModel, Field, field_validator
 from enum import Enum
 from datetime import datetime
-import time
-Input = TypeVar('Input')
-Output = TypeVar('Output')
 class ExampleParams(Enum):
     INPUT = "input"
@@ -23,11 +18,12 @@ class ExampleParams(Enum):
     TOOLS_CALLED = "tools_called"
     EXPECTED_TOOLS = "expected_tools"
     REASONING = "reasoning"
+    ADDITIONAL_METADATA = "additional_metadata"
 class Example(BaseModel):
-    input: Input
-    actual_output: Output
+    input: str
+    actual_output: str
     expected_output: Optional[str] = None
     context: Optional[List[str]] = None
     retrieval_context: Optional[List[str]] = None
@@ -39,22 +35,81 @@ class Example(BaseModel):
     example_index: Optional[int] = None
     timestamp: Optional[str] = None
     trace_id: Optional[str] = None
-    @field_validator('input', 'actual_output', mode='before')
-    def convert_to_str(cls, value):
-        try:
-            return str(value)
-        except Exception:
-            return repr(value)
     def __init__(self, **data):
+        # Check that required fields are provided
+        if 'input' not in data:
+            raise ValueError("Example must be initialized with 'input' field.")
+        if 'actual_output' not in data:
+            raise ValueError("Example must be initialized with 'actual_output' field.")
         if 'example_id' not in data:
             data['example_id'] = str(uuid4())
         # Set timestamp if not provided
         if 'timestamp' not in data:
             data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
         super().__init__(**data)
+    @field_validator('input', mode='before')
+    @classmethod
+    def validate_input(cls, v):
+        if not v or not isinstance(v, str):
+            raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
+        return v
+    @field_validator('actual_output', mode='before')
+    @classmethod
+    def validate_actual_output(cls, v):
+        if not isinstance(v, str):
+            raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
+        return v
+    @field_validator('expected_output', mode='before')
+    @classmethod
+    def validate_expected_output(cls, v):
+        if v is not None and not isinstance(v, str):
+            raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
+        return v
+    @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
+    @classmethod
+    def validate_string_lists(cls, v, info):
+        field_name = info.field_name
+        if v is not None:
+            if not isinstance(v, list):
+                raise ValueError(f"{field_name} must be a list of strings or None but got {v} of type {type(v)}")
+            for i, item in enumerate(v):
+                if not isinstance(item, str):
+                    raise ValueError(f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}")
+        return v
+    @field_validator('additional_metadata', mode='before')
+    @classmethod
+    def validate_additional_metadata(cls, v):
+        if v is not None and not isinstance(v, dict):
+            raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
+        return v
+    @field_validator('example_index', mode='before')
+    @classmethod
+    def validate_example_index(cls, v):
+        if v is not None and not isinstance(v, int):
+            raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
+        return v
+    @field_validator('timestamp', mode='before')
+    @classmethod
+    def validate_timestamp(cls, v):
+        if v is not None and not isinstance(v, str):
+            raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
+        return v
+    @field_validator('trace_id', mode='before')
+    @classmethod
+    def validate_trace_id(cls, v):
+        if v is not None and not isinstance(v, str):
+            raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
+        return v
     def to_dict(self):
         return {

judgeval 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl

judgeval 0.0.17py3-none-any.whl → 0.0.19py3-none-any.whl