PyPI - judgeval - Versions diffs - 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl - Mend

judgeval 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

judgeval/common/tracer.py +48 -252
judgeval/data/__init__.py +1 -2
judgeval/integrations/langgraph.py +316 -0
judgeval-0.0.25.dist-info/METADATA +156 -0
{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/RECORD +7 -9
judgeval/data/custom_example.py +0 -98
judgeval/data/datasets/utils.py +0 -0
judgeval/data/ground_truth.py +0 -0
judgeval-0.0.23.dist-info/METADATA +0 -40
{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/WHEEL +0 -0
{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer.py CHANGED Viewed

@@ -10,16 +10,12 @@ import os
 import time
 import uuid
 import warnings
-from contextvars import ContextVar
 from contextlib import contextmanager
-from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from http import HTTPStatus
 from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
 from rich import print as rprint
-from uuid import UUID
-from collections.abc import Sequence
 # Third-party imports
 import pika
@@ -48,19 +44,6 @@ from judgeval.rules import Rule
 from judgeval.evaluation_run import EvaluationRun
 from judgeval.data.result import ScoringResult
-from langchain_core.language_models import BaseChatModel
-from langchain_huggingface import ChatHuggingFace
-from langchain_openai import ChatOpenAI
-from langchain_anthropic import ChatAnthropic
-from langchain_core.utils.function_calling import convert_to_openai_tool
-from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
-from langchain_core.agents import AgentAction, AgentFinish
-from langchain_core.outputs import LLMResult
-from langchain_core.tracers.context import register_configure_hook
-from langchain_core.messages.ai import AIMessage
-from langchain_core.messages.tool import ToolMessage
-from langchain_core.messages.base import BaseMessage
-from langchain_core.documents import Document
 # Define type aliases for better code readability and maintainability
 ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic]  # Supported API clients
@@ -125,8 +108,7 @@ class TraceEntry:
                 if self._is_json_serializable(value):
                     serialized_inputs[key] = value
                 else:
-                    warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
-                    serialized_inputs[key] = None
+                    serialized_inputs[key] = self.safe_stringify(value, self.function)
         return serialized_inputs
     def _is_json_serializable(self, obj: Any) -> bool:
@@ -137,6 +119,25 @@ class TraceEntry:
         except (TypeError, OverflowError, ValueError):
             return False
+    def safe_stringify(self, output, function_name):
+        """
+        Safely converts an object to a string or repr, handling serialization issues gracefully.
+        """
+        try:
+            return str(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        try:
+            return repr(output)
+        except (TypeError, OverflowError, ValueError):
+            pass
+        warnings.warn(
+            f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
+        )
+        return None
     def to_dict(self) -> dict:
         """Convert the trace entry to a dictionary format for storage/transmission."""
         return {
@@ -160,25 +161,6 @@ class TraceEntry:
         - We try to serialize into JSON, then string, then the base representation (__repr__)
         - Non-serializable objects return None with a warning
         """
-        def safe_stringify(output, function_name):
-            """
-            Safely converts an object to a string or repr, handling serialization issues gracefully.
-            """
-            try:
-                return str(output)
-            except (TypeError, OverflowError, ValueError):
-                pass
-            try:
-                return repr(output)
-            except (TypeError, OverflowError, ValueError):
-                pass
-            warnings.warn(
-                f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
-            )
-            return None
         if isinstance(self.output, BaseModel):
             return self.output.model_dump()
@@ -188,7 +170,7 @@ class TraceEntry:
             json.dumps(self.output)
             return self.output
         except (TypeError, OverflowError, ValueError):
-            return safe_stringify(self.output, self.function)
+            return self.safe_stringify(self.output, self.function)
 class TraceManagerClient:
@@ -331,6 +313,8 @@ class TraceClient:
         project_name: str = "default_project",
         overwrite: bool = False,
         rules: Optional[List[Rule]] = None,
+        enable_monitoring: bool = True,
+        enable_evaluations: bool = True
     ):
         self.name = name
         self.trace_id = trace_id or str(uuid.uuid4())
@@ -339,6 +323,8 @@ class TraceClient:
         self.tracer = tracer
         # Initialize rules with either provided rules or an empty list
         self.rules = rules or []
+        self.enable_monitoring = enable_monitoring
+        self.enable_evaluations = enable_evaluations
         self.client: JudgmentClient = tracer.client
         self.entries: List[TraceEntry] = []
@@ -399,6 +385,9 @@ class TraceClient:
         model: Optional[str] = None,
         log_results: Optional[bool] = True
     ):
+        if not self.enable_evaluations:
+            return
         start_time = time.time()  # Record start time
         example = Example(
             input=input,
@@ -698,7 +687,10 @@ class Tracer:
         api_key: str = os.getenv("JUDGMENT_API_KEY"),
         project_name: str = "default_project",
         rules: Optional[List[Rule]] = None,  # Added rules parameter
-        organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
+        organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
+        enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
+        enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
+        ):
         if not hasattr(self, 'initialized'):
             if not api_key:
                 raise ValueError("Tracer must be configured with a Judgment API key")
@@ -714,6 +706,8 @@ class Tracer:
             self._current_trace: Optional[str] = None
             self.rules: List[Rule] = rules or []  # Store rules at tracer level
             self.initialized: bool = True
+            self.enable_monitoring: bool = enable_monitoring
+            self.enable_evaluations: bool = enable_evaluations
         elif hasattr(self, 'project_name') and self.project_name != project_name:
             warnings.warn(
                 f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
@@ -740,7 +734,9 @@ class Tracer:
             name,
             project_name=project,
             overwrite=overwrite,
-            rules=self.rules  # Pass combined rules to the trace client
+            rules=self.rules,  # Pass combined rules to the trace client
+            enable_monitoring=self.enable_monitoring,
+            enable_evaluations=self.enable_evaluations
         )
         prev_trace = self._current_trace
         self._current_trace = trace
@@ -771,6 +767,9 @@ class Tracer:
             project_name: Optional project name override
             overwrite: Whether to overwrite existing traces
         """
+        if not self.enable_monitoring:
+            return
         if func is None:
             return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
@@ -787,7 +786,7 @@ class Tracer:
                     trace_id = str(uuid.uuid4())
                     trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring, enable_evaluations=self.enable_evaluations)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -824,7 +823,7 @@ class Tracer:
                     trace_id = str(uuid.uuid4())
                     trace_name = func.__name__
                     project = project_name if project_name is not None else self.project_name
-                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
+                    trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring)
                     self._current_trace = trace
                     # Only save empty trace for the root call
                     trace.save(empty_save=True, overwrite=overwrite)
@@ -872,6 +871,11 @@ class Tracer:
                     self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
             return wrapper
+    def async_evaluate(self, *args, **kwargs):
+        if self._current_trace:
+            self._current_trace.async_evaluate(*args, **kwargs)
+        else:
+            warnings.warn("No trace found, skipping evaluation")
 def wrap(client: Any) -> Any:
@@ -982,212 +986,4 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
             "output_tokens": response.usage.output_tokens,
             "total_tokens": response.usage.input_tokens + response.usage.output_tokens
         }
-    }
-class JudgevalCallbackHandler(BaseCallbackHandler):
-    def __init__(self, trace_client: TraceClient):
-        self.trace_client = trace_client
-        self.previous_node = "__start__"
-        self.executed_node_tools = []
-        self.executed_nodes = []
-        self.executed_tools = []
-        self.openai_count = 1
-    def start_span(self, name: str, span_type: SpanType = "span"):
-        start_time = time.time()
-        # Record span entry
-        self.trace_client.add_entry(TraceEntry(
-            type="enter",
-            function=name,
-            depth=self.trace_client.tracer.depth,
-            message=name,
-            timestamp=start_time,
-            span_type=span_type
-        ))
-        self.trace_client.tracer.depth += 1
-        self.trace_client.prev_span = self.trace_client._current_span
-        self.trace_client._current_span = name
-        self._start_time = start_time
-    def end_span(self, name: str, span_type: SpanType = "span"):
-        self.trace_client.tracer.depth -= 1
-        duration = time.time() - self._start_time
-        # Record span exit
-        self.trace_client.add_entry(TraceEntry(
-            type="exit",
-            function=name,
-            depth=self.trace_client.tracer.depth,
-            message=f"← {name}",
-            timestamp=time.time(),
-            duration=duration,
-            span_type=span_type
-        ))
-        self.trace_client._current_span = self.trace_client.prev_span
-    def on_retriever_start(
-        self,
-        serialized: Optional[dict[str, Any]],
-        query: str,
-        *,
-        run_id: UUID,
-        parent_run_id: Optional[UUID] = None,
-        tags: Optional[list[str]] = None,
-        metadata: Optional[dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Any:
-        name = "RETRIEVER_CALL"
-        if serialized and "name" in serialized:
-            name = f"RETRIEVER_{serialized['name'].upper()}"
-        self.start_span(name, span_type="retriever")
-        self.trace_client.record_input({
-            'query': query,
-            'tags': tags,
-            'metadata': metadata,
-            'kwargs': kwargs
-        })
-    def on_retriever_end(
-        self,
-        documents: Sequence[Document],
-        *,
-        run_id: UUID,
-        parent_run_id: Optional[UUID] = None,
-        **kwargs: Any
-    ) -> Any:
-        # Process the retrieved documents into a format suitable for logging
-        doc_summary = []
-        for i, doc in enumerate(documents):
-            # Extract key information from each document
-            doc_data = {
-                "index": i,
-                "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
-                "metadata": doc.metadata
-            }
-            doc_summary.append(doc_data)
-        # Record the document data
-        self.trace_client.record_output({
-            "document_count": len(documents),
-            "documents": doc_summary
-        })
-        # End the retriever span
-        self.end_span(self.trace_client._current_span, span_type="retriever")
-    def on_chain_start(
-        self,
-        serialized: Dict[str, Any],
-        inputs: Dict[str, Any],
-        *,
-        run_id: UUID,
-        parent_run_id: Optional[UUID] = None,
-        tags: Optional[List[str]] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        **kwargs: Any
-    ) -> None:
-        node = metadata.get("langgraph_node")
-        if node != None and node != "__start__" and node != self.previous_node:
-            self.executed_node_tools.append(node)
-            self.executed_nodes.append(node)
-        self.previous_node = node
-    def on_tool_start(
-        self,
-        serialized: Optional[dict[str, Any]],
-        input_str: str,
-        run_id: Optional[UUID] = None,
-        parent_run_id: Optional[UUID] = None,
-        inputs: Optional[dict[str, Any]] = None,
-        **kwargs: Any,
-    ):
-        name = serialized["name"]
-        self.start_span(name, span_type="tool")
-        self.executed_node_tools.append(f"{self.previous_node}:{name}")
-        self.executed_tools.append(name)
-        self.trace_client.record_input({
-            'args': input_str,
-            'kwargs': kwargs
-        })
-    def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
-        self.trace_client.record_output(output)
-        self.end_span(self.trace_client._current_span, span_type="tool")
-    def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
-        print(f"Agent action: {action}")
-    def on_agent_finish(
-            self,
-            finish: AgentFinish,
-            *,
-            run_id: UUID,
-            parent_run_id: Optional[UUID] = None,
-            tags: Optional[list[str]] = None,
-            **kwargs: Any,
-        ) -> None:
-            print(f"Agent action: {finish}")
-    def on_llm_start(
-        self,
-        serialized: Optional[dict[str, Any]],
-        prompts: list[str],
-        *,
-        run_id: UUID,
-        parent_run_id: Optional[UUID] = None,
-        **kwargs: Any,
-    ) -> Any:
-        name = "LLM call"
-        self.start_span(name, span_type="llm")
-        self.trace_client.record_input({
-            'args': prompts,
-            'kwargs': kwargs
-        })
-    def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
-        self.trace_client.record_output(response.generations[0][0].text)
-        self.end_span(self.trace_client._current_span, span_type="llm")
-    def on_chat_model_start(
-        self,
-        serialized: Optional[dict[str, Any]],
-        messages: list[list[BaseMessage]],
-        *,
-        run_id: UUID,
-        parent_run_id: Optional[UUID] = None,
-        **kwargs: Any,
-    ) -> Any:
-        if "openai" in serialized["id"]:
-            name = f"OPENAI_API_CALL_{self.openai_count}"
-            self.openai_count += 1
-        elif "anthropic" in serialized["id"]:
-            name = "ANTHROPIC_API_CALL"
-        elif "together" in serialized["id"]:
-            name = "TOGETHER_API_CALL"
-        else:
-            name = "LLM call"
-        self.start_span(name, span_type="llm")
-        self.trace_client.record_input({
-            'args': str(messages),
-            'kwargs': kwargs
-        })
-judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
-    "judgeval_callback_handler", default=None
-)
-def set_global_handler(handler: JudgevalCallbackHandler):
-    judgeval_callback_handler_var.set(handler)
-def clear_global_handler():
-    judgeval_callback_handler_var.set(None)
-register_configure_hook(
-    context_var=judgeval_callback_handler_var,
-    inheritable=True,
-)
+    }

judgeval/data/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
 from judgeval.data.api_example import ProcessExample, create_process_example
 from judgeval.data.scorer_data import ScorerData, create_scorer_data
 from judgeval.data.result import ScoringResult, generate_scoring_result
-from judgeval.data.custom_example import CustomExample
 __all__ = [
     "Example",
     "ExampleParams",
@@ -12,5 +12,4 @@ __all__ = [
     "create_scorer_data",
     "ScoringResult",
     "generate_scoring_result",
-    "CustomExample",
 ]

judgeval/integrations/langgraph.py ADDED Viewed

@@ -0,0 +1,316 @@
+from typing import Any, Dict, List, Optional, Sequence
+from uuid import UUID
+import time
+import uuid
+from contextvars import ContextVar
+from judgeval.common.tracer import TraceClient, TraceEntry, Tracer, SpanType
+from langchain_core.language_models import BaseChatModel
+from langchain_huggingface import ChatHuggingFace
+from langchain_openai import ChatOpenAI
+from langchain_anthropic import ChatAnthropic
+from langchain_core.utils.function_calling import convert_to_openai_tool
+from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
+from langchain_core.agents import AgentAction, AgentFinish
+from langchain_core.outputs import LLMResult
+from langchain_core.tracers.context import register_configure_hook
+from langchain_core.messages.ai import AIMessage
+from langchain_core.messages.tool import ToolMessage
+from langchain_core.messages.base import BaseMessage
+from langchain_core.documents import Document
+class JudgevalCallbackHandler(BaseCallbackHandler):
+    def __init__(self, tracer: Tracer):
+        self.tracer = tracer
+        self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
+        self.previous_spans = [] # stack of previous spans
+        self.finished = False
+        # Attributes for users to access
+        self.previous_node = None
+        self.executed_node_tools = []
+        self.executed_nodes = []
+        self.executed_tools = []
+    def start_span(self, name: str, span_type: SpanType = "span"):
+        start_time = time.time()
+        # Record span entry
+        self.trace_client.add_entry(TraceEntry(
+            type="enter",
+            function=name,
+            depth=self.trace_client.tracer.depth,
+            message=name,
+            timestamp=start_time,
+            span_type=span_type
+        ))
+        self.trace_client.tracer.depth += 1
+        self.previous_spans.append(self.trace_client._current_span)
+        self.trace_client._current_span = name
+        self._start_time = start_time
+    def end_span(self, name: str, span_type: SpanType = "span"):
+        self.trace_client.tracer.depth -= 1
+        duration = time.time() - self._start_time
+        # Record span exit
+        self.trace_client.add_entry(TraceEntry(
+            type="exit",
+            function=name,
+            depth=self.trace_client.tracer.depth,
+            message=f"{name}",
+            timestamp=time.time(),
+            duration=duration,
+            span_type=span_type
+        ))
+        self.trace_client._current_span = self.previous_spans.pop()
+        if self.trace_client.tracer.depth == 0:
+            # Save the trace if we are the root, this is when users dont use any @observe decorators
+            self.trace_client.save(empty_save=False, overwrite=True)
+            self.trace_client._current_trace = None
+    def on_retriever_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        query: str,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        tags: Optional[list[str]] = None,
+        metadata: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> Any:
+        name = "RETRIEVER_CALL"
+        if serialized and "name" in serialized:
+            name = f"RETRIEVER_{serialized['name'].upper()}"
+        self.start_span(name, span_type="retriever")
+        self.trace_client.record_input({
+            'query': query,
+            'tags': tags,
+            'metadata': metadata,
+            'kwargs': kwargs
+        })
+    def on_retriever_end(
+        self,
+        documents: Sequence[Document],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any
+    ) -> Any:
+        # Process the retrieved documents into a format suitable for logging
+        doc_summary = []
+        for i, doc in enumerate(documents):
+            # Extract key information from each document
+            doc_data = {
+                "index": i,
+                "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
+                "metadata": doc.metadata
+            }
+            doc_summary.append(doc_data)
+        # Record the document data
+        self.trace_client.record_output({
+            "document_count": len(documents),
+            "documents": doc_summary
+        })
+        # End the retriever span
+        self.end_span(self.trace_client._current_span, span_type="retriever")
+    def on_chain_start(
+        self,
+        serialized: Dict[str, Any],
+        inputs: Dict[str, Any],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        **kwargs: Any
+    ) -> None:
+        # If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
+        if kwargs.get('name') == 'LangGraph':
+            if not self.trace_client:
+                trace_id = str(uuid.uuid4())
+                project = self.tracer.project_name
+                trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
+                self.trace_client = trace
+                self.tracer._current_trace = trace # set the trace in the original tracer object
+                # Only save empty trace for the root call
+                self.trace_client.save(empty_save=True, overwrite=False)
+            self.start_span("LangGraph", span_type="Main Function")
+        node = metadata.get("langgraph_node")
+        if node != None and node != self.previous_node:
+            self.start_span(node, span_type="node")
+            self.executed_node_tools.append(node)
+            self.executed_nodes.append(node)
+            self.trace_client.record_input({
+                'args': inputs,
+                'kwargs': kwargs
+            })
+        self.previous_node = node
+    def on_chain_end(
+        self,
+        outputs: Dict[str, Any],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        tags: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> Any:
+        if outputs == "__end__":
+            self.finished = True
+        if tags is not None and any("graph:step" in tag for tag in tags):
+            self.trace_client.record_output(outputs)
+            self.end_span(self.trace_client._current_span, span_type="node")
+            if self.finished:
+                self.end_span(self.trace_client._current_span, span_type="Main Function")
+    def on_chain_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        print(f"Chain error: {error}")
+        self.trace_client.record_output(error)
+        self.end_span(self.trace_client._current_span, span_type="node")
+    def on_tool_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        input_str: str,
+        run_id: Optional[UUID] = None,
+        parent_run_id: Optional[UUID] = None,
+        inputs: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ):
+        name = serialized["name"]
+        self.start_span(name, span_type="tool")
+        self.executed_node_tools.append(f"{self.previous_node}:{name}")
+        self.executed_tools.append(name)
+        self.trace_client.record_input({
+            'args': input_str,
+            'kwargs': kwargs
+        })
+    def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
+        self.trace_client.record_output(output)
+        self.end_span(self.trace_client._current_span, span_type="tool")
+    def on_tool_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        print(f"Tool error: {error}")
+        self.trace_client.record_output(error)
+        self.end_span(self.trace_client._current_span, span_type="tool")
+    def on_agent_action(
+        self,
+        action: AgentAction,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        print(f"Agent action: {action}")
+    def on_agent_finish(
+        self,
+        finish: AgentFinish,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        print(f"Agent finish: {finish}")
+    def on_llm_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        prompts: list[str],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        name = "LLM call"
+        self.start_span(name, span_type="llm")
+        self.trace_client.record_input({
+            'args': prompts,
+            'kwargs': kwargs
+        })
+    def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
+        self.trace_client.record_output(response.generations[0][0].text)
+        self.end_span(self.trace_client._current_span, span_type="llm")
+    def on_llm_error(
+        self,
+        error: BaseException,
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        print(f"LLM error: {error}")
+        self.trace_client.record_output(error)
+        self.end_span(self.trace_client._current_span, span_type="llm")
+    def on_chat_model_start(
+        self,
+        serialized: Optional[dict[str, Any]],
+        messages: list[list[BaseMessage]],
+        *,
+        run_id: UUID,
+        parent_run_id: Optional[UUID] = None,
+        **kwargs: Any,
+    ) -> Any:
+        if "openai" in serialized["id"]:
+            name = f"OPENAI_API_CALL"
+        elif "anthropic" in serialized["id"]:
+            name = "ANTHROPIC_API_CALL"
+        elif "together" in serialized["id"]:
+            name = "TOGETHER_API_CALL"
+        else:
+            name = "LLM call"
+        self.start_span(name, span_type="llm")
+        self.trace_client.record_input({
+            'args': str(messages),
+            'kwargs': kwargs
+        })
+judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
+    "judgeval_callback_handler", default=None
+)
+def set_global_handler(handler: JudgevalCallbackHandler):
+    if not handler.tracer.enable_monitoring:
+        return
+    judgeval_callback_handler_var.set(handler)
+def clear_global_handler():
+    judgeval_callback_handler_var.set(None)
+register_configure_hook(
+    context_var=judgeval_callback_handler_var,
+    inheritable=True,
+)

judgeval-0.0.25.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,156 @@
+Metadata-Version: 2.4
+Name: judgeval
+Version: 0.0.25
+Summary: Judgeval Package
+Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
+Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
+Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
+License-Expression: Apache-2.0
+License-File: LICENSE.md
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.11
+Requires-Dist: anthropic
+Requires-Dist: fastapi
+Requires-Dist: langchain
+Requires-Dist: langchain-anthropic
+Requires-Dist: langchain-core
+Requires-Dist: langchain-huggingface
+Requires-Dist: langchain-openai
+Requires-Dist: litellm
+Requires-Dist: nest-asyncio
+Requires-Dist: openai
+Requires-Dist: openpyxl
+Requires-Dist: pandas
+Requires-Dist: pika
+Requires-Dist: python-dotenv==1.0.1
+Requires-Dist: requests
+Requires-Dist: supabase
+Requires-Dist: together
+Requires-Dist: uvicorn
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
+Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
+Requires-Dist: pytest>=8.3.4; extra == 'dev'
+Requires-Dist: tavily-python; extra == 'dev'
+Description-Content-Type: text/markdown
+# Judgeval SDK
+Judgeval is an open-source framework for building evaluation pipelines for multi-step agent workflows, supporting both real-time and experimental evaluation setups. To learn more about Judgment or sign up for free, visit our [website](https://www.judgmentlabs.ai/) or check out our [developer docs](https://judgment.mintlify.app/getting_started).
+## Features
+- **Development and Production Evaluation Layer**: Offers a robust evaluation layer for multi-step agent applications, including unit-testing and performance monitoring.
+- **Plug-and-Evaluate**: Integrate LLM systems with 10+ research-backed metrics, including:
+  - Hallucination detection
+  - RAG retriever quality
+  - And more
+- **Custom Evaluation Pipelines**: Construct powerful custom evaluation pipelines tailored for your LLM systems.
+- **Monitoring in Production**: Utilize state-of-the-art real-time evaluation foundation models to monitor LLM systems effectively.
+## Installation
+   ```bash
+   pip install judgeval
+   ```
+## Quickstart: Evaluations
+You can evaluate your workflow execution data to measure quality metrics such as hallucination.
+Create a file named `evaluate.py` with the following code:
+   ```python
+    from judgeval import JudgmentClient
+    from judgeval.data import Example
+    from judgeval.scorers import FaithfulnessScorer
+    client = JudgmentClient()
+    example = Example(
+        input="What if these shoes don't fit?",
+        actual_output="We offer a 30-day full refund at no extra cost.",
+        retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
+    )
+    scorer = FaithfulnessScorer(threshold=0.5)
+    results = client.run_evaluation(
+        examples=[example],
+        scorers=[scorer],
+        model="gpt-4o",
+    )
+    print(results)
+   ```
+   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation
+## Quickstart: Traces
+Track your workflow execution for full observability with just a few lines of code.
+Create a file named `traces.py` with the following code:
+   ```python
+    from judgeval.common.tracer import Tracer, wrap
+    from openai import OpenAI
+    client = wrap(OpenAI())
+    judgment = Tracer(project_name="my_project")
+    @judgment.observe(span_type="tool")
+    def my_tool():
+        return "Hello world!"
+    @judgment.observe(span_type="function")
+    def main():
+        task_input = my_tool()
+        res = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[{"role": "user", "content": f"{task_input}"}]
+        )
+        return res.choices[0].message.content
+   ```
+   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation
+## Quickstart: Online Evaluations
+Apply performance monitoring to measure the quality of your systems in production, not just on historical data.
+Using the same traces.py file we created earlier:
+   ```python
+    from judgeval.common.tracer import Tracer, wrap
+    from judgeval.scorers import AnswerRelevancyScorer
+    from openai import OpenAI
+    client = wrap(OpenAI())
+    judgment = Tracer(project_name="my_project")
+    @judgment.observe(span_type="tool")
+    def my_tool():
+        return "Hello world!"
+    @judgment.observe(span_type="function")
+    def main():
+        task_input = my_tool()
+        res = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[{"role": "user", "content": f"{task_input}"}]
+        ).choices[0].message.content
+        judgment.get_current_trace().async_evaluate(
+            scorers=[AnswerRelevancyScorer(threshold=0.5)],
+            input=task_input,
+            actual_output=res,
+            model="gpt-4o"
+        )
+        return res
+   ```
+   Click [here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation
+## Documentation and Demos
+For more detailed documentation, please check out our [docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
+##

{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/RECORD RENAMED Viewed

@@ -8,19 +8,17 @@ judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23
 judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
 judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
 judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
-judgeval/common/tracer.py,sha256=WFjFNf3NZ2BN8UAu2MG0F3Om9LgJNma3m_GrxyXgJqE,46655
+judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
 judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
-judgeval/data/__init__.py,sha256=6ADbugtS3AporRv23Hxm67qcghU4tj0OScS8t3xLd6U,549
+judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
 judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
-judgeval/data/custom_example.py,sha256=C-j9iVenBy52dwnL6PIjJAdKsBO1ajKjsaRr4RJthUo,3676
 judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
-judgeval/data/ground_truth.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
 judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
 judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
 judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
 judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
-judgeval/data/datasets/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
 judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
 judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
 judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -89,7 +87,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
 judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
 judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
 judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
-judgeval-0.0.23.dist-info/METADATA,sha256=EkRIGemm8UvM5J4RBR5KVzBfn0XTBBYvJjRM4-F0s0w,1378
-judgeval-0.0.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.0.23.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.0.23.dist-info/RECORD,,
+judgeval-0.0.25.dist-info/METADATA,sha256=09S16QU5qwYqwvrsdg36KVvv9-tnVcSKccgDldPqWpQ,5418
+judgeval-0.0.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.0.25.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.0.25.dist-info/RECORD,,

judgeval/data/custom_example.py DELETED Viewed

@@ -1,98 +0,0 @@
-from pydantic import BaseModel, Field, field_validator
-from typing import Optional, Dict, Any
-from uuid import uuid4
-from datetime import datetime
-import json
-import warnings
-# Brainstorming what are the requirements for the fields?
-class CustomExample(BaseModel):
-    name: Optional[str] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    example_id: str = Field(default_factory=lambda: str(uuid4()))
-    example_index: Optional[int] = None
-    timestamp: Optional[str] = None
-    trace_id: Optional[str] = None
-    model_config = {
-        "extra": "allow",  # Allow extra fields with any types
-    }
-    def __init__(self, **data):
-        if 'example_id' not in data:
-            data['example_id'] = str(uuid4())
-        # Set timestamp if not provided
-        if 'timestamp' not in data:
-            data['timestamp'] = datetime.now().isoformat()
-        super().__init__(**data)
-    @field_validator('additional_metadata', mode='before')
-    @classmethod
-    def validate_additional_metadata(cls, v):
-        if v is not None and not isinstance(v, dict):
-            raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
-        return v
-    @field_validator('example_index', mode='before')
-    @classmethod
-    def validate_example_index(cls, v):
-        if v is not None and not isinstance(v, int):
-            raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
-        return v
-    @field_validator('timestamp', mode='before')
-    @classmethod
-    def validate_timestamp(cls, v):
-        if v is not None and not isinstance(v, str):
-            raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
-        return v
-    @field_validator('trace_id', mode='before')
-    @classmethod
-    def validate_trace_id(cls, v):
-        if v is not None and not isinstance(v, str):
-            raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
-        return v
-    def to_dict(self):
-        return self.model_dump()
-    def __str__(self):
-        return str(self.model_dump())
-    def model_dump(self, **kwargs):
-        """
-        Custom serialization that handles special cases for fields that might fail standard serialization.
-        """
-        data = super().model_dump(**kwargs)
-        # Get all fields including custom ones
-        all_fields = self.__dict__
-        for field_name, value in all_fields.items():
-            try:
-                # Check if the field has its own serialization method
-                if hasattr(value, 'to_dict'):
-                    data[field_name] = value.to_dict()
-                elif hasattr(value, 'model_dump'):
-                    data[field_name] = value.model_dump()
-                # Field is already in data from super().model_dump()
-                elif field_name in data:
-                    continue
-                else:
-                    # Try standard JSON serialization
-                    json.dumps(value)
-                    data[field_name] = value
-            except (TypeError, OverflowError, ValueError):
-                # Handle non-serializable objects
-                try:
-                    # Try converting to string
-                    data[field_name] = str(value)
-                except Exception as _:
-                    # If all else fails, store as None and optionally warn
-                    warnings.warn(f"Could not serialize field {field_name}, setting to None")
-                    data[field_name] = None
-        return data

judgeval/data/datasets/utils.py DELETED Viewed

File without changes

judgeval/data/ground_truth.py DELETED Viewed

File without changes

judgeval-0.0.23.dist-info/METADATA DELETED Viewed

@@ -1,40 +0,0 @@
-Metadata-Version: 2.4
-Name: judgeval
-Version: 0.0.23
-Summary: Judgeval Package
-Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
-Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
-Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
-License-Expression: Apache-2.0
-License-File: LICENSE.md
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.11
-Requires-Dist: anthropic
-Requires-Dist: fastapi
-Requires-Dist: langchain
-Requires-Dist: langchain-anthropic
-Requires-Dist: langchain-core
-Requires-Dist: langchain-huggingface
-Requires-Dist: langchain-openai
-Requires-Dist: litellm
-Requires-Dist: nest-asyncio
-Requires-Dist: openai
-Requires-Dist: openpyxl
-Requires-Dist: pandas
-Requires-Dist: pika
-Requires-Dist: python-dotenv==1.0.1
-Requires-Dist: requests
-Requires-Dist: supabase
-Requires-Dist: together
-Requires-Dist: uvicorn
-Provides-Extra: dev
-Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
-Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
-Requires-Dist: pytest>=8.3.4; extra == 'dev'
-Requires-Dist: tavily-python; extra == 'dev'
-Description-Content-Type: text/markdown
-# judgeval
-Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.

{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

judgeval 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl