PyPI - judgeval - Versions diffs - 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

judgeval 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

judgeval/__init__.py +2 -2
judgeval/api/api_types.py +81 -12
judgeval/cli.py +2 -1
judgeval/constants.py +0 -6
judgeval/data/evaluation_run.py +2 -5
judgeval/data/judgment_types.py +97 -12
judgeval/data/trace.py +108 -1
judgeval/dataset/__init__.py +72 -23
judgeval/env.py +5 -20
judgeval/integrations/langgraph/__init__.py +9 -785
judgeval/scorers/api_scorer.py +7 -12
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
judgeval/scorers/score.py +1 -1
judgeval/scorers/utils.py +1 -4
judgeval/tracer/__init__.py +175 -156
judgeval/tracer/exporters/__init__.py +4 -1
judgeval/tracer/keys.py +15 -25
judgeval/tracer/llm/__init__.py +0 -1
judgeval/tracer/llm/anthropic/__init__.py +20 -0
judgeval/tracer/llm/google/__init__.py +21 -0
judgeval/tracer/llm/groq/__init__.py +20 -0
judgeval/tracer/llm/openai/__init__.py +32 -0
judgeval/tracer/llm/providers.py +28 -79
judgeval/tracer/llm/together/__init__.py +20 -0
judgeval/tracer/managers.py +23 -48
judgeval/tracer/processors/__init__.py +36 -75
judgeval/tracer/utils.py +1 -2
judgeval/utils/file_utils.py +0 -2
judgeval/utils/meta.py +18 -5
judgeval/utils/testing.py +0 -14
judgeval/utils/version_check.py +2 -0
judgeval/version.py +1 -1
{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +40 -35
{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/tracer/llm/groq/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+HAS_GROQ = False
+groq_Groq = None
+groq_AsyncGroq = None
+try:
+    from groq import Groq, AsyncGroq  # type: ignore[import-untyped]
+    groq_Groq = Groq
+    groq_AsyncGroq = AsyncGroq
+    HAS_GROQ = True
+except ImportError:
+    pass
+__all__ = [
+    "HAS_GROQ",
+    "groq_Groq",
+    "groq_AsyncGroq",
+]

judgeval/tracer/llm/openai/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+from __future__ import annotations
+HAS_OPENAI = False
+openai_OpenAI = None
+openai_AsyncOpenAI = None
+openai_ChatCompletion = None
+openai_Response = None
+openai_ParsedChatCompletion = None
+try:
+    from openai import OpenAI, AsyncOpenAI
+    from openai.types.chat.chat_completion import ChatCompletion
+    from openai.types.responses.response import Response
+    from openai.types.chat import ParsedChatCompletion
+    openai_OpenAI = OpenAI
+    openai_AsyncOpenAI = AsyncOpenAI
+    openai_ChatCompletion = ChatCompletion
+    openai_Response = Response
+    openai_ParsedChatCompletion = ParsedChatCompletion
+    HAS_OPENAI = True
+except ImportError:
+    pass
+__all__ = [
+    "HAS_OPENAI",
+    "openai_OpenAI",
+    "openai_AsyncOpenAI",
+    "openai_ChatCompletion",
+    "openai_Response",
+    "openai_ParsedChatCompletion",
+]

judgeval/tracer/llm/providers.py CHANGED Viewed

@@ -1,85 +1,34 @@
 from __future__ import annotations
 from typing import Any, TypeAlias
-HAS_OPENAI = False
-openai_OpenAI = None
-openai_AsyncOpenAI = None
-openai_ChatCompletion = None
-openai_Response = None
-openai_ParsedChatCompletion = None
-try:
-    from openai import OpenAI, AsyncOpenAI
-    from openai.types.chat.chat_completion import ChatCompletion
-    from openai.types.responses.response import Response
-    from openai.types.chat import ParsedChatCompletion
-    openai_OpenAI = OpenAI
-    openai_AsyncOpenAI = AsyncOpenAI
-    openai_ChatCompletion = ChatCompletion
-    openai_Response = Response
-    openai_ParsedChatCompletion = ParsedChatCompletion
-    HAS_OPENAI = True
-except ImportError:
-    pass
-HAS_TOGETHER = False
-together_Together = None
-together_AsyncTogether = None
-try:
-    from together import Together, AsyncTogether  # type: ignore[import-untyped]
-    together_Together = Together
-    together_AsyncTogether = AsyncTogether
-    HAS_TOGETHER = True
-except ImportError:
-    pass
-HAS_ANTHROPIC = False
-anthropic_Anthropic = None
-anthropic_AsyncAnthropic = None
-try:
-    from anthropic import Anthropic, AsyncAnthropic  # type: ignore[import-untyped]
-    anthropic_Anthropic = Anthropic
-    anthropic_AsyncAnthropic = AsyncAnthropic
-    HAS_ANTHROPIC = True
-except ImportError:
-    pass
-HAS_GOOGLE_GENAI = False
-google_genai_Client = None
-google_genai_cleint_AsyncClient = None
-try:
-    from google.genai import Client  # type: ignore[import-untyped]
-    from google.genai.client import AsyncClient  # type: ignore[import-untyped]
-    google_genai_Client = Client
-    google_genai_AsyncClient = AsyncClient
-    HAS_GOOGLE_GENAI = True
-except ImportError:
-    pass
-HAS_GROQ = False
-groq_Groq = None
-groq_AsyncGroq = None
-try:
-    from groq import Groq, AsyncGroq  # type: ignore[import-untyped]
-    groq_Groq = Groq
-    groq_AsyncGroq = AsyncGroq
-    HAS_GROQ = True
-except ImportError:
-    pass
+from judgeval.tracer.llm.openai import (
+    HAS_OPENAI,
+    openai_OpenAI,
+    openai_AsyncOpenAI,
+    openai_ChatCompletion,
+    openai_Response,
+    openai_ParsedChatCompletion,
+)
+from judgeval.tracer.llm.together import (
+    HAS_TOGETHER,
+    together_Together,
+    together_AsyncTogether,
+)
+from judgeval.tracer.llm.anthropic import (
+    HAS_ANTHROPIC,
+    anthropic_Anthropic,
+    anthropic_AsyncAnthropic,
+)
+from judgeval.tracer.llm.google import (
+    HAS_GOOGLE_GENAI,
+    google_genai_Client,
+    google_genai_AsyncClient,
+)
+from judgeval.tracer.llm.groq import (
+    HAS_GROQ,
+    groq_Groq,
+    groq_AsyncGroq,
+)
 # TODO: if we support dependency groups we can have this better type, but during runtime, we do

judgeval/tracer/llm/together/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+HAS_TOGETHER = False
+together_Together = None
+together_AsyncTogether = None
+try:
+    from together import Together, AsyncTogether  # type: ignore[import-untyped]
+    together_Together = Together
+    together_AsyncTogether = AsyncTogether
+    HAS_TOGETHER = True
+except ImportError:
+    pass
+__all__ = [
+    "HAS_TOGETHER",
+    "together_Together",
+    "together_AsyncTogether",
+]

judgeval/tracer/managers.py CHANGED Viewed

@@ -2,10 +2,9 @@ from __future__ import annotations
 from contextlib import asynccontextmanager, contextmanager
 from typing import TYPE_CHECKING, Dict, Optional, List, Any
-from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys
+from judgeval.tracer.keys import InternalAttributeKeys
 import uuid
 from judgeval.exceptions import JudgmentRuntimeError
-from judgeval.tracer.utils import set_span_attribute
 if TYPE_CHECKING:
     from judgeval.tracer import Tracer
@@ -21,29 +20,17 @@ def sync_span_context(
     if span_attributes is None:
         span_attributes = {}
-    current_cost_context = tracer.get_current_cost_context()
-    cost_context = {"cumulative_cost": 0.0}
-    cost_token = current_cost_context.set(cost_context)
-    try:
-        with tracer.get_tracer().start_as_current_span(
-            name=name,
-            attributes=span_attributes,
-        ) as span:
-            set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
-            if disable_partial_emit:
-                tracer.judgment_processor.set_internal_attribute(
-                    span_context=span.get_span_context(),
-                    key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
-                    value=True,
-                )
-            yield span
-    finally:
-        current_cost_context.reset(cost_token)
-        child_cost = float(cost_context.get("cumulative_cost", 0.0))
-        tracer.add_cost_to_current_context(child_cost)
+    with tracer.get_tracer().start_as_current_span(
+        name=name,
+        attributes=span_attributes,
+    ) as span:
+        if disable_partial_emit:
+            tracer.judgment_processor.set_internal_attribute(
+                span_context=span.get_span_context(),
+                key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
+                value=True,
+            )
+        yield span
 @asynccontextmanager
@@ -56,29 +43,17 @@ async def async_span_context(
     if span_attributes is None:
         span_attributes = {}
-    current_cost_context = tracer.get_current_cost_context()
-    cost_context = {"cumulative_cost": 0.0}
-    cost_token = current_cost_context.set(cost_context)
-    try:
-        with tracer.get_tracer().start_as_current_span(
-            name=name,
-            attributes=span_attributes,
-        ) as span:
-            set_span_attribute(span, AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST, 0.0)
-            if disable_partial_emit:
-                tracer.judgment_processor.set_internal_attribute(
-                    span_context=span.get_span_context(),
-                    key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
-                    value=True,
-                )
-            yield span
-    finally:
-        current_cost_context.reset(cost_token)
-        child_cost = float(cost_context.get("cumulative_cost", 0.0))
-        tracer.add_cost_to_current_context(child_cost)
+    with tracer.get_tracer().start_as_current_span(
+        name=name,
+        attributes=span_attributes,
+    ) as span:
+        if disable_partial_emit:
+            tracer.judgment_processor.set_internal_attribute(
+                span_context=span.get_span_context(),
+                key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
+                value=True,
+            )
+        yield span
 def create_agent_context(

judgeval/tracer/processors/__init__.py CHANGED Viewed

@@ -2,16 +2,15 @@ from __future__ import annotations
 from typing import Optional, TYPE_CHECKING, Any
 from collections import defaultdict
 from opentelemetry.context import Context
-from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor, SpanContext
+from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
+from opentelemetry.trace.span import SpanContext
 from opentelemetry.sdk.trace.export import (
     BatchSpanProcessor,
 )
-from opentelemetry.sdk.resources import Resource
 from judgeval.tracer.exporters import JudgmentSpanExporter
 from judgeval.tracer.keys import AttributeKeys, InternalAttributeKeys, ResourceKeys
-from judgeval.api import JudgmentSyncClient
-from judgeval.logger import judgeval_logger
 from judgeval.utils.url import url_for
+from judgeval.utils.decorators import dont_throw
 from judgeval.version import get_version
 if TYPE_CHECKING:
@@ -33,75 +32,50 @@ class NoOpSpanProcessor(SpanProcessor):
 class JudgmentSpanProcessor(BatchSpanProcessor):
+    __slots__ = ("tracer", "resource_attributes", "_internal_attributes")
     def __init__(
         self,
         tracer: Tracer,
         project_name: str,
+        project_id: str,
         api_key: str,
         organization_id: str,
         /,
         *,
-        max_queue_size: int = 2**18,
-        export_timeout_millis: int = 30000,
+        max_queue_size: int | None = None,
+        schedule_delay_millis: float | None = None,
+        max_export_batch_size: int | None = None,
+        export_timeout_millis: float | None = None,
         resource_attributes: Optional[dict[str, Any]] = None,
     ):
         self.tracer = tracer
-        self.project_name = project_name
-        self.api_key = api_key
-        self.organization_id = organization_id
-        # Resolve project_id
-        self.project_id = self._resolve_project_id()
-        # Set up resource attributes with project_id
-        self._setup_resource_attributes(resource_attributes or {})
+        attrs = {
+            ResourceKeys.SERVICE_NAME: project_name,
+            ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
+            ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
+            ResourceKeys.JUDGMENT_PROJECT_ID: project_id,
+            **(resource_attributes or {}),
+        }
+        self.resource_attributes = attrs
-        endpoint = url_for("/otel/v1/traces")
         super().__init__(
             JudgmentSpanExporter(
-                endpoint=endpoint,
+                endpoint=url_for("/otel/v1/traces"),
                 api_key=api_key,
                 organization_id=organization_id,
+                project_id=project_id,
             ),
             max_queue_size=max_queue_size,
+            schedule_delay_millis=schedule_delay_millis,
+            max_export_batch_size=max_export_batch_size,
             export_timeout_millis=export_timeout_millis,
         )
         self._internal_attributes: defaultdict[tuple[int, int], dict[str, Any]] = (
             defaultdict(dict)
         )
-    def _resolve_project_id(self) -> str | None:
-        """Resolve project_id from project_name using the API."""
-        try:
-            client = JudgmentSyncClient(
-                api_key=self.api_key,
-                organization_id=self.organization_id,
-            )
-            return client.projects_resolve({"project_name": self.project_name})[
-                "project_id"
-            ]
-        except Exception:
-            return None
-    def _setup_resource_attributes(self, resource_attributes: dict[str, Any]) -> None:
-        """Set up resource attributes including project_id."""
-        resource_attributes.update(
-            {
-                ResourceKeys.SERVICE_NAME: self.project_name,
-                ResourceKeys.TELEMETRY_SDK_NAME: "judgeval",
-                ResourceKeys.TELEMETRY_SDK_VERSION: get_version(),
-            }
-        )
-        if self.project_id is not None:
-            resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
-        else:
-            judgeval_logger.error(
-                f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
-            )
-        self.resource_attributes = resource_attributes
     def _get_span_key(self, span_context: SpanContext) -> tuple[int, int]:
         return (span_context.trace_id, span_context.span_id)
@@ -132,38 +106,32 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
     def _cleanup_span_state(self, span_key: tuple[int, int]) -> None:
         self._internal_attributes.pop(span_key, None)
+    @dont_throw
     def emit_partial(self) -> None:
         current_span = self.tracer.get_current_span()
-        if not current_span or not current_span.is_recording():
-            return
-        if not isinstance(current_span, ReadableSpan):
+        if (
+            not current_span
+            or not current_span.is_recording()
+            or not isinstance(current_span, ReadableSpan)
+        ):
             return
         span_context = current_span.get_span_context()
         if self.get_internal_attribute(
-            span_context=span_context,
-            key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
-            default=False,
+            span_context, InternalAttributeKeys.DISABLE_PARTIAL_EMIT, False
         ):
             return
-        current_update_id = self.increment_update_id(span_context=span_context)
         attributes = dict(current_span.attributes or {})
-        attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = current_update_id
-        existing_resource_attrs = (
-            dict(current_span.resource.attributes) if current_span.resource else {}
+        attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = self.increment_update_id(
+            span_context
         )
-        merged_resource_attrs = {**existing_resource_attrs, **self.resource_attributes}
-        merged_resource = Resource.create(merged_resource_attrs)
         partial_span = ReadableSpan(
             name=current_span.name,
             context=span_context,
             parent=current_span.parent,
-            resource=merged_resource,
+            resource=current_span.resource,
             attributes=attributes,
             events=current_span.events,
             links=current_span.links,
@@ -193,20 +161,11 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
             attributes = dict(span.attributes or {})
             attributes[AttributeKeys.JUDGMENT_UPDATE_ID] = 20
-            existing_resource_attrs = (
-                dict(span.resource.attributes) if span.resource else {}
-            )
-            merged_resource_attrs = {
-                **existing_resource_attrs,
-                **self.resource_attributes,
-            }
-            merged_resource = Resource.create(merged_resource_attrs)
             final_span = ReadableSpan(
                 name=span.name,
                 context=span.context,
                 parent=span.parent,
-                resource=merged_resource,
+                resource=span.resource,
                 attributes=attributes,
                 events=span.events,
                 links=span.links,
@@ -224,8 +183,10 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
 class NoOpJudgmentSpanProcessor(JudgmentSpanProcessor):
+    __slots__ = ("resource_attributes",)
     def __init__(self):
-        pass
+        self.resource_attributes = {}
     def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
         pass

judgeval/tracer/utils.py CHANGED Viewed

@@ -3,7 +3,6 @@ from opentelemetry.trace import Span
 from pydantic import BaseModel
 from typing import Callable, Optional
 from judgeval.scorers.api_scorer import TraceAPIScorerConfig
-from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 def set_span_attribute(span: Span, name: str, value: Any):
@@ -15,6 +14,6 @@ def set_span_attribute(span: Span, name: str, value: Any):
 class TraceScorerConfig(BaseModel):
     scorer: TraceAPIScorerConfig
-    model: str = JUDGMENT_DEFAULT_GPT_MODEL
+    model: Optional[str] = None
     sampling_rate: float = 1.0
     run_condition: Optional[Callable[..., bool]] = None

judgeval/utils/file_utils.py CHANGED Viewed

@@ -85,12 +85,10 @@ def extract_scorer_name(scorer_file_path: str) -> str:
                 and attr.__module__ == "scorer_module"
             ):
                 try:
-                    # Instantiate the scorer and get its name
                     scorer_instance = attr()
                     if hasattr(scorer_instance, "name"):
                         return scorer_instance.name
                 except Exception:
-                    # Skip if instantiation fails
                     continue
         raise AttributeError("No scorer class found or could be instantiated")

judgeval/utils/meta.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from __future__ import annotations
+from typing import TypeVar, Dict, cast, Type
+T = TypeVar("T")
 class SingletonMeta(type):
@@ -6,9 +9,19 @@ class SingletonMeta(type):
     Metaclass for creating singleton classes.
     """
-    _instances: dict[type, object] = {}
+    _instances: Dict[type, object] = {}
+    def __call__(cls, *args, **kwargs) -> object:
+        if cls not in SingletonMeta._instances:
+            SingletonMeta._instances[cls] = super(SingletonMeta, cls).__call__(
+                *args, **kwargs
+            )
+        return SingletonMeta._instances[cls]
+    def get_instance(cls: Type[T]) -> T | None:
+        """Get the singleton instance if it exists, otherwise return None"""
+        instance = SingletonMeta._instances.get(cls, None)
+        return cast(T, instance) if instance is not None else None
-    def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super().__call__(*args, **kwargs)
-        return cls._instances[cls]
+__all__ = ("SingletonMeta",)

judgeval/utils/testing.py CHANGED Viewed

@@ -7,23 +7,11 @@ from judgeval.exceptions import JudgmentTestError
 def assert_test_results(scoring_results: List[ScoringResult]) -> None:
-    """
-    Collects all failed scorers from the scoring results.
-    Args:
-        ScoringResults (List[ScoringResult]): List of scoring results to check
-    Returns:
-        None. Raises exceptions for any failed test cases.
-    """
     failed_cases: List[List[ScorerData]] = []
     for result in scoring_results:
         if not result.success:
-            # Create a test case context with all relevant fields
             test_case = []
             if result.scorers_data:
-                # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
                         test_case.append(scorer_data)
@@ -50,7 +38,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
         failed_tests = len(failed_cases)
         passed_tests = total_tests - failed_tests
-        # Print summary with colors
         rprint("\n" + "=" * 80)
         if failed_tests == 0:
             rprint(
@@ -62,7 +49,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
             )
         rprint("=" * 80 + "\n")
-        # Print individual test cases
         for i, result in enumerate(scoring_results):
             test_num = i + 1
             if result.success:

judgeval/utils/version_check.py CHANGED Viewed

@@ -2,8 +2,10 @@ import importlib.metadata
 import httpx
 import threading
 from judgeval.logger import judgeval_logger
+from judgeval.utils.decorators import use_once
+@use_once
 def check_latest_version(package_name: str = "judgeval"):
     def _check():
         try:

judgeval/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.12.0"
+__version__ = "0.13.0"
 def get_version() -> str:

{judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.12.0
+Version: 0.13.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -17,14 +17,8 @@ Requires-Dist: httpx>=0.28.1
 Requires-Dist: litellm<1.75.0
 Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
 Requires-Dist: opentelemetry-sdk>=1.36.0
-Requires-Dist: opentelemetry-semantic-conventions>=0.57b0
 Requires-Dist: orjson>=3.9.0
 Requires-Dist: typer>=0.9.0
-Provides-Extra: langchain
-Requires-Dist: langchain-anthropic; extra == 'langchain'
-Requires-Dist: langchain-core; extra == 'langchain'
-Requires-Dist: langchain-huggingface; extra == 'langchain'
-Requires-Dist: langchain-openai; extra == 'langchain'
 Provides-Extra: s3
 Requires-Dist: boto3>=1.40.11; extra == 's3'
 Provides-Extra: trainer

judgeval 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

judgeval 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl