PyPI - judgeval - Versions diffs - 0.16.5__tar.gz → 0.16.7__tar.gz - Mend

judgeval 0.16.5tar.gz → 0.16.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (149) hide show

{judgeval-0.16.5 → judgeval-0.16.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.16.5
+Version: 0.16.7
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.16.5 → judgeval-0.16.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "judgeval"
-version = "0.16.5"
+version = "0.16.7"
 authors = [
     { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
     { name = "Alex Shan", email = "alex@judgmentlabs.ai" },

{judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/__init__.py RENAMED Viewed

@@ -39,18 +39,23 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_evaluation(
         self,
         examples: List[Example],
-        scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
+        scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer, None]],
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
         model: Optional[str] = None,
         assert_test: bool = False,
     ) -> List[ScoringResult]:
         try:
+            for scorer in scorers:
+                if scorer is None:
+                    raise ValueError(
+                        "Failed to run evaluation: At least one Prompt Scorer was not successfuly retrieved."
+                    )
             eval = ExampleEvaluationRun(
                 project_name=project_name,
                 eval_name=eval_run_name,
                 examples=examples,
-                scorers=scorers,
+                scorers=scorers,  # type: ignore
                 model=model,
             )

{judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py RENAMED Viewed

@@ -12,6 +12,7 @@ from judgeval.logger import judgeval_logger
 from abc import ABC
 from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 from copy import copy
+from judgeval.utils.decorators.dont_throw import dont_throw
 def push_prompt_scorer(
@@ -60,10 +61,19 @@ def fetch_prompt_scorer(
 ):
     client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
-        scorer_config.pop("created_at")
-        scorer_config.pop("updated_at")
-        return scorer_config
+        fetched_scorers = client.fetch_scorers({"names": [name]})
+        if len(fetched_scorers["scorers"]) == 0:
+            judgeval_logger.error(f"Prompt scorer '{name}' not found")
+            raise JudgmentAPIError(
+                status_code=404,
+                detail=f"Prompt scorer '{name}' not found",
+                response=None,  # type: ignore
+            )
+        else:
+            scorer_config = fetched_scorers["scorers"][0]
+            scorer_config.pop("created_at")
+            scorer_config.pop("updated_at")
+            return scorer_config
     except JudgmentAPIError as e:
         if e.status_code == 500:
             raise JudgmentAPIError(
@@ -109,6 +119,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
     @classmethod
+    @dont_throw
     def get(
         cls,
         name: str,

{judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/__init__.py RENAMED Viewed

@@ -267,6 +267,7 @@ class Tracer(metaclass=SingletonMeta):
         if span and span.is_recording():
             set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
+    @dont_throw
     def add_agent_attributes_to_span(self, span):
         """Add agent ID, class name, and instance name to span if they exist in context"""
         current_agent_context = self.agent_context.get()
@@ -342,6 +343,9 @@ class Tracer(metaclass=SingletonMeta):
         run_condition = scorer_config.run_condition
         sampling_rate = scorer_config.sampling_rate
+        if scorer is None:
+            judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
+            return
         if not isinstance(scorer, (TraceAPIScorerConfig)):
             judgeval_logger.error(
                 "Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
@@ -877,7 +881,7 @@ class Tracer(metaclass=SingletonMeta):
         self,
         /,
         *,
-        scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
+        scorer: Union[ExampleAPIScorerConfig, ExampleScorer, None],
         example: Example,
         model: Optional[str] = None,
         sampling_rate: float = 1.0,
@@ -886,6 +890,10 @@ class Tracer(metaclass=SingletonMeta):
             judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
             return
+        if scorer is None:
+            judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
+            return
         if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
             judgeval_logger.error(
                 "Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."

{judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_anthropic/wrapper.py RENAMED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
 import functools
-import orjson
 from typing import (
     TYPE_CHECKING,
     Callable,
     Optional,
     Protocol,
+    TypeVar,
     Tuple,
     Union,
     Iterator,
@@ -19,6 +19,7 @@ from judgeval.tracer.llm.llm_anthropic.config import (
     anthropic_AsyncAnthropic,
 )
 from judgeval.tracer.managers import sync_span_context, async_span_context
+from judgeval.logger import judgeval_logger
 from judgeval.tracer.keys import AttributeKeys
 from judgeval.tracer.utils import set_span_attribute
 from judgeval.utils.serialize import safe_serialize
@@ -28,10 +29,6 @@ if TYPE_CHECKING:
     from opentelemetry.trace import Span
-# Keep the original client type for runtime compatibility
-AnthropicClientType = Union[anthropic_Anthropic, anthropic_AsyncAnthropic]
 # Content block protocols
 @runtime_checkable
 class AnthropicContentBlock(Protocol):
@@ -81,6 +78,10 @@ class AnthropicAsyncClient(Protocol):
     pass
+# Generic client type bound to both sync and async client protocols
+TClient = TypeVar("TClient", bound=Union[AnthropicClient, AnthropicAsyncClient])
 # Union types
 AnthropicResponseType = AnthropicMessage
 AnthropicStreamType = Union[
@@ -193,7 +194,7 @@ class TracedAnthropicGenerator:
         self,
         tracer: Tracer,
         generator: Iterator[AnthropicStreamEvent],
-        client: AnthropicClientType,
+        client: AnthropicClient,
         span: Span,
         model_name: str,
     ):
@@ -261,7 +262,7 @@ class TracedAnthropicAsyncGenerator:
         self,
         tracer: Tracer,
         async_generator: AsyncIterator[AnthropicStreamEvent],
-        client: AnthropicClientType,
+        client: AnthropicAsyncClient,
         span: Span,
         model_name: str,
     ):
@@ -278,6 +279,19 @@ class TracedAnthropicAsyncGenerator:
     async def __anext__(self) -> AnthropicStreamEvent:
         try:
             chunk = await self.async_generator.__anext__()
+        except StopAsyncIteration:
+            set_span_attribute(
+                self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
+            )
+            self.span.end()
+            raise
+        except Exception as e:
+            if self.span:
+                self.span.record_exception(e)
+                self.span.end()
+            raise
+        try:
             content = _extract_anthropic_content(chunk)
             if content:
                 self.accumulated_content += content
@@ -310,18 +324,14 @@ class TracedAnthropicAsyncGenerator:
                     AttributeKeys.JUDGMENT_USAGE_METADATA,
                     safe_serialize(usage_data),
                 )
-            return chunk
-        except StopAsyncIteration:
-            set_span_attribute(
-                self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
-            )
-            self.span.end()
-            raise
         except Exception as e:
             if self.span:
-                self.span.record_exception(e)
                 self.span.end()
-            raise
+            judgeval_logger.error(
+                f"[anthropic wrapped_async] Error adding span metadata: {e}"
+            )
+        finally:
+            return chunk
 class TracedAnthropicSyncContextManager:
@@ -329,7 +339,7 @@ class TracedAnthropicSyncContextManager:
         self,
         tracer: Tracer,
         context_manager,
-        client: AnthropicClientType,
+        client: AnthropicClient,
         span: Span,
         model_name: str,
     ):
@@ -354,7 +364,7 @@ class TracedAnthropicAsyncContextManager:
         self,
         tracer: Tracer,
         context_manager,
-        client: AnthropicClientType,
+        client: AnthropicAsyncClient,
         span: Span,
         model_name: str,
     ):
@@ -374,9 +384,7 @@ class TracedAnthropicAsyncContextManager:
         return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
-def wrap_anthropic_client(
-    tracer: Tracer, client: AnthropicClientType
-) -> AnthropicClientType:
+def wrap_anthropic_client(tracer: Tracer, client: TClient) -> TClient:
     def wrapped(function: Callable, span_name: str):
         @functools.wraps(function)
         def wrapper(*args, **kwargs):
@@ -398,68 +406,77 @@ def wrap_anthropic_client(
                 with sync_span_context(
                     tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
                 ) as span:
-                    tracer.add_agent_attributes_to_span(span)
-                    set_span_attribute(
-                        span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
-                    )
-                    model_name = kwargs.get("model", "")
-                    set_span_attribute(
-                        span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
-                    )
-                    response = function(*args, **kwargs)
-                    if isinstance(response, AnthropicMessage):
-                        output, usage_data = _format_anthropic_output(response)
-                        # Serialize structured data to JSON for span attribute
-                        if isinstance(output, list):
-                            output_str = orjson.dumps(
-                                output, option=orjson.OPT_INDENT_2
-                            ).decode()
-                        else:
-                            output_str = str(output) if output is not None else None
+                    try:
+                        tracer.add_agent_attributes_to_span(span)
+                        set_span_attribute(
+                            span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
+                        )
+                        model_name = kwargs.get("model", "")
                         set_span_attribute(
-                            span, AttributeKeys.GEN_AI_COMPLETION, output_str
+                            span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
+                        )
+                    except Exception as e:
+                        judgeval_logger.error(
+                            f"[anthropic wrapped] Error adding span metadata: {e}"
                         )
-                        if usage_data:
-                            (
-                                prompt_tokens,
-                                completion_tokens,
-                                cache_read,
-                                cache_creation,
-                            ) = _extract_anthropic_tokens(usage_data)
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
-                                prompt_tokens,
-                            )
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
-                                completion_tokens,
-                            )
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
-                                cache_read,
-                            )
+                    response = function(*args, **kwargs)
+                    try:
+                        if isinstance(response, AnthropicMessage):
+                            output, usage_data = _format_anthropic_output(response)
+                            # Serialize structured data to JSON for span attribute
+                            if isinstance(output, list):
+                                output_str = safe_serialize(output)
+                            else:
+                                output_str = str(output) if output is not None else None
                             set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
-                                cache_creation,
+                                span, AttributeKeys.GEN_AI_COMPLETION, output_str
                             )
+                            if usage_data:
+                                (
+                                    prompt_tokens,
+                                    completion_tokens,
+                                    cache_read,
+                                    cache_creation,
+                                ) = _extract_anthropic_tokens(usage_data)
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
+                                    prompt_tokens,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
+                                    completion_tokens,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
+                                    cache_read,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                                    cache_creation,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.JUDGMENT_USAGE_METADATA,
+                                    safe_serialize(usage_data),
+                                )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.JUDGMENT_USAGE_METADATA,
-                                safe_serialize(usage_data),
+                                AttributeKeys.GEN_AI_RESPONSE_MODEL,
+                                getattr(response, "model", model_name),
                             )
-                        set_span_attribute(
-                            span,
-                            AttributeKeys.GEN_AI_RESPONSE_MODEL,
-                            getattr(response, "model", model_name),
+                    except Exception as e:
+                        judgeval_logger.error(
+                            f"[anthropic wrapped] Error adding span metadata: {e}"
                         )
-                    return response
+                    finally:
+                        return response
         return wrapper
@@ -484,68 +501,77 @@ def wrap_anthropic_client(
                 async with async_span_context(
                     tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
                 ) as span:
-                    tracer.add_agent_attributes_to_span(span)
-                    set_span_attribute(
-                        span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
-                    )
-                    model_name = kwargs.get("model", "")
-                    set_span_attribute(
-                        span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
-                    )
-                    response = await function(*args, **kwargs)
-                    if isinstance(response, AnthropicMessage):
-                        output, usage_data = _format_anthropic_output(response)
-                        # Serialize structured data to JSON for span attribute
-                        if isinstance(output, list):
-                            output_str = orjson.dumps(
-                                output, option=orjson.OPT_INDENT_2
-                            ).decode()
-                        else:
-                            output_str = str(output) if output is not None else None
+                    try:
+                        tracer.add_agent_attributes_to_span(span)
+                        set_span_attribute(
+                            span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
+                        )
+                        model_name = kwargs.get("model", "")
                         set_span_attribute(
-                            span, AttributeKeys.GEN_AI_COMPLETION, output_str
+                            span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
+                        )
+                    except Exception as e:
+                        judgeval_logger.error(
+                            f"[anthropic wrapped_async] Error adding span metadata: {e}"
                         )
-                        if usage_data:
-                            (
-                                prompt_tokens,
-                                completion_tokens,
-                                cache_read,
-                                cache_creation,
-                            ) = _extract_anthropic_tokens(usage_data)
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
-                                prompt_tokens,
-                            )
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
-                                completion_tokens,
-                            )
-                            set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
-                                cache_read,
-                            )
+                    response = await function(*args, **kwargs)
+                    try:
+                        if isinstance(response, AnthropicMessage):
+                            output, usage_data = _format_anthropic_output(response)
+                            # Serialize structured data to JSON for span attribute
+                            if isinstance(output, list):
+                                output_str = safe_serialize(output)
+                            else:
+                                output_str = str(output) if output is not None else None
                             set_span_attribute(
-                                span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
-                                cache_creation,
+                                span, AttributeKeys.GEN_AI_COMPLETION, output_str
                             )
+                            if usage_data:
+                                (
+                                    prompt_tokens,
+                                    completion_tokens,
+                                    cache_read,
+                                    cache_creation,
+                                ) = _extract_anthropic_tokens(usage_data)
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
+                                    prompt_tokens,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
+                                    completion_tokens,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
+                                    cache_read,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                                    cache_creation,
+                                )
+                                set_span_attribute(
+                                    span,
+                                    AttributeKeys.JUDGMENT_USAGE_METADATA,
+                                    safe_serialize(usage_data),
+                                )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.JUDGMENT_USAGE_METADATA,
-                                safe_serialize(usage_data),
+                                AttributeKeys.GEN_AI_RESPONSE_MODEL,
+                                getattr(response, "model", model_name),
                             )
-                        set_span_attribute(
-                            span,
-                            AttributeKeys.GEN_AI_RESPONSE_MODEL,
-                            getattr(response, "model", model_name),
+                    except Exception as e:
+                        judgeval_logger.error(
+                            f"[anthropic wrapped_async] Error adding span metadata: {e}"
                         )
-                    return response
+                    finally:
+                        return response
         return wrapper
@@ -590,16 +616,20 @@ def wrap_anthropic_client(
         return wrapper
     span_name = "ANTHROPIC_API_CALL"
-    if anthropic_Anthropic and isinstance(client, anthropic_Anthropic):
+    if anthropic_Anthropic is not None and isinstance(client, anthropic_Anthropic):
         setattr(client.messages, "create", wrapped(client.messages.create, span_name))
         setattr(
             client.messages,
             "stream",
             wrapped_sync_context_manager(client.messages.stream, span_name),
         )
-    elif anthropic_AsyncAnthropic and isinstance(client, anthropic_AsyncAnthropic):
+    elif anthropic_AsyncAnthropic is not None and isinstance(
+        client, anthropic_AsyncAnthropic
+    ):
         setattr(
-            client.messages, "create", wrapped_async(client.messages.create, span_name)
+            client.messages,
+            "create",
+            wrapped_async(client.messages.create, span_name),
         )
         setattr(
             client.messages,

judgeval 0.16.5__tar.gz → 0.16.7__tar.gz

Potentially problematic release.

judgeval 0.16.5tar.gz → 0.16.7tar.gz