PyPI - judgeval - Versions diffs - 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

judgeval/__init__.py +173 -10
judgeval/api/__init__.py +523 -0
judgeval/api/api_types.py +413 -0
judgeval/cli.py +112 -0
judgeval/constants.py +7 -30
judgeval/data/__init__.py +1 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +14 -40
judgeval/data/judgment_types.py +396 -146
judgeval/data/result.py +11 -18
judgeval/data/scorer_data.py +3 -26
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +115 -194
judgeval/dataset/__init__.py +335 -0
judgeval/env.py +55 -0
judgeval/evaluation/__init__.py +346 -0
judgeval/exceptions.py +28 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +51 -0
judgeval/judges/__init__.py +2 -2
judgeval/judges/litellm_judge.py +77 -16
judgeval/judges/together_judge.py +88 -17
judgeval/judges/utils.py +7 -20
judgeval/judgment_attribute_keys.py +55 -0
judgeval/{common/logger.py → logger.py} +24 -8
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +11 -11
judgeval/scorers/agent_scorer.py +15 -19
judgeval/scorers/api_scorer.py +21 -23
judgeval/scorers/base_scorer.py +54 -36
judgeval/scorers/example_scorer.py +1 -3
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
judgeval/scorers/score.py +64 -47
judgeval/scorers/utils.py +2 -107
judgeval/tracer/__init__.py +1111 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +123 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +392 -0
judgeval/trainer/trainable_model.py +252 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +74 -28
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/{version_check.py → utils/version_check.py} +5 -3
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/v1/__init__.py +88 -0
judgeval/v1/data/__init__.py +7 -0
judgeval/v1/data/example.py +44 -0
judgeval/v1/data/scorer_data.py +42 -0
judgeval/v1/data/scoring_result.py +44 -0
judgeval/v1/datasets/__init__.py +6 -0
judgeval/v1/datasets/dataset.py +214 -0
judgeval/v1/datasets/dataset_factory.py +94 -0
judgeval/v1/evaluation/__init__.py +6 -0
judgeval/v1/evaluation/evaluation.py +182 -0
judgeval/v1/evaluation/evaluation_factory.py +17 -0
judgeval/v1/instrumentation/__init__.py +6 -0
judgeval/v1/instrumentation/llm/__init__.py +7 -0
judgeval/v1/instrumentation/llm/config.py +78 -0
judgeval/v1/instrumentation/llm/constants.py +11 -0
judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
judgeval/v1/instrumentation/llm/providers.py +19 -0
judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
judgeval/v1/integrations/langgraph/__init__.py +13 -0
judgeval/v1/integrations/openlit/__init__.py +47 -0
judgeval/v1/internal/api/__init__.py +525 -0
judgeval/v1/internal/api/api_types.py +413 -0
judgeval/v1/prompts/__init__.py +6 -0
judgeval/v1/prompts/prompt.py +29 -0
judgeval/v1/prompts/prompt_factory.py +189 -0
judgeval/v1/py.typed +0 -0
judgeval/v1/scorers/__init__.py +6 -0
judgeval/v1/scorers/api_scorer.py +82 -0
judgeval/v1/scorers/base_scorer.py +17 -0
judgeval/v1/scorers/built_in/__init__.py +17 -0
judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
judgeval/v1/scorers/built_in/faithfulness.py +28 -0
judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
judgeval/v1/scorers/scorers_factory.py +49 -0
judgeval/v1/tracer/__init__.py +7 -0
judgeval/v1/tracer/base_tracer.py +520 -0
judgeval/v1/tracer/exporters/__init__.py +14 -0
judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
judgeval/v1/tracer/exporters/span_store.py +50 -0
judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
judgeval/v1/tracer/processors/__init__.py +6 -0
judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
judgeval/v1/tracer/tracer.py +67 -0
judgeval/v1/tracer/tracer_factory.py +38 -0
judgeval/v1/trainers/__init__.py +5 -0
judgeval/v1/trainers/base_trainer.py +62 -0
judgeval/v1/trainers/config.py +123 -0
judgeval/v1/trainers/console.py +144 -0
judgeval/v1/trainers/fireworks_trainer.py +392 -0
judgeval/v1/trainers/trainable_model.py +252 -0
judgeval/v1/trainers/trainers_factory.py +37 -0
judgeval/v1/utils.py +18 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.23.0.dist-info/METADATA +266 -0
judgeval-0.23.0.dist-info/RECORD +201 -0
judgeval-0.23.0.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -34
judgeval/common/__init__.py +0 -13
judgeval/common/api/__init__.py +0 -3
judgeval/common/api/api.py +0 -352
judgeval/common/api/constants.py +0 -165
judgeval/common/exceptions.py +0 -27
judgeval/common/storage/__init__.py +0 -6
judgeval/common/storage/s3_storage.py +0 -98
judgeval/common/tracer/__init__.py +0 -31
judgeval/common/tracer/constants.py +0 -22
judgeval/common/tracer/core.py +0 -1916
judgeval/common/tracer/otel_exporter.py +0 -108
judgeval/common/tracer/otel_span_processor.py +0 -234
judgeval/common/tracer/span_processor.py +0 -37
judgeval/common/tracer/span_transformer.py +0 -211
judgeval/common/tracer/trace_manager.py +0 -92
judgeval/common/utils.py +0 -940
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
judgeval/data/tool.py +0 -5
judgeval/data/trace_run.py +0 -37
judgeval/evaluation_run.py +0 -75
judgeval/integrations/langgraph.py +0 -843
judgeval/judges/mixture_of_judges.py +0 -286
judgeval/judgment_client.py +0 -369
judgeval/rules.py +0 -521
judgeval/run_evaluation.py +0 -684
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
judgeval/utils/alerts.py +0 -93
judgeval/utils/requests.py +0 -50
judgeval-0.1.0.dist-info/METADATA +0 -202
judgeval-0.1.0.dist-info/RECORD +0 -73
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
{judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/v1/instrumentation/llm/config.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from judgeval.logger import judgeval_logger
+from judgeval.v1.instrumentation.llm.constants import ProviderType
+from judgeval.v1.instrumentation.llm.providers import (
+    HAS_OPENAI,
+    HAS_TOGETHER,
+    HAS_ANTHROPIC,
+    HAS_GOOGLE_GENAI,
+    ApiClient,
+)
+if TYPE_CHECKING:
+    from judgeval.v1.tracer.base_tracer import BaseTracer
+def _detect_provider(client: ApiClient) -> ProviderType:
+    if HAS_OPENAI:
+        from openai import OpenAI, AsyncOpenAI
+        if isinstance(client, (OpenAI, AsyncOpenAI)):
+            return ProviderType.OPENAI
+    if HAS_ANTHROPIC:
+        from anthropic import Anthropic, AsyncAnthropic
+        if isinstance(client, (Anthropic, AsyncAnthropic)):
+            return ProviderType.ANTHROPIC
+    if HAS_TOGETHER:
+        from together import Together, AsyncTogether  # type: ignore[import-untyped]
+        if isinstance(client, (Together, AsyncTogether)):
+            return ProviderType.TOGETHER
+    if HAS_GOOGLE_GENAI:
+        from google.genai import Client as GoogleClient
+        if isinstance(client, GoogleClient):
+            return ProviderType.GOOGLE
+    judgeval_logger.warning(
+        f"Unknown client type {type(client)}, Trying to wrap as OpenAI-compatible. "
+        "If this is a mistake or you think we should support this client, please file an issue at https://github.com/JudgmentLabs/judgeval/issues!"
+    )
+    return ProviderType.DEFAULT
+def wrap_provider(tracer: BaseTracer, client: ApiClient) -> ApiClient:
+    """
+    Wraps an API client to add tracing capabilities.
+    Supports OpenAI, Together, Anthropic, and Google GenAI clients.
+    """
+    provider_type = _detect_provider(client)
+    if provider_type == ProviderType.OPENAI:
+        from .llm_openai.wrapper import wrap_openai_client
+        return wrap_openai_client(tracer, client)
+    elif provider_type == ProviderType.ANTHROPIC:
+        from .llm_anthropic.wrapper import wrap_anthropic_client
+        return wrap_anthropic_client(tracer, client)
+    elif provider_type == ProviderType.TOGETHER:
+        from .llm_together.wrapper import wrap_together_client
+        return wrap_together_client(tracer, client)
+    elif provider_type == ProviderType.GOOGLE:
+        from .llm_google.wrapper import wrap_google_client
+        return wrap_google_client(tracer, client)
+    else:
+        # Default to OpenAI-compatible wrapping for unknown clients
+        from .llm_openai.wrapper import wrap_openai_client
+        return wrap_openai_client(tracer, client)

judgeval/v1/instrumentation/llm/constants.py ADDED Viewed

@@ -0,0 +1,11 @@
+from __future__ import annotations
+from enum import Enum
+class ProviderType(Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    TOGETHER = "together"
+    GOOGLE = "google"
+    DEFAULT = "default"

judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from __future__ import annotations
+from .wrapper import wrap_anthropic_client
+__all__ = ["wrap_anthropic_client"]

judgeval/v1/instrumentation/llm/llm_anthropic/config.py ADDED Viewed

@@ -0,0 +1,6 @@
+from __future__ import annotations
+import importlib.util
+HAS_ANTHROPIC = importlib.util.find_spec("anthropic") is not None
+__all__ = ["HAS_ANTHROPIC"]

judgeval/v1/instrumentation/llm/llm_anthropic/messages.py ADDED Viewed

@@ -0,0 +1,414 @@
+from __future__ import annotations
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterator,
+    AsyncIterator,
+    Generator,
+    AsyncGenerator,
+    Tuple,
+)
+from opentelemetry.trace import Status, StatusCode
+from judgeval.judgment_attribute_keys import AttributeKeys
+from judgeval.utils.serialize import safe_serialize
+from judgeval.utils.wrappers import (
+    immutable_wrap_sync,
+    immutable_wrap_async,
+    mutable_wrap_sync,
+    mutable_wrap_async,
+    immutable_wrap_sync_iterator,
+    immutable_wrap_async_iterator,
+)
+if TYPE_CHECKING:
+    from judgeval.v1.tracer import BaseTracer
+    from anthropic import Anthropic, AsyncAnthropic
+    from anthropic.types import (
+        Message,
+        Usage,
+        MessageDeltaUsage,
+        RawMessageStreamEvent,
+    )
+def _extract_anthropic_content(chunk: RawMessageStreamEvent) -> str:
+    if chunk.type == "content_block_delta":
+        delta = chunk.delta
+        if delta.type == "text_delta" and delta.text:
+            return delta.text
+    return ""
+def _extract_anthropic_tokens(
+    usage: Usage | MessageDeltaUsage,
+) -> Tuple[int, int, int, int]:
+    input_tokens = usage.input_tokens if usage.input_tokens is not None else 0
+    output_tokens = usage.output_tokens if usage.output_tokens is not None else 0
+    cache_read = (
+        usage.cache_read_input_tokens
+        if usage.cache_read_input_tokens is not None
+        else 0
+    )
+    cache_creation = (
+        usage.cache_creation_input_tokens
+        if usage.cache_creation_input_tokens is not None
+        else 0
+    )
+    return (input_tokens, output_tokens, cache_read, cache_creation)
+def _extract_anthropic_chunk_usage(
+    chunk: RawMessageStreamEvent,
+) -> Usage | MessageDeltaUsage | None:
+    if chunk.type == "message_start":
+        return chunk.message.usage if chunk.message else None
+    elif chunk.type == "message_delta":
+        return chunk.usage if hasattr(chunk, "usage") else None
+    return None
+def wrap_messages_create_sync(tracer: BaseTracer, client: Anthropic) -> None:
+    original_func = client.messages.create
+    def dispatcher(*args: Any, **kwargs: Any) -> Any:
+        if kwargs.get("stream", False):
+            return _wrap_streaming_sync(tracer, original_func)(*args, **kwargs)
+        return _wrap_non_streaming_sync(tracer, original_func)(*args, **kwargs)
+    setattr(client.messages, "create", dispatcher)
+def _wrap_non_streaming_sync(
+    tracer: BaseTracer, original_func: Callable[..., Message]
+) -> Callable[..., Message]:
+    def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
+        ctx["span"] = tracer.get_tracer().start_span(
+            "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
+        )
+        ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
+        ctx["model_name"] = kwargs.get("model", "")
+        ctx["span"].set_attribute(
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
+        )
+    def post_hook(ctx: Dict[str, Any], result: Message) -> None:
+        span = ctx.get("span")
+        if not span:
+            return
+        span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
+        if result.usage:
+            prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                _extract_anthropic_tokens(result.usage)
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                cache_creation,
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_METADATA,
+                safe_serialize(result.usage),
+            )
+        span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, result.model)
+    def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
+        span = ctx.get("span")
+        if span:
+            span.record_exception(error)
+            span.set_status(Status(StatusCode.ERROR))
+    def finally_hook(ctx: Dict[str, Any]) -> None:
+        span = ctx.get("span")
+        if span:
+            span.end()
+    return immutable_wrap_sync(
+        original_func,
+        pre_hook=pre_hook,
+        post_hook=post_hook,
+        error_hook=error_hook,
+        finally_hook=finally_hook,
+    )
+def _wrap_streaming_sync(
+    tracer: BaseTracer, original_func: Callable[..., Iterator[RawMessageStreamEvent]]
+) -> Callable[..., Iterator[RawMessageStreamEvent]]:
+    def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
+        ctx["span"] = tracer.get_tracer().start_span(
+            "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
+        )
+        ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
+        ctx["model_name"] = kwargs.get("model", "")
+        ctx["span"].set_attribute(
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
+        )
+        ctx["accumulated_content"] = ""
+    def mutate_hook(
+        ctx: Dict[str, Any], result: Iterator[RawMessageStreamEvent]
+    ) -> Iterator[RawMessageStreamEvent]:
+        def traced_generator() -> Generator[RawMessageStreamEvent, None, None]:
+            for chunk in result:
+                yield chunk
+        def yield_hook(inner_ctx: Dict[str, Any], chunk: RawMessageStreamEvent) -> None:
+            span = ctx.get("span")
+            if not span:
+                return
+            content = _extract_anthropic_content(chunk)
+            if content:
+                ctx["accumulated_content"] = (
+                    ctx.get("accumulated_content", "") + content
+                )
+            usage_data = _extract_anthropic_chunk_usage(chunk)
+            if usage_data:
+                prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                    _extract_anthropic_tokens(usage_data)
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS, prompt_tokens
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                    cache_creation,
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_METADATA, safe_serialize(usage_data)
+                )
+        def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
+            span = ctx.get("span")
+            if span:
+                accumulated = ctx.get("accumulated_content", "")
+                span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
+        def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
+            span = ctx.get("span")
+            if span:
+                span.record_exception(error)
+                span.set_status(Status(StatusCode.ERROR))
+        def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
+            span = ctx.get("span")
+            if span:
+                span.end()
+        wrapped_generator = immutable_wrap_sync_iterator(
+            traced_generator,
+            yield_hook=yield_hook,
+            post_hook=post_hook_inner,
+            error_hook=error_hook_inner,
+            finally_hook=finally_hook_inner,
+        )
+        return wrapped_generator()
+    def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
+        span = ctx.get("span")
+        if span:
+            span.record_exception(error)
+            span.set_status(Status(StatusCode.ERROR))
+    return mutable_wrap_sync(
+        original_func,
+        pre_hook=pre_hook,
+        mutate_hook=mutate_hook,
+        error_hook=error_hook,
+    )
+def wrap_messages_create_async(tracer: BaseTracer, client: AsyncAnthropic) -> None:
+    original_func = client.messages.create
+    async def dispatcher(*args: Any, **kwargs: Any) -> Any:
+        if kwargs.get("stream", False):
+            return await _wrap_streaming_async(tracer, original_func)(*args, **kwargs)
+        return await _wrap_non_streaming_async(tracer, original_func)(*args, **kwargs)
+    setattr(client.messages, "create", dispatcher)
+def _wrap_non_streaming_async(
+    tracer: BaseTracer, original_func: Callable[..., Awaitable[Message]]
+) -> Callable[..., Awaitable[Message]]:
+    def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
+        ctx["span"] = tracer.get_tracer().start_span(
+            "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
+        )
+        ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
+        ctx["model_name"] = kwargs.get("model", "")
+        ctx["span"].set_attribute(
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
+        )
+    def post_hook(ctx: Dict[str, Any], result: Message) -> None:
+        span = ctx.get("span")
+        if not span:
+            return
+        span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
+        if result.usage:
+            prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                _extract_anthropic_tokens(result.usage)
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                cache_creation,
+            )
+            span.set_attribute(
+                AttributeKeys.JUDGMENT_USAGE_METADATA,
+                safe_serialize(result.usage),
+            )
+        span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, result.model)
+    def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
+        span = ctx.get("span")
+        if span:
+            span.record_exception(error)
+            span.set_status(Status(StatusCode.ERROR))
+    def finally_hook(ctx: Dict[str, Any]) -> None:
+        span = ctx.get("span")
+        if span:
+            span.end()
+    return immutable_wrap_async(
+        original_func,
+        pre_hook=pre_hook,
+        post_hook=post_hook,
+        error_hook=error_hook,
+        finally_hook=finally_hook,
+    )
+def _wrap_streaming_async(
+    tracer: BaseTracer,
+    original_func: Callable[..., Awaitable[AsyncIterator[RawMessageStreamEvent]]],
+) -> Callable[..., Awaitable[AsyncIterator[RawMessageStreamEvent]]]:
+    def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
+        ctx["span"] = tracer.get_tracer().start_span(
+            "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
+        )
+        ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
+        ctx["model_name"] = kwargs.get("model", "")
+        ctx["span"].set_attribute(
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
+        )
+        ctx["accumulated_content"] = ""
+    def mutate_hook(
+        ctx: Dict[str, Any], result: AsyncIterator[RawMessageStreamEvent]
+    ) -> AsyncIterator[RawMessageStreamEvent]:
+        async def traced_generator() -> AsyncGenerator[RawMessageStreamEvent, None]:
+            async for chunk in result:
+                yield chunk
+        def yield_hook(inner_ctx: Dict[str, Any], chunk: RawMessageStreamEvent) -> None:
+            span = ctx.get("span")
+            if not span:
+                return
+            content = _extract_anthropic_content(chunk)
+            if content:
+                ctx["accumulated_content"] = (
+                    ctx.get("accumulated_content", "") + content
+                )
+            usage_data = _extract_anthropic_chunk_usage(chunk)
+            if usage_data:
+                prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                    _extract_anthropic_tokens(usage_data)
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS, prompt_tokens
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                    cache_creation,
+                )
+                span.set_attribute(
+                    AttributeKeys.JUDGMENT_USAGE_METADATA, safe_serialize(usage_data)
+                )
+        def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
+            span = ctx.get("span")
+            if span:
+                accumulated = ctx.get("accumulated_content", "")
+                span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
+        def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
+            span = ctx.get("span")
+            if span:
+                span.record_exception(error)
+                span.set_status(Status(StatusCode.ERROR))
+        def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
+            span = ctx.get("span")
+            if span:
+                span.end()
+        wrapped_generator = immutable_wrap_async_iterator(
+            traced_generator,
+            yield_hook=yield_hook,
+            post_hook=post_hook_inner,
+            error_hook=error_hook_inner,
+            finally_hook=finally_hook_inner,
+        )
+        return wrapped_generator()
+    def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
+        span = ctx.get("span")
+        if span:
+            span.record_exception(error)
+            span.set_status(Status(StatusCode.ERROR))
+    return mutable_wrap_async(
+        original_func,
+        pre_hook=pre_hook,
+        mutate_hook=mutate_hook,
+        error_hook=error_hook,
+    )

judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

judgeval 0.1.0py3-none-any.whl → 0.23.0py3-none-any.whl