PyPI - judgeval - Versions diffs - 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.16.9py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (37) hide show

judgeval/__init__.py +32 -2
judgeval/api/__init__.py +108 -0
judgeval/api/api_types.py +76 -15
judgeval/cli.py +16 -1
judgeval/data/judgment_types.py +76 -20
judgeval/dataset/__init__.py +11 -2
judgeval/env.py +2 -11
judgeval/evaluation/__init__.py +4 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
judgeval/tracer/__init__.py +371 -257
judgeval/tracer/constants.py +1 -1
judgeval/tracer/exporters/store.py +32 -16
judgeval/tracer/keys.py +11 -9
judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
judgeval/tracer/llm/llm_google/generate_content.py +9 -7
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
judgeval/tracer/llm/llm_openai/responses.py +88 -26
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
judgeval/tracer/managers.py +4 -0
judgeval/trainer/__init__.py +10 -1
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +1 -1
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainer.py +52 -387
judgeval/utils/guards.py +9 -5
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +2 -2
judgeval/version.py +1 -1
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
{judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/tracer/constants.py CHANGED Viewed

	@@ -1 +1 @@
1	- JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "~~opentelemetry.instrumentation.~~judgeval"
1	+ JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "judgeval"

judgeval/tracer/exporters/store.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Dict
 from opentelemetry.sdk.trace import ReadableSpan
@@ -9,35 +9,51 @@ class ABCSpanStore(ABC):
     def add(self, *spans: ReadableSpan): ...
     @abstractmethod
-    def get(self, id: str) -> ReadableSpan: ...
+    def get_all(self) -> List[ReadableSpan]: ...
     @abstractmethod
-    def get_all(self) -> List[ReadableSpan]: ...
+    def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]: ...
+    @abstractmethod
+    def clear_trace(self, trace_id: str): ...
 class SpanStore(ABCSpanStore):
-    __slots__ = ("spans",)
+    __slots__ = ("_spans_by_trace",)
-    spans: List[ReadableSpan]
+    _spans_by_trace: Dict[str, List[ReadableSpan]]
     def __init__(self):
-        self.spans = []
+        self._spans_by_trace = {}
     def add(self, *spans: ReadableSpan):
-        self.spans.extend(spans)
-    def get(self, id: str) -> ReadableSpan:
-        for span in self.spans:
+        for span in spans:
             context = span.get_span_context()
             if context is None:
                 continue
-            if context.span_id == id:
-                return span
-        raise ValueError(f"Span with id {id} not found")
+            # Convert trace_id to hex string per OTEL spec
+            trace_id = format(context.trace_id, "032x")
+            if trace_id not in self._spans_by_trace:
+                self._spans_by_trace[trace_id] = []
+            self._spans_by_trace[trace_id].append(span)
     def get_all(self) -> List[ReadableSpan]:
-        return self.spans
+        all_spans = []
+        for spans in self._spans_by_trace.values():
+            all_spans.extend(spans)
+        return all_spans
+    def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]:
+        """Get all spans for a specific trace ID (32-char hex string)."""
+        return self._spans_by_trace.get(trace_id, [])
+    def clear_trace(self, trace_id: str):
+        """Clear all spans for a specific trace ID (32-char hex string)."""
+        if trace_id in self._spans_by_trace:
+            del self._spans_by_trace[trace_id]
     def __repr__(self) -> str:
-        return f"SpanStore(spans={self.spans})"
+        total_spans = sum(len(spans) for spans in self._spans_by_trace.values())
+        return (
+            f"SpanStore(traces={len(self._spans_by_trace)}, total_spans={total_spans})"
+        )

judgeval/tracer/keys.py CHANGED Viewed

@@ -26,18 +26,19 @@ class AttributeKeys(str, Enum):
     PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
+    JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
+    JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
+    JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
+    JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
+        "judgment.usage.cache_creation_input_tokens"
+    )
+    JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
+    JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
+    JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
     GEN_AI_PROMPT = "gen_ai.prompt"
     GEN_AI_COMPLETION = "gen_ai.completion"
-    GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
-    GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
     GEN_AI_SYSTEM = "gen_ai.system"
-    GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
-    GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
-    GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
-        "gen_ai.usage.cache_creation_input_tokens"
-    )
-    GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
     GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
     GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
     GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
@@ -51,6 +52,7 @@ class InternalAttributeKeys(str, Enum):
     DISABLE_PARTIAL_EMIT = "disable_partial_emit"
     CANCELLED = "cancelled"
+    IS_CUSTOMER_CONTEXT_OWNER = "is_customer_context_owner"
 class ResourceKeys(str, Enum):

judgeval/tracer/llm/llm_anthropic/messages.py CHANGED Viewed

@@ -89,13 +89,13 @@ def _wrap_non_streaming_sync(
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
     def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -112,17 +112,19 @@ def _wrap_non_streaming_sync(
                 _extract_anthropic_tokens(result.usage)
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                span,
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
             )
             set_span_attribute(
                 span,
-                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                 cache_creation,
             )
             set_span_attribute(
@@ -133,7 +135,7 @@ def _wrap_non_streaming_sync(
         set_span_attribute(
             span,
-            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
             result.model,
         )
@@ -163,13 +165,13 @@ def _wrap_streaming_sync(
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
         ctx["accumulated_content"] = ""
@@ -197,17 +199,21 @@ def _wrap_streaming_sync(
                     _extract_anthropic_tokens(usage_data)
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                    span,
+                    AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                    prompt_tokens,
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                    span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                    span,
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
+                    cache_read,
                 )
                 set_span_attribute(
                     span,
-                    AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                     cache_creation,
                 )
                 set_span_attribute(
@@ -273,13 +279,13 @@ def _wrap_non_streaming_async(
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
     def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -296,17 +302,19 @@ def _wrap_non_streaming_async(
                 _extract_anthropic_tokens(result.usage)
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                span,
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
             )
             set_span_attribute(
                 span,
-                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                 cache_creation,
             )
             set_span_attribute(
@@ -317,7 +325,7 @@ def _wrap_non_streaming_async(
         set_span_attribute(
             span,
-            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
             result.model,
         )
@@ -348,13 +356,13 @@ def _wrap_streaming_async(
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
         ctx["accumulated_content"] = ""
@@ -382,17 +390,21 @@ def _wrap_streaming_async(
                     _extract_anthropic_tokens(usage_data)
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                    span,
+                    AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                    prompt_tokens,
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                    span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
                 )
                 set_span_attribute(
-                    span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                    span,
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
+                    cache_read,
                 )
                 set_span_attribute(
                     span,
-                    AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                    AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                     cache_creation,
                 )
                 set_span_attribute(

judgeval/tracer/llm/llm_anthropic/messages_stream.py CHANGED Viewed

@@ -37,14 +37,14 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
         ctx["accumulated_content"] = ""
@@ -125,22 +125,22 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
                             ) = _extract_anthropic_tokens(final_message.usage)
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
                                 prompt_tokens,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
                                 completion_tokens,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
                                 cache_read,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                                 cache_creation,
                             )
                             set_span_attribute(
@@ -151,7 +151,7 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
                         set_span_attribute(
                             span,
-                            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+                            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
                             final_message.model,
                         )
                     except Exception:
@@ -183,14 +183,14 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
         ctx["span"] = tracer.get_tracer().start_span(
             "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
         ctx["accumulated_content"] = ""
@@ -271,22 +271,22 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
                             ) = _extract_anthropic_tokens(final_message.usage)
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
                                 prompt_tokens,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
                                 completion_tokens,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
                                 cache_read,
                             )
                             set_span_attribute(
                                 span,
-                                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                                 cache_creation,
                             )
                             set_span_attribute(
@@ -297,7 +297,7 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
                         set_span_attribute(
                             span,
-                            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+                            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
                             final_message.model,
                         )
                     except Exception:

judgeval/tracer/llm/llm_google/generate_content.py CHANGED Viewed

@@ -57,13 +57,13 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
         ctx["span"] = tracer.get_tracer().start_span(
             "GOOGLE_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
     def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
@@ -79,17 +79,19 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
                 _extract_google_tokens(usage_data)
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                span,
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
             )
             set_span_attribute(
                 span,
-                AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
+                AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
                 cache_creation,
             )
             set_span_attribute(
@@ -100,7 +102,7 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
         set_span_attribute(
             span,
-            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
             result.model_version if result.model_version else ctx["model_name"],
         )

judgeval/tracer/llm/llm_openai/beta_chat_completions.py CHANGED Viewed

@@ -16,6 +16,7 @@ from judgeval.utils.wrappers import (
     immutable_wrap_sync,
     immutable_wrap_async,
 )
+from judgeval.tracer.llm.llm_openai.utils import openai_tokens_converter
 if TYPE_CHECKING:
     from judgeval.tracer import Tracer
@@ -39,13 +40,13 @@ def _wrap_beta_non_streaming_sync(
         ctx["span"] = tracer.get_tracer().start_span(
             "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
     def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
@@ -66,17 +67,29 @@ def _wrap_beta_non_streaming_sync(
             if prompt_tokens_details:
                 cache_read = prompt_tokens_details.cached_tokens or 0
+            prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                openai_tokens_converter(
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read,
+                    0,
+                    usage_data.total_tokens,
+                )
+            )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                span,
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
             )
             set_span_attribute(
                 span,
@@ -86,7 +99,7 @@ def _wrap_beta_non_streaming_sync(
         set_span_attribute(
             span,
-            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
             result.model or ctx["model_name"],
         )
@@ -122,13 +135,13 @@ def _wrap_beta_non_streaming_async(
         ctx["span"] = tracer.get_tracer().start_span(
             "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
         )
-        tracer.add_agent_attributes_to_span(ctx["span"])
+        tracer._inject_judgment_context(ctx["span"])
         set_span_attribute(
             ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
         )
         ctx["model_name"] = kwargs.get("model", "")
         set_span_attribute(
-            ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
+            ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
         )
     def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
@@ -149,17 +162,28 @@ def _wrap_beta_non_streaming_async(
             if prompt_tokens_details:
                 cache_read = prompt_tokens_details.cached_tokens or 0
+            prompt_tokens, completion_tokens, cache_read, cache_creation = (
+                openai_tokens_converter(
+                    prompt_tokens,
+                    completion_tokens,
+                    cache_read,
+                    0,
+                    usage_data.total_tokens,
+                )
+            )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
+                span,
+                AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
+                prompt_tokens,
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
+                span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
             )
             set_span_attribute(
-                span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
+                span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
             )
             set_span_attribute(
                 span,
@@ -169,7 +193,7 @@ def _wrap_beta_non_streaming_async(
         set_span_attribute(
             span,
-            AttributeKeys.GEN_AI_RESPONSE_MODEL,
+            AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
             result.model or ctx["model_name"],
         )

judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.16.9py3-none-any.whl → 0.22.2py3-none-any.whl