PyPI - opentelemetry-instrumentation-openai - Versions diffs - 0.34.1__py3-none-any.whl → 0.49.3__py3-none-any.whl - Mend - Supply Chain Defender

opentelemetry-instrumentation-openai 0.34.1py3-none-any.whl → 0.49.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of opentelemetry-instrumentation-openai might be problematic. Click here for more details.

Files changed (22) hide show

opentelemetry/instrumentation/openai/shared/chat_wrappers.py CHANGED Viewed

@@ -1,48 +1,59 @@
 import copy
 import json
 import logging
+import threading
 import time
-from opentelemetry.instrumentation.openai.shared.config import Config
-from wrapt import ObjectProxy
+from functools import singledispatch
+from typing import List, Optional, Union
 from opentelemetry import context as context_api
-from opentelemetry.metrics import Counter, Histogram
-from opentelemetry.semconv_ai import (
-    SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY,
-    SpanAttributes,
-    LLMRequestTypeValues,
-)
-from opentelemetry.instrumentation.utils import _SUPPRESS_INSTRUMENTATION_KEY
-from opentelemetry.instrumentation.openai.utils import (
-    _with_chat_telemetry_wrapper,
-    dont_throw,
-    run_async,
-)
+import pydantic
 from opentelemetry.instrumentation.openai.shared import (
-    metric_shared_attributes,
+    OPENAI_LLM_USAGE_TOKEN_TYPES,
+    _get_openai_base_url,
     _set_client_attributes,
+    _set_functions_attributes,
     _set_request_attributes,
+    _set_response_attributes,
     _set_span_attribute,
-    _set_functions_attributes,
+    _set_span_stream_usage,
     _token_type,
-    set_tools_attributes,
-    _set_response_attributes,
     is_streaming_response,
-    should_send_prompts,
+    metric_shared_attributes,
     model_as_dict,
-    _get_openai_base_url,
-    OPENAI_LLM_USAGE_TOKEN_TYPES,
-    should_record_stream_token_usage,
-    get_token_count_from_string,
-    _set_span_stream_usage,
     propagate_trace_context,
+    set_tools_attributes,
+)
+from opentelemetry.instrumentation.openai.shared.config import Config
+from opentelemetry.instrumentation.openai.shared.event_emitter import emit_event
+from opentelemetry.instrumentation.openai.shared.event_models import (
+    ChoiceEvent,
+    MessageEvent,
+    ToolCall,
+)
+from opentelemetry.instrumentation.openai.utils import (
+    _with_chat_telemetry_wrapper,
+    dont_throw,
+    is_openai_v1,
+    run_async,
+    should_emit_events,
+    should_send_prompts,
+)
+from opentelemetry.instrumentation.utils import _SUPPRESS_INSTRUMENTATION_KEY
+from opentelemetry.metrics import Counter, Histogram
+from opentelemetry.semconv.attributes.error_attributes import ERROR_TYPE
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv_ai import (
+    SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY,
+    LLMRequestTypeValues,
+    SpanAttributes,
 )
 from opentelemetry.trace import SpanKind, Tracer
+from opentelemetry import trace
 from opentelemetry.trace.status import Status, StatusCode
-from opentelemetry.instrumentation.openai.utils import is_openai_v1
+from wrapt import ObjectProxy
 SPAN_NAME = "openai.chat"
 PROMPT_FILTER_KEY = "prompt_filter_results"
@@ -79,70 +90,77 @@ def chat_wrapper(
         attributes={SpanAttributes.LLM_REQUEST_TYPE: LLM_REQUEST_TYPE.value},
     )
-    run_async(_handle_request(span, kwargs, instance))
+    # Use the span as current context to ensure events get proper trace context
+    with trace.use_span(span, end_on_exit=False):
+        run_async(_handle_request(span, kwargs, instance))
+        try:
+            start_time = time.time()
+            response = wrapped(*args, **kwargs)
+            end_time = time.time()
+        except Exception as e:  # pylint: disable=broad-except
+            end_time = time.time()
+            duration = end_time - start_time if "start_time" in locals() else 0
+            attributes = {
+                "error.type": e.__class__.__name__,
+            }
-    try:
-        start_time = time.time()
-        response = wrapped(*args, **kwargs)
-        end_time = time.time()
-    except Exception as e:  # pylint: disable=broad-except
-        end_time = time.time()
-        duration = end_time - start_time if "start_time" in locals() else 0
-        attributes = {
-            "error.type": e.__class__.__name__,
-        }
-        if duration > 0 and duration_histogram:
-            duration_histogram.record(duration, attributes=attributes)
-        if exception_counter:
-            exception_counter.add(1, attributes=attributes)
-        raise e
-    if is_streaming_response(response):
-        # span will be closed after the generator is done
-        if is_openai_v1():
-            return ChatStream(
-                span,
-                response,
-                instance,
-                token_counter,
-                choice_counter,
-                duration_histogram,
-                streaming_time_to_first_token,
-                streaming_time_to_generate,
-                start_time,
-                kwargs,
-            )
-        else:
-            return _build_from_streaming_response(
-                span,
-                response,
-                instance,
-                token_counter,
-                choice_counter,
-                duration_histogram,
-                streaming_time_to_first_token,
-                streaming_time_to_generate,
-                start_time,
-                kwargs,
-            )
+            if duration > 0 and duration_histogram:
+                duration_histogram.record(duration, attributes=attributes)
+            if exception_counter:
+                exception_counter.add(1, attributes=attributes)
-    duration = end_time - start_time
+            span.set_attribute(ERROR_TYPE, e.__class__.__name__)
+            span.record_exception(e)
+            span.set_status(Status(StatusCode.ERROR, str(e)))
+            span.end()
-    _handle_response(
-        response,
-        span,
-        instance,
-        token_counter,
-        choice_counter,
-        duration_histogram,
-        duration,
-    )
-    span.end()
+            raise
-    return response
+        if is_streaming_response(response):
+            # span will be closed after the generator is done
+            if is_openai_v1():
+                return ChatStream(
+                    span,
+                    response,
+                    instance,
+                    token_counter,
+                    choice_counter,
+                    duration_histogram,
+                    streaming_time_to_first_token,
+                    streaming_time_to_generate,
+                    start_time,
+                    kwargs,
+                )
+            else:
+                return _build_from_streaming_response(
+                    span,
+                    response,
+                    instance,
+                    token_counter,
+                    choice_counter,
+                    duration_histogram,
+                    streaming_time_to_first_token,
+                    streaming_time_to_generate,
+                    start_time,
+                    kwargs,
+                )
+        duration = end_time - start_time
+        _handle_response(
+            response,
+            span,
+            instance,
+            token_counter,
+            choice_counter,
+            duration_histogram,
+            duration,
+        )
+        span.end()
+        return response
 @_with_chat_telemetry_wrapper
@@ -169,87 +187,115 @@ async def achat_wrapper(
         kind=SpanKind.CLIENT,
         attributes={SpanAttributes.LLM_REQUEST_TYPE: LLM_REQUEST_TYPE.value},
     )
-    await _handle_request(span, kwargs, instance)
-    try:
-        start_time = time.time()
-        response = await wrapped(*args, **kwargs)
-        end_time = time.time()
-    except Exception as e:  # pylint: disable=broad-except
-        end_time = time.time()
-        duration = end_time - start_time if "start_time" in locals() else 0
-        common_attributes = Config.get_common_metrics_attributes()
-        attributes = {
-            **common_attributes,
-            "error.type": e.__class__.__name__,
-        }
-        if duration > 0 and duration_histogram:
-            duration_histogram.record(duration, attributes=attributes)
-        if exception_counter:
-            exception_counter.add(1, attributes=attributes)
-        raise e
-    if is_streaming_response(response):
-        # span will be closed after the generator is done
-        if is_openai_v1():
-            return ChatStream(
-                span,
-                response,
-                instance,
-                token_counter,
-                choice_counter,
-                duration_histogram,
-                streaming_time_to_first_token,
-                streaming_time_to_generate,
-                start_time,
-                kwargs,
-            )
-        else:
-            return _abuild_from_streaming_response(
-                span,
-                response,
-                instance,
-                token_counter,
-                choice_counter,
-                duration_histogram,
-                streaming_time_to_first_token,
-                streaming_time_to_generate,
-                start_time,
-                kwargs,
-            )
+    # Use the span as current context to ensure events get proper trace context
+    with trace.use_span(span, end_on_exit=False):
+        await _handle_request(span, kwargs, instance)
-    duration = end_time - start_time
+        try:
+            start_time = time.time()
+            response = await wrapped(*args, **kwargs)
+            end_time = time.time()
+        except Exception as e:  # pylint: disable=broad-except
+            end_time = time.time()
+            duration = end_time - start_time if "start_time" in locals() else 0
+            common_attributes = Config.get_common_metrics_attributes()
+            attributes = {
+                **common_attributes,
+                "error.type": e.__class__.__name__,
+            }
-    _handle_response(
-        response,
-        span,
-        instance,
-        token_counter,
-        choice_counter,
-        duration_histogram,
-        duration,
-    )
-    span.end()
+            if duration > 0 and duration_histogram:
+                duration_histogram.record(duration, attributes=attributes)
+            if exception_counter:
+                exception_counter.add(1, attributes=attributes)
-    return response
+            span.set_attribute(ERROR_TYPE, e.__class__.__name__)
+            span.record_exception(e)
+            span.set_status(Status(StatusCode.ERROR, str(e)))
+            span.end()
+            raise
+        if is_streaming_response(response):
+            # span will be closed after the generator is done
+            if is_openai_v1():
+                return ChatStream(
+                    span,
+                    response,
+                    instance,
+                    token_counter,
+                    choice_counter,
+                    duration_histogram,
+                    streaming_time_to_first_token,
+                    streaming_time_to_generate,
+                    start_time,
+                    kwargs,
+                )
+            else:
+                return _abuild_from_streaming_response(
+                    span,
+                    response,
+                    instance,
+                    token_counter,
+                    choice_counter,
+                    duration_histogram,
+                    streaming_time_to_first_token,
+                    streaming_time_to_generate,
+                    start_time,
+                    kwargs,
+                )
+        duration = end_time - start_time
+        _handle_response(
+            response,
+            span,
+            instance,
+            token_counter,
+            choice_counter,
+            duration_histogram,
+            duration,
+        )
+        span.end()
+        return response
 @dont_throw
 async def _handle_request(span, kwargs, instance):
-    _set_request_attributes(span, kwargs)
+    _set_request_attributes(span, kwargs, instance)
     _set_client_attributes(span, instance)
-    if should_send_prompts():
-        await _set_prompts(span, kwargs.get("messages"))
-        if kwargs.get("functions"):
-            _set_functions_attributes(span, kwargs.get("functions"))
-        elif kwargs.get("tools"):
-            set_tools_attributes(span, kwargs.get("tools"))
+    if should_emit_events():
+        for message in kwargs.get("messages", []):
+            emit_event(
+                MessageEvent(
+                    content=message.get("content"),
+                    role=message.get("role"),
+                    tool_calls=_parse_tool_calls(
+                        message.get("tool_calls", None)),
+                )
+            )
+    else:
+        if should_send_prompts():
+            await _set_prompts(span, kwargs.get("messages"))
+            if kwargs.get("functions"):
+                _set_functions_attributes(span, kwargs.get("functions"))
+            elif kwargs.get("tools"):
+                set_tools_attributes(span, kwargs.get("tools"))
     if Config.enable_trace_context_propagation:
         propagate_trace_context(span, kwargs)
+    # Reasoning request attributes
+    reasoning_effort = kwargs.get("reasoning_effort")
+    _set_span_attribute(
+        span,
+        SpanAttributes.LLM_REQUEST_REASONING_EFFORT,
+        reasoning_effort or ()
+    )
 @dont_throw
 def _handle_response(
@@ -260,6 +306,7 @@ def _handle_response(
     choice_counter=None,
     duration_histogram=None,
     duration=None,
+    is_streaming: bool = False,
 ):
     if is_openai_v1():
         response_dict = model_as_dict(response)
@@ -274,25 +321,59 @@ def _handle_response(
         duration_histogram,
         response_dict,
         duration,
+        is_streaming,
     )
     # span attributes
     _set_response_attributes(span, response_dict)
-    if should_send_prompts():
-        _set_completions(span, response_dict.get("choices"))
+    # Reasoning usage attributes
+    usage = response_dict.get("usage")
+    reasoning_tokens = None
+    if usage:
+        # Support both dict-style and object-style `usage`
+        tokens_details = (
+            usage.get("completion_tokens_details") if isinstance(usage, dict)
+            else getattr(usage, "completion_tokens_details", None)
+        )
+        if tokens_details:
+            reasoning_tokens = (
+                tokens_details.get("reasoning_tokens", None) if isinstance(tokens_details, dict)
+                else getattr(tokens_details, "reasoning_tokens", None)
+            )
+    _set_span_attribute(
+        span,
+        SpanAttributes.LLM_USAGE_REASONING_TOKENS,
+        reasoning_tokens or 0,
+    )
+    if should_emit_events():
+        if response.choices is not None:
+            for choice in response.choices:
+                emit_event(_parse_choice_event(choice))
+    else:
+        if should_send_prompts():
+            _set_completions(span, response_dict.get("choices"))
     return response
 def _set_chat_metrics(
-    instance, token_counter, choice_counter, duration_histogram, response_dict, duration
+    instance,
+    token_counter,
+    choice_counter,
+    duration_histogram,
+    response_dict,
+    duration,
+    is_streaming: bool = False,
 ):
     shared_attributes = metric_shared_attributes(
         response_model=response_dict.get("model") or None,
         operation="chat",
         server_address=_get_openai_base_url(instance),
-        is_streaming=False,
+        is_streaming=is_streaming,
     )
     # token metrics
@@ -326,7 +407,7 @@ def _set_token_counter_metrics(token_counter, usage, shared_attributes):
         if name in OPENAI_LLM_USAGE_TOKEN_TYPES:
             attributes_with_token_type = {
                 **shared_attributes,
-                SpanAttributes.LLM_TOKEN_TYPE: _token_type(name),
+                GenAIAttributes.GEN_AI_TOKEN_TYPE: _token_type(name),
             }
             token_counter.record(val, attributes=attributes_with_token_type)
@@ -351,7 +432,8 @@ async def _process_image_item(item, trace_id, span_id, message_index, content_in
     image_format = item["image_url"]["url"].split(";")[0].split("/")[1]
     image_name = f"message_{message_index}_content_{content_index}.{image_format}"
     base64_string = item["image_url"]["url"].split(",")[1]
-    url = await Config.upload_base64_image(trace_id, span_id, image_name, base64_string)
+    # Convert trace_id and span_id to strings as expected by upload function
+    url = await Config.upload_base64_image(str(trace_id), str(span_id), image_name, base64_string)
     return {"type": "image_url", "image_url": {"url": url}}
@@ -362,7 +444,8 @@ async def _set_prompts(span, messages):
         return
     for i, msg in enumerate(messages):
-        prefix = f"{SpanAttributes.LLM_PROMPTS}.{i}"
+        prefix = f"{GenAIAttributes.GEN_AI_PROMPT}.{i}"
+        msg = msg if isinstance(msg, dict) else model_as_dict(msg)
         _set_span_attribute(span, f"{prefix}.role", msg.get("role"))
         if msg.get("content"):
@@ -382,7 +465,8 @@ async def _set_prompts(span, messages):
                 content = json.dumps(content)
             _set_span_attribute(span, f"{prefix}.content", content)
         if msg.get("tool_call_id"):
-            _set_span_attribute(span, f"{prefix}.tool_call_id", msg.get("tool_call_id"))
+            _set_span_attribute(
+                span, f"{prefix}.tool_call_id", msg.get("tool_call_id"))
         tool_calls = msg.get("tool_calls")
         if tool_calls:
             for i, tool_call in enumerate(tool_calls):
@@ -413,7 +497,7 @@ def _set_completions(span, choices):
     for choice in choices:
         index = choice.get("index")
-        prefix = f"{SpanAttributes.LLM_COMPLETIONS}.{index}"
+        prefix = f"{GenAIAttributes.GEN_AI_COMPLETION}.{index}"
         _set_span_attribute(
             span, f"{prefix}.finish_reason", choice.get("finish_reason")
         )
@@ -438,9 +522,11 @@ def _set_completions(span, choices):
         _set_span_attribute(span, f"{prefix}.role", message.get("role"))
         if message.get("refusal"):
-            _set_span_attribute(span, f"{prefix}.refusal", message.get("refusal"))
+            _set_span_attribute(
+                span, f"{prefix}.refusal", message.get("refusal"))
         else:
-            _set_span_attribute(span, f"{prefix}.content", message.get("content"))
+            _set_span_attribute(
+                span, f"{prefix}.content", message.get("content"))
         function_call = message.get("function_call")
         if function_call:
@@ -478,60 +564,34 @@ def _set_completions(span, choices):
 def _set_streaming_token_metrics(
     request_kwargs, complete_response, span, token_counter, shared_attributes
 ):
-    # use tiktoken calculate token usage
-    if not should_record_stream_token_usage():
-        return
-    # kwargs={'model': 'gpt-3.5', 'messages': [{'role': 'user', 'content': '...'}], 'stream': True}
     prompt_usage = -1
     completion_usage = -1
-    # prompt_usage
-    if request_kwargs and request_kwargs.get("messages"):
-        prompt_content = ""
-        # setting the default model_name as gpt-4. As this uses the embedding "cl100k_base" that
-        # is used by most of the other model.
-        model_name = (
-            complete_response.get("model") or request_kwargs.get("model") or "gpt-4"
-        )
-        for msg in request_kwargs.get("messages"):
-            if msg.get("content"):
-                prompt_content += msg.get("content")
-        if model_name:
-            prompt_usage = get_token_count_from_string(prompt_content, model_name)
-    # completion_usage
-    if complete_response.get("choices"):
-        completion_content = ""
-        # setting the default model_name as gpt-4. As this uses the embedding "cl100k_base" that
-        # is used by most of the other model.
-        model_name = complete_response.get("model") or "gpt-4"
-        for choice in complete_response.get("choices"):
-            if choice.get("message") and choice.get("message").get("content"):
-                completion_content += choice["message"]["content"]
-        if model_name:
-            completion_usage = get_token_count_from_string(
-                completion_content, model_name
-            )
+    # Use token usage from API response only
+    if complete_response.get("usage"):
+        usage = complete_response["usage"]
+        if usage.get("prompt_tokens"):
+            prompt_usage = usage["prompt_tokens"]
+        if usage.get("completion_tokens"):
+            completion_usage = usage["completion_tokens"]
     # span record
     _set_span_stream_usage(span, prompt_usage, completion_usage)
     # metrics record
     if token_counter:
-        if type(prompt_usage) is int and prompt_usage >= 0:
+        if isinstance(prompt_usage, int) and prompt_usage >= 0:
             attributes_with_token_type = {
                 **shared_attributes,
-                SpanAttributes.LLM_TOKEN_TYPE: "input",
+                GenAIAttributes.GEN_AI_TOKEN_TYPE: "input",
             }
-            token_counter.record(prompt_usage, attributes=attributes_with_token_type)
+            token_counter.record(
+                prompt_usage, attributes=attributes_with_token_type)
-        if type(completion_usage) is int and completion_usage >= 0:
+        if isinstance(completion_usage, int) and completion_usage >= 0:
             attributes_with_token_type = {
                 **shared_attributes,
-                SpanAttributes.LLM_TOKEN_TYPE: "output",
+                GenAIAttributes.GEN_AI_TOKEN_TYPE: "output",
             }
             token_counter.record(
                 completion_usage, attributes=attributes_with_token_type
@@ -579,11 +639,34 @@ class ChatStream(ObjectProxy):
         self._time_of_first_token = self._start_time
         self._complete_response = {"choices": [], "model": ""}
+        # Cleanup state tracking to prevent duplicate operations
+        self._cleanup_completed = False
+        self._cleanup_lock = threading.Lock()
+    def __del__(self):
+        """Cleanup when object is garbage collected"""
+        if hasattr(self, '_cleanup_completed') and not self._cleanup_completed:
+            self._ensure_cleanup()
     def __enter__(self):
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.__wrapped__.__exit__(exc_type, exc_val, exc_tb)
+        cleanup_exception = None
+        try:
+            self._ensure_cleanup()
+        except Exception as e:
+            cleanup_exception = e
+            # Don't re-raise to avoid masking original exception
+        result = self.__wrapped__.__exit__(exc_type, exc_val, exc_tb)
+        if cleanup_exception:
+            # Log cleanup exception but don't affect context manager behavior
+            logger.debug(
+                "Error during ChatStream cleanup in __exit__: %s", cleanup_exception)
+        return result
     async def __aenter__(self):
         return self
@@ -602,8 +685,13 @@ class ChatStream(ObjectProxy):
             chunk = self.__wrapped__.__next__()
         except Exception as e:
             if isinstance(e, StopIteration):
-                self._close_span()
-            raise e
+                self._process_complete_response()
+            else:
+                # Handle cleanup for other exceptions during stream iteration
+                self._ensure_cleanup()
+                if self._span and self._span.is_recording():
+                    self._span.set_status(Status(StatusCode.ERROR, str(e)))
+            raise
         else:
             self._process_item(chunk)
             return chunk
@@ -613,14 +701,20 @@ class ChatStream(ObjectProxy):
             chunk = await self.__wrapped__.__anext__()
         except Exception as e:
             if isinstance(e, StopAsyncIteration):
-                self._close_span()
-            raise e
+                self._process_complete_response()
+            else:
+                # Handle cleanup for other exceptions during stream iteration
+                self._ensure_cleanup()
+                if self._span and self._span.is_recording():
+                    self._span.set_status(Status(StatusCode.ERROR, str(e)))
+            raise
         else:
             self._process_item(chunk)
             return chunk
     def _process_item(self, item):
-        self._span.add_event(name=f"{SpanAttributes.LLM_CONTENT_COMPLETION_CHUNK}")
+        self._span.add_event(
+            name=f"{SpanAttributes.LLM_CONTENT_COMPLETION_CHUNK}")
         if self._first_token and self._streaming_time_to_first_token:
             self._time_of_first_token = time.time()
@@ -643,7 +737,7 @@ class ChatStream(ObjectProxy):
         )
     @dont_throw
-    def _close_span(self):
+    def _process_complete_response(self):
         _set_streaming_token_metrics(
             self._request_kwargs,
             self._complete_response,
@@ -676,12 +770,87 @@ class ChatStream(ObjectProxy):
             )
         _set_response_attributes(self._span, self._complete_response)
-        if should_send_prompts():
-            _set_completions(self._span, self._complete_response.get("choices"))
+        if should_emit_events():
+            for choice in self._complete_response.get("choices", []):
+                emit_event(_parse_choice_event(choice))
+        else:
+            if should_send_prompts():
+                _set_completions(
+                    self._span, self._complete_response.get("choices"))
         self._span.set_status(Status(StatusCode.OK))
         self._span.end()
+        self._cleanup_completed = True
+    @dont_throw
+    def _ensure_cleanup(self):
+        """Thread-safe cleanup method that handles different cleanup scenarios"""
+        with self._cleanup_lock:
+            if self._cleanup_completed:
+                logger.debug("ChatStream cleanup already completed, skipping")
+                return
+            try:
+                logger.debug("Starting ChatStream cleanup")
+                # Calculate partial metrics based on available data
+                self._record_partial_metrics()
+                # Set span status and close it
+                if self._span and self._span.is_recording():
+                    self._span.set_status(Status(StatusCode.OK))
+                    self._span.end()
+                    logger.debug("ChatStream span closed successfully")
+                self._cleanup_completed = True
+                logger.debug("ChatStream cleanup completed successfully")
+            except Exception as e:
+                # Log cleanup errors but don't propagate to avoid masking original issues
+                logger.debug("Error during ChatStream cleanup: %s", str(e))
+                # Still try to close the span even if metrics recording failed
+                try:
+                    if self._span and self._span.is_recording():
+                        self._span.set_status(
+                            Status(StatusCode.ERROR, "Cleanup failed"))
+                        self._span.end()
+                    self._cleanup_completed = True
+                except Exception:
+                    # Final fallback - just mark as completed to prevent infinite loops
+                    self._cleanup_completed = True
+    @dont_throw
+    def _record_partial_metrics(self):
+        """Record metrics based on available partial data"""
+        # Always record duration if we have start time
+        if self._start_time and isinstance(self._start_time, (float, int)) and self._duration_histogram:
+            duration = time.time() - self._start_time
+            self._duration_histogram.record(
+                duration, attributes=self._shared_attributes()
+            )
+        # Record basic span attributes even without complete response
+        if self._span and self._span.is_recording():
+            _set_response_attributes(self._span, self._complete_response)
+        # Record partial token metrics if we have any data
+        if self._complete_response.get("choices") or self._request_kwargs:
+            _set_streaming_token_metrics(
+                self._request_kwargs,
+                self._complete_response,
+                self._span,
+                self._token_counter,
+                self._shared_attributes(),
+            )
+        # Record choice metrics if we have any choices processed
+        if self._choice_counter and self._complete_response.get("choices"):
+            _set_choice_counter_metrics(
+                self._choice_counter,
+                self._complete_response.get("choices"),
+                self._shared_attributes(),
+            )
 # Backward compatibility with OpenAI v0
@@ -700,7 +869,7 @@ def _build_from_streaming_response(
     start_time=None,
     request_kwargs=None,
 ):
-    complete_response = {"choices": [], "model": ""}
+    complete_response = {"choices": [], "model": "", "id": ""}
     first_token = True
     time_of_first_token = start_time  # will be updated when first token is received
@@ -712,7 +881,8 @@ def _build_from_streaming_response(
         if first_token and streaming_time_to_first_token:
             time_of_first_token = time.time()
-            streaming_time_to_first_token.record(time_of_first_token - start_time)
+            streaming_time_to_first_token.record(
+                time_of_first_token - start_time)
             first_token = False
         _accumulate_stream_items(item, complete_response)
@@ -720,7 +890,7 @@ def _build_from_streaming_response(
         yield item_to_yield
     shared_attributes = {
-        SpanAttributes.LLM_RESPONSE_MODEL: complete_response.get("model") or None,
+        GenAIAttributes.GEN_AI_RESPONSE_MODEL: complete_response.get("model") or None,
         "server.address": _get_openai_base_url(instance),
         "stream": True,
     }
@@ -746,9 +916,12 @@ def _build_from_streaming_response(
         streaming_time_to_generate.record(time.time() - time_of_first_token)
     _set_response_attributes(span, complete_response)
-    if should_send_prompts():
-        _set_completions(span, complete_response.get("choices"))
+    if should_emit_events():
+        for choice in complete_response.get("choices", []):
+            emit_event(_parse_choice_event(choice))
+    else:
+        if should_send_prompts():
+            _set_completions(span, complete_response.get("choices"))
     span.set_status(Status(StatusCode.OK))
     span.end()
@@ -767,7 +940,7 @@ async def _abuild_from_streaming_response(
     start_time=None,
     request_kwargs=None,
 ):
-    complete_response = {"choices": [], "model": ""}
+    complete_response = {"choices": [], "model": "", "id": ""}
     first_token = True
     time_of_first_token = start_time  # will be updated when first token is received
@@ -779,7 +952,8 @@ async def _abuild_from_streaming_response(
         if first_token and streaming_time_to_first_token:
             time_of_first_token = time.time()
-            streaming_time_to_first_token.record(time_of_first_token - start_time)
+            streaming_time_to_first_token.record(
+                time_of_first_token - start_time)
             first_token = False
         _accumulate_stream_items(item, complete_response)
@@ -787,7 +961,7 @@ async def _abuild_from_streaming_response(
         yield item_to_yield
     shared_attributes = {
-        SpanAttributes.LLM_RESPONSE_MODEL: complete_response.get("model") or None,
+        GenAIAttributes.GEN_AI_RESPONSE_MODEL: complete_response.get("model") or None,
         "server.address": _get_openai_base_url(instance),
         "stream": True,
     }
@@ -813,23 +987,161 @@ async def _abuild_from_streaming_response(
         streaming_time_to_generate.record(time.time() - time_of_first_token)
     _set_response_attributes(span, complete_response)
-    if should_send_prompts():
-        _set_completions(span, complete_response.get("choices"))
+    if should_emit_events():
+        for choice in complete_response.get("choices", []):
+            emit_event(_parse_choice_event(choice))
+    else:
+        if should_send_prompts():
+            _set_completions(span, complete_response.get("choices"))
     span.set_status(Status(StatusCode.OK))
     span.end()
+# pydantic.BaseModel here is ChatCompletionMessageFunctionToolCall (as of openai 1.99.7)
+# but we keep to a parent type to support older versions
+def _parse_tool_calls(
+    tool_calls: Optional[List[Union[dict, pydantic.BaseModel]]],
+) -> Union[List[ToolCall], None]:
+    """
+    Util to correctly parse the tool calls data from the OpenAI API to this module's
+    standard `ToolCall`.
+    """
+    if tool_calls is None:
+        return tool_calls
+    result = []
+    for tool_call in tool_calls:
+        tool_call_data = None
+        if isinstance(tool_call, dict):
+            tool_call_data = copy.deepcopy(tool_call)
+        elif _is_chat_message_function_tool_call(tool_call):
+            tool_call_data = tool_call.model_dump()
+        elif _is_function_call(tool_call):
+            function_call = tool_call.model_dump()
+            tool_call_data = ToolCall(
+                id="",
+                function={
+                    "name": function_call.get("name"),
+                    "arguments": function_call.get("arguments"),
+                },
+                type="function",
+            )
+        result.append(tool_call_data)
+    return result
+def _is_chat_message_function_tool_call(model: Union[dict, pydantic.BaseModel]) -> bool:
+    try:
+        from openai.types.chat.chat_completion_message_function_tool_call import (
+            ChatCompletionMessageFunctionToolCall,
+        )
+        return isinstance(model, ChatCompletionMessageFunctionToolCall)
+    except Exception:
+        try:
+            # Since OpenAI 1.99.3, ChatCompletionMessageToolCall is a Union,
+            # and the isinstance check will fail. This is fine, because in all
+            # those versions, the check above will succeed.
+            from openai.types.chat.chat_completion_message_tool_call import (
+                ChatCompletionMessageToolCall,
+            )
+            return isinstance(model, ChatCompletionMessageToolCall)
+        except Exception:
+            return False
+def _is_function_call(model: Union[dict, pydantic.BaseModel]) -> bool:
+    try:
+        from openai.types.chat.chat_completion_message import FunctionCall
+        return isinstance(model, FunctionCall)
+    except Exception:
+        return False
+@singledispatch
+def _parse_choice_event(choice) -> ChoiceEvent:
+    has_message = choice.message is not None
+    has_finish_reason = choice.finish_reason is not None
+    has_tool_calls = has_message and choice.message.tool_calls
+    has_function_call = has_message and choice.message.function_call
+    content = choice.message.content if has_message else None
+    role = choice.message.role if has_message else "unknown"
+    finish_reason = choice.finish_reason if has_finish_reason else "unknown"
+    if has_tool_calls and has_function_call:
+        tool_calls = choice.message.tool_calls + [choice.message.function_call]
+    elif has_tool_calls:
+        tool_calls = choice.message.tool_calls
+    elif has_function_call:
+        tool_calls = [choice.message.function_call]
+    else:
+        tool_calls = None
+    return ChoiceEvent(
+        index=choice.index,
+        message={"content": content, "role": role},
+        finish_reason=finish_reason,
+        tool_calls=_parse_tool_calls(tool_calls),
+    )
+@_parse_choice_event.register
+def _(choice: dict) -> ChoiceEvent:
+    message = choice.get("message")
+    has_message = message is not None
+    has_finish_reason = choice.get("finish_reason") is not None
+    has_tool_calls = has_message and message.get("tool_calls")
+    has_function_call = has_message and message.get("function_call")
+    content = choice.get("message").get("content", "") if has_message else None
+    role = choice.get("message").get("role") if has_message else "unknown"
+    finish_reason = choice.get(
+        "finish_reason") if has_finish_reason else "unknown"
+    if has_tool_calls and has_function_call:
+        tool_calls = message.get("tool_calls") + [message.get("function_call")]
+    elif has_tool_calls:
+        tool_calls = message.get("tool_calls")
+    elif has_function_call:
+        tool_calls = [message.get("function_call")]
+    else:
+        tool_calls = None
+    if tool_calls is not None:
+        for tool_call in tool_calls:
+            tool_call["type"] = "function"
+    return ChoiceEvent(
+        index=choice.get("index"),
+        message={"content": content, "role": role},
+        finish_reason=finish_reason,
+        tool_calls=tool_calls,
+    )
 def _accumulate_stream_items(item, complete_response):
     if is_openai_v1():
         item = model_as_dict(item)
     complete_response["model"] = item.get("model")
+    complete_response["id"] = item.get("id")
+    # capture usage information from the last stream chunks
+    if item.get("usage"):
+        complete_response["usage"] = item.get("usage")
+    elif item.get("choices") and item["choices"][0].get("usage"):
+        # Some LLM providers like moonshot mistakenly place token usage information within choices[0], handle this.
+        complete_response["usage"] = item["choices"][0].get("usage")
     # prompt filter results
     if item.get("prompt_filter_results"):
-        complete_response["prompt_filter_results"] = item.get("prompt_filter_results")
+        complete_response["prompt_filter_results"] = item.get(
+            "prompt_filter_results")
     for choice in item.get("choices"):
         index = choice.get("index")
@@ -876,4 +1188,5 @@ def _accumulate_stream_items(item, complete_response):
                 if tool_call_function and tool_call_function.get("name"):
                     span_function["name"] = tool_call_function.get("name")
                 if tool_call_function and tool_call_function.get("arguments"):
-                    span_function["arguments"] += tool_call_function.get("arguments")
+                    span_function["arguments"] += tool_call_function.get(
+                        "arguments")