openlit 1.33.7__py3-none-any.whl → 1.33.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. openlit/__helpers.py +83 -0
  2. openlit/__init__.py +1 -1
  3. openlit/instrumentation/ag2/ag2.py +2 -2
  4. openlit/instrumentation/ai21/__init__.py +4 -4
  5. openlit/instrumentation/ai21/ai21.py +370 -319
  6. openlit/instrumentation/ai21/async_ai21.py +371 -319
  7. openlit/instrumentation/anthropic/__init__.py +4 -4
  8. openlit/instrumentation/anthropic/anthropic.py +321 -189
  9. openlit/instrumentation/anthropic/async_anthropic.py +323 -190
  10. openlit/instrumentation/assemblyai/__init__.py +1 -1
  11. openlit/instrumentation/assemblyai/assemblyai.py +59 -43
  12. openlit/instrumentation/astra/astra.py +4 -4
  13. openlit/instrumentation/astra/async_astra.py +4 -4
  14. openlit/instrumentation/azure_ai_inference/__init__.py +4 -4
  15. openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +406 -252
  16. openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +406 -252
  17. openlit/instrumentation/bedrock/__init__.py +1 -1
  18. openlit/instrumentation/bedrock/bedrock.py +115 -58
  19. openlit/instrumentation/chroma/chroma.py +4 -4
  20. openlit/instrumentation/cohere/__init__.py +33 -10
  21. openlit/instrumentation/cohere/async_cohere.py +610 -0
  22. openlit/instrumentation/cohere/cohere.py +410 -219
  23. openlit/instrumentation/controlflow/controlflow.py +2 -2
  24. openlit/instrumentation/crawl4ai/async_crawl4ai.py +2 -2
  25. openlit/instrumentation/crawl4ai/crawl4ai.py +2 -2
  26. openlit/instrumentation/crewai/crewai.py +2 -2
  27. openlit/instrumentation/dynamiq/dynamiq.py +2 -2
  28. openlit/instrumentation/elevenlabs/async_elevenlabs.py +73 -47
  29. openlit/instrumentation/elevenlabs/elevenlabs.py +73 -52
  30. openlit/instrumentation/embedchain/embedchain.py +4 -4
  31. openlit/instrumentation/firecrawl/firecrawl.py +2 -2
  32. openlit/instrumentation/google_ai_studio/__init__.py +9 -9
  33. openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +183 -219
  34. openlit/instrumentation/google_ai_studio/google_ai_studio.py +183 -220
  35. openlit/instrumentation/gpt4all/gpt4all.py +17 -17
  36. openlit/instrumentation/groq/async_groq.py +14 -14
  37. openlit/instrumentation/groq/groq.py +14 -14
  38. openlit/instrumentation/haystack/haystack.py +2 -2
  39. openlit/instrumentation/julep/async_julep.py +2 -2
  40. openlit/instrumentation/julep/julep.py +2 -2
  41. openlit/instrumentation/langchain/langchain.py +36 -31
  42. openlit/instrumentation/letta/letta.py +6 -6
  43. openlit/instrumentation/litellm/async_litellm.py +20 -20
  44. openlit/instrumentation/litellm/litellm.py +20 -20
  45. openlit/instrumentation/llamaindex/llamaindex.py +2 -2
  46. openlit/instrumentation/mem0/mem0.py +2 -2
  47. openlit/instrumentation/milvus/milvus.py +4 -4
  48. openlit/instrumentation/mistral/async_mistral.py +18 -18
  49. openlit/instrumentation/mistral/mistral.py +18 -18
  50. openlit/instrumentation/multion/async_multion.py +2 -2
  51. openlit/instrumentation/multion/multion.py +2 -2
  52. openlit/instrumentation/ollama/async_ollama.py +29 -29
  53. openlit/instrumentation/ollama/ollama.py +29 -29
  54. openlit/instrumentation/openai/__init__.py +11 -230
  55. openlit/instrumentation/openai/async_openai.py +434 -409
  56. openlit/instrumentation/openai/openai.py +415 -393
  57. openlit/instrumentation/phidata/phidata.py +2 -2
  58. openlit/instrumentation/pinecone/pinecone.py +4 -4
  59. openlit/instrumentation/premai/premai.py +20 -20
  60. openlit/instrumentation/qdrant/async_qdrant.py +4 -4
  61. openlit/instrumentation/qdrant/qdrant.py +4 -4
  62. openlit/instrumentation/reka/async_reka.py +6 -6
  63. openlit/instrumentation/reka/reka.py +6 -6
  64. openlit/instrumentation/together/async_together.py +18 -18
  65. openlit/instrumentation/together/together.py +18 -18
  66. openlit/instrumentation/transformers/transformers.py +6 -6
  67. openlit/instrumentation/vertexai/async_vertexai.py +53 -53
  68. openlit/instrumentation/vertexai/vertexai.py +53 -53
  69. openlit/instrumentation/vllm/vllm.py +6 -6
  70. openlit/otel/metrics.py +98 -7
  71. openlit/semcov/__init__.py +113 -80
  72. {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/METADATA +2 -1
  73. openlit-1.33.9.dist-info/RECORD +121 -0
  74. {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/WHEEL +1 -1
  75. openlit/instrumentation/openai/async_azure_openai.py +0 -900
  76. openlit/instrumentation/openai/azure_openai.py +0 -898
  77. openlit-1.33.7.dist-info/RECORD +0 -122
  78. {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/LICENSE +0 -0
@@ -1,15 +1,20 @@
1
- # pylint: disable=duplicate-code, broad-exception-caught, too-many-statements, unused-argument, possibly-used-before-assignment, protected-access
2
1
  """
3
2
  Module for monitoring Azure AI Inference API calls.
4
3
  """
5
4
 
6
5
  import logging
6
+ import time
7
7
  from opentelemetry.trace import SpanKind, Status, StatusCode
8
- from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
8
+ from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
9
9
  from openlit.__helpers import (
10
- handle_exception,
11
10
  get_chat_model_cost,
12
11
  get_embed_model_cost,
12
+ handle_exception,
13
+ response_as_dict,
14
+ calculate_ttft,
15
+ calculate_tbt,
16
+ create_metrics_attributes,
17
+ set_server_address_and_port,
13
18
  general_tokens
14
19
  )
15
20
  from openlit.semcov import SemanticConvetion
@@ -17,13 +22,12 @@ from openlit.semcov import SemanticConvetion
17
22
  # Initialize logger for logging potential issues and operations
18
23
  logger = logging.getLogger(__name__)
19
24
 
20
- def async_complete(gen_ai_endpoint, version, environment, application_name,
25
+ def async_complete(version, environment, application_name,
21
26
  tracer, pricing_info, trace_content, metrics, disable_metrics):
22
27
  """
23
28
  Generates a telemetry wrapper for chat to collect metrics.
24
29
 
25
30
  Args:
26
- gen_ai_endpoint: Endpoint identifier for logging and tracing.
27
31
  version: Version of the monitoring package.
28
32
  environment: Deployment environment (e.g., production, staging).
29
33
  application_name: Name of the application using the Azure AI Inference API.
@@ -35,165 +39,275 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
35
39
  A function that wraps the chat method to add telemetry.
36
40
  """
37
41
 
42
+ class TracedAsyncStream:
43
+ """
44
+ Wrapper for streaming responses to collect metrics and trace data.
45
+ Wraps the 'az.ai.inference.AsyncStream' response to collect message IDs and aggregated response.
46
+
47
+ This class implements the '__aiter__' and '__anext__' methods that
48
+ handle asynchronous streaming responses.
49
+
50
+ This class also implements '__aenter__' and '__aexit__' methods that
51
+ handle asynchronous context management protocol.
52
+ """
53
+ def __init__(
54
+ self,
55
+ wrapped,
56
+ span,
57
+ kwargs,
58
+ server_address,
59
+ server_port,
60
+ **args,
61
+ ):
62
+ self.__wrapped__ = wrapped
63
+ self._span = span
64
+ # Placeholder for aggregating streaming response
65
+ self._llmresponse = ""
66
+ self._response_id = ""
67
+ self._response_model = ""
68
+ self._finish_reason = ""
69
+ self._system_fingerprint = ""
70
+
71
+ self._args = args
72
+ self._kwargs = kwargs
73
+ self._start_time = time.time()
74
+ self._end_time = None
75
+ self._timestamps = []
76
+ self._ttft = 0
77
+ self._tbt = 0
78
+ self._server_address = server_address
79
+ self._server_port = server_port
80
+
81
+ async def __aenter__(self):
82
+ await self.__wrapped__.__aenter__()
83
+ return self
84
+
85
+ async def __aexit__(self, exc_type, exc_value, traceback):
86
+ await self.__wrapped__.__aexit__(exc_type, exc_value, traceback)
87
+
88
+ def __aiter__(self):
89
+ return self
90
+
91
+ async def __getattr__(self, name):
92
+ """Delegate attribute access to the wrapped object."""
93
+ return getattr(await self.__wrapped__, name)
94
+
95
+ async def __anext__(self):
96
+ try:
97
+ chunk = await self.__wrapped__.__anext__()
98
+ end_time = time.time()
99
+ # Record the timestamp for the current chunk
100
+ self._timestamps.append(end_time)
101
+
102
+ if len(self._timestamps) == 1:
103
+ # Calculate time to first chunk
104
+ self._ttft = calculate_ttft(self._timestamps, self._start_time)
105
+
106
+ chunked = response_as_dict(chunk)
107
+ # Collect message IDs and aggregated response from events
108
+ if (len(chunked.get('choices')) > 0 and ('delta' in chunked.get('choices')[0] and
109
+ 'content' in chunked.get('choices')[0].get('delta'))):
110
+
111
+ content = chunked.get('choices')[0].get('delta').get('content')
112
+ if content:
113
+ self._llmresponse += content
114
+ self._response_id = chunked.get('id')
115
+ self._response_model = chunked.get('model')
116
+ self._finish_reason = chunked.get('choices')[0].get('finish_reason')
117
+ self._system_fingerprint = chunked.get('system_fingerprint')
118
+ return chunk
119
+ except StopAsyncIteration:
120
+ # Handling exception ensure observability without disrupting operation
121
+ try:
122
+ self._end_time = time.time()
123
+ if len(self._timestamps) > 1:
124
+ self._tbt = calculate_tbt(self._timestamps)
125
+
126
+ # Format 'messages' into a single string
127
+ message_prompt = self._kwargs.get("messages", "")
128
+ formatted_messages = []
129
+ for message in message_prompt:
130
+ role = message["role"]
131
+ content = message["content"]
132
+
133
+ if isinstance(content, list):
134
+ content_str_list = []
135
+ for item in content:
136
+ if item["type"] == "text":
137
+ content_str_list.append(f'text: {item["text"]}')
138
+ elif (item["type"] == "image_url" and
139
+ not item["image_url"]["url"].startswith("data:")):
140
+ content_str_list.append(f'image_url: {item["image_url"]["url"]}')
141
+ content_str = ", ".join(content_str_list)
142
+ formatted_messages.append(f"{role}: {content_str}")
143
+ else:
144
+ formatted_messages.append(f"{role}: {content}")
145
+ prompt = "\n".join(formatted_messages)
146
+
147
+ request_model = self._kwargs.get("model", "gpt-4o")
148
+
149
+ # Calculate tokens using input prompt and aggregated response
150
+ input_tokens = general_tokens(prompt)
151
+ output_tokens = general_tokens(self._llmresponse)
152
+
153
+ # Calculate cost of the operation
154
+ cost = get_chat_model_cost(request_model,
155
+ pricing_info, input_tokens,
156
+ output_tokens)
157
+
158
+ # Set Span attributes (OTel Semconv)
159
+ self._span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
160
+ self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
161
+ SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
162
+ self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
163
+ SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
164
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
165
+ request_model)
166
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
167
+ self._kwargs.get("seed", ""))
168
+ self._span.set_attribute(SemanticConvetion.SERVER_PORT,
169
+ self._server_port)
170
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
171
+ self._kwargs.get("frequency_penalty", 0.0))
172
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
173
+ self._kwargs.get("max_tokens", -1))
174
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
175
+ self._kwargs.get("presence_penalty", 0.0))
176
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
177
+ self._kwargs.get("stop", []))
178
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
179
+ self._kwargs.get("temperature", 1.0))
180
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
181
+ self._kwargs.get("top_p", 1.0))
182
+ self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
183
+ [self._finish_reason])
184
+ self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
185
+ self._response_id)
186
+ self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
187
+ self._response_model)
188
+ self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
189
+ input_tokens)
190
+ self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
191
+ output_tokens)
192
+ self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
193
+ self._server_address)
194
+ self._span.set_attribute(SemanticConvetion.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT,
195
+ self._system_fingerprint)
196
+ if isinstance(self._llmresponse, str):
197
+ self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
198
+ "text")
199
+ else:
200
+ self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
201
+ "json")
202
+
203
+ # Set Span attributes (Extra)
204
+ self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
205
+ environment)
206
+ self._span.set_attribute(SERVICE_NAME,
207
+ application_name)
208
+ self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
209
+ True)
210
+ self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
211
+ input_tokens + output_tokens)
212
+ self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
213
+ cost)
214
+ self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
215
+ self._tbt)
216
+ self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
217
+ self._ttft)
218
+ self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
219
+ version)
220
+ if trace_content:
221
+ self._span.add_event(
222
+ name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
223
+ attributes={
224
+ SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
225
+ },
226
+ )
227
+ self._span.add_event(
228
+ name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
229
+ attributes={
230
+ SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
231
+ },
232
+ )
233
+ self._span.set_status(Status(StatusCode.OK))
234
+
235
+ if disable_metrics is False:
236
+ attributes = create_metrics_attributes(
237
+ service_name=application_name,
238
+ deployment_environment=environment,
239
+ operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
240
+ system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
241
+ request_model=request_model,
242
+ server_address=self._server_address,
243
+ server_port=self._server_port,
244
+ response_model=self._response_model,
245
+ )
246
+
247
+ metrics["genai_client_usage_tokens"].record(
248
+ input_tokens + output_tokens, attributes
249
+ )
250
+ metrics["genai_client_operation_duration"].record(
251
+ self._end_time - self._start_time, attributes
252
+ )
253
+ metrics["genai_server_tbt"].record(
254
+ self._tbt, attributes
255
+ )
256
+ metrics["genai_server_ttft"].record(
257
+ self._ttft, attributes
258
+ )
259
+ metrics["genai_requests"].add(1, attributes)
260
+ metrics["genai_completion_tokens"].add(output_tokens, attributes)
261
+ metrics["genai_prompt_tokens"].add(input_tokens, attributes)
262
+ metrics["genai_cost"].record(cost, attributes)
263
+
264
+ except Exception as e:
265
+ handle_exception(self._span, e)
266
+ logger.error("Error in trace creation: %s", e)
267
+ finally:
268
+ self._span.end()
269
+ raise
270
+
38
271
  async def wrapper(wrapped, instance, args, kwargs):
39
272
  """
40
- Wraps the 'chat' API call to add telemetry.
41
-
273
+ Wraps the 'chat.completions' API call to add telemetry.
274
+
42
275
  This collects metrics such as execution time, cost, and token usage, and handles errors
43
276
  gracefully, adding details to the trace for observability.
44
277
 
45
278
  Args:
46
- wrapped: The original 'chat' method to be wrapped.
279
+ wrapped: The original 'chat.completions' method to be wrapped.
47
280
  instance: The instance of the class where the original method is defined.
48
- args: Positional arguments for the 'chat' method.
49
- kwargs: Keyword arguments for the 'chat' method.
281
+ args: Positional arguments for the 'chat.completions' method.
282
+ kwargs: Keyword arguments for the 'chat.completions' method.
50
283
 
51
284
  Returns:
52
- The response from the original 'chat' method.
285
+ The response from the original 'chat.completions' method.
53
286
  """
54
- # pylint: disable=no-else-return
55
- if kwargs.get("stream", False) is True:
56
- # Special handling for streaming response to accommodate the nature of data flow
57
- async def stream_generator():
58
- with tracer.start_as_current_span(gen_ai_endpoint, kind= SpanKind.CLIENT) as span:
59
- # Placeholder for aggregating streaming response
60
- llmresponse = ""
61
-
62
- # Loop through streaming events capturing relevant details
63
- async for chunk in await wrapped(*args, **kwargs):
64
- if chunk.choices:
65
- # Collect message IDs and aggregated response from events
66
- content = chunk.choices[0].delta.content
67
- if content:
68
- llmresponse += content
69
-
70
- yield chunk
71
- response_id = chunk.id
72
-
73
- # Handling exception ensure observability without disrupting operation
74
- try:
75
- # Format 'messages' into a single string
76
- message_prompt = kwargs.get("messages", "")
77
- formatted_messages = []
78
- for message in message_prompt:
79
- role = message["role"]
80
- content = message["content"]
81
-
82
- if isinstance(content, list):
83
- content_str = ", ".join(
84
- # pylint: disable=line-too-long
85
- f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
86
- if "type" in item else f'text: {item["text"]}'
87
- for item in content
88
- )
89
- formatted_messages.append(f"{role}: {content_str}")
90
- else:
91
- formatted_messages.append(f"{role}: {content}")
92
- prompt = "\n".join(formatted_messages)
93
-
94
- model = kwargs.get("model", "phi3-mini-4k")
95
-
96
- # Calculate tokens using input prompt and aggregated response
97
- input_tokens = general_tokens(prompt)
98
- output_tokens = general_tokens(llmresponse)
99
-
100
- total_tokens = input_tokens + output_tokens
101
- # Calculate cost of the operation
102
- cost = get_chat_model_cost(model,
103
- pricing_info, input_tokens,
104
- output_tokens)
105
-
106
- # Set base span attribues
107
- span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
108
- span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
109
- SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
110
- span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
111
- SemanticConvetion.GEN_AI_TYPE_CHAT)
112
- span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
113
- gen_ai_endpoint)
114
- span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
115
- environment)
116
- span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
117
- application_name)
118
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
119
- model)
120
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
121
- True)
122
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
123
- kwargs.get("user", ""))
124
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
125
- kwargs.get("top_p", 1.0))
126
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
127
- kwargs.get("max_tokens", -1))
128
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
129
- kwargs.get("temperature", 1.0))
130
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
131
- kwargs.get("presence_penalty", 0.0))
132
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
133
- kwargs.get("frequency_penalty", 0.0))
134
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
135
- kwargs.get("seed", ""))
136
- span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
137
- response_id)
138
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
139
- input_tokens)
140
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
141
- output_tokens)
142
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
143
- total_tokens)
144
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
145
- cost)
146
287
 
288
+ # Check if streaming is enabled for the API call
289
+ streaming = kwargs.get("stream", False)
290
+ server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
291
+ request_model = kwargs.get("model", "gpt-4o")
147
292
 
148
- if trace_content:
149
- span.add_event(
150
- name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
151
- attributes={
152
- SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
153
- },
154
- )
155
- span.add_event(
156
- name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
157
- attributes={
158
- SemanticConvetion.GEN_AI_CONTENT_COMPLETION: llmresponse,
159
- },
160
- )
293
+ span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
161
294
 
162
- span.set_status(Status(StatusCode.OK))
163
-
164
- if disable_metrics is False:
165
- attributes = {
166
- TELEMETRY_SDK_NAME:
167
- "openlit",
168
- SemanticConvetion.GEN_AI_APPLICATION_NAME:
169
- application_name,
170
- SemanticConvetion.GEN_AI_SYSTEM:
171
- SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
172
- SemanticConvetion.GEN_AI_ENVIRONMENT:
173
- environment,
174
- SemanticConvetion.GEN_AI_TYPE:
175
- SemanticConvetion.GEN_AI_TYPE_CHAT,
176
- SemanticConvetion.GEN_AI_REQUEST_MODEL:
177
- model
178
- }
179
-
180
- metrics["genai_requests"].add(1, attributes)
181
- metrics["genai_total_tokens"].add(
182
- total_tokens, attributes
183
- )
184
- metrics["genai_completion_tokens"].add(output_tokens, attributes)
185
- metrics["genai_prompt_tokens"].add(input_tokens, attributes)
186
- metrics["genai_cost"].record(cost, attributes)
295
+ # pylint: disable=no-else-return
296
+ if streaming:
297
+ # Special handling for streaming response to accommodate the nature of data flow
298
+ awaited_wrapped = await wrapped(*args, **kwargs)
299
+ span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
187
300
 
188
- except Exception as e:
189
- handle_exception(span, e)
190
- logger.error("Error in trace creation: %s", e)
301
+ return TracedAsyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
191
302
 
192
- return stream_generator()
303
+ # Handling for non-streaming responses
193
304
  else:
194
- # pylint: disable=line-too-long
195
- with tracer.start_as_current_span(gen_ai_endpoint, kind= SpanKind.CLIENT) as span:
305
+ with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
306
+ start_time = time.time()
196
307
  response = await wrapped(*args, **kwargs)
308
+ end_time = time.time()
309
+
310
+ response_dict = response_as_dict(response)
197
311
 
198
312
  try:
199
313
  # Format 'messages' into a single string
@@ -205,7 +319,6 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
205
319
 
206
320
  if isinstance(content, list):
207
321
  content_str = ", ".join(
208
- # pylint: disable=line-too-long
209
322
  f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
210
323
  if "type" in item else f'text: {item["text"]}'
211
324
  for item in content
@@ -215,43 +328,66 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
215
328
  formatted_messages.append(f"{role}: {content}")
216
329
  prompt = "\n".join(formatted_messages)
217
330
 
218
- model = kwargs.get("model", "phi3-mini-4k")
331
+ input_tokens = response_dict.get('usage').get('prompt_tokens')
332
+ output_tokens = response_dict.get('usage').get('completion_tokens')
219
333
 
220
- # Set base span attribues
334
+ # Calculate cost of the operation
335
+ cost = get_chat_model_cost(request_model,
336
+ pricing_info, input_tokens,
337
+ output_tokens)
338
+
339
+ # Set base span attribues (OTel Semconv)
221
340
  span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
341
+ span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
342
+ SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
222
343
  span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
223
344
  SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
224
- span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
225
- SemanticConvetion.GEN_AI_TYPE_CHAT)
226
- span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
227
- gen_ai_endpoint)
228
- span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
229
- environment)
230
- span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
231
- application_name)
232
345
  span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
233
- model)
234
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
235
- False)
236
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
237
- kwargs.get("user", ""))
238
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
239
- kwargs.get("top_p", 1.0))
346
+ request_model)
347
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
348
+ kwargs.get("seed", ""))
349
+ span.set_attribute(SemanticConvetion.SERVER_PORT,
350
+ server_port)
351
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
352
+ kwargs.get("frequency_penalty", 0.0))
240
353
  span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
241
354
  kwargs.get("max_tokens", -1))
242
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
243
- kwargs.get("temperature", 1.0))
244
355
  span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
245
356
  kwargs.get("presence_penalty", 0.0))
246
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
247
- kwargs.get("frequency_penalty", 0.0))
248
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
249
- kwargs.get("seed", ""))
357
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
358
+ kwargs.get("stop", []))
359
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
360
+ kwargs.get("temperature", 1.0))
361
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
362
+ kwargs.get("top_p", 1.0))
250
363
  span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
251
- response.id)
252
- span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
253
- [response.choices[0]["finish_reason"]])
364
+ response_dict.get("id"))
365
+ span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
366
+ response_dict.get('model'))
367
+ span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
368
+ input_tokens)
369
+ span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
370
+ output_tokens)
371
+ span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
372
+ server_address)
373
+ span.set_attribute(SemanticConvetion.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT,
374
+ response_dict.get('system_fingerprint'))
254
375
 
376
+ # Set base span attribues (Extras)
377
+ span.set_attribute(DEPLOYMENT_ENVIRONMENT,
378
+ environment)
379
+ span.set_attribute(SERVICE_NAME,
380
+ application_name)
381
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
382
+ False)
383
+ span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
384
+ input_tokens + output_tokens)
385
+ span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
386
+ cost)
387
+ span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
388
+ end_time - start_time)
389
+ span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
390
+ version)
255
391
  if trace_content:
256
392
  span.add_event(
257
393
  name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
@@ -259,49 +395,53 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
259
395
  SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
260
396
  },
261
397
  )
262
- span.add_event(
263
- name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
264
- attributes={
265
- SemanticConvetion.GEN_AI_CONTENT_COMPLETION: response.choices[0].message.content,
266
- },
267
- )
268
398
 
269
- input_tokens = response.usage.prompt_tokens
270
- output_tokens = response.usage.completion_tokens
271
- total_tokens = response.usage.total_tokens
272
- # Calculate cost of the operation
273
- cost = get_chat_model_cost(model,
274
- pricing_info, input_tokens, output_tokens)
399
+ for i in range(kwargs.get('n',1)):
400
+ span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
401
+ [response_dict.get('choices')[i].get('finish_reason')])
402
+ if trace_content:
403
+ span.add_event(
404
+ name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
405
+ attributes={
406
+ # pylint: disable=line-too-long
407
+ SemanticConvetion.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('choices')[i].get('message').get('content')),
408
+ },
409
+ )
410
+ if kwargs.get('tools'):
411
+ span.set_attribute(SemanticConvetion.GEN_AI_TOOL_CALLS,
412
+ str(response_dict.get('choices')[i].get('message').get('tool_calls')))
275
413
 
276
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
277
- input_tokens)
278
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
279
- output_tokens)
280
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
281
- total_tokens)
282
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
283
- cost)
414
+ if isinstance(response_dict.get('choices')[i].get('message').get('content'), str):
415
+ span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
416
+ "text")
417
+ elif response_dict.get('choices')[i].get('message').get('content') is not None:
418
+ span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
419
+ "json")
284
420
 
285
421
  span.set_status(Status(StatusCode.OK))
286
422
 
287
423
  if disable_metrics is False:
288
- attributes = {
289
- TELEMETRY_SDK_NAME:
290
- "openlit",
291
- SemanticConvetion.GEN_AI_APPLICATION_NAME:
292
- application_name,
293
- SemanticConvetion.GEN_AI_SYSTEM:
294
- SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
295
- SemanticConvetion.GEN_AI_ENVIRONMENT:
296
- environment,
297
- SemanticConvetion.GEN_AI_TYPE:
298
- SemanticConvetion.GEN_AI_TYPE_CHAT,
299
- SemanticConvetion.GEN_AI_REQUEST_MODEL:
300
- model
301
- }
424
+ attributes = create_metrics_attributes(
425
+ service_name=application_name,
426
+ deployment_environment=environment,
427
+ operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
428
+ system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
429
+ request_model=request_model,
430
+ server_address=server_address,
431
+ server_port=server_port,
432
+ response_model=response_dict.get('model'),
433
+ )
302
434
 
435
+ metrics["genai_client_usage_tokens"].record(
436
+ input_tokens + output_tokens, attributes
437
+ )
438
+ metrics["genai_client_operation_duration"].record(
439
+ end_time - start_time, attributes
440
+ )
441
+ metrics["genai_server_ttft"].record(
442
+ end_time - start_time, attributes
443
+ )
303
444
  metrics["genai_requests"].add(1, attributes)
304
- metrics["genai_total_tokens"].add(total_tokens, attributes)
305
445
  metrics["genai_completion_tokens"].add(output_tokens, attributes)
306
446
  metrics["genai_prompt_tokens"].add(input_tokens, attributes)
307
447
  metrics["genai_cost"].record(cost, attributes)
@@ -318,18 +458,17 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
318
458
 
319
459
  return wrapper
320
460
 
321
- def async_embedding(gen_ai_endpoint, version, environment, application_name,
461
+ def async_embedding(version, environment, application_name,
322
462
  tracer, pricing_info, trace_content, metrics, disable_metrics):
323
463
  """
324
464
  Generates a telemetry wrapper for embeddings to collect metrics.
325
465
 
326
466
  Args:
327
- gen_ai_endpoint: Endpoint identifier for logging and tracing.
328
467
  version: Version of the monitoring package.
329
468
  environment: Deployment environment (e.g., production, staging).
330
- application_name: Name of the application using the Azure AI Inference API.
469
+ application_name: Name of the application using the Azure Inference API.
331
470
  tracer: OpenTelemetry tracer for creating spans.
332
- pricing_info: Information used for calculating the cost of Azure AI Inference usage.
471
+ pricing_info: Information used for calculating the cost of Azure Inference usage.
333
472
  trace_content: Flag indicating whether to trace the actual content.
334
473
 
335
474
  Returns:
@@ -353,69 +492,84 @@ def async_embedding(gen_ai_endpoint, version, environment, application_name,
353
492
  The response from the original 'embeddings' method.
354
493
  """
355
494
 
356
- with tracer.start_as_current_span(gen_ai_endpoint, kind= SpanKind.CLIENT) as span:
495
+ server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
496
+ request_model = kwargs.get("model", "text-embedding-ada-002")
497
+
498
+ span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}"
499
+
500
+ with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
501
+ start_time = time.time()
357
502
  response = await wrapped(*args, **kwargs)
503
+ end_time = time.time()
358
504
 
505
+ response_dict = response_as_dict(response)
359
506
  try:
507
+ input_tokens = response_dict.get('usage').get('prompt_tokens')
508
+
360
509
  # Calculate cost of the operation
361
- cost = get_embed_model_cost(kwargs.get("model", "text-embedding-ada-002"),
362
- pricing_info, response.usage.prompt_tokens)
510
+ cost = get_embed_model_cost(request_model,
511
+ pricing_info, input_tokens)
363
512
 
364
- # Set Span attributes
513
+ # Set Span attributes (OTel Semconv)
365
514
  span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
515
+ span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
516
+ SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
366
517
  span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
367
518
  SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
368
- span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
369
- SemanticConvetion.GEN_AI_TYPE_EMBEDDING)
370
- span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
371
- gen_ai_endpoint)
372
- span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
519
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
520
+ request_model)
521
+ span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_ENCODING_FORMATS,
522
+ [kwargs.get('encoding_format', 'float')])
523
+ span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
524
+ request_model)
525
+ span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
526
+ server_address)
527
+ span.set_attribute(SemanticConvetion.SERVER_PORT,
528
+ server_port)
529
+ span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
530
+ input_tokens)
531
+
532
+ # Set Span attributes (Extras)
533
+ span.set_attribute(DEPLOYMENT_ENVIRONMENT,
373
534
  environment)
374
- span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
535
+ span.set_attribute(SERVICE_NAME,
375
536
  application_name)
376
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
377
- kwargs.get("model", "text-embedding-ada-002"))
378
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_FORMAT,
379
- kwargs.get("encoding_format", "float"))
380
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_DIMENSION,
381
- kwargs.get("dimensions", ""))
382
- span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
383
- kwargs.get("user", ""))
384
- span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
385
- response.usage.prompt_tokens)
386
537
  span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
387
- response.usage.total_tokens)
538
+ input_tokens)
388
539
  span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
389
540
  cost)
541
+ span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
542
+ version)
543
+
390
544
  if trace_content:
391
545
  span.add_event(
392
546
  name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
393
547
  attributes={
394
- SemanticConvetion.GEN_AI_CONTENT_PROMPT: kwargs.get("input", ""),
548
+ SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get("input", "")),
395
549
  },
396
550
  )
397
551
 
398
552
  span.set_status(Status(StatusCode.OK))
399
553
 
400
554
  if disable_metrics is False:
401
- attributes = {
402
- TELEMETRY_SDK_NAME:
403
- "openlit",
404
- SemanticConvetion.GEN_AI_APPLICATION_NAME:
405
- application_name,
406
- SemanticConvetion.GEN_AI_SYSTEM:
407
- SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
408
- SemanticConvetion.GEN_AI_ENVIRONMENT:
409
- environment,
410
- SemanticConvetion.GEN_AI_TYPE:
411
- SemanticConvetion.GEN_AI_TYPE_EMBEDDING,
412
- SemanticConvetion.GEN_AI_REQUEST_MODEL:
413
- kwargs.get("model", "text-embedding-ada-002")
414
- }
415
-
555
+ attributes = create_metrics_attributes(
556
+ service_name=application_name,
557
+ deployment_environment=environment,
558
+ operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
559
+ system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
560
+ request_model=request_model,
561
+ server_address=server_address,
562
+ server_port=server_port,
563
+ response_model=request_model,
564
+ )
565
+ metrics["genai_client_usage_tokens"].record(
566
+ input_tokens, attributes
567
+ )
568
+ metrics["genai_client_operation_duration"].record(
569
+ end_time - start_time, attributes
570
+ )
416
571
  metrics["genai_requests"].add(1, attributes)
417
- metrics["genai_total_tokens"].add(response.usage.total_tokens, attributes)
418
- metrics["genai_prompt_tokens"].add(response.usage.prompt_tokens, attributes)
572
+ metrics["genai_prompt_tokens"].add(input_tokens, attributes)
419
573
  metrics["genai_cost"].record(cost, attributes)
420
574
 
421
575
  # Return original response