openlit 1.33.17__py3-none-any.whl → 1.33.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/instrumentation/azure_ai_inference/__init__.py +5 -22
- openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +48 -489
- openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +48 -489
- openlit/instrumentation/azure_ai_inference/utils.py +225 -0
- {openlit-1.33.17.dist-info → openlit-1.33.18.dist-info}/METADATA +1 -1
- {openlit-1.33.17.dist-info → openlit-1.33.18.dist-info}/RECORD +8 -7
- {openlit-1.33.17.dist-info → openlit-1.33.18.dist-info}/LICENSE +0 -0
- {openlit-1.33.17.dist-info → openlit-1.33.18.dist-info}/WHEEL +0 -0
@@ -4,13 +4,11 @@ from typing import Collection
|
|
4
4
|
import importlib.metadata
|
5
5
|
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
|
6
6
|
from wrapt import wrap_function_wrapper
|
7
|
-
|
8
7
|
from openlit.instrumentation.azure_ai_inference.azure_ai_inference import (
|
9
|
-
complete
|
8
|
+
complete
|
10
9
|
)
|
11
|
-
|
12
10
|
from openlit.instrumentation.azure_ai_inference.async_azure_ai_inference import (
|
13
|
-
async_complete
|
11
|
+
async_complete
|
14
12
|
)
|
15
13
|
|
16
14
|
_instruments = ('azure-ai-inference >= 1.0.0b4',)
|
@@ -27,6 +25,7 @@ class AzureAIInferenceInstrumentor(BaseInstrumentor):
|
|
27
25
|
application_name = kwargs.get('application_name', 'default')
|
28
26
|
environment = kwargs.get('environment', 'default')
|
29
27
|
tracer = kwargs.get('tracer')
|
28
|
+
event_provider = kwargs.get('event_provider')
|
30
29
|
metrics = kwargs.get('metrics_dict')
|
31
30
|
pricing_info = kwargs.get('pricing_info', {})
|
32
31
|
capture_message_content = kwargs.get('capture_message_content', False)
|
@@ -38,15 +37,7 @@ class AzureAIInferenceInstrumentor(BaseInstrumentor):
|
|
38
37
|
'azure.ai.inference',
|
39
38
|
'ChatCompletionsClient.complete',
|
40
39
|
complete(version, environment, application_name,
|
41
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
42
|
-
)
|
43
|
-
|
44
|
-
# sync embedding
|
45
|
-
wrap_function_wrapper(
|
46
|
-
'azure.ai.inference',
|
47
|
-
'EmbeddingsClient.embed',
|
48
|
-
embedding(version, environment, application_name,
|
49
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
40
|
+
tracer, event_provider, pricing_info, capture_message_content, metrics, disable_metrics),
|
50
41
|
)
|
51
42
|
|
52
43
|
# async generate
|
@@ -54,15 +45,7 @@ class AzureAIInferenceInstrumentor(BaseInstrumentor):
|
|
54
45
|
'azure.ai.inference.aio',
|
55
46
|
'ChatCompletionsClient.complete',
|
56
47
|
async_complete(version, environment, application_name,
|
57
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
58
|
-
)
|
59
|
-
|
60
|
-
# async embedding
|
61
|
-
wrap_function_wrapper(
|
62
|
-
'azure.ai.inference.aio',
|
63
|
-
'EmbeddingsClient.embed',
|
64
|
-
async_embedding(version, environment, application_name,
|
65
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
48
|
+
tracer, event_provider, pricing_info, capture_message_content, metrics, disable_metrics),
|
66
49
|
)
|
67
50
|
|
68
51
|
def _uninstrument(self, **kwargs):
|
@@ -4,18 +4,15 @@ Module for monitoring Azure AI Inference API calls.
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
import time
|
7
|
-
from opentelemetry.trace import SpanKind
|
8
|
-
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
7
|
+
from opentelemetry.trace import SpanKind
|
9
8
|
from openlit.__helpers import (
|
10
|
-
get_chat_model_cost,
|
11
|
-
get_embed_model_cost,
|
12
9
|
handle_exception,
|
13
|
-
response_as_dict,
|
14
|
-
calculate_ttft,
|
15
|
-
calculate_tbt,
|
16
|
-
create_metrics_attributes,
|
17
10
|
set_server_address_and_port,
|
18
|
-
|
11
|
+
)
|
12
|
+
from openlit.instrumentation.azure_ai_inference.utils import (
|
13
|
+
process_chunk,
|
14
|
+
process_chat_response,
|
15
|
+
process_streaming_chat_response,
|
19
16
|
)
|
20
17
|
from openlit.semcov import SemanticConvetion
|
21
18
|
|
@@ -23,37 +20,21 @@ from openlit.semcov import SemanticConvetion
|
|
23
20
|
logger = logging.getLogger(__name__)
|
24
21
|
|
25
22
|
def async_complete(version, environment, application_name,
|
26
|
-
|
23
|
+
tracer, event_provider, pricing_info, capture_message_content, metrics, disable_metrics):
|
27
24
|
"""
|
28
|
-
Generates a telemetry wrapper for
|
29
|
-
|
30
|
-
Args:
|
31
|
-
version: Version of the monitoring package.
|
32
|
-
environment: Deployment environment (e.g., production, staging).
|
33
|
-
application_name: Name of the application using the Azure AI Inference API.
|
34
|
-
tracer: OpenTelemetry tracer for creating spans.
|
35
|
-
pricing_info: Information used for calculating the cost of Azure AI Inference usage.
|
36
|
-
capture_message_content: Flag indicating whether to trace the actual content.
|
37
|
-
|
38
|
-
Returns:
|
39
|
-
A function that wraps the chat method to add telemetry.
|
25
|
+
Generates a telemetry wrapper for GenAI function call
|
40
26
|
"""
|
41
27
|
|
42
28
|
class TracedAsyncStream:
|
43
29
|
"""
|
44
|
-
Wrapper for streaming responses to collect
|
45
|
-
Wraps the response to collect message IDs and aggregated response.
|
46
|
-
|
47
|
-
This class implements the '__aiter__' and '__anext__' methods that
|
48
|
-
handle asynchronous streaming responses.
|
49
|
-
|
50
|
-
This class also implements '__aenter__' and '__aexit__' methods that
|
51
|
-
handle asynchronous context management protocol.
|
30
|
+
Wrapper for streaming responses to collect telemetry.
|
52
31
|
"""
|
32
|
+
|
53
33
|
def __init__(
|
54
34
|
self,
|
55
35
|
wrapped,
|
56
36
|
span,
|
37
|
+
span_name,
|
57
38
|
kwargs,
|
58
39
|
server_address,
|
59
40
|
server_port,
|
@@ -61,12 +42,13 @@ def async_complete(version, environment, application_name,
|
|
61
42
|
):
|
62
43
|
self.__wrapped__ = wrapped
|
63
44
|
self._span = span
|
64
|
-
|
45
|
+
self._span_name = span_name
|
65
46
|
self._llmresponse = ""
|
66
47
|
self._response_id = ""
|
67
48
|
self._response_model = ""
|
68
49
|
self._finish_reason = ""
|
69
|
-
self.
|
50
|
+
self._input_tokens = 0
|
51
|
+
self._output_tokens = 0
|
70
52
|
|
71
53
|
self._args = args
|
72
54
|
self._kwargs = kwargs
|
@@ -95,197 +77,33 @@ def async_complete(version, environment, application_name,
|
|
95
77
|
async def __anext__(self):
|
96
78
|
try:
|
97
79
|
chunk = await self.__wrapped__.__anext__()
|
98
|
-
|
99
|
-
# Record the timestamp for the current chunk
|
100
|
-
self._timestamps.append(end_time)
|
101
|
-
|
102
|
-
if len(self._timestamps) == 1:
|
103
|
-
# Calculate time to first chunk
|
104
|
-
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
105
|
-
|
106
|
-
chunked = response_as_dict(chunk)
|
107
|
-
# Collect message IDs and aggregated response from events
|
108
|
-
if (len(chunked.get('choices')) > 0 and ('delta' in chunked.get('choices')[0] and
|
109
|
-
'content' in chunked.get('choices')[0].get('delta'))):
|
110
|
-
|
111
|
-
content = chunked.get('choices')[0].get('delta').get('content')
|
112
|
-
if content:
|
113
|
-
self._llmresponse += content
|
114
|
-
self._response_id = chunked.get('id')
|
115
|
-
self._response_model = chunked.get('model')
|
116
|
-
self._finish_reason = chunked.get('choices')[0].get('finish_reason')
|
117
|
-
self._system_fingerprint = chunked.get('system_fingerprint')
|
80
|
+
process_chunk(self, chunk)
|
118
81
|
return chunk
|
119
82
|
except StopAsyncIteration:
|
120
|
-
# Handling exception ensure observability without disrupting operation
|
121
83
|
try:
|
122
|
-
self.
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
if isinstance(content, list):
|
134
|
-
content_str_list = []
|
135
|
-
for item in content:
|
136
|
-
if item["type"] == "text":
|
137
|
-
content_str_list.append(f'text: {item["text"]}')
|
138
|
-
elif (item["type"] == "image_url" and
|
139
|
-
not item["image_url"]["url"].startswith("data:")):
|
140
|
-
content_str_list.append(f'image_url: {item["image_url"]["url"]}')
|
141
|
-
content_str = ", ".join(content_str_list)
|
142
|
-
formatted_messages.append(f"{role}: {content_str}")
|
143
|
-
else:
|
144
|
-
formatted_messages.append(f"{role}: {content}")
|
145
|
-
prompt = "\n".join(formatted_messages)
|
146
|
-
|
147
|
-
request_model = self._kwargs.get("model", "gpt-4o")
|
148
|
-
|
149
|
-
# Calculate tokens using input prompt and aggregated response
|
150
|
-
input_tokens = general_tokens(prompt)
|
151
|
-
output_tokens = general_tokens(self._llmresponse)
|
152
|
-
|
153
|
-
# Calculate cost of the operation
|
154
|
-
cost = get_chat_model_cost(request_model,
|
155
|
-
pricing_info, input_tokens,
|
156
|
-
output_tokens)
|
157
|
-
|
158
|
-
# Set Span attributes (OTel Semconv)
|
159
|
-
self._span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
160
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
161
|
-
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
162
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
163
|
-
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
164
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
165
|
-
request_model)
|
166
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
167
|
-
self._kwargs.get("seed", ""))
|
168
|
-
self._span.set_attribute(SemanticConvetion.SERVER_PORT,
|
169
|
-
self._server_port)
|
170
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
171
|
-
self._kwargs.get("frequency_penalty", 0.0))
|
172
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
173
|
-
self._kwargs.get("max_tokens", -1))
|
174
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
175
|
-
self._kwargs.get("presence_penalty", 0.0))
|
176
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
177
|
-
self._kwargs.get("stop", []))
|
178
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
179
|
-
self._kwargs.get("temperature", 1.0))
|
180
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
181
|
-
self._kwargs.get("top_p", 1.0))
|
182
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
183
|
-
[self._finish_reason])
|
184
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
185
|
-
self._response_id)
|
186
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
187
|
-
self._response_model)
|
188
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
189
|
-
input_tokens)
|
190
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
191
|
-
output_tokens)
|
192
|
-
self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
193
|
-
self._server_address)
|
194
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
195
|
-
self._system_fingerprint)
|
196
|
-
if isinstance(self._llmresponse, str):
|
197
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
198
|
-
"text")
|
199
|
-
else:
|
200
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
201
|
-
"json")
|
202
|
-
|
203
|
-
# Set Span attributes (Extra)
|
204
|
-
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
205
|
-
environment)
|
206
|
-
self._span.set_attribute(SERVICE_NAME,
|
207
|
-
application_name)
|
208
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
209
|
-
True)
|
210
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
211
|
-
input_tokens + output_tokens)
|
212
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
213
|
-
cost)
|
214
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
|
215
|
-
self._tbt)
|
216
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
217
|
-
self._ttft)
|
218
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
219
|
-
version)
|
220
|
-
if capture_message_content:
|
221
|
-
self._span.add_event(
|
222
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
223
|
-
attributes={
|
224
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
225
|
-
},
|
226
|
-
)
|
227
|
-
self._span.add_event(
|
228
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
229
|
-
attributes={
|
230
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
231
|
-
},
|
84
|
+
with tracer.start_as_current_span(self._span_name, kind= SpanKind.CLIENT) as self._span:
|
85
|
+
process_streaming_chat_response(
|
86
|
+
self,
|
87
|
+
pricing_info=pricing_info,
|
88
|
+
environment=environment,
|
89
|
+
application_name=application_name,
|
90
|
+
metrics=metrics,
|
91
|
+
event_provider=event_provider,
|
92
|
+
capture_message_content=capture_message_content,
|
93
|
+
disable_metrics=disable_metrics,
|
94
|
+
version=version
|
232
95
|
)
|
233
|
-
self._span.set_status(Status(StatusCode.OK))
|
234
|
-
|
235
|
-
if disable_metrics is False:
|
236
|
-
attributes = create_metrics_attributes(
|
237
|
-
service_name=application_name,
|
238
|
-
deployment_environment=environment,
|
239
|
-
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
240
|
-
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
241
|
-
request_model=request_model,
|
242
|
-
server_address=self._server_address,
|
243
|
-
server_port=self._server_port,
|
244
|
-
response_model=self._response_model,
|
245
|
-
)
|
246
|
-
|
247
|
-
metrics["genai_client_usage_tokens"].record(
|
248
|
-
input_tokens + output_tokens, attributes
|
249
|
-
)
|
250
|
-
metrics["genai_client_operation_duration"].record(
|
251
|
-
self._end_time - self._start_time, attributes
|
252
|
-
)
|
253
|
-
metrics["genai_server_tbt"].record(
|
254
|
-
self._tbt, attributes
|
255
|
-
)
|
256
|
-
metrics["genai_server_ttft"].record(
|
257
|
-
self._ttft, attributes
|
258
|
-
)
|
259
|
-
metrics["genai_requests"].add(1, attributes)
|
260
|
-
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
261
|
-
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
262
|
-
metrics["genai_cost"].record(cost, attributes)
|
263
96
|
|
264
97
|
except Exception as e:
|
265
98
|
handle_exception(self._span, e)
|
266
99
|
logger.error("Error in trace creation: %s", e)
|
267
|
-
finally:
|
268
|
-
self._span.end()
|
269
100
|
raise
|
270
101
|
|
271
102
|
async def wrapper(wrapped, instance, args, kwargs):
|
272
103
|
"""
|
273
|
-
Wraps the
|
274
|
-
|
275
|
-
This collects metrics such as execution time, cost, and token usage, and handles errors
|
276
|
-
gracefully, adding details to the trace for observability.
|
277
|
-
|
278
|
-
Args:
|
279
|
-
wrapped: The original 'chat.completions' method to be wrapped.
|
280
|
-
instance: The instance of the class where the original method is defined.
|
281
|
-
args: Positional arguments for the 'chat.completions' method.
|
282
|
-
kwargs: Keyword arguments for the 'chat.completions' method.
|
283
|
-
|
284
|
-
Returns:
|
285
|
-
The response from the original 'chat.completions' method.
|
104
|
+
Wraps the GenAI function call.
|
286
105
|
"""
|
287
106
|
|
288
|
-
# Check if streaming is enabled for the API call
|
289
107
|
streaming = kwargs.get("stream", False)
|
290
108
|
server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
|
291
109
|
request_model = kwargs.get("model", "gpt-4o")
|
@@ -294,292 +112,33 @@ def async_complete(version, environment, application_name,
|
|
294
112
|
|
295
113
|
# pylint: disable=no-else-return
|
296
114
|
if streaming:
|
297
|
-
# Special handling for streaming response to accommodate the nature of data flow
|
298
115
|
awaited_wrapped = await wrapped(*args, **kwargs)
|
299
116
|
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
300
117
|
|
301
|
-
return TracedAsyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
118
|
+
return TracedAsyncStream(awaited_wrapped, span, span_name, kwargs, server_address, server_port)
|
302
119
|
|
303
|
-
# Handling for non-streaming responses
|
304
120
|
else:
|
305
|
-
with tracer.start_as_current_span(span_name, kind=
|
121
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
306
122
|
start_time = time.time()
|
307
123
|
response = await wrapped(*args, **kwargs)
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
else:
|
328
|
-
formatted_messages.append(f"{role}: {content}")
|
329
|
-
prompt = "\n".join(formatted_messages)
|
330
|
-
|
331
|
-
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
332
|
-
output_tokens = response_dict.get('usage').get('completion_tokens')
|
333
|
-
|
334
|
-
# Calculate cost of the operation
|
335
|
-
cost = get_chat_model_cost(request_model,
|
336
|
-
pricing_info, input_tokens,
|
337
|
-
output_tokens)
|
338
|
-
|
339
|
-
# Set base span attribues (OTel Semconv)
|
340
|
-
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
341
|
-
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
342
|
-
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
343
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
344
|
-
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
345
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
346
|
-
request_model)
|
347
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
348
|
-
kwargs.get("seed", ""))
|
349
|
-
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
350
|
-
server_port)
|
351
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
352
|
-
kwargs.get("frequency_penalty", 0.0))
|
353
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
354
|
-
kwargs.get("max_tokens", -1))
|
355
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
356
|
-
kwargs.get("presence_penalty", 0.0))
|
357
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
358
|
-
kwargs.get("stop", []))
|
359
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
360
|
-
kwargs.get("temperature", 1.0))
|
361
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
362
|
-
kwargs.get("top_p", 1.0))
|
363
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
364
|
-
response_dict.get("id"))
|
365
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
366
|
-
response_dict.get('model'))
|
367
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
368
|
-
input_tokens)
|
369
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
370
|
-
output_tokens)
|
371
|
-
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
372
|
-
server_address)
|
373
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
374
|
-
response_dict.get('system_fingerprint'))
|
375
|
-
|
376
|
-
# Set base span attribues (Extras)
|
377
|
-
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
378
|
-
environment)
|
379
|
-
span.set_attribute(SERVICE_NAME,
|
380
|
-
application_name)
|
381
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
382
|
-
False)
|
383
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
384
|
-
input_tokens + output_tokens)
|
385
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
386
|
-
cost)
|
387
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
388
|
-
end_time - start_time)
|
389
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
390
|
-
version)
|
391
|
-
if capture_message_content:
|
392
|
-
span.add_event(
|
393
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
394
|
-
attributes={
|
395
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
396
|
-
},
|
397
|
-
)
|
398
|
-
|
399
|
-
for i in range(kwargs.get('n',1)):
|
400
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
401
|
-
[response_dict.get('choices')[i].get('finish_reason')])
|
402
|
-
if capture_message_content:
|
403
|
-
span.add_event(
|
404
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
405
|
-
attributes={
|
406
|
-
# pylint: disable=line-too-long
|
407
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('choices')[i].get('message').get('content')),
|
408
|
-
},
|
409
|
-
)
|
410
|
-
if kwargs.get('tools'):
|
411
|
-
span.set_attribute(SemanticConvetion.GEN_AI_TOOL_CALLS,
|
412
|
-
str(response_dict.get('choices')[i].get('message').get('tool_calls')))
|
413
|
-
|
414
|
-
if isinstance(response_dict.get('choices')[i].get('message').get('content'), str):
|
415
|
-
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
416
|
-
"text")
|
417
|
-
elif response_dict.get('choices')[i].get('message').get('content') is not None:
|
418
|
-
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
419
|
-
"json")
|
420
|
-
|
421
|
-
span.set_status(Status(StatusCode.OK))
|
422
|
-
|
423
|
-
if disable_metrics is False:
|
424
|
-
attributes = create_metrics_attributes(
|
425
|
-
service_name=application_name,
|
426
|
-
deployment_environment=environment,
|
427
|
-
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
428
|
-
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
429
|
-
request_model=request_model,
|
430
|
-
server_address=server_address,
|
431
|
-
server_port=server_port,
|
432
|
-
response_model=response_dict.get('model'),
|
433
|
-
)
|
434
|
-
|
435
|
-
metrics["genai_client_usage_tokens"].record(
|
436
|
-
input_tokens + output_tokens, attributes
|
437
|
-
)
|
438
|
-
metrics["genai_client_operation_duration"].record(
|
439
|
-
end_time - start_time, attributes
|
440
|
-
)
|
441
|
-
metrics["genai_server_ttft"].record(
|
442
|
-
end_time - start_time, attributes
|
443
|
-
)
|
444
|
-
metrics["genai_requests"].add(1, attributes)
|
445
|
-
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
446
|
-
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
447
|
-
metrics["genai_cost"].record(cost, attributes)
|
448
|
-
|
449
|
-
# Return original response
|
450
|
-
return response
|
451
|
-
|
452
|
-
except Exception as e:
|
453
|
-
handle_exception(span, e)
|
454
|
-
logger.error("Error in trace creation: %s", e)
|
455
|
-
|
456
|
-
# Return original response
|
457
|
-
return response
|
458
|
-
|
459
|
-
return wrapper
|
460
|
-
|
461
|
-
def async_embedding(version, environment, application_name,
|
462
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics):
|
463
|
-
"""
|
464
|
-
Generates a telemetry wrapper for embeddings to collect metrics.
|
465
|
-
|
466
|
-
Args:
|
467
|
-
version: Version of the monitoring package.
|
468
|
-
environment: Deployment environment (e.g., production, staging).
|
469
|
-
application_name: Name of the application using the Azure Inference API.
|
470
|
-
tracer: OpenTelemetry tracer for creating spans.
|
471
|
-
pricing_info: Information used for calculating the cost of Azure Inference usage.
|
472
|
-
capture_message_content: Flag indicating whether to trace the actual content.
|
473
|
-
|
474
|
-
Returns:
|
475
|
-
A function that wraps the embeddings method to add telemetry.
|
476
|
-
"""
|
477
|
-
|
478
|
-
async def wrapper(wrapped, instance, args, kwargs):
|
479
|
-
"""
|
480
|
-
Wraps the 'embeddings' API call to add telemetry.
|
481
|
-
|
482
|
-
This collects metrics such as execution time, cost, and token usage, and handles errors
|
483
|
-
gracefully, adding details to the trace for observability.
|
484
|
-
|
485
|
-
Args:
|
486
|
-
wrapped: The original 'embeddings' method to be wrapped.
|
487
|
-
instance: The instance of the class where the original method is defined.
|
488
|
-
args: Positional arguments for the 'embeddings' method.
|
489
|
-
kwargs: Keyword arguments for the 'embeddings' method.
|
490
|
-
|
491
|
-
Returns:
|
492
|
-
The response from the original 'embeddings' method.
|
493
|
-
"""
|
494
|
-
|
495
|
-
server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
|
496
|
-
request_model = kwargs.get("model", "text-embedding-ada-002")
|
497
|
-
|
498
|
-
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}"
|
499
|
-
|
500
|
-
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
501
|
-
start_time = time.time()
|
502
|
-
response = await wrapped(*args, **kwargs)
|
503
|
-
end_time = time.time()
|
504
|
-
|
505
|
-
response_dict = response_as_dict(response)
|
506
|
-
try:
|
507
|
-
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
508
|
-
|
509
|
-
# Calculate cost of the operation
|
510
|
-
cost = get_embed_model_cost(request_model,
|
511
|
-
pricing_info, input_tokens)
|
512
|
-
|
513
|
-
# Set Span attributes (OTel Semconv)
|
514
|
-
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
515
|
-
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
516
|
-
SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
|
517
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
518
|
-
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
519
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
520
|
-
request_model)
|
521
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_ENCODING_FORMATS,
|
522
|
-
[kwargs.get('encoding_format', 'float')])
|
523
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
524
|
-
request_model)
|
525
|
-
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
526
|
-
server_address)
|
527
|
-
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
528
|
-
server_port)
|
529
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
530
|
-
input_tokens)
|
531
|
-
|
532
|
-
# Set Span attributes (Extras)
|
533
|
-
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
534
|
-
environment)
|
535
|
-
span.set_attribute(SERVICE_NAME,
|
536
|
-
application_name)
|
537
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
538
|
-
input_tokens)
|
539
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
540
|
-
cost)
|
541
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
542
|
-
version)
|
543
|
-
|
544
|
-
if capture_message_content:
|
545
|
-
span.add_event(
|
546
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
547
|
-
attributes={
|
548
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get("input", "")),
|
549
|
-
},
|
550
|
-
)
|
551
|
-
|
552
|
-
span.set_status(Status(StatusCode.OK))
|
553
|
-
|
554
|
-
if disable_metrics is False:
|
555
|
-
attributes = create_metrics_attributes(
|
556
|
-
service_name=application_name,
|
557
|
-
deployment_environment=environment,
|
558
|
-
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
559
|
-
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
560
|
-
request_model=request_model,
|
561
|
-
server_address=server_address,
|
562
|
-
server_port=server_port,
|
563
|
-
response_model=request_model,
|
564
|
-
)
|
565
|
-
metrics["genai_client_usage_tokens"].record(
|
566
|
-
input_tokens, attributes
|
567
|
-
)
|
568
|
-
metrics["genai_client_operation_duration"].record(
|
569
|
-
end_time - start_time, attributes
|
570
|
-
)
|
571
|
-
metrics["genai_requests"].add(1, attributes)
|
572
|
-
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
573
|
-
metrics["genai_cost"].record(cost, attributes)
|
574
|
-
|
575
|
-
# Return original response
|
576
|
-
return response
|
577
|
-
|
578
|
-
except Exception as e:
|
579
|
-
handle_exception(span, e)
|
580
|
-
logger.error("Error in trace creation: %s", e)
|
581
|
-
|
582
|
-
# Return original response
|
583
|
-
return response
|
124
|
+
response = process_chat_response(
|
125
|
+
response=response,
|
126
|
+
request_model=request_model,
|
127
|
+
pricing_info=pricing_info,
|
128
|
+
server_port=server_port,
|
129
|
+
server_address=server_address,
|
130
|
+
environment=environment,
|
131
|
+
application_name=application_name,
|
132
|
+
metrics=metrics,
|
133
|
+
event_provider=event_provider,
|
134
|
+
start_time=start_time,
|
135
|
+
span=span,
|
136
|
+
capture_message_content=capture_message_content,
|
137
|
+
disable_metrics=disable_metrics,
|
138
|
+
version=version,
|
139
|
+
**kwargs
|
140
|
+
)
|
141
|
+
|
142
|
+
return response
|
584
143
|
|
585
144
|
return wrapper
|