openlit 1.33.8__py3-none-any.whl → 1.33.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/__helpers.py +83 -0
- openlit/__init__.py +1 -1
- openlit/instrumentation/ag2/ag2.py +2 -2
- openlit/instrumentation/ai21/__init__.py +4 -4
- openlit/instrumentation/ai21/ai21.py +370 -319
- openlit/instrumentation/ai21/async_ai21.py +371 -319
- openlit/instrumentation/anthropic/__init__.py +4 -4
- openlit/instrumentation/anthropic/anthropic.py +321 -189
- openlit/instrumentation/anthropic/async_anthropic.py +323 -190
- openlit/instrumentation/assemblyai/__init__.py +1 -1
- openlit/instrumentation/assemblyai/assemblyai.py +59 -43
- openlit/instrumentation/astra/astra.py +4 -4
- openlit/instrumentation/astra/async_astra.py +4 -4
- openlit/instrumentation/azure_ai_inference/__init__.py +4 -4
- openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +406 -252
- openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +406 -252
- openlit/instrumentation/bedrock/__init__.py +1 -1
- openlit/instrumentation/bedrock/bedrock.py +115 -58
- openlit/instrumentation/chroma/chroma.py +4 -4
- openlit/instrumentation/cohere/__init__.py +33 -10
- openlit/instrumentation/cohere/async_cohere.py +610 -0
- openlit/instrumentation/cohere/cohere.py +410 -219
- openlit/instrumentation/controlflow/controlflow.py +2 -2
- openlit/instrumentation/crawl4ai/async_crawl4ai.py +2 -2
- openlit/instrumentation/crawl4ai/crawl4ai.py +2 -2
- openlit/instrumentation/crewai/crewai.py +2 -2
- openlit/instrumentation/dynamiq/dynamiq.py +2 -2
- openlit/instrumentation/elevenlabs/async_elevenlabs.py +73 -47
- openlit/instrumentation/elevenlabs/elevenlabs.py +73 -52
- openlit/instrumentation/embedchain/embedchain.py +4 -4
- openlit/instrumentation/firecrawl/firecrawl.py +2 -2
- openlit/instrumentation/google_ai_studio/__init__.py +9 -9
- openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +183 -219
- openlit/instrumentation/google_ai_studio/google_ai_studio.py +183 -220
- openlit/instrumentation/gpt4all/gpt4all.py +17 -17
- openlit/instrumentation/groq/async_groq.py +14 -14
- openlit/instrumentation/groq/groq.py +14 -14
- openlit/instrumentation/haystack/haystack.py +2 -2
- openlit/instrumentation/julep/async_julep.py +2 -2
- openlit/instrumentation/julep/julep.py +2 -2
- openlit/instrumentation/langchain/langchain.py +36 -31
- openlit/instrumentation/letta/letta.py +6 -6
- openlit/instrumentation/litellm/async_litellm.py +20 -20
- openlit/instrumentation/litellm/litellm.py +20 -20
- openlit/instrumentation/llamaindex/llamaindex.py +2 -2
- openlit/instrumentation/mem0/mem0.py +2 -2
- openlit/instrumentation/milvus/milvus.py +4 -4
- openlit/instrumentation/mistral/async_mistral.py +18 -18
- openlit/instrumentation/mistral/mistral.py +18 -18
- openlit/instrumentation/multion/async_multion.py +2 -2
- openlit/instrumentation/multion/multion.py +2 -2
- openlit/instrumentation/ollama/async_ollama.py +29 -29
- openlit/instrumentation/ollama/ollama.py +29 -29
- openlit/instrumentation/openai/__init__.py +11 -230
- openlit/instrumentation/openai/async_openai.py +434 -409
- openlit/instrumentation/openai/openai.py +415 -393
- openlit/instrumentation/phidata/phidata.py +2 -2
- openlit/instrumentation/pinecone/pinecone.py +4 -4
- openlit/instrumentation/premai/premai.py +20 -20
- openlit/instrumentation/qdrant/async_qdrant.py +4 -4
- openlit/instrumentation/qdrant/qdrant.py +4 -4
- openlit/instrumentation/reka/async_reka.py +6 -6
- openlit/instrumentation/reka/reka.py +6 -6
- openlit/instrumentation/together/async_together.py +18 -18
- openlit/instrumentation/together/together.py +18 -18
- openlit/instrumentation/transformers/transformers.py +6 -6
- openlit/instrumentation/vertexai/async_vertexai.py +53 -53
- openlit/instrumentation/vertexai/vertexai.py +53 -53
- openlit/instrumentation/vllm/vllm.py +6 -6
- openlit/otel/metrics.py +98 -7
- openlit/semcov/__init__.py +113 -80
- {openlit-1.33.8.dist-info → openlit-1.33.9.dist-info}/METADATA +1 -1
- openlit-1.33.9.dist-info/RECORD +121 -0
- {openlit-1.33.8.dist-info → openlit-1.33.9.dist-info}/WHEEL +1 -1
- openlit/instrumentation/openai/async_azure_openai.py +0 -900
- openlit/instrumentation/openai/azure_openai.py +0 -898
- openlit-1.33.8.dist-info/RECORD +0 -122
- {openlit-1.33.8.dist-info → openlit-1.33.9.dist-info}/LICENSE +0 -0
@@ -1,15 +1,20 @@
|
|
1
|
-
# pylint: disable=duplicate-code, broad-exception-caught, too-many-statements, unused-argument, possibly-used-before-assignment, protected-access
|
2
1
|
"""
|
3
2
|
Module for monitoring Azure AI Inference API calls.
|
4
3
|
"""
|
5
4
|
|
6
5
|
import logging
|
6
|
+
import time
|
7
7
|
from opentelemetry.trace import SpanKind, Status, StatusCode
|
8
|
-
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
8
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
9
9
|
from openlit.__helpers import (
|
10
|
-
handle_exception,
|
11
10
|
get_chat_model_cost,
|
12
11
|
get_embed_model_cost,
|
12
|
+
handle_exception,
|
13
|
+
response_as_dict,
|
14
|
+
calculate_ttft,
|
15
|
+
calculate_tbt,
|
16
|
+
create_metrics_attributes,
|
17
|
+
set_server_address_and_port,
|
13
18
|
general_tokens
|
14
19
|
)
|
15
20
|
from openlit.semcov import SemanticConvetion
|
@@ -17,13 +22,12 @@ from openlit.semcov import SemanticConvetion
|
|
17
22
|
# Initialize logger for logging potential issues and operations
|
18
23
|
logger = logging.getLogger(__name__)
|
19
24
|
|
20
|
-
def async_complete(
|
25
|
+
def async_complete(version, environment, application_name,
|
21
26
|
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
22
27
|
"""
|
23
28
|
Generates a telemetry wrapper for chat to collect metrics.
|
24
29
|
|
25
30
|
Args:
|
26
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
27
31
|
version: Version of the monitoring package.
|
28
32
|
environment: Deployment environment (e.g., production, staging).
|
29
33
|
application_name: Name of the application using the Azure AI Inference API.
|
@@ -35,165 +39,275 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
|
|
35
39
|
A function that wraps the chat method to add telemetry.
|
36
40
|
"""
|
37
41
|
|
42
|
+
class TracedAsyncStream:
|
43
|
+
"""
|
44
|
+
Wrapper for streaming responses to collect metrics and trace data.
|
45
|
+
Wraps the 'az.ai.inference.AsyncStream' response to collect message IDs and aggregated response.
|
46
|
+
|
47
|
+
This class implements the '__aiter__' and '__anext__' methods that
|
48
|
+
handle asynchronous streaming responses.
|
49
|
+
|
50
|
+
This class also implements '__aenter__' and '__aexit__' methods that
|
51
|
+
handle asynchronous context management protocol.
|
52
|
+
"""
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
wrapped,
|
56
|
+
span,
|
57
|
+
kwargs,
|
58
|
+
server_address,
|
59
|
+
server_port,
|
60
|
+
**args,
|
61
|
+
):
|
62
|
+
self.__wrapped__ = wrapped
|
63
|
+
self._span = span
|
64
|
+
# Placeholder for aggregating streaming response
|
65
|
+
self._llmresponse = ""
|
66
|
+
self._response_id = ""
|
67
|
+
self._response_model = ""
|
68
|
+
self._finish_reason = ""
|
69
|
+
self._system_fingerprint = ""
|
70
|
+
|
71
|
+
self._args = args
|
72
|
+
self._kwargs = kwargs
|
73
|
+
self._start_time = time.time()
|
74
|
+
self._end_time = None
|
75
|
+
self._timestamps = []
|
76
|
+
self._ttft = 0
|
77
|
+
self._tbt = 0
|
78
|
+
self._server_address = server_address
|
79
|
+
self._server_port = server_port
|
80
|
+
|
81
|
+
async def __aenter__(self):
|
82
|
+
await self.__wrapped__.__aenter__()
|
83
|
+
return self
|
84
|
+
|
85
|
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
86
|
+
await self.__wrapped__.__aexit__(exc_type, exc_value, traceback)
|
87
|
+
|
88
|
+
def __aiter__(self):
|
89
|
+
return self
|
90
|
+
|
91
|
+
async def __getattr__(self, name):
|
92
|
+
"""Delegate attribute access to the wrapped object."""
|
93
|
+
return getattr(await self.__wrapped__, name)
|
94
|
+
|
95
|
+
async def __anext__(self):
|
96
|
+
try:
|
97
|
+
chunk = await self.__wrapped__.__anext__()
|
98
|
+
end_time = time.time()
|
99
|
+
# Record the timestamp for the current chunk
|
100
|
+
self._timestamps.append(end_time)
|
101
|
+
|
102
|
+
if len(self._timestamps) == 1:
|
103
|
+
# Calculate time to first chunk
|
104
|
+
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
105
|
+
|
106
|
+
chunked = response_as_dict(chunk)
|
107
|
+
# Collect message IDs and aggregated response from events
|
108
|
+
if (len(chunked.get('choices')) > 0 and ('delta' in chunked.get('choices')[0] and
|
109
|
+
'content' in chunked.get('choices')[0].get('delta'))):
|
110
|
+
|
111
|
+
content = chunked.get('choices')[0].get('delta').get('content')
|
112
|
+
if content:
|
113
|
+
self._llmresponse += content
|
114
|
+
self._response_id = chunked.get('id')
|
115
|
+
self._response_model = chunked.get('model')
|
116
|
+
self._finish_reason = chunked.get('choices')[0].get('finish_reason')
|
117
|
+
self._system_fingerprint = chunked.get('system_fingerprint')
|
118
|
+
return chunk
|
119
|
+
except StopAsyncIteration:
|
120
|
+
# Handling exception ensure observability without disrupting operation
|
121
|
+
try:
|
122
|
+
self._end_time = time.time()
|
123
|
+
if len(self._timestamps) > 1:
|
124
|
+
self._tbt = calculate_tbt(self._timestamps)
|
125
|
+
|
126
|
+
# Format 'messages' into a single string
|
127
|
+
message_prompt = self._kwargs.get("messages", "")
|
128
|
+
formatted_messages = []
|
129
|
+
for message in message_prompt:
|
130
|
+
role = message["role"]
|
131
|
+
content = message["content"]
|
132
|
+
|
133
|
+
if isinstance(content, list):
|
134
|
+
content_str_list = []
|
135
|
+
for item in content:
|
136
|
+
if item["type"] == "text":
|
137
|
+
content_str_list.append(f'text: {item["text"]}')
|
138
|
+
elif (item["type"] == "image_url" and
|
139
|
+
not item["image_url"]["url"].startswith("data:")):
|
140
|
+
content_str_list.append(f'image_url: {item["image_url"]["url"]}')
|
141
|
+
content_str = ", ".join(content_str_list)
|
142
|
+
formatted_messages.append(f"{role}: {content_str}")
|
143
|
+
else:
|
144
|
+
formatted_messages.append(f"{role}: {content}")
|
145
|
+
prompt = "\n".join(formatted_messages)
|
146
|
+
|
147
|
+
request_model = self._kwargs.get("model", "gpt-4o")
|
148
|
+
|
149
|
+
# Calculate tokens using input prompt and aggregated response
|
150
|
+
input_tokens = general_tokens(prompt)
|
151
|
+
output_tokens = general_tokens(self._llmresponse)
|
152
|
+
|
153
|
+
# Calculate cost of the operation
|
154
|
+
cost = get_chat_model_cost(request_model,
|
155
|
+
pricing_info, input_tokens,
|
156
|
+
output_tokens)
|
157
|
+
|
158
|
+
# Set Span attributes (OTel Semconv)
|
159
|
+
self._span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
160
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
161
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
162
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
163
|
+
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
164
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
165
|
+
request_model)
|
166
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
167
|
+
self._kwargs.get("seed", ""))
|
168
|
+
self._span.set_attribute(SemanticConvetion.SERVER_PORT,
|
169
|
+
self._server_port)
|
170
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
171
|
+
self._kwargs.get("frequency_penalty", 0.0))
|
172
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
173
|
+
self._kwargs.get("max_tokens", -1))
|
174
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
175
|
+
self._kwargs.get("presence_penalty", 0.0))
|
176
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
177
|
+
self._kwargs.get("stop", []))
|
178
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
179
|
+
self._kwargs.get("temperature", 1.0))
|
180
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
181
|
+
self._kwargs.get("top_p", 1.0))
|
182
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
183
|
+
[self._finish_reason])
|
184
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
185
|
+
self._response_id)
|
186
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
187
|
+
self._response_model)
|
188
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
189
|
+
input_tokens)
|
190
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
191
|
+
output_tokens)
|
192
|
+
self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
193
|
+
self._server_address)
|
194
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT,
|
195
|
+
self._system_fingerprint)
|
196
|
+
if isinstance(self._llmresponse, str):
|
197
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
198
|
+
"text")
|
199
|
+
else:
|
200
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
201
|
+
"json")
|
202
|
+
|
203
|
+
# Set Span attributes (Extra)
|
204
|
+
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
205
|
+
environment)
|
206
|
+
self._span.set_attribute(SERVICE_NAME,
|
207
|
+
application_name)
|
208
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
209
|
+
True)
|
210
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
211
|
+
input_tokens + output_tokens)
|
212
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
213
|
+
cost)
|
214
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
|
215
|
+
self._tbt)
|
216
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
217
|
+
self._ttft)
|
218
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
219
|
+
version)
|
220
|
+
if trace_content:
|
221
|
+
self._span.add_event(
|
222
|
+
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
223
|
+
attributes={
|
224
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
225
|
+
},
|
226
|
+
)
|
227
|
+
self._span.add_event(
|
228
|
+
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
229
|
+
attributes={
|
230
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
231
|
+
},
|
232
|
+
)
|
233
|
+
self._span.set_status(Status(StatusCode.OK))
|
234
|
+
|
235
|
+
if disable_metrics is False:
|
236
|
+
attributes = create_metrics_attributes(
|
237
|
+
service_name=application_name,
|
238
|
+
deployment_environment=environment,
|
239
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
240
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
241
|
+
request_model=request_model,
|
242
|
+
server_address=self._server_address,
|
243
|
+
server_port=self._server_port,
|
244
|
+
response_model=self._response_model,
|
245
|
+
)
|
246
|
+
|
247
|
+
metrics["genai_client_usage_tokens"].record(
|
248
|
+
input_tokens + output_tokens, attributes
|
249
|
+
)
|
250
|
+
metrics["genai_client_operation_duration"].record(
|
251
|
+
self._end_time - self._start_time, attributes
|
252
|
+
)
|
253
|
+
metrics["genai_server_tbt"].record(
|
254
|
+
self._tbt, attributes
|
255
|
+
)
|
256
|
+
metrics["genai_server_ttft"].record(
|
257
|
+
self._ttft, attributes
|
258
|
+
)
|
259
|
+
metrics["genai_requests"].add(1, attributes)
|
260
|
+
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
261
|
+
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
262
|
+
metrics["genai_cost"].record(cost, attributes)
|
263
|
+
|
264
|
+
except Exception as e:
|
265
|
+
handle_exception(self._span, e)
|
266
|
+
logger.error("Error in trace creation: %s", e)
|
267
|
+
finally:
|
268
|
+
self._span.end()
|
269
|
+
raise
|
270
|
+
|
38
271
|
async def wrapper(wrapped, instance, args, kwargs):
|
39
272
|
"""
|
40
|
-
Wraps the 'chat' API call to add telemetry.
|
41
|
-
|
273
|
+
Wraps the 'chat.completions' API call to add telemetry.
|
274
|
+
|
42
275
|
This collects metrics such as execution time, cost, and token usage, and handles errors
|
43
276
|
gracefully, adding details to the trace for observability.
|
44
277
|
|
45
278
|
Args:
|
46
|
-
wrapped: The original 'chat' method to be wrapped.
|
279
|
+
wrapped: The original 'chat.completions' method to be wrapped.
|
47
280
|
instance: The instance of the class where the original method is defined.
|
48
|
-
args: Positional arguments for the 'chat' method.
|
49
|
-
kwargs: Keyword arguments for the 'chat' method.
|
281
|
+
args: Positional arguments for the 'chat.completions' method.
|
282
|
+
kwargs: Keyword arguments for the 'chat.completions' method.
|
50
283
|
|
51
284
|
Returns:
|
52
|
-
The response from the original 'chat' method.
|
285
|
+
The response from the original 'chat.completions' method.
|
53
286
|
"""
|
54
|
-
# pylint: disable=no-else-return
|
55
|
-
if kwargs.get("stream", False) is True:
|
56
|
-
# Special handling for streaming response to accommodate the nature of data flow
|
57
|
-
async def stream_generator():
|
58
|
-
with tracer.start_as_current_span(gen_ai_endpoint, kind= SpanKind.CLIENT) as span:
|
59
|
-
# Placeholder for aggregating streaming response
|
60
|
-
llmresponse = ""
|
61
|
-
|
62
|
-
# Loop through streaming events capturing relevant details
|
63
|
-
async for chunk in await wrapped(*args, **kwargs):
|
64
|
-
if chunk.choices:
|
65
|
-
# Collect message IDs and aggregated response from events
|
66
|
-
content = chunk.choices[0].delta.content
|
67
|
-
if content:
|
68
|
-
llmresponse += content
|
69
|
-
|
70
|
-
yield chunk
|
71
|
-
response_id = chunk.id
|
72
|
-
|
73
|
-
# Handling exception ensure observability without disrupting operation
|
74
|
-
try:
|
75
|
-
# Format 'messages' into a single string
|
76
|
-
message_prompt = kwargs.get("messages", "")
|
77
|
-
formatted_messages = []
|
78
|
-
for message in message_prompt:
|
79
|
-
role = message["role"]
|
80
|
-
content = message["content"]
|
81
|
-
|
82
|
-
if isinstance(content, list):
|
83
|
-
content_str = ", ".join(
|
84
|
-
# pylint: disable=line-too-long
|
85
|
-
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
86
|
-
if "type" in item else f'text: {item["text"]}'
|
87
|
-
for item in content
|
88
|
-
)
|
89
|
-
formatted_messages.append(f"{role}: {content_str}")
|
90
|
-
else:
|
91
|
-
formatted_messages.append(f"{role}: {content}")
|
92
|
-
prompt = "\n".join(formatted_messages)
|
93
|
-
|
94
|
-
model = kwargs.get("model", "phi3-mini-4k")
|
95
|
-
|
96
|
-
# Calculate tokens using input prompt and aggregated response
|
97
|
-
input_tokens = general_tokens(prompt)
|
98
|
-
output_tokens = general_tokens(llmresponse)
|
99
|
-
|
100
|
-
total_tokens = input_tokens + output_tokens
|
101
|
-
# Calculate cost of the operation
|
102
|
-
cost = get_chat_model_cost(model,
|
103
|
-
pricing_info, input_tokens,
|
104
|
-
output_tokens)
|
105
|
-
|
106
|
-
# Set base span attribues
|
107
|
-
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
108
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
109
|
-
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
110
|
-
span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
|
111
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT)
|
112
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
|
113
|
-
gen_ai_endpoint)
|
114
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
|
115
|
-
environment)
|
116
|
-
span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
|
117
|
-
application_name)
|
118
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
119
|
-
model)
|
120
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
121
|
-
True)
|
122
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
123
|
-
kwargs.get("user", ""))
|
124
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
125
|
-
kwargs.get("top_p", 1.0))
|
126
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
127
|
-
kwargs.get("max_tokens", -1))
|
128
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
129
|
-
kwargs.get("temperature", 1.0))
|
130
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
131
|
-
kwargs.get("presence_penalty", 0.0))
|
132
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
133
|
-
kwargs.get("frequency_penalty", 0.0))
|
134
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
135
|
-
kwargs.get("seed", ""))
|
136
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
137
|
-
response_id)
|
138
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
139
|
-
input_tokens)
|
140
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
141
|
-
output_tokens)
|
142
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
143
|
-
total_tokens)
|
144
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
145
|
-
cost)
|
146
287
|
|
288
|
+
# Check if streaming is enabled for the API call
|
289
|
+
streaming = kwargs.get("stream", False)
|
290
|
+
server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
|
291
|
+
request_model = kwargs.get("model", "gpt-4o")
|
147
292
|
|
148
|
-
|
149
|
-
span.add_event(
|
150
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
151
|
-
attributes={
|
152
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
153
|
-
},
|
154
|
-
)
|
155
|
-
span.add_event(
|
156
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
157
|
-
attributes={
|
158
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: llmresponse,
|
159
|
-
},
|
160
|
-
)
|
293
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
161
294
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
"openlit",
|
168
|
-
SemanticConvetion.GEN_AI_APPLICATION_NAME:
|
169
|
-
application_name,
|
170
|
-
SemanticConvetion.GEN_AI_SYSTEM:
|
171
|
-
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
172
|
-
SemanticConvetion.GEN_AI_ENVIRONMENT:
|
173
|
-
environment,
|
174
|
-
SemanticConvetion.GEN_AI_TYPE:
|
175
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
176
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
177
|
-
model
|
178
|
-
}
|
179
|
-
|
180
|
-
metrics["genai_requests"].add(1, attributes)
|
181
|
-
metrics["genai_total_tokens"].add(
|
182
|
-
total_tokens, attributes
|
183
|
-
)
|
184
|
-
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
185
|
-
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
186
|
-
metrics["genai_cost"].record(cost, attributes)
|
295
|
+
# pylint: disable=no-else-return
|
296
|
+
if streaming:
|
297
|
+
# Special handling for streaming response to accommodate the nature of data flow
|
298
|
+
awaited_wrapped = await wrapped(*args, **kwargs)
|
299
|
+
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
187
300
|
|
188
|
-
|
189
|
-
handle_exception(span, e)
|
190
|
-
logger.error("Error in trace creation: %s", e)
|
301
|
+
return TracedAsyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
191
302
|
|
192
|
-
|
303
|
+
# Handling for non-streaming responses
|
193
304
|
else:
|
194
|
-
|
195
|
-
|
305
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
306
|
+
start_time = time.time()
|
196
307
|
response = await wrapped(*args, **kwargs)
|
308
|
+
end_time = time.time()
|
309
|
+
|
310
|
+
response_dict = response_as_dict(response)
|
197
311
|
|
198
312
|
try:
|
199
313
|
# Format 'messages' into a single string
|
@@ -205,7 +319,6 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
|
|
205
319
|
|
206
320
|
if isinstance(content, list):
|
207
321
|
content_str = ", ".join(
|
208
|
-
# pylint: disable=line-too-long
|
209
322
|
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
210
323
|
if "type" in item else f'text: {item["text"]}'
|
211
324
|
for item in content
|
@@ -215,43 +328,66 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
|
|
215
328
|
formatted_messages.append(f"{role}: {content}")
|
216
329
|
prompt = "\n".join(formatted_messages)
|
217
330
|
|
218
|
-
|
331
|
+
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
332
|
+
output_tokens = response_dict.get('usage').get('completion_tokens')
|
219
333
|
|
220
|
-
#
|
334
|
+
# Calculate cost of the operation
|
335
|
+
cost = get_chat_model_cost(request_model,
|
336
|
+
pricing_info, input_tokens,
|
337
|
+
output_tokens)
|
338
|
+
|
339
|
+
# Set base span attribues (OTel Semconv)
|
221
340
|
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
341
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
342
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
222
343
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
223
344
|
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
224
|
-
span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
|
225
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT)
|
226
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
|
227
|
-
gen_ai_endpoint)
|
228
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
|
229
|
-
environment)
|
230
|
-
span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
|
231
|
-
application_name)
|
232
345
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
233
|
-
|
234
|
-
span.set_attribute(SemanticConvetion.
|
235
|
-
|
236
|
-
span.set_attribute(SemanticConvetion.
|
237
|
-
|
238
|
-
span.set_attribute(SemanticConvetion.
|
239
|
-
kwargs.get("
|
346
|
+
request_model)
|
347
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
348
|
+
kwargs.get("seed", ""))
|
349
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
350
|
+
server_port)
|
351
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
352
|
+
kwargs.get("frequency_penalty", 0.0))
|
240
353
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
241
354
|
kwargs.get("max_tokens", -1))
|
242
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
243
|
-
kwargs.get("temperature", 1.0))
|
244
355
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
245
356
|
kwargs.get("presence_penalty", 0.0))
|
246
|
-
span.set_attribute(SemanticConvetion.
|
247
|
-
kwargs.get("
|
248
|
-
span.set_attribute(SemanticConvetion.
|
249
|
-
kwargs.get("
|
357
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
358
|
+
kwargs.get("stop", []))
|
359
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
360
|
+
kwargs.get("temperature", 1.0))
|
361
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
362
|
+
kwargs.get("top_p", 1.0))
|
250
363
|
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
251
|
-
|
252
|
-
span.set_attribute(SemanticConvetion.
|
253
|
-
|
364
|
+
response_dict.get("id"))
|
365
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
366
|
+
response_dict.get('model'))
|
367
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
368
|
+
input_tokens)
|
369
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
370
|
+
output_tokens)
|
371
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
372
|
+
server_address)
|
373
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT,
|
374
|
+
response_dict.get('system_fingerprint'))
|
254
375
|
|
376
|
+
# Set base span attribues (Extras)
|
377
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
378
|
+
environment)
|
379
|
+
span.set_attribute(SERVICE_NAME,
|
380
|
+
application_name)
|
381
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
382
|
+
False)
|
383
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
384
|
+
input_tokens + output_tokens)
|
385
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
386
|
+
cost)
|
387
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
388
|
+
end_time - start_time)
|
389
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
390
|
+
version)
|
255
391
|
if trace_content:
|
256
392
|
span.add_event(
|
257
393
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
@@ -259,49 +395,53 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
|
|
259
395
|
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
260
396
|
},
|
261
397
|
)
|
262
|
-
span.add_event(
|
263
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
264
|
-
attributes={
|
265
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: response.choices[0].message.content,
|
266
|
-
},
|
267
|
-
)
|
268
398
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
399
|
+
for i in range(kwargs.get('n',1)):
|
400
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
401
|
+
[response_dict.get('choices')[i].get('finish_reason')])
|
402
|
+
if trace_content:
|
403
|
+
span.add_event(
|
404
|
+
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
405
|
+
attributes={
|
406
|
+
# pylint: disable=line-too-long
|
407
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('choices')[i].get('message').get('content')),
|
408
|
+
},
|
409
|
+
)
|
410
|
+
if kwargs.get('tools'):
|
411
|
+
span.set_attribute(SemanticConvetion.GEN_AI_TOOL_CALLS,
|
412
|
+
str(response_dict.get('choices')[i].get('message').get('tool_calls')))
|
275
413
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
283
|
-
cost)
|
414
|
+
if isinstance(response_dict.get('choices')[i].get('message').get('content'), str):
|
415
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
416
|
+
"text")
|
417
|
+
elif response_dict.get('choices')[i].get('message').get('content') is not None:
|
418
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
419
|
+
"json")
|
284
420
|
|
285
421
|
span.set_status(Status(StatusCode.OK))
|
286
422
|
|
287
423
|
if disable_metrics is False:
|
288
|
-
attributes =
|
289
|
-
|
290
|
-
|
291
|
-
SemanticConvetion.
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
299
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
300
|
-
model
|
301
|
-
}
|
424
|
+
attributes = create_metrics_attributes(
|
425
|
+
service_name=application_name,
|
426
|
+
deployment_environment=environment,
|
427
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
428
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
429
|
+
request_model=request_model,
|
430
|
+
server_address=server_address,
|
431
|
+
server_port=server_port,
|
432
|
+
response_model=response_dict.get('model'),
|
433
|
+
)
|
302
434
|
|
435
|
+
metrics["genai_client_usage_tokens"].record(
|
436
|
+
input_tokens + output_tokens, attributes
|
437
|
+
)
|
438
|
+
metrics["genai_client_operation_duration"].record(
|
439
|
+
end_time - start_time, attributes
|
440
|
+
)
|
441
|
+
metrics["genai_server_ttft"].record(
|
442
|
+
end_time - start_time, attributes
|
443
|
+
)
|
303
444
|
metrics["genai_requests"].add(1, attributes)
|
304
|
-
metrics["genai_total_tokens"].add(total_tokens, attributes)
|
305
445
|
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
306
446
|
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
307
447
|
metrics["genai_cost"].record(cost, attributes)
|
@@ -318,18 +458,17 @@ def async_complete(gen_ai_endpoint, version, environment, application_name,
|
|
318
458
|
|
319
459
|
return wrapper
|
320
460
|
|
321
|
-
def async_embedding(
|
461
|
+
def async_embedding(version, environment, application_name,
|
322
462
|
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
323
463
|
"""
|
324
464
|
Generates a telemetry wrapper for embeddings to collect metrics.
|
325
465
|
|
326
466
|
Args:
|
327
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
328
467
|
version: Version of the monitoring package.
|
329
468
|
environment: Deployment environment (e.g., production, staging).
|
330
|
-
application_name: Name of the application using the Azure
|
469
|
+
application_name: Name of the application using the Azure Inference API.
|
331
470
|
tracer: OpenTelemetry tracer for creating spans.
|
332
|
-
pricing_info: Information used for calculating the cost of Azure
|
471
|
+
pricing_info: Information used for calculating the cost of Azure Inference usage.
|
333
472
|
trace_content: Flag indicating whether to trace the actual content.
|
334
473
|
|
335
474
|
Returns:
|
@@ -353,69 +492,84 @@ def async_embedding(gen_ai_endpoint, version, environment, application_name,
|
|
353
492
|
The response from the original 'embeddings' method.
|
354
493
|
"""
|
355
494
|
|
356
|
-
|
495
|
+
server_address, server_port = set_server_address_and_port(instance, "models.github.ai", 443)
|
496
|
+
request_model = kwargs.get("model", "text-embedding-ada-002")
|
497
|
+
|
498
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}"
|
499
|
+
|
500
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
501
|
+
start_time = time.time()
|
357
502
|
response = await wrapped(*args, **kwargs)
|
503
|
+
end_time = time.time()
|
358
504
|
|
505
|
+
response_dict = response_as_dict(response)
|
359
506
|
try:
|
507
|
+
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
508
|
+
|
360
509
|
# Calculate cost of the operation
|
361
|
-
cost = get_embed_model_cost(
|
362
|
-
|
510
|
+
cost = get_embed_model_cost(request_model,
|
511
|
+
pricing_info, input_tokens)
|
363
512
|
|
364
|
-
# Set Span attributes
|
513
|
+
# Set Span attributes (OTel Semconv)
|
365
514
|
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
515
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
516
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
|
366
517
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
367
518
|
SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE)
|
368
|
-
span.set_attribute(SemanticConvetion.
|
369
|
-
|
370
|
-
span.set_attribute(SemanticConvetion.
|
371
|
-
|
372
|
-
span.set_attribute(SemanticConvetion.
|
519
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
520
|
+
request_model)
|
521
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_ENCODING_FORMATS,
|
522
|
+
[kwargs.get('encoding_format', 'float')])
|
523
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
524
|
+
request_model)
|
525
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
526
|
+
server_address)
|
527
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
528
|
+
server_port)
|
529
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
530
|
+
input_tokens)
|
531
|
+
|
532
|
+
# Set Span attributes (Extras)
|
533
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
373
534
|
environment)
|
374
|
-
span.set_attribute(
|
535
|
+
span.set_attribute(SERVICE_NAME,
|
375
536
|
application_name)
|
376
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
377
|
-
kwargs.get("model", "text-embedding-ada-002"))
|
378
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_FORMAT,
|
379
|
-
kwargs.get("encoding_format", "float"))
|
380
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_DIMENSION,
|
381
|
-
kwargs.get("dimensions", ""))
|
382
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
383
|
-
kwargs.get("user", ""))
|
384
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
385
|
-
response.usage.prompt_tokens)
|
386
537
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
387
|
-
|
538
|
+
input_tokens)
|
388
539
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
389
540
|
cost)
|
541
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
542
|
+
version)
|
543
|
+
|
390
544
|
if trace_content:
|
391
545
|
span.add_event(
|
392
546
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
393
547
|
attributes={
|
394
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: kwargs.get("input", ""),
|
548
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get("input", "")),
|
395
549
|
},
|
396
550
|
)
|
397
551
|
|
398
552
|
span.set_status(Status(StatusCode.OK))
|
399
553
|
|
400
554
|
if disable_metrics is False:
|
401
|
-
attributes =
|
402
|
-
|
403
|
-
|
404
|
-
SemanticConvetion.
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
555
|
+
attributes = create_metrics_attributes(
|
556
|
+
service_name=application_name,
|
557
|
+
deployment_environment=environment,
|
558
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
559
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_AZURE_AI_INFERENCE,
|
560
|
+
request_model=request_model,
|
561
|
+
server_address=server_address,
|
562
|
+
server_port=server_port,
|
563
|
+
response_model=request_model,
|
564
|
+
)
|
565
|
+
metrics["genai_client_usage_tokens"].record(
|
566
|
+
input_tokens, attributes
|
567
|
+
)
|
568
|
+
metrics["genai_client_operation_duration"].record(
|
569
|
+
end_time - start_time, attributes
|
570
|
+
)
|
416
571
|
metrics["genai_requests"].add(1, attributes)
|
417
|
-
metrics["
|
418
|
-
metrics["genai_prompt_tokens"].add(response.usage.prompt_tokens, attributes)
|
572
|
+
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
419
573
|
metrics["genai_cost"].record(cost, attributes)
|
420
574
|
|
421
575
|
# Return original response
|