openlit 1.34.13__py3-none-any.whl → 1.34.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/instrumentation/litellm/__init__.py +7 -6
- openlit/instrumentation/litellm/async_litellm.py +89 -493
- openlit/instrumentation/litellm/litellm.py +87 -491
- openlit/instrumentation/litellm/utils.py +288 -0
- openlit/instrumentation/transformers/__init__.py +12 -5
- openlit/instrumentation/transformers/transformers.py +21 -28
- openlit/instrumentation/transformers/utils.py +126 -110
- {openlit-1.34.13.dist-info → openlit-1.34.15.dist-info}/METADATA +1 -1
- {openlit-1.34.13.dist-info → openlit-1.34.15.dist-info}/RECORD +11 -10
- {openlit-1.34.13.dist-info → openlit-1.34.15.dist-info}/LICENSE +0 -0
- {openlit-1.34.13.dist-info → openlit-1.34.15.dist-info}/WHEEL +0 -0
@@ -1,57 +1,37 @@
|
|
1
1
|
"""
|
2
|
-
Module for monitoring LiteLLM calls.
|
2
|
+
Module for monitoring LiteLLM API calls.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import logging
|
6
5
|
import time
|
7
|
-
from opentelemetry.trace import SpanKind
|
8
|
-
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
6
|
+
from opentelemetry.trace import SpanKind
|
9
7
|
from openlit.__helpers import (
|
10
|
-
get_chat_model_cost,
|
11
|
-
get_embed_model_cost,
|
12
|
-
general_tokens,
|
13
8
|
handle_exception,
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
set_server_address_and_port
|
10
|
+
)
|
11
|
+
from openlit.instrumentation.litellm.utils import (
|
12
|
+
process_chunk,
|
13
|
+
process_streaming_chat_response,
|
14
|
+
process_chat_response,
|
15
|
+
process_embedding_response
|
18
16
|
)
|
19
17
|
from openlit.semcov import SemanticConvention
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def completion(version, environment, application_name,
|
25
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics):
|
19
|
+
def completion(version, environment, application_name, tracer, pricing_info,
|
20
|
+
capture_message_content, metrics, disable_metrics):
|
26
21
|
"""
|
27
|
-
Generates a telemetry wrapper for
|
28
|
-
|
29
|
-
Args:
|
30
|
-
version: Version of the monitoring package.
|
31
|
-
environment: Deployment environment (e.g., production, staging).
|
32
|
-
application_name: Name of the application using the LiteLLM SDK.
|
33
|
-
tracer: OpenTelemetry tracer for creating spans.
|
34
|
-
pricing_info: Information used for calculating the cost of LiteLLM usage.
|
35
|
-
capture_message_content: Flag indicating whether to trace the actual content.
|
36
|
-
|
37
|
-
Returns:
|
38
|
-
A function that wraps the chat completions method to add telemetry.
|
22
|
+
Generates a telemetry wrapper for GenAI function call
|
39
23
|
"""
|
40
24
|
|
41
25
|
class TracedSyncStream:
|
42
26
|
"""
|
43
|
-
Wrapper for streaming responses to collect
|
44
|
-
|
45
|
-
This class implements the '__aiter__' and '__anext__' methods that
|
46
|
-
handle asynchronous streaming responses.
|
47
|
-
|
48
|
-
This class also implements '__aenter__' and '__aexit__' methods that
|
49
|
-
handle asynchronous context management protocol.
|
27
|
+
Wrapper for streaming responses to collect telemetry.
|
50
28
|
"""
|
29
|
+
|
51
30
|
def __init__(
|
52
31
|
self,
|
53
32
|
wrapped,
|
54
33
|
span,
|
34
|
+
span_name,
|
55
35
|
kwargs,
|
56
36
|
server_address,
|
57
37
|
server_port,
|
@@ -59,12 +39,15 @@ def completion(version, environment, application_name,
|
|
59
39
|
):
|
60
40
|
self.__wrapped__ = wrapped
|
61
41
|
self._span = span
|
62
|
-
self.
|
63
|
-
self.
|
64
|
-
self.
|
65
|
-
self.
|
66
|
-
self.
|
67
|
-
|
42
|
+
self._span_name = span_name
|
43
|
+
self._llmresponse = ""
|
44
|
+
self._response_id = ""
|
45
|
+
self._response_model = ""
|
46
|
+
self._finish_reason = ""
|
47
|
+
self._response_service_tier = ""
|
48
|
+
self._tools = None
|
49
|
+
self._input_tokens = 0
|
50
|
+
self._output_tokens = 0
|
68
51
|
self._args = args
|
69
52
|
self._kwargs = kwargs
|
70
53
|
self._start_time = time.time()
|
@@ -92,501 +75,114 @@ def completion(version, environment, application_name,
|
|
92
75
|
def __next__(self):
|
93
76
|
try:
|
94
77
|
chunk = self.__wrapped__.__next__()
|
95
|
-
|
96
|
-
# Record the timestamp for the current chunk
|
97
|
-
self._timestamps.append(end_time)
|
98
|
-
|
99
|
-
if len(self._timestamps) == 1:
|
100
|
-
# Calculate time to first chunk
|
101
|
-
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
102
|
-
|
103
|
-
chunked = response_as_dict(chunk)
|
104
|
-
# Collect message IDs and aggregated response from events
|
105
|
-
if (len(chunked.get('choices')) > 0 and ('delta' in chunked.get('choices')[0] and
|
106
|
-
'content' in chunked.get('choices')[0].get('delta'))):
|
107
|
-
|
108
|
-
content = chunked.get('choices')[0].get('delta').get('content')
|
109
|
-
if content:
|
110
|
-
self._llmresponse += content
|
111
|
-
self._response_id = chunked.get('id')
|
112
|
-
self._response_model = chunked.get('model')
|
113
|
-
self._finish_reason = chunked.get('choices')[0].get('finish_reason')
|
114
|
-
self._response_service_tier = str(chunked.get('system_fingerprint'))
|
78
|
+
process_chunk(self, chunk)
|
115
79
|
return chunk
|
116
80
|
except StopIteration:
|
117
|
-
# Handling exception ensure observability without disrupting operation
|
118
81
|
try:
|
119
|
-
self.
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
if isinstance(content, list):
|
131
|
-
content_str = ", ".join(
|
132
|
-
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
133
|
-
if "type" in item else f'text: {item["text"]}'
|
134
|
-
for item in content
|
135
|
-
)
|
136
|
-
formatted_messages.append(f'{role}: {content_str}')
|
137
|
-
else:
|
138
|
-
formatted_messages.append(f'{role}: {content}')
|
139
|
-
prompt = '\n'.join(formatted_messages)
|
140
|
-
|
141
|
-
request_model = self._kwargs.get('model', 'openai/gpt-4o')
|
142
|
-
|
143
|
-
# Calculate tokens using input prompt and aggregated response
|
144
|
-
input_tokens = general_tokens(prompt)
|
145
|
-
output_tokens = general_tokens(self._llmresponse)
|
146
|
-
|
147
|
-
# Calculate cost of the operation
|
148
|
-
cost = get_chat_model_cost(request_model,
|
149
|
-
pricing_info, input_tokens,
|
150
|
-
output_tokens)
|
151
|
-
|
152
|
-
# Set Span attributes (OTel Semconv)
|
153
|
-
self._span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
154
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_OPERATION,
|
155
|
-
SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT)
|
156
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_SYSTEM,
|
157
|
-
SemanticConvention.GEN_AI_SYSTEM_LITELLM)
|
158
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MODEL,
|
159
|
-
request_model)
|
160
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_SEED,
|
161
|
-
self._kwargs.get('seed', ''))
|
162
|
-
self._span.set_attribute(SemanticConvention.SERVER_PORT,
|
163
|
-
self._server_port)
|
164
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
165
|
-
self._kwargs.get('frequency_penalty', 0.0))
|
166
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MAX_TOKENS,
|
167
|
-
self._kwargs.get('max_tokens', -1))
|
168
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
169
|
-
self._kwargs.get('presence_penalty', 0.0))
|
170
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_STOP_SEQUENCES,
|
171
|
-
self._kwargs.get('stop', []))
|
172
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TEMPERATURE,
|
173
|
-
self._kwargs.get('temperature', 1.0))
|
174
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TOP_P,
|
175
|
-
self._kwargs.get('top_p', 1.0))
|
176
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_FINISH_REASON,
|
177
|
-
[self._finish_reason])
|
178
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_ID,
|
179
|
-
self._response_id)
|
180
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL,
|
181
|
-
self._response_model)
|
182
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS,
|
183
|
-
input_tokens)
|
184
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS,
|
185
|
-
output_tokens)
|
186
|
-
self._span.set_attribute(SemanticConvention.SERVER_ADDRESS,
|
187
|
-
self._server_address)
|
188
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_SERVICE_TIER,
|
189
|
-
self._kwargs.get('service_tier', 'auto'))
|
190
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_SERVICE_TIER,
|
191
|
-
self._response_service_tier)
|
192
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
193
|
-
self._response_service_tier)
|
194
|
-
if isinstance(self._llmresponse, str):
|
195
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE,
|
196
|
-
'text')
|
197
|
-
else:
|
198
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE,
|
199
|
-
'json')
|
200
|
-
|
201
|
-
# Set Span attributes (Extra)
|
202
|
-
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
203
|
-
environment)
|
204
|
-
self._span.set_attribute(SERVICE_NAME,
|
205
|
-
application_name)
|
206
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_USER,
|
207
|
-
self._kwargs.get('user', ''))
|
208
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_IS_STREAM,
|
209
|
-
True)
|
210
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_USAGE_TOTAL_TOKENS,
|
211
|
-
input_tokens + output_tokens)
|
212
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST,
|
213
|
-
cost)
|
214
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TBT,
|
215
|
-
self._tbt)
|
216
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TTFT,
|
217
|
-
self._ttft)
|
218
|
-
self._span.set_attribute(SemanticConvention.GEN_AI_SDK_VERSION,
|
219
|
-
version)
|
220
|
-
if capture_message_content:
|
221
|
-
self._span.add_event(
|
222
|
-
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
223
|
-
attributes={
|
224
|
-
SemanticConvention.GEN_AI_CONTENT_PROMPT: prompt,
|
225
|
-
},
|
226
|
-
)
|
227
|
-
self._span.add_event(
|
228
|
-
name=SemanticConvention.GEN_AI_CONTENT_COMPLETION_EVENT,
|
229
|
-
attributes={
|
230
|
-
SemanticConvention.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
231
|
-
},
|
82
|
+
with tracer.start_as_current_span(self._span_name, kind=SpanKind.CLIENT) as self._span:
|
83
|
+
process_streaming_chat_response(
|
84
|
+
self,
|
85
|
+
pricing_info=pricing_info,
|
86
|
+
environment=environment,
|
87
|
+
application_name=application_name,
|
88
|
+
metrics=metrics,
|
89
|
+
capture_message_content=capture_message_content,
|
90
|
+
disable_metrics=disable_metrics,
|
91
|
+
version=version
|
232
92
|
)
|
233
|
-
self._span.set_status(Status(StatusCode.OK))
|
234
|
-
|
235
|
-
if disable_metrics is False:
|
236
|
-
attributes = create_metrics_attributes(
|
237
|
-
service_name=application_name,
|
238
|
-
deployment_environment=environment,
|
239
|
-
operation=SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT,
|
240
|
-
system=SemanticConvention.GEN_AI_SYSTEM_LITELLM,
|
241
|
-
request_model=request_model,
|
242
|
-
server_address=self._server_address,
|
243
|
-
server_port=self._server_port,
|
244
|
-
response_model=self._response_model,
|
245
|
-
)
|
246
|
-
|
247
|
-
metrics['genai_client_usage_tokens'].record(
|
248
|
-
input_tokens + output_tokens, attributes
|
249
|
-
)
|
250
|
-
metrics['genai_client_operation_duration'].record(
|
251
|
-
self._end_time - self._start_time, attributes
|
252
|
-
)
|
253
|
-
metrics['genai_server_tbt'].record(
|
254
|
-
self._tbt, attributes
|
255
|
-
)
|
256
|
-
metrics['genai_server_ttft'].record(
|
257
|
-
self._ttft, attributes
|
258
|
-
)
|
259
|
-
metrics['genai_requests'].add(1, attributes)
|
260
|
-
metrics['genai_completion_tokens'].add(output_tokens, attributes)
|
261
|
-
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
262
|
-
metrics['genai_cost'].record(cost, attributes)
|
263
93
|
|
264
94
|
except Exception as e:
|
265
95
|
handle_exception(self._span, e)
|
266
|
-
|
267
|
-
finally:
|
268
|
-
self._span.end()
|
96
|
+
|
269
97
|
raise
|
270
98
|
|
271
99
|
def wrapper(wrapped, instance, args, kwargs):
|
272
100
|
"""
|
273
|
-
Wraps the
|
274
|
-
|
275
|
-
This collects metrics such as execution time, cost, and token usage, and handles errors
|
276
|
-
gracefully, adding details to the trace for observability.
|
277
|
-
|
278
|
-
Args:
|
279
|
-
wrapped: The original 'chat.completions' method to be wrapped.
|
280
|
-
instance: The instance of the class where the original method is defined.
|
281
|
-
args: Positional arguments for the 'chat.completions' method.
|
282
|
-
kwargs: Keyword arguments for the 'chat.completions' method.
|
283
|
-
|
284
|
-
Returns:
|
285
|
-
The response from the original 'chat.completions' method.
|
101
|
+
Wraps the GenAI function call.
|
286
102
|
"""
|
287
|
-
|
288
103
|
# Check if streaming is enabled for the API call
|
289
|
-
streaming = kwargs.get(
|
290
|
-
server_address, server_port =
|
291
|
-
request_model = kwargs.get(
|
104
|
+
streaming = kwargs.get("stream", False)
|
105
|
+
server_address, server_port = set_server_address_and_port(instance, "NOT_FOUND", "NOT_FOUND")
|
106
|
+
request_model = kwargs.get("model", "openai/gpt-4o")
|
292
107
|
|
293
|
-
span_name = f
|
108
|
+
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
294
109
|
|
295
|
-
# pylint: disable=no-else-return
|
296
110
|
if streaming:
|
297
|
-
# Special handling for streaming response
|
111
|
+
# Special handling for streaming response
|
298
112
|
awaited_wrapped = wrapped(*args, **kwargs)
|
299
113
|
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
300
|
-
|
301
|
-
return TracedSyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
302
|
-
|
303
|
-
# Handling for non-streaming responses
|
304
|
-
# Handling for non-streaming responses
|
114
|
+
return TracedSyncStream(awaited_wrapped, span, span_name, kwargs, server_address, server_port)
|
305
115
|
else:
|
306
|
-
|
116
|
+
# Handling for non-streaming responses
|
117
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
307
118
|
start_time = time.time()
|
308
119
|
response = wrapped(*args, **kwargs)
|
309
|
-
end_time = time.time()
|
310
|
-
|
311
|
-
response_dict = response_as_dict(response)
|
312
120
|
|
313
121
|
try:
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
prompt = '\n'.join(formatted_messages)
|
331
|
-
|
332
|
-
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
333
|
-
output_tokens = response_dict.get('usage').get('completion_tokens')
|
334
|
-
|
335
|
-
# Calculate cost of the operation
|
336
|
-
cost = get_chat_model_cost(request_model,
|
337
|
-
pricing_info, input_tokens,
|
338
|
-
output_tokens)
|
339
|
-
|
340
|
-
# Set base span attribues (OTel Semconv)
|
341
|
-
span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
342
|
-
span.set_attribute(SemanticConvention.GEN_AI_OPERATION,
|
343
|
-
SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT)
|
344
|
-
span.set_attribute(SemanticConvention.GEN_AI_SYSTEM,
|
345
|
-
SemanticConvention.GEN_AI_SYSTEM_LITELLM)
|
346
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MODEL,
|
347
|
-
request_model)
|
348
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_SEED,
|
349
|
-
kwargs.get('seed', ''))
|
350
|
-
span.set_attribute(SemanticConvention.SERVER_PORT,
|
351
|
-
server_port)
|
352
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
353
|
-
kwargs.get('frequency_penalty', 0.0))
|
354
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MAX_TOKENS,
|
355
|
-
kwargs.get('max_tokens', -1))
|
356
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
357
|
-
kwargs.get('presence_penalty', 0.0))
|
358
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_STOP_SEQUENCES,
|
359
|
-
kwargs.get('stop', []))
|
360
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TEMPERATURE,
|
361
|
-
kwargs.get('temperature', 1.0))
|
362
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TOP_P,
|
363
|
-
kwargs.get('top_p', 1.0))
|
364
|
-
span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_ID,
|
365
|
-
response_dict.get('id'))
|
366
|
-
span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL,
|
367
|
-
response_dict.get('model'))
|
368
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS,
|
369
|
-
input_tokens)
|
370
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS,
|
371
|
-
output_tokens)
|
372
|
-
span.set_attribute(SemanticConvention.SERVER_ADDRESS,
|
373
|
-
server_address)
|
374
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_SERVICE_TIER,
|
375
|
-
kwargs.get('service_tier', 'auto'))
|
376
|
-
span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
377
|
-
str(response_dict.get('system_fingerprint')))
|
378
|
-
|
379
|
-
# Set base span attribues (Extras)
|
380
|
-
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
381
|
-
environment)
|
382
|
-
span.set_attribute(SERVICE_NAME,
|
383
|
-
application_name)
|
384
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_USER,
|
385
|
-
kwargs.get('user', ''))
|
386
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_IS_STREAM,
|
387
|
-
False)
|
388
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_TOTAL_TOKENS,
|
389
|
-
input_tokens + output_tokens)
|
390
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST,
|
391
|
-
cost)
|
392
|
-
span.set_attribute(SemanticConvention.GEN_AI_SERVER_TTFT,
|
393
|
-
end_time - start_time)
|
394
|
-
span.set_attribute(SemanticConvention.GEN_AI_SDK_VERSION,
|
395
|
-
version)
|
396
|
-
if capture_message_content:
|
397
|
-
span.add_event(
|
398
|
-
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
399
|
-
attributes={
|
400
|
-
SemanticConvention.GEN_AI_CONTENT_PROMPT: prompt,
|
401
|
-
},
|
402
|
-
)
|
403
|
-
|
404
|
-
for i in range(kwargs.get('n',1)):
|
405
|
-
span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_FINISH_REASON,
|
406
|
-
[response_dict.get('choices')[i].get('finish_reason')])
|
407
|
-
if capture_message_content:
|
408
|
-
span.add_event(
|
409
|
-
name=SemanticConvention.GEN_AI_CONTENT_COMPLETION_EVENT,
|
410
|
-
attributes={
|
411
|
-
# pylint: disable=line-too-long
|
412
|
-
SemanticConvention.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('choices')[i].get('message').get('content')),
|
413
|
-
},
|
414
|
-
)
|
415
|
-
if kwargs.get('tools'):
|
416
|
-
span.set_attribute(SemanticConvention.GEN_AI_TOOL_CALLS,
|
417
|
-
str(response_dict.get('choices')[i].get('message').get('tool_calls')))
|
418
|
-
|
419
|
-
if isinstance(response_dict.get('choices')[i].get('message').get('content'), str):
|
420
|
-
span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE,
|
421
|
-
'text')
|
422
|
-
elif response_dict.get('choices')[i].get('message').get('content') is not None:
|
423
|
-
span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE,
|
424
|
-
'json')
|
425
|
-
|
426
|
-
span.set_status(Status(StatusCode.OK))
|
427
|
-
|
428
|
-
if disable_metrics is False:
|
429
|
-
attributes = create_metrics_attributes(
|
430
|
-
service_name=application_name,
|
431
|
-
deployment_environment=environment,
|
432
|
-
operation=SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT,
|
433
|
-
system=SemanticConvention.GEN_AI_SYSTEM_LITELLM,
|
434
|
-
request_model=request_model,
|
435
|
-
server_address=server_address,
|
436
|
-
server_port=server_port,
|
437
|
-
response_model=response_dict.get('model'),
|
438
|
-
)
|
439
|
-
|
440
|
-
metrics['genai_client_usage_tokens'].record(
|
441
|
-
input_tokens + output_tokens, attributes
|
442
|
-
)
|
443
|
-
metrics['genai_client_operation_duration'].record(
|
444
|
-
end_time - start_time, attributes
|
445
|
-
)
|
446
|
-
metrics['genai_server_ttft'].record(
|
447
|
-
end_time - start_time, attributes
|
448
|
-
)
|
449
|
-
metrics['genai_requests'].add(1, attributes)
|
450
|
-
metrics['genai_completion_tokens'].add(output_tokens, attributes)
|
451
|
-
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
452
|
-
metrics['genai_cost'].record(cost, attributes)
|
453
|
-
|
454
|
-
# Return original response
|
455
|
-
return response
|
122
|
+
response = process_chat_response(
|
123
|
+
response=response,
|
124
|
+
request_model=request_model,
|
125
|
+
pricing_info=pricing_info,
|
126
|
+
server_port=server_port,
|
127
|
+
server_address=server_address,
|
128
|
+
environment=environment,
|
129
|
+
application_name=application_name,
|
130
|
+
metrics=metrics,
|
131
|
+
start_time=start_time,
|
132
|
+
span=span,
|
133
|
+
capture_message_content=capture_message_content,
|
134
|
+
disable_metrics=disable_metrics,
|
135
|
+
version=version,
|
136
|
+
**kwargs
|
137
|
+
)
|
456
138
|
|
457
139
|
except Exception as e:
|
458
140
|
handle_exception(span, e)
|
459
|
-
logger.error('Error in trace creation: %s', e)
|
460
141
|
|
461
|
-
|
462
|
-
return response
|
142
|
+
return response
|
463
143
|
|
464
144
|
return wrapper
|
465
145
|
|
466
|
-
def embedding(version, environment, application_name,
|
467
|
-
|
146
|
+
def embedding(version, environment, application_name, tracer, pricing_info,
|
147
|
+
capture_message_content, metrics, disable_metrics):
|
468
148
|
"""
|
469
|
-
Generates a telemetry wrapper for
|
470
|
-
|
471
|
-
Args:
|
472
|
-
version: Version of the monitoring package.
|
473
|
-
environment: Deployment environment (e.g., production, staging).
|
474
|
-
application_name: Name of the application using the LiteLLM API.
|
475
|
-
tracer: OpenTelemetry tracer for creating spans.
|
476
|
-
pricing_info: Information used for calculating the cost of LiteLLM usage.
|
477
|
-
capture_message_content: Flag indicating whether to trace the actual content.
|
478
|
-
|
479
|
-
Returns:
|
480
|
-
A function that wraps the embeddings method to add telemetry.
|
149
|
+
Generates a telemetry wrapper for GenAI embedding function call
|
481
150
|
"""
|
482
151
|
|
483
152
|
def wrapper(wrapped, instance, args, kwargs):
|
484
153
|
"""
|
485
|
-
Wraps the
|
486
|
-
|
487
|
-
This collects metrics such as execution time, cost, and token usage, and handles errors
|
488
|
-
gracefully, adding details to the trace for observability.
|
489
|
-
|
490
|
-
Args:
|
491
|
-
wrapped: The original 'embeddings' method to be wrapped.
|
492
|
-
instance: The instance of the class where the original method is defined.
|
493
|
-
args: Positional arguments for the 'embeddings' method.
|
494
|
-
kwargs: Keyword arguments for the 'embeddings' method.
|
495
|
-
|
496
|
-
Returns:
|
497
|
-
The response from the original 'embeddings' method.
|
154
|
+
Wraps the GenAI embedding function call.
|
498
155
|
"""
|
156
|
+
server_address, server_port = set_server_address_and_port(instance, "NOT_FOUND", "NOT_FOUND")
|
157
|
+
request_model = kwargs.get("model", "text-embedding-ada-002")
|
499
158
|
|
500
|
-
|
501
|
-
request_model = kwargs.get('model', 'text-embedding-ada-002')
|
502
|
-
|
503
|
-
span_name = f'{SemanticConvention.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}'
|
159
|
+
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}"
|
504
160
|
|
505
|
-
with tracer.start_as_current_span(span_name, kind=
|
161
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
506
162
|
start_time = time.time()
|
507
163
|
response = wrapped(*args, **kwargs)
|
508
|
-
end_time = time.time()
|
509
164
|
|
510
|
-
response_dict = response_as_dict(response)
|
511
165
|
try:
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL,
|
529
|
-
response_dict.get('model'))
|
530
|
-
span.set_attribute(SemanticConvention.SERVER_ADDRESS,
|
531
|
-
server_address)
|
532
|
-
span.set_attribute(SemanticConvention.SERVER_PORT,
|
533
|
-
server_port)
|
534
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS,
|
535
|
-
input_tokens)
|
536
|
-
|
537
|
-
# Set Span attributes (Extras)
|
538
|
-
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
539
|
-
environment)
|
540
|
-
span.set_attribute(SERVICE_NAME,
|
541
|
-
application_name)
|
542
|
-
span.set_attribute(SemanticConvention.GEN_AI_REQUEST_USER,
|
543
|
-
kwargs.get('user', ''))
|
544
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_TOTAL_TOKENS,
|
545
|
-
input_tokens)
|
546
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST,
|
547
|
-
cost)
|
548
|
-
span.set_attribute(SemanticConvention.GEN_AI_SDK_VERSION,
|
549
|
-
version)
|
550
|
-
|
551
|
-
if capture_message_content:
|
552
|
-
span.add_event(
|
553
|
-
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
554
|
-
attributes={
|
555
|
-
SemanticConvention.GEN_AI_CONTENT_PROMPT: str(kwargs.get('input', '')),
|
556
|
-
},
|
557
|
-
)
|
558
|
-
|
559
|
-
span.set_status(Status(StatusCode.OK))
|
560
|
-
|
561
|
-
if disable_metrics is False:
|
562
|
-
attributes = create_metrics_attributes(
|
563
|
-
service_name=application_name,
|
564
|
-
deployment_environment=environment,
|
565
|
-
operation=SemanticConvention.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
566
|
-
system=SemanticConvention.GEN_AI_SYSTEM_LITELLM,
|
567
|
-
request_model=request_model,
|
568
|
-
server_address=server_address,
|
569
|
-
server_port=server_port,
|
570
|
-
response_model=response_dict.get('model'),
|
571
|
-
)
|
572
|
-
metrics['genai_client_usage_tokens'].record(
|
573
|
-
input_tokens, attributes
|
574
|
-
)
|
575
|
-
metrics['genai_client_operation_duration'].record(
|
576
|
-
end_time - start_time, attributes
|
577
|
-
)
|
578
|
-
metrics['genai_requests'].add(1, attributes)
|
579
|
-
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
580
|
-
metrics['genai_cost'].record(cost, attributes)
|
581
|
-
|
582
|
-
# Return original response
|
583
|
-
return response
|
166
|
+
response = process_embedding_response(
|
167
|
+
response=response,
|
168
|
+
request_model=request_model,
|
169
|
+
pricing_info=pricing_info,
|
170
|
+
server_port=server_port,
|
171
|
+
server_address=server_address,
|
172
|
+
environment=environment,
|
173
|
+
application_name=application_name,
|
174
|
+
metrics=metrics,
|
175
|
+
start_time=start_time,
|
176
|
+
span=span,
|
177
|
+
capture_message_content=capture_message_content,
|
178
|
+
disable_metrics=disable_metrics,
|
179
|
+
version=version,
|
180
|
+
**kwargs
|
181
|
+
)
|
584
182
|
|
585
183
|
except Exception as e:
|
586
184
|
handle_exception(span, e)
|
587
|
-
logger.error('Error in trace creation: %s', e)
|
588
185
|
|
589
|
-
|
590
|
-
return response
|
186
|
+
return response
|
591
187
|
|
592
188
|
return wrapper
|