openlit 1.33.9__py3-none-any.whl → 1.33.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/__helpers.py +5 -0
- openlit/__init__.py +3 -2
- openlit/instrumentation/ag2/ag2.py +3 -3
- openlit/instrumentation/ai21/ai21.py +1 -1
- openlit/instrumentation/ai21/async_ai21.py +1 -1
- openlit/instrumentation/anthropic/anthropic.py +1 -1
- openlit/instrumentation/anthropic/async_anthropic.py +1 -1
- openlit/instrumentation/astra/astra.py +5 -5
- openlit/instrumentation/astra/async_astra.py +5 -5
- openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +3 -3
- openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +3 -3
- openlit/instrumentation/chroma/chroma.py +5 -5
- openlit/instrumentation/cohere/async_cohere.py +1 -1
- openlit/instrumentation/cohere/cohere.py +2 -2
- openlit/instrumentation/controlflow/controlflow.py +3 -3
- openlit/instrumentation/crawl4ai/async_crawl4ai.py +3 -3
- openlit/instrumentation/crawl4ai/crawl4ai.py +3 -3
- openlit/instrumentation/crewai/crewai.py +4 -2
- openlit/instrumentation/dynamiq/dynamiq.py +3 -3
- openlit/instrumentation/elevenlabs/async_elevenlabs.py +1 -2
- openlit/instrumentation/elevenlabs/elevenlabs.py +1 -2
- openlit/instrumentation/embedchain/embedchain.py +5 -5
- openlit/instrumentation/firecrawl/firecrawl.py +3 -3
- openlit/instrumentation/gpt4all/__init__.py +2 -2
- openlit/instrumentation/gpt4all/gpt4all.py +345 -220
- openlit/instrumentation/gpu/__init__.py +5 -5
- openlit/instrumentation/groq/__init__.py +2 -2
- openlit/instrumentation/groq/async_groq.py +356 -240
- openlit/instrumentation/groq/groq.py +356 -240
- openlit/instrumentation/haystack/haystack.py +3 -3
- openlit/instrumentation/julep/async_julep.py +3 -3
- openlit/instrumentation/julep/julep.py +3 -3
- openlit/instrumentation/langchain/__init__.py +13 -7
- openlit/instrumentation/langchain/async_langchain.py +384 -0
- openlit/instrumentation/langchain/langchain.py +98 -490
- openlit/instrumentation/letta/letta.py +5 -3
- openlit/instrumentation/litellm/__init__.py +4 -5
- openlit/instrumentation/litellm/async_litellm.py +316 -245
- openlit/instrumentation/litellm/litellm.py +312 -241
- openlit/instrumentation/llamaindex/llamaindex.py +3 -3
- openlit/instrumentation/mem0/mem0.py +3 -3
- openlit/instrumentation/milvus/milvus.py +5 -5
- openlit/instrumentation/mistral/__init__.py +6 -6
- openlit/instrumentation/mistral/async_mistral.py +421 -248
- openlit/instrumentation/mistral/mistral.py +418 -244
- openlit/instrumentation/multion/async_multion.py +4 -2
- openlit/instrumentation/multion/multion.py +4 -2
- openlit/instrumentation/ollama/__init__.py +8 -30
- openlit/instrumentation/ollama/async_ollama.py +385 -417
- openlit/instrumentation/ollama/ollama.py +384 -417
- openlit/instrumentation/openai/async_openai.py +7 -9
- openlit/instrumentation/openai/openai.py +7 -9
- openlit/instrumentation/phidata/phidata.py +4 -2
- openlit/instrumentation/pinecone/pinecone.py +5 -5
- openlit/instrumentation/premai/__init__.py +2 -2
- openlit/instrumentation/premai/premai.py +262 -213
- openlit/instrumentation/qdrant/async_qdrant.py +5 -5
- openlit/instrumentation/qdrant/qdrant.py +5 -5
- openlit/instrumentation/reka/__init__.py +2 -2
- openlit/instrumentation/reka/async_reka.py +90 -52
- openlit/instrumentation/reka/reka.py +90 -52
- openlit/instrumentation/together/__init__.py +4 -4
- openlit/instrumentation/together/async_together.py +278 -236
- openlit/instrumentation/together/together.py +278 -236
- openlit/instrumentation/transformers/__init__.py +1 -1
- openlit/instrumentation/transformers/transformers.py +75 -44
- openlit/instrumentation/vertexai/__init__.py +14 -64
- openlit/instrumentation/vertexai/async_vertexai.py +329 -986
- openlit/instrumentation/vertexai/vertexai.py +329 -986
- openlit/instrumentation/vllm/__init__.py +1 -1
- openlit/instrumentation/vllm/vllm.py +62 -32
- openlit/semcov/__init__.py +3 -3
- {openlit-1.33.9.dist-info → openlit-1.33.10.dist-info}/METADATA +1 -1
- openlit-1.33.10.dist-info/RECORD +122 -0
- openlit-1.33.9.dist-info/RECORD +0 -121
- {openlit-1.33.9.dist-info → openlit-1.33.10.dist-info}/LICENSE +0 -0
- {openlit-1.33.9.dist-info → openlit-1.33.10.dist-info}/WHEEL +0 -0
@@ -1,30 +1,32 @@
|
|
1
|
-
# pylint: disable=duplicate-code, broad-exception-caught, too-many-statements, unused-argument, too-many-branches
|
2
1
|
"""
|
3
2
|
Module for monitoring LiteLLM calls.
|
4
3
|
"""
|
5
4
|
|
6
5
|
import logging
|
6
|
+
import time
|
7
7
|
from opentelemetry.trace import SpanKind, Status, StatusCode
|
8
|
-
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
8
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
9
9
|
from openlit.__helpers import (
|
10
10
|
get_chat_model_cost,
|
11
11
|
get_embed_model_cost,
|
12
|
-
|
12
|
+
general_tokens,
|
13
13
|
handle_exception,
|
14
14
|
response_as_dict,
|
15
|
+
calculate_ttft,
|
16
|
+
calculate_tbt,
|
17
|
+
create_metrics_attributes,
|
15
18
|
)
|
16
19
|
from openlit.semcov import SemanticConvetion
|
17
20
|
|
18
21
|
# Initialize logger for logging potential issues and operations
|
19
22
|
logger = logging.getLogger(__name__)
|
20
23
|
|
21
|
-
def acompletion(
|
22
|
-
|
24
|
+
def acompletion(version, environment, application_name,
|
25
|
+
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
23
26
|
"""
|
24
27
|
Generates a telemetry wrapper for chat completions to collect metrics.
|
25
28
|
|
26
29
|
Args:
|
27
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
28
30
|
version: Version of the monitoring package.
|
29
31
|
environment: Deployment environment (e.g., production, staging).
|
30
32
|
application_name: Name of the application using the LiteLLM SDK.
|
@@ -51,16 +53,27 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
51
53
|
wrapped,
|
52
54
|
span,
|
53
55
|
kwargs,
|
56
|
+
server_address,
|
57
|
+
server_port,
|
54
58
|
**args,
|
55
59
|
):
|
56
60
|
self.__wrapped__ = wrapped
|
57
61
|
self._span = span
|
58
|
-
|
59
|
-
self.
|
60
|
-
self.
|
62
|
+
self._llmresponse = ''
|
63
|
+
self._response_id = ''
|
64
|
+
self._response_model = ''
|
65
|
+
self._finish_reason = ''
|
66
|
+
self._response_service_tier = ''
|
61
67
|
|
62
68
|
self._args = args
|
63
69
|
self._kwargs = kwargs
|
70
|
+
self._start_time = time.time()
|
71
|
+
self._end_time = None
|
72
|
+
self._timestamps = []
|
73
|
+
self._ttft = 0
|
74
|
+
self._tbt = 0
|
75
|
+
self._server_address = server_address
|
76
|
+
self._server_port = server_port
|
64
77
|
|
65
78
|
async def __aenter__(self):
|
66
79
|
await self.__wrapped__.__aenter__()
|
@@ -79,6 +92,14 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
79
92
|
async def __anext__(self):
|
80
93
|
try:
|
81
94
|
chunk = await self.__wrapped__.__anext__()
|
95
|
+
end_time = time.time()
|
96
|
+
# Record the timestamp for the current chunk
|
97
|
+
self._timestamps.append(end_time)
|
98
|
+
|
99
|
+
if len(self._timestamps) == 1:
|
100
|
+
# Calculate time to first chunk
|
101
|
+
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
102
|
+
|
82
103
|
chunked = response_as_dict(chunk)
|
83
104
|
# Collect message IDs and aggregated response from events
|
84
105
|
if (len(chunked.get('choices')) > 0 and ('delta' in chunked.get('choices')[0] and
|
@@ -88,80 +109,114 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
88
109
|
if content:
|
89
110
|
self._llmresponse += content
|
90
111
|
self._response_id = chunked.get('id')
|
112
|
+
self._response_model = chunked.get('model')
|
113
|
+
self._finish_reason = chunked.get('choices')[0].get('finish_reason')
|
114
|
+
self._response_service_tier = str(chunked.get('system_fingerprint'))
|
91
115
|
return chunk
|
92
116
|
except StopAsyncIteration:
|
93
117
|
# Handling exception ensure observability without disrupting operation
|
94
118
|
try:
|
119
|
+
self._end_time = time.time()
|
120
|
+
if len(self._timestamps) > 1:
|
121
|
+
self._tbt = calculate_tbt(self._timestamps)
|
122
|
+
|
95
123
|
# Format 'messages' into a single string
|
96
|
-
message_prompt = self._kwargs.get(
|
124
|
+
message_prompt = self._kwargs.get('messages', '')
|
97
125
|
formatted_messages = []
|
98
126
|
for message in message_prompt:
|
99
|
-
role = message[
|
100
|
-
content = message[
|
127
|
+
role = message['role']
|
128
|
+
content = message['content']
|
101
129
|
|
102
130
|
if isinstance(content, list):
|
103
131
|
content_str = ", ".join(
|
104
|
-
# pylint: disable=line-too-long
|
105
132
|
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
106
133
|
if "type" in item else f'text: {item["text"]}'
|
107
134
|
for item in content
|
108
135
|
)
|
109
|
-
formatted_messages.append(f
|
136
|
+
formatted_messages.append(f'{role}: {content_str}')
|
110
137
|
else:
|
111
|
-
formatted_messages.append(f
|
112
|
-
prompt =
|
138
|
+
formatted_messages.append(f'{role}: {content}')
|
139
|
+
prompt = '\n'.join(formatted_messages)
|
140
|
+
|
141
|
+
request_model = self._kwargs.get('model', 'openai/gpt-4o')
|
113
142
|
|
114
143
|
# Calculate tokens using input prompt and aggregated response
|
115
|
-
|
116
|
-
|
117
|
-
completion_tokens = openai_tokens(self._llmresponse,
|
118
|
-
self._kwargs.get("model", "gpt-3.5-turbo"))
|
144
|
+
input_tokens = general_tokens(prompt)
|
145
|
+
output_tokens = general_tokens(self._llmresponse)
|
119
146
|
|
120
147
|
# Calculate cost of the operation
|
121
|
-
cost = get_chat_model_cost(
|
122
|
-
pricing_info,
|
123
|
-
|
148
|
+
cost = get_chat_model_cost(request_model,
|
149
|
+
pricing_info, input_tokens,
|
150
|
+
output_tokens)
|
124
151
|
|
125
|
-
# Set Span attributes
|
126
|
-
self._span.set_attribute(TELEMETRY_SDK_NAME,
|
127
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
128
|
-
SemanticConvetion.GEN_AI_SYSTEM_LITELLM)
|
152
|
+
# Set Span attributes (OTel Semconv)
|
153
|
+
self._span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
129
154
|
self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
130
155
|
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
131
|
-
self._span.set_attribute(SemanticConvetion.
|
132
|
-
|
156
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
157
|
+
SemanticConvetion.GEN_AI_SYSTEM_LITELLM)
|
158
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
159
|
+
request_model)
|
160
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
161
|
+
self._kwargs.get('seed', ''))
|
162
|
+
self._span.set_attribute(SemanticConvetion.SERVER_PORT,
|
163
|
+
self._server_port)
|
164
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
165
|
+
self._kwargs.get('frequency_penalty', 0.0))
|
166
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
167
|
+
self._kwargs.get('max_tokens', -1))
|
168
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
169
|
+
self._kwargs.get('presence_penalty', 0.0))
|
170
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
171
|
+
self._kwargs.get('stop', []))
|
172
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
173
|
+
self._kwargs.get('temperature', 1.0))
|
174
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
175
|
+
self._kwargs.get('top_p', 1.0))
|
176
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
177
|
+
[self._finish_reason])
|
133
178
|
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
134
179
|
self._response_id)
|
135
|
-
self._span.set_attribute(SemanticConvetion.
|
180
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
181
|
+
self._response_model)
|
182
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
183
|
+
input_tokens)
|
184
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
185
|
+
output_tokens)
|
186
|
+
self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
187
|
+
self._server_address)
|
188
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SERVICE_TIER,
|
189
|
+
self._kwargs.get('service_tier', 'auto'))
|
190
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_SERVICE_TIER,
|
191
|
+
self._response_service_tier)
|
192
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
193
|
+
self._response_service_tier)
|
194
|
+
if isinstance(self._llmresponse, str):
|
195
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
196
|
+
'text')
|
197
|
+
else:
|
198
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
199
|
+
'json')
|
200
|
+
|
201
|
+
# Set Span attributes (Extra)
|
202
|
+
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
136
203
|
environment)
|
137
|
-
self._span.set_attribute(
|
204
|
+
self._span.set_attribute(SERVICE_NAME,
|
138
205
|
application_name)
|
139
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
140
|
-
self._kwargs.get("model", "gpt-3.5-turbo"))
|
141
206
|
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
142
|
-
self._kwargs.get(
|
143
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
144
|
-
self._kwargs.get("top_p", 1.0))
|
145
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
146
|
-
self._kwargs.get("max_tokens", -1))
|
147
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
148
|
-
self._kwargs.get("temperature", 1.0))
|
149
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
150
|
-
self._kwargs.get("presence_penalty", 0.0))
|
151
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
152
|
-
self._kwargs.get("frequency_penalty", 0.0))
|
153
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
154
|
-
self._kwargs.get("seed", ""))
|
207
|
+
self._kwargs.get('user', ''))
|
155
208
|
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
156
209
|
True)
|
157
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
158
|
-
prompt_tokens)
|
159
|
-
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
160
|
-
completion_tokens)
|
161
210
|
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
162
|
-
|
211
|
+
input_tokens + output_tokens)
|
163
212
|
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
164
213
|
cost)
|
214
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
|
215
|
+
self._tbt)
|
216
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
217
|
+
self._ttft)
|
218
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
219
|
+
version)
|
165
220
|
if trace_content:
|
166
221
|
self._span.add_event(
|
167
222
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
@@ -175,36 +230,40 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
175
230
|
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
176
231
|
},
|
177
232
|
)
|
178
|
-
|
179
233
|
self._span.set_status(Status(StatusCode.OK))
|
180
234
|
|
181
235
|
if disable_metrics is False:
|
182
|
-
attributes =
|
183
|
-
|
184
|
-
|
185
|
-
SemanticConvetion.
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
SemanticConvetion.GEN_AI_OPERATION:
|
192
|
-
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
193
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
194
|
-
self._kwargs.get("model", "gpt-3.5-turbo")
|
195
|
-
}
|
196
|
-
|
197
|
-
metrics["genai_requests"].add(1, attributes)
|
198
|
-
metrics["genai_total_tokens"].add(
|
199
|
-
prompt_tokens + completion_tokens, attributes
|
236
|
+
attributes = create_metrics_attributes(
|
237
|
+
service_name=application_name,
|
238
|
+
deployment_environment=environment,
|
239
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
240
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_LITELLM,
|
241
|
+
request_model=request_model,
|
242
|
+
server_address=self._server_address,
|
243
|
+
server_port=self._server_port,
|
244
|
+
response_model=self._response_model,
|
200
245
|
)
|
201
|
-
|
202
|
-
metrics[
|
203
|
-
|
246
|
+
|
247
|
+
metrics['genai_client_usage_tokens'].record(
|
248
|
+
input_tokens + output_tokens, attributes
|
249
|
+
)
|
250
|
+
metrics['genai_client_operation_duration'].record(
|
251
|
+
self._end_time - self._start_time, attributes
|
252
|
+
)
|
253
|
+
metrics['genai_server_tbt'].record(
|
254
|
+
self._tbt, attributes
|
255
|
+
)
|
256
|
+
metrics['genai_server_ttft'].record(
|
257
|
+
self._ttft, attributes
|
258
|
+
)
|
259
|
+
metrics['genai_requests'].add(1, attributes)
|
260
|
+
metrics['genai_completion_tokens'].add(output_tokens, attributes)
|
261
|
+
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
262
|
+
metrics['genai_cost'].record(cost, attributes)
|
204
263
|
|
205
264
|
except Exception as e:
|
206
265
|
handle_exception(self._span, e)
|
207
|
-
logger.error(
|
266
|
+
logger.error('Error in trace creation: %s', e)
|
208
267
|
finally:
|
209
268
|
self._span.end()
|
210
269
|
raise
|
@@ -227,76 +286,113 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
227
286
|
"""
|
228
287
|
|
229
288
|
# Check if streaming is enabled for the API call
|
230
|
-
streaming = kwargs.get(
|
289
|
+
streaming = kwargs.get('stream', False)
|
290
|
+
server_address, server_port = 'NOT_FOUND', 'NOT_FOUND'
|
291
|
+
request_model = kwargs.get('model', 'openai/gpt-4o')
|
292
|
+
|
293
|
+
span_name = f'{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}'
|
231
294
|
|
232
295
|
# pylint: disable=no-else-return
|
233
296
|
if streaming:
|
234
297
|
# Special handling for streaming response to accommodate the nature of data flow
|
235
298
|
awaited_wrapped = await wrapped(*args, **kwargs)
|
236
|
-
span = tracer.start_span(
|
299
|
+
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
237
300
|
|
238
|
-
return TracedAsyncStream(awaited_wrapped, span, kwargs)
|
301
|
+
return TracedAsyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
239
302
|
|
303
|
+
# Handling for non-streaming responses
|
240
304
|
# Handling for non-streaming responses
|
241
305
|
else:
|
242
|
-
|
243
|
-
|
306
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
307
|
+
start_time = time.time()
|
244
308
|
response = await wrapped(*args, **kwargs)
|
309
|
+
end_time = time.time()
|
245
310
|
|
246
311
|
response_dict = response_as_dict(response)
|
247
312
|
|
248
313
|
try:
|
249
314
|
# Format 'messages' into a single string
|
250
|
-
message_prompt = kwargs.get(
|
315
|
+
message_prompt = kwargs.get('messages', '')
|
251
316
|
formatted_messages = []
|
252
317
|
for message in message_prompt:
|
253
|
-
role = message[
|
254
|
-
content = message[
|
318
|
+
role = message['role']
|
319
|
+
content = message['content']
|
255
320
|
|
256
321
|
if isinstance(content, list):
|
257
322
|
content_str = ", ".join(
|
258
|
-
# pylint: disable=line-too-long
|
259
323
|
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
260
324
|
if "type" in item else f'text: {item["text"]}'
|
261
325
|
for item in content
|
262
326
|
)
|
263
|
-
formatted_messages.append(f
|
327
|
+
formatted_messages.append(f'{role}: {content_str}')
|
264
328
|
else:
|
265
|
-
formatted_messages.append(f
|
266
|
-
prompt =
|
329
|
+
formatted_messages.append(f'{role}: {content}')
|
330
|
+
prompt = '\n'.join(formatted_messages)
|
267
331
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
332
|
+
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
333
|
+
output_tokens = response_dict.get('usage').get('completion_tokens')
|
334
|
+
|
335
|
+
# Calculate cost of the operation
|
336
|
+
cost = get_chat_model_cost(request_model,
|
337
|
+
pricing_info, input_tokens,
|
338
|
+
output_tokens)
|
339
|
+
|
340
|
+
# Set base span attribues (OTel Semconv)
|
341
|
+
span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
272
342
|
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
273
343
|
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
274
|
-
span.set_attribute(SemanticConvetion.
|
275
|
-
|
344
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
345
|
+
SemanticConvetion.GEN_AI_SYSTEM_LITELLM)
|
346
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
347
|
+
request_model)
|
348
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
349
|
+
kwargs.get('seed', ''))
|
350
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
351
|
+
server_port)
|
352
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
353
|
+
kwargs.get('frequency_penalty', 0.0))
|
354
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
355
|
+
kwargs.get('max_tokens', -1))
|
356
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
357
|
+
kwargs.get('presence_penalty', 0.0))
|
358
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
359
|
+
kwargs.get('stop', []))
|
360
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
361
|
+
kwargs.get('temperature', 1.0))
|
362
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
363
|
+
kwargs.get('top_p', 1.0))
|
276
364
|
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
277
|
-
response_dict.get(
|
278
|
-
span.set_attribute(SemanticConvetion.
|
365
|
+
response_dict.get('id'))
|
366
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
367
|
+
response_dict.get('model'))
|
368
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
369
|
+
input_tokens)
|
370
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
371
|
+
output_tokens)
|
372
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
373
|
+
server_address)
|
374
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SERVICE_TIER,
|
375
|
+
kwargs.get('service_tier', 'auto'))
|
376
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_SYSTEM_FINGERPRINT,
|
377
|
+
str(response_dict.get('system_fingerprint')))
|
378
|
+
|
379
|
+
# Set base span attribues (Extras)
|
380
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
279
381
|
environment)
|
280
|
-
span.set_attribute(
|
382
|
+
span.set_attribute(SERVICE_NAME,
|
281
383
|
application_name)
|
282
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
283
|
-
kwargs.get("model", "gpt-3.5-turbo"))
|
284
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
285
|
-
kwargs.get("top_p", 1.0))
|
286
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
287
|
-
kwargs.get("max_tokens", -1))
|
288
384
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
289
|
-
kwargs.get(
|
290
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
291
|
-
kwargs.get("temperature", 1.0))
|
292
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
293
|
-
kwargs.get("presence_penalty", 0.0))
|
294
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
295
|
-
kwargs.get("frequency_penalty", 0.0))
|
296
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
297
|
-
kwargs.get("seed", ""))
|
385
|
+
kwargs.get('user', ''))
|
298
386
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
299
387
|
False)
|
388
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
389
|
+
input_tokens + output_tokens)
|
390
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
391
|
+
cost)
|
392
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
393
|
+
end_time - start_time)
|
394
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
395
|
+
version)
|
300
396
|
if trace_content:
|
301
397
|
span.add_event(
|
302
398
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
@@ -305,121 +401,81 @@ def acompletion(gen_ai_endpoint, version, environment, application_name,
|
|
305
401
|
},
|
306
402
|
)
|
307
403
|
|
308
|
-
|
309
|
-
if "tools" not in kwargs:
|
310
|
-
# Calculate cost of the operation
|
311
|
-
cost = get_chat_model_cost(kwargs.get("model", "gpt-3.5-turbo"),
|
312
|
-
pricing_info, response_dict.get('usage', {}).get('prompt_tokens', None),
|
313
|
-
response_dict.get('usage', {}).get('completion_tokens', None))
|
314
|
-
|
315
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
316
|
-
response_dict.get('usage', {}).get('prompt_tokens', None))
|
317
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
318
|
-
response_dict.get('usage', {}).get('completion_tokens', None))
|
319
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
320
|
-
response_dict.get('usage', {}).get('total_tokens', None))
|
404
|
+
for i in range(kwargs.get('n',1)):
|
321
405
|
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
322
|
-
[response_dict.get('choices'
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
span.add_event(
|
342
|
-
name=attribute_name,
|
343
|
-
attributes={
|
344
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: response_dict.get('choices')[i].get("message").get("content"),
|
345
|
-
},
|
346
|
-
)
|
347
|
-
i += 1
|
348
|
-
|
349
|
-
# Return original response
|
350
|
-
return response
|
351
|
-
|
352
|
-
# Set span attributes when tools is passed to the function call
|
353
|
-
elif "tools" in kwargs:
|
354
|
-
# Calculate cost of the operation
|
355
|
-
cost = get_chat_model_cost(kwargs.get("model", "gpt-3.5-turbo"),
|
356
|
-
pricing_info, response_dict.get('usage').get('prompt_tokens'),
|
357
|
-
response_dict.get('usage').get('completion_tokens'))
|
358
|
-
span.add_event(
|
359
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
360
|
-
attributes={
|
361
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: "Function called with tools",
|
362
|
-
},
|
363
|
-
)
|
364
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
365
|
-
response_dict.get('usage').get('prompt_tokens'))
|
366
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
367
|
-
response_dict.get('usage').get('completion_tokens'))
|
368
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
369
|
-
response_dict.get('usage').get('total_tokens'))
|
370
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
371
|
-
cost)
|
406
|
+
[response_dict.get('choices')[i].get('finish_reason')])
|
407
|
+
if trace_content:
|
408
|
+
span.add_event(
|
409
|
+
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
410
|
+
attributes={
|
411
|
+
# pylint: disable=line-too-long
|
412
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('choices')[i].get('message').get('content')),
|
413
|
+
},
|
414
|
+
)
|
415
|
+
if kwargs.get('tools'):
|
416
|
+
span.set_attribute(SemanticConvetion.GEN_AI_TOOL_CALLS,
|
417
|
+
str(response_dict.get('choices')[i].get('message').get('tool_calls')))
|
418
|
+
|
419
|
+
if isinstance(response_dict.get('choices')[i].get('message').get('content'), str):
|
420
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
421
|
+
'text')
|
422
|
+
elif response_dict.get('choices')[i].get('message').get('content') is not None:
|
423
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
424
|
+
'json')
|
372
425
|
|
373
426
|
span.set_status(Status(StatusCode.OK))
|
374
427
|
|
375
428
|
if disable_metrics is False:
|
376
|
-
attributes =
|
377
|
-
|
378
|
-
|
379
|
-
SemanticConvetion.
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
metrics[
|
394
|
-
|
395
|
-
|
429
|
+
attributes = create_metrics_attributes(
|
430
|
+
service_name=application_name,
|
431
|
+
deployment_environment=environment,
|
432
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
433
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_LITELLM,
|
434
|
+
request_model=request_model,
|
435
|
+
server_address=server_address,
|
436
|
+
server_port=server_port,
|
437
|
+
response_model=response_dict.get('model'),
|
438
|
+
)
|
439
|
+
|
440
|
+
metrics['genai_client_usage_tokens'].record(
|
441
|
+
input_tokens + output_tokens, attributes
|
442
|
+
)
|
443
|
+
metrics['genai_client_operation_duration'].record(
|
444
|
+
end_time - start_time, attributes
|
445
|
+
)
|
446
|
+
metrics['genai_server_ttft'].record(
|
447
|
+
end_time - start_time, attributes
|
448
|
+
)
|
449
|
+
metrics['genai_requests'].add(1, attributes)
|
450
|
+
metrics['genai_completion_tokens'].add(output_tokens, attributes)
|
451
|
+
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
452
|
+
metrics['genai_cost'].record(cost, attributes)
|
396
453
|
|
397
454
|
# Return original response
|
398
455
|
return response
|
399
456
|
|
400
457
|
except Exception as e:
|
401
458
|
handle_exception(span, e)
|
402
|
-
logger.error(
|
459
|
+
logger.error('Error in trace creation: %s', e)
|
403
460
|
|
404
461
|
# Return original response
|
405
462
|
return response
|
406
463
|
|
407
464
|
return wrapper
|
408
465
|
|
409
|
-
def aembedding(
|
410
|
-
|
466
|
+
def aembedding(version, environment, application_name,
|
467
|
+
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
411
468
|
"""
|
412
469
|
Generates a telemetry wrapper for embeddings to collect metrics.
|
413
|
-
|
470
|
+
|
414
471
|
Args:
|
415
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
416
472
|
version: Version of the monitoring package.
|
417
473
|
environment: Deployment environment (e.g., production, staging).
|
418
|
-
application_name: Name of the application using the
|
474
|
+
application_name: Name of the application using the LiteLLM API.
|
419
475
|
tracer: OpenTelemetry tracer for creating spans.
|
420
|
-
pricing_info: Information used for calculating the cost of
|
476
|
+
pricing_info: Information used for calculating the cost of LiteLLM usage.
|
421
477
|
trace_content: Flag indicating whether to trace the actual content.
|
422
|
-
|
478
|
+
|
423
479
|
Returns:
|
424
480
|
A function that wraps the embeddings method to add telemetry.
|
425
481
|
"""
|
@@ -441,79 +497,94 @@ def aembedding(gen_ai_endpoint, version, environment, application_name,
|
|
441
497
|
The response from the original 'embeddings' method.
|
442
498
|
"""
|
443
499
|
|
444
|
-
|
500
|
+
server_address, server_port = 'NOT_FOUND', 'NOT_FOUND'
|
501
|
+
request_model = kwargs.get('model', 'text-embedding-ada-002')
|
502
|
+
|
503
|
+
span_name = f'{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}'
|
504
|
+
|
505
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
506
|
+
start_time = time.time()
|
445
507
|
response = await wrapped(*args, **kwargs)
|
508
|
+
end_time = time.time()
|
509
|
+
|
446
510
|
response_dict = response_as_dict(response)
|
447
511
|
try:
|
512
|
+
input_tokens = response_dict.get('usage').get('prompt_tokens')
|
513
|
+
|
448
514
|
# Calculate cost of the operation
|
449
|
-
cost = get_embed_model_cost(
|
450
|
-
|
515
|
+
cost = get_embed_model_cost(request_model,
|
516
|
+
pricing_info, input_tokens)
|
451
517
|
|
452
|
-
# Set Span attributes
|
453
|
-
span.set_attribute(TELEMETRY_SDK_NAME,
|
454
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
455
|
-
SemanticConvetion.GEN_AI_SYSTEM_OPENAI)
|
518
|
+
# Set Span attributes (OTel Semconv)
|
519
|
+
span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
456
520
|
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
457
521
|
SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
|
458
|
-
span.set_attribute(SemanticConvetion.
|
459
|
-
|
460
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
|
461
|
-
environment)
|
462
|
-
span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
|
463
|
-
application_name)
|
522
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
523
|
+
SemanticConvetion.GEN_AI_SYSTEM_LITELLM)
|
464
524
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
465
|
-
|
525
|
+
request_model)
|
466
526
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_ENCODING_FORMATS,
|
467
|
-
kwargs.get(
|
468
|
-
|
469
|
-
|
470
|
-
span.set_attribute(SemanticConvetion.
|
471
|
-
|
527
|
+
[kwargs.get('encoding_format', 'float')])
|
528
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
529
|
+
response_dict.get('model'))
|
530
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
531
|
+
server_address)
|
532
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
533
|
+
server_port)
|
472
534
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
473
|
-
|
535
|
+
input_tokens)
|
536
|
+
|
537
|
+
# Set Span attributes (Extras)
|
538
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
539
|
+
environment)
|
540
|
+
span.set_attribute(SERVICE_NAME,
|
541
|
+
application_name)
|
542
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
543
|
+
kwargs.get('user', ''))
|
474
544
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
475
|
-
|
545
|
+
input_tokens)
|
476
546
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
477
547
|
cost)
|
548
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
549
|
+
version)
|
550
|
+
|
478
551
|
if trace_content:
|
479
552
|
span.add_event(
|
480
553
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
481
554
|
attributes={
|
482
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: kwargs.get(
|
555
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get('input', '')),
|
483
556
|
},
|
484
557
|
)
|
485
558
|
|
486
559
|
span.set_status(Status(StatusCode.OK))
|
487
560
|
|
488
561
|
if disable_metrics is False:
|
489
|
-
attributes =
|
490
|
-
|
491
|
-
|
492
|
-
SemanticConvetion.
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
metrics[
|
506
|
-
|
507
|
-
metrics[
|
508
|
-
response_dict.get('usage').get('prompt_tokens'), attributes)
|
509
|
-
metrics["genai_cost"].record(cost, attributes)
|
562
|
+
attributes = create_metrics_attributes(
|
563
|
+
service_name=application_name,
|
564
|
+
deployment_environment=environment,
|
565
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
566
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_LITELLM,
|
567
|
+
request_model=request_model,
|
568
|
+
server_address=server_address,
|
569
|
+
server_port=server_port,
|
570
|
+
response_model=response_dict.get('model'),
|
571
|
+
)
|
572
|
+
metrics['genai_client_usage_tokens'].record(
|
573
|
+
input_tokens, attributes
|
574
|
+
)
|
575
|
+
metrics['genai_client_operation_duration'].record(
|
576
|
+
end_time - start_time, attributes
|
577
|
+
)
|
578
|
+
metrics['genai_requests'].add(1, attributes)
|
579
|
+
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
580
|
+
metrics['genai_cost'].record(cost, attributes)
|
510
581
|
|
511
582
|
# Return original response
|
512
583
|
return response
|
513
584
|
|
514
585
|
except Exception as e:
|
515
586
|
handle_exception(span, e)
|
516
|
-
logger.error(
|
587
|
+
logger.error('Error in trace creation: %s', e)
|
517
588
|
|
518
589
|
# Return original response
|
519
590
|
return response
|