openlit 1.34.11__py3-none-any.whl → 1.34.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/__helpers.py +3 -3
- openlit/instrumentation/ai21/__init__.py +10 -8
- openlit/instrumentation/ai21/ai21.py +15 -27
- openlit/instrumentation/ai21/async_ai21.py +15 -27
- openlit/instrumentation/ai21/utils.py +229 -212
- openlit/instrumentation/openai/__init__.py +3 -3
- openlit/instrumentation/vllm/__init__.py +5 -7
- openlit/instrumentation/vllm/utils.py +85 -103
- openlit/instrumentation/vllm/vllm.py +3 -8
- {openlit-1.34.11.dist-info → openlit-1.34.13.dist-info}/METADATA +1 -1
- {openlit-1.34.11.dist-info → openlit-1.34.13.dist-info}/RECORD +13 -13
- {openlit-1.34.11.dist-info → openlit-1.34.13.dist-info}/LICENSE +0 -0
- {openlit-1.34.11.dist-info → openlit-1.34.13.dist-info}/WHEEL +0 -0
openlit/__helpers.py
CHANGED
@@ -346,12 +346,12 @@ def common_span_attributes(scope, gen_ai_operation, gen_ai_system, server_addres
|
|
346
346
|
scope._span.set_attribute(SemanticConvention.SERVER_ADDRESS, server_address)
|
347
347
|
scope._span.set_attribute(SemanticConvention.SERVER_PORT, server_port)
|
348
348
|
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MODEL, request_model)
|
349
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL,
|
349
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL, response_model)
|
350
350
|
scope._span.set_attribute(DEPLOYMENT_ENVIRONMENT, environment)
|
351
351
|
scope._span.set_attribute(SERVICE_NAME, application_name)
|
352
352
|
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_IS_STREAM, is_stream)
|
353
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TBT,
|
354
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TTFT,
|
353
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TBT, tbt)
|
354
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TTFT, ttft)
|
355
355
|
scope._span.set_attribute(SemanticConvention.GEN_AI_SDK_VERSION, version)
|
356
356
|
|
357
357
|
def record_completion_metrics(metrics, gen_ai_operation, gen_ai_system, server_address, server_port,
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# pylint: disable=useless-return, bad-staticmethod-argument, disable=duplicate-code
|
2
1
|
"""Initializer of Auto Instrumentation of AI21 Functions"""
|
3
2
|
|
4
3
|
from typing import Collection
|
@@ -33,34 +32,37 @@ class AI21Instrumentor(BaseInstrumentor):
|
|
33
32
|
disable_metrics = kwargs.get("disable_metrics")
|
34
33
|
version = importlib.metadata.version("ai21")
|
35
34
|
|
36
|
-
#
|
35
|
+
# Chat completions
|
37
36
|
wrap_function_wrapper(
|
38
37
|
"ai21.clients.studio.resources.chat.chat_completions",
|
39
38
|
"ChatCompletions.create",
|
40
39
|
chat(version, environment, application_name,
|
41
|
-
|
40
|
+
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
42
41
|
)
|
42
|
+
|
43
|
+
# RAG completions
|
43
44
|
wrap_function_wrapper(
|
44
45
|
"ai21.clients.studio.resources.studio_conversational_rag",
|
45
46
|
"StudioConversationalRag.create",
|
46
47
|
chat_rag(version, environment, application_name,
|
47
|
-
|
48
|
+
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
48
49
|
)
|
49
50
|
|
50
|
-
#Async
|
51
|
+
# Async chat completions
|
51
52
|
wrap_function_wrapper(
|
52
53
|
"ai21.clients.studio.resources.chat.async_chat_completions",
|
53
54
|
"AsyncChatCompletions.create",
|
54
55
|
async_chat(version, environment, application_name,
|
55
|
-
|
56
|
+
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
56
57
|
)
|
58
|
+
|
59
|
+
# Async RAG completions
|
57
60
|
wrap_function_wrapper(
|
58
61
|
"ai21.clients.studio.resources.studio_conversational_rag",
|
59
62
|
"AsyncStudioConversationalRag.create",
|
60
63
|
async_chat_rag(version, environment, application_name,
|
61
|
-
|
64
|
+
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
62
65
|
)
|
63
66
|
|
64
67
|
def _uninstrument(self, **kwargs):
|
65
|
-
# Proper uninstrumentation logic to revert patched methods
|
66
68
|
pass
|
@@ -1,8 +1,7 @@
|
|
1
1
|
"""
|
2
|
-
Module for monitoring AI21 calls.
|
2
|
+
Module for monitoring AI21 API calls.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import logging
|
6
5
|
import time
|
7
6
|
from opentelemetry.trace import SpanKind
|
8
7
|
from openlit.__helpers import (
|
@@ -15,14 +14,10 @@ from openlit.instrumentation.ai21.utils import (
|
|
15
14
|
process_streaming_chat_response,
|
16
15
|
process_chat_rag_response
|
17
16
|
)
|
18
|
-
|
19
17
|
from openlit.semcov import SemanticConvention
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def chat(version, environment, application_name,
|
25
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics):
|
19
|
+
def chat(version, environment, application_name, tracer, pricing_info,
|
20
|
+
capture_message_content, metrics, disable_metrics):
|
26
21
|
"""
|
27
22
|
Generates a telemetry wrapper for GenAI function call
|
28
23
|
"""
|
@@ -45,14 +40,12 @@ def chat(version, environment, application_name,
|
|
45
40
|
self.__wrapped__ = wrapped
|
46
41
|
self._span = span
|
47
42
|
self._span_name = span_name
|
48
|
-
# Placeholder for aggregating streaming response
|
49
43
|
self._llmresponse = ""
|
50
44
|
self._response_id = ""
|
51
45
|
self._finish_reason = ""
|
46
|
+
self._tools = None
|
52
47
|
self._input_tokens = 0
|
53
48
|
self._output_tokens = 0
|
54
|
-
self._choices = []
|
55
|
-
|
56
49
|
self._args = args
|
57
50
|
self._kwargs = kwargs
|
58
51
|
self._start_time = time.time()
|
@@ -83,9 +76,8 @@ def chat(version, environment, application_name,
|
|
83
76
|
process_chunk(self, chunk)
|
84
77
|
return chunk
|
85
78
|
except StopIteration:
|
86
|
-
# Handling exception ensure observability without disrupting operation
|
87
79
|
try:
|
88
|
-
with tracer.start_as_current_span(self._span_name, kind=
|
80
|
+
with tracer.start_as_current_span(self._span_name, kind=SpanKind.CLIENT) as self._span:
|
89
81
|
process_streaming_chat_response(
|
90
82
|
self,
|
91
83
|
pricing_info=pricing_info,
|
@@ -96,34 +88,31 @@ def chat(version, environment, application_name,
|
|
96
88
|
disable_metrics=disable_metrics,
|
97
89
|
version=version
|
98
90
|
)
|
91
|
+
|
99
92
|
except Exception as e:
|
100
93
|
handle_exception(self._span, e)
|
101
|
-
|
94
|
+
|
102
95
|
raise
|
103
96
|
|
104
97
|
def wrapper(wrapped, instance, args, kwargs):
|
105
98
|
"""
|
106
99
|
Wraps the GenAI function call.
|
107
100
|
"""
|
108
|
-
|
109
101
|
# Check if streaming is enabled for the API call
|
110
102
|
streaming = kwargs.get("stream", False)
|
111
|
-
|
112
103
|
server_address, server_port = set_server_address_and_port(instance, "api.ai21.com", 443)
|
113
104
|
request_model = kwargs.get("model", "jamba-1.5-mini")
|
114
105
|
|
115
106
|
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
116
107
|
|
117
|
-
# pylint: disable=no-else-return
|
118
108
|
if streaming:
|
119
|
-
# Special handling for streaming response
|
109
|
+
# Special handling for streaming response
|
120
110
|
awaited_wrapped = wrapped(*args, **kwargs)
|
121
111
|
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
122
112
|
return TracedSyncStream(awaited_wrapped, span, span_name, kwargs, server_address, server_port)
|
123
|
-
|
124
|
-
# Handling for non-streaming responses
|
125
113
|
else:
|
126
|
-
|
114
|
+
# Handling for non-streaming responses
|
115
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
127
116
|
start_time = time.time()
|
128
117
|
response = wrapped(*args, **kwargs)
|
129
118
|
|
@@ -152,23 +141,22 @@ def chat(version, environment, application_name,
|
|
152
141
|
|
153
142
|
return wrapper
|
154
143
|
|
155
|
-
def chat_rag(version, environment, application_name,
|
156
|
-
|
144
|
+
def chat_rag(version, environment, application_name, tracer, pricing_info,
|
145
|
+
capture_message_content, metrics, disable_metrics):
|
157
146
|
"""
|
158
|
-
Generates a telemetry wrapper for GenAI function call
|
147
|
+
Generates a telemetry wrapper for GenAI RAG function call
|
159
148
|
"""
|
160
149
|
|
161
150
|
def wrapper(wrapped, instance, args, kwargs):
|
162
151
|
"""
|
163
|
-
Wraps the GenAI function call.
|
152
|
+
Wraps the GenAI RAG function call.
|
164
153
|
"""
|
165
|
-
|
166
154
|
server_address, server_port = set_server_address_and_port(instance, "api.ai21.com", 443)
|
167
155
|
request_model = kwargs.get("model", "jamba-1.5-mini")
|
168
156
|
|
169
157
|
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
170
158
|
|
171
|
-
with tracer.start_as_current_span(span_name, kind=
|
159
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
172
160
|
start_time = time.time()
|
173
161
|
response = wrapped(*args, **kwargs)
|
174
162
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
"""
|
2
|
-
Module for monitoring AI21 calls.
|
2
|
+
Module for monitoring AI21 API calls (async version).
|
3
3
|
"""
|
4
4
|
|
5
|
-
import logging
|
6
5
|
import time
|
7
6
|
from opentelemetry.trace import SpanKind
|
8
7
|
from openlit.__helpers import (
|
@@ -15,21 +14,17 @@ from openlit.instrumentation.ai21.utils import (
|
|
15
14
|
process_streaming_chat_response,
|
16
15
|
process_chat_rag_response
|
17
16
|
)
|
18
|
-
|
19
17
|
from openlit.semcov import SemanticConvention
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def async_chat(version, environment, application_name,
|
25
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics):
|
19
|
+
def async_chat(version, environment, application_name, tracer, pricing_info,
|
20
|
+
capture_message_content, metrics, disable_metrics):
|
26
21
|
"""
|
27
22
|
Generates a telemetry wrapper for GenAI function call
|
28
23
|
"""
|
29
24
|
|
30
25
|
class TracedAsyncStream:
|
31
26
|
"""
|
32
|
-
Wrapper for streaming responses to collect telemetry.
|
27
|
+
Wrapper for async streaming responses to collect telemetry.
|
33
28
|
"""
|
34
29
|
|
35
30
|
def __init__(
|
@@ -45,14 +40,12 @@ def async_chat(version, environment, application_name,
|
|
45
40
|
self.__wrapped__ = wrapped
|
46
41
|
self._span = span
|
47
42
|
self._span_name = span_name
|
48
|
-
# Placeholder for aggregating streaming response
|
49
43
|
self._llmresponse = ""
|
50
44
|
self._response_id = ""
|
51
45
|
self._finish_reason = ""
|
46
|
+
self._tools = None
|
52
47
|
self._input_tokens = 0
|
53
48
|
self._output_tokens = 0
|
54
|
-
self._choices = []
|
55
|
-
|
56
49
|
self._args = args
|
57
50
|
self._kwargs = kwargs
|
58
51
|
self._start_time = time.time()
|
@@ -83,9 +76,8 @@ def async_chat(version, environment, application_name,
|
|
83
76
|
process_chunk(self, chunk)
|
84
77
|
return chunk
|
85
78
|
except StopAsyncIteration:
|
86
|
-
# Handling exception ensure observability without disrupting operation
|
87
79
|
try:
|
88
|
-
with tracer.start_as_current_span(self._span_name, kind=
|
80
|
+
with tracer.start_as_current_span(self._span_name, kind=SpanKind.CLIENT) as self._span:
|
89
81
|
process_streaming_chat_response(
|
90
82
|
self,
|
91
83
|
pricing_info=pricing_info,
|
@@ -96,6 +88,7 @@ def async_chat(version, environment, application_name,
|
|
96
88
|
disable_metrics=disable_metrics,
|
97
89
|
version=version
|
98
90
|
)
|
91
|
+
|
99
92
|
except Exception as e:
|
100
93
|
handle_exception(self._span, e)
|
101
94
|
|
@@ -105,25 +98,21 @@ def async_chat(version, environment, application_name,
|
|
105
98
|
"""
|
106
99
|
Wraps the GenAI function call.
|
107
100
|
"""
|
108
|
-
|
109
101
|
# Check if streaming is enabled for the API call
|
110
102
|
streaming = kwargs.get("stream", False)
|
111
|
-
|
112
103
|
server_address, server_port = set_server_address_and_port(instance, "api.ai21.com", 443)
|
113
104
|
request_model = kwargs.get("model", "jamba-1.5-mini")
|
114
105
|
|
115
106
|
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
116
107
|
|
117
|
-
# pylint: disable=no-else-return
|
118
108
|
if streaming:
|
119
|
-
# Special handling for streaming response
|
109
|
+
# Special handling for streaming response
|
120
110
|
awaited_wrapped = await wrapped(*args, **kwargs)
|
121
111
|
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
122
112
|
return TracedAsyncStream(awaited_wrapped, span, span_name, kwargs, server_address, server_port)
|
123
|
-
|
124
|
-
# Handling for non-streaming responses
|
125
113
|
else:
|
126
|
-
|
114
|
+
# Handling for non-streaming responses
|
115
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
127
116
|
start_time = time.time()
|
128
117
|
response = await wrapped(*args, **kwargs)
|
129
118
|
|
@@ -152,23 +141,22 @@ def async_chat(version, environment, application_name,
|
|
152
141
|
|
153
142
|
return wrapper
|
154
143
|
|
155
|
-
def async_chat_rag(version, environment, application_name,
|
156
|
-
|
144
|
+
def async_chat_rag(version, environment, application_name, tracer, pricing_info,
|
145
|
+
capture_message_content, metrics, disable_metrics):
|
157
146
|
"""
|
158
|
-
Generates a telemetry wrapper for GenAI function call
|
147
|
+
Generates a telemetry wrapper for GenAI RAG function call
|
159
148
|
"""
|
160
149
|
|
161
150
|
async def wrapper(wrapped, instance, args, kwargs):
|
162
151
|
"""
|
163
|
-
Wraps the GenAI function call.
|
152
|
+
Wraps the GenAI RAG function call.
|
164
153
|
"""
|
165
|
-
|
166
154
|
server_address, server_port = set_server_address_and_port(instance, "api.ai21.com", 443)
|
167
155
|
request_model = kwargs.get("model", "jamba-1.5-mini")
|
168
156
|
|
169
157
|
span_name = f"{SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
170
158
|
|
171
|
-
with tracer.start_as_current_span(span_name, kind=
|
159
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
172
160
|
start_time = time.time()
|
173
161
|
response = await wrapped(*args, **kwargs)
|
174
162
|
|
@@ -4,7 +4,6 @@ AI21 OpenTelemetry instrumentation utility functions
|
|
4
4
|
|
5
5
|
import time
|
6
6
|
|
7
|
-
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
8
7
|
from opentelemetry.trace import Status, StatusCode
|
9
8
|
|
10
9
|
from openlit.__helpers import (
|
@@ -12,289 +11,307 @@ from openlit.__helpers import (
|
|
12
11
|
response_as_dict,
|
13
12
|
calculate_tbt,
|
14
13
|
general_tokens,
|
15
|
-
extract_and_format_input,
|
16
14
|
get_chat_model_cost,
|
17
|
-
|
18
|
-
|
15
|
+
common_span_attributes,
|
16
|
+
record_completion_metrics,
|
19
17
|
)
|
20
18
|
from openlit.semcov import SemanticConvention
|
21
19
|
|
22
|
-
def
|
23
|
-
server_port, server_address, environment,
|
24
|
-
application_name, extra_attrs):
|
20
|
+
def format_content(messages):
|
25
21
|
"""
|
26
|
-
|
22
|
+
Process a list of messages to extract content.
|
27
23
|
"""
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
# Environment and service identifiers.
|
59
|
-
span.set_attribute(DEPLOYMENT_ENVIRONMENT, environment)
|
60
|
-
span.set_attribute(SERVICE_NAME, application_name)
|
61
|
-
|
62
|
-
# Set any extra attributes passed in.
|
63
|
-
for key, value in extra_attrs.items():
|
64
|
-
span.set_attribute(key, value)
|
65
|
-
|
66
|
-
def record_common_metrics(metrics, application_name, environment, request_model,
|
67
|
-
server_address, server_port, start_time, end_time,
|
68
|
-
input_tokens, output_tokens, cost, include_tbt=False, tbt_value=None):
|
69
|
-
"""
|
70
|
-
Record common metrics for the operation.
|
71
|
-
"""
|
72
|
-
|
73
|
-
attributes = create_metrics_attributes(
|
74
|
-
service_name=application_name,
|
75
|
-
deployment_environment=environment,
|
76
|
-
operation=SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT,
|
77
|
-
system=SemanticConvention.GEN_AI_SYSTEM_AI21,
|
78
|
-
request_model=request_model,
|
79
|
-
server_address=server_address,
|
80
|
-
server_port=server_port,
|
81
|
-
response_model=request_model,
|
82
|
-
)
|
83
|
-
metrics["genai_client_usage_tokens"].record(input_tokens + output_tokens, attributes)
|
84
|
-
metrics["genai_client_operation_duration"].record(end_time - start_time, attributes)
|
85
|
-
if include_tbt and tbt_value is not None:
|
86
|
-
metrics["genai_server_tbt"].record(tbt_value, attributes)
|
87
|
-
metrics["genai_server_ttft"].record(end_time - start_time, attributes)
|
88
|
-
metrics["genai_requests"].add(1, attributes)
|
89
|
-
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
90
|
-
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
91
|
-
metrics["genai_cost"].record(cost, attributes)
|
92
|
-
|
93
|
-
def process_chunk(self, chunk):
|
25
|
+
formatted_messages = []
|
26
|
+
for message in messages:
|
27
|
+
# Handle different message formats
|
28
|
+
if hasattr(message, "role") and (hasattr(message, "content") or hasattr(message, "text")):
|
29
|
+
# ChatMessage object (AI21 format)
|
30
|
+
role = str(message.role) if hasattr(message.role, 'value') else str(message.role)
|
31
|
+
content = getattr(message, "content", None) or getattr(message, "text", "")
|
32
|
+
elif isinstance(message, dict):
|
33
|
+
# Dictionary format
|
34
|
+
role = message["role"]
|
35
|
+
content = message["content"]
|
36
|
+
else:
|
37
|
+
# Fallback - try to extract as string
|
38
|
+
role = str(getattr(message, "role", "unknown"))
|
39
|
+
content = str(getattr(message, "content", "") or getattr(message, "text", ""))
|
40
|
+
|
41
|
+
if isinstance(content, list):
|
42
|
+
content_str = ", ".join(
|
43
|
+
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
44
|
+
if "type" in item else f'text: {item["text"]}'
|
45
|
+
for item in content
|
46
|
+
)
|
47
|
+
formatted_messages.append(f"{role}: {content_str}")
|
48
|
+
else:
|
49
|
+
formatted_messages.append(f"{role}: {content}")
|
50
|
+
|
51
|
+
return "\n".join(formatted_messages)
|
52
|
+
|
53
|
+
def process_chunk(scope, chunk):
|
94
54
|
"""
|
95
55
|
Process a chunk of response data and update state.
|
96
56
|
"""
|
97
57
|
|
98
58
|
end_time = time.time()
|
99
|
-
# Record the timestamp for the current chunk
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
59
|
+
# Record the timestamp for the current chunk
|
60
|
+
scope._timestamps.append(end_time)
|
61
|
+
|
62
|
+
if len(scope._timestamps) == 1:
|
63
|
+
# Calculate time to first chunk
|
64
|
+
scope._ttft = calculate_ttft(scope._timestamps, scope._start_time)
|
104
65
|
|
105
66
|
chunked = response_as_dict(chunk)
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
67
|
+
|
68
|
+
# Collect message IDs and aggregated response from events
|
69
|
+
if (len(chunked.get("choices", [])) > 0 and
|
70
|
+
"delta" in chunked.get("choices")[0] and
|
71
|
+
"content" in chunked.get("choices")[0].get("delta", {})):
|
72
|
+
|
73
|
+
content = chunked.get("choices")[0].get("delta").get("content")
|
74
|
+
if content:
|
75
|
+
scope._llmresponse += content
|
76
|
+
|
77
|
+
if chunked.get("usage"):
|
78
|
+
scope._input_tokens = chunked.get("usage").get("prompt_tokens")
|
79
|
+
scope._output_tokens = chunked.get("usage").get("completion_tokens")
|
80
|
+
scope._response_id = chunked.get("id")
|
81
|
+
scope._finish_reason = chunked.get("choices", [{}])[0].get("finish_reason")
|
82
|
+
scope._end_time = time.time()
|
117
83
|
|
118
84
|
def common_chat_logic(scope, pricing_info, environment, application_name, metrics,
|
119
85
|
capture_message_content, disable_metrics, version, is_stream):
|
120
86
|
"""
|
121
|
-
Process chat request and generate Telemetry
|
87
|
+
Process chat request and generate Telemetry
|
122
88
|
"""
|
123
89
|
|
124
|
-
scope._end_time = time.time()
|
125
90
|
if len(scope._timestamps) > 1:
|
126
91
|
scope._tbt = calculate_tbt(scope._timestamps)
|
127
92
|
|
128
|
-
|
129
|
-
formatted_messages = extract_and_format_input(scope._kwargs.get("messages", ""))
|
130
|
-
prompt = concatenate_all_contents(formatted_messages)
|
93
|
+
prompt = format_content(scope._kwargs.get("messages", []))
|
131
94
|
request_model = scope._kwargs.get("model", "jamba-1.5-mini")
|
132
95
|
|
133
|
-
# Calculate cost based on token usage.
|
134
96
|
cost = get_chat_model_cost(request_model, pricing_info, scope._input_tokens, scope._output_tokens)
|
135
|
-
# Prepare tokens dictionary.
|
136
|
-
tokens = {
|
137
|
-
"finish_reason": scope._finish_reason,
|
138
|
-
"response_id": scope._response_id,
|
139
|
-
"input_tokens": scope._input_tokens,
|
140
|
-
"output_tokens": scope._output_tokens,
|
141
|
-
"total_tokens": scope._input_tokens + scope._output_tokens,
|
142
|
-
}
|
143
|
-
extra_attrs = {
|
144
|
-
SemanticConvention.GEN_AI_REQUEST_IS_STREAM: is_stream,
|
145
|
-
SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE: scope._input_tokens + scope._output_tokens,
|
146
|
-
SemanticConvention.GEN_AI_USAGE_COST: cost,
|
147
|
-
SemanticConvention.GEN_AI_SERVER_TBT: scope._tbt,
|
148
|
-
SemanticConvention.GEN_AI_SERVER_TTFT: scope._ttft,
|
149
|
-
SemanticConvention.GEN_AI_SDK_VERSION: version,
|
150
|
-
SemanticConvention.GEN_AI_OUTPUT_TYPE: "text" if isinstance(scope._llmresponse, str) else "json"
|
151
|
-
}
|
152
|
-
# Set span attributes.
|
153
|
-
setup_common_span_attributes(scope._span, request_model, scope._kwargs, tokens,
|
154
|
-
scope._server_port, scope._server_address, environment,
|
155
|
-
application_name, extra_attrs)
|
156
97
|
|
98
|
+
# Common Span Attributes
|
99
|
+
common_span_attributes(scope,
|
100
|
+
SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_AI21,
|
101
|
+
scope._server_address, scope._server_port, request_model, request_model,
|
102
|
+
environment, application_name, is_stream, scope._tbt, scope._ttft, version)
|
103
|
+
|
104
|
+
# Span Attributes for Request parameters
|
105
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_SEED, scope._kwargs.get("seed", ""))
|
106
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_FREQUENCY_PENALTY, scope._kwargs.get("frequency_penalty", 0.0))
|
107
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MAX_TOKENS, scope._kwargs.get("max_tokens", -1))
|
108
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_PRESENCE_PENALTY, scope._kwargs.get("presence_penalty", 0.0))
|
109
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_STOP_SEQUENCES, scope._kwargs.get("stop", []))
|
110
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TEMPERATURE, scope._kwargs.get("temperature", 0.4))
|
111
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TOP_P, scope._kwargs.get("top_p", 1.0))
|
112
|
+
|
113
|
+
# Span Attributes for Response parameters
|
114
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_ID, scope._response_id)
|
115
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_FINISH_REASON, [scope._finish_reason])
|
116
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE, "text" if isinstance(scope._llmresponse, str) else "json")
|
117
|
+
|
118
|
+
# Span Attributes for Cost and Tokens
|
119
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS, scope._input_tokens)
|
120
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS, scope._output_tokens)
|
121
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE, scope._input_tokens + scope._output_tokens)
|
122
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST, cost)
|
123
|
+
|
124
|
+
# Span Attributes for Tools
|
125
|
+
if scope._tools:
|
126
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_TOOL_NAME, scope._tools.get("function", {}).get("name", ""))
|
127
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_TOOL_CALL_ID, str(scope._tools.get("id", "")))
|
128
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_TOOL_ARGS, str(scope._tools.get("function", {}).get("arguments", "")))
|
129
|
+
|
130
|
+
# Span Attributes for Content
|
157
131
|
if capture_message_content:
|
132
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_PROMPT, prompt)
|
133
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_COMPLETION, scope._llmresponse)
|
134
|
+
|
135
|
+
# To be removed once the change to span_attributes (from span events) is complete
|
158
136
|
scope._span.add_event(
|
159
137
|
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
160
|
-
attributes={
|
138
|
+
attributes={
|
139
|
+
SemanticConvention.GEN_AI_CONTENT_PROMPT: prompt,
|
140
|
+
},
|
161
141
|
)
|
162
142
|
scope._span.add_event(
|
163
143
|
name=SemanticConvention.GEN_AI_CONTENT_COMPLETION_EVENT,
|
164
|
-
attributes={
|
144
|
+
attributes={
|
145
|
+
SemanticConvention.GEN_AI_CONTENT_COMPLETION: scope._llmresponse,
|
146
|
+
},
|
165
147
|
)
|
166
148
|
|
167
149
|
scope._span.set_status(Status(StatusCode.OK))
|
168
150
|
|
151
|
+
# Metrics
|
169
152
|
if not disable_metrics:
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
include_tbt=True, tbt_value=scope._tbt)
|
153
|
+
record_completion_metrics(metrics, SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_AI21,
|
154
|
+
scope._server_address, scope._server_port, request_model, request_model, environment,
|
155
|
+
application_name, scope._start_time, scope._end_time, scope._input_tokens, scope._output_tokens,
|
156
|
+
cost, scope._tbt, scope._ttft)
|
175
157
|
|
176
|
-
def process_streaming_chat_response(
|
158
|
+
def process_streaming_chat_response(scope, pricing_info, environment, application_name, metrics,
|
177
159
|
capture_message_content=False, disable_metrics=False, version=""):
|
178
160
|
"""
|
179
|
-
Process
|
161
|
+
Process streaming chat request and generate Telemetry
|
180
162
|
"""
|
181
163
|
|
182
|
-
common_chat_logic(
|
164
|
+
common_chat_logic(scope, pricing_info, environment, application_name, metrics,
|
183
165
|
capture_message_content, disable_metrics, version, is_stream=True)
|
184
166
|
|
185
167
|
def process_chat_response(response, request_model, pricing_info, server_port, server_address,
|
186
|
-
|
187
|
-
|
168
|
+
environment, application_name, metrics, start_time, span, capture_message_content=False,
|
169
|
+
disable_metrics=False, version="1.0.0", **kwargs):
|
188
170
|
"""
|
189
|
-
Process
|
171
|
+
Process chat request and generate Telemetry
|
190
172
|
"""
|
191
173
|
|
192
|
-
# Create
|
193
|
-
|
174
|
+
# Create scope object
|
175
|
+
scope = type("GenericScope", (), {})()
|
194
176
|
response_dict = response_as_dict(response)
|
195
177
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
self._span = span
|
201
|
-
# Concatenate content from all choices.
|
202
|
-
self._llmresponse = "".join(
|
178
|
+
scope._start_time = start_time
|
179
|
+
scope._end_time = time.time()
|
180
|
+
scope._span = span
|
181
|
+
scope._llmresponse = " ".join(
|
203
182
|
(choice.get("message", {}).get("content") or "")
|
204
183
|
for choice in response_dict.get("choices", [])
|
205
184
|
)
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
185
|
+
scope._response_id = response_dict.get("id")
|
186
|
+
scope._input_tokens = response_dict.get("usage", {}).get("prompt_tokens", 0)
|
187
|
+
scope._output_tokens = response_dict.get("usage", {}).get("completion_tokens", 0)
|
188
|
+
scope._timestamps = []
|
189
|
+
scope._ttft, scope._tbt = scope._end_time - scope._start_time, 0
|
190
|
+
scope._server_address, scope._server_port = server_address, server_port
|
191
|
+
scope._kwargs = kwargs
|
192
|
+
scope._finish_reason = str(response_dict.get("choices", [])[0].get("finish_reason", ""))
|
193
|
+
|
194
|
+
# Handle tool calls
|
195
|
+
if scope._kwargs.get("tools"):
|
196
|
+
scope._tools = response_dict.get("choices", [{}])[0].get("message", {}).get("tool_calls")
|
197
|
+
else:
|
198
|
+
scope._tools = None
|
199
|
+
|
200
|
+
common_chat_logic(scope, pricing_info, environment, application_name, metrics,
|
219
201
|
capture_message_content, disable_metrics, version, is_stream=False)
|
220
202
|
|
221
203
|
return response
|
222
204
|
|
223
|
-
def
|
224
|
-
|
225
|
-
span, capture_message_content=False, disable_metrics=False, version="1.0.0", **kwargs):
|
205
|
+
def common_chat_rag_logic(scope, pricing_info, environment, application_name, metrics,
|
206
|
+
capture_message_content, disable_metrics, version):
|
226
207
|
"""
|
227
|
-
Process
|
208
|
+
Process RAG chat request and generate Telemetry
|
228
209
|
"""
|
229
|
-
end_time = time.time()
|
230
|
-
response_dict = response_as_dict(response)
|
231
|
-
# Format input messages into a single prompt string.
|
232
|
-
messages_input = kwargs.get("messages", "")
|
233
|
-
formatted_messages = extract_and_format_input(messages_input)
|
234
|
-
prompt = concatenate_all_contents(formatted_messages)
|
235
|
-
input_tokens = general_tokens(prompt)
|
236
210
|
|
237
|
-
|
238
|
-
|
239
|
-
extra_attrs = {
|
240
|
-
SemanticConvention.GEN_AI_REQUEST_IS_STREAM: False,
|
241
|
-
SemanticConvention.GEN_AI_SERVER_TTFT: end_time - start_time,
|
242
|
-
SemanticConvention.GEN_AI_SDK_VERSION: version,
|
243
|
-
SemanticConvention.GEN_AI_RAG_MAX_SEGMENTS: kwargs.get("max_segments", -1),
|
244
|
-
SemanticConvention.GEN_AI_RAG_STRATEGY: kwargs.get("retrieval_strategy", "segments"),
|
245
|
-
SemanticConvention.GEN_AI_RAG_SIMILARITY_THRESHOLD: kwargs.get("retrieval_similarity_threshold", -1),
|
246
|
-
SemanticConvention.GEN_AI_RAG_MAX_NEIGHBORS: kwargs.get("max_neighbors", -1),
|
247
|
-
SemanticConvention.GEN_AI_RAG_FILE_IDS: str(kwargs.get("file_ids", "")),
|
248
|
-
SemanticConvention.GEN_AI_RAG_DOCUMENTS_PATH: kwargs.get("path", "")
|
249
|
-
}
|
250
|
-
# Set common span attributes.
|
251
|
-
setup_common_span_attributes(span, request_model, kwargs, tokens,
|
252
|
-
server_port, server_address, environment, application_name,
|
253
|
-
extra_attrs)
|
211
|
+
prompt = format_content(scope._kwargs.get("messages", []))
|
212
|
+
request_model = scope._kwargs.get("model", "jamba-1.5-mini")
|
254
213
|
|
214
|
+
cost = get_chat_model_cost(request_model, pricing_info, scope._input_tokens, scope._output_tokens)
|
215
|
+
|
216
|
+
# Common Span Attributes
|
217
|
+
common_span_attributes(scope,
|
218
|
+
SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_AI21,
|
219
|
+
scope._server_address, scope._server_port, request_model, scope._response_model,
|
220
|
+
environment, application_name, False, scope._tbt, scope._ttft, version)
|
221
|
+
|
222
|
+
# RAG-specific span attributes
|
223
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_MAX_SEGMENTS, scope._kwargs.get("max_segments", -1))
|
224
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_STRATEGY, scope._kwargs.get("retrieval_strategy", "segments"))
|
225
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_MAX_NEIGHBORS, scope._kwargs.get("max_neighbors", -1))
|
226
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_FILE_IDS, str(scope._kwargs.get("file_ids", "")))
|
227
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_DOCUMENTS_PATH, scope._kwargs.get("path", ""))
|
228
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RAG_SIMILARITY_THRESHOLD,
|
229
|
+
scope._kwargs.get("retrieval_similarity_threshold", -1))
|
230
|
+
|
231
|
+
# Standard span attributes
|
232
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_ID, scope._response_id)
|
233
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE, "text" if isinstance(scope._llmresponse, str) else "json")
|
234
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS, scope._input_tokens)
|
235
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS, scope._output_tokens)
|
236
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE, scope._input_tokens + scope._output_tokens)
|
237
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST, cost)
|
238
|
+
|
239
|
+
# Handle tool calls
|
240
|
+
if scope._kwargs.get("tools"):
|
241
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_TOOL_CALLS,
|
242
|
+
str(scope._choices[0].get("message", {}).get("tool_calls", "")))
|
243
|
+
|
244
|
+
# Content attributes
|
255
245
|
if capture_message_content:
|
256
|
-
|
246
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_PROMPT, prompt)
|
247
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_COMPLETION, scope._llmresponse)
|
248
|
+
|
249
|
+
# To be removed once the change to span_attributes (from span events) is complete
|
250
|
+
scope._span.add_event(
|
257
251
|
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
258
|
-
attributes={
|
252
|
+
attributes={
|
253
|
+
SemanticConvention.GEN_AI_CONTENT_PROMPT: prompt,
|
254
|
+
},
|
255
|
+
)
|
256
|
+
scope._span.add_event(
|
257
|
+
name=SemanticConvention.GEN_AI_CONTENT_COMPLETION_EVENT,
|
258
|
+
attributes={
|
259
|
+
SemanticConvention.GEN_AI_CONTENT_COMPLETION: scope._llmresponse,
|
260
|
+
},
|
259
261
|
)
|
260
262
|
|
261
|
-
|
263
|
+
scope._span.set_status(Status(StatusCode.OK))
|
264
|
+
|
265
|
+
# Metrics
|
266
|
+
if not disable_metrics:
|
267
|
+
record_completion_metrics(metrics, SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_AI21,
|
268
|
+
scope._server_address, scope._server_port, request_model, scope._response_model, environment,
|
269
|
+
application_name, scope._start_time, scope._end_time, scope._input_tokens, scope._output_tokens,
|
270
|
+
cost, scope._tbt, scope._ttft)
|
271
|
+
|
272
|
+
def process_chat_rag_response(response, request_model, pricing_info, server_port, server_address,
|
273
|
+
environment, application_name, metrics, start_time, span, capture_message_content=False,
|
274
|
+
disable_metrics=False, version="1.0.0", **kwargs):
|
275
|
+
"""
|
276
|
+
Process RAG chat request and generate Telemetry
|
277
|
+
"""
|
278
|
+
|
279
|
+
# Create scope object
|
280
|
+
scope = type("GenericScope", (), {})()
|
281
|
+
response_dict = response_as_dict(response)
|
282
|
+
|
283
|
+
scope._start_time = start_time
|
284
|
+
scope._end_time = time.time()
|
285
|
+
scope._span = span
|
286
|
+
|
287
|
+
# Format input messages and calculate input tokens
|
288
|
+
prompt = format_content(kwargs.get("messages", []))
|
289
|
+
input_tokens = general_tokens(prompt)
|
290
|
+
|
291
|
+
# Process response choices
|
262
292
|
choices = response_dict.get("choices", [])
|
263
293
|
aggregated_completion = []
|
294
|
+
output_tokens = 0
|
295
|
+
|
264
296
|
for i in range(kwargs.get("n", 1)):
|
265
|
-
# Get the response content from each choice and count tokens.
|
266
297
|
content = choices[i].get("content", "")
|
267
298
|
aggregated_completion.append(content)
|
268
299
|
output_tokens += general_tokens(content)
|
269
|
-
if kwargs.get("tools"):
|
270
|
-
span.set_attribute(SemanticConvention.GEN_AI_TOOL_CALLS,
|
271
|
-
str(choices[i].get("message", {}).get("tool_calls")))
|
272
|
-
# Set output type based on actual content type.
|
273
|
-
if isinstance(content, str):
|
274
|
-
span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE, "text")
|
275
|
-
elif content is not None:
|
276
|
-
span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE, "json")
|
277
|
-
|
278
|
-
# Concatenate completion responses.
|
279
|
-
llmresponse = "".join(aggregated_completion)
|
280
|
-
tokens["output_tokens"] = output_tokens
|
281
|
-
tokens["total_tokens"] = input_tokens + output_tokens
|
282
|
-
|
283
|
-
cost = get_chat_model_cost(request_model, pricing_info, input_tokens, output_tokens)
|
284
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST, cost)
|
285
|
-
span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
|
286
|
-
span.set_attribute(SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE, input_tokens + output_tokens)
|
287
|
-
|
288
|
-
span.set_status(Status(StatusCode.OK))
|
289
300
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
301
|
+
scope._llmresponse = "".join(aggregated_completion)
|
302
|
+
scope._response_id = response_dict.get("id", "")
|
303
|
+
scope._response_model = request_model
|
304
|
+
scope._input_tokens = input_tokens
|
305
|
+
scope._output_tokens = output_tokens
|
306
|
+
scope._timestamps = []
|
307
|
+
scope._ttft, scope._tbt = scope._end_time - scope._start_time, 0
|
308
|
+
scope._server_address, scope._server_port = server_address, server_port
|
309
|
+
scope._kwargs = kwargs
|
310
|
+
scope._finish_reason = ""
|
311
|
+
scope._tools = None
|
312
|
+
scope._choices = choices
|
313
|
+
|
314
|
+
common_chat_rag_logic(scope, pricing_info, environment, application_name, metrics,
|
315
|
+
capture_message_content, disable_metrics, version)
|
295
316
|
|
296
|
-
if not disable_metrics:
|
297
|
-
record_common_metrics(metrics, application_name, environment, request_model,
|
298
|
-
server_address, server_port, start_time, end_time,
|
299
|
-
input_tokens, output_tokens, cost, include_tbt=False)
|
300
317
|
return response
|
@@ -12,7 +12,7 @@ from openlit.instrumentation.openai.async_openai import (async_chat_completions,
|
|
12
12
|
from openlit.instrumentation.openai.async_openai import async_image_generate, async_image_variatons
|
13
13
|
from openlit.instrumentation.openai.async_openai import async_audio_create, async_responses
|
14
14
|
|
15
|
-
_instruments = ("openai >= 1.
|
15
|
+
_instruments = ("openai >= 1.92.0",)
|
16
16
|
|
17
17
|
class OpenAIInstrumentor(BaseInstrumentor):
|
18
18
|
"""An instrumentor for OpenAI's client library."""
|
@@ -129,14 +129,14 @@ class OpenAIInstrumentor(BaseInstrumentor):
|
|
129
129
|
)
|
130
130
|
|
131
131
|
wrap_function_wrapper(
|
132
|
-
"openai.resources.
|
132
|
+
"openai.resources.chat.completions",
|
133
133
|
"Completions.parse",
|
134
134
|
chat_completions_parse(version, environment, application_name, tracer, pricing_info,
|
135
135
|
capture_message_content, metrics, disable_metrics),
|
136
136
|
)
|
137
137
|
|
138
138
|
wrap_function_wrapper(
|
139
|
-
"openai.resources.
|
139
|
+
"openai.resources.chat.completions",
|
140
140
|
"AsyncCompletions.parse",
|
141
141
|
async_chat_completions_parse(version, environment, application_name, tracer, pricing_info,
|
142
142
|
capture_message_content, metrics, disable_metrics),
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# pylint: disable=useless-return, bad-staticmethod-argument, disable=duplicate-code
|
2
1
|
"""Initializer of Auto Instrumentation of vLLM Functions"""
|
3
2
|
|
4
3
|
from typing import Collection
|
@@ -14,15 +13,15 @@ _instruments = ("vllm >= 0.5.4",)
|
|
14
13
|
|
15
14
|
class VLLMInstrumentor(BaseInstrumentor):
|
16
15
|
"""
|
17
|
-
An instrumentor for vLLM
|
16
|
+
An instrumentor for vLLM client library.
|
18
17
|
"""
|
19
18
|
|
20
19
|
def instrumentation_dependencies(self) -> Collection[str]:
|
21
20
|
return _instruments
|
22
21
|
|
23
22
|
def _instrument(self, **kwargs):
|
24
|
-
application_name = kwargs.get("application_name", "
|
25
|
-
environment = kwargs.get("environment", "
|
23
|
+
application_name = kwargs.get("application_name", "default")
|
24
|
+
environment = kwargs.get("environment", "default")
|
26
25
|
tracer = kwargs.get("tracer")
|
27
26
|
metrics = kwargs.get("metrics_dict")
|
28
27
|
pricing_info = kwargs.get("pricing_info", {})
|
@@ -30,14 +29,13 @@ class VLLMInstrumentor(BaseInstrumentor):
|
|
30
29
|
disable_metrics = kwargs.get("disable_metrics")
|
31
30
|
version = importlib.metadata.version("vllm")
|
32
31
|
|
33
|
-
#
|
32
|
+
# Chat completions
|
34
33
|
wrap_function_wrapper(
|
35
34
|
"vllm.entrypoints.llm",
|
36
35
|
"LLM.generate",
|
37
36
|
generate(version, environment, application_name,
|
38
|
-
|
37
|
+
tracer, pricing_info, capture_message_content, metrics, disable_metrics),
|
39
38
|
)
|
40
39
|
|
41
40
|
def _uninstrument(self, **kwargs):
|
42
|
-
# Proper uninstrumentation logic to revert patched methods
|
43
41
|
pass
|
@@ -1,15 +1,15 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
vLLM OpenTelemetry instrumentation utility functions
|
3
3
|
"""
|
4
|
-
|
5
4
|
import time
|
6
|
-
|
5
|
+
|
7
6
|
from opentelemetry.trace import Status, StatusCode
|
7
|
+
|
8
8
|
from openlit.__helpers import (
|
9
|
-
calculate_tbt,
|
10
|
-
get_chat_model_cost,
|
11
9
|
general_tokens,
|
12
|
-
|
10
|
+
get_chat_model_cost,
|
11
|
+
common_span_attributes,
|
12
|
+
record_completion_metrics,
|
13
13
|
)
|
14
14
|
from openlit.semcov import SemanticConvention
|
15
15
|
|
@@ -24,77 +24,81 @@ def get_inference_config(args, kwargs):
|
|
24
24
|
return args[1]
|
25
25
|
return None
|
26
26
|
|
27
|
+
def format_content(prompts):
|
28
|
+
"""
|
29
|
+
Process a list of prompts to extract content.
|
30
|
+
"""
|
31
|
+
|
32
|
+
if isinstance(prompts, str):
|
33
|
+
return prompts
|
34
|
+
elif isinstance(prompts, list):
|
35
|
+
return "\n".join(str(prompt) for prompt in prompts)
|
36
|
+
else:
|
37
|
+
return str(prompts)
|
38
|
+
|
27
39
|
def common_chat_logic(scope, pricing_info, environment, application_name, metrics,
|
28
40
|
capture_message_content, disable_metrics, version, is_stream):
|
29
41
|
"""
|
30
42
|
Process chat request and generate Telemetry
|
31
43
|
"""
|
32
44
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
request_model = scope._request_model
|
46
|
+
|
47
|
+
# Extract prompts and completions from vLLM response
|
48
|
+
input_tokens = 0
|
49
|
+
output_tokens = 0
|
50
|
+
prompt = ""
|
51
|
+
completion = ""
|
52
|
+
|
53
|
+
for output in scope._response:
|
54
|
+
prompt += output.prompt + "\n"
|
55
|
+
if output.outputs and len(output.outputs) > 0:
|
56
|
+
completion += output.outputs[0].text + "\n"
|
57
|
+
input_tokens += general_tokens(output.prompt)
|
58
|
+
output_tokens += general_tokens(output.outputs[0].text)
|
59
|
+
|
60
|
+
cost = get_chat_model_cost(request_model, pricing_info, input_tokens, output_tokens)
|
61
|
+
|
62
|
+
# Common Span Attributes
|
63
|
+
common_span_attributes(scope,
|
64
|
+
SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_VLLM,
|
65
|
+
scope._server_address, scope._server_port, request_model, request_model,
|
66
|
+
environment, application_name, is_stream, scope._tbt, scope._ttft, version)
|
67
|
+
|
68
|
+
# Span Attributes for Request parameters
|
46
69
|
inference_config = get_inference_config(scope._args, scope._kwargs)
|
47
70
|
if inference_config:
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
(
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
value = getattr(inference_config, key, None)
|
60
|
-
if value is not None:
|
61
|
-
scope._span.set_attribute(attribute, value)
|
62
|
-
|
63
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_RESPONSE_MODEL, scope._request_model)
|
71
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_MAX_TOKENS, getattr(inference_config, 'max_tokens', -1))
|
72
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_STOP_SEQUENCES, getattr(inference_config, 'stop_sequences', []))
|
73
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TEMPERATURE, getattr(inference_config, 'temperature', 1.0))
|
74
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TOP_P, getattr(inference_config, 'top_p', 1.0))
|
75
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_TOP_K, getattr(inference_config, 'top_k', -1))
|
76
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
77
|
+
getattr(inference_config, 'presence_penalty', 0.0))
|
78
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
79
|
+
getattr(inference_config, 'frequency_penalty', 0.0))
|
80
|
+
|
81
|
+
# Span Attributes for Response parameters
|
64
82
|
scope._span.set_attribute(SemanticConvention.GEN_AI_OUTPUT_TYPE, "text")
|
65
83
|
|
66
|
-
#
|
67
|
-
scope._span.set_attribute(
|
68
|
-
scope._span.set_attribute(
|
69
|
-
scope._span.set_attribute(SemanticConvention.
|
70
|
-
scope._span.set_attribute(SemanticConvention.
|
71
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_SERVER_TTFT, scope._ttft)
|
72
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_SDK_VERSION, version)
|
73
|
-
|
74
|
-
input_tokens = 0
|
75
|
-
output_tokens = 0
|
76
|
-
cost = 0
|
84
|
+
# Span Attributes for Cost and Tokens
|
85
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS, input_tokens)
|
86
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
|
87
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE, input_tokens + output_tokens)
|
88
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST, cost)
|
77
89
|
|
90
|
+
# Span Attributes for Content
|
78
91
|
if capture_message_content:
|
79
|
-
prompt
|
80
|
-
completion
|
92
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_PROMPT, prompt)
|
93
|
+
scope._span.set_attribute(SemanticConvention.GEN_AI_CONTENT_COMPLETION, completion)
|
81
94
|
|
82
|
-
|
83
|
-
prompt += output.prompt + "\n"
|
84
|
-
if output.outputs and len(output.outputs) > 0:
|
85
|
-
completion += output.outputs[0].text + "\n"
|
86
|
-
input_tokens += general_tokens(output.prompt)
|
87
|
-
output_tokens += general_tokens(output.outputs[0].text)
|
88
|
-
|
89
|
-
# Add a single event for prompt
|
95
|
+
# To be removed once the change to span_attributes (from span events) is complete
|
90
96
|
scope._span.add_event(
|
91
97
|
name=SemanticConvention.GEN_AI_CONTENT_PROMPT_EVENT,
|
92
98
|
attributes={
|
93
99
|
SemanticConvention.GEN_AI_CONTENT_PROMPT: prompt,
|
94
100
|
},
|
95
101
|
)
|
96
|
-
|
97
|
-
# Add a single event for completion
|
98
102
|
scope._span.add_event(
|
99
103
|
name=SemanticConvention.GEN_AI_CONTENT_COMPLETION_EVENT,
|
100
104
|
attributes={
|
@@ -102,39 +106,14 @@ def common_chat_logic(scope, pricing_info, environment, application_name, metric
|
|
102
106
|
},
|
103
107
|
)
|
104
108
|
|
105
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_INPUT_TOKENS,
|
106
|
-
input_tokens)
|
107
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_OUTPUT_TOKENS,
|
108
|
-
output_tokens)
|
109
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_CLIENT_TOKEN_USAGE,
|
110
|
-
input_tokens + output_tokens)
|
111
|
-
|
112
|
-
# Calculate cost of the operation
|
113
|
-
cost = get_chat_model_cost(scope._request_model, pricing_info, input_tokens, output_tokens)
|
114
|
-
scope._span.set_attribute(SemanticConvention.GEN_AI_USAGE_COST, cost)
|
115
|
-
|
116
109
|
scope._span.set_status(Status(StatusCode.OK))
|
117
110
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
request_model=scope._request_model,
|
125
|
-
server_address=scope._server_address,
|
126
|
-
server_port=scope._server_port,
|
127
|
-
response_model=scope._request_model,
|
128
|
-
)
|
129
|
-
metrics['genai_client_operation_duration'].record(scope._end_time - scope._start_time, metrics_attributes)
|
130
|
-
metrics['genai_server_tbt'].record(scope._tbt, metrics_attributes)
|
131
|
-
metrics['genai_server_ttft'].record(scope._ttft, metrics_attributes)
|
132
|
-
metrics['genai_requests'].add(1, metrics_attributes)
|
133
|
-
metrics['genai_completion_tokens'].add(output_tokens, metrics_attributes)
|
134
|
-
metrics['genai_prompt_tokens'].add(input_tokens, metrics_attributes)
|
135
|
-
metrics['genai_cost'].record(cost, metrics_attributes)
|
136
|
-
metrics['genai_client_usage_tokens'].record(
|
137
|
-
input_tokens + output_tokens, metrics_attributes)
|
111
|
+
# Metrics
|
112
|
+
if not disable_metrics:
|
113
|
+
record_completion_metrics(metrics, SemanticConvention.GEN_AI_OPERATION_TYPE_CHAT, SemanticConvention.GEN_AI_SYSTEM_VLLM,
|
114
|
+
scope._server_address, scope._server_port, request_model, request_model, environment,
|
115
|
+
application_name, scope._start_time, scope._end_time, input_tokens, output_tokens,
|
116
|
+
cost, scope._tbt, scope._ttft)
|
138
117
|
|
139
118
|
def process_chat_response(instance, response, request_model, pricing_info, server_port, server_address,
|
140
119
|
environment, application_name, metrics, start_time, span, args, kwargs,
|
@@ -142,20 +121,23 @@ def process_chat_response(instance, response, request_model, pricing_info, serve
|
|
142
121
|
"""
|
143
122
|
Process chat request and generate Telemetry
|
144
123
|
"""
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
124
|
+
|
125
|
+
# Create scope object
|
126
|
+
scope = type("GenericScope", (), {})()
|
127
|
+
|
128
|
+
scope._response = response
|
129
|
+
scope._start_time = start_time
|
130
|
+
scope._end_time = time.time()
|
131
|
+
scope._span = span
|
132
|
+
scope._ttft, scope._tbt = scope._end_time - scope._start_time, 0
|
133
|
+
scope._server_address = server_address
|
134
|
+
scope._server_port = server_port
|
135
|
+
scope._request_model = request_model
|
136
|
+
scope._timestamps = []
|
137
|
+
scope._args = args
|
138
|
+
scope._kwargs = kwargs
|
139
|
+
|
140
|
+
common_chat_logic(scope, pricing_info, environment, application_name, metrics,
|
159
141
|
capture_message_content, disable_metrics, version, is_stream=False)
|
160
142
|
|
161
143
|
return response
|
@@ -2,7 +2,6 @@
|
|
2
2
|
Module for monitoring vLLM API calls.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import logging
|
6
5
|
import time
|
7
6
|
from opentelemetry.trace import SpanKind
|
8
7
|
from openlit.__helpers import (
|
@@ -14,11 +13,8 @@ from openlit.instrumentation.vllm.utils import (
|
|
14
13
|
)
|
15
14
|
from openlit.semcov import SemanticConvention
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
def generate(version, environment, application_name,
|
21
|
-
tracer, pricing_info, capture_message_content, metrics, disable_metrics):
|
16
|
+
def generate(version, environment, application_name, tracer, pricing_info,
|
17
|
+
capture_message_content, metrics, disable_metrics):
|
22
18
|
"""
|
23
19
|
Generates a telemetry wrapper for GenAI function call
|
24
20
|
"""
|
@@ -27,7 +23,6 @@ def generate(version, environment, application_name,
|
|
27
23
|
"""
|
28
24
|
Wraps the GenAI function call.
|
29
25
|
"""
|
30
|
-
|
31
26
|
server_address, server_port = set_server_address_and_port(instance, "http://127.0.0.1", 443)
|
32
27
|
request_model = instance.llm_engine.model_config.model or "facebook/opt-125m"
|
33
28
|
|
@@ -56,9 +51,9 @@ def generate(version, environment, application_name,
|
|
56
51
|
disable_metrics=disable_metrics,
|
57
52
|
version=version,
|
58
53
|
)
|
54
|
+
|
59
55
|
except Exception as e:
|
60
56
|
handle_exception(span, e)
|
61
|
-
logger.error("Error in trace creation: %s", e)
|
62
57
|
|
63
58
|
return response
|
64
59
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: openlit
|
3
|
-
Version: 1.34.
|
3
|
+
Version: 1.34.13
|
4
4
|
Summary: OpenTelemetry-native Auto instrumentation library for monitoring LLM Applications and GPUs, facilitating the integration of observability into your GenAI-driven projects
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: OpenTelemetry,otel,otlp,llm,tracing,openai,anthropic,claude,cohere,llm monitoring,observability,monitoring,gpt,Generative AI,chatGPT,gpu
|
@@ -1,4 +1,4 @@
|
|
1
|
-
openlit/__helpers.py,sha256=
|
1
|
+
openlit/__helpers.py,sha256=x_HA-B3v0lawXeg3_yASXAzN0P0hChrgWyYXdLGY0Pw,14862
|
2
2
|
openlit/__init__.py,sha256=ris6-GY0ePSbK_jvawHTXymGClVF7yeKdIT95IRBl18,24086
|
3
3
|
openlit/evals/__init__.py,sha256=nJe99nuLo1b5rf7pt9U9BCdSDedzbVi2Fj96cgl7msM,380
|
4
4
|
openlit/evals/all.py,sha256=oWrue3PotE-rB5WePG3MRYSA-ro6WivkclSHjYlAqGs,7154
|
@@ -14,10 +14,10 @@ openlit/guard/sensitive_topic.py,sha256=RgVw_laFERv0nNdzBsAd2_3yLomMOK-gVq-P7oj1
|
|
14
14
|
openlit/guard/utils.py,sha256=6hE3rCRjFXYjKRQYUo8YsqUSlvod48nOWp8MwoQEYdw,7670
|
15
15
|
openlit/instrumentation/ag2/__init__.py,sha256=KgyLJBmwAxRWu7Z0S8FDDK4TZ13EFoAAIalvG5Oq4wc,1839
|
16
16
|
openlit/instrumentation/ag2/ag2.py,sha256=eNQziyeZl4396GsIp5qI1Dne2KcnQMmhftW7joKQvNU,6934
|
17
|
-
openlit/instrumentation/ai21/__init__.py,sha256=
|
18
|
-
openlit/instrumentation/ai21/ai21.py,sha256=
|
19
|
-
openlit/instrumentation/ai21/async_ai21.py,sha256=
|
20
|
-
openlit/instrumentation/ai21/utils.py,sha256=
|
17
|
+
openlit/instrumentation/ai21/__init__.py,sha256=tKX643fwxPWPJq1EXEZd0Xpd6B0jl_ViPFmJ87f5B08,2539
|
18
|
+
openlit/instrumentation/ai21/ai21.py,sha256=zyQMfCLcOFG1tQWrZmGeMaVAmj8MtCUeXQtPHmlUAO0,6533
|
19
|
+
openlit/instrumentation/ai21/async_ai21.py,sha256=q1Dhxru4tUJu0U1Px3PptNqrSGW0-VfRGcqkLKFR8vQ,6659
|
20
|
+
openlit/instrumentation/ai21/utils.py,sha256=5zf69uw_TT8u-q-6R6rBeGm1bX0WpsbrAq-MTTZJ9Bk,14309
|
21
21
|
openlit/instrumentation/anthropic/__init__.py,sha256=QEsiwdxcQDzzlVYR4_x7KTdf0-UJDJt8FjwNQMspnxM,1929
|
22
22
|
openlit/instrumentation/anthropic/anthropic.py,sha256=NxJJjhsu9sSFIlBp322olGkPlLt9Bn5sndaugYA68dE,5149
|
23
23
|
openlit/instrumentation/anthropic/async_anthropic.py,sha256=ivJGygKWVTS2hWWX12_g1tiq-5mpeHXETZsWoFZL3UE,5235
|
@@ -99,7 +99,7 @@ openlit/instrumentation/ollama/__init__.py,sha256=WxjqjuR8ovMU5dR08OELNqClbuM7ns
|
|
99
99
|
openlit/instrumentation/ollama/async_ollama.py,sha256=ORXwem8lgSrhOcci55NkChIK9SNc3IYIpLjF_ogsGA8,6666
|
100
100
|
openlit/instrumentation/ollama/ollama.py,sha256=8mvrWfU1c5h1L7lxWo47YBJ7g2u7QZmSZuuP0URtTDo,6538
|
101
101
|
openlit/instrumentation/ollama/utils.py,sha256=TIE3_ur2U-iyCclna7TzwjDIFC9PZjRnZqNDV6NfG-0,11958
|
102
|
-
openlit/instrumentation/openai/__init__.py,sha256=
|
102
|
+
openlit/instrumentation/openai/__init__.py,sha256=KI3ncllea3VzK0lvBfZXBhs2EClSLe38WEPdIL4_SOo,6311
|
103
103
|
openlit/instrumentation/openai/async_openai.py,sha256=JyA8MDxWCM38Te6mJzBdfonRgIIlo2ziLn7HOmzqxxo,81398
|
104
104
|
openlit/instrumentation/openai/openai.py,sha256=5fgRyK5dUN2zUdrN0vBSZFnSEAXf2dKS0qnq_85-mQE,81175
|
105
105
|
openlit/instrumentation/openai_agents/__init__.py,sha256=tRTSIrUtkXc_lfQnVanXmQLd2Sy9RqBNTHF5FhhZx7o,1530
|
@@ -131,14 +131,14 @@ openlit/instrumentation/transformers/utils.py,sha256=3f-ewpUpduaBrTVIFJKaabACjz-
|
|
131
131
|
openlit/instrumentation/vertexai/__init__.py,sha256=mT28WCBvQfRCkAWGL6bd0EjEPHvMjaNcz6T3jsLZh8k,3745
|
132
132
|
openlit/instrumentation/vertexai/async_vertexai.py,sha256=-kpg-eiL76O5_XopUPghCYwJHf0Nrxi00_Z5tCwq6zM,23086
|
133
133
|
openlit/instrumentation/vertexai/vertexai.py,sha256=5NB090aWlm9DnlccNNLRO6A97P_RN-JnHb5JS01tYyw,23000
|
134
|
-
openlit/instrumentation/vllm/__init__.py,sha256=
|
135
|
-
openlit/instrumentation/vllm/utils.py,sha256=
|
136
|
-
openlit/instrumentation/vllm/vllm.py,sha256=
|
134
|
+
openlit/instrumentation/vllm/__init__.py,sha256=uaSzQmgDuKJ-sh61sfVdzVt2qAZaozZIQ8sbmQ0XpZE,1357
|
135
|
+
openlit/instrumentation/vllm/utils.py,sha256=HuCPNBgChWg9vA7DHNFCij_y8qj27DjZxdZ0Nvdt2fg,5751
|
136
|
+
openlit/instrumentation/vllm/vllm.py,sha256=VzazF2f4LLwjZDO_G8lIN_d622oSJM0fIO9wjxXbhyg,2004
|
137
137
|
openlit/otel/events.py,sha256=VrMjTpvnLtYRBHCiFwJojTQqqNpRCxoD4yJYeQrtPsk,3560
|
138
138
|
openlit/otel/metrics.py,sha256=GM2PDloBGRhBTkHHkYaqmOwIAQkY124ZhW4sEqW1Fgk,7086
|
139
139
|
openlit/otel/tracing.py,sha256=tjV2bEbEDPUB1Z46gE-UsJsb04sRdFrfbhIDkxViZc0,3103
|
140
140
|
openlit/semcov/__init__.py,sha256=ptyo37PY-FHDx_PShEvbdns71cD4YvvXw15bCRXKCKM,13461
|
141
|
-
openlit-1.34.
|
142
|
-
openlit-1.34.
|
143
|
-
openlit-1.34.
|
144
|
-
openlit-1.34.
|
141
|
+
openlit-1.34.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
142
|
+
openlit-1.34.13.dist-info/METADATA,sha256=4uHfQSKnuT-yfoNz7kj78yd53TBFDCDYVhOIsz7XF8k,23470
|
143
|
+
openlit-1.34.13.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
144
|
+
openlit-1.34.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|