openlit 1.33.8__py3-none-any.whl → 1.33.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/__helpers.py +88 -0
- openlit/__init__.py +4 -3
- openlit/instrumentation/ag2/ag2.py +5 -5
- openlit/instrumentation/ai21/__init__.py +4 -4
- openlit/instrumentation/ai21/ai21.py +370 -319
- openlit/instrumentation/ai21/async_ai21.py +371 -319
- openlit/instrumentation/anthropic/__init__.py +4 -4
- openlit/instrumentation/anthropic/anthropic.py +321 -189
- openlit/instrumentation/anthropic/async_anthropic.py +323 -190
- openlit/instrumentation/assemblyai/__init__.py +1 -1
- openlit/instrumentation/assemblyai/assemblyai.py +59 -43
- openlit/instrumentation/astra/astra.py +9 -9
- openlit/instrumentation/astra/async_astra.py +9 -9
- openlit/instrumentation/azure_ai_inference/__init__.py +4 -4
- openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +406 -252
- openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +406 -252
- openlit/instrumentation/bedrock/__init__.py +1 -1
- openlit/instrumentation/bedrock/bedrock.py +115 -58
- openlit/instrumentation/chroma/chroma.py +9 -9
- openlit/instrumentation/cohere/__init__.py +33 -10
- openlit/instrumentation/cohere/async_cohere.py +610 -0
- openlit/instrumentation/cohere/cohere.py +410 -219
- openlit/instrumentation/controlflow/controlflow.py +5 -5
- openlit/instrumentation/crawl4ai/async_crawl4ai.py +5 -5
- openlit/instrumentation/crawl4ai/crawl4ai.py +5 -5
- openlit/instrumentation/crewai/crewai.py +6 -4
- openlit/instrumentation/dynamiq/dynamiq.py +5 -5
- openlit/instrumentation/elevenlabs/async_elevenlabs.py +71 -46
- openlit/instrumentation/elevenlabs/elevenlabs.py +71 -51
- openlit/instrumentation/embedchain/embedchain.py +9 -9
- openlit/instrumentation/firecrawl/firecrawl.py +5 -5
- openlit/instrumentation/google_ai_studio/__init__.py +9 -9
- openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +183 -219
- openlit/instrumentation/google_ai_studio/google_ai_studio.py +183 -220
- openlit/instrumentation/gpt4all/__init__.py +2 -2
- openlit/instrumentation/gpt4all/gpt4all.py +345 -220
- openlit/instrumentation/gpu/__init__.py +5 -5
- openlit/instrumentation/groq/__init__.py +2 -2
- openlit/instrumentation/groq/async_groq.py +356 -240
- openlit/instrumentation/groq/groq.py +356 -240
- openlit/instrumentation/haystack/haystack.py +5 -5
- openlit/instrumentation/julep/async_julep.py +5 -5
- openlit/instrumentation/julep/julep.py +5 -5
- openlit/instrumentation/langchain/__init__.py +13 -7
- openlit/instrumentation/langchain/async_langchain.py +384 -0
- openlit/instrumentation/langchain/langchain.py +105 -492
- openlit/instrumentation/letta/letta.py +11 -9
- openlit/instrumentation/litellm/__init__.py +4 -5
- openlit/instrumentation/litellm/async_litellm.py +318 -247
- openlit/instrumentation/litellm/litellm.py +314 -243
- openlit/instrumentation/llamaindex/llamaindex.py +5 -5
- openlit/instrumentation/mem0/mem0.py +5 -5
- openlit/instrumentation/milvus/milvus.py +9 -9
- openlit/instrumentation/mistral/__init__.py +6 -6
- openlit/instrumentation/mistral/async_mistral.py +423 -250
- openlit/instrumentation/mistral/mistral.py +420 -246
- openlit/instrumentation/multion/async_multion.py +6 -4
- openlit/instrumentation/multion/multion.py +6 -4
- openlit/instrumentation/ollama/__init__.py +8 -30
- openlit/instrumentation/ollama/async_ollama.py +385 -417
- openlit/instrumentation/ollama/ollama.py +384 -417
- openlit/instrumentation/openai/__init__.py +11 -230
- openlit/instrumentation/openai/async_openai.py +433 -410
- openlit/instrumentation/openai/openai.py +414 -394
- openlit/instrumentation/phidata/phidata.py +6 -4
- openlit/instrumentation/pinecone/pinecone.py +9 -9
- openlit/instrumentation/premai/__init__.py +2 -2
- openlit/instrumentation/premai/premai.py +262 -213
- openlit/instrumentation/qdrant/async_qdrant.py +9 -9
- openlit/instrumentation/qdrant/qdrant.py +9 -9
- openlit/instrumentation/reka/__init__.py +2 -2
- openlit/instrumentation/reka/async_reka.py +90 -52
- openlit/instrumentation/reka/reka.py +90 -52
- openlit/instrumentation/together/__init__.py +4 -4
- openlit/instrumentation/together/async_together.py +278 -236
- openlit/instrumentation/together/together.py +278 -236
- openlit/instrumentation/transformers/__init__.py +1 -1
- openlit/instrumentation/transformers/transformers.py +76 -45
- openlit/instrumentation/vertexai/__init__.py +14 -64
- openlit/instrumentation/vertexai/async_vertexai.py +330 -987
- openlit/instrumentation/vertexai/vertexai.py +330 -987
- openlit/instrumentation/vllm/__init__.py +1 -1
- openlit/instrumentation/vllm/vllm.py +66 -36
- openlit/otel/metrics.py +98 -7
- openlit/semcov/__init__.py +113 -80
- {openlit-1.33.8.dist-info → openlit-1.33.10.dist-info}/METADATA +1 -1
- openlit-1.33.10.dist-info/RECORD +122 -0
- {openlit-1.33.8.dist-info → openlit-1.33.10.dist-info}/WHEEL +1 -1
- openlit/instrumentation/openai/async_azure_openai.py +0 -900
- openlit/instrumentation/openai/azure_openai.py +0 -898
- openlit-1.33.8.dist-info/RECORD +0 -122
- {openlit-1.33.8.dist-info → openlit-1.33.10.dist-info}/LICENSE +0 -0
@@ -1,28 +1,33 @@
|
|
1
|
-
# pylint: disable=duplicate-code, broad-exception-caught, too-many-statements, unused-argument, possibly-used-before-assignment
|
2
1
|
"""
|
3
2
|
Module for monitoring Ollama API calls.
|
4
3
|
"""
|
5
4
|
|
6
5
|
import logging
|
6
|
+
import time
|
7
7
|
from opentelemetry.trace import SpanKind, Status, StatusCode
|
8
|
-
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
8
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
9
9
|
from openlit.__helpers import (
|
10
|
+
get_chat_model_cost,
|
11
|
+
get_embed_model_cost,
|
10
12
|
handle_exception,
|
13
|
+
response_as_dict,
|
11
14
|
general_tokens,
|
12
|
-
|
13
|
-
|
15
|
+
calculate_ttft,
|
16
|
+
calculate_tbt,
|
17
|
+
create_metrics_attributes,
|
18
|
+
set_server_address_and_port
|
19
|
+
)
|
14
20
|
from openlit.semcov import SemanticConvetion
|
15
21
|
|
16
22
|
# Initialize logger for logging potential issues and operations
|
17
23
|
logger = logging.getLogger(__name__)
|
18
24
|
|
19
|
-
def async_chat(
|
25
|
+
def async_chat(version, environment, application_name,
|
20
26
|
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
21
27
|
"""
|
22
|
-
Generates a telemetry wrapper for chat to collect metrics.
|
28
|
+
Generates a telemetry wrapper for chat completions to collect metrics.
|
23
29
|
|
24
30
|
Args:
|
25
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
26
31
|
version: Version of the monitoring package.
|
27
32
|
environment: Deployment environment (e.g., production, staging).
|
28
33
|
application_name: Name of the application using the Ollama API.
|
@@ -31,464 +36,410 @@ def async_chat(gen_ai_endpoint, version, environment, application_name,
|
|
31
36
|
trace_content: Flag indicating whether to trace the actual content.
|
32
37
|
|
33
38
|
Returns:
|
34
|
-
A function that wraps the chat method to add telemetry.
|
39
|
+
A function that wraps the chat completions method to add telemetry.
|
35
40
|
"""
|
36
41
|
|
37
|
-
|
42
|
+
class TracedAsyncStream:
|
38
43
|
"""
|
39
|
-
|
40
|
-
|
41
|
-
This collects metrics such as execution time, cost, and token usage, and handles errors
|
42
|
-
gracefully, adding details to the trace for observability.
|
44
|
+
Wrapper for streaming responses to collect metrics and trace data.
|
45
|
+
Wraps the response to collect message IDs and aggregated response.
|
43
46
|
|
44
|
-
|
45
|
-
|
46
|
-
instance: The instance of the class where the original method is defined.
|
47
|
-
args: Positional arguments for the 'chat' method.
|
48
|
-
kwargs: Keyword arguments for the 'chat' method.
|
47
|
+
This class implements the '__aiter__' and '__anext__' methods that
|
48
|
+
handle asynchronous streaming responses.
|
49
49
|
|
50
|
-
|
51
|
-
|
50
|
+
This class also implements '__aenter__' and '__aexit__' methods that
|
51
|
+
handle asynchronous context management protocol.
|
52
52
|
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
118
|
-
True)
|
119
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
120
|
-
prompt_tokens)
|
121
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
122
|
-
completion_tokens)
|
123
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
124
|
-
total_tokens)
|
125
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
126
|
-
cost)
|
127
|
-
if trace_content:
|
128
|
-
span.add_event(
|
129
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
130
|
-
attributes={
|
131
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
132
|
-
},
|
133
|
-
)
|
134
|
-
span.add_event(
|
135
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
136
|
-
attributes={
|
137
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: llmresponse,
|
138
|
-
},
|
139
|
-
)
|
140
|
-
|
141
|
-
span.set_status(Status(StatusCode.OK))
|
142
|
-
|
143
|
-
if disable_metrics is False:
|
144
|
-
attributes = {
|
145
|
-
TELEMETRY_SDK_NAME:
|
146
|
-
"openlit",
|
147
|
-
SemanticConvetion.GEN_AI_APPLICATION_NAME:
|
148
|
-
application_name,
|
149
|
-
SemanticConvetion.GEN_AI_SYSTEM:
|
150
|
-
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA,
|
151
|
-
SemanticConvetion.GEN_AI_ENVIRONMENT:
|
152
|
-
environment,
|
153
|
-
SemanticConvetion.GEN_AI_TYPE:
|
154
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
155
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
156
|
-
kwargs.get("model", "llama3")
|
157
|
-
}
|
158
|
-
|
159
|
-
metrics["genai_requests"].add(1, attributes)
|
160
|
-
metrics["genai_total_tokens"].add(total_tokens, attributes)
|
161
|
-
metrics["genai_completion_tokens"].add(completion_tokens, attributes)
|
162
|
-
metrics["genai_prompt_tokens"].add(prompt_tokens, attributes)
|
163
|
-
metrics["genai_cost"].record(cost, attributes)
|
164
|
-
|
165
|
-
except Exception as e:
|
166
|
-
handle_exception(span, e)
|
167
|
-
logger.error("Error in trace creation: %s", e)
|
168
|
-
|
169
|
-
return stream_generator()
|
170
|
-
|
171
|
-
# Handling for non-streaming responses
|
172
|
-
else:
|
173
|
-
# pylint: disable=line-too-long
|
174
|
-
with tracer.start_as_current_span(gen_ai_endpoint, kind= SpanKind.CLIENT) as span:
|
175
|
-
response = await wrapped(*args, **kwargs)
|
176
|
-
|
53
|
+
def __init__(
|
54
|
+
self,
|
55
|
+
wrapped,
|
56
|
+
span,
|
57
|
+
kwargs,
|
58
|
+
server_address,
|
59
|
+
server_port,
|
60
|
+
**args,
|
61
|
+
):
|
62
|
+
self.__wrapped__ = wrapped
|
63
|
+
self._span = span
|
64
|
+
# Placeholder for aggregating streaming response
|
65
|
+
self._llmresponse = ""
|
66
|
+
self._response_model = ""
|
67
|
+
self._finish_reason = ""
|
68
|
+
self._input_tokens = 0
|
69
|
+
self._output_tokens = 0
|
70
|
+
|
71
|
+
self._args = args
|
72
|
+
self._kwargs = kwargs
|
73
|
+
self._start_time = time.time()
|
74
|
+
self._end_time = None
|
75
|
+
self._timestamps = []
|
76
|
+
self._ttft = 0
|
77
|
+
self._tbt = 0
|
78
|
+
self._server_address = server_address
|
79
|
+
self._server_port = server_port
|
80
|
+
|
81
|
+
async def __aenter__(self):
|
82
|
+
await self.__wrapped__.__aenter__()
|
83
|
+
return self
|
84
|
+
|
85
|
+
async def __aexit__(self, exc_type, exc_value, traceback):
|
86
|
+
await self.__wrapped__.__aexit__(exc_type, exc_value, traceback)
|
87
|
+
|
88
|
+
def __aiter__(self):
|
89
|
+
return self
|
90
|
+
|
91
|
+
async def __getattr__(self, name):
|
92
|
+
"""Delegate attribute access to the wrapped object."""
|
93
|
+
return getattr(await self.__wrapped__, name)
|
94
|
+
|
95
|
+
async def __anext__(self):
|
96
|
+
try:
|
97
|
+
chunk = await self.__wrapped__.__anext__()
|
98
|
+
end_time = time.time()
|
99
|
+
# Record the timestamp for the current chunk
|
100
|
+
self._timestamps.append(end_time)
|
101
|
+
|
102
|
+
if len(self._timestamps) == 1:
|
103
|
+
# Calculate time to first chunk
|
104
|
+
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
105
|
+
|
106
|
+
chunked = response_as_dict(chunk)
|
107
|
+
self._llmresponse += chunked.get('message').get('content')
|
108
|
+
|
109
|
+
if chunked.get('eval_count'):
|
110
|
+
self._input_tokens = chunked.get('prompt_eval_count')
|
111
|
+
self._output_tokens = chunked.get('eval_count')
|
112
|
+
self._response_model = chunked.get('model')
|
113
|
+
self._finish_reason = chunked.get('done_reason')
|
114
|
+
return chunk
|
115
|
+
except StopAsyncIteration:
|
116
|
+
# Handling exception ensure observability without disrupting operation
|
177
117
|
try:
|
118
|
+
self._end_time = time.time()
|
119
|
+
if len(self._timestamps) > 1:
|
120
|
+
self._tbt = calculate_tbt(self._timestamps)
|
121
|
+
|
178
122
|
# Format 'messages' into a single string
|
179
|
-
message_prompt =
|
123
|
+
message_prompt = self._kwargs.get("messages", "")
|
180
124
|
formatted_messages = []
|
181
125
|
for message in message_prompt:
|
182
126
|
role = message["role"]
|
183
127
|
content = message["content"]
|
184
128
|
|
185
129
|
if isinstance(content, list):
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
130
|
+
content_str_list = []
|
131
|
+
for item in content:
|
132
|
+
if item["type"] == "text":
|
133
|
+
content_str_list.append(f'text: {item["text"]}')
|
134
|
+
elif (item["type"] == "image_url" and
|
135
|
+
not item["image_url"]["url"].startswith("data:")):
|
136
|
+
content_str_list.append(f'image_url: {item["image_url"]["url"]}')
|
137
|
+
content_str = ", ".join(content_str_list)
|
192
138
|
formatted_messages.append(f"{role}: {content_str}")
|
193
139
|
else:
|
194
140
|
formatted_messages.append(f"{role}: {content}")
|
195
141
|
prompt = "\n".join(formatted_messages)
|
196
142
|
|
197
|
-
|
198
|
-
|
199
|
-
|
143
|
+
request_model = self._kwargs.get("model", "gpt-4o")
|
144
|
+
|
145
|
+
# Calculate cost of the operation
|
146
|
+
cost = get_chat_model_cost(request_model,
|
147
|
+
pricing_info, self._input_tokens,
|
148
|
+
self._output_tokens)
|
149
|
+
|
150
|
+
# Set Span attributes (OTel Semconv)
|
151
|
+
self._span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
152
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
153
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
154
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
200
155
|
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA)
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
156
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
157
|
+
request_model)
|
158
|
+
self._span.set_attribute(SemanticConvetion.SERVER_PORT,
|
159
|
+
self._server_port)
|
160
|
+
|
161
|
+
# List of attributes and their config keys
|
162
|
+
attributes = [
|
163
|
+
(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY, 'repeat_penalty'),
|
164
|
+
(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS, 'max_tokens'),
|
165
|
+
(SemanticConvetion.GEN_AI_REQUEST_SEED, 'seed'),
|
166
|
+
(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES, 'stop'),
|
167
|
+
(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE, 'temperature'),
|
168
|
+
(SemanticConvetion.GEN_AI_REQUEST_TOP_P, 'top_p'),
|
169
|
+
(SemanticConvetion.GEN_AI_REQUEST_TOP_K, 'top_k'),
|
170
|
+
]
|
171
|
+
|
172
|
+
# Safely get the options dictionary from kwargs
|
173
|
+
options = self._kwargs.get('options', {})
|
174
|
+
|
175
|
+
# Set each attribute if the corresponding value exists and is not None
|
176
|
+
for attribute, key in attributes:
|
177
|
+
# Use dictionary `get` to retrieve values from the options dictionary
|
178
|
+
value = options.get(key)
|
179
|
+
if value is not None:
|
180
|
+
self._span.set_attribute(attribute, value)
|
181
|
+
|
182
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
183
|
+
[self._finish_reason])
|
184
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
185
|
+
self._response_model)
|
186
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
187
|
+
self._input_tokens)
|
188
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
189
|
+
self._output_tokens)
|
190
|
+
self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
191
|
+
self._server_address)
|
192
|
+
if isinstance(self._llmresponse, str):
|
193
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
194
|
+
"text")
|
195
|
+
else:
|
196
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
197
|
+
"json")
|
198
|
+
|
199
|
+
# Set Span attributes (Extra)
|
200
|
+
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
206
201
|
environment)
|
207
|
-
|
202
|
+
self._span.set_attribute(SERVICE_NAME,
|
208
203
|
application_name)
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
204
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
205
|
+
True)
|
206
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
207
|
+
self._input_tokens + self._output_tokens)
|
208
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
209
|
+
cost)
|
210
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
|
211
|
+
self._tbt)
|
212
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
213
|
+
self._ttft)
|
214
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
215
|
+
version)
|
213
216
|
if trace_content:
|
214
|
-
|
217
|
+
self._span.add_event(
|
215
218
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
216
219
|
attributes={
|
217
220
|
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
218
221
|
},
|
219
222
|
)
|
220
|
-
|
223
|
+
self._span.add_event(
|
221
224
|
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
222
225
|
attributes={
|
223
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION:
|
226
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
224
227
|
},
|
225
228
|
)
|
226
|
-
|
227
|
-
prompt_tokens = general_tokens(prompt)
|
228
|
-
completion_tokens = response["eval_count"]
|
229
|
-
total_tokens = prompt_tokens + completion_tokens
|
230
|
-
# Calculate cost of the operation
|
231
|
-
cost = get_chat_model_cost(kwargs.get("model", "llama3"),
|
232
|
-
pricing_info, prompt_tokens, completion_tokens)
|
233
|
-
|
234
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
235
|
-
prompt_tokens)
|
236
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
237
|
-
completion_tokens)
|
238
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
239
|
-
total_tokens)
|
240
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
241
|
-
[response["done_reason"]])
|
242
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
243
|
-
cost)
|
244
|
-
|
245
|
-
span.set_status(Status(StatusCode.OK))
|
229
|
+
self._span.set_status(Status(StatusCode.OK))
|
246
230
|
|
247
231
|
if disable_metrics is False:
|
248
|
-
attributes =
|
249
|
-
|
250
|
-
|
251
|
-
SemanticConvetion.
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
259
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
260
|
-
kwargs.get("model", "llama3")
|
261
|
-
}
|
232
|
+
attributes = create_metrics_attributes(
|
233
|
+
service_name=application_name,
|
234
|
+
deployment_environment=environment,
|
235
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
236
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_OLLAMA,
|
237
|
+
request_model=request_model,
|
238
|
+
server_address=self._server_address,
|
239
|
+
server_port=self._server_port,
|
240
|
+
response_model=self._response_model,
|
241
|
+
)
|
262
242
|
|
243
|
+
metrics["genai_client_usage_tokens"].record(
|
244
|
+
self._input_tokens + self._output_tokens, attributes
|
245
|
+
)
|
246
|
+
metrics["genai_client_operation_duration"].record(
|
247
|
+
self._end_time - self._start_time, attributes
|
248
|
+
)
|
249
|
+
metrics["genai_server_tbt"].record(
|
250
|
+
self._tbt, attributes
|
251
|
+
)
|
252
|
+
metrics["genai_server_ttft"].record(
|
253
|
+
self._ttft, attributes
|
254
|
+
)
|
263
255
|
metrics["genai_requests"].add(1, attributes)
|
264
|
-
metrics["
|
265
|
-
metrics["
|
266
|
-
metrics["genai_prompt_tokens"].add(prompt_tokens, attributes)
|
256
|
+
metrics["genai_completion_tokens"].add(self._output_tokens, attributes)
|
257
|
+
metrics["genai_prompt_tokens"].add(self._input_tokens, attributes)
|
267
258
|
metrics["genai_cost"].record(cost, attributes)
|
268
259
|
|
269
|
-
# Return original response
|
270
|
-
return response
|
271
|
-
|
272
260
|
except Exception as e:
|
273
|
-
handle_exception(
|
261
|
+
handle_exception(self._span, e)
|
274
262
|
logger.error("Error in trace creation: %s", e)
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
return wrapper
|
280
|
-
|
281
|
-
def async_generate(gen_ai_endpoint, version, environment, application_name,
|
282
|
-
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
283
|
-
"""
|
284
|
-
Generates a telemetry wrapper for generate to collect metrics.
|
285
|
-
|
286
|
-
Args:
|
287
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
288
|
-
version: Version of the monitoring package.
|
289
|
-
environment: Deployment environment (e.g., production, staging).
|
290
|
-
application_name: Name of the application using the Ollama API.
|
291
|
-
tracer: OpenTelemetry tracer for creating spans.
|
292
|
-
pricing_info: Information used for calculating the cost of Ollama usage.
|
293
|
-
trace_content: Flag indicating whether to trace the actual content.
|
294
|
-
|
295
|
-
Returns:
|
296
|
-
A function that wraps the generate method to add telemetry.
|
297
|
-
"""
|
263
|
+
finally:
|
264
|
+
self._span.end()
|
265
|
+
raise
|
298
266
|
|
299
267
|
async def wrapper(wrapped, instance, args, kwargs):
|
300
268
|
"""
|
301
|
-
Wraps the '
|
302
|
-
|
269
|
+
Wraps the 'chat.completions' API call to add telemetry.
|
270
|
+
|
303
271
|
This collects metrics such as execution time, cost, and token usage, and handles errors
|
304
272
|
gracefully, adding details to the trace for observability.
|
305
273
|
|
306
274
|
Args:
|
307
|
-
wrapped: The original '
|
275
|
+
wrapped: The original 'chat.completions' method to be wrapped.
|
308
276
|
instance: The instance of the class where the original method is defined.
|
309
|
-
args: Positional arguments for the '
|
310
|
-
kwargs: Keyword arguments for the '
|
277
|
+
args: Positional arguments for the 'chat.completions' method.
|
278
|
+
kwargs: Keyword arguments for the 'chat.completions' method.
|
311
279
|
|
312
280
|
Returns:
|
313
|
-
The response from the original '
|
281
|
+
The response from the original 'chat.completions' method.
|
314
282
|
"""
|
315
283
|
|
316
284
|
# Check if streaming is enabled for the API call
|
317
285
|
streaming = kwargs.get("stream", False)
|
286
|
+
server_address, server_port = set_server_address_and_port(instance, "127.0.0.1", 11434)
|
287
|
+
request_model = kwargs.get("model", "gpt-4o")
|
288
|
+
|
289
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
318
290
|
|
319
291
|
# pylint: disable=no-else-return
|
320
292
|
if streaming:
|
321
293
|
# Special handling for streaming response to accommodate the nature of data flow
|
322
|
-
|
323
|
-
|
324
|
-
# Placeholder for aggregating streaming response
|
325
|
-
llmresponse = ""
|
326
|
-
|
327
|
-
# Loop through streaming events capturing relevant details
|
328
|
-
async for chunk in await wrapped(*args, **kwargs):
|
329
|
-
# Collect aggregated response from events
|
330
|
-
content = chunk['response']
|
331
|
-
llmresponse += content
|
332
|
-
|
333
|
-
if chunk['done'] is True:
|
334
|
-
completion_tokens = chunk["eval_count"]
|
335
|
-
|
336
|
-
yield chunk
|
337
|
-
|
338
|
-
# Handling exception ensure observability without disrupting operation
|
339
|
-
try:
|
340
|
-
prompt_tokens = general_tokens(kwargs.get("prompt", ""))
|
341
|
-
total_tokens = prompt_tokens + completion_tokens
|
342
|
-
# Calculate cost of the operation
|
343
|
-
cost = get_chat_model_cost(kwargs.get("model", "llama3"),
|
344
|
-
pricing_info, prompt_tokens, completion_tokens)
|
345
|
-
|
346
|
-
# Set Span attributes
|
347
|
-
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
348
|
-
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
349
|
-
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA)
|
350
|
-
span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
|
351
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT)
|
352
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
|
353
|
-
gen_ai_endpoint)
|
354
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
|
355
|
-
environment)
|
356
|
-
span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
|
357
|
-
application_name)
|
358
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
359
|
-
kwargs.get("model", "llama3"))
|
360
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
361
|
-
True)
|
362
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
363
|
-
prompt_tokens)
|
364
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
365
|
-
completion_tokens)
|
366
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
367
|
-
total_tokens)
|
368
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
369
|
-
cost)
|
370
|
-
if trace_content:
|
371
|
-
span.add_event(
|
372
|
-
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
373
|
-
attributes={
|
374
|
-
# pylint: disable=line-too-long
|
375
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: kwargs.get("prompt", ""),
|
376
|
-
},
|
377
|
-
)
|
378
|
-
span.add_event(
|
379
|
-
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
380
|
-
attributes={
|
381
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: llmresponse,
|
382
|
-
},
|
383
|
-
)
|
294
|
+
awaited_wrapped = await wrapped(*args, **kwargs)
|
295
|
+
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
384
296
|
|
385
|
-
|
386
|
-
|
387
|
-
if disable_metrics is False:
|
388
|
-
attributes = {
|
389
|
-
TELEMETRY_SDK_NAME:
|
390
|
-
"openlit",
|
391
|
-
SemanticConvetion.GEN_AI_APPLICATION_NAME:
|
392
|
-
application_name,
|
393
|
-
SemanticConvetion.GEN_AI_SYSTEM:
|
394
|
-
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA,
|
395
|
-
SemanticConvetion.GEN_AI_ENVIRONMENT:
|
396
|
-
environment,
|
397
|
-
SemanticConvetion.GEN_AI_TYPE:
|
398
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
399
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
400
|
-
kwargs.get("model", "llama3")
|
401
|
-
}
|
402
|
-
|
403
|
-
metrics["genai_requests"].add(1, attributes)
|
404
|
-
metrics["genai_total_tokens"].add(total_tokens, attributes)
|
405
|
-
metrics["genai_completion_tokens"].add(completion_tokens, attributes)
|
406
|
-
metrics["genai_prompt_tokens"].add(prompt_tokens, attributes)
|
407
|
-
metrics["genai_cost"].record(cost, attributes)
|
408
|
-
|
409
|
-
except Exception as e:
|
410
|
-
handle_exception(span, e)
|
411
|
-
logger.error("Error in trace creation: %s", e)
|
412
|
-
|
413
|
-
return stream_generator()
|
297
|
+
return TracedAsyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
414
298
|
|
415
299
|
# Handling for non-streaming responses
|
416
300
|
else:
|
417
|
-
|
418
|
-
|
301
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
302
|
+
start_time = time.time()
|
419
303
|
response = await wrapped(*args, **kwargs)
|
304
|
+
end_time = time.time()
|
305
|
+
|
306
|
+
response_dict = response_as_dict(response)
|
420
307
|
|
421
308
|
try:
|
422
|
-
#
|
309
|
+
# Format 'messages' into a single string
|
310
|
+
message_prompt = kwargs.get("messages", "")
|
311
|
+
formatted_messages = []
|
312
|
+
for message in message_prompt:
|
313
|
+
role = message["role"]
|
314
|
+
content = message["content"]
|
315
|
+
|
316
|
+
if isinstance(content, list):
|
317
|
+
content_str = ", ".join(
|
318
|
+
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
319
|
+
if "type" in item else f'text: {item["text"]}'
|
320
|
+
for item in content
|
321
|
+
)
|
322
|
+
formatted_messages.append(f"{role}: {content_str}")
|
323
|
+
else:
|
324
|
+
formatted_messages.append(f"{role}: {content}")
|
325
|
+
prompt = "\n".join(formatted_messages)
|
326
|
+
|
327
|
+
input_tokens = response_dict.get('prompt_eval_count')
|
328
|
+
output_tokens = response_dict.get('eval_count')
|
329
|
+
|
330
|
+
# Calculate cost of the operation
|
331
|
+
cost = get_chat_model_cost(request_model,
|
332
|
+
pricing_info, input_tokens,
|
333
|
+
output_tokens)
|
334
|
+
|
335
|
+
# Set base span attribues (OTel Semconv)
|
423
336
|
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
337
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
338
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
424
339
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
425
340
|
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA)
|
426
|
-
span.set_attribute(SemanticConvetion.
|
427
|
-
|
428
|
-
span.set_attribute(SemanticConvetion.
|
429
|
-
|
430
|
-
|
341
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
342
|
+
request_model)
|
343
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
344
|
+
server_port)
|
345
|
+
|
346
|
+
# List of attributes and their config keys
|
347
|
+
attributes = [
|
348
|
+
(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY, 'repeat_penalty'),
|
349
|
+
(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS, 'max_tokens'),
|
350
|
+
(SemanticConvetion.GEN_AI_REQUEST_SEED, 'seed'),
|
351
|
+
(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES, 'stop'),
|
352
|
+
(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE, 'temperature'),
|
353
|
+
(SemanticConvetion.GEN_AI_REQUEST_TOP_P, 'top_p'),
|
354
|
+
(SemanticConvetion.GEN_AI_REQUEST_TOP_K, 'top_k'),
|
355
|
+
]
|
356
|
+
|
357
|
+
# Safely get the options dictionary from kwargs
|
358
|
+
options = kwargs.get('options', {})
|
359
|
+
|
360
|
+
# Set each attribute if the corresponding value exists and is not None
|
361
|
+
for attribute, key in attributes:
|
362
|
+
# Use dictionary `get` to retrieve values from the options dictionary
|
363
|
+
value = options.get(key)
|
364
|
+
if value is not None:
|
365
|
+
span.set_attribute(attribute, value)
|
366
|
+
|
367
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
368
|
+
response_dict.get('model'))
|
369
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
370
|
+
input_tokens)
|
371
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
372
|
+
output_tokens)
|
373
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
374
|
+
server_address)
|
375
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
376
|
+
[response_dict.get('done_reason')])
|
377
|
+
if kwargs.get('format'):
|
378
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
379
|
+
'json')
|
380
|
+
else:
|
381
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
382
|
+
'text')
|
383
|
+
|
384
|
+
# Set base span attribues (Extras)
|
385
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
431
386
|
environment)
|
432
|
-
span.set_attribute(
|
387
|
+
span.set_attribute(SERVICE_NAME,
|
433
388
|
application_name)
|
434
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
435
|
-
kwargs.get("model", "llama3"))
|
436
389
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
437
390
|
False)
|
391
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
392
|
+
input_tokens + output_tokens)
|
393
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
394
|
+
cost)
|
395
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
396
|
+
end_time - start_time)
|
397
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
398
|
+
version)
|
438
399
|
if trace_content:
|
439
400
|
span.add_event(
|
440
401
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
441
402
|
attributes={
|
442
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT:
|
403
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
443
404
|
},
|
444
405
|
)
|
445
406
|
span.add_event(
|
446
407
|
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
447
408
|
attributes={
|
448
|
-
|
409
|
+
# pylint: disable=line-too-long
|
410
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: str(response_dict.get('message').get('content')),
|
449
411
|
},
|
450
412
|
)
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
total_tokens = prompt_tokens + completion_tokens
|
455
|
-
# Calculate cost of the operation
|
456
|
-
cost = get_chat_model_cost(kwargs.get("model", "llama3"),
|
457
|
-
pricing_info, prompt_tokens, completion_tokens)
|
458
|
-
|
459
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
460
|
-
prompt_tokens)
|
461
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
462
|
-
completion_tokens)
|
463
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
464
|
-
total_tokens)
|
465
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
466
|
-
[response["done_reason"]])
|
467
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
468
|
-
cost)
|
413
|
+
if kwargs.get('tools'):
|
414
|
+
span.set_attribute(SemanticConvetion.GEN_AI_TOOL_CALLS,
|
415
|
+
str(response_dict.get('message').get('tool_calls')))
|
469
416
|
|
470
417
|
span.set_status(Status(StatusCode.OK))
|
471
418
|
|
472
419
|
if disable_metrics is False:
|
473
|
-
attributes =
|
474
|
-
|
475
|
-
|
476
|
-
SemanticConvetion.
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
484
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
485
|
-
kwargs.get("model", "llama3")
|
486
|
-
}
|
420
|
+
attributes = create_metrics_attributes(
|
421
|
+
service_name=application_name,
|
422
|
+
deployment_environment=environment,
|
423
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
424
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_OLLAMA,
|
425
|
+
request_model=request_model,
|
426
|
+
server_address=server_address,
|
427
|
+
server_port=server_port,
|
428
|
+
response_model=response_dict.get('model'),
|
429
|
+
)
|
487
430
|
|
431
|
+
metrics["genai_client_usage_tokens"].record(
|
432
|
+
input_tokens + output_tokens, attributes
|
433
|
+
)
|
434
|
+
metrics["genai_client_operation_duration"].record(
|
435
|
+
end_time - start_time, attributes
|
436
|
+
)
|
437
|
+
metrics["genai_server_ttft"].record(
|
438
|
+
end_time - start_time, attributes
|
439
|
+
)
|
488
440
|
metrics["genai_requests"].add(1, attributes)
|
489
|
-
metrics["
|
490
|
-
metrics["
|
491
|
-
metrics["genai_prompt_tokens"].add(prompt_tokens, attributes)
|
441
|
+
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
442
|
+
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
492
443
|
metrics["genai_cost"].record(cost, attributes)
|
493
444
|
|
494
445
|
# Return original response
|
@@ -503,13 +454,12 @@ def async_generate(gen_ai_endpoint, version, environment, application_name,
|
|
503
454
|
|
504
455
|
return wrapper
|
505
456
|
|
506
|
-
def async_embeddings(
|
507
|
-
|
457
|
+
def async_embeddings(version, environment, application_name,
|
458
|
+
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
508
459
|
"""
|
509
460
|
Generates a telemetry wrapper for embeddings to collect metrics.
|
510
461
|
|
511
462
|
Args:
|
512
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
513
463
|
version: Version of the monitoring package.
|
514
464
|
environment: Deployment environment (e.g., production, staging).
|
515
465
|
application_name: Name of the application using the Ollama API.
|
@@ -538,71 +488,89 @@ def async_embeddings(gen_ai_endpoint, version, environment, application_name,
|
|
538
488
|
The response from the original 'embeddings' method.
|
539
489
|
"""
|
540
490
|
|
541
|
-
|
491
|
+
server_address, server_port = set_server_address_and_port(instance, '127.0.0.1', 11434)
|
492
|
+
request_model = kwargs.get('model', 'all-minilm')
|
493
|
+
|
494
|
+
span_name = f'{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}'
|
495
|
+
|
496
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
497
|
+
start_time = time.time()
|
542
498
|
response = await wrapped(*args, **kwargs)
|
499
|
+
end_time = time.time()
|
543
500
|
|
544
501
|
try:
|
545
|
-
|
502
|
+
input_tokens = general_tokens(str(kwargs.get('prompt')))
|
503
|
+
|
546
504
|
# Calculate cost of the operation
|
547
|
-
cost = get_embed_model_cost(
|
548
|
-
|
549
|
-
|
550
|
-
|
505
|
+
cost = get_embed_model_cost(request_model,
|
506
|
+
pricing_info, input_tokens)
|
507
|
+
|
508
|
+
# Set Span attributes (OTel Semconv)
|
509
|
+
span.set_attribute(TELEMETRY_SDK_NAME, 'openlit')
|
510
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
511
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
|
551
512
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
552
513
|
SemanticConvetion.GEN_AI_SYSTEM_OLLAMA)
|
553
|
-
span.set_attribute(SemanticConvetion.
|
554
|
-
|
555
|
-
span.set_attribute(SemanticConvetion.
|
556
|
-
|
557
|
-
span.set_attribute(SemanticConvetion.
|
514
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
515
|
+
request_model)
|
516
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
517
|
+
request_model)
|
518
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
519
|
+
server_address)
|
520
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
521
|
+
server_port)
|
522
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
523
|
+
input_tokens)
|
524
|
+
|
525
|
+
# Set Span attributes (Extras)
|
526
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
558
527
|
environment)
|
559
|
-
span.set_attribute(
|
528
|
+
span.set_attribute(SERVICE_NAME,
|
560
529
|
application_name)
|
561
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
562
|
-
kwargs.get('model', "llama3"))
|
563
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
564
|
-
prompt_tokens)
|
565
530
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
566
|
-
|
531
|
+
input_tokens)
|
567
532
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
568
533
|
cost)
|
534
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
535
|
+
version)
|
536
|
+
|
569
537
|
if trace_content:
|
570
538
|
span.add_event(
|
571
539
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
572
540
|
attributes={
|
573
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT: kwargs.get(
|
541
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get('prompt', '')),
|
574
542
|
},
|
575
543
|
)
|
576
544
|
|
577
545
|
span.set_status(Status(StatusCode.OK))
|
578
546
|
|
579
547
|
if disable_metrics is False:
|
580
|
-
attributes =
|
581
|
-
|
582
|
-
|
583
|
-
SemanticConvetion.
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
metrics[
|
597
|
-
metrics[
|
598
|
-
metrics[
|
548
|
+
attributes = create_metrics_attributes(
|
549
|
+
service_name=application_name,
|
550
|
+
deployment_environment=environment,
|
551
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
552
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_OLLAMA,
|
553
|
+
request_model=request_model,
|
554
|
+
server_address=server_address,
|
555
|
+
server_port=server_port,
|
556
|
+
response_model=request_model,
|
557
|
+
)
|
558
|
+
metrics['genai_client_usage_tokens'].record(
|
559
|
+
input_tokens, attributes
|
560
|
+
)
|
561
|
+
metrics['genai_client_operation_duration'].record(
|
562
|
+
end_time - start_time, attributes
|
563
|
+
)
|
564
|
+
metrics['genai_requests'].add(1, attributes)
|
565
|
+
metrics['genai_prompt_tokens'].add(input_tokens, attributes)
|
566
|
+
metrics['genai_cost'].record(cost, attributes)
|
599
567
|
|
600
568
|
# Return original response
|
601
569
|
return response
|
602
570
|
|
603
571
|
except Exception as e:
|
604
572
|
handle_exception(span, e)
|
605
|
-
logger.error(
|
573
|
+
logger.error('Error in trace creation: %s', e)
|
606
574
|
|
607
575
|
# Return original response
|
608
576
|
return response
|