openlit 1.33.7__py3-none-any.whl → 1.33.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openlit/__helpers.py +83 -0
- openlit/__init__.py +1 -1
- openlit/instrumentation/ag2/ag2.py +2 -2
- openlit/instrumentation/ai21/__init__.py +4 -4
- openlit/instrumentation/ai21/ai21.py +370 -319
- openlit/instrumentation/ai21/async_ai21.py +371 -319
- openlit/instrumentation/anthropic/__init__.py +4 -4
- openlit/instrumentation/anthropic/anthropic.py +321 -189
- openlit/instrumentation/anthropic/async_anthropic.py +323 -190
- openlit/instrumentation/assemblyai/__init__.py +1 -1
- openlit/instrumentation/assemblyai/assemblyai.py +59 -43
- openlit/instrumentation/astra/astra.py +4 -4
- openlit/instrumentation/astra/async_astra.py +4 -4
- openlit/instrumentation/azure_ai_inference/__init__.py +4 -4
- openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +406 -252
- openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +406 -252
- openlit/instrumentation/bedrock/__init__.py +1 -1
- openlit/instrumentation/bedrock/bedrock.py +115 -58
- openlit/instrumentation/chroma/chroma.py +4 -4
- openlit/instrumentation/cohere/__init__.py +33 -10
- openlit/instrumentation/cohere/async_cohere.py +610 -0
- openlit/instrumentation/cohere/cohere.py +410 -219
- openlit/instrumentation/controlflow/controlflow.py +2 -2
- openlit/instrumentation/crawl4ai/async_crawl4ai.py +2 -2
- openlit/instrumentation/crawl4ai/crawl4ai.py +2 -2
- openlit/instrumentation/crewai/crewai.py +2 -2
- openlit/instrumentation/dynamiq/dynamiq.py +2 -2
- openlit/instrumentation/elevenlabs/async_elevenlabs.py +73 -47
- openlit/instrumentation/elevenlabs/elevenlabs.py +73 -52
- openlit/instrumentation/embedchain/embedchain.py +4 -4
- openlit/instrumentation/firecrawl/firecrawl.py +2 -2
- openlit/instrumentation/google_ai_studio/__init__.py +9 -9
- openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +183 -219
- openlit/instrumentation/google_ai_studio/google_ai_studio.py +183 -220
- openlit/instrumentation/gpt4all/gpt4all.py +17 -17
- openlit/instrumentation/groq/async_groq.py +14 -14
- openlit/instrumentation/groq/groq.py +14 -14
- openlit/instrumentation/haystack/haystack.py +2 -2
- openlit/instrumentation/julep/async_julep.py +2 -2
- openlit/instrumentation/julep/julep.py +2 -2
- openlit/instrumentation/langchain/langchain.py +36 -31
- openlit/instrumentation/letta/letta.py +6 -6
- openlit/instrumentation/litellm/async_litellm.py +20 -20
- openlit/instrumentation/litellm/litellm.py +20 -20
- openlit/instrumentation/llamaindex/llamaindex.py +2 -2
- openlit/instrumentation/mem0/mem0.py +2 -2
- openlit/instrumentation/milvus/milvus.py +4 -4
- openlit/instrumentation/mistral/async_mistral.py +18 -18
- openlit/instrumentation/mistral/mistral.py +18 -18
- openlit/instrumentation/multion/async_multion.py +2 -2
- openlit/instrumentation/multion/multion.py +2 -2
- openlit/instrumentation/ollama/async_ollama.py +29 -29
- openlit/instrumentation/ollama/ollama.py +29 -29
- openlit/instrumentation/openai/__init__.py +11 -230
- openlit/instrumentation/openai/async_openai.py +434 -409
- openlit/instrumentation/openai/openai.py +415 -393
- openlit/instrumentation/phidata/phidata.py +2 -2
- openlit/instrumentation/pinecone/pinecone.py +4 -4
- openlit/instrumentation/premai/premai.py +20 -20
- openlit/instrumentation/qdrant/async_qdrant.py +4 -4
- openlit/instrumentation/qdrant/qdrant.py +4 -4
- openlit/instrumentation/reka/async_reka.py +6 -6
- openlit/instrumentation/reka/reka.py +6 -6
- openlit/instrumentation/together/async_together.py +18 -18
- openlit/instrumentation/together/together.py +18 -18
- openlit/instrumentation/transformers/transformers.py +6 -6
- openlit/instrumentation/vertexai/async_vertexai.py +53 -53
- openlit/instrumentation/vertexai/vertexai.py +53 -53
- openlit/instrumentation/vllm/vllm.py +6 -6
- openlit/otel/metrics.py +98 -7
- openlit/semcov/__init__.py +113 -80
- {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/METADATA +2 -1
- openlit-1.33.9.dist-info/RECORD +121 -0
- {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/WHEEL +1 -1
- openlit/instrumentation/openai/async_azure_openai.py +0 -900
- openlit/instrumentation/openai/azure_openai.py +0 -898
- openlit-1.33.7.dist-info/RECORD +0 -122
- {openlit-1.33.7.dist-info → openlit-1.33.9.dist-info}/LICENSE +0 -0
@@ -1,29 +1,37 @@
|
|
1
|
-
# pylint: disable=duplicate-code, broad-exception-caught, too-many-statements, unused-argument, possibly-used-before-assignment
|
2
1
|
"""
|
3
2
|
Module for monitoring Cohere API calls.
|
4
3
|
"""
|
5
4
|
|
6
5
|
import logging
|
6
|
+
import time
|
7
7
|
from opentelemetry.trace import SpanKind, Status, StatusCode
|
8
|
-
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
9
|
-
from openlit.__helpers import
|
8
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, TELEMETRY_SDK_NAME, DEPLOYMENT_ENVIRONMENT
|
9
|
+
from openlit.__helpers import (
|
10
|
+
get_chat_model_cost,
|
11
|
+
get_embed_model_cost,
|
12
|
+
handle_exception,
|
13
|
+
response_as_dict,
|
14
|
+
calculate_ttft,
|
15
|
+
calculate_tbt,
|
16
|
+
create_metrics_attributes,
|
17
|
+
set_server_address_and_port
|
18
|
+
)
|
10
19
|
from openlit.semcov import SemanticConvetion
|
11
20
|
|
12
21
|
# Initialize logger for logging potential issues and operations
|
13
22
|
logger = logging.getLogger(__name__)
|
14
23
|
|
15
|
-
def embed(
|
24
|
+
def embed(version, environment, application_name, tracer,
|
16
25
|
pricing_info, trace_content, metrics, disable_metrics):
|
17
26
|
"""
|
18
27
|
Generates a telemetry wrapper for embeddings to collect metrics.
|
19
28
|
|
20
29
|
Args:
|
21
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
22
30
|
version: Version of the monitoring package.
|
23
31
|
environment: Deployment environment (e.g., production, staging).
|
24
|
-
application_name: Name of the application using the
|
32
|
+
application_name: Name of the application using the Cohere API.
|
25
33
|
tracer: OpenTelemetry tracer for creating spans.
|
26
|
-
pricing_info: Information used for calculating the cost of
|
34
|
+
pricing_info: Information used for calculating the cost of Cohere usage.
|
27
35
|
trace_content: Flag indicating whether to trace the actual content.
|
28
36
|
|
29
37
|
Returns:
|
@@ -47,80 +55,85 @@ def embed(gen_ai_endpoint, version, environment, application_name, tracer,
|
|
47
55
|
The response from the original 'embed' method.
|
48
56
|
"""
|
49
57
|
|
50
|
-
|
51
|
-
|
58
|
+
server_address, server_port = set_server_address_and_port(instance, "api.cohere.com", 443)
|
59
|
+
request_model = kwargs.get("model", "mbed-english-v3.0")
|
52
60
|
|
53
|
-
|
54
|
-
# Get prompt from kwargs and store as a single string
|
55
|
-
prompt = " ".join(kwargs.get("texts", []))
|
61
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING} {request_model}"
|
56
62
|
|
63
|
+
with tracer.start_as_current_span(span_name, kind= SpanKind.CLIENT) as span:
|
64
|
+
start_time = time.time()
|
65
|
+
response = wrapped(*args, **kwargs)
|
66
|
+
end_time = time.time()
|
57
67
|
|
68
|
+
response_dict = response_as_dict(response)
|
69
|
+
try:
|
70
|
+
input_tokens = response_dict.get('meta').get('billed_units').get('input_tokens')
|
58
71
|
# Calculate cost of the operation
|
59
72
|
cost = get_embed_model_cost(kwargs.get("model", "embed-english-v2.0"),
|
60
|
-
pricing_info,
|
61
|
-
response.meta.billed_units.input_tokens)
|
73
|
+
pricing_info, input_tokens)
|
62
74
|
|
63
|
-
# Set Span attributes
|
75
|
+
# Set Span attributes (OTel Semconv)
|
64
76
|
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
77
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
78
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING)
|
65
79
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
66
80
|
SemanticConvetion.GEN_AI_SYSTEM_COHERE)
|
67
|
-
span.set_attribute(SemanticConvetion.
|
68
|
-
|
69
|
-
span.set_attribute(SemanticConvetion.
|
70
|
-
|
71
|
-
span.set_attribute(SemanticConvetion.
|
81
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
82
|
+
request_model)
|
83
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_ENCODING_FORMATS,
|
84
|
+
kwargs.get('embedding_types', ['float']))
|
85
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
86
|
+
request_model)
|
87
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
88
|
+
server_address)
|
89
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
90
|
+
server_port)
|
91
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
92
|
+
input_tokens)
|
93
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
94
|
+
response_dict.get('response_type'))
|
95
|
+
|
96
|
+
# Set Span attributes (Extras)
|
97
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
72
98
|
environment)
|
73
|
-
span.set_attribute(
|
99
|
+
span.set_attribute(SERVICE_NAME,
|
74
100
|
application_name)
|
75
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
76
|
-
kwargs.get("model", "embed-english-v2.0"))
|
77
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_FORMAT,
|
78
|
-
kwargs.get("embedding_types", "float"))
|
79
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_EMBEDDING_DIMENSION,
|
80
|
-
kwargs.get("input_type", ""))
|
81
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_USER,
|
82
|
-
kwargs.get("user", ""))
|
83
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
84
|
-
response.id)
|
85
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
86
|
-
response.meta.billed_units.input_tokens)
|
87
101
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
88
|
-
|
102
|
+
input_tokens)
|
89
103
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
90
104
|
cost)
|
105
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
106
|
+
version)
|
107
|
+
|
91
108
|
if trace_content:
|
92
109
|
span.add_event(
|
93
110
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
94
111
|
attributes={
|
95
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT:
|
112
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: str(kwargs.get("texts", "")),
|
96
113
|
},
|
97
114
|
)
|
98
115
|
|
99
116
|
span.set_status(Status(StatusCode.OK))
|
100
117
|
|
101
118
|
if disable_metrics is False:
|
102
|
-
attributes =
|
103
|
-
|
104
|
-
|
105
|
-
SemanticConvetion.
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
SemanticConvetion.GEN_AI_TYPE:
|
112
|
-
SemanticConvetion.GEN_AI_TYPE_EMBEDDING,
|
113
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
114
|
-
kwargs.get("model", "embed-english-v2.0")
|
115
|
-
}
|
116
|
-
|
117
|
-
metrics["genai_requests"].add(1, attributes)
|
118
|
-
metrics["genai_total_tokens"].add(
|
119
|
-
response.meta.billed_units.input_tokens, attributes
|
119
|
+
attributes = create_metrics_attributes(
|
120
|
+
service_name=application_name,
|
121
|
+
deployment_environment=environment,
|
122
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_EMBEDDING,
|
123
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_COHERE,
|
124
|
+
request_model=request_model,
|
125
|
+
server_address=server_address,
|
126
|
+
server_port=server_port,
|
127
|
+
response_model=request_model,
|
120
128
|
)
|
121
|
-
metrics["
|
122
|
-
|
129
|
+
metrics["genai_client_usage_tokens"].record(
|
130
|
+
input_tokens, attributes
|
131
|
+
)
|
132
|
+
metrics["genai_client_operation_duration"].record(
|
133
|
+
end_time - start_time, attributes
|
123
134
|
)
|
135
|
+
metrics["genai_requests"].add(1, attributes)
|
136
|
+
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
124
137
|
metrics["genai_cost"].record(cost, attributes)
|
125
138
|
|
126
139
|
# Return original response
|
@@ -135,18 +148,17 @@ def embed(gen_ai_endpoint, version, environment, application_name, tracer,
|
|
135
148
|
|
136
149
|
return wrapper
|
137
150
|
|
138
|
-
def chat(
|
151
|
+
def chat(version, environment, application_name, tracer,
|
139
152
|
pricing_info, trace_content, metrics, disable_metrics):
|
140
153
|
"""
|
141
154
|
Generates a telemetry wrapper for chat to collect metrics.
|
142
155
|
|
143
156
|
Args:
|
144
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
145
157
|
version: Version of the monitoring package.
|
146
158
|
environment: Deployment environment (e.g., production, staging).
|
147
|
-
application_name: Name of the application using the
|
159
|
+
application_name: Name of the application using the Cohere API.
|
148
160
|
tracer: OpenTelemetry tracer for creating spans.
|
149
|
-
pricing_info: Information used for calculating the cost of
|
161
|
+
pricing_info: Information used for calculating the cost of Cohere usage.
|
150
162
|
trace_content: Flag indicating whether to trace the actual content.
|
151
163
|
|
152
164
|
Returns:
|
@@ -170,96 +182,145 @@ def chat(gen_ai_endpoint, version, environment, application_name, tracer,
|
|
170
182
|
The response from the original 'chat' method.
|
171
183
|
"""
|
172
184
|
|
173
|
-
|
185
|
+
server_address, server_port = set_server_address_and_port(instance, "api.cohere.com", 443)
|
186
|
+
request_model = kwargs.get("model", "command-r-plus-08-2024")
|
187
|
+
|
188
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
189
|
+
|
190
|
+
with tracer.start_as_current_span(span_name, kind=SpanKind.CLIENT) as span:
|
191
|
+
start_time = time.time()
|
174
192
|
response = wrapped(*args, **kwargs)
|
193
|
+
end_time = time.time()
|
194
|
+
|
195
|
+
response_dict = response_as_dict(response)
|
175
196
|
|
176
197
|
try:
|
198
|
+
# Format 'messages' into a single string
|
199
|
+
message_prompt = kwargs.get("messages", "")
|
200
|
+
formatted_messages = []
|
201
|
+
for message in message_prompt:
|
202
|
+
role = message["role"]
|
203
|
+
content = message["content"]
|
204
|
+
|
205
|
+
if isinstance(content, list):
|
206
|
+
content_str = ", ".join(
|
207
|
+
f'{item["type"]}: {item["text"] if "text" in item else item["image_url"]}'
|
208
|
+
if "type" in item else f'text: {item["text"]}'
|
209
|
+
for item in content
|
210
|
+
)
|
211
|
+
formatted_messages.append(f"{role}: {content_str}")
|
212
|
+
else:
|
213
|
+
formatted_messages.append(f"{role}: {content}")
|
214
|
+
prompt = "\n".join(formatted_messages)
|
215
|
+
|
216
|
+
input_tokens = response_dict.get('usage').get('billed_units').get('input_tokens')
|
217
|
+
output_tokens = response_dict.get('usage').get('billed_units').get('output_tokens')
|
218
|
+
|
177
219
|
# Calculate cost of the operation
|
178
|
-
cost = get_chat_model_cost(
|
179
|
-
|
180
|
-
|
181
|
-
|
220
|
+
cost = get_chat_model_cost(request_model, pricing_info,
|
221
|
+
input_tokens, output_tokens)
|
222
|
+
|
223
|
+
llm_response = response_dict.get('message').get('content')[0].get('text')
|
182
224
|
|
183
|
-
|
225
|
+
# Set base span attribues (OTel Semconv)
|
184
226
|
span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
227
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
228
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
185
229
|
span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
186
230
|
SemanticConvetion.GEN_AI_SYSTEM_COHERE)
|
187
|
-
span.set_attribute(SemanticConvetion.GEN_AI_TYPE,
|
188
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT)
|
189
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENDPOINT,
|
190
|
-
gen_ai_endpoint)
|
191
|
-
span.set_attribute(SemanticConvetion.GEN_AI_ENVIRONMENT,
|
192
|
-
environment)
|
193
|
-
span.set_attribute(SemanticConvetion.GEN_AI_APPLICATION_NAME,
|
194
|
-
application_name)
|
195
231
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
196
|
-
|
197
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
198
|
-
kwargs.get("temperature", 0.3))
|
199
|
-
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
200
|
-
kwargs.get("max_tokens", -1))
|
232
|
+
request_model)
|
201
233
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
202
234
|
kwargs.get("seed", ""))
|
235
|
+
span.set_attribute(SemanticConvetion.SERVER_PORT,
|
236
|
+
server_port)
|
203
237
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
204
238
|
kwargs.get("frequency_penalty", 0.0))
|
239
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
240
|
+
kwargs.get("max_tokens", -1))
|
205
241
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
206
242
|
kwargs.get("presence_penalty", 0.0))
|
243
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
244
|
+
kwargs.get("stop_sequences", []))
|
245
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
246
|
+
kwargs.get("temperature", 0.3))
|
247
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_K,
|
248
|
+
kwargs.get("k", 1.0))
|
249
|
+
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
250
|
+
kwargs.get("p", 1.0))
|
251
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
252
|
+
response_dict.get("id"))
|
253
|
+
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
254
|
+
request_model)
|
255
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
256
|
+
input_tokens)
|
257
|
+
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
258
|
+
output_tokens)
|
259
|
+
span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
260
|
+
server_address)
|
261
|
+
if isinstance(llm_response, str):
|
262
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
263
|
+
"text")
|
264
|
+
else:
|
265
|
+
span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
266
|
+
"json")
|
267
|
+
|
268
|
+
# Set base span attribues (Extras)
|
269
|
+
span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
270
|
+
environment)
|
271
|
+
span.set_attribute(SERVICE_NAME,
|
272
|
+
application_name)
|
207
273
|
span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
208
274
|
False)
|
209
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
210
|
-
response.generation_id)
|
211
|
-
span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
212
|
-
[response.finish_reason])
|
213
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_PROMPT_TOKENS,
|
214
|
-
response.meta.billed_units.input_tokens)
|
215
|
-
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COMPLETION_TOKENS,
|
216
|
-
response.meta.billed_units.output_tokens)
|
217
275
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
218
|
-
|
219
|
-
response.meta.billed_units.output_tokens)
|
276
|
+
input_tokens + output_tokens)
|
220
277
|
span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
221
278
|
cost)
|
279
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
280
|
+
end_time - start_time)
|
281
|
+
span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
282
|
+
version)
|
222
283
|
|
223
284
|
if trace_content:
|
224
285
|
span.add_event(
|
225
286
|
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
226
287
|
attributes={
|
227
|
-
SemanticConvetion.GEN_AI_CONTENT_PROMPT:
|
288
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
228
289
|
},
|
229
290
|
)
|
230
291
|
span.add_event(
|
231
292
|
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
232
293
|
attributes={
|
233
|
-
SemanticConvetion.GEN_AI_CONTENT_COMPLETION:
|
294
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: llm_response,
|
234
295
|
},
|
235
296
|
)
|
236
297
|
|
237
298
|
span.set_status(Status(StatusCode.OK))
|
238
299
|
|
239
300
|
if disable_metrics is False:
|
240
|
-
attributes =
|
241
|
-
|
242
|
-
|
243
|
-
SemanticConvetion.
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
SemanticConvetion.GEN_AI_TYPE_CHAT,
|
251
|
-
SemanticConvetion.GEN_AI_REQUEST_MODEL:
|
252
|
-
kwargs.get("model", "command")
|
253
|
-
}
|
301
|
+
attributes = create_metrics_attributes(
|
302
|
+
service_name=application_name,
|
303
|
+
deployment_environment=environment,
|
304
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
305
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_COHERE,
|
306
|
+
request_model=request_model,
|
307
|
+
server_address=server_address,
|
308
|
+
server_port=server_port,
|
309
|
+
response_model=request_model,
|
310
|
+
)
|
254
311
|
|
312
|
+
metrics["genai_client_usage_tokens"].record(
|
313
|
+
input_tokens + output_tokens, attributes
|
314
|
+
)
|
315
|
+
metrics["genai_client_operation_duration"].record(
|
316
|
+
end_time - start_time, attributes
|
317
|
+
)
|
318
|
+
metrics["genai_server_ttft"].record(
|
319
|
+
end_time - start_time, attributes
|
320
|
+
)
|
255
321
|
metrics["genai_requests"].add(1, attributes)
|
256
|
-
metrics["
|
257
|
-
|
258
|
-
response.meta.billed_units.output_tokens, attributes)
|
259
|
-
metrics["genai_completion_tokens"].add(
|
260
|
-
response.meta.billed_units.output_tokens, attributes)
|
261
|
-
metrics["genai_prompt_tokens"].add(
|
262
|
-
response.meta.billed_units.input_tokens, attributes)
|
322
|
+
metrics["genai_completion_tokens"].add(output_tokens, attributes)
|
323
|
+
metrics["genai_prompt_tokens"].add(input_tokens, attributes)
|
263
324
|
metrics["genai_cost"].record(cost, attributes)
|
264
325
|
|
265
326
|
# Return original response
|
@@ -274,18 +335,17 @@ def chat(gen_ai_endpoint, version, environment, application_name, tracer,
|
|
274
335
|
|
275
336
|
return wrapper
|
276
337
|
|
277
|
-
def chat_stream(
|
338
|
+
def chat_stream(version, environment, application_name,
|
278
339
|
tracer, pricing_info, trace_content, metrics, disable_metrics):
|
279
340
|
"""
|
280
341
|
Generates a telemetry wrapper for chat_stream to collect metrics.
|
281
342
|
|
282
343
|
Args:
|
283
|
-
gen_ai_endpoint: Endpoint identifier for logging and tracing.
|
284
344
|
version: Version of the monitoring package.
|
285
345
|
environment: Deployment environment (e.g., production, staging).
|
286
|
-
application_name: Name of the application using the
|
346
|
+
application_name: Name of the application using the Cohere API.
|
287
347
|
tracer: OpenTelemetry tracer for creating spans.
|
288
|
-
pricing_info: Information used for calculating the cost of
|
348
|
+
pricing_info: Information used for calculating the cost of Cohere usage.
|
289
349
|
trace_content: Flag indicating whether to trace the actual content.
|
290
350
|
|
291
351
|
Returns:
|
@@ -309,111 +369,242 @@ def chat_stream(gen_ai_endpoint, version, environment, application_name,
|
|
309
369
|
The response from the original 'chat_stream' method.
|
310
370
|
"""
|
311
371
|
|
312
|
-
|
313
|
-
|
372
|
+
class TracedSyncStream:
|
373
|
+
"""
|
374
|
+
Wrapper for streaming responses to collect metrics and trace data.
|
375
|
+
Wraps the 'cohere.AsyncStream' response to collect message IDs and aggregated response.
|
376
|
+
|
377
|
+
This class implements the '__aiter__' and '__anext__' methods that
|
378
|
+
handle asynchronous streaming responses.
|
379
|
+
|
380
|
+
This class also implements '__aenter__' and '__aexit__' methods that
|
381
|
+
handle asynchronous context management protocol.
|
382
|
+
"""
|
383
|
+
def __init__(
|
384
|
+
self,
|
385
|
+
wrapped,
|
386
|
+
span,
|
387
|
+
kwargs,
|
388
|
+
server_address,
|
389
|
+
server_port,
|
390
|
+
**args,
|
391
|
+
):
|
392
|
+
self.__wrapped__ = wrapped
|
393
|
+
self._span = span
|
314
394
|
# Placeholder for aggregating streaming response
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
395
|
+
self._llmresponse = ""
|
396
|
+
self._response_id = ""
|
397
|
+
self._finish_reason = ""
|
398
|
+
self._input_tokens = ""
|
399
|
+
self._output_tokens = ""
|
400
|
+
|
401
|
+
self._args = args
|
402
|
+
self._kwargs = kwargs
|
403
|
+
self._start_time = time.time()
|
404
|
+
self._end_time = None
|
405
|
+
self._timestamps = []
|
406
|
+
self._ttft = 0
|
407
|
+
self._tbt = 0
|
408
|
+
self._server_address = server_address
|
409
|
+
self._server_port = server_port
|
410
|
+
|
411
|
+
def __enter__(self):
|
412
|
+
self.__wrapped__.__enter__()
|
413
|
+
return self
|
414
|
+
|
415
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
416
|
+
self.__wrapped__.__exit__(exc_type, exc_value, traceback)
|
417
|
+
|
418
|
+
def __iter__(self):
|
419
|
+
return self
|
420
|
+
|
421
|
+
def __getattr__(self, name):
|
422
|
+
"""Delegate attribute access to the wrapped object."""
|
423
|
+
return getattr(self.__wrapped__, name)
|
424
|
+
|
425
|
+
def __next__(self):
|
330
426
|
try:
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
427
|
+
chunk = self.__wrapped__.__next__()
|
428
|
+
end_time = time.time()
|
429
|
+
# Record the timestamp for the current chunk
|
430
|
+
self._timestamps.append(end_time)
|
431
|
+
|
432
|
+
if len(self._timestamps) == 1:
|
433
|
+
# Calculate time to first chunk
|
434
|
+
self._ttft = calculate_ttft(self._timestamps, self._start_time)
|
435
|
+
|
436
|
+
chunked = response_as_dict(chunk)
|
437
|
+
|
438
|
+
if chunked.get('type') == 'message-start':
|
439
|
+
self._response_id = chunked.get('id')
|
440
|
+
|
441
|
+
if chunked.get('type') == 'content-delta':
|
442
|
+
content = chunked.get('delta').get('message').get('text')
|
443
|
+
if content:
|
444
|
+
self._llmresponse += content
|
445
|
+
|
446
|
+
if chunked.get('type') == 'message-end':
|
447
|
+
self._finish_reason = chunked.get('delta').get('finish_reason')
|
448
|
+
self._input_tokens = chunked.get('delta').get('usage').get('billed_units').get('input_tokens')
|
449
|
+
self._output_tokens = chunked.get('delta').get('usage').get('billed_units').get('output_tokens')
|
450
|
+
|
451
|
+
return chunk
|
452
|
+
except StopIteration:
|
453
|
+
# Handling exception ensure observability without disrupting operation
|
454
|
+
try:
|
455
|
+
self._end_time = time.time()
|
456
|
+
if len(self._timestamps) > 1:
|
457
|
+
self._tbt = calculate_tbt(self._timestamps)
|
458
|
+
|
459
|
+
# Format 'messages' into a single string
|
460
|
+
message_prompt = self._kwargs.get("messages", "")
|
461
|
+
formatted_messages = []
|
462
|
+
for message in message_prompt:
|
463
|
+
role = message["role"]
|
464
|
+
content = message["content"]
|
465
|
+
|
466
|
+
if isinstance(content, list):
|
467
|
+
content_str_list = []
|
468
|
+
for item in content:
|
469
|
+
if item["type"] == "text":
|
470
|
+
content_str_list.append(f'text: {item["text"]}')
|
471
|
+
elif (item["type"] == "image_url" and
|
472
|
+
not item["image_url"]["url"].startswith("data:")):
|
473
|
+
content_str_list.append(f'image_url: {item["image_url"]["url"]}')
|
474
|
+
content_str = ", ".join(content_str_list)
|
475
|
+
formatted_messages.append(f"{role}: {content_str}")
|
476
|
+
else:
|
477
|
+
formatted_messages.append(f"{role}: {content}")
|
478
|
+
prompt = "\n".join(formatted_messages)
|
479
|
+
|
480
|
+
request_model = self._kwargs.get("model", "command-r-plus")
|
481
|
+
|
482
|
+
# Calculate cost of the operation
|
483
|
+
cost = get_chat_model_cost(request_model,
|
484
|
+
pricing_info, self._input_tokens,
|
485
|
+
self._output_tokens)
|
486
|
+
|
487
|
+
# Set Span attributes (OTel Semconv)
|
488
|
+
self._span.set_attribute(TELEMETRY_SDK_NAME, "openlit")
|
489
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OPERATION,
|
490
|
+
SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT)
|
491
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SYSTEM,
|
492
|
+
SemanticConvetion.GEN_AI_SYSTEM_COHERE)
|
493
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MODEL,
|
494
|
+
request_model)
|
495
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_SEED,
|
496
|
+
self._kwargs.get("seed", ""))
|
497
|
+
self._span.set_attribute(SemanticConvetion.SERVER_PORT,
|
498
|
+
self._server_port)
|
499
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
500
|
+
self._kwargs.get("frequency_penalty", 0.0))
|
501
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_MAX_TOKENS,
|
502
|
+
self._kwargs.get("max_tokens", -1))
|
503
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_PRESENCE_PENALTY,
|
504
|
+
self._kwargs.get("presence_penalty", 0.0))
|
505
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_STOP_SEQUENCES,
|
506
|
+
self._kwargs.get("stop_sequences", []))
|
507
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TEMPERATURE,
|
508
|
+
self._kwargs.get("temperature", 0.3))
|
509
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_K,
|
510
|
+
self._kwargs.get("k", 1.0))
|
511
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_TOP_P,
|
512
|
+
self._kwargs.get("p", 1.0))
|
513
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_FINISH_REASON,
|
514
|
+
[self._finish_reason])
|
515
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_ID,
|
516
|
+
self._response_id)
|
517
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_RESPONSE_MODEL,
|
518
|
+
request_model)
|
519
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_INPUT_TOKENS,
|
520
|
+
self._input_tokens)
|
521
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_OUTPUT_TOKENS,
|
522
|
+
self._output_tokens)
|
523
|
+
self._span.set_attribute(SemanticConvetion.SERVER_ADDRESS,
|
524
|
+
self._server_address)
|
525
|
+
|
526
|
+
if isinstance(self._llmresponse, str):
|
527
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
528
|
+
"text")
|
529
|
+
else:
|
530
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_OUTPUT_TYPE,
|
531
|
+
"json")
|
532
|
+
|
533
|
+
# Set Span attributes (Extra)
|
534
|
+
self._span.set_attribute(DEPLOYMENT_ENVIRONMENT,
|
535
|
+
environment)
|
536
|
+
self._span.set_attribute(SERVICE_NAME,
|
537
|
+
application_name)
|
538
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_REQUEST_IS_STREAM,
|
539
|
+
True)
|
540
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_TOTAL_TOKENS,
|
541
|
+
self._input_tokens + self._output_tokens)
|
542
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_USAGE_COST,
|
543
|
+
cost)
|
544
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TBT,
|
545
|
+
self._tbt)
|
546
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SERVER_TTFT,
|
547
|
+
self._ttft)
|
548
|
+
self._span.set_attribute(SemanticConvetion.GEN_AI_SDK_VERSION,
|
549
|
+
version)
|
550
|
+
if trace_content:
|
551
|
+
self._span.add_event(
|
552
|
+
name=SemanticConvetion.GEN_AI_CONTENT_PROMPT_EVENT,
|
553
|
+
attributes={
|
554
|
+
SemanticConvetion.GEN_AI_CONTENT_PROMPT: prompt,
|
555
|
+
},
|
556
|
+
)
|
557
|
+
self._span.add_event(
|
558
|
+
name=SemanticConvetion.GEN_AI_CONTENT_COMPLETION_EVENT,
|
559
|
+
attributes={
|
560
|
+
SemanticConvetion.GEN_AI_CONTENT_COMPLETION: self._llmresponse,
|
561
|
+
},
|
562
|
+
)
|
563
|
+
self._span.set_status(Status(StatusCode.OK))
|
564
|
+
|
565
|
+
if disable_metrics is False:
|
566
|
+
attributes = create_metrics_attributes(
|
567
|
+
service_name=application_name,
|
568
|
+
deployment_environment=environment,
|
569
|
+
operation=SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT,
|
570
|
+
system=SemanticConvetion.GEN_AI_SYSTEM_COHERE,
|
571
|
+
request_model=request_model,
|
572
|
+
server_address=self._server_address,
|
573
|
+
server_port=self._server_port,
|
574
|
+
response_model=request_model,
|
575
|
+
)
|
576
|
+
|
577
|
+
metrics["genai_client_usage_tokens"].record(
|
578
|
+
self._input_tokens + self._output_tokens, attributes
|
579
|
+
)
|
580
|
+
metrics["genai_client_operation_duration"].record(
|
581
|
+
self._end_time - self._start_time, attributes
|
582
|
+
)
|
583
|
+
metrics["genai_server_tbt"].record(
|
584
|
+
self._tbt, attributes
|
585
|
+
)
|
586
|
+
metrics["genai_server_ttft"].record(
|
587
|
+
self._ttft, attributes
|
588
|
+
)
|
589
|
+
metrics["genai_requests"].add(1, attributes)
|
590
|
+
metrics["genai_completion_tokens"].add(self._output_tokens, attributes)
|
591
|
+
metrics["genai_prompt_tokens"].add(self._input_tokens, attributes)
|
592
|
+
metrics["genai_cost"].record(cost, attributes)
|
593
|
+
|
594
|
+
except Exception as e:
|
595
|
+
handle_exception(self._span, e)
|
596
|
+
logger.error("Error in trace creation: %s", e)
|
597
|
+
finally:
|
598
|
+
self._span.end()
|
599
|
+
raise
|
600
|
+
|
601
|
+
server_address, server_port = set_server_address_and_port(instance, "api.cohere.com", 443)
|
602
|
+
request_model = kwargs.get("model", "command-r-plus")
|
603
|
+
|
604
|
+
span_name = f"{SemanticConvetion.GEN_AI_OPERATION_TYPE_CHAT} {request_model}"
|
605
|
+
|
606
|
+
awaited_wrapped = wrapped(*args, **kwargs)
|
607
|
+
span = tracer.start_span(span_name, kind=SpanKind.CLIENT)
|
608
|
+
return TracedSyncStream(awaited_wrapped, span, kwargs, server_address, server_port)
|
418
609
|
|
419
610
|
return wrapper
|