genai-otel-instrument 0.1.1.dev0__py3-none-any.whl → 0.1.4.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of genai-otel-instrument might be problematic. Click here for more details.
- genai_otel/__version__.py +2 -2
- genai_otel/auto_instrument.py +18 -1
- genai_otel/config.py +3 -0
- genai_otel/cost_calculator.py +136 -11
- genai_otel/cost_enrichment_processor.py +177 -0
- genai_otel/gpu_metrics.py +34 -0
- genai_otel/instrumentors/base.py +74 -42
- genai_otel/instrumentors/cohere_instrumentor.py +80 -16
- genai_otel/instrumentors/huggingface_instrumentor.py +138 -13
- genai_otel/instrumentors/mistralai_instrumentor.py +249 -37
- genai_otel/instrumentors/ollama_instrumentor.py +104 -35
- genai_otel/instrumentors/replicate_instrumentor.py +59 -14
- genai_otel/instrumentors/togetherai_instrumentor.py +120 -16
- genai_otel/instrumentors/vertexai_instrumentor.py +79 -15
- genai_otel/llm_pricing.json +866 -586
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/METADATA +230 -10
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/RECORD +21 -20
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/WHEEL +0 -0
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/entry_points.txt +0 -0
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/licenses/LICENSE +0 -0
- {genai_otel_instrument-0.1.1.dev0.dist-info → genai_otel_instrument-0.1.4.dev0.dist-info}/top_level.txt +0 -0
genai_otel/__version__.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.4.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 4, 'dev0')
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
genai_otel/auto_instrument.py
CHANGED
|
@@ -17,6 +17,8 @@ from opentelemetry.sdk.trace import TracerProvider
|
|
|
17
17
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
|
18
18
|
|
|
19
19
|
from .config import OTelConfig
|
|
20
|
+
from .cost_calculator import CostCalculator
|
|
21
|
+
from .cost_enrichment_processor import CostEnrichmentSpanProcessor
|
|
20
22
|
from .gpu_metrics import GPUMetricsCollector
|
|
21
23
|
from .mcp_instrumentors import MCPInstrumentorManager
|
|
22
24
|
from .metrics import (
|
|
@@ -117,12 +119,16 @@ INSTRUMENTORS = {
|
|
|
117
119
|
}
|
|
118
120
|
|
|
119
121
|
# Add OpenInference instrumentors if available (requires Python >= 3.10)
|
|
122
|
+
# IMPORTANT: Order matters! Load in this specific sequence:
|
|
123
|
+
# 1. smolagents - instruments the agent framework
|
|
124
|
+
# 2. litellm - instruments LLM calls made by agents
|
|
125
|
+
# 3. mcp - instruments Model Context Protocol tools
|
|
120
126
|
if OPENINFERENCE_AVAILABLE:
|
|
121
127
|
INSTRUMENTORS.update(
|
|
122
128
|
{
|
|
123
129
|
"smolagents": SmolagentsInstrumentor,
|
|
124
|
-
"mcp": MCPInstrumentor,
|
|
125
130
|
"litellm": LiteLLMInstrumentor,
|
|
131
|
+
"mcp": MCPInstrumentor,
|
|
126
132
|
}
|
|
127
133
|
)
|
|
128
134
|
|
|
@@ -163,6 +169,17 @@ def setup_auto_instrumentation(config: OTelConfig):
|
|
|
163
169
|
|
|
164
170
|
set_global_textmap(TraceContextTextMapPropagator())
|
|
165
171
|
|
|
172
|
+
# Add cost enrichment processor for OpenInference instrumentors
|
|
173
|
+
# This enriches spans from smolagents, litellm, mcp with cost attributes
|
|
174
|
+
if config.enable_cost_tracking:
|
|
175
|
+
try:
|
|
176
|
+
cost_calculator = CostCalculator()
|
|
177
|
+
cost_processor = CostEnrichmentSpanProcessor(cost_calculator)
|
|
178
|
+
tracer_provider.add_span_processor(cost_processor)
|
|
179
|
+
logger.info("Cost enrichment processor added for OpenInference instrumentors")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning(f"Failed to add cost enrichment processor: {e}", exc_info=True)
|
|
182
|
+
|
|
166
183
|
logger.debug(f"OTelConfig endpoint: {config.endpoint}")
|
|
167
184
|
if config.endpoint:
|
|
168
185
|
# Convert timeout to float safely
|
genai_otel/config.py
CHANGED
|
@@ -44,6 +44,9 @@ DEFAULT_INSTRUMENTORS = [
|
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
# Add OpenInference instrumentors only for Python >= 3.10
|
|
47
|
+
# IMPORTANT: Order matters! Load in this specific sequence:
|
|
48
|
+
# 1. smolagents - instruments the agent framework
|
|
49
|
+
# 2. litellm - instruments the LLM calls made by agents
|
|
47
50
|
if sys.version_info >= (3, 10):
|
|
48
51
|
DEFAULT_INSTRUMENTORS.extend(["smolagents", "litellm"])
|
|
49
52
|
|
genai_otel/cost_calculator.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
+
import re
|
|
5
6
|
from typing import Any, Dict, Optional
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
@@ -137,18 +138,32 @@ class CostCalculator:
|
|
|
137
138
|
Dict with keys: total, prompt, completion, reasoning, cache_read, cache_write
|
|
138
139
|
"""
|
|
139
140
|
model_key = self._normalize_model_name(model, "chat")
|
|
140
|
-
if not model_key:
|
|
141
|
-
logger.debug("Pricing not found for chat model: %s", model)
|
|
142
|
-
return {
|
|
143
|
-
"total": 0.0,
|
|
144
|
-
"prompt": 0.0,
|
|
145
|
-
"completion": 0.0,
|
|
146
|
-
"reasoning": 0.0,
|
|
147
|
-
"cache_read": 0.0,
|
|
148
|
-
"cache_write": 0.0,
|
|
149
|
-
}
|
|
150
141
|
|
|
151
|
-
pricing
|
|
142
|
+
# Fallback for unknown local models (Ollama, HuggingFace): estimate pricing based on parameter count
|
|
143
|
+
if not model_key:
|
|
144
|
+
param_count = self._extract_param_count_from_model_name(model)
|
|
145
|
+
if param_count is not None:
|
|
146
|
+
pricing = self._get_local_model_price_tier(param_count)
|
|
147
|
+
logger.info(
|
|
148
|
+
"Using fallback pricing for unknown local model '%s' with %.2fB parameters: "
|
|
149
|
+
"$%.4f prompt / $%.4f completion per 1k tokens",
|
|
150
|
+
model,
|
|
151
|
+
param_count,
|
|
152
|
+
pricing["promptPrice"],
|
|
153
|
+
pricing["completionPrice"]
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
logger.debug("Pricing not found for chat model: %s", model)
|
|
157
|
+
return {
|
|
158
|
+
"total": 0.0,
|
|
159
|
+
"prompt": 0.0,
|
|
160
|
+
"completion": 0.0,
|
|
161
|
+
"reasoning": 0.0,
|
|
162
|
+
"cache_read": 0.0,
|
|
163
|
+
"cache_write": 0.0,
|
|
164
|
+
}
|
|
165
|
+
else:
|
|
166
|
+
pricing = self.pricing_data["chat"][model_key]
|
|
152
167
|
|
|
153
168
|
# Standard prompt and completion tokens
|
|
154
169
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
@@ -274,3 +289,113 @@ class CostCalculator:
|
|
|
274
289
|
if key.lower() in normalized_model:
|
|
275
290
|
return key
|
|
276
291
|
return None
|
|
292
|
+
|
|
293
|
+
def _extract_param_count_from_model_name(self, model: str) -> Optional[float]:
|
|
294
|
+
"""Extract parameter count from Ollama or HuggingFace model name.
|
|
295
|
+
|
|
296
|
+
Supports both explicit size indicators and common model size names.
|
|
297
|
+
|
|
298
|
+
Examples:
|
|
299
|
+
Ollama models:
|
|
300
|
+
"smollm2:360m" -> 0.36
|
|
301
|
+
"llama3:7b" -> 7.0
|
|
302
|
+
"llama3.1:70b" -> 70.0
|
|
303
|
+
"deepseek-r1:32b" -> 32.0
|
|
304
|
+
|
|
305
|
+
HuggingFace models:
|
|
306
|
+
"gpt2" -> 0.124 (base)
|
|
307
|
+
"gpt2-xl" -> 1.5
|
|
308
|
+
"bert-base-uncased" -> 0.11
|
|
309
|
+
"bert-large-uncased" -> 0.34
|
|
310
|
+
"t5-small" -> 0.06
|
|
311
|
+
"t5-xxl" -> 11.0
|
|
312
|
+
"llama-2-7b" -> 7.0
|
|
313
|
+
"mistral-7b-v0.1" -> 7.0
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Parameter count in billions, or None if not parseable.
|
|
317
|
+
"""
|
|
318
|
+
model_lower = model.lower()
|
|
319
|
+
|
|
320
|
+
# First try explicit parameter count patterns (e.g., 135m, 7b, 70b)
|
|
321
|
+
# Matches: digits followed by optional decimal, then 'm' or 'b'
|
|
322
|
+
pattern = r'(\d+(?:\.\d+)?)(m|b)(?:\s|:|$|-)'
|
|
323
|
+
match = re.search(pattern, model_lower)
|
|
324
|
+
if match:
|
|
325
|
+
value = float(match.group(1))
|
|
326
|
+
unit = match.group(2)
|
|
327
|
+
if unit == 'm':
|
|
328
|
+
return value / 1000 # Convert millions to billions
|
|
329
|
+
elif unit == 'b':
|
|
330
|
+
return value
|
|
331
|
+
|
|
332
|
+
# Fallback to common model size indicators for HuggingFace models
|
|
333
|
+
# These are approximate values based on typical model sizes
|
|
334
|
+
size_map = {
|
|
335
|
+
# T5 family
|
|
336
|
+
"t5-small": 0.06,
|
|
337
|
+
"t5-base": 0.22,
|
|
338
|
+
"t5-large": 0.77,
|
|
339
|
+
"t5-xl": 3.0,
|
|
340
|
+
"t5-xxl": 11.0,
|
|
341
|
+
# GPT-2 family
|
|
342
|
+
"gpt2-small": 0.124,
|
|
343
|
+
"gpt2-medium": 0.355,
|
|
344
|
+
"gpt2-large": 0.774,
|
|
345
|
+
"gpt2-xl": 1.5,
|
|
346
|
+
"gpt2": 0.124, # default GPT-2 is small
|
|
347
|
+
# BERT family
|
|
348
|
+
"bert-tiny": 0.004,
|
|
349
|
+
"bert-mini": 0.011,
|
|
350
|
+
"bert-small": 0.029,
|
|
351
|
+
"bert-medium": 0.041,
|
|
352
|
+
"bert-base": 0.11,
|
|
353
|
+
"bert-large": 0.34,
|
|
354
|
+
# Generic size indicators (fallback)
|
|
355
|
+
"tiny": 0.01,
|
|
356
|
+
"mini": 0.02,
|
|
357
|
+
"small": 0.06,
|
|
358
|
+
"base": 0.11,
|
|
359
|
+
"medium": 0.35,
|
|
360
|
+
"large": 0.77,
|
|
361
|
+
"xl": 1.5,
|
|
362
|
+
"xxl": 11.0,
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# Check for size indicators in the model name
|
|
366
|
+
for size_key, param_count in size_map.items():
|
|
367
|
+
if size_key in model_lower:
|
|
368
|
+
return param_count
|
|
369
|
+
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
def _get_local_model_price_tier(self, param_count_billions: float) -> Dict[str, float]:
|
|
373
|
+
"""Get pricing tier based on parameter count for local models (Ollama, HuggingFace).
|
|
374
|
+
|
|
375
|
+
Local models (Ollama, HuggingFace Transformers) are free but consume GPU power
|
|
376
|
+
and electricity. We estimate costs based on parameter count and comparable
|
|
377
|
+
cloud API pricing.
|
|
378
|
+
|
|
379
|
+
Price Tiers (based on parameter count):
|
|
380
|
+
- Tiny (< 1B params): $0.0001 / $0.0002 (prompt/completion)
|
|
381
|
+
- Small (1-10B): $0.0003 / $0.0006
|
|
382
|
+
- Medium (10-20B): $0.0005 / $0.001
|
|
383
|
+
- Large (20-80B): $0.0008 / $0.0008
|
|
384
|
+
- XLarge (80B+): $0.0012 / $0.0012
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
param_count_billions: Model parameter count in billions
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
Dict with promptPrice and completionPrice
|
|
391
|
+
"""
|
|
392
|
+
if param_count_billions < 1.0:
|
|
393
|
+
return {"promptPrice": 0.0001, "completionPrice": 0.0002}
|
|
394
|
+
elif param_count_billions < 10.0:
|
|
395
|
+
return {"promptPrice": 0.0003, "completionPrice": 0.0006}
|
|
396
|
+
elif param_count_billions < 20.0:
|
|
397
|
+
return {"promptPrice": 0.0005, "completionPrice": 0.001}
|
|
398
|
+
elif param_count_billions < 80.0:
|
|
399
|
+
return {"promptPrice": 0.0008, "completionPrice": 0.0008}
|
|
400
|
+
else:
|
|
401
|
+
return {"promptPrice": 0.0012, "completionPrice": 0.0012}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Custom SpanProcessor to enrich OpenInference spans with cost tracking.
|
|
2
|
+
|
|
3
|
+
This processor adds cost attributes to spans created by OpenInference instrumentors
|
|
4
|
+
(smolagents, litellm, mcp) by extracting token usage and model information from
|
|
5
|
+
existing span attributes and calculating costs using our CostCalculator.
|
|
6
|
+
|
|
7
|
+
Supports both OpenTelemetry GenAI and OpenInference semantic conventions:
|
|
8
|
+
- GenAI: gen_ai.request.model, gen_ai.usage.{prompt_tokens,completion_tokens}
|
|
9
|
+
- OpenInference: llm.model_name, llm.token_count.{prompt,completion}
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
|
16
|
+
from opentelemetry.trace import SpanContext
|
|
17
|
+
|
|
18
|
+
from .cost_calculator import CostCalculator
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CostEnrichmentSpanProcessor(SpanProcessor):
|
|
24
|
+
"""Enriches spans with cost tracking attributes.
|
|
25
|
+
|
|
26
|
+
This processor:
|
|
27
|
+
1. Identifies spans from OpenInference instrumentors (smolagents, litellm, mcp)
|
|
28
|
+
2. Extracts model name and token usage from span attributes
|
|
29
|
+
3. Calculates cost using CostCalculator
|
|
30
|
+
4. Adds cost attributes (gen_ai.usage.cost.total, etc.) to the span
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, cost_calculator: Optional[CostCalculator] = None):
|
|
34
|
+
"""Initialize the cost enrichment processor.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
cost_calculator: CostCalculator instance to use for cost calculations.
|
|
38
|
+
If None, creates a new instance.
|
|
39
|
+
"""
|
|
40
|
+
self.cost_calculator = cost_calculator or CostCalculator()
|
|
41
|
+
logger.info("CostEnrichmentSpanProcessor initialized")
|
|
42
|
+
|
|
43
|
+
def on_start(self, span: Span, parent_context: Optional[SpanContext] = None) -> None:
|
|
44
|
+
"""Called when a span starts. No action needed."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def on_end(self, span: ReadableSpan) -> None:
|
|
48
|
+
"""Called when a span ends. Enriches with cost attributes if applicable.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
span: The span that just ended.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
# Only process spans that have LLM-related attributes
|
|
55
|
+
if not span.attributes:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
attributes = span.attributes
|
|
59
|
+
|
|
60
|
+
# Check for model name - support both GenAI and OpenInference conventions
|
|
61
|
+
model = (
|
|
62
|
+
attributes.get("gen_ai.request.model")
|
|
63
|
+
or attributes.get("llm.model_name")
|
|
64
|
+
or attributes.get("embedding.model_name")
|
|
65
|
+
)
|
|
66
|
+
if not model:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# Skip if cost attributes are already present (added by instrumentor)
|
|
70
|
+
if "gen_ai.usage.cost.total" in attributes:
|
|
71
|
+
logger.debug(
|
|
72
|
+
f"Span '{span.name}' already has cost attributes, skipping enrichment"
|
|
73
|
+
)
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
# Extract token usage - support GenAI, OpenInference, and legacy conventions
|
|
77
|
+
prompt_tokens = (
|
|
78
|
+
attributes.get("gen_ai.usage.prompt_tokens")
|
|
79
|
+
or attributes.get("gen_ai.usage.input_tokens")
|
|
80
|
+
or attributes.get("llm.token_count.prompt") # OpenInference
|
|
81
|
+
or 0
|
|
82
|
+
)
|
|
83
|
+
completion_tokens = (
|
|
84
|
+
attributes.get("gen_ai.usage.completion_tokens")
|
|
85
|
+
or attributes.get("gen_ai.usage.output_tokens")
|
|
86
|
+
or attributes.get("llm.token_count.completion") # OpenInference
|
|
87
|
+
or 0
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Skip if no tokens recorded
|
|
91
|
+
if prompt_tokens == 0 and completion_tokens == 0:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Get call type - support both GenAI and OpenInference conventions
|
|
95
|
+
# OpenInference uses openinference.span.kind (values: LLM, EMBEDDING, etc.)
|
|
96
|
+
span_kind = attributes.get("openinference.span.kind", "").upper()
|
|
97
|
+
call_type = attributes.get("gen_ai.operation.name") or span_kind.lower() or "chat"
|
|
98
|
+
|
|
99
|
+
# Map operation names to call types for cost calculator
|
|
100
|
+
# Supports both GenAI and OpenInference conventions
|
|
101
|
+
call_type_mapping = {
|
|
102
|
+
# GenAI conventions
|
|
103
|
+
"chat": "chat",
|
|
104
|
+
"completion": "chat",
|
|
105
|
+
"embedding": "embedding",
|
|
106
|
+
"embeddings": "embedding",
|
|
107
|
+
"text_generation": "chat",
|
|
108
|
+
"image_generation": "image",
|
|
109
|
+
"audio": "audio",
|
|
110
|
+
# OpenInference conventions (span.kind values)
|
|
111
|
+
"llm": "chat",
|
|
112
|
+
"embedding": "embedding",
|
|
113
|
+
"chain": "chat",
|
|
114
|
+
"retriever": "embedding",
|
|
115
|
+
"reranker": "embedding",
|
|
116
|
+
"tool": "chat",
|
|
117
|
+
"agent": "chat",
|
|
118
|
+
}
|
|
119
|
+
normalized_call_type = call_type_mapping.get(str(call_type).lower(), "chat")
|
|
120
|
+
|
|
121
|
+
# Calculate cost
|
|
122
|
+
usage = {
|
|
123
|
+
"prompt_tokens": int(prompt_tokens),
|
|
124
|
+
"completion_tokens": int(completion_tokens),
|
|
125
|
+
"total_tokens": int(prompt_tokens) + int(completion_tokens),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Use calculate_granular_cost to get detailed breakdown
|
|
129
|
+
cost_info = self.cost_calculator.calculate_granular_cost(
|
|
130
|
+
model=str(model),
|
|
131
|
+
usage=usage,
|
|
132
|
+
call_type=normalized_call_type,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if cost_info and cost_info.get("total", 0.0) > 0:
|
|
136
|
+
# Add cost attributes to the span
|
|
137
|
+
# Note: We can't modify ReadableSpan attributes directly,
|
|
138
|
+
# but we can if span is still a Span instance
|
|
139
|
+
if isinstance(span, Span):
|
|
140
|
+
span.set_attribute("gen_ai.usage.cost.total", cost_info["total"])
|
|
141
|
+
|
|
142
|
+
if cost_info.get("prompt", 0.0) > 0:
|
|
143
|
+
span.set_attribute("gen_ai.usage.cost.prompt", cost_info["prompt"])
|
|
144
|
+
if cost_info.get("completion", 0.0) > 0:
|
|
145
|
+
span.set_attribute("gen_ai.usage.cost.completion", cost_info["completion"])
|
|
146
|
+
|
|
147
|
+
logger.info(
|
|
148
|
+
f"Enriched span '{span.name}' with cost: {cost_info['total']:.6f} USD "
|
|
149
|
+
f"for model {model} ({usage['total_tokens']} tokens)"
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
logger.warning(
|
|
153
|
+
f"Span '{span.name}' is not mutable (type: {type(span).__name__}), "
|
|
154
|
+
"cannot add cost attributes"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
# Don't fail span processing due to cost enrichment errors
|
|
159
|
+
logger.warning(
|
|
160
|
+
f"Failed to enrich span '{getattr(span, 'name', 'unknown')}' with cost: {e}",
|
|
161
|
+
exc_info=True,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def shutdown(self) -> None:
|
|
165
|
+
"""Called when the processor is shutdown."""
|
|
166
|
+
logger.info("CostEnrichmentSpanProcessor shutdown")
|
|
167
|
+
|
|
168
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
169
|
+
"""Force flush any pending spans.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
timeout_millis: Timeout in milliseconds.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
True if flush succeeded.
|
|
176
|
+
"""
|
|
177
|
+
return True
|
genai_otel/gpu_metrics.py
CHANGED
|
@@ -43,6 +43,7 @@ class GPUMetricsCollector:
|
|
|
43
43
|
self.gpu_utilization_counter: Optional[ObservableCounter] = None
|
|
44
44
|
self.gpu_memory_used_gauge: Optional[ObservableGauge] = None
|
|
45
45
|
self.gpu_temperature_gauge: Optional[ObservableGauge] = None
|
|
46
|
+
self.gpu_power_gauge: Optional[ObservableGauge] = None
|
|
46
47
|
self.config = config
|
|
47
48
|
self.interval = interval # seconds
|
|
48
49
|
self.gpu_available = False
|
|
@@ -93,6 +94,12 @@ class GPUMetricsCollector:
|
|
|
93
94
|
description="GPU temperature in Celsius",
|
|
94
95
|
unit="Cel",
|
|
95
96
|
)
|
|
97
|
+
self.gpu_power_gauge = self.meter.create_observable_gauge(
|
|
98
|
+
"gen_ai.gpu.power", # Fixed metric name
|
|
99
|
+
callbacks=[self._observe_gpu_power],
|
|
100
|
+
description="GPU power consumption in Watts",
|
|
101
|
+
unit="W",
|
|
102
|
+
)
|
|
96
103
|
except Exception as e:
|
|
97
104
|
logger.error("Failed to create GPU metrics instruments: %s", e, exc_info=True)
|
|
98
105
|
|
|
@@ -185,6 +192,33 @@ class GPUMetricsCollector:
|
|
|
185
192
|
except Exception as e:
|
|
186
193
|
logger.error("Error observing GPU temperature: %s", e)
|
|
187
194
|
|
|
195
|
+
def _observe_gpu_power(self, options):
|
|
196
|
+
"""Observable callback for GPU power consumption."""
|
|
197
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
pynvml.nvmlInit()
|
|
202
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
203
|
+
|
|
204
|
+
for i in range(device_count):
|
|
205
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
206
|
+
device_name = self._get_device_name(handle, i)
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
# Power usage is returned in milliwatts, convert to watts
|
|
210
|
+
power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
|
|
211
|
+
power_w = power_mw / 1000.0
|
|
212
|
+
yield Observation(
|
|
213
|
+
value=power_w, attributes={"gpu_id": str(i), "gpu_name": device_name}
|
|
214
|
+
)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.debug("Failed to get GPU power for GPU %d: %s", i, e)
|
|
217
|
+
|
|
218
|
+
pynvml.nvmlShutdown()
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error("Error observing GPU power: %s", e)
|
|
221
|
+
|
|
188
222
|
def start(self):
|
|
189
223
|
"""Starts the GPU metrics collection.
|
|
190
224
|
|
genai_otel/instrumentors/base.py
CHANGED
|
@@ -82,6 +82,12 @@ class BaseInstrumentor(ABC): # pylint: disable=R0902
|
|
|
82
82
|
_shared_latency_histogram = None
|
|
83
83
|
_shared_cost_counter = None
|
|
84
84
|
_shared_error_counter = None
|
|
85
|
+
# Granular cost counters (Phase 3.2)
|
|
86
|
+
_shared_prompt_cost_counter = None
|
|
87
|
+
_shared_completion_cost_counter = None
|
|
88
|
+
_shared_reasoning_cost_counter = None
|
|
89
|
+
_shared_cache_read_cost_counter = None
|
|
90
|
+
_shared_cache_write_cost_counter = None
|
|
85
91
|
# Streaming metrics (Phase 3.4)
|
|
86
92
|
_shared_ttft_histogram = None
|
|
87
93
|
_shared_tbt_histogram = None
|
|
@@ -103,6 +109,12 @@ class BaseInstrumentor(ABC): # pylint: disable=R0902
|
|
|
103
109
|
self.latency_histogram = self._shared_latency_histogram
|
|
104
110
|
self.cost_counter = self._shared_cost_counter
|
|
105
111
|
self.error_counter = self._shared_error_counter
|
|
112
|
+
# Granular cost counters (Phase 3.2)
|
|
113
|
+
self.prompt_cost_counter = self._shared_prompt_cost_counter
|
|
114
|
+
self.completion_cost_counter = self._shared_completion_cost_counter
|
|
115
|
+
self.reasoning_cost_counter = self._shared_reasoning_cost_counter
|
|
116
|
+
self.cache_read_cost_counter = self._shared_cache_read_cost_counter
|
|
117
|
+
self.cache_write_cost_counter = self._shared_cache_write_cost_counter
|
|
106
118
|
# Streaming metrics
|
|
107
119
|
self.ttft_histogram = self._shared_ttft_histogram
|
|
108
120
|
self.tbt_histogram = self._shared_tbt_histogram
|
|
@@ -346,45 +358,54 @@ class BaseInstrumentor(ABC): # pylint: disable=R0902
|
|
|
346
358
|
and "dup" in self.config.semconv_stability_opt_in
|
|
347
359
|
)
|
|
348
360
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
)
|
|
357
|
-
# New semantic convention
|
|
361
|
+
# Record prompt tokens
|
|
362
|
+
if isinstance(prompt_tokens, (int, float)) and prompt_tokens > 0:
|
|
363
|
+
# Record metric if available
|
|
364
|
+
if self.token_counter:
|
|
365
|
+
self.token_counter.add(
|
|
366
|
+
prompt_tokens, {"token_type": "prompt", "operation": span.name}
|
|
367
|
+
)
|
|
368
|
+
# Always set span attributes (needed for cost calculation)
|
|
358
369
|
span.set_attribute("gen_ai.usage.prompt_tokens", int(prompt_tokens))
|
|
359
370
|
# Old semantic convention (if dual emission enabled)
|
|
360
371
|
if emit_old_attrs:
|
|
361
372
|
span.set_attribute("gen_ai.usage.input_tokens", int(prompt_tokens))
|
|
362
373
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
)
|
|
371
|
-
# New semantic convention
|
|
374
|
+
# Record completion tokens
|
|
375
|
+
if isinstance(completion_tokens, (int, float)) and completion_tokens > 0:
|
|
376
|
+
# Record metric if available
|
|
377
|
+
if self.token_counter:
|
|
378
|
+
self.token_counter.add(
|
|
379
|
+
completion_tokens, {"token_type": "completion", "operation": span.name}
|
|
380
|
+
)
|
|
381
|
+
# Always set span attributes (needed for cost calculation)
|
|
372
382
|
span.set_attribute("gen_ai.usage.completion_tokens", int(completion_tokens))
|
|
373
383
|
# Old semantic convention (if dual emission enabled)
|
|
374
384
|
if emit_old_attrs:
|
|
375
385
|
span.set_attribute("gen_ai.usage.output_tokens", int(completion_tokens))
|
|
376
386
|
|
|
387
|
+
# Record total tokens
|
|
377
388
|
if isinstance(total_tokens, (int, float)) and total_tokens > 0:
|
|
378
389
|
span.set_attribute("gen_ai.usage.total_tokens", int(total_tokens))
|
|
379
390
|
|
|
380
391
|
# Calculate and record cost if enabled and applicable
|
|
381
|
-
|
|
392
|
+
logger.debug(
|
|
393
|
+
f"Cost tracking check: config={self.config is not None}, "
|
|
394
|
+
f"enable_cost_tracking={self.config.enable_cost_tracking if self.config else 'N/A'}"
|
|
395
|
+
)
|
|
396
|
+
if self.config and self.config.enable_cost_tracking:
|
|
382
397
|
try:
|
|
383
398
|
model = span.attributes.get("gen_ai.request.model", "unknown")
|
|
384
399
|
# Assuming 'chat' as a default call_type for generic base instrumentor tests.
|
|
385
400
|
# Specific instrumentors will provide the actual call_type.
|
|
386
401
|
call_type = span.attributes.get("gen_ai.request.type", "chat")
|
|
387
402
|
|
|
403
|
+
logger.debug(
|
|
404
|
+
f"Calculating cost for model={model}, call_type={call_type}, "
|
|
405
|
+
f"prompt_tokens={usage.get('prompt_tokens')}, "
|
|
406
|
+
f"completion_tokens={usage.get('completion_tokens')}"
|
|
407
|
+
)
|
|
408
|
+
|
|
388
409
|
# Use granular cost calculation for chat requests
|
|
389
410
|
if call_type == "chat":
|
|
390
411
|
costs = self.cost_calculator.calculate_granular_cost(
|
|
@@ -394,45 +415,55 @@ class BaseInstrumentor(ABC): # pylint: disable=R0902
|
|
|
394
415
|
|
|
395
416
|
# Record total cost
|
|
396
417
|
if total_cost > 0:
|
|
397
|
-
self.
|
|
398
|
-
|
|
418
|
+
if self.cost_counter:
|
|
419
|
+
self.cost_counter.add(total_cost, {"model": str(model)})
|
|
420
|
+
# Always set span attributes (needed for cost tracking)
|
|
399
421
|
span.set_attribute("gen_ai.usage.cost.total", total_cost)
|
|
422
|
+
logger.debug(f"Set cost attribute: gen_ai.usage.cost.total={total_cost}")
|
|
423
|
+
else:
|
|
424
|
+
logger.debug(f"Cost is zero, not setting attributes. Costs: {costs}")
|
|
400
425
|
|
|
401
426
|
# Record and set attributes for granular costs
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
427
|
+
# Note: Metrics recording is optional, span attributes are always set
|
|
428
|
+
if costs["prompt"] > 0:
|
|
429
|
+
if self.prompt_cost_counter:
|
|
430
|
+
self.prompt_cost_counter.add(
|
|
431
|
+
costs["prompt"], {"model": str(model)}
|
|
432
|
+
)
|
|
406
433
|
span.set_attribute("gen_ai.usage.cost.prompt", costs["prompt"])
|
|
407
434
|
|
|
408
|
-
if costs["completion"] > 0
|
|
409
|
-
self.
|
|
410
|
-
|
|
411
|
-
|
|
435
|
+
if costs["completion"] > 0:
|
|
436
|
+
if self.completion_cost_counter:
|
|
437
|
+
self.completion_cost_counter.add(
|
|
438
|
+
costs["completion"], {"model": str(model)}
|
|
439
|
+
)
|
|
412
440
|
span.set_attribute(
|
|
413
441
|
"gen_ai.usage.cost.completion", costs["completion"]
|
|
414
442
|
)
|
|
415
443
|
|
|
416
|
-
if costs["reasoning"] > 0
|
|
417
|
-
self.
|
|
418
|
-
|
|
419
|
-
|
|
444
|
+
if costs["reasoning"] > 0:
|
|
445
|
+
if self.reasoning_cost_counter:
|
|
446
|
+
self.reasoning_cost_counter.add(
|
|
447
|
+
costs["reasoning"], {"model": str(model)}
|
|
448
|
+
)
|
|
420
449
|
span.set_attribute(
|
|
421
450
|
"gen_ai.usage.cost.reasoning", costs["reasoning"]
|
|
422
451
|
)
|
|
423
452
|
|
|
424
|
-
if costs["cache_read"] > 0
|
|
425
|
-
self.
|
|
426
|
-
|
|
427
|
-
|
|
453
|
+
if costs["cache_read"] > 0:
|
|
454
|
+
if self.cache_read_cost_counter:
|
|
455
|
+
self.cache_read_cost_counter.add(
|
|
456
|
+
costs["cache_read"], {"model": str(model)}
|
|
457
|
+
)
|
|
428
458
|
span.set_attribute(
|
|
429
459
|
"gen_ai.usage.cost.cache_read", costs["cache_read"]
|
|
430
460
|
)
|
|
431
461
|
|
|
432
|
-
if costs["cache_write"] > 0
|
|
433
|
-
self.
|
|
434
|
-
|
|
435
|
-
|
|
462
|
+
if costs["cache_write"] > 0:
|
|
463
|
+
if self.cache_write_cost_counter:
|
|
464
|
+
self.cache_write_cost_counter.add(
|
|
465
|
+
costs["cache_write"], {"model": str(model)}
|
|
466
|
+
)
|
|
436
467
|
span.set_attribute(
|
|
437
468
|
"gen_ai.usage.cost.cache_write", costs["cache_write"]
|
|
438
469
|
)
|
|
@@ -440,7 +471,8 @@ class BaseInstrumentor(ABC): # pylint: disable=R0902
|
|
|
440
471
|
# For non-chat requests, use simple cost calculation
|
|
441
472
|
cost = self.cost_calculator.calculate_cost(model, usage, call_type)
|
|
442
473
|
if cost and cost > 0:
|
|
443
|
-
self.
|
|
474
|
+
if self.cost_counter:
|
|
475
|
+
self.cost_counter.add(cost, {"model": str(model)})
|
|
444
476
|
except Exception as e:
|
|
445
477
|
logger.warning("Failed to calculate cost for span '%s': %s", span.name, e)
|
|
446
478
|
|