genai-otel-instrument 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genai_otel/__init__.py +132 -0
- genai_otel/__version__.py +34 -0
- genai_otel/auto_instrument.py +602 -0
- genai_otel/cli.py +92 -0
- genai_otel/config.py +333 -0
- genai_otel/cost_calculator.py +467 -0
- genai_otel/cost_enriching_exporter.py +207 -0
- genai_otel/cost_enrichment_processor.py +174 -0
- genai_otel/evaluation/__init__.py +76 -0
- genai_otel/evaluation/bias_detector.py +364 -0
- genai_otel/evaluation/config.py +261 -0
- genai_otel/evaluation/hallucination_detector.py +525 -0
- genai_otel/evaluation/pii_detector.py +356 -0
- genai_otel/evaluation/prompt_injection_detector.py +262 -0
- genai_otel/evaluation/restricted_topics_detector.py +316 -0
- genai_otel/evaluation/span_processor.py +962 -0
- genai_otel/evaluation/toxicity_detector.py +406 -0
- genai_otel/exceptions.py +17 -0
- genai_otel/gpu_metrics.py +516 -0
- genai_otel/instrumentors/__init__.py +71 -0
- genai_otel/instrumentors/anthropic_instrumentor.py +134 -0
- genai_otel/instrumentors/anyscale_instrumentor.py +27 -0
- genai_otel/instrumentors/autogen_instrumentor.py +394 -0
- genai_otel/instrumentors/aws_bedrock_instrumentor.py +94 -0
- genai_otel/instrumentors/azure_openai_instrumentor.py +69 -0
- genai_otel/instrumentors/base.py +919 -0
- genai_otel/instrumentors/bedrock_agents_instrumentor.py +398 -0
- genai_otel/instrumentors/cohere_instrumentor.py +140 -0
- genai_otel/instrumentors/crewai_instrumentor.py +311 -0
- genai_otel/instrumentors/dspy_instrumentor.py +661 -0
- genai_otel/instrumentors/google_ai_instrumentor.py +310 -0
- genai_otel/instrumentors/groq_instrumentor.py +106 -0
- genai_otel/instrumentors/guardrails_ai_instrumentor.py +510 -0
- genai_otel/instrumentors/haystack_instrumentor.py +503 -0
- genai_otel/instrumentors/huggingface_instrumentor.py +399 -0
- genai_otel/instrumentors/hyperbolic_instrumentor.py +236 -0
- genai_otel/instrumentors/instructor_instrumentor.py +425 -0
- genai_otel/instrumentors/langchain_instrumentor.py +340 -0
- genai_otel/instrumentors/langgraph_instrumentor.py +328 -0
- genai_otel/instrumentors/llamaindex_instrumentor.py +36 -0
- genai_otel/instrumentors/mistralai_instrumentor.py +315 -0
- genai_otel/instrumentors/ollama_instrumentor.py +197 -0
- genai_otel/instrumentors/ollama_server_metrics_poller.py +336 -0
- genai_otel/instrumentors/openai_agents_instrumentor.py +291 -0
- genai_otel/instrumentors/openai_instrumentor.py +260 -0
- genai_otel/instrumentors/pydantic_ai_instrumentor.py +362 -0
- genai_otel/instrumentors/replicate_instrumentor.py +87 -0
- genai_otel/instrumentors/sambanova_instrumentor.py +196 -0
- genai_otel/instrumentors/togetherai_instrumentor.py +146 -0
- genai_otel/instrumentors/vertexai_instrumentor.py +106 -0
- genai_otel/llm_pricing.json +1676 -0
- genai_otel/logging_config.py +45 -0
- genai_otel/mcp_instrumentors/__init__.py +14 -0
- genai_otel/mcp_instrumentors/api_instrumentor.py +144 -0
- genai_otel/mcp_instrumentors/base.py +105 -0
- genai_otel/mcp_instrumentors/database_instrumentor.py +336 -0
- genai_otel/mcp_instrumentors/kafka_instrumentor.py +31 -0
- genai_otel/mcp_instrumentors/manager.py +139 -0
- genai_otel/mcp_instrumentors/redis_instrumentor.py +31 -0
- genai_otel/mcp_instrumentors/vector_db_instrumentor.py +265 -0
- genai_otel/metrics.py +148 -0
- genai_otel/py.typed +2 -0
- genai_otel/server_metrics.py +197 -0
- genai_otel_instrument-0.1.24.dist-info/METADATA +1404 -0
- genai_otel_instrument-0.1.24.dist-info/RECORD +69 -0
- genai_otel_instrument-0.1.24.dist-info/WHEEL +5 -0
- genai_otel_instrument-0.1.24.dist-info/entry_points.txt +2 -0
- genai_otel_instrument-0.1.24.dist-info/licenses/LICENSE +680 -0
- genai_otel_instrument-0.1.24.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""Module for collecting GPU metrics using nvidia-ml-py and reporting them via OpenTelemetry.
|
|
2
|
+
|
|
3
|
+
This module provides the `GPUMetricsCollector` class, which periodically collects
|
|
4
|
+
GPU utilization, memory usage, and temperature, and exports these as OpenTelemetry
|
|
5
|
+
metrics. It relies on the `nvidia-ml-py` library for interacting with NVIDIA GPUs.
|
|
6
|
+
|
|
7
|
+
CO2 emissions tracking is provided via codecarbon integration, which offers:
|
|
8
|
+
- Automatic region-based carbon intensity lookup
|
|
9
|
+
- Cloud provider carbon intensity data
|
|
10
|
+
- More accurate emission factors based on location
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
from opentelemetry.metrics import Meter, ObservableCounter, ObservableGauge, Observation
|
|
19
|
+
|
|
20
|
+
from genai_otel.config import OTelConfig
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Try to import nvidia-ml-py (official replacement for pynvml)
|
|
25
|
+
try:
|
|
26
|
+
import pynvml
|
|
27
|
+
|
|
28
|
+
NVML_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
NVML_AVAILABLE = False
|
|
31
|
+
logger.debug("nvidia-ml-py not available, GPU metrics will be disabled")
|
|
32
|
+
|
|
33
|
+
# Try to import codecarbon for CO2 emissions tracking
|
|
34
|
+
try:
|
|
35
|
+
from codecarbon import EmissionsTracker, OfflineEmissionsTracker
|
|
36
|
+
|
|
37
|
+
CODECARBON_AVAILABLE = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
CODECARBON_AVAILABLE = False
|
|
40
|
+
EmissionsTracker = None # type: ignore
|
|
41
|
+
OfflineEmissionsTracker = None # type: ignore
|
|
42
|
+
logger.debug("codecarbon not available, will use manual CO2 calculation")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class GPUMetricsCollector:
|
|
46
|
+
"""Collects and reports GPU metrics using nvidia-ml-py and codecarbon for CO2 tracking."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, meter: Meter, config: OTelConfig, interval: int = 10):
|
|
49
|
+
"""Initializes the GPUMetricsCollector.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
meter (Meter): The OpenTelemetry meter to use for recording metrics.
|
|
53
|
+
config (OTelConfig): Configuration for the collector.
|
|
54
|
+
interval (int): Collection interval in seconds.
|
|
55
|
+
"""
|
|
56
|
+
self.meter = meter
|
|
57
|
+
self.running = False
|
|
58
|
+
self.thread: Optional[threading.Thread] = None
|
|
59
|
+
self._thread: Optional[threading.Thread] = None # Initialize _thread
|
|
60
|
+
self._stop_event = threading.Event()
|
|
61
|
+
self.gpu_utilization_counter: Optional[ObservableCounter] = None
|
|
62
|
+
self.gpu_memory_used_gauge: Optional[ObservableGauge] = None
|
|
63
|
+
self.gpu_memory_total_gauge: Optional[ObservableGauge] = None
|
|
64
|
+
self.gpu_temperature_gauge: Optional[ObservableGauge] = None
|
|
65
|
+
self.gpu_power_gauge: Optional[ObservableGauge] = None
|
|
66
|
+
self.config = config
|
|
67
|
+
self.interval = interval # seconds
|
|
68
|
+
self.gpu_available = False
|
|
69
|
+
|
|
70
|
+
# Codecarbon emissions tracker
|
|
71
|
+
self._emissions_tracker: Optional["EmissionsTracker"] = None
|
|
72
|
+
self._last_emissions_kg: float = 0.0
|
|
73
|
+
self._use_codecarbon: bool = False
|
|
74
|
+
|
|
75
|
+
self.device_count = 0
|
|
76
|
+
self.nvml = None
|
|
77
|
+
if NVML_AVAILABLE:
|
|
78
|
+
try:
|
|
79
|
+
pynvml.nvmlInit()
|
|
80
|
+
self.device_count = pynvml.nvmlDeviceGetCount()
|
|
81
|
+
if self.device_count > 0:
|
|
82
|
+
self.gpu_available = True
|
|
83
|
+
self.nvml = pynvml
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error("Failed to initialize NVML to get device count: %s", e)
|
|
86
|
+
|
|
87
|
+
self.cumulative_energy_wh = [0.0] * self.device_count # Per GPU, in Wh
|
|
88
|
+
self.last_timestamp = [time.time()] * self.device_count
|
|
89
|
+
self.co2_counter = meter.create_counter(
|
|
90
|
+
"gen_ai.co2.emissions",
|
|
91
|
+
description="Cumulative CO2 equivalent emissions in grams",
|
|
92
|
+
unit="gCO2e",
|
|
93
|
+
)
|
|
94
|
+
self.power_cost_counter = meter.create_counter(
|
|
95
|
+
"gen_ai.power.cost",
|
|
96
|
+
description="Cumulative electricity cost in USD based on GPU power consumption",
|
|
97
|
+
unit="USD",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Initialize codecarbon if available and CO2 tracking is enabled
|
|
101
|
+
self._init_codecarbon()
|
|
102
|
+
|
|
103
|
+
if not NVML_AVAILABLE:
|
|
104
|
+
logger.warning(
|
|
105
|
+
"GPU metrics collection not available - nvidia-ml-py not installed. "
|
|
106
|
+
"Install with: pip install genai-otel-instrument[gpu]"
|
|
107
|
+
)
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# Use ObservableGauge for all GPU metrics (not Counter!)
|
|
112
|
+
self.gpu_utilization_gauge = self.meter.create_observable_gauge(
|
|
113
|
+
"gen_ai.gpu.utilization", # Fixed metric name
|
|
114
|
+
callbacks=[self._observe_gpu_utilization],
|
|
115
|
+
description="GPU utilization percentage",
|
|
116
|
+
unit="%",
|
|
117
|
+
)
|
|
118
|
+
self.gpu_memory_used_gauge = self.meter.create_observable_gauge(
|
|
119
|
+
"gen_ai.gpu.memory.used", # Fixed metric name
|
|
120
|
+
callbacks=[self._observe_gpu_memory],
|
|
121
|
+
description="GPU memory used in MiB",
|
|
122
|
+
unit="MiB",
|
|
123
|
+
)
|
|
124
|
+
self.gpu_memory_total_gauge = self.meter.create_observable_gauge(
|
|
125
|
+
"gen_ai.gpu.memory.total", # Fixed metric name
|
|
126
|
+
callbacks=[self._observe_gpu_memory_total],
|
|
127
|
+
description="Total GPU memory capacity in MiB",
|
|
128
|
+
unit="MiB",
|
|
129
|
+
)
|
|
130
|
+
self.gpu_temperature_gauge = self.meter.create_observable_gauge(
|
|
131
|
+
"gen_ai.gpu.temperature", # Fixed metric name
|
|
132
|
+
callbacks=[self._observe_gpu_temperature],
|
|
133
|
+
description="GPU temperature in Celsius",
|
|
134
|
+
unit="Cel",
|
|
135
|
+
)
|
|
136
|
+
self.gpu_power_gauge = self.meter.create_observable_gauge(
|
|
137
|
+
"gen_ai.gpu.power", # Fixed metric name
|
|
138
|
+
callbacks=[self._observe_gpu_power],
|
|
139
|
+
description="GPU power consumption in Watts",
|
|
140
|
+
unit="W",
|
|
141
|
+
)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.error("Failed to create GPU metrics instruments: %s", e, exc_info=True)
|
|
144
|
+
|
|
145
|
+
def _get_device_name(self, handle, index):
|
|
146
|
+
"""Get GPU device name safely."""
|
|
147
|
+
try:
|
|
148
|
+
device_name = pynvml.nvmlDeviceGetName(handle)
|
|
149
|
+
if isinstance(device_name, bytes):
|
|
150
|
+
device_name = device_name.decode("utf-8")
|
|
151
|
+
return device_name
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.debug("Failed to get GPU name: %s", e)
|
|
154
|
+
return f"GPU_{index}"
|
|
155
|
+
|
|
156
|
+
def _init_codecarbon(self):
|
|
157
|
+
"""Initialize codecarbon EmissionsTracker if available and CO2 tracking is enabled."""
|
|
158
|
+
if not self.config.enable_co2_tracking:
|
|
159
|
+
logger.debug("CO2 tracking disabled, skipping codecarbon initialization")
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
# Check if user wants to force manual calculation
|
|
163
|
+
if self.config.co2_use_manual:
|
|
164
|
+
logger.info(
|
|
165
|
+
"Using manual CO2 calculation (GENAI_CO2_USE_MANUAL=true) with "
|
|
166
|
+
"carbon_intensity=%s gCO2e/kWh",
|
|
167
|
+
self.config.carbon_intensity,
|
|
168
|
+
)
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
if not CODECARBON_AVAILABLE:
|
|
172
|
+
logger.info(
|
|
173
|
+
"codecarbon not installed, using manual CO2 calculation with "
|
|
174
|
+
"carbon_intensity=%s gCO2e/kWh. Install codecarbon for automatic "
|
|
175
|
+
"region-based carbon intensity: pip install genai-otel-instrument[co2]",
|
|
176
|
+
self.config.carbon_intensity,
|
|
177
|
+
)
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
# Build codecarbon configuration from OTelConfig
|
|
182
|
+
tracker_kwargs = {
|
|
183
|
+
"project_name": self.config.service_name,
|
|
184
|
+
"measure_power_secs": self.config.gpu_collection_interval,
|
|
185
|
+
"save_to_file": False, # We report via OpenTelemetry, not CSV
|
|
186
|
+
"save_to_api": False, # Don't send to codecarbon API
|
|
187
|
+
"logging_logger": logger, # Use our logger
|
|
188
|
+
"log_level": "warning", # Reduce codecarbon's logging noise
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
# Tracking mode: "machine" (all processes) or "process" (current only)
|
|
192
|
+
tracker_kwargs["tracking_mode"] = self.config.co2_tracking_mode
|
|
193
|
+
|
|
194
|
+
# Determine country code for offline mode
|
|
195
|
+
country_code = self.config.co2_country_iso_code
|
|
196
|
+
if self.config.co2_offline_mode and not country_code:
|
|
197
|
+
# Default to USA if not specified in offline mode
|
|
198
|
+
country_code = "USA"
|
|
199
|
+
logger.debug(
|
|
200
|
+
"No country ISO code specified for offline mode, defaulting to USA. "
|
|
201
|
+
"Set GENAI_CO2_COUNTRY_ISO_CODE for accurate carbon intensity."
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Use OfflineEmissionsTracker for offline mode, EmissionsTracker otherwise
|
|
205
|
+
if self.config.co2_offline_mode:
|
|
206
|
+
# OfflineEmissionsTracker requires country_iso_code
|
|
207
|
+
tracker_kwargs["country_iso_code"] = country_code
|
|
208
|
+
|
|
209
|
+
# Optional region within country (e.g., "california")
|
|
210
|
+
if self.config.co2_region:
|
|
211
|
+
tracker_kwargs["region"] = self.config.co2_region
|
|
212
|
+
|
|
213
|
+
# Cloud provider configuration for more accurate carbon intensity
|
|
214
|
+
if self.config.co2_cloud_provider:
|
|
215
|
+
tracker_kwargs["cloud_provider"] = self.config.co2_cloud_provider
|
|
216
|
+
if self.config.co2_cloud_region:
|
|
217
|
+
tracker_kwargs["cloud_region"] = self.config.co2_cloud_region
|
|
218
|
+
|
|
219
|
+
self._emissions_tracker = OfflineEmissionsTracker(**tracker_kwargs)
|
|
220
|
+
else:
|
|
221
|
+
# Online mode - EmissionsTracker can auto-detect location
|
|
222
|
+
if self.config.co2_cloud_provider:
|
|
223
|
+
tracker_kwargs["cloud_provider"] = self.config.co2_cloud_provider
|
|
224
|
+
if self.config.co2_cloud_region:
|
|
225
|
+
tracker_kwargs["cloud_region"] = self.config.co2_cloud_region
|
|
226
|
+
|
|
227
|
+
self._emissions_tracker = EmissionsTracker(**tracker_kwargs)
|
|
228
|
+
|
|
229
|
+
self._use_codecarbon = True
|
|
230
|
+
logger.info(
|
|
231
|
+
"Codecarbon initialized for CO2 tracking (offline=%s, country=%s, region=%s)",
|
|
232
|
+
self.config.co2_offline_mode,
|
|
233
|
+
country_code or "auto-detect",
|
|
234
|
+
self.config.co2_region or "auto-detect",
|
|
235
|
+
)
|
|
236
|
+
except Exception as e:
|
|
237
|
+
logger.warning(
|
|
238
|
+
"Failed to initialize codecarbon, falling back to manual CO2 calculation: %s", e
|
|
239
|
+
)
|
|
240
|
+
self._use_codecarbon = False
|
|
241
|
+
|
|
242
|
+
def _observe_gpu_utilization(self, options):
|
|
243
|
+
"""Observable callback for GPU utilization."""
|
|
244
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
pynvml.nvmlInit()
|
|
249
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
250
|
+
|
|
251
|
+
for i in range(device_count):
|
|
252
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
253
|
+
device_name = self._get_device_name(handle, i)
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
257
|
+
yield Observation(
|
|
258
|
+
value=utilization.gpu,
|
|
259
|
+
attributes={"gpu_id": str(i), "gpu_name": device_name},
|
|
260
|
+
)
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.debug("Failed to get GPU utilization for GPU %d: %s", i, e)
|
|
263
|
+
|
|
264
|
+
pynvml.nvmlShutdown()
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error("Error observing GPU utilization: %s", e)
|
|
267
|
+
|
|
268
|
+
def _observe_gpu_memory(self, options):
|
|
269
|
+
"""Observable callback for GPU memory usage."""
|
|
270
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
pynvml.nvmlInit()
|
|
275
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
276
|
+
|
|
277
|
+
for i in range(device_count):
|
|
278
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
279
|
+
device_name = self._get_device_name(handle, i)
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
283
|
+
gpu_memory_used = memory_info.used / (1024**2) # Convert to MiB
|
|
284
|
+
yield Observation(
|
|
285
|
+
value=gpu_memory_used,
|
|
286
|
+
attributes={"gpu_id": str(i), "gpu_name": device_name},
|
|
287
|
+
)
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.debug("Failed to get GPU memory for GPU %d: %s", i, e)
|
|
290
|
+
|
|
291
|
+
pynvml.nvmlShutdown()
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error("Error observing GPU memory: %s", e)
|
|
294
|
+
|
|
295
|
+
def _observe_gpu_memory_total(self, options):
|
|
296
|
+
"""Observable callback for total GPU memory capacity."""
|
|
297
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
pynvml.nvmlInit()
|
|
302
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
303
|
+
|
|
304
|
+
for i in range(device_count):
|
|
305
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
306
|
+
device_name = self._get_device_name(handle, i)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
310
|
+
gpu_memory_total = memory_info.total / (1024**2) # Convert to MiB
|
|
311
|
+
yield Observation(
|
|
312
|
+
value=gpu_memory_total,
|
|
313
|
+
attributes={"gpu_id": str(i), "gpu_name": device_name},
|
|
314
|
+
)
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.debug("Failed to get total GPU memory for GPU %d: %s", i, e)
|
|
317
|
+
|
|
318
|
+
pynvml.nvmlShutdown()
|
|
319
|
+
except Exception as e:
|
|
320
|
+
logger.error("Error observing total GPU memory: %s", e)
|
|
321
|
+
|
|
322
|
+
def _observe_gpu_temperature(self, options):
|
|
323
|
+
"""Observable callback for GPU temperature."""
|
|
324
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
pynvml.nvmlInit()
|
|
329
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
330
|
+
|
|
331
|
+
for i in range(device_count):
|
|
332
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
333
|
+
device_name = self._get_device_name(handle, i)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
gpu_temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
337
|
+
yield Observation(
|
|
338
|
+
value=gpu_temp, attributes={"gpu_id": str(i), "gpu_name": device_name}
|
|
339
|
+
)
|
|
340
|
+
except Exception as e:
|
|
341
|
+
logger.debug("Failed to get GPU temperature for GPU %d: %s", i, e)
|
|
342
|
+
|
|
343
|
+
pynvml.nvmlShutdown()
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error("Error observing GPU temperature: %s", e)
|
|
346
|
+
|
|
347
|
+
def _observe_gpu_power(self, options):
|
|
348
|
+
"""Observable callback for GPU power consumption."""
|
|
349
|
+
if not NVML_AVAILABLE or not self.gpu_available:
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
pynvml.nvmlInit()
|
|
354
|
+
device_count = pynvml.nvmlDeviceGetCount()
|
|
355
|
+
|
|
356
|
+
for i in range(device_count):
|
|
357
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
358
|
+
device_name = self._get_device_name(handle, i)
|
|
359
|
+
|
|
360
|
+
try:
|
|
361
|
+
# Power usage is returned in milliwatts, convert to watts
|
|
362
|
+
power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
|
|
363
|
+
power_w = power_mw / 1000.0
|
|
364
|
+
yield Observation(
|
|
365
|
+
value=power_w, attributes={"gpu_id": str(i), "gpu_name": device_name}
|
|
366
|
+
)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.debug("Failed to get GPU power for GPU %d: %s", i, e)
|
|
369
|
+
|
|
370
|
+
pynvml.nvmlShutdown()
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.error("Error observing GPU power: %s", e)
|
|
373
|
+
|
|
374
|
+
def start(self):
|
|
375
|
+
"""Starts the GPU metrics collection.
|
|
376
|
+
|
|
377
|
+
ObservableGauges are automatically collected by the MeterProvider,
|
|
378
|
+
so we only need to start the CO2 collection thread.
|
|
379
|
+
"""
|
|
380
|
+
if not NVML_AVAILABLE:
|
|
381
|
+
logger.warning("Cannot start GPU metrics collection - nvidia-ml-py not available")
|
|
382
|
+
return
|
|
383
|
+
|
|
384
|
+
if not self.gpu_available:
|
|
385
|
+
return
|
|
386
|
+
|
|
387
|
+
# Start codecarbon emissions tracker if available and configured
|
|
388
|
+
if self._use_codecarbon and self._emissions_tracker:
|
|
389
|
+
try:
|
|
390
|
+
self._emissions_tracker.start()
|
|
391
|
+
self._last_emissions_kg = 0.0
|
|
392
|
+
logger.info("Codecarbon emissions tracker started")
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logger.warning("Failed to start codecarbon tracker: %s", e)
|
|
395
|
+
self._use_codecarbon = False
|
|
396
|
+
|
|
397
|
+
logger.info("Starting GPU metrics collection (CO2 tracking)")
|
|
398
|
+
# Only start CO2 collection thread - ObservableGauges are auto-collected
|
|
399
|
+
self._thread = threading.Thread(target=self._collect_loop, daemon=True)
|
|
400
|
+
self._thread.start()
|
|
401
|
+
|
|
402
|
+
def _collect_loop(self):
|
|
403
|
+
while not self._stop_event.wait(self.interval):
|
|
404
|
+
current_time = time.time()
|
|
405
|
+
|
|
406
|
+
# Collect CO2 emissions from codecarbon if available
|
|
407
|
+
if self.config.enable_co2_tracking:
|
|
408
|
+
self._collect_codecarbon_emissions()
|
|
409
|
+
|
|
410
|
+
for i in range(self.device_count):
|
|
411
|
+
try:
|
|
412
|
+
handle = self.nvml.nvmlDeviceGetHandleByIndex(i)
|
|
413
|
+
power_w = self.nvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # Watts
|
|
414
|
+
delta_time_hours = (current_time - self.last_timestamp[i]) / 3600.0
|
|
415
|
+
delta_energy_wh = (power_w / 1000.0) * (
|
|
416
|
+
delta_time_hours * 3600.0
|
|
417
|
+
) # Wh (power in kW * hours = kWh, but track in Wh for precision)
|
|
418
|
+
self.cumulative_energy_wh[i] += delta_energy_wh
|
|
419
|
+
|
|
420
|
+
# Calculate and record CO2 emissions using manual calculation
|
|
421
|
+
# (only if codecarbon is not available/enabled)
|
|
422
|
+
if self.config.enable_co2_tracking and not self._use_codecarbon:
|
|
423
|
+
delta_co2_g = (
|
|
424
|
+
delta_energy_wh / 1000.0
|
|
425
|
+
) * self.config.carbon_intensity # gCO2e
|
|
426
|
+
self.co2_counter.add(delta_co2_g, {"gpu_id": str(i)})
|
|
427
|
+
|
|
428
|
+
# Calculate and record power cost
|
|
429
|
+
# delta_energy_wh is in Wh, convert to kWh and multiply by cost per kWh
|
|
430
|
+
delta_cost_usd = (delta_energy_wh / 1000.0) * self.config.power_cost_per_kwh
|
|
431
|
+
device_name = self._get_device_name(handle, i)
|
|
432
|
+
self.power_cost_counter.add(
|
|
433
|
+
delta_cost_usd, {"gpu_id": str(i), "gpu_name": device_name}
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
self.last_timestamp[i] = current_time
|
|
437
|
+
except Exception as e:
|
|
438
|
+
logger.error("Error collecting GPU %d metrics: %s", i, e)
|
|
439
|
+
|
|
440
|
+
def _collect_codecarbon_emissions(self):
|
|
441
|
+
"""Collect CO2 emissions from codecarbon and report to OpenTelemetry."""
|
|
442
|
+
if not self._use_codecarbon or not self._emissions_tracker:
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
# Flush codecarbon to get current emissions without stopping
|
|
447
|
+
# This updates the internal state and allows us to read current emissions
|
|
448
|
+
self._emissions_tracker.flush()
|
|
449
|
+
|
|
450
|
+
# Get current total emissions from codecarbon (in kg CO2e)
|
|
451
|
+
# The emissions property returns the total accumulated emissions
|
|
452
|
+
current_emissions_kg = self._emissions_tracker._total_emissions.total # kg CO2e
|
|
453
|
+
|
|
454
|
+
# Calculate delta since last collection
|
|
455
|
+
delta_emissions_kg = current_emissions_kg - self._last_emissions_kg
|
|
456
|
+
|
|
457
|
+
if delta_emissions_kg > 0:
|
|
458
|
+
# Convert kg to grams and record
|
|
459
|
+
delta_emissions_g = delta_emissions_kg * 1000.0
|
|
460
|
+
self.co2_counter.add(
|
|
461
|
+
delta_emissions_g,
|
|
462
|
+
{
|
|
463
|
+
"source": "codecarbon",
|
|
464
|
+
"country": self.config.co2_country_iso_code or "auto",
|
|
465
|
+
"region": self.config.co2_region or "auto",
|
|
466
|
+
},
|
|
467
|
+
)
|
|
468
|
+
logger.debug(
|
|
469
|
+
"Recorded %.4f gCO2e emissions from codecarbon (total: %.4f kg)",
|
|
470
|
+
delta_emissions_g,
|
|
471
|
+
current_emissions_kg,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
self._last_emissions_kg = current_emissions_kg
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logger.debug("Error collecting codecarbon emissions: %s", e)
|
|
478
|
+
|
|
479
|
+
def stop(self):
|
|
480
|
+
"""Stops the GPU metrics collection thread."""
|
|
481
|
+
# Stop CO2 collection thread
|
|
482
|
+
self._stop_event.set()
|
|
483
|
+
if self._thread and self._thread.is_alive():
|
|
484
|
+
self._thread.join(timeout=5)
|
|
485
|
+
logger.info("GPU CO2 metrics collection thread stopped.")
|
|
486
|
+
|
|
487
|
+
# Stop codecarbon emissions tracker and get final emissions
|
|
488
|
+
if self._use_codecarbon and self._emissions_tracker:
|
|
489
|
+
try:
|
|
490
|
+
final_emissions_kg = self._emissions_tracker.stop()
|
|
491
|
+
if final_emissions_kg is not None:
|
|
492
|
+
# Record any remaining emissions not yet reported
|
|
493
|
+
delta_emissions_kg = final_emissions_kg - self._last_emissions_kg
|
|
494
|
+
if delta_emissions_kg > 0:
|
|
495
|
+
delta_emissions_g = delta_emissions_kg * 1000.0
|
|
496
|
+
self.co2_counter.add(
|
|
497
|
+
delta_emissions_g,
|
|
498
|
+
{
|
|
499
|
+
"source": "codecarbon",
|
|
500
|
+
"country": self.config.co2_country_iso_code or "auto",
|
|
501
|
+
"region": self.config.co2_region or "auto",
|
|
502
|
+
},
|
|
503
|
+
)
|
|
504
|
+
logger.info(
|
|
505
|
+
"Codecarbon emissions tracker stopped. Total emissions: %.4f kg CO2e",
|
|
506
|
+
final_emissions_kg,
|
|
507
|
+
)
|
|
508
|
+
except Exception as e:
|
|
509
|
+
logger.debug("Error stopping codecarbon tracker: %s", e)
|
|
510
|
+
|
|
511
|
+
# ObservableGauges will automatically stop when MeterProvider is shutdown
|
|
512
|
+
if self.gpu_available and NVML_AVAILABLE:
|
|
513
|
+
try:
|
|
514
|
+
pynvml.nvmlShutdown()
|
|
515
|
+
except Exception as e:
|
|
516
|
+
logger.debug("Error shutting down NVML: %s", e)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Module for OpenTelemetry instrumentors for various LLM providers and frameworks.
|
|
2
|
+
|
|
3
|
+
This package contains individual instrumentor classes for different Generative AI
|
|
4
|
+
libraries and frameworks, allowing for automatic tracing and metric collection
|
|
5
|
+
of their operations.
|
|
6
|
+
|
|
7
|
+
All imports are done lazily to avoid ImportError when optional dependencies
|
|
8
|
+
are not installed.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .anthropic_instrumentor import AnthropicInstrumentor
|
|
12
|
+
from .anyscale_instrumentor import AnyscaleInstrumentor
|
|
13
|
+
from .autogen_instrumentor import AutoGenInstrumentor
|
|
14
|
+
from .aws_bedrock_instrumentor import AWSBedrockInstrumentor
|
|
15
|
+
from .azure_openai_instrumentor import AzureOpenAIInstrumentor
|
|
16
|
+
from .bedrock_agents_instrumentor import BedrockAgentsInstrumentor
|
|
17
|
+
from .cohere_instrumentor import CohereInstrumentor
|
|
18
|
+
from .crewai_instrumentor import CrewAIInstrumentor
|
|
19
|
+
from .dspy_instrumentor import DSPyInstrumentor
|
|
20
|
+
from .google_ai_instrumentor import GoogleAIInstrumentor
|
|
21
|
+
from .groq_instrumentor import GroqInstrumentor
|
|
22
|
+
from .guardrails_ai_instrumentor import GuardrailsAIInstrumentor
|
|
23
|
+
from .haystack_instrumentor import HaystackInstrumentor
|
|
24
|
+
from .huggingface_instrumentor import HuggingFaceInstrumentor
|
|
25
|
+
from .hyperbolic_instrumentor import HyperbolicInstrumentor
|
|
26
|
+
from .instructor_instrumentor import InstructorInstrumentor
|
|
27
|
+
from .langchain_instrumentor import LangChainInstrumentor
|
|
28
|
+
from .langgraph_instrumentor import LangGraphInstrumentor
|
|
29
|
+
from .llamaindex_instrumentor import LlamaIndexInstrumentor
|
|
30
|
+
from .mistralai_instrumentor import MistralAIInstrumentor
|
|
31
|
+
from .ollama_instrumentor import OllamaInstrumentor
|
|
32
|
+
from .openai_agents_instrumentor import OpenAIAgentsInstrumentor
|
|
33
|
+
|
|
34
|
+
# Import instrumentors only - they handle their own dependency checking
|
|
35
|
+
from .openai_instrumentor import OpenAIInstrumentor
|
|
36
|
+
from .pydantic_ai_instrumentor import PydanticAIInstrumentor
|
|
37
|
+
from .replicate_instrumentor import ReplicateInstrumentor
|
|
38
|
+
from .sambanova_instrumentor import SambaNovaInstrumentor
|
|
39
|
+
from .togetherai_instrumentor import TogetherAIInstrumentor
|
|
40
|
+
from .vertexai_instrumentor import VertexAIInstrumentor
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"OpenAIInstrumentor",
|
|
44
|
+
"OpenAIAgentsInstrumentor",
|
|
45
|
+
"AnthropicInstrumentor",
|
|
46
|
+
"GoogleAIInstrumentor",
|
|
47
|
+
"AWSBedrockInstrumentor",
|
|
48
|
+
"AzureOpenAIInstrumentor",
|
|
49
|
+
"AutoGenInstrumentor",
|
|
50
|
+
"BedrockAgentsInstrumentor",
|
|
51
|
+
"CohereInstrumentor",
|
|
52
|
+
"CrewAIInstrumentor",
|
|
53
|
+
"DSPyInstrumentor",
|
|
54
|
+
"MistralAIInstrumentor",
|
|
55
|
+
"TogetherAIInstrumentor",
|
|
56
|
+
"GroqInstrumentor",
|
|
57
|
+
"GuardrailsAIInstrumentor",
|
|
58
|
+
"HaystackInstrumentor",
|
|
59
|
+
"InstructorInstrumentor",
|
|
60
|
+
"OllamaInstrumentor",
|
|
61
|
+
"VertexAIInstrumentor",
|
|
62
|
+
"ReplicateInstrumentor",
|
|
63
|
+
"AnyscaleInstrumentor",
|
|
64
|
+
"SambaNovaInstrumentor",
|
|
65
|
+
"HyperbolicInstrumentor",
|
|
66
|
+
"LangChainInstrumentor",
|
|
67
|
+
"LangGraphInstrumentor",
|
|
68
|
+
"LlamaIndexInstrumentor",
|
|
69
|
+
"HuggingFaceInstrumentor",
|
|
70
|
+
"PydanticAIInstrumentor",
|
|
71
|
+
]
|