openlit 1.27.1__tar.gz → 1.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {openlit-1.27.1 → openlit-1.29.0}/PKG-INFO +4 -4
  2. {openlit-1.27.1 → openlit-1.29.0}/README.md +2 -2
  3. {openlit-1.27.1 → openlit-1.29.0}/pyproject.toml +2 -2
  4. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/__init__.py +2 -2
  5. openlit-1.29.0/src/openlit/instrumentation/gpu/__init__.py +213 -0
  6. openlit-1.27.1/src/openlit/instrumentation/gpu/__init__.py +0 -132
  7. {openlit-1.27.1 → openlit-1.29.0}/LICENSE +0 -0
  8. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/__helpers.py +0 -0
  9. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/__init__.py +0 -0
  10. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/all.py +0 -0
  11. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/bias_detection.py +0 -0
  12. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/hallucination.py +0 -0
  13. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/toxicity.py +0 -0
  14. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/evals/utils.py +0 -0
  15. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/__init__.py +0 -0
  16. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/all.py +0 -0
  17. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/prompt_injection.py +0 -0
  18. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/restrict_topic.py +0 -0
  19. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/sensitive_topic.py +0 -0
  20. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/guard/utils.py +0 -0
  21. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/anthropic/__init__.py +0 -0
  22. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/anthropic/anthropic.py +0 -0
  23. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/anthropic/async_anthropic.py +0 -0
  24. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/azure_ai_inference/__init__.py +0 -0
  25. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +0 -0
  26. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +0 -0
  27. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/bedrock/__init__.py +0 -0
  28. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/bedrock/bedrock.py +0 -0
  29. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/chroma/__init__.py +0 -0
  30. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/chroma/chroma.py +0 -0
  31. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/cohere/__init__.py +0 -0
  32. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/cohere/cohere.py +0 -0
  33. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/elevenlabs/__init__.py +0 -0
  34. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/elevenlabs/async_elevenlabs.py +0 -0
  35. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/elevenlabs/elevenlabs.py +0 -0
  36. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/embedchain/__init__.py +0 -0
  37. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/embedchain/embedchain.py +0 -0
  38. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/google_ai_studio/__init__.py +0 -0
  39. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +0 -0
  40. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/google_ai_studio/google_ai_studio.py +0 -0
  41. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/gpt4all/__init__.py +0 -0
  42. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/gpt4all/gpt4all.py +0 -0
  43. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/groq/__init__.py +0 -0
  44. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/groq/async_groq.py +0 -0
  45. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/groq/groq.py +0 -0
  46. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/haystack/__init__.py +0 -0
  47. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/haystack/haystack.py +0 -0
  48. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/langchain/__init__.py +0 -0
  49. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/langchain/langchain.py +0 -0
  50. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/llamaindex/__init__.py +0 -0
  51. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/llamaindex/llamaindex.py +0 -0
  52. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/milvus/__init__.py +0 -0
  53. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/milvus/milvus.py +0 -0
  54. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/mistral/__init__.py +0 -0
  55. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/mistral/async_mistral.py +0 -0
  56. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/mistral/mistral.py +0 -0
  57. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/ollama/__init__.py +0 -0
  58. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/ollama/async_ollama.py +0 -0
  59. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/ollama/ollama.py +0 -0
  60. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/openai/__init__.py +0 -0
  61. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/openai/async_azure_openai.py +0 -0
  62. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/openai/async_openai.py +0 -0
  63. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/openai/azure_openai.py +0 -0
  64. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/openai/openai.py +0 -0
  65. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/pinecone/__init__.py +0 -0
  66. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/pinecone/pinecone.py +0 -0
  67. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/qdrant/__init__.py +0 -0
  68. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/qdrant/qdrant.py +0 -0
  69. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/transformers/__init__.py +0 -0
  70. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/transformers/transformers.py +0 -0
  71. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/vertexai/__init__.py +0 -0
  72. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/vertexai/async_vertexai.py +0 -0
  73. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/vertexai/vertexai.py +0 -0
  74. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/vllm/__init__.py +0 -0
  75. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/instrumentation/vllm/vllm.py +0 -0
  76. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/otel/metrics.py +0 -0
  77. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/otel/tracing.py +0 -0
  78. {openlit-1.27.1 → openlit-1.29.0}/src/openlit/semcov/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: openlit
3
- Version: 1.27.1
3
+ Version: 1.29.0
4
4
  Summary: OpenTelemetry-native Auto instrumentation library for monitoring LLM Applications and GPUs, facilitating the integration of observability into your GenAI-driven projects
5
5
  Home-page: https://github.com/openlit/openlit/tree/main/openlit/python
6
6
  Keywords: OpenTelemetry,otel,otlp,llm,tracing,openai,anthropic,claude,cohere,llm monitoring,observability,monitoring,gpt,Generative AI,chatGPT,gpu
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.13
16
16
  Requires-Dist: anthropic (>=0.21.0,<0.22.0)
17
17
  Requires-Dist: boto3 (>=1.34.0,<2.0.0)
18
18
  Requires-Dist: botocore (>=1.34.0,<2.0.0)
19
- Requires-Dist: gpustat (>=1.1.1,<2.0.0)
20
19
  Requires-Dist: openai (>=1.1.1,<2.0.0)
21
20
  Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
22
21
  Requires-Dist: opentelemetry-exporter-otlp (>=1.27.0,<2.0.0)
@@ -26,6 +25,7 @@ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
26
25
  Requires-Dist: requests (>=2.26.0,<3.0.0)
27
26
  Requires-Dist: schedule (>=1.2.2,<2.0.0)
28
27
  Requires-Dist: tiktoken (>=0.7.0,<0.8.0)
28
+ Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
29
29
  Project-URL: Repository, https://github.com/openlit/openlit/tree/main/openlit/python
30
30
  Description-Content-Type: text/markdown
31
31
 
@@ -65,8 +65,8 @@ This project proudly follows and maintains the [Semantic Conventions](https://gi
65
65
 
66
66
  | LLMs | Vector DBs | Frameworks | GPUs |
67
67
  |--------------------------------------------------------------------------|----------------------------------------------|----------------------------------------------|---------------|
68
- | [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA GPUs](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
69
- | [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | |
68
+ | [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
69
+ | [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | [✅ AMD](#) |
70
70
  | [✅ Anthropic](https://docs.openlit.io/latest/integrations/anthropic) | [✅ Qdrant](https://docs.openlit.io/latest/integrations/qdrant) | [✅ LlamaIndex](https://docs.openlit.io/latest/integrations/llama-index) | |
71
71
  | [✅ GPT4All](https://docs.openlit.io/latest/integrations/gpt4all) | [✅ Milvus](https://docs.openlit.io/latest/integrations/milvus) | [✅ Haystack](https://docs.openlit.io/latest/integrations/haystack) | |
72
72
  | [✅ Cohere](https://docs.openlit.io/latest/integrations/cohere) | | [✅ EmbedChain](https://docs.openlit.io/latest/integrations/embedchain) | |
@@ -34,8 +34,8 @@ This project proudly follows and maintains the [Semantic Conventions](https://gi
34
34
 
35
35
  | LLMs | Vector DBs | Frameworks | GPUs |
36
36
  |--------------------------------------------------------------------------|----------------------------------------------|----------------------------------------------|---------------|
37
- | [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA GPUs](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
38
- | [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | |
37
+ | [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
38
+ | [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | [✅ AMD](#) |
39
39
  | [✅ Anthropic](https://docs.openlit.io/latest/integrations/anthropic) | [✅ Qdrant](https://docs.openlit.io/latest/integrations/qdrant) | [✅ LlamaIndex](https://docs.openlit.io/latest/integrations/llama-index) | |
40
40
  | [✅ GPT4All](https://docs.openlit.io/latest/integrations/gpt4all) | [✅ Milvus](https://docs.openlit.io/latest/integrations/milvus) | [✅ Haystack](https://docs.openlit.io/latest/integrations/haystack) | |
41
41
  | [✅ Cohere](https://docs.openlit.io/latest/integrations/cohere) | | [✅ EmbedChain](https://docs.openlit.io/latest/integrations/embedchain) | |
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "openlit"
3
- version = "1.27.1"
3
+ version = "1.29.0"
4
4
  description = "OpenTelemetry-native Auto instrumentation library for monitoring LLM Applications and GPUs, facilitating the integration of observability into your GenAI-driven projects"
5
5
  authors = ["OpenLIT"]
6
6
  repository = "https://github.com/openlit/openlit/tree/main/openlit/python"
@@ -14,7 +14,7 @@ requests = "^2.26.0"
14
14
  schedule = "^1.2.2"
15
15
  tiktoken = "^0.7.0"
16
16
  pydantic = "^2.0.0"
17
- gpustat = "^1.1.1"
17
+ xmltodict = "^0.13.0"
18
18
  boto3 = "^1.34.0"
19
19
  botocore = "^1.34.0"
20
20
  opentelemetry-api = "^1.27.0"
@@ -46,7 +46,7 @@ from openlit.instrumentation.pinecone import PineconeInstrumentor
46
46
  from openlit.instrumentation.qdrant import QdrantInstrumentor
47
47
  from openlit.instrumentation.milvus import MilvusInstrumentor
48
48
  from openlit.instrumentation.transformers import TransformersInstrumentor
49
- from openlit.instrumentation.gpu import NvidiaGPUInstrumentor
49
+ from openlit.instrumentation.gpu import GPUInstrumentor
50
50
  import openlit.guard
51
51
  import openlit.evals
52
52
 
@@ -313,7 +313,7 @@ def init(environment="default", application_name="default", tracer=None, otlp_en
313
313
  disabled_instrumentors, module_name_map)
314
314
 
315
315
  if not disable_metrics and collect_gpu_stats:
316
- NvidiaGPUInstrumentor().instrument(
316
+ GPUInstrumentor().instrument(
317
317
  environment=config.environment,
318
318
  application_name=config.application_name,
319
319
  )
@@ -0,0 +1,213 @@
1
+ # pylint: disable=useless-return, bad-staticmethod-argument, duplicate-code, import-outside-toplevel, broad-exception-caught, unused-argument, import-error, too-many-return-statements, superfluous-parens
2
+ """Initializer of Auto Instrumentation of GPU Metrics"""
3
+
4
+ from typing import Collection, Iterable
5
+ import logging
6
+ from functools import partial
7
+ from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
8
+ from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
9
+ from opentelemetry.metrics import get_meter, CallbackOptions, Observation
10
+ from openlit.semcov import SemanticConvetion
11
+
12
+ # Initialize logger for logging potential issues and operations
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class GPUInstrumentor(BaseInstrumentor):
16
+ """
17
+ An instrumentor for collecting GPU metrics.
18
+ """
19
+
20
+ def instrumentation_dependencies(self) -> Collection[str]:
21
+ return []
22
+
23
+ def _instrument(self, **kwargs):
24
+ application_name = kwargs.get("application_name", "default")
25
+ environment = kwargs.get("environment", "default")
26
+ # pylint: disable=attribute-defined-outside-init
27
+ self.gpu_type = self._get_gpu_type()
28
+ meter = get_meter(
29
+ __name__,
30
+ "0.1.0",
31
+ schema_url="https://opentelemetry.io/schemas/1.11.0",
32
+ )
33
+
34
+ if not self.gpu_type:
35
+ logger.error(
36
+ "OpenLIT GPU Instrumentation Error: No supported GPUs found."
37
+ "If this is a non-GPU host, set `collect_gpu_stats=False` to disable GPU stats."
38
+ )
39
+ return
40
+
41
+ metric_names = [
42
+ ("GPU_UTILIZATION", "utilization"),
43
+ ("GPU_UTILIZATION_ENC", "utilization_enc"),
44
+ ("GPU_UTILIZATION_DEC", "utilization_dec"),
45
+ ("GPU_TEMPERATURE", "temperature"),
46
+ ("GPU_FAN_SPEED", "fan_speed"),
47
+ ("GPU_MEMORY_AVAILABLE", "memory_available"),
48
+ ("GPU_MEMORY_TOTAL", "memory_total"),
49
+ ("GPU_MEMORY_USED", "memory_used"),
50
+ ("GPU_MEMORY_FREE", "memory_free"),
51
+ ("GPU_POWER_DRAW", "power_draw"),
52
+ ("GPU_POWER_LIMIT", "power_limit"),
53
+ ]
54
+
55
+ for semantic_name, internal_name in metric_names:
56
+ meter.create_observable_gauge(
57
+ name=getattr(SemanticConvetion, semantic_name),
58
+ callbacks=[partial(self._collect_metric,
59
+ environment, application_name, internal_name)],
60
+ description=f"GPU {internal_name.replace('_', ' ').title()}",
61
+ )
62
+
63
+ def _uninstrument(self, **kwargs):
64
+ # Proper uninstrumentation logic to revert patched methods
65
+ pass
66
+
67
+ def _get_gpu_type(self) -> str:
68
+ try:
69
+ import pynvml
70
+ pynvml.nvmlInit()
71
+ return "nvidia"
72
+ except Exception:
73
+ try:
74
+ import amdsmi
75
+ amdsmi.amdsmi_init()
76
+ return "amd"
77
+ except Exception:
78
+ return None
79
+
80
+
81
+ def _collect_metric(self, environment, application_name,
82
+ metric_name,
83
+ options: CallbackOptions) -> Iterable[Observation]:
84
+ # pylint: disable=no-else-return
85
+ if self.gpu_type == "nvidia":
86
+ return self._collect_nvidia_metrics(environment, application_name, metric_name, options)
87
+ elif self.gpu_type == "amd":
88
+ return self._collect_amd_metrics(environment, application_name, metric_name, options)
89
+ return []
90
+
91
+ def _collect_nvidia_metrics(self, environment, application_name,
92
+ metric_name,
93
+ options: CallbackOptions) -> Iterable[Observation]:
94
+ try:
95
+ import pynvml
96
+ gpu_count = pynvml.nvmlDeviceGetCount()
97
+ mega_bytes = 1024 * 1024
98
+ gpu_index = 0
99
+ for gpu_index in range(gpu_count):
100
+ handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
101
+
102
+ def get_metric_value(handle, metric_name):
103
+ try:
104
+ # pylint: disable=no-else-return
105
+ if metric_name == "temperature":
106
+ return pynvml.nvmlDeviceGetTemperature(handle,
107
+ pynvml.NVML_TEMPERATURE_GPU)
108
+ elif metric_name == "utilization":
109
+ return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
110
+ elif metric_name == "utilization_enc":
111
+ return pynvml.nvmlDeviceGetEncoderUtilization(handle)[0]
112
+ elif metric_name == "utilization_dec":
113
+ return pynvml.nvmlDeviceGetDecoderUtilization(handle)[0]
114
+ elif metric_name == "fan_speed":
115
+ return 0
116
+ elif metric_name == "memory_available":
117
+ memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
118
+ return (memory_info.free // mega_bytes) # Assuming reserved memory is 0
119
+ elif metric_name == "memory_total":
120
+ return (pynvml.nvmlDeviceGetMemoryInfo(handle).total // mega_bytes)
121
+ elif metric_name == "memory_used":
122
+ return (pynvml.nvmlDeviceGetMemoryInfo(handle).used // mega_bytes)
123
+ elif metric_name == "memory_free":
124
+ return (pynvml.nvmlDeviceGetMemoryInfo(handle).free // mega_bytes)
125
+ elif metric_name == "power_draw":
126
+ return (pynvml.nvmlDeviceGetPowerUsage(handle) // 1000.0)
127
+ elif metric_name == "power_limit":
128
+ return (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) // 1000.0)
129
+ except Exception as e:
130
+ # pylint: disable=cell-var-from-loop
131
+ logger.error("Error collecting metric %s for GPU %d: %s", metric_name,
132
+ gpu_index, e)
133
+ return 0
134
+
135
+ def safe_decode(byte_string):
136
+ if isinstance(byte_string, bytes):
137
+ return byte_string.decode('utf-8')
138
+ return byte_string
139
+
140
+ attributes = {
141
+ TELEMETRY_SDK_NAME: "openlit",
142
+ SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
143
+ SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
144
+ SemanticConvetion.GPU_INDEX: str(gpu_index),
145
+ SemanticConvetion.GPU_UUID: safe_decode(pynvml.nvmlDeviceGetUUID(handle)),
146
+ SemanticConvetion.GPU_NAME: safe_decode(pynvml.nvmlDeviceGetName(handle))
147
+ }
148
+ yield Observation(get_metric_value(handle, metric_name), attributes)
149
+
150
+ except Exception as e:
151
+ logger.error("Error in GPU metrics collection: %s", e)
152
+
153
+ def _collect_amd_metrics(self, environment, application_name,
154
+ metric_name,
155
+ options: CallbackOptions) -> Iterable[Observation]:
156
+ try:
157
+ import amdsmi
158
+ # Get the number of AMD GPUs
159
+ devices = amdsmi.amdsmi_get_processor_handles()
160
+ mega_bytes = 1024 * 1024
161
+ for device_handle in devices:
162
+
163
+ def get_metric_value(device_handle, metric_name):
164
+ try:
165
+ # pylint: disable=no-else-return
166
+ if metric_name == "temperature":
167
+ # pylint: disable=line-too-long
168
+ return amdsmi.amdsmi_get_temp_metric(device_handle,
169
+ amdsmi.AmdSmiTemperatureType.EDGE,
170
+ amdsmi.AmdSmiTemperatureMetric.CURRENT)
171
+ elif metric_name == "utilization":
172
+ # pylint: disable=line-too-long
173
+ return amdsmi.amdsmi_get_utilization_count(device_handle,
174
+ amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY)
175
+ elif metric_name in ["utilization_enc", "utilization_dec"]:
176
+ return 0 # Placeholder if unsupported
177
+ elif metric_name == "fan_speed":
178
+ return amdsmi.amdsmi_get_gpu_fan_speed(device_handle, 0)
179
+ elif metric_name == "memory_available":
180
+ return (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
181
+ elif metric_name == "memory_total":
182
+ return (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
183
+ elif metric_name == "memory_used":
184
+ return (amdsmi.amdsmi_get_gpu_memory_usage(device_handle) // mega_bytes)
185
+ elif metric_name == "memory_free":
186
+ total_mem = (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
187
+ used_mem = (amdsmi.amdsmi_get_gpu_memory_usage(device_handle) // mega_bytes)
188
+ return (total_mem - used_mem)
189
+ elif metric_name == "power_draw":
190
+ # pylint: disable=line-too-long
191
+ return (amdsmi.amdsmi_get_power_info(device_handle)['average_socket_power'] // 1000.0)
192
+ elif metric_name == "power_limit":
193
+ # pylint: disable=line-too-long
194
+ return (amdsmi.amdsmi_get_power_info(device_handle)['power_limit'] // 1000.0)
195
+ except Exception as e:
196
+ logger.error("Error collecting metric %s for AMD GPU %d: %s", metric_name,
197
+ amdsmi.amdsmi_get_xgmi_info(device_handle)['index'], e)
198
+ return 0
199
+
200
+ attributes = {
201
+ TELEMETRY_SDK_NAME: "openlit",
202
+ SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
203
+ SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
204
+ # pylint: disable=line-too-long
205
+ SemanticConvetion.GPU_INDEX: amdsmi.amdsmi_get_xgmi_info(device_handle)['index'],
206
+ # pylint: disable=line-too-long
207
+ SemanticConvetion.GPU_UUID: amdsmi.amdsmi_get_gpu_asic_info(device_handle)['market_name'],
208
+ SemanticConvetion.GPU_NAME: amdsmi.amdsmi_get_device_name(device_handle)
209
+ }
210
+ yield Observation(get_metric_value(device_handle, metric_name), attributes)
211
+
212
+ except Exception as e:
213
+ logger.error("Error in AMD GPU metrics collection: %s", e)
@@ -1,132 +0,0 @@
1
- # pylint: disable=useless-return, bad-staticmethod-argument, duplicate-code, import-outside-toplevel, broad-exception-caught, unused-argument
2
- """Initializer of Auto Instrumentation of GPU Metrics"""
3
-
4
- from typing import Collection, Iterable
5
- import logging
6
- from functools import partial
7
-
8
- from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
9
- from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
10
- from opentelemetry.metrics import get_meter, CallbackOptions, Observation
11
-
12
- from openlit.semcov import SemanticConvetion
13
-
14
- # Initialize logger for logging potential issues and operations
15
- logger = logging.getLogger(__name__)
16
-
17
- class NvidiaGPUInstrumentor(BaseInstrumentor):
18
- """
19
- An instrumentor for collecting NVIDIA GPU metrics.
20
- """
21
-
22
- def instrumentation_dependencies(self) -> Collection[str]:
23
- return []
24
-
25
- def _instrument(self, **kwargs):
26
-
27
- application_name = kwargs.get("application_name", "default")
28
- environment = kwargs.get("environment", "default")
29
-
30
- meter = get_meter(
31
- __name__,
32
- "0.1.0",
33
- schema_url="https://opentelemetry.io/schemas/1.11.0",
34
- )
35
-
36
- def check_and_record(value):
37
- return value if value is not None else 0
38
-
39
- meter.create_observable_gauge(
40
- name=SemanticConvetion.GPU_UTILIZATION,
41
- callbacks=[partial(self._collect_metric, environment,
42
- application_name, check_and_record, "utilization")],
43
- description="GPU Utilization",
44
- )
45
- meter.create_observable_gauge(
46
- name=SemanticConvetion.GPU_UTILIZATION_ENC,
47
- callbacks=[partial(self._collect_metric, environment,
48
- application_name, check_and_record, "utilization_enc")],
49
- description="GPU Encoder Utilization",
50
- )
51
- meter.create_observable_gauge(
52
- name=SemanticConvetion.GPU_UTILIZATION_DEC,
53
- callbacks=[partial(self._collect_metric, environment,
54
- application_name, check_and_record, "utilization_dec")],
55
- description="GPU Decoder Utilization",
56
- )
57
- meter.create_observable_gauge(
58
- name=SemanticConvetion.GPU_TEMPERATURE,
59
- callbacks=[partial(self._collect_metric, environment,
60
- application_name, check_and_record, "temperature")],
61
- description="GPU Temperature",
62
- )
63
- meter.create_observable_gauge(
64
- name=SemanticConvetion.GPU_FAN_SPEED,
65
- callbacks=[partial(self._collect_metric, environment,
66
- application_name, check_and_record, "fan_speed")],
67
- description="GPU Fan Speed",
68
- )
69
- meter.create_observable_gauge(
70
- name=SemanticConvetion.GPU_MEMORY_AVAILABLE,
71
- callbacks=[partial(self._collect_metric, environment,
72
- application_name, check_and_record, "memory_available")],
73
- description="GPU Memory Available",
74
- )
75
- meter.create_observable_gauge(
76
- name=SemanticConvetion.GPU_MEMORY_TOTAL,
77
- callbacks=[partial(self._collect_metric, environment,
78
- application_name, check_and_record, "memory_total")],
79
- description="GPU Memory Total",
80
- )
81
- meter.create_observable_gauge(
82
- name=SemanticConvetion.GPU_MEMORY_USED,
83
- callbacks=[partial(self._collect_metric, environment,
84
- application_name, check_and_record, "memory_used")],
85
- description="GPU Memory Used",
86
- )
87
- meter.create_observable_gauge(
88
- name=SemanticConvetion.GPU_MEMORY_FREE,
89
- callbacks=[partial(self._collect_metric, environment,
90
- application_name, check_and_record, "memory_free")],
91
- description="GPU Memory Free",
92
- )
93
- meter.create_observable_gauge(
94
- name=SemanticConvetion.GPU_POWER_DRAW,
95
- callbacks=[partial(self._collect_metric, environment,
96
- application_name, check_and_record, "power_draw")],
97
- description="GPU Power Draw",
98
- )
99
- meter.create_observable_gauge(
100
- name=SemanticConvetion.GPU_POWER_LIMIT,
101
- callbacks=[partial(self._collect_metric, environment,
102
- application_name, check_and_record, "power_limit")],
103
- description="GPU Power Limit",
104
- )
105
-
106
- def _uninstrument(self, **kwargs):
107
- # Proper uninstrumentation logic to revert patched methods
108
- pass
109
-
110
- def _collect_metric(self, environment, application_name,
111
- check_and_record, metric_name,
112
- options: CallbackOptions) -> Iterable[Observation]:
113
-
114
- import gpustat
115
-
116
- try:
117
- gpu_stats = gpustat.GPUStatCollection.new_query()
118
-
119
- for gpu in gpu_stats.gpus:
120
- attributes = {
121
- TELEMETRY_SDK_NAME: "openlit",
122
- SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
123
- SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
124
- SemanticConvetion.GPU_INDEX: gpu.index,
125
- SemanticConvetion.GPU_UUID: gpu.uuid,
126
- SemanticConvetion.GPU_NAME: gpu.name,
127
- }
128
-
129
- yield Observation(check_and_record(getattr(gpu, metric_name, 0)), attributes)
130
-
131
- except Exception as e:
132
- logger.error("Error in GPU metrics collection: %s", e)
File without changes