openlit 1.27.1__tar.gz → 1.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openlit-1.27.1 → openlit-1.28.0}/PKG-INFO +4 -4
- {openlit-1.27.1 → openlit-1.28.0}/README.md +2 -2
- {openlit-1.27.1 → openlit-1.28.0}/pyproject.toml +2 -2
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/__init__.py +2 -2
- openlit-1.28.0/src/openlit/instrumentation/gpu/__init__.py +208 -0
- openlit-1.27.1/src/openlit/instrumentation/gpu/__init__.py +0 -132
- {openlit-1.27.1 → openlit-1.28.0}/LICENSE +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/__helpers.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/all.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/bias_detection.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/hallucination.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/toxicity.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/evals/utils.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/all.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/prompt_injection.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/restrict_topic.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/sensitive_topic.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/guard/utils.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/anthropic/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/anthropic/anthropic.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/anthropic/async_anthropic.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/azure_ai_inference/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/azure_ai_inference/async_azure_ai_inference.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/azure_ai_inference/azure_ai_inference.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/bedrock/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/bedrock/bedrock.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/chroma/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/chroma/chroma.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/cohere/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/cohere/cohere.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/elevenlabs/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/elevenlabs/async_elevenlabs.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/elevenlabs/elevenlabs.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/embedchain/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/embedchain/embedchain.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/google_ai_studio/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/google_ai_studio/async_google_ai_studio.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/google_ai_studio/google_ai_studio.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/gpt4all/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/gpt4all/gpt4all.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/groq/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/groq/async_groq.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/groq/groq.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/haystack/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/haystack/haystack.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/langchain/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/langchain/langchain.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/llamaindex/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/llamaindex/llamaindex.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/milvus/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/milvus/milvus.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/mistral/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/mistral/async_mistral.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/mistral/mistral.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/ollama/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/ollama/async_ollama.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/ollama/ollama.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/openai/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/openai/async_azure_openai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/openai/async_openai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/openai/azure_openai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/openai/openai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/pinecone/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/pinecone/pinecone.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/qdrant/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/qdrant/qdrant.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/transformers/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/transformers/transformers.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/vertexai/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/vertexai/async_vertexai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/vertexai/vertexai.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/vllm/__init__.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/vllm/vllm.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/otel/metrics.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/otel/tracing.py +0 -0
- {openlit-1.27.1 → openlit-1.28.0}/src/openlit/semcov/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: openlit
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.28.0
|
4
4
|
Summary: OpenTelemetry-native Auto instrumentation library for monitoring LLM Applications and GPUs, facilitating the integration of observability into your GenAI-driven projects
|
5
5
|
Home-page: https://github.com/openlit/openlit/tree/main/openlit/python
|
6
6
|
Keywords: OpenTelemetry,otel,otlp,llm,tracing,openai,anthropic,claude,cohere,llm monitoring,observability,monitoring,gpt,Generative AI,chatGPT,gpu
|
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.13
|
|
16
16
|
Requires-Dist: anthropic (>=0.21.0,<0.22.0)
|
17
17
|
Requires-Dist: boto3 (>=1.34.0,<2.0.0)
|
18
18
|
Requires-Dist: botocore (>=1.34.0,<2.0.0)
|
19
|
-
Requires-Dist: gpustat (>=1.1.1,<2.0.0)
|
20
19
|
Requires-Dist: openai (>=1.1.1,<2.0.0)
|
21
20
|
Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
|
22
21
|
Requires-Dist: opentelemetry-exporter-otlp (>=1.27.0,<2.0.0)
|
@@ -26,6 +25,7 @@ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
26
25
|
Requires-Dist: requests (>=2.26.0,<3.0.0)
|
27
26
|
Requires-Dist: schedule (>=1.2.2,<2.0.0)
|
28
27
|
Requires-Dist: tiktoken (>=0.7.0,<0.8.0)
|
28
|
+
Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
|
29
29
|
Project-URL: Repository, https://github.com/openlit/openlit/tree/main/openlit/python
|
30
30
|
Description-Content-Type: text/markdown
|
31
31
|
|
@@ -65,8 +65,8 @@ This project proudly follows and maintains the [Semantic Conventions](https://gi
|
|
65
65
|
|
66
66
|
| LLMs | Vector DBs | Frameworks | GPUs |
|
67
67
|
|--------------------------------------------------------------------------|----------------------------------------------|----------------------------------------------|---------------|
|
68
|
-
| [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA
|
69
|
-
| [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) |
|
68
|
+
| [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
|
69
|
+
| [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | [✅ AMD](#) |
|
70
70
|
| [✅ Anthropic](https://docs.openlit.io/latest/integrations/anthropic) | [✅ Qdrant](https://docs.openlit.io/latest/integrations/qdrant) | [✅ LlamaIndex](https://docs.openlit.io/latest/integrations/llama-index) | |
|
71
71
|
| [✅ GPT4All](https://docs.openlit.io/latest/integrations/gpt4all) | [✅ Milvus](https://docs.openlit.io/latest/integrations/milvus) | [✅ Haystack](https://docs.openlit.io/latest/integrations/haystack) | |
|
72
72
|
| [✅ Cohere](https://docs.openlit.io/latest/integrations/cohere) | | [✅ EmbedChain](https://docs.openlit.io/latest/integrations/embedchain) | |
|
@@ -34,8 +34,8 @@ This project proudly follows and maintains the [Semantic Conventions](https://gi
|
|
34
34
|
|
35
35
|
| LLMs | Vector DBs | Frameworks | GPUs |
|
36
36
|
|--------------------------------------------------------------------------|----------------------------------------------|----------------------------------------------|---------------|
|
37
|
-
| [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA
|
38
|
-
| [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) |
|
37
|
+
| [✅ OpenAI](https://docs.openlit.io/latest/integrations/openai) | [✅ ChromaDB](https://docs.openlit.io/latest/integrations/chromadb) | [✅ Langchain](https://docs.openlit.io/latest/integrations/langchain) | [✅ NVIDIA](https://docs.openlit.io/latest/integrations/nvidia-gpu) |
|
38
|
+
| [✅ Ollama](https://docs.openlit.io/latest/integrations/ollama) | [✅ Pinecone](https://docs.openlit.io/latest/integrations/pinecone) | [✅ LiteLLM](https://docs.openlit.io/latest/integrations/litellm) | [✅ AMD](#) |
|
39
39
|
| [✅ Anthropic](https://docs.openlit.io/latest/integrations/anthropic) | [✅ Qdrant](https://docs.openlit.io/latest/integrations/qdrant) | [✅ LlamaIndex](https://docs.openlit.io/latest/integrations/llama-index) | |
|
40
40
|
| [✅ GPT4All](https://docs.openlit.io/latest/integrations/gpt4all) | [✅ Milvus](https://docs.openlit.io/latest/integrations/milvus) | [✅ Haystack](https://docs.openlit.io/latest/integrations/haystack) | |
|
41
41
|
| [✅ Cohere](https://docs.openlit.io/latest/integrations/cohere) | | [✅ EmbedChain](https://docs.openlit.io/latest/integrations/embedchain) | |
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "openlit"
|
3
|
-
version = "1.
|
3
|
+
version = "1.28.0"
|
4
4
|
description = "OpenTelemetry-native Auto instrumentation library for monitoring LLM Applications and GPUs, facilitating the integration of observability into your GenAI-driven projects"
|
5
5
|
authors = ["OpenLIT"]
|
6
6
|
repository = "https://github.com/openlit/openlit/tree/main/openlit/python"
|
@@ -14,7 +14,7 @@ requests = "^2.26.0"
|
|
14
14
|
schedule = "^1.2.2"
|
15
15
|
tiktoken = "^0.7.0"
|
16
16
|
pydantic = "^2.0.0"
|
17
|
-
|
17
|
+
xmltodict = "^0.13.0"
|
18
18
|
boto3 = "^1.34.0"
|
19
19
|
botocore = "^1.34.0"
|
20
20
|
opentelemetry-api = "^1.27.0"
|
@@ -46,7 +46,7 @@ from openlit.instrumentation.pinecone import PineconeInstrumentor
|
|
46
46
|
from openlit.instrumentation.qdrant import QdrantInstrumentor
|
47
47
|
from openlit.instrumentation.milvus import MilvusInstrumentor
|
48
48
|
from openlit.instrumentation.transformers import TransformersInstrumentor
|
49
|
-
from openlit.instrumentation.gpu import
|
49
|
+
from openlit.instrumentation.gpu import GPUInstrumentor
|
50
50
|
import openlit.guard
|
51
51
|
import openlit.evals
|
52
52
|
|
@@ -313,7 +313,7 @@ def init(environment="default", application_name="default", tracer=None, otlp_en
|
|
313
313
|
disabled_instrumentors, module_name_map)
|
314
314
|
|
315
315
|
if not disable_metrics and collect_gpu_stats:
|
316
|
-
|
316
|
+
GPUInstrumentor().instrument(
|
317
317
|
environment=config.environment,
|
318
318
|
application_name=config.application_name,
|
319
319
|
)
|
@@ -0,0 +1,208 @@
|
|
1
|
+
# pylint: disable=useless-return, bad-staticmethod-argument, duplicate-code, import-outside-toplevel, broad-exception-caught, unused-argument, import-error, too-many-return-statements, superfluous-parens
|
2
|
+
"""Initializer of Auto Instrumentation of GPU Metrics"""
|
3
|
+
|
4
|
+
from typing import Collection, Iterable
|
5
|
+
import logging
|
6
|
+
from functools import partial
|
7
|
+
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
|
8
|
+
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
9
|
+
from opentelemetry.metrics import get_meter, CallbackOptions, Observation
|
10
|
+
from openlit.semcov import SemanticConvetion
|
11
|
+
|
12
|
+
# Initialize logger for logging potential issues and operations
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
class GPUInstrumentor(BaseInstrumentor):
|
16
|
+
"""
|
17
|
+
An instrumentor for collecting GPU metrics.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def instrumentation_dependencies(self) -> Collection[str]:
|
21
|
+
return []
|
22
|
+
|
23
|
+
def _instrument(self, **kwargs):
|
24
|
+
application_name = kwargs.get("application_name", "default")
|
25
|
+
environment = kwargs.get("environment", "default")
|
26
|
+
# pylint: disable=attribute-defined-outside-init
|
27
|
+
self.gpu_type = self._get_gpu_type()
|
28
|
+
meter = get_meter(
|
29
|
+
__name__,
|
30
|
+
"0.1.0",
|
31
|
+
schema_url="https://opentelemetry.io/schemas/1.11.0",
|
32
|
+
)
|
33
|
+
|
34
|
+
if not self.gpu_type:
|
35
|
+
logger.error(
|
36
|
+
"OpenLIT GPU Instrumentation Error: No supported GPUs found."
|
37
|
+
"If this is a non-GPU host, set `collect_gpu_stats=False` to disable GPU stats."
|
38
|
+
)
|
39
|
+
return
|
40
|
+
|
41
|
+
metric_names = [
|
42
|
+
("GPU_UTILIZATION", "utilization"),
|
43
|
+
("GPU_UTILIZATION_ENC", "utilization_enc"),
|
44
|
+
("GPU_UTILIZATION_DEC", "utilization_dec"),
|
45
|
+
("GPU_TEMPERATURE", "temperature"),
|
46
|
+
("GPU_FAN_SPEED", "fan_speed"),
|
47
|
+
("GPU_MEMORY_AVAILABLE", "memory_available"),
|
48
|
+
("GPU_MEMORY_TOTAL", "memory_total"),
|
49
|
+
("GPU_MEMORY_USED", "memory_used"),
|
50
|
+
("GPU_MEMORY_FREE", "memory_free"),
|
51
|
+
("GPU_POWER_DRAW", "power_draw"),
|
52
|
+
("GPU_POWER_LIMIT", "power_limit"),
|
53
|
+
]
|
54
|
+
|
55
|
+
for semantic_name, internal_name in metric_names:
|
56
|
+
meter.create_observable_gauge(
|
57
|
+
name=getattr(SemanticConvetion, semantic_name),
|
58
|
+
callbacks=[partial(self._collect_metric,
|
59
|
+
environment, application_name, internal_name)],
|
60
|
+
description=f"GPU {internal_name.replace('_', ' ').title()}",
|
61
|
+
)
|
62
|
+
|
63
|
+
def _uninstrument(self, **kwargs):
|
64
|
+
# Proper uninstrumentation logic to revert patched methods
|
65
|
+
pass
|
66
|
+
|
67
|
+
def _get_gpu_type(self) -> str:
|
68
|
+
try:
|
69
|
+
import pynvml
|
70
|
+
pynvml.nvmlInit()
|
71
|
+
return "nvidia"
|
72
|
+
except Exception:
|
73
|
+
try:
|
74
|
+
import amdsmi
|
75
|
+
amdsmi.amdsmi_init()
|
76
|
+
return "amd"
|
77
|
+
except Exception:
|
78
|
+
return None
|
79
|
+
|
80
|
+
|
81
|
+
def _collect_metric(self, environment, application_name,
|
82
|
+
metric_name,
|
83
|
+
options: CallbackOptions) -> Iterable[Observation]:
|
84
|
+
# pylint: disable=no-else-return
|
85
|
+
if self.gpu_type == "nvidia":
|
86
|
+
return self._collect_nvidia_metrics(environment, application_name, metric_name, options)
|
87
|
+
elif self.gpu_type == "amd":
|
88
|
+
return self._collect_amd_metrics(environment, application_name, metric_name, options)
|
89
|
+
return []
|
90
|
+
|
91
|
+
def _collect_nvidia_metrics(self, environment, application_name,
|
92
|
+
metric_name,
|
93
|
+
options: CallbackOptions) -> Iterable[Observation]:
|
94
|
+
try:
|
95
|
+
import pynvml
|
96
|
+
gpu_count = pynvml.nvmlDeviceGetCount()
|
97
|
+
mega_bytes = 1024 * 1024
|
98
|
+
gpu_index = 0
|
99
|
+
for gpu_index in range(gpu_count):
|
100
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
|
101
|
+
|
102
|
+
def get_metric_value(handle, metric_name):
|
103
|
+
try:
|
104
|
+
# pylint: disable=no-else-return
|
105
|
+
if metric_name == "temperature":
|
106
|
+
return pynvml.nvmlDeviceGetTemperature(handle,
|
107
|
+
pynvml.NVML_TEMPERATURE_GPU)
|
108
|
+
elif metric_name == "utilization":
|
109
|
+
return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
|
110
|
+
elif metric_name == "utilization_enc":
|
111
|
+
return pynvml.nvmlDeviceGetEncoderUtilization(handle)[0]
|
112
|
+
elif metric_name == "utilization_dec":
|
113
|
+
return pynvml.nvmlDeviceGetDecoderUtilization(handle)[0]
|
114
|
+
elif metric_name == "fan_speed":
|
115
|
+
return 0
|
116
|
+
elif metric_name == "memory_available":
|
117
|
+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
118
|
+
return (memory_info.free // mega_bytes) # Assuming reserved memory is 0
|
119
|
+
elif metric_name == "memory_total":
|
120
|
+
return (pynvml.nvmlDeviceGetMemoryInfo(handle).total // mega_bytes)
|
121
|
+
elif metric_name == "memory_used":
|
122
|
+
return (pynvml.nvmlDeviceGetMemoryInfo(handle).used // mega_bytes)
|
123
|
+
elif metric_name == "memory_free":
|
124
|
+
return (pynvml.nvmlDeviceGetMemoryInfo(handle).free // mega_bytes)
|
125
|
+
elif metric_name == "power_draw":
|
126
|
+
return (pynvml.nvmlDeviceGetPowerUsage(handle) // 1000.0)
|
127
|
+
elif metric_name == "power_limit":
|
128
|
+
return (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) // 1000.0)
|
129
|
+
except Exception as e:
|
130
|
+
# pylint: disable=cell-var-from-loop
|
131
|
+
logger.error("Error collecting metric %s for GPU %d: %s", metric_name,
|
132
|
+
gpu_index, e)
|
133
|
+
return 0
|
134
|
+
|
135
|
+
attributes = {
|
136
|
+
TELEMETRY_SDK_NAME: "openlit",
|
137
|
+
SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
|
138
|
+
SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
|
139
|
+
SemanticConvetion.GPU_INDEX: str(gpu_index),
|
140
|
+
SemanticConvetion.GPU_UUID: pynvml.nvmlDeviceGetUUID(handle).decode('utf-8'),
|
141
|
+
SemanticConvetion.GPU_NAME: pynvml.nvmlDeviceGetName(handle).decode('utf-8')
|
142
|
+
}
|
143
|
+
yield Observation(get_metric_value(handle, metric_name), attributes)
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
logger.error("Error in GPU metrics collection: %s", e)
|
147
|
+
|
148
|
+
def _collect_amd_metrics(self, environment, application_name,
|
149
|
+
metric_name,
|
150
|
+
options: CallbackOptions) -> Iterable[Observation]:
|
151
|
+
try:
|
152
|
+
import amdsmi
|
153
|
+
# Get the number of AMD GPUs
|
154
|
+
devices = amdsmi.amdsmi_get_processor_handles()
|
155
|
+
mega_bytes = 1024 * 1024
|
156
|
+
for device_handle in devices:
|
157
|
+
|
158
|
+
def get_metric_value(device_handle, metric_name):
|
159
|
+
try:
|
160
|
+
# pylint: disable=no-else-return
|
161
|
+
if metric_name == "temperature":
|
162
|
+
# pylint: disable=line-too-long
|
163
|
+
return amdsmi.amdsmi_get_temp_metric(device_handle,
|
164
|
+
amdsmi.AmdSmiTemperatureType.EDGE,
|
165
|
+
amdsmi.AmdSmiTemperatureMetric.CURRENT)
|
166
|
+
elif metric_name == "utilization":
|
167
|
+
# pylint: disable=line-too-long
|
168
|
+
return amdsmi.amdsmi_get_utilization_count(device_handle,
|
169
|
+
amdsmi.AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY)
|
170
|
+
elif metric_name in ["utilization_enc", "utilization_dec"]:
|
171
|
+
return 0 # Placeholder if unsupported
|
172
|
+
elif metric_name == "fan_speed":
|
173
|
+
return amdsmi.amdsmi_get_gpu_fan_speed(device_handle, 0)
|
174
|
+
elif metric_name == "memory_available":
|
175
|
+
return (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
|
176
|
+
elif metric_name == "memory_total":
|
177
|
+
return (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
|
178
|
+
elif metric_name == "memory_used":
|
179
|
+
return (amdsmi.amdsmi_get_gpu_memory_usage(device_handle) // mega_bytes)
|
180
|
+
elif metric_name == "memory_free":
|
181
|
+
total_mem = (amdsmi.amdsmi_get_gpu_memory_total(device_handle) // mega_bytes)
|
182
|
+
used_mem = (amdsmi.amdsmi_get_gpu_memory_usage(device_handle) // mega_bytes)
|
183
|
+
return (total_mem - used_mem)
|
184
|
+
elif metric_name == "power_draw":
|
185
|
+
# pylint: disable=line-too-long
|
186
|
+
return (amdsmi.amdsmi_get_power_info(device_handle)['average_socket_power'] // 1000.0)
|
187
|
+
elif metric_name == "power_limit":
|
188
|
+
# pylint: disable=line-too-long
|
189
|
+
return (amdsmi.amdsmi_get_power_info(device_handle)['power_limit'] // 1000.0)
|
190
|
+
except Exception as e:
|
191
|
+
logger.error("Error collecting metric %s for AMD GPU %d: %s", metric_name,
|
192
|
+
amdsmi.amdsmi_get_xgmi_info(device_handle)['index'], e)
|
193
|
+
return 0
|
194
|
+
|
195
|
+
attributes = {
|
196
|
+
TELEMETRY_SDK_NAME: "openlit",
|
197
|
+
SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
|
198
|
+
SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
|
199
|
+
# pylint: disable=line-too-long
|
200
|
+
SemanticConvetion.GPU_INDEX: amdsmi.amdsmi_get_xgmi_info(device_handle)['index'],
|
201
|
+
# pylint: disable=line-too-long
|
202
|
+
SemanticConvetion.GPU_UUID: amdsmi.amdsmi_get_gpu_asic_info(device_handle)['market_name'],
|
203
|
+
SemanticConvetion.GPU_NAME: amdsmi.amdsmi_get_device_name(device_handle)
|
204
|
+
}
|
205
|
+
yield Observation(get_metric_value(device_handle, metric_name), attributes)
|
206
|
+
|
207
|
+
except Exception as e:
|
208
|
+
logger.error("Error in AMD GPU metrics collection: %s", e)
|
@@ -1,132 +0,0 @@
|
|
1
|
-
# pylint: disable=useless-return, bad-staticmethod-argument, duplicate-code, import-outside-toplevel, broad-exception-caught, unused-argument
|
2
|
-
"""Initializer of Auto Instrumentation of GPU Metrics"""
|
3
|
-
|
4
|
-
from typing import Collection, Iterable
|
5
|
-
import logging
|
6
|
-
from functools import partial
|
7
|
-
|
8
|
-
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
|
9
|
-
from opentelemetry.sdk.resources import TELEMETRY_SDK_NAME
|
10
|
-
from opentelemetry.metrics import get_meter, CallbackOptions, Observation
|
11
|
-
|
12
|
-
from openlit.semcov import SemanticConvetion
|
13
|
-
|
14
|
-
# Initialize logger for logging potential issues and operations
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
|
-
class NvidiaGPUInstrumentor(BaseInstrumentor):
|
18
|
-
"""
|
19
|
-
An instrumentor for collecting NVIDIA GPU metrics.
|
20
|
-
"""
|
21
|
-
|
22
|
-
def instrumentation_dependencies(self) -> Collection[str]:
|
23
|
-
return []
|
24
|
-
|
25
|
-
def _instrument(self, **kwargs):
|
26
|
-
|
27
|
-
application_name = kwargs.get("application_name", "default")
|
28
|
-
environment = kwargs.get("environment", "default")
|
29
|
-
|
30
|
-
meter = get_meter(
|
31
|
-
__name__,
|
32
|
-
"0.1.0",
|
33
|
-
schema_url="https://opentelemetry.io/schemas/1.11.0",
|
34
|
-
)
|
35
|
-
|
36
|
-
def check_and_record(value):
|
37
|
-
return value if value is not None else 0
|
38
|
-
|
39
|
-
meter.create_observable_gauge(
|
40
|
-
name=SemanticConvetion.GPU_UTILIZATION,
|
41
|
-
callbacks=[partial(self._collect_metric, environment,
|
42
|
-
application_name, check_and_record, "utilization")],
|
43
|
-
description="GPU Utilization",
|
44
|
-
)
|
45
|
-
meter.create_observable_gauge(
|
46
|
-
name=SemanticConvetion.GPU_UTILIZATION_ENC,
|
47
|
-
callbacks=[partial(self._collect_metric, environment,
|
48
|
-
application_name, check_and_record, "utilization_enc")],
|
49
|
-
description="GPU Encoder Utilization",
|
50
|
-
)
|
51
|
-
meter.create_observable_gauge(
|
52
|
-
name=SemanticConvetion.GPU_UTILIZATION_DEC,
|
53
|
-
callbacks=[partial(self._collect_metric, environment,
|
54
|
-
application_name, check_and_record, "utilization_dec")],
|
55
|
-
description="GPU Decoder Utilization",
|
56
|
-
)
|
57
|
-
meter.create_observable_gauge(
|
58
|
-
name=SemanticConvetion.GPU_TEMPERATURE,
|
59
|
-
callbacks=[partial(self._collect_metric, environment,
|
60
|
-
application_name, check_and_record, "temperature")],
|
61
|
-
description="GPU Temperature",
|
62
|
-
)
|
63
|
-
meter.create_observable_gauge(
|
64
|
-
name=SemanticConvetion.GPU_FAN_SPEED,
|
65
|
-
callbacks=[partial(self._collect_metric, environment,
|
66
|
-
application_name, check_and_record, "fan_speed")],
|
67
|
-
description="GPU Fan Speed",
|
68
|
-
)
|
69
|
-
meter.create_observable_gauge(
|
70
|
-
name=SemanticConvetion.GPU_MEMORY_AVAILABLE,
|
71
|
-
callbacks=[partial(self._collect_metric, environment,
|
72
|
-
application_name, check_and_record, "memory_available")],
|
73
|
-
description="GPU Memory Available",
|
74
|
-
)
|
75
|
-
meter.create_observable_gauge(
|
76
|
-
name=SemanticConvetion.GPU_MEMORY_TOTAL,
|
77
|
-
callbacks=[partial(self._collect_metric, environment,
|
78
|
-
application_name, check_and_record, "memory_total")],
|
79
|
-
description="GPU Memory Total",
|
80
|
-
)
|
81
|
-
meter.create_observable_gauge(
|
82
|
-
name=SemanticConvetion.GPU_MEMORY_USED,
|
83
|
-
callbacks=[partial(self._collect_metric, environment,
|
84
|
-
application_name, check_and_record, "memory_used")],
|
85
|
-
description="GPU Memory Used",
|
86
|
-
)
|
87
|
-
meter.create_observable_gauge(
|
88
|
-
name=SemanticConvetion.GPU_MEMORY_FREE,
|
89
|
-
callbacks=[partial(self._collect_metric, environment,
|
90
|
-
application_name, check_and_record, "memory_free")],
|
91
|
-
description="GPU Memory Free",
|
92
|
-
)
|
93
|
-
meter.create_observable_gauge(
|
94
|
-
name=SemanticConvetion.GPU_POWER_DRAW,
|
95
|
-
callbacks=[partial(self._collect_metric, environment,
|
96
|
-
application_name, check_and_record, "power_draw")],
|
97
|
-
description="GPU Power Draw",
|
98
|
-
)
|
99
|
-
meter.create_observable_gauge(
|
100
|
-
name=SemanticConvetion.GPU_POWER_LIMIT,
|
101
|
-
callbacks=[partial(self._collect_metric, environment,
|
102
|
-
application_name, check_and_record, "power_limit")],
|
103
|
-
description="GPU Power Limit",
|
104
|
-
)
|
105
|
-
|
106
|
-
def _uninstrument(self, **kwargs):
|
107
|
-
# Proper uninstrumentation logic to revert patched methods
|
108
|
-
pass
|
109
|
-
|
110
|
-
def _collect_metric(self, environment, application_name,
|
111
|
-
check_and_record, metric_name,
|
112
|
-
options: CallbackOptions) -> Iterable[Observation]:
|
113
|
-
|
114
|
-
import gpustat
|
115
|
-
|
116
|
-
try:
|
117
|
-
gpu_stats = gpustat.GPUStatCollection.new_query()
|
118
|
-
|
119
|
-
for gpu in gpu_stats.gpus:
|
120
|
-
attributes = {
|
121
|
-
TELEMETRY_SDK_NAME: "openlit",
|
122
|
-
SemanticConvetion.GEN_AI_APPLICATION_NAME: application_name,
|
123
|
-
SemanticConvetion.GEN_AI_ENVIRONMENT: environment,
|
124
|
-
SemanticConvetion.GPU_INDEX: gpu.index,
|
125
|
-
SemanticConvetion.GPU_UUID: gpu.uuid,
|
126
|
-
SemanticConvetion.GPU_NAME: gpu.name,
|
127
|
-
}
|
128
|
-
|
129
|
-
yield Observation(check_and_record(getattr(gpu, metric_name, 0)), attributes)
|
130
|
-
|
131
|
-
except Exception as e:
|
132
|
-
logger.error("Error in GPU metrics collection: %s", e)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/azure_ai_inference/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/elevenlabs/async_elevenlabs.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{openlit-1.27.1 → openlit-1.28.0}/src/openlit/instrumentation/google_ai_studio/google_ai_studio.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|