gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__main__.py +7 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +2 -0
- gpustack_runtime/cmds/deployer.py +84 -2
- gpustack_runtime/cmds/images.py +2 -0
- gpustack_runtime/deployer/__init__.py +2 -0
- gpustack_runtime/deployer/__types__.py +52 -28
- gpustack_runtime/deployer/__utils__.py +99 -112
- gpustack_runtime/deployer/cdi/__init__.py +81 -0
- gpustack_runtime/deployer/cdi/__types__.py +667 -0
- gpustack_runtime/deployer/cdi/thead.py +103 -0
- gpustack_runtime/deployer/docker.py +42 -24
- gpustack_runtime/deployer/kuberentes.py +8 -4
- gpustack_runtime/deployer/podman.py +41 -23
- gpustack_runtime/detector/__init__.py +62 -3
- gpustack_runtime/detector/__types__.py +11 -0
- gpustack_runtime/detector/__utils__.py +23 -0
- gpustack_runtime/detector/amd.py +17 -9
- gpustack_runtime/detector/hygon.py +6 -1
- gpustack_runtime/detector/iluvatar.py +20 -5
- gpustack_runtime/detector/mthreads.py +8 -12
- gpustack_runtime/detector/nvidia.py +365 -168
- gpustack_runtime/detector/pyacl/__init__.py +9 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
- gpustack_runtime/detector/pycuda/__init__.py +9 -1
- gpustack_runtime/detector/pydcmi/__init__.py +9 -2
- gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
- gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
- gpustack_runtime/detector/pyhgml/libuki.so +0 -0
- gpustack_runtime/detector/pyhsa/__init__.py +9 -0
- gpustack_runtime/detector/pyixml/__init__.py +89 -164
- gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
- gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
- gpustack_runtime/detector/thead.py +733 -0
- gpustack_runtime/envs.py +128 -55
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
- gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
- gpustack_runtime/detector/pymtml/__init__.py +0 -770
- gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -107,7 +108,11 @@ class AMDDetector(Detector):
|
|
|
107
108
|
asic_serial = dev_gpu_asic_info.get("asic_serial")
|
|
108
109
|
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
|
|
109
110
|
else:
|
|
110
|
-
dev_uuid =
|
|
111
|
+
dev_uuid = ""
|
|
112
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
113
|
+
dev_uuid = (
|
|
114
|
+
f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
115
|
+
)
|
|
111
116
|
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
117
|
|
|
113
118
|
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
|
|
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
|
|
|
119
124
|
|
|
120
125
|
dev_cc = dev_hsa_agent.compute_capability
|
|
121
126
|
if not dev_cc:
|
|
122
|
-
|
|
123
|
-
dev_cc =
|
|
127
|
+
if "target_graphics_version" in dev_gpu_asic_info:
|
|
128
|
+
dev_cc = dev_gpu_asic_info.get("target_graphics_version")
|
|
129
|
+
else:
|
|
130
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
131
|
+
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
|
|
132
|
+
dev_idx,
|
|
133
|
+
)
|
|
124
134
|
|
|
125
135
|
dev_bdf = None
|
|
126
136
|
dev_card_id = None
|
|
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
|
|
|
195
205
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
196
206
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
197
207
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
dev,
|
|
202
|
-
)
|
|
208
|
+
dev_is_vgpu = False
|
|
209
|
+
if dev_bdf:
|
|
210
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
203
211
|
|
|
204
212
|
dev_appendix = {
|
|
205
213
|
"arch_family": _get_arch_family(dev_asic_family_id),
|
|
206
|
-
"vgpu":
|
|
214
|
+
"vgpu": dev_is_vgpu,
|
|
207
215
|
}
|
|
208
216
|
if dev_bdf:
|
|
209
217
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
|
|
|
156
157
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
157
158
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
158
159
|
|
|
160
|
+
dev_is_vgpu = False
|
|
161
|
+
if dev_bdf:
|
|
162
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
163
|
+
|
|
159
164
|
dev_appendix = {
|
|
160
|
-
"vgpu":
|
|
165
|
+
"vgpu": dev_is_vgpu,
|
|
161
166
|
}
|
|
162
167
|
if dev_bdf is not None:
|
|
163
168
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -23,9 +23,9 @@ from .__utils__ import (
|
|
|
23
23
|
get_numa_node_by_bdf,
|
|
24
24
|
get_numa_nodeset_size,
|
|
25
25
|
get_pci_devices,
|
|
26
|
+
get_physical_function_by_bdf,
|
|
26
27
|
get_utilization,
|
|
27
28
|
map_numa_node_to_cpu_affinity,
|
|
28
|
-
support_command,
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
logger = logging.getLogger(__name__)
|
|
@@ -54,8 +54,14 @@ class IluvatarDetector(Detector):
|
|
|
54
54
|
pci_devs = IluvatarDetector.detect_pci_devices()
|
|
55
55
|
if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
|
|
56
56
|
logger.debug("No Iluvatar PCI devices found")
|
|
57
|
+
return supported
|
|
57
58
|
|
|
58
|
-
|
|
59
|
+
try:
|
|
60
|
+
pyixml.nvmlInit()
|
|
61
|
+
pyixml.nvmlShutdown()
|
|
62
|
+
supported = True
|
|
63
|
+
except pyixml.NVMLError:
|
|
64
|
+
debug_log_exception(logger, "Failed to initialize IXML library")
|
|
59
65
|
|
|
60
66
|
return supported
|
|
61
67
|
|
|
@@ -73,7 +79,7 @@ class IluvatarDetector(Detector):
|
|
|
73
79
|
|
|
74
80
|
def detect(self) -> Devices | None:
|
|
75
81
|
"""
|
|
76
|
-
Detect Iluvatar GPUs using
|
|
82
|
+
Detect Iluvatar GPUs using pyixml.
|
|
77
83
|
|
|
78
84
|
Returns:
|
|
79
85
|
A list of detected Iluvatar GPU devices,
|
|
@@ -165,13 +171,20 @@ class IluvatarDetector(Detector):
|
|
|
165
171
|
if dev_cc_t:
|
|
166
172
|
dev_cc = ".".join(map(str, dev_cc_t))
|
|
167
173
|
|
|
174
|
+
dev_bdf = None
|
|
175
|
+
with contextlib.suppress(pyixml.NVMLError):
|
|
176
|
+
dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
|
|
177
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
178
|
+
|
|
168
179
|
dev_is_vgpu = False
|
|
169
|
-
|
|
180
|
+
if dev_bdf:
|
|
181
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
170
182
|
|
|
171
183
|
dev_appendix = {
|
|
172
184
|
"vgpu": dev_is_vgpu,
|
|
173
|
-
"bdf": str(dev_pci_info.busIdLegacy).lower(),
|
|
174
185
|
}
|
|
186
|
+
if dev_bdf:
|
|
187
|
+
dev_appendix["bdf"] = dev_bdf
|
|
175
188
|
|
|
176
189
|
ret.append(
|
|
177
190
|
Device(
|
|
@@ -229,6 +242,8 @@ class IluvatarDetector(Detector):
|
|
|
229
242
|
)
|
|
230
243
|
|
|
231
244
|
try:
|
|
245
|
+
pyixml.nvmlInit()
|
|
246
|
+
|
|
232
247
|
for i, dev_i in enumerate(devices):
|
|
233
248
|
dev_i_handle = pyixml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
234
249
|
|
|
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
|
|
6
|
+
import pymtml
|
|
7
|
+
|
|
6
8
|
from .. import envs
|
|
7
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
8
|
-
from . import pymtml
|
|
9
10
|
from .__types__ import (
|
|
10
11
|
Detector,
|
|
11
12
|
Device,
|
|
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
|
|
|
105
106
|
|
|
106
107
|
try:
|
|
107
108
|
pymtml.mtmlLibraryInit()
|
|
108
|
-
|
|
109
|
-
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
|
|
110
|
-
|
|
109
|
+
system = pymtml.mtmlLibraryInitSystem()
|
|
110
|
+
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
|
|
111
111
|
dev_count = pymtml.mtmlLibraryCountDevice()
|
|
112
112
|
for dev_idx in range(dev_count):
|
|
113
113
|
dev_index = dev_idx
|
|
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
|
|
|
139
139
|
|
|
140
140
|
dev_mem = 0
|
|
141
141
|
dev_mem_used = 0
|
|
142
|
-
|
|
143
|
-
try:
|
|
142
|
+
with pymtml.mtmlMemoryContext(dev) as devmem:
|
|
144
143
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
145
144
|
pymtml.mtmlMemoryGetTotal(devmem),
|
|
146
145
|
)
|
|
147
146
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
148
147
|
pymtml.mtmlMemoryGetUsed(devmem),
|
|
149
148
|
)
|
|
150
|
-
finally:
|
|
151
|
-
pymtml.mtmlDeviceFreeMemory(devmem)
|
|
152
149
|
|
|
153
150
|
dev_cores_util = None
|
|
154
151
|
dev_temp = None
|
|
155
|
-
|
|
156
|
-
try:
|
|
152
|
+
with pymtml.mtmlGpuContext(dev) as devgpu:
|
|
157
153
|
dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
|
|
158
154
|
dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
|
|
159
|
-
|
|
160
|
-
pymtml.mtmlDeviceFreeGpu(devgpu)
|
|
155
|
+
|
|
161
156
|
if dev_cores_util is None:
|
|
162
157
|
debug_log_warning(
|
|
163
158
|
logger,
|
|
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
|
|
|
198
193
|
debug_log_exception(logger, "Failed to process devices fetching")
|
|
199
194
|
raise
|
|
200
195
|
finally:
|
|
196
|
+
pymtml.mtmlLibraryFreeSystem(system)
|
|
201
197
|
pymtml.mtmlLibraryShutDown()
|
|
202
198
|
|
|
203
199
|
return ret
|