gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. gpustack_runtime/__main__.py +7 -3
  2. gpustack_runtime/_version.py +2 -2
  3. gpustack_runtime/_version_appendix.py +1 -1
  4. gpustack_runtime/cmds/__init__.py +2 -0
  5. gpustack_runtime/cmds/deployer.py +84 -2
  6. gpustack_runtime/cmds/images.py +2 -0
  7. gpustack_runtime/deployer/__init__.py +2 -0
  8. gpustack_runtime/deployer/__types__.py +52 -28
  9. gpustack_runtime/deployer/__utils__.py +99 -112
  10. gpustack_runtime/deployer/cdi/__init__.py +81 -0
  11. gpustack_runtime/deployer/cdi/__types__.py +667 -0
  12. gpustack_runtime/deployer/cdi/thead.py +103 -0
  13. gpustack_runtime/deployer/docker.py +42 -24
  14. gpustack_runtime/deployer/kuberentes.py +8 -4
  15. gpustack_runtime/deployer/podman.py +41 -23
  16. gpustack_runtime/detector/__init__.py +62 -3
  17. gpustack_runtime/detector/__types__.py +11 -0
  18. gpustack_runtime/detector/__utils__.py +23 -0
  19. gpustack_runtime/detector/amd.py +17 -9
  20. gpustack_runtime/detector/hygon.py +6 -1
  21. gpustack_runtime/detector/iluvatar.py +20 -5
  22. gpustack_runtime/detector/mthreads.py +8 -12
  23. gpustack_runtime/detector/nvidia.py +365 -168
  24. gpustack_runtime/detector/pyacl/__init__.py +9 -1
  25. gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
  26. gpustack_runtime/detector/pycuda/__init__.py +9 -1
  27. gpustack_runtime/detector/pydcmi/__init__.py +9 -2
  28. gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
  29. gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
  30. gpustack_runtime/detector/pyhgml/libuki.so +0 -0
  31. gpustack_runtime/detector/pyhsa/__init__.py +9 -0
  32. gpustack_runtime/detector/pyixml/__init__.py +89 -164
  33. gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
  34. gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
  35. gpustack_runtime/detector/thead.py +733 -0
  36. gpustack_runtime/envs.py +128 -55
  37. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
  38. gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
  39. gpustack_runtime/detector/pymtml/__init__.py +0 -770
  40. gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
  41. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
  42. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
  43. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -107,7 +108,11 @@ class AMDDetector(Detector):
107
108
  asic_serial = dev_gpu_asic_info.get("asic_serial")
108
109
  dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
109
110
  else:
110
- dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
+ dev_uuid = ""
112
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
113
+ dev_uuid = (
114
+ f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
115
+ )
111
116
  dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
117
 
113
118
  dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
119
124
 
120
125
  dev_cc = dev_hsa_agent.compute_capability
121
126
  if not dev_cc:
122
- with contextlib.suppress(pyrocmsmi.ROCMSMIError):
123
- dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
127
+ if "target_graphics_version" in dev_gpu_asic_info:
128
+ dev_cc = dev_gpu_asic_info.get("target_graphics_version")
129
+ else:
130
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
131
+ dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
132
+ dev_idx,
133
+ )
124
134
 
125
135
  dev_bdf = None
126
136
  dev_card_id = None
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
195
205
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
196
206
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
197
207
 
198
- dev_compute_partition = None
199
- with contextlib.suppress(pyamdsmi.AmdSmiException):
200
- dev_compute_partition = pyamdsmi.amdsmi_get_gpu_compute_partition(
201
- dev,
202
- )
208
+ dev_is_vgpu = False
209
+ if dev_bdf:
210
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
203
211
 
204
212
  dev_appendix = {
205
213
  "arch_family": _get_arch_family(dev_asic_family_id),
206
- "vgpu": dev_compute_partition is not None,
214
+ "vgpu": dev_is_vgpu,
207
215
  }
208
216
  if dev_bdf:
209
217
  dev_appendix["bdf"] = dev_bdf
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
156
157
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
157
158
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
158
159
 
160
+ dev_is_vgpu = False
161
+ if dev_bdf:
162
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
163
+
159
164
  dev_appendix = {
160
- "vgpu": False,
165
+ "vgpu": dev_is_vgpu,
161
166
  }
162
167
  if dev_bdf is not None:
163
168
  dev_appendix["bdf"] = dev_bdf
@@ -23,9 +23,9 @@ from .__utils__ import (
23
23
  get_numa_node_by_bdf,
24
24
  get_numa_nodeset_size,
25
25
  get_pci_devices,
26
+ get_physical_function_by_bdf,
26
27
  get_utilization,
27
28
  map_numa_node_to_cpu_affinity,
28
- support_command,
29
29
  )
30
30
 
31
31
  logger = logging.getLogger(__name__)
@@ -54,8 +54,14 @@ class IluvatarDetector(Detector):
54
54
  pci_devs = IluvatarDetector.detect_pci_devices()
55
55
  if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
56
56
  logger.debug("No Iluvatar PCI devices found")
57
+ return supported
57
58
 
58
- supported = support_command("ixsmi")
59
+ try:
60
+ pyixml.nvmlInit()
61
+ pyixml.nvmlShutdown()
62
+ supported = True
63
+ except pyixml.NVMLError:
64
+ debug_log_exception(logger, "Failed to initialize IXML library")
59
65
 
60
66
  return supported
61
67
 
@@ -73,7 +79,7 @@ class IluvatarDetector(Detector):
73
79
 
74
80
  def detect(self) -> Devices | None:
75
81
  """
76
- Detect Iluvatar GPUs using ixsmi tool.
82
+ Detect Iluvatar GPUs using pyixml.
77
83
 
78
84
  Returns:
79
85
  A list of detected Iluvatar GPU devices,
@@ -165,13 +171,20 @@ class IluvatarDetector(Detector):
165
171
  if dev_cc_t:
166
172
  dev_cc = ".".join(map(str, dev_cc_t))
167
173
 
174
+ dev_bdf = None
175
+ with contextlib.suppress(pyixml.NVMLError):
176
+ dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
177
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
178
+
168
179
  dev_is_vgpu = False
169
- dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
180
+ if dev_bdf:
181
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
170
182
 
171
183
  dev_appendix = {
172
184
  "vgpu": dev_is_vgpu,
173
- "bdf": str(dev_pci_info.busIdLegacy).lower(),
174
185
  }
186
+ if dev_bdf:
187
+ dev_appendix["bdf"] = dev_bdf
175
188
 
176
189
  ret.append(
177
190
  Device(
@@ -229,6 +242,8 @@ class IluvatarDetector(Detector):
229
242
  )
230
243
 
231
244
  try:
245
+ pyixml.nvmlInit()
246
+
232
247
  for i, dev_i in enumerate(devices):
233
248
  dev_i_handle = pyixml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
234
249
 
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  from functools import lru_cache
5
5
 
6
+ import pymtml
7
+
6
8
  from .. import envs
7
9
  from ..logging import debug_log_exception, debug_log_warning
8
- from . import pymtml
9
10
  from .__types__ import (
10
11
  Detector,
11
12
  Device,
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
105
106
 
106
107
  try:
107
108
  pymtml.mtmlLibraryInit()
108
-
109
- sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
110
-
109
+ system = pymtml.mtmlLibraryInitSystem()
110
+ sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
111
111
  dev_count = pymtml.mtmlLibraryCountDevice()
112
112
  for dev_idx in range(dev_count):
113
113
  dev_index = dev_idx
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
139
139
 
140
140
  dev_mem = 0
141
141
  dev_mem_used = 0
142
- devmem = pymtml.mtmlDeviceInitMemory(dev)
143
- try:
142
+ with pymtml.mtmlMemoryContext(dev) as devmem:
144
143
  dev_mem = byte_to_mebibyte( # byte to MiB
145
144
  pymtml.mtmlMemoryGetTotal(devmem),
146
145
  )
147
146
  dev_mem_used = byte_to_mebibyte( # byte to MiB
148
147
  pymtml.mtmlMemoryGetUsed(devmem),
149
148
  )
150
- finally:
151
- pymtml.mtmlDeviceFreeMemory(devmem)
152
149
 
153
150
  dev_cores_util = None
154
151
  dev_temp = None
155
- devgpu = pymtml.mtmlDeviceInitGpu(dev)
156
- try:
152
+ with pymtml.mtmlGpuContext(dev) as devgpu:
157
153
  dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
158
154
  dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
159
- finally:
160
- pymtml.mtmlDeviceFreeGpu(devgpu)
155
+
161
156
  if dev_cores_util is None:
162
157
  debug_log_warning(
163
158
  logger,
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
198
193
  debug_log_exception(logger, "Failed to process devices fetching")
199
194
  raise
200
195
  finally:
196
+ pymtml.mtmlLibraryFreeSystem(system)
201
197
  pymtml.mtmlLibraryShutDown()
202
198
 
203
199
  return ret