gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +3 -1
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  6. gpustack_runtime/deployer/docker.py +109 -148
  7. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +1 -1
  8. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
  9. gpustack_runtime/deployer/kuberentes.py +89 -108
  10. gpustack_runtime/deployer/podman.py +89 -122
  11. gpustack_runtime/detector/__init__.py +2 -0
  12. gpustack_runtime/detector/__types__.py +26 -0
  13. gpustack_runtime/detector/amd.py +28 -8
  14. gpustack_runtime/detector/ascend.py +49 -4
  15. gpustack_runtime/detector/cambricon.py +3 -0
  16. gpustack_runtime/detector/hygon.py +16 -1
  17. gpustack_runtime/detector/iluvatar.py +6 -0
  18. gpustack_runtime/detector/metax.py +8 -0
  19. gpustack_runtime/detector/mthreads.py +11 -0
  20. gpustack_runtime/detector/nvidia.py +139 -134
  21. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  22. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  23. gpustack_runtime/detector/thead.py +135 -127
  24. gpustack_runtime/envs.py +7 -6
  25. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
  26. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
  27. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
  28. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
  29. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
960
960
  # GSP firmware
961
961
  NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
962
962
 
963
+ # Health
964
+ IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
965
+ IXML_HEALTH_MC_ERROR = 0x0000000000000002
966
+ IXML_HEALTH_ECC_ERROR = 0x0000000000000010
967
+ IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
968
+ IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
969
+ IXML_HEALTH_OK = 0x0000000000000000
970
+
963
971
 
964
972
  ## Error Checking ##
965
973
  class NVMLError(Exception):
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
5267
5275
  ret = fn(device, gpuFabricInfo)
5268
5276
  _nvmlCheckReturn(ret)
5269
5277
  return ret
5278
+
5279
+
5280
+ def ixmlDeviceGetHealth(device):
5281
+ c_health = c_longlong()
5282
+ fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
5283
+ ret = fn(device, byref(c_health))
5284
+ _nvmlCheckReturn(ret)
5285
+ return c_health.value
@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
393
393
  )
394
394
  _rocmsmiCheckReturn(ret)
395
395
  return c_accessible.value
396
+
397
+
398
+ def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
399
+ if gpu_block is None:
400
+ gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
401
+ c_error_count = rsmi_error_count_t()
402
+ fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
403
+ ret = fn(
404
+ device,
405
+ gpu_block,
406
+ byref(c_error_count),
407
+ )
408
+ _rocmsmiCheckReturn(ret)
409
+ return c_error_count
@@ -12,6 +12,7 @@ from . import pyhgml
12
12
  from .__types__ import (
13
13
  Detector,
14
14
  Device,
15
+ DeviceMemoryStatusEnum,
15
16
  Devices,
16
17
  ManufacturerEnum,
17
18
  Topology,
@@ -138,17 +139,33 @@ class THeadDetector(Detector):
138
139
  )
139
140
  dev_numa = bitmask_to_str(list(dev_node_affinity))
140
141
 
142
+ dev_temp = None
143
+ with contextlib.suppress(pyhgml.HGMLError):
144
+ dev_temp = pyhgml.hgmlDeviceGetTemperature(
145
+ dev,
146
+ pyhgml.HGML_TEMPERATURE_GPU,
147
+ )
148
+
149
+ dev_power = None
150
+ dev_power_used = None
151
+ with contextlib.suppress(pyhgml.HGMLError):
152
+ dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
153
+ dev_power = dev_power // 1000 # mW to W
154
+ dev_power_used = (
155
+ pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
156
+ ) # mW to W
157
+
141
158
  dev_mig_mode = pyhgml.HGML_DEVICE_MIG_DISABLE
142
159
  with contextlib.suppress(pyhgml.HGMLError):
143
160
  dev_mig_mode, _ = pyhgml.hgmlDeviceGetMigMode(dev)
144
161
 
162
+ dev_index = dev_idx
163
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
164
+ dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
165
+
145
166
  # With MIG disabled, treat as a single device.
146
167
 
147
168
  if dev_mig_mode == pyhgml.HGML_DEVICE_MIG_DISABLE:
148
- dev_index = dev_idx
149
- if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
150
- dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
151
-
152
169
  dev_name = pyhgml.hgmlDeviceGetName(dev)
153
170
 
154
171
  dev_uuid = pyhgml.hgmlDeviceGetUUID(dev)
@@ -171,6 +188,7 @@ class THeadDetector(Detector):
171
188
 
172
189
  dev_mem = 0
173
190
  dev_mem_used = 0
191
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
174
192
  with contextlib.suppress(pyhgml.HGMLError):
175
193
  dev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(dev)
176
194
  dev_mem = byte_to_mebibyte( # byte to MiB
@@ -179,22 +197,14 @@ class THeadDetector(Detector):
179
197
  dev_mem_used = byte_to_mebibyte( # byte to MiB
180
198
  dev_mem_info.used,
181
199
  )
182
-
183
- dev_temp = None
184
- with contextlib.suppress(pyhgml.HGMLError):
185
- dev_temp = pyhgml.hgmlDeviceGetTemperature(
200
+ dev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
186
201
  dev,
187
- pyhgml.HGML_TEMPERATURE_GPU,
202
+ pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
203
+ pyhgml.HGML_VOLATILE_ECC,
204
+ pyhgml.HGML_MEMORY_LOCATION_DRAM,
188
205
  )
189
-
190
- dev_power = None
191
- dev_power_used = None
192
- with contextlib.suppress(pyhgml.HGMLError):
193
- dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
194
- dev_power = dev_power // 1000 # mW to W
195
- dev_power_used = (
196
- pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
197
- ) # mW to W
206
+ if dev_mem_ecc_errors > 0:
207
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
198
208
 
199
209
  dev_is_vgpu = False
200
210
  if dev_bdf:
@@ -221,6 +231,7 @@ class THeadDetector(Detector):
221
231
  memory=dev_mem,
222
232
  memory_used=dev_mem_used,
223
233
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
234
+ memory_status=dev_mem_status,
224
235
  temperature=dev_temp,
225
236
  power=dev_power,
226
237
  power_used=dev_power_used,
@@ -236,35 +247,34 @@ class THeadDetector(Detector):
236
247
  mdev_cores = None
237
248
  mdev_count = pyhgml.hgmlDeviceGetMaxMigDeviceCount(dev)
238
249
  for mdev_idx in range(mdev_count):
239
- mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
250
+ mdev = None
251
+ with contextlib.suppress(pyhgml.HGMLError):
252
+ mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
253
+ if not mdev:
254
+ continue
240
255
 
241
- mdev_index = mdev_idx
256
+ mdev_index = mdev_idx + dev_count * (dev_idx + 1)
242
257
  mdev_uuid = pyhgml.hgmlDeviceGetUUID(mdev)
243
258
 
244
- mdev_mem, mdev_mem_used = 0, 0
259
+ mdev_mem = 0
260
+ mdev_mem_used = 0
261
+ mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
245
262
  with contextlib.suppress(pyhgml.HGMLError):
246
263
  mdev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(mdev)
247
- byte_to_mebibyte( # byte to MiB
264
+ mdev_mem = byte_to_mebibyte( # byte to MiB
248
265
  mdev_mem_info.total,
249
266
  )
250
- byte_to_mebibyte( # byte to MiB
267
+ mdev_mem_used = byte_to_mebibyte( # byte to MiB
251
268
  mdev_mem_info.used,
252
269
  )
253
-
254
- mdev_temp = pyhgml.hgmlDeviceGetTemperature(
255
- mdev,
256
- pyhgml.HGML_TEMPERATURE_GPU,
257
- )
258
-
259
- mdev_power = None
260
- with contextlib.suppress(pyhgml.HGMLError):
261
- mdev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(
270
+ mdev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
262
271
  mdev,
272
+ pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
273
+ pyhgml.HGML_AGGREGATE_ECC,
274
+ pyhgml.HGML_MEMORY_LOCATION_SRAM,
263
275
  )
264
- mdev_power = mdev_power // 1000 # mW to W
265
- mdev_power_used = (
266
- pyhgml.hgmlDeviceGetPowerUsage(mdev) // 1000
267
- ) # mW to W
276
+ if mdev_mem_ecc_errors > 0:
277
+ mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
268
278
 
269
279
  mdev_appendix = {
270
280
  "vgpu": True,
@@ -279,63 +289,64 @@ class THeadDetector(Detector):
279
289
 
280
290
  mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
281
291
 
282
- if not mdev_name:
283
- mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
284
- mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
285
- mdev_gi,
286
- mdev_ci_id,
287
- )
288
- mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
289
- mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
290
- for dev_gi_prf_id in range(
291
- pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
292
- ):
293
- try:
294
- dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
295
- dev,
296
- dev_gi_prf_id,
297
- )
298
- if dev_gi_prf.id != mdev_gi_info.profileId:
299
- continue
300
- except pyhgml.HGMLError:
292
+ mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
293
+ mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
294
+ mdev_gi,
295
+ mdev_ci_id,
296
+ )
297
+ mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
298
+ mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
299
+ for dev_gi_prf_id in range(
300
+ pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
301
+ ):
302
+ try:
303
+ dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
304
+ dev,
305
+ dev_gi_prf_id,
306
+ )
307
+ if dev_gi_prf.id != mdev_gi_info.profileId:
301
308
  continue
309
+ except pyhgml.HGMLError:
310
+ continue
302
311
 
303
- for dev_ci_prf_id in range(
304
- pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
312
+ for dev_ci_prf_id in range(
313
+ pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
314
+ ):
315
+ for dev_cig_prf_id in range(
316
+ pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
305
317
  ):
306
- for dev_cig_prf_id in range(
307
- pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
308
- ):
309
- try:
310
- mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
311
- mdev_gi,
312
- dev_ci_prf_id,
313
- dev_cig_prf_id,
314
- )
315
- if mdev_ci_prf.id != mdev_ci_info.profileId:
316
- continue
317
- except pyhgml.HGMLError:
318
- continue
319
-
320
- ci_slice = _get_compute_instance_slice(
318
+ try:
319
+ mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
320
+ mdev_gi,
321
321
  dev_ci_prf_id,
322
+ dev_cig_prf_id,
322
323
  )
323
- gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
324
- gi_mem = _get_gpu_instance_memory(
325
- dev_mem_info,
326
- dev_gi_prf,
327
- )
324
+ if mdev_ci_prf.id != mdev_ci_info.profileId:
325
+ continue
326
+ except pyhgml.HGMLError:
327
+ continue
328
328
 
329
- if ci_slice == gi_slice:
330
- mdev_name = f"{gi_slice}g.{gi_mem}gb"
329
+ ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
330
+ gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
331
+ if ci_slice == gi_slice:
332
+ if hasattr(dev_gi_prf, "name"):
333
+ mdev_name = dev_gi_prf.name
331
334
  else:
332
- mdev_name = (
333
- f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
335
+ gi_mem = round(
336
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
334
337
  )
338
+ mdev_name = f"{gi_slice}g.{gi_mem}gb"
339
+ elif hasattr(mdev_ci_prf, "name"):
340
+ mdev_name = mdev_ci_prf.name
341
+ else:
342
+ gi_mem = round(
343
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
344
+ )
345
+ mdev_name = f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
335
346
 
336
- mdev_cores = mdev_ci_prf.multiprocessorCount
347
+ mdev_cores = mdev_ci_prf.multiprocessorCount
337
348
 
338
- break
349
+ break
339
350
 
340
351
  ret.append(
341
352
  Device(
@@ -352,9 +363,10 @@ class THeadDetector(Detector):
352
363
  memory=mdev_mem,
353
364
  memory_used=mdev_mem_used,
354
365
  memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
355
- temperature=mdev_temp,
356
- power=mdev_power,
357
- power_used=mdev_power_used,
366
+ memory_status=mdev_mem_status,
367
+ temperature=dev_temp,
368
+ power=dev_power,
369
+ power_used=dev_power_used,
358
370
  appendix=mdev_appendix,
359
371
  ),
360
372
  )
@@ -392,11 +404,17 @@ class THeadDetector(Detector):
392
404
  devices_count=len(devices),
393
405
  )
394
406
 
407
+ get_links_cache = {}
408
+
395
409
  try:
396
410
  pyhgml.hgmlInit()
397
411
 
398
412
  for i, dev_i in enumerate(devices):
399
- dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
413
+ dev_i_bdf = dev_i.appendix.get("bdf")
414
+ if dev_i.appendix.get("vgpu", False):
415
+ dev_i_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(dev_i_bdf)
416
+ else:
417
+ dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
400
418
 
401
419
  # Get NUMA and CPU affinities.
402
420
  ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -405,7 +423,12 @@ class THeadDetector(Detector):
405
423
  )
406
424
 
407
425
  # Get links state if applicable.
408
- if dev_i_links_state := _get_links_state(dev_i_handle):
426
+ if dev_i_bdf in get_links_cache:
427
+ dev_i_links_state = get_links_cache[dev_i_bdf]
428
+ else:
429
+ dev_i_links_state = _get_links_state(dev_i_handle)
430
+ get_links_cache[dev_i_bdf] = dev_i_links_state
431
+ if dev_i_links_state:
409
432
  ret.appendices[i].update(dev_i_links_state)
410
433
  # In practice, if a card has an active *Link,
411
434
  # then other cards in the same machine should be interconnected with it through the *Link.
@@ -422,21 +445,30 @@ class THeadDetector(Detector):
422
445
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
423
446
  continue
424
447
 
425
- dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
426
-
427
- distance = TopologyDistanceEnum.UNK
428
- try:
429
- distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
430
- dev_i_handle,
431
- dev_j_handle,
432
- )
433
- except pyhgml.HGMLError:
434
- debug_log_exception(
435
- logger,
436
- "Failed to get distance between device %d and %d",
437
- dev_i.index,
438
- dev_j.index,
439
- )
448
+ dev_j_bdf = dev_j.appendix.get("bdf")
449
+ if dev_i_bdf == dev_j_bdf:
450
+ distance = TopologyDistanceEnum.SELF
451
+ else:
452
+ if dev_j.appendix.get("vgpu", False):
453
+ dev_j_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(
454
+ dev_j_bdf,
455
+ )
456
+ else:
457
+ dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
458
+
459
+ distance = TopologyDistanceEnum.UNK
460
+ try:
461
+ distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
462
+ dev_i_handle,
463
+ dev_j_handle,
464
+ )
465
+ except pyhgml.HGMLError:
466
+ debug_log_exception(
467
+ logger,
468
+ "Failed to get distance between device %d and %d",
469
+ dev_i.index,
470
+ dev_j.index,
471
+ )
440
472
 
441
473
  ret.devices_distances[i][j] = distance
442
474
  ret.devices_distances[j][i] = distance
@@ -655,30 +687,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
655
687
  raise AttributeError(msg)
656
688
 
657
689
 
658
- def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
659
- """
660
- Compute the memory size of a MIG compute instance in GiB.
661
-
662
- Args:
663
- dev_mem:
664
- The total memory info of the parent GPU device.
665
- dev_gi_prf:
666
- The profile info of the GPU instance.
667
-
668
- Returns:
669
- The memory size in GiB.
670
-
671
- """
672
- mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
673
-
674
- gib = round(
675
- math.ceil(mem / dev_mem.total * 8)
676
- / 8
677
- * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
678
- )
679
- return gib
680
-
681
-
682
690
  def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
683
691
  """
684
692
  Get the number of slice for a given Compute Instance Profile ID.
gpustack_runtime/envs.py CHANGED
@@ -246,7 +246,7 @@ if TYPE_CHECKING:
246
246
  GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE: bool = True
247
247
  """
248
248
  Generate CDI specifications during deployment when using CDI resource injection policy,
249
- requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
249
+ requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
250
250
  Works only when `GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY` is set to `CDI`.
251
251
  Using internal knowledge to generate the CDI specifications for deployer,
252
252
  if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
@@ -283,7 +283,7 @@ if TYPE_CHECKING:
283
283
  Resource injection policy for the Kubernetes deployer (e.g., Auto, Env, KDP).
284
284
  `Auto`: Automatically choose the resource injection policy based on the environment.
285
285
  `Env`: Injects resources using standard environment variable, depends on underlying Container Toolkit, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`.
286
- `KDP`: Injects resources using Kubernetes Device Plugin, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CDI`.
286
+ `KDP`: Injects resources using Kubernetes Device Plugin.
287
287
  """
288
288
  GPUSTACK_RUNTIME_KUBERNETES_KDP_PER_DEVICE_MAX_ALLOCATIONS: int | None = None
289
289
  """
@@ -294,14 +294,14 @@ if TYPE_CHECKING:
294
294
  """
295
295
  Device allocation policy for the Kubernetes Device Plugin (e.g., CDI, Env, Opaque).
296
296
  `Auto`: Automatically choose the device allocation policy based on the environment.
297
- `Env`: Allocates devices using runtime-visible environment variables; requires Container Toolkit support.
298
- `CDI`: Allocates devices using generated CDI specifications, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
297
+ `Env`: Allocates devices using runtime-visible environment variables, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`; requires Container Toolkit support.
298
+ `CDI`: Allocates devices using generated CDI specifications, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CDI`, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
299
299
  `Opaque`: Uses internal logic for allocation, which is convenient for deployment but difficult to troubleshoot.
300
300
  """
301
301
  GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE: bool = True
302
302
  """
303
303
  Generate CDI specifications during deployment,
304
- requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
304
+ requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
305
305
  Works only when `GPUSTACK_RUNTIME_KUBERNETES_KDP_DEVICE_ALLOCATION_POLICY` is set to `CDI`.
306
306
  Using internal knowledge to generate the CDI specifications for deployer,
307
307
  if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
@@ -344,7 +344,7 @@ if TYPE_CHECKING:
344
344
  GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE: bool = True
345
345
  """
346
346
  Generate CDI specifications during deployment,
347
- requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
347
+ requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
348
348
  Using internal knowledge to generate the CDI specifications for deployer,
349
349
  if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
350
350
  please disable this and remove the output file manually.
@@ -577,6 +577,7 @@ variables: dict[str, Callable[[], Any]] = {
577
577
  "GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID": lambda: to_set(
578
578
  getenv(
579
579
  "GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID",
580
+ "NVIDIA_VISIBLE_DEVICES",
580
581
  ),
581
582
  sep=",",
582
583
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpustack-runtime
3
- Version: 0.1.41.post3
3
+ Version: 0.1.42
4
4
  Summary: GPUStack Runtime is library for detecting GPU resources and launching GPU workloads.
5
5
  Project-URL: Homepage, https://github.com/gpustack/runtime
6
6
  Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
@@ -16,7 +16,7 @@ Requires-Python: >=3.10
16
16
  Requires-Dist: argcomplete>=3.6.3
17
17
  Requires-Dist: cachetools>=5.5.2
18
18
  Requires-Dist: docker>=7.1.0
19
- Requires-Dist: gpustack-runner>=0.1.24.post1
19
+ Requires-Dist: gpustack-runner>=0.1.24.post4
20
20
  Requires-Dist: grpc-interceptor>=0.15.4
21
21
  Requires-Dist: grpcio>=1.76.0
22
22
  Requires-Dist: kubernetes>=33.1.0
@@ -1,51 +1,51 @@
1
1
  gpustack_runtime/__init__.py,sha256=Xw_PVWneitx-8QmW6sJQeymj6zVbEgEndGhIB_km6TI,186
2
2
  gpustack_runtime/__main__.py,sha256=O9yJKcN7vg0Ppgc13qesxHwST2wkH3ccOkTQXPWHnNA,3939
3
- gpustack_runtime/_version.py,sha256=avOg2k2wJyqvBs2osdxAhmsDLf9lzQxB4hFsfDnq90s,792
3
+ gpustack_runtime/_version.py,sha256=Vvw3zQNp4FXvAmZn_g3ZyhEUudGslUaT_mMkqwv_Tdg,777
4
4
  gpustack_runtime/_version.pyi,sha256=A42NoSgcqEXVy2OeNm4LXC9CbyonbooYrSUBlPm2lGY,156
5
- gpustack_runtime/envs.py,sha256=Q8vK42OpkY4T72zN6pOz_eCS_hnQElhAmxZ1wdks0xQ,38794
5
+ gpustack_runtime/envs.py,sha256=gkr30NnIq3USzSYvbW6ipf8tKdbXx-BrDcY3-4Oc-Hg,38894
6
6
  gpustack_runtime/logging.py,sha256=wMPriPpOuVsuClsjMh0qwEPQKyJiJa89ggdDjqkk7i0,6934
7
7
  gpustack_runtime/cmds/__init__.py,sha256=-_X2O2lBn6KcdLGUzhL3lEjQC4_cwA36fvWDnFAgtVM,1382
8
8
  gpustack_runtime/cmds/__types__.py,sha256=TBnUWUqzTkDtJnsMv363kdw-H8fOf-XQYbOvrmQif-M,815
9
9
  gpustack_runtime/cmds/deployer.py,sha256=KvhPhU6ZW-UV6vLykI5adKI1ThgVFFJqWaII3n4OhL8,32846
10
- gpustack_runtime/cmds/detector.py,sha256=AALcoqCiNuwYucKBnyj7r5ScOWc_BSzAhHR2C0QbEHE,8750
10
+ gpustack_runtime/cmds/detector.py,sha256=2ORRF53q3goyYRB4T8gu3X8u0VZ4v0xeEndJqtuktyQ,8872
11
11
  gpustack_runtime/cmds/images.py,sha256=7tb-D3G4yqLPkbS9aSuWI1bD3DYK8BLbPbgqac56blI,594
12
12
  gpustack_runtime/deployer/__init__.py,sha256=impMrmvkMjuCBthsn3QUz3LuwpmmNAymHJKJ2o6SZoc,16249
13
13
  gpustack_runtime/deployer/__patches__.py,sha256=cTBge8BT6IsY5MzETKY3kN28k3igYfNj7pcpgDzfDzw,17849
14
- gpustack_runtime/deployer/__types__.py,sha256=PgIWogHOvHKsHoeBjmKFEEM3JrKck89Mmnwlfx01BbE,72248
14
+ gpustack_runtime/deployer/__types__.py,sha256=J2YX8X7EYY_56_L9WL5YMmdsyJ572uOIhMoHCVjPaog,72469
15
15
  gpustack_runtime/deployer/__utils__.py,sha256=paQu2M1UeoSfQPsiskmAqJSiln-8qwibTssEoWFMLec,21109
16
- gpustack_runtime/deployer/docker.py,sha256=bOaXbTnaalbO42FlyWR1Ha26Y30LGWPzWKPV5Q-Nk7g,85039
17
- gpustack_runtime/deployer/kuberentes.py,sha256=V7_lPMFaLCJz3vqFBGKomOs9EZs7nGjrSV9EJ5lLyVM,89323
18
- gpustack_runtime/deployer/podman.py,sha256=_qdbsTezacRmiXa3n04OUPUsgVy1pSFgJSKxous4s14,82156
16
+ gpustack_runtime/deployer/docker.py,sha256=e48conm3gfu8dlwcIhvTvM5NhlhdgKlvk6Ix8xGYVeI,81448
17
+ gpustack_runtime/deployer/kuberentes.py,sha256=-G7eYuqTDDi3T9u2Jqr6j0Ut-8vkP5u2lxzSyDx0EWM,86776
18
+ gpustack_runtime/deployer/podman.py,sha256=9lo4AvXzD3HUteY17-Fuz9A0ItScPb_D1tweDgm7PVo,79090
19
19
  gpustack_runtime/deployer/cdi/__init__.py,sha256=2wHrxkud3GJokE3ytNc3jvjddemXkNuuz_oIKzxD3-I,4000
20
20
  gpustack_runtime/deployer/cdi/__types__.py,sha256=04DKvcogk7OoHS7TU2Bmht3VVMu7iOEBWTEOvxpHt4w,18399
21
- gpustack_runtime/deployer/cdi/__utils__.py,sha256=mvdOqkbhaSkphl0K-VpNwtFviAkttS9UrmKEA285kRw,3908
21
+ gpustack_runtime/deployer/cdi/__utils__.py,sha256=CAYUv76akZiHJYZO_VY0NXKhEI2jrP7G3OgvQa8Pg4U,4050
22
22
  gpustack_runtime/deployer/cdi/amd.py,sha256=-eq_SOlC56VX2QscZXvnoeffWSRindhr8zFZmaIcKrE,4082
23
23
  gpustack_runtime/deployer/cdi/ascend.py,sha256=lDs75a9--c0lM34xfJqu-_QbfWNFrf4zE-GXPKReBe4,4538
24
24
  gpustack_runtime/deployer/cdi/hygon.py,sha256=h6-vQfv03sgxYjMJAf_JOMq9cHFPaNjK1YbUYIiSXck,4117
25
25
  gpustack_runtime/deployer/cdi/iluvatar.py,sha256=6nNECZpU5IPP6-5l-O1rzU-ib-WcuwKvDg7ZV__1NE4,3650
26
26
  gpustack_runtime/deployer/cdi/metax.py,sha256=tmJBvr-n9pERAp-dXsa54qv6xmxt0rJoJwY36TFdoWk,4143
27
27
  gpustack_runtime/deployer/cdi/thead.py,sha256=SvIDKNYZx7FwMPTTxyJ2RRjlr9LXLN8BUYCUhidmiQk,3671
28
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=cCP8Swtz_LzeIrKnwNszD54fj8ApAMgffUym4Wcyc_g,10975
28
+ gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=kvjsDx_8kNt3h8a5MOx5A7qPvqRsk1amvFr_ZYDA1l0,10931
29
29
  gpustack_runtime/deployer/k8s/deviceplugin/__types__.py,sha256=LCkgPDZ64Mra7bo5jmtsAO2Ypbc4qK99lMl6R_nQhnY,3043
30
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=ipZ_V6pgJ2pzyEYUgAizZ7_W3a4noKEdTiZ9GAeuiRY,17728
30
+ gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=20eUDvM_SBFCi5WDR3AfyDJpnL7CJxxcPdW4p626I_M,17671
31
31
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py,sha256=3rOYmgDIIJ4idEtwgnumGStH7PaK-J7EYrOnLa9A-8o,118
32
32
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto,sha256=rmB8RDe4LN5FCVkQ608uS-pl32mk5tt6iGe-g2lKtPs,7919
33
33
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py,sha256=dNkzjTE-2y25q8NF0QRznNJ5r1-5ZxxJS598MHbjx98,45998
34
34
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi,sha256=lq1dbSgBYqJ7zyGfoKKHCyfr6R5vcCGzJxteeyQpbuI,8232
35
35
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py,sha256=GM6EyCEFeyOjL0XOCisbcHurRoLKqKDUI5obsUyTxpE,17446
36
36
  gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py,sha256=tpNk3e_cvY67C9RwVsdTNl75YuNXBgsn53fSJIzeTR4,828
37
- gpustack_runtime/detector/__init__.py,sha256=9i6KOd3Qp_BmnSyPURlOBHlHJMSInqlDTh1kpAbs4_U,8104
38
- gpustack_runtime/detector/__types__.py,sha256=tiYbxPD6gV5wS79K3d2dUzy6btJl4QcsgunyxtJ240E,13162
37
+ gpustack_runtime/detector/__init__.py,sha256=2GaczS5lWLuPsighvq7oPhlPFQSUJfBKLAliXPfl4i0,8162
38
+ gpustack_runtime/detector/__types__.py,sha256=ZiKuYZb0jeXqzyTCUWTDppoHnRmkaZuljAWbnR_ohX8,13626
39
39
  gpustack_runtime/detector/__utils__.py,sha256=QdLWXwsU1_EMxXG5Y29psqnttWJyXWMphHDjiC_6Byc,25153
40
- gpustack_runtime/detector/amd.py,sha256=qh86xGhPJRIXwiKaHmeyIrsxchUDRpyggR6yc0cLuKw,17553
41
- gpustack_runtime/detector/ascend.py,sha256=E6YPoREI5r2HZIegUaQb0XwC3Qau1mnkNeCRbgtlE5k,17992
42
- gpustack_runtime/detector/cambricon.py,sha256=GzXlS4et8zape0rr19d1EwSV9cfFEmHgaElTVMjR3IY,3947
43
- gpustack_runtime/detector/hygon.py,sha256=R4I8h66YHJC00iAtDJhBX772VMKUdZ8nleRXZFPUt3Q,12299
44
- gpustack_runtime/detector/iluvatar.py,sha256=bqciqjYE_tIxBg2PSOlIzH3WcFYBgTDjfN6nT90LSGg,10206
45
- gpustack_runtime/detector/metax.py,sha256=W4NSZD7Kf5431B63UBpYnKIk2Jz1SutEpwXkwgYfmfE,10374
46
- gpustack_runtime/detector/mthreads.py,sha256=GcElUIMvU5C6P4Fx_X_kSOsJps8WZ47tkZ2B2MJZmk4,12131
47
- gpustack_runtime/detector/nvidia.py,sha256=xdvoMdNx943on5fd_mI3WI_85kMY-0dYm5NU3vqXb9M,33400
48
- gpustack_runtime/detector/thead.py,sha256=43TGPq78FulpYBUeEMVxDTY-0X3ve2FsX4Hsd0Lswy0,25561
40
+ gpustack_runtime/detector/amd.py,sha256=ywJMDKFnmF2REdJc1F8_zGYK6O4K0o5xh5eqWU-X2EE,18294
41
+ gpustack_runtime/detector/ascend.py,sha256=a6QRnJfXdyU2tyGiiUKy0fgsp6NF652Zr2fFQgVg1Xw,19470
42
+ gpustack_runtime/detector/cambricon.py,sha256=5AXILG9NAMYiWjaLRZ5h8lXtFk7FLC7LB_aFQz0ZtYU,4102
43
+ gpustack_runtime/detector/hygon.py,sha256=3AcHBlPXTFiH0JQ0VS_xZcqjX-FXy-cdle6Nc-rNj5w,12795
44
+ gpustack_runtime/detector/iluvatar.py,sha256=klFl5H607w8ksTvYSt21QkHMRzzeg-TkJKfoh9CMzqc,10551
45
+ gpustack_runtime/detector/metax.py,sha256=P24WiqK2Ngjpu6AQt0Fp1wEVNra2Xgs-C8JAAwpYews,10801
46
+ gpustack_runtime/detector/mthreads.py,sha256=mwNdsc42nebnSJMPFo6ue1tbiOwHmvPw6dF2CrLwdIQ,12714
47
+ gpustack_runtime/detector/nvidia.py,sha256=oD3HUPfYWXRIRZ87iidNTW2Tg8CTVNIJh8qW1Z3HBO4,34535
48
+ gpustack_runtime/detector/thead.py,sha256=hIRtlZNPa7xzAT0W_2XgFiDVH3YHSGi8NqCdaFaqQcA,26818
49
49
  gpustack_runtime/detector/pyacl/__init__.py,sha256=UQjaBxP7nJNyzr08N8_lH-5wPtnFmUY9pyQhs6vIChU,16232
50
50
  gpustack_runtime/detector/pyamdgpu/__init__.py,sha256=x-UO07EpKEgfTLmXQOD6j9f6kibuvDC7riQFof3YGdw,8617
51
51
  gpustack_runtime/detector/pyamdsmi/__init__.py,sha256=800-khq2w6HLgXM12RkhcdvXBGeAJ4s1_TWJyHebCMk,955
@@ -55,13 +55,13 @@ gpustack_runtime/detector/pyhgml/__init__.py,sha256=Yp9s-QhHS4ck7Iq9kd4v6a4BruyJ
55
55
  gpustack_runtime/detector/pyhgml/libhgml.so,sha256=BPzGVBpzrMX1tSvbXddq8Q0Qhi8w-No2JXX8sRxTioI,2101640
56
56
  gpustack_runtime/detector/pyhgml/libuki.so,sha256=EE6v1vIYYT4FSDMMm9rSfAqwrwIPFD-4_6KtP51lSps,702352
57
57
  gpustack_runtime/detector/pyhsa/__init__.py,sha256=4DuGnBBMUVOCPa6vTx3XT5mffGrKk6M6CYbUWBoMTJ0,15792
58
- gpustack_runtime/detector/pyixml/__init__.py,sha256=6ss_Dyl8lIT4WrKpfwmQqzBmg4Bxi38vg_eey_wsSY0,162681
58
+ gpustack_runtime/detector/pyixml/__init__.py,sha256=2YmNoYhcIvc4CbRZgORM9o-GKdQ6O05J-5L3JbMZdhA,163157
59
59
  gpustack_runtime/detector/pymxsml/__init__.py,sha256=YxfNHq7TWd7CpNroP45BGXhcWNpY_sXgVzNGtx68DII,45409
60
60
  gpustack_runtime/detector/pyrocmcore/__init__.py,sha256=rgwIdPS-7GG7_5luRMR1XG9QyNM3lJh5ryD7kfZqpWg,2523
61
- gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=ACwRtJWVIuJ4NTcBJxk0zrVb_qtDOMkApMdbJoag5g0,11906
62
- gpustack_runtime/_version_appendix.py,sha256=NCkcFhpU5_5eIeG-39gvsVgVabq_3NDFCToVnvsn6EU,23
63
- gpustack_runtime-0.1.41.post3.dist-info/METADATA,sha256=RLNYsBXAICxX3S0JO0t_V-73-MfKoJoLH8Mcq5kessc,2364
64
- gpustack_runtime-0.1.41.post3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
65
- gpustack_runtime-0.1.41.post3.dist-info/entry_points.txt,sha256=bBO_61GxP6dIT74uZwbSDgW5Vt2pTePUS3CgjUJkUgg,68
66
- gpustack_runtime-0.1.41.post3.dist-info/licenses/LICENSE,sha256=OiPibowBvB-NHV3TP_NOj18XNBlXcshXZFMpa3uvKVE,10362
67
- gpustack_runtime-0.1.41.post3.dist-info/RECORD,,
61
+ gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=Gk4pTadOMzLCZJvQJ2S1N_1ivogtYokfVPHj_9Y874Y,12286
62
+ gpustack_runtime/_version_appendix.py,sha256=2B6zFAHFYbVzMJ1w6ZW4XpqNz2XaMa-cAueeeQ4OfJk,23
63
+ gpustack_runtime-0.1.42.dist-info/METADATA,sha256=sUS5YnNvheiK-tDT-rGSzuOKrOIFIKKhUBH1Jxc7lPE,2358
64
+ gpustack_runtime-0.1.42.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
65
+ gpustack_runtime-0.1.42.dist-info/entry_points.txt,sha256=bBO_61GxP6dIT74uZwbSDgW5Vt2pTePUS3CgjUJkUgg,68
66
+ gpustack_runtime-0.1.42.dist-info/licenses/LICENSE,sha256=OiPibowBvB-NHV3TP_NOj18XNBlXcshXZFMpa3uvKVE,10362
67
+ gpustack_runtime-0.1.42.dist-info/RECORD,,