PyPI - xpk - Versions diffs - 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

integration/README.md +19 -0
xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3mega/storage_crd.yaml +52 -0
xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
xpk/blueprints/a4/storage_crd.yaml +52 -0
xpk/commands/cluster.py +33 -12
xpk/commands/cluster_gcluster_test.py +5 -1
xpk/commands/cluster_test.py +125 -0
xpk/commands/config.py +3 -3
xpk/commands/inspector.py +5 -3
xpk/commands/kind.py +2 -0
xpk/commands/managed_ml_diagnostics.py +249 -0
xpk/commands/managed_ml_diagnostics_test.py +146 -0
xpk/commands/workload.py +124 -139
xpk/commands/workload_test.py +160 -118
xpk/core/blueprint/blueprint_generator.py +3 -0
xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
xpk/core/blueprint/testing/data/a4.yaml +185 -0
xpk/core/capacity.py +2 -0
xpk/core/cluster.py +18 -47
xpk/core/cluster_test.py +76 -1
xpk/core/config.py +81 -7
xpk/core/config_test.py +67 -11
xpk/core/docker_container.py +3 -1
xpk/core/docker_image.py +10 -6
xpk/core/docker_resources.py +1 -10
xpk/core/kjob.py +17 -16
xpk/core/kueue_manager.py +13 -19
xpk/core/kueue_manager_test.py +27 -1
xpk/core/nap.py +13 -14
xpk/core/nodepool.py +17 -15
xpk/core/nodepool_test.py +25 -4
xpk/core/pathways.py +23 -0
xpk/core/pathways_test.py +57 -0
xpk/core/resources.py +84 -27
xpk/core/scheduling.py +128 -132
xpk/core/scheduling_test.py +215 -2
xpk/core/system_characteristics.py +179 -0
xpk/core/system_characteristics_test.py +49 -1
xpk/core/telemetry.py +4 -4
xpk/core/telemetry_test.py +9 -9
xpk/core/vertex.py +4 -3
xpk/core/workload_decorators/tcpx_decorator.py +5 -1
xpk/main.py +2 -0
xpk/parser/cluster.py +22 -88
xpk/parser/cluster_test.py +41 -0
xpk/parser/common.py +84 -0
xpk/parser/storage.py +10 -0
xpk/parser/storage_test.py +47 -0
xpk/parser/workload.py +14 -41
xpk/parser/workload_test.py +2 -48
xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
xpk/utils/feature_flags.py +3 -0
xpk/utils/validation.py +2 -2
xpk-0.16.0.dist-info/METADATA +127 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
xpk-0.15.0.dist-info/METADATA +0 -1666
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0

xpk/core/system_characteristics.py CHANGED Viewed

@@ -15,11 +15,29 @@ limitations under the License.
 """
 from dataclasses import dataclass
+import dataclasses
+from typing import Callable, Literal, Optional
+from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, tcpx_decorator
 from ..utils.topology import get_topology_product
 from enum import Enum
 SUB_SLICING_TOPOLOGIES = ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
+INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
+INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
+INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
+INSTALLER_NCCL_RDMA_A4X = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml'
+class DockerPlatform(str, Enum):
+  AMD = 'linux/amd64'
+  ARM = 'linux/arm64'
+AMD_PLATFORM = DockerPlatform.AMD
+ARM_PLATFORM = DockerPlatform.ARM
 class AcceleratorType(Enum):
   TPU = 1
@@ -56,6 +74,45 @@ AcceleratorTypeToAcceleratorCharacteristics = {
 }
+@dataclass
+class GpuConfig:
+  """Contains GPU-specific configuration and requirements."""
+  requires_topology: bool
+  gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
+  kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
+  """A function to decorate the kjob template for GPU-specific configurations.
+  Args:
+    job_manifest (dict): The kjob manifest as a dictionary.
+  Returns:
+    dict: The modified kjob manifest as a dictionary.
+  """
+  nccl_installer: Optional[str] = None
+  jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
+  """A function to decorate the jobset for GPU-specific configurations.
+  Args:
+    jobset_manifest_str (str): The JobSet manifest as a YAML string.
+    sub_networks (list[str], optional): A list of sub-network names, used by some decorators.
+  Returns:
+    str: The modified JobSet manifest as a YAML string.
+  """
+  def __repr__(self) -> str:
+    """Returns a string representation of the GpuConfig, omitting memory addresses for functions."""
+    parts = []
+    for f in dataclasses.fields(self):
+      value = getattr(self, f.name)
+      if f.name in ('kjob_decorator_fn', 'jobset_decorator_fn') and value:
+        parts.append(f'{f.name}=<function {value.__name__}>')
+      else:
+        parts.append(f'{f.name}={repr(value)}')
+    return f"GpuConfig({', '.join(parts)})"
 @dataclass
 class SystemCharacteristics:
   """Contains the defining characteristics of a specific accelerator system.
@@ -92,12 +149,28 @@ class SystemCharacteristics:
   accelerator_type: AcceleratorType
   device_type: str
   supports_sub_slicing: bool
+  docker_platform: DockerPlatform
   requires_workload_policy: bool = False
+  gpu_config: Optional[GpuConfig] = None
   def __post_init__(self):
     if self.accelerator_type == AcceleratorType.GPU:
       self.requires_workload_policy = True
+      if self.gpu_config is None:
+        raise ValueError(
+            f"Validation Error: System '{self.device_type}' is a GPU, "
+            "but 'gpu_config' was not provided."
+        )
+  @property
+  def gpu_requires_topology(self) -> bool:
+    """
+    Safely returns whether the GPU config requires topology,
+    defaulting to False if no GPU config exists.
+    """
+    return self.gpu_config.requires_topology if self.gpu_config else False
 def get_system_characteristics(
     args,
@@ -167,6 +240,7 @@ def get_tpu_system_characteristics_map(
     machine_type: str,
     supported_topologies: list[str],
     supports_sub_slicing: bool,
+    docker_platform: DockerPlatform,
     tpu_type_requires_workload_policy: bool = False,
     default_topologies: set[str] | None = None,
 ) -> dict[str, SystemCharacteristics]:
@@ -189,6 +263,7 @@ def get_tpu_system_characteristics_map(
         requires_workload_policy=tpu_type_requires_workload_policy
         and vms_per_slice > 1,
         supports_sub_slicing=supports_sub_slicing,
+        docker_platform=docker_platform,
     )
     system_characteristics_map[f'{prefix}-{topology}'] = system
     if (
@@ -231,6 +306,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='l4-1',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'l4-2': SystemCharacteristics(
         topology='N/A',
@@ -241,6 +318,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='l4-2',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'l4-4': SystemCharacteristics(
         topology='N/A',
@@ -251,6 +330,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='l4-4',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'l4-8': SystemCharacteristics(
         topology='N/A',
@@ -261,6 +342,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='l4-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     # A100-40gb-$CHIPSc
     'a100-40gb-1': SystemCharacteristics(
@@ -272,6 +355,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='a100-40gb-1',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'a100-40gb-2': SystemCharacteristics(
         topology='N/A',
@@ -282,6 +367,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='a100-40gb-2',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'a100-40gb-4': SystemCharacteristics(
         topology='N/A',
@@ -292,6 +379,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='a100-40gb-4',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'a100-40gb-8': SystemCharacteristics(
         topology='N/A',
@@ -302,6 +391,8 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='a100-40gb-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(requires_topology=False),
+        docker_platform=AMD_PLATFORM,
     ),
     'gb200-4': SystemCharacteristics(
         topology='1x72',
@@ -312,6 +403,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='gb200-4',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_RDMA_A4X,
+            kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
+            jobset_decorator_fn=rdma_decorator.decorate_jobset,
+            gpu_direct_name='rdma',
+        ),
+        docker_platform=ARM_PLATFORM,
     ),
     'gb200-4-nolssd': SystemCharacteristics(
         topology='1x72',
@@ -322,6 +421,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='gb200-4',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_RDMA_A4X,
+            kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
+            jobset_decorator_fn=rdma_decorator.decorate_jobset,
+            gpu_direct_name='rdma',
+        ),
+        docker_platform=ARM_PLATFORM,
     ),
     'b200-8': SystemCharacteristics(
         topology='N/A',
@@ -332,6 +439,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='b200-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_RDMA,
+            kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
+            jobset_decorator_fn=rdma_decorator.decorate_jobset,
+            gpu_direct_name='rdma',
+        ),
+        docker_platform=AMD_PLATFORM,
     ),
     'h200-141gb-8': SystemCharacteristics(
         topology='N/A',
@@ -342,6 +457,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='h200-141gb-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_RDMA,
+            kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
+            jobset_decorator_fn=rdma_decorator.decorate_jobset,
+            gpu_direct_name='rdma',
+        ),
+        docker_platform=AMD_PLATFORM,
     ),
     # H100-80gb-$CHIPS
     'h100-80gb-8': SystemCharacteristics(
@@ -353,6 +476,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='h100-80gb-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_TCPX,
+            kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
+            jobset_decorator_fn=tcpx_decorator.decorate_jobset,
+            gpu_direct_name='tcpx',
+        ),
+        docker_platform=AMD_PLATFORM,
     ),
     # H100-mega-80gb-$CHIPS
     'h100-mega-80gb-8': SystemCharacteristics(
@@ -364,6 +495,14 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.GPU,
         device_type='h100-mega-80gb-8',
         supports_sub_slicing=False,
+        gpu_config=GpuConfig(
+            requires_topology=True,
+            nccl_installer=INSTALLER_NCCL_TCPXO,
+            kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
+            jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
+            gpu_direct_name='tcpxo',
+        ),
+        docker_platform=AMD_PLATFORM,
     ),
     # TPU system characteristics
     **get_tpu_system_characteristics_map(
@@ -374,6 +513,7 @@ UserFacingNameToSystemCharacteristics = {
         supported_topologies=['1x1x1'],
         tpu_type_requires_workload_policy=True,
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     **get_tpu_system_characteristics_map(
         prefix='tpu7x',
@@ -382,6 +522,7 @@ UserFacingNameToSystemCharacteristics = {
         machine_type='tpu7x-standard-4t',
         tpu_type_requires_workload_policy=True,
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
         supported_topologies=generate_tpu_topologies(max_cubes=144),
         default_topologies=set([
             '12x12x12',
@@ -491,6 +632,7 @@ UserFacingNameToSystemCharacteristics = {
         machine_type='ct6e-standard-1t',
         supports_sub_slicing=False,
         supported_topologies=['1x1'],
+        docker_platform=AMD_PLATFORM,
     ),
     **get_tpu_system_characteristics_map(
         prefix='v6e',
@@ -501,6 +643,7 @@ UserFacingNameToSystemCharacteristics = {
         supported_topologies=[
             '2x2',
         ],
+        docker_platform=AMD_PLATFORM,
     ),
     **get_tpu_system_characteristics_map(
         prefix='v6e',
@@ -509,6 +652,7 @@ UserFacingNameToSystemCharacteristics = {
         machine_type='ct6e-standard-4t',
         supports_sub_slicing=True,
         supported_topologies=SUB_SLICING_TOPOLOGIES,
+        docker_platform=AMD_PLATFORM,
     ),
     **get_tpu_system_characteristics_map(
         prefix='v5p',
@@ -516,6 +660,7 @@ UserFacingNameToSystemCharacteristics = {
         gke_accelerator='tpu-v5p-slice',
         machine_type='ct5p-hightpu-4t',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
         supported_topologies=generate_tpu_topologies(max_cubes=140),
         default_topologies=set([
             '2x2x1',
@@ -621,6 +766,7 @@ UserFacingNameToSystemCharacteristics = {
         tensorcores_per_chip=1,
         gke_accelerator='tpu-v5-lite-podslice',
         machine_type='ct5lp-hightpu-4t',
+        docker_platform=AMD_PLATFORM,
         supports_sub_slicing=False,
         supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
     ),
@@ -629,6 +775,7 @@ UserFacingNameToSystemCharacteristics = {
         tensorcores_per_chip=2,
         gke_accelerator='tpu-v4-podslice',
         machine_type='ct4p-hightpu-4t',
+        docker_platform=AMD_PLATFORM,
         supports_sub_slicing=False,
         supported_topologies=generate_tpu_topologies(
             max_cubes=64, enforce_nondecreasing=False
@@ -660,6 +807,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='m1-megamem-96-1',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     # n2-standard-#vCPUs-#VMs
     'n2-standard-64-1': SystemCharacteristics(
@@ -671,6 +819,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-64-1',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-1': SystemCharacteristics(
         topology='N/A',
@@ -681,6 +830,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-1',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-2': SystemCharacteristics(
         topology='N/A',
@@ -691,6 +841,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-2',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-4': SystemCharacteristics(
         topology='N/A',
@@ -701,6 +852,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-4',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-8': SystemCharacteristics(
         topology='N/A',
@@ -711,6 +863,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-8',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-16': SystemCharacteristics(
         topology='N/A',
@@ -721,6 +874,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-16',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-32': SystemCharacteristics(
         topology='N/A',
@@ -731,6 +885,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-32',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-64': SystemCharacteristics(
         topology='N/A',
@@ -741,6 +896,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-64',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-128': SystemCharacteristics(
         topology='N/A',
@@ -751,6 +907,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-128',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-256': SystemCharacteristics(
         topology='N/A',
@@ -761,6 +918,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-256',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-512': SystemCharacteristics(
         topology='N/A',
@@ -771,6 +929,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-512',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-1024': SystemCharacteristics(
         topology='N/A',
@@ -781,6 +940,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-1024',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
     'n2-standard-32-2048': SystemCharacteristics(
         topology='N/A',
@@ -791,6 +951,7 @@ UserFacingNameToSystemCharacteristics = {
         accelerator_type=AcceleratorType.CPU,
         device_type='n2-standard-32-2048',
         supports_sub_slicing=False,
+        docker_platform=AMD_PLATFORM,
     ),
 }
 """ If you modify UserFacingNameToSystemCharacteristics you should also modify
@@ -808,3 +969,21 @@ def get_system_characteristics_keys_by_accelerator_type(
       for key, value in UserFacingNameToSystemCharacteristics.items()
       if value.accelerator_type in accelerators
   ]
+def create_accelerator_label(system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType.CPU:
+    return ''
+  return (
+      f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].accelerator_label}:'
+      f' {system.gke_accelerator}'
+  )
+def create_machine_label(system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType.TPU:
+    return (
+        f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].machine_label}:'
+        f' {system.topology}'
+    )
+  return ''

xpk/core/system_characteristics_test.py CHANGED Viewed

@@ -14,7 +14,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from .system_characteristics import get_tpu_system_characteristics_map, generate_tpu_topologies, SystemCharacteristics, AcceleratorType
+import pytest
+from .system_characteristics import (
+    get_tpu_system_characteristics_map,
+    generate_tpu_topologies,
+    DockerPlatform,
+    SystemCharacteristics,
+    AcceleratorType,
+    GpuConfig,
+)
 def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
@@ -25,6 +33,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
       machine_type="test",
       supported_topologies=["1x1"],
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       tpu_type_requires_workload_policy=False,
   )
@@ -37,6 +46,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
       accelerator_type=AcceleratorType.TPU,
       device_type="test-1",
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       requires_workload_policy=False,
   )
   assert result == {
@@ -53,6 +63,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
       machine_type="test",
       supported_topologies=["2x2"],
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       tpu_type_requires_workload_policy=True,
   )
@@ -65,6 +76,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
       accelerator_type=AcceleratorType.TPU,
       device_type="test-8",
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       requires_workload_policy=False,
   )
   assert result == {
@@ -81,6 +93,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
       machine_type="test",
       supported_topologies=["2x2x2"],
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       tpu_type_requires_workload_policy=True,
   )
@@ -93,6 +106,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
       accelerator_type=AcceleratorType.TPU,
       device_type="test-16",
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       requires_workload_policy=True,
   )
   assert result == {
@@ -109,6 +123,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
       machine_type="test",
       supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
       default_topologies=set(["4x8x16"]),
   )
@@ -146,3 +161,36 @@ def test_generate_tpu_topologies_contains_sub_cube_slices():
   one_cube = generate_tpu_topologies(max_cubes=1)
   assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
+def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
+  """Tests that __post_init__ correctly sets requires_workload_policy for GPUs."""
+  gpu_system = SystemCharacteristics(
+      topology="N/A",
+      vms_per_slice=1,
+      gke_accelerator="nvidia-l4",
+      gce_machine_type="g2-standard-12",
+      chips_per_vm=1,
+      accelerator_type=AcceleratorType.GPU,
+      device_type="l4-1",
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.AMD,
+      gpu_config=GpuConfig(requires_topology=False),
+  )
+  assert gpu_system.requires_workload_policy is True
+def test_system_characteristics_post_init_throws_for_gpu_without_config():
+  """Tests that __post_init__ raises ValueError for GPU without gpu_config."""
+  with pytest.raises(ValueError, match="'gpu_config' was not provided"):
+    SystemCharacteristics(
+        topology="N/A",
+        vms_per_slice=1,
+        gke_accelerator="nvidia-l4",
+        gce_machine_type="g2-standard-12",
+        chips_per_vm=1,
+        accelerator_type=AcceleratorType.GPU,
+        device_type="l4-1",
+        supports_sub_slicing=False,
+        docker_platform=DockerPlatform.AMD,
+    )

xpk/core/telemetry.py CHANGED Viewed

@@ -27,7 +27,7 @@ import requests
 from enum import Enum
 from typing import Any
 from dataclasses import dataclass
-from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
+from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
 from ..utils.execution_context import is_dry_run
 from ..utils.user_agent import get_user_agent
 from ..utils.feature_flags import FeatureFlags
@@ -36,7 +36,7 @@ from ..utils.feature_flags import FeatureFlags
 def should_send_telemetry():
   return (
       FeatureFlags.TELEMETRY_ENABLED
-      and xpk_config.get(SEND_TELEMETRY_KEY) != "false"
+      and get_config().get(SEND_TELEMETRY_KEY) != "false"
   )
@@ -254,10 +254,10 @@ def _get_session_id() -> str:
 def _ensure_client_id() -> str:
   """Generates Client ID and stores in configuration if not already present."""
-  current_client_id = xpk_config.get(CLIENT_ID_KEY)
+  current_client_id = get_config().get(CLIENT_ID_KEY)
   if current_client_id is not None:
     return current_client_id
   new_client_id = str(uuid.uuid4())
-  xpk_config.set(CLIENT_ID_KEY, new_client_id)
+  get_config().set(CLIENT_ID_KEY, new_client_id)
   return new_client_id

xpk/core/telemetry_test.py CHANGED Viewed

@@ -16,7 +16,7 @@ limitations under the License.
 import pytest
 import json
-from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
+from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
 from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
 from ..utils.execution_context import set_dry_run
 from ..utils.feature_flags import FeatureFlags
@@ -31,9 +31,9 @@ def setup_mocks(mocker: MockerFixture):
   mocker.patch('os.path.basename', return_value='xpk.py')
   mocker.patch('os.path.abspath', return_value='/home/xpk_user')
   set_dry_run(False)
-  xpk_config.set(CLIENT_ID_KEY, 'client_id')
+  get_config().set(CLIENT_ID_KEY, 'client_id')
   yield
-  xpk_config.set(CLIENT_ID_KEY, None)
+  get_config().set(CLIENT_ID_KEY, None)
 @pytest.mark.parametrize(
@@ -48,13 +48,13 @@ def setup_mocks(mocker: MockerFixture):
 def test_should_send_telemetry_returns_correct_value(
     feature_flag: bool, config_value: str, expected: bool
 ):
-  xpk_config.set(SEND_TELEMETRY_KEY, config_value)
+  get_config().set(SEND_TELEMETRY_KEY, config_value)
   FeatureFlags.TELEMETRY_ENABLED = feature_flag
   assert should_send_telemetry() is expected
 def test_metrics_collector_generates_client_id_if_not_present():
-  xpk_config.set(CLIENT_ID_KEY, None)
+  get_config().set(CLIENT_ID_KEY, None)
   MetricsCollector.log_start(command='test')
   payload = json.loads(MetricsCollector.flush())
   extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
@@ -79,12 +79,12 @@ def test_metrics_collector_logs_start_event_correctly():
       ],
       'event_name': 'start',
       'event_type': 'commands',
-      'release_version': 'v0.15.0',
+      'release_version': 'v0.0.0',
   }
 def test_metrics_collector_generates_client_id_when_not_present():
-  xpk_config.set(CLIENT_ID_KEY, None)
+  get_config().set(CLIENT_ID_KEY, None)
   MetricsCollector.log_start(command='test')
   payload = json.loads(MetricsCollector.flush())
   extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
@@ -109,7 +109,7 @@ def test_metrics_collector_logs_complete_event_correctly():
       ],
       'event_name': 'complete',
       'event_type': 'commands',
-      'release_version': 'v0.15.0',
+      'release_version': 'v0.0.0',
   }
@@ -132,7 +132,7 @@ def test_metrics_collector_logs_custom_event_correctly():
       ],
       'event_name': 'test',
       'event_type': 'custom',
-      'release_version': 'v0.15.0',
+      'release_version': 'v0.0.0',
   }

xpk/core/vertex.py CHANGED Viewed

@@ -15,7 +15,7 @@ limitations under the License.
 """
 from ..utils.console import xpk_print
-from .resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
+from .resources import ConfigMapType, get_cluster_configmap
 DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
@@ -65,8 +65,9 @@ def create_vertex_experiment(args) -> dict | None:
       tensorboard,
   )
-  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(metadata_configmap_name)
+  cluster_config_map = get_cluster_configmap(
+      args.cluster, ConfigMapType.METADATA
+  )
   if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
     xpk_print(

xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl