PyPI - xpk - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

xpk 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

xpk/commands/cluster.py +29 -30
xpk/commands/cluster_gcluster.py +19 -14
xpk/commands/cluster_test.py +1 -21
xpk/commands/common.py +39 -6
xpk/commands/common_test.py +170 -0
xpk/commands/info.py +9 -5
xpk/commands/inspector.py +33 -4
xpk/commands/inspector_test.py +142 -0
xpk/commands/workload.py +35 -17
xpk/commands/workload_test.py +70 -3
xpk/core/blueprint/blueprint_generator.py +19 -8
xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
xpk/core/blueprint/testing/data/a4.yaml +3 -1
xpk/core/capacity.py +37 -17
xpk/core/capacity_test.py +66 -1
xpk/core/cluster.py +10 -10
xpk/core/cluster_private.py +3 -3
xpk/core/cluster_test.py +29 -2
xpk/core/docker_container.py +55 -30
xpk/core/docker_manager.py +4 -4
xpk/core/docker_resources.py +4 -1
xpk/core/kueue_manager.py +6 -8
xpk/core/kueue_manager_test.py +4 -5
xpk/core/nap.py +14 -3
xpk/core/nodepool.py +46 -13
xpk/core/nodepool_test.py +143 -8
xpk/core/pathways.py +4 -8
xpk/core/remote_state/fuse_remote_state.py +1 -1
xpk/core/scheduling.py +16 -13
xpk/core/scheduling_test.py +15 -7
xpk/core/system_characteristics.py +6 -0
xpk/core/telemetry.py +11 -1
xpk/core/telemetry_test.py +39 -0
xpk/core/testing/commands_tester.py +26 -0
xpk/core/testing/commands_tester_test.py +20 -1
xpk/core/workload_decorators/rdma_decorator.py +9 -0
xpk/parser/cluster.py +11 -1
xpk/parser/cluster_test.py +59 -1
xpk/parser/common.py +11 -0
xpk/parser/storage.py +3 -3
xpk/utils/console.py +1 -1
xpk/utils/feature_flags.py +7 -3
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/METADATA +37 -21
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/RECORD +48 -55
xpk-1.1.1.dist-info/top_level.txt +1 -0
integration/README.md +0 -19
integration/__init__.py +0 -15
integration/docker_manager_test.py +0 -102
integration/gcluster_a3mega_test.py +0 -215
integration/gcluster_a3ultra_test.py +0 -187
integration/gcluster_a4_test.py +0 -187
integration/gcluster_test.py +0 -107
xpk/utils/user_input.py +0 -48
xpk/utils/user_input_test.py +0 -92
xpk-1.0.0.dist-info/top_level.txt +0 -2
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/WHEEL +0 -0
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/entry_points.txt +0 -0
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/licenses/LICENSE +0 -0

xpk/core/nodepool_test.py CHANGED Viewed

@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
     ensure_resource_policy_exists,
     get_desired_node_pool_names,
     run_gke_node_pool_create_command,
+    _validate_reservation_count,
 )
 from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
 from xpk.core.commands import FailedCommand
@@ -103,6 +104,7 @@ def commands_tester(mocker):
   return CommandsTester(
       mocker,
       run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
+      run_command_batch_path="xpk.core.commands.run_command_batch",
   )
@@ -119,7 +121,7 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
   assert len(commands_tester.commands_history) == 1
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies describe resource-policy",
+      "gcloud beta compute resource-policies describe resource-policy",
       "--project=test-project",
       "--region=us-central1",
   )
@@ -129,7 +131,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
     commands_tester: CommandsTester,
 ):
   commands_tester.set_result_for_command(
-      (1, ""), "gcloud compute resource-policies describe"
+      (1, ""), "gcloud beta compute resource-policies describe"
   )
   ensure_resource_policy_exists(
@@ -142,16 +144,17 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
   assert len(commands_tester.commands_history) == 2
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies describe"
+      "gcloud beta compute resource-policies describe"
   )
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies create workload-policy resource-policy",
+      "gcloud beta compute resource-policies create workload-policy"
+      " resource-policy",
       "--project=test-project",
       "--region=us-central1",
       "--accelerator-topology=2x2x1",
   )
   commands_tester.assert_command_not_run(
-      "gcloud compute resource-policies create workload-policy",
+      "gcloud beta compute resource-policies create workload-policy",
       "--accelerator-topology-mode",
   )
@@ -160,7 +163,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
     commands_tester: CommandsTester,
 ):
   commands_tester.set_result_for_command(
-      (1, ""), "gcloud compute resource-policies describe"
+      (1, ""), "gcloud beta compute resource-policies describe"
   )
   ensure_resource_policy_exists(
@@ -172,7 +175,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
   )
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies create workload-policy",
+      "gcloud beta compute resource-policies create workload-policy",
       "--accelerator-topology-mode",
   )
@@ -182,7 +185,7 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
 ):
   with pytest.raises(RuntimeError):
     commands_tester.set_result_for_command(
-        (1, ""), "gcloud compute resource-policies"
+        (1, ""), "gcloud beta compute resource-policies"
     )
     ensure_resource_policy_exists(
@@ -433,3 +436,135 @@ def test_display_nodepool_creation_ignores_logs_without_errors(
       mock_xpk_print.call_args_list[0].args[0]
       == "Create Nodepools returned ERROR 1"
   )
+def test_validate_reservation_count_mismatch(mock_xpk_print):
+  result = _validate_reservation_count(
+      ["res1", "res2"], num_node_pools_to_create=3
+  )
+  assert result == 1
+  assert mock_xpk_print.call_count == 1
+  assert (
+      "reservations (2) must match the number of NEW nodepools (3)"
+      in mock_xpk_print.call_args_list[0].args[0]
+  )
+def test_run_gke_node_pool_create_command_multiple_reservations(
+    mocker,
+    commands_tester: CommandsTester,
+):
+  mocker.patch(
+      "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
+  )
+  mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
+  args = mocker.Mock(
+      num_slices=2,
+      reservation="res1,res2",
+      tpu_type="v4-8",
+      device_type=None,
+      cluster="test-cluster",
+      project="test-project",
+      zone="us-central1-a",
+      on_demand=False,
+      spot=False,
+      flex=False,
+      enable_workload_identity=False,
+      enable_gcsfuse_csi_driver=False,
+      host_maintenance_interval="AS_NEEDED",
+      custom_nodepool_arguments="",
+  )
+  system = SystemCharacteristics(
+      topology="2x2x1",
+      vms_per_slice=2,
+      gke_accelerator="tpu-v4",
+      gce_machine_type="ct4p-hightpu-4t",
+      chips_per_vm=4,
+      accelerator_type=AcceleratorType.TPU,
+      device_type="v4-8",
+      requires_workload_policy=False,
+      supports_sub_slicing=False,
+      supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
+      docker_platform=DockerPlatform.AMD,
+  )
+  commands_tester.set_result_for_command(
+      (0, ""), "gcloud beta container node-pools list"
+  )
+  result = run_gke_node_pool_create_command(args, system, "1.2.3")
+  assert result == 0
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-0", "--reservation=res1"
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res2"
+  )
+def test_run_gke_node_pool_create_command_partial_reservations(
+    mocker,
+    commands_tester: CommandsTester,
+):
+  mocker.patch(
+      "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
+  )
+  mocker.patch("xpk.core.nodepool.get_node_pools_to_delete", return_value=[])
+  mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
+  args = mocker.Mock(
+      num_slices=3,
+      reservation="res1,res2",
+      tpu_type="v4-8",
+      device_type=None,
+      cluster="test-cluster",
+      project="test-project",
+      zone="us-central1-a",
+      on_demand=False,
+      spot=False,
+      flex=False,
+      enable_workload_identity=False,
+      enable_gcsfuse_csi_driver=False,
+      host_maintenance_interval="AS_NEEDED",
+      custom_nodepool_arguments="",
+  )
+  system = SystemCharacteristics(
+      topology="2x2x1",
+      vms_per_slice=2,
+      gke_accelerator="tpu-v4",
+      gce_machine_type="ct4p-hightpu-4t",
+      chips_per_vm=4,
+      accelerator_type=AcceleratorType.TPU,
+      device_type="v4-8",
+      requires_workload_policy=False,
+      supports_sub_slicing=False,
+      supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
+      docker_platform=DockerPlatform.AMD,
+  )
+  commands_tester.set_result_for_command(
+      (0, "test-cluster-np-0"), "gcloud beta container node-pools list"
+  )
+  commands_tester.set_result_for_command(
+      (0, "us-central1-a"),
+      "gcloud",
+      "node-pools describe",
+      '--format="value(locations)"',
+  )
+  result = run_gke_node_pool_create_command(args, system, "1.2.3")
+  assert result == 0
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res1"
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
+  )

xpk/core/pathways.py CHANGED Viewed

@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
 def get_user_workload_for_pathways(
-    args,
-    system: SystemCharacteristics,
+    args, system: SystemCharacteristics, parallel_containers: int
 ) -> str:
   """
   Create a user workload container for Pathways.
   Don't create one for Pathways headless mode.
-  Args:
-    args: user provided args.
-    system: system characteristics.
   Returns:
     str:
       Pathways server port as a YAML string
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
   if args.headless:
     return ''
   else:
-    container, _ = get_user_workload_container(args, system)
+    container, _ = get_user_workload_container(
+        args, system, parallel_containers
+    )
     return user_workload_yaml.format(
         args=args,
         container=container,

xpk/core/remote_state/fuse_remote_state.py CHANGED Viewed

@@ -56,7 +56,7 @@ class FuseStateClient(RemoteStateClient):
   def upload_state(self) -> None:
     xpk_print(
-        f'Uploading dependecies from directory {self.state_dir} to bucket:'
+        f'Uploading dependencies from directory {self.state_dir} to bucket:'
         f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
     )
     upload_directory_to_gcs(

xpk/core/scheduling.py CHANGED Viewed

@@ -33,8 +33,11 @@ from .system_characteristics import (
 from packaging.version import Version
 _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
-_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
+_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
 _SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
+ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
+    'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
+)
 class WorkloadScheduling(Enum):
@@ -85,6 +88,18 @@ def check_if_workload_can_schedule(
       return WorkloadScheduling.UNAVAILABLE
     return WorkloadScheduling.AVAILABLE
+  if cluster_system and _check_super_slicing_availability(
+      workload_system=workload_system, cluster_system=cluster_system
+  ):
+    if _check_workload_size_fits(
+        args,
+        workload_system,
+        max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
+    ) and _check_super_slicing_topology(workload_system):
+      return WorkloadScheduling.SUPER_SLICING_AVAILABLE
+    else:
+      return WorkloadScheduling.UNAVAILABLE
   if workload_system.device_type in resources_config_map:
     if _check_workload_size_fits(
         args,
@@ -109,18 +124,6 @@ def check_if_workload_can_schedule(
     else:
       return WorkloadScheduling.UNAVAILABLE
-  if cluster_system and _check_super_slicing_availability(
-      workload_system=workload_system, cluster_system=cluster_system
-  ):
-    if _check_workload_size_fits(
-        args,
-        workload_system,
-        max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
-    ) and _check_super_slicing_topology(workload_system):
-      return WorkloadScheduling.SUPER_SLICING_AVAILABLE
-    else:
-      return WorkloadScheduling.UNAVAILABLE
   xpk_print(
       'Workload scheduling validation failed. XPK will not create the workload'
       f' {args.workload}.'

xpk/core/scheduling_test.py CHANGED Viewed

@@ -208,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
     cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
     # 5 4x4x4 cubes:
     resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
-    kueue_version='0.14.0',
+    kueue_version='0.15.2',
     super_slicing_feature_enabled=True,
     super_slicing_topology_set=True,
     num_slices=1,
@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
             WorkloadScheduling.UNAVAILABLE,
         ),
         (
-            (
-                'Super-slicing should be ignored when a given device is already'
-                ' present in the cluster'
+            'Super-slicing, but one cube',
+            dataclasses.replace(
+                SUPER_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die('tpu7x-128'),
+                cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
+                resources_config_map={'tpu7x-128': '16'},
             ),
+            WorkloadScheduling.SUPER_SLICING_AVAILABLE,
+        ),
+        (
+            'Super-slicing, but one cube and no super-slicing-topology',
             dataclasses.replace(
                 SUPER_SLICING_CASE,
-                workload_system=_get_system_characteristics_or_die('tpu7x-64'),
-                cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
-                resources_config_map={'tpu7x-64': '16'},
+                workload_system=_get_system_characteristics_or_die('tpu7x-128'),
+                cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
+                resources_config_map={'tpu7x-128': '16'},
+                super_slicing_topology_set=False,
             ),
             WorkloadScheduling.AVAILABLE,
         ),

xpk/core/system_characteristics.py CHANGED Viewed

@@ -131,6 +131,8 @@ class SystemCharacteristics:
     supports_super_slicing: Whether the Super-slicing feature is supported.
     requires_workload_policy: A boolean indicating if a GCE resource
       workload policy is required. This is automatically set to True for GPUs.
+    parallel_containers: The number of containers running on a single VM.
   """
   topology: str
@@ -146,6 +148,7 @@ class SystemCharacteristics:
   docker_platform: DockerPlatform
   requires_workload_policy: bool = False
   gpu_config: Optional[GpuConfig] = None
+  parallel_containers: int = 1
   def __post_init__(self):
     if self.accelerator_type == AcceleratorType.GPU:
@@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map(
     default_topologies: set[str] | None = None,
     sub_slicing_topologies: set[str] | None = None,
     super_slicing_topologies: set[str] | None = None,
+    parallel_containers: int = 1,
 ) -> dict[str, SystemCharacteristics]:
   system_characteristics_map = {}
   default_topologies = default_topologies or set()
@@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map(
         supports_super_slicing=topology in super_slicing_topologies,
         supports_accelerator_network_profile=supports_accelerator_network_profile,
         docker_platform=docker_platform,
+        parallel_containers=parallel_containers,
     )
     system_characteristics_map[f'{prefix}-{topology}'] = system
     if (
@@ -544,6 +549,7 @@ UserFacingNameToSystemCharacteristics = {
         tpu_type_requires_workload_policy=True,
         supports_accelerator_network_profile=False,
         docker_platform=AMD_PLATFORM,
+        parallel_containers=2,
         supported_topologies=generate_tpu_topologies(max_cubes=144),
         super_slicing_topologies=set(['4x4x4']),
         default_topologies=set([

xpk/core/telemetry.py CHANGED Viewed

@@ -30,7 +30,7 @@ from dataclasses import dataclass
 from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
 from ..utils.execution_context import is_dry_run
 from ..utils.user_agent import get_user_agent
-from ..utils.feature_flags import FeatureFlags
+from ..utils.feature_flags import FeatureFlags, is_tester
 def should_send_telemetry():
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
 class MetricsEventMetadataKey(Enum):
+  """Represents available metadata keys."""
   SESSION_ID = "XPK_SESSION_ID"
   DRY_RUN = "XPK_DRY_RUN"
   PYTHON_VERSION = "XPK_PYTHON_VERSION"
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
   RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
   RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
   LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
+  TESTER = "XPK_TESTER"
 @dataclass
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
       MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
           _is_running_from_source()
       ).lower(),
+      MetricsEventMetadataKey.TESTER: str(
+          is_tester() or _is_trash_execution()
+      ).lower(),
   }
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
   }
+def _is_trash_execution() -> bool:
+  return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
 def _is_running_as_pip() -> bool:
   return os.path.basename(sys.argv[0]) == "xpk"

xpk/core/telemetry_test.py CHANGED Viewed

@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
   mocker.patch('time.time', side_effect=itertools.count())
   mocker.patch('platform.python_version', return_value='99.99.99')
   mocker.patch('os.path.basename', return_value='xpk.py')
+  mocker.patch('os.getenv', return_value='false')
   mocker.patch('os.path.abspath', return_value='/home/xpk_user')
+  mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
   set_dry_run(False)
   get_config().set(CLIENT_ID_KEY, 'client_id')
   yield
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
           {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
           {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
           {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
+          {'key': 'XPK_TESTER', 'value': 'false'},
           {'key': 'XPK_COMMAND', 'value': 'test'},
           {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
       ],
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
           {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
           {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
           {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
+          {'key': 'XPK_TESTER', 'value': 'false'},
           {'key': 'XPK_EXIT_CODE', 'value': '2'},
           {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
       ],
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
           {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
           {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
           {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
+          {'key': 'XPK_TESTER', 'value': 'false'},
           {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
           {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
       ],
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
   assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
+@pytest.mark.parametrize(
+    argnames='tester,expected',
+    argvalues=[
+        (True, 'true'),
+        (False, 'false'),
+    ],
+)
+def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
+    tester: bool, expected: str, mocker: MockerFixture
+):
+  mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
+  MetricsCollector.log_start(command='test')
+  payload = MetricsCollector.flush()
+  assert _get_metadata_value(payload, 'XPK_TESTER') == expected
+@pytest.mark.parametrize(
+    argnames='trash_execution,expected',
+    argvalues=[
+        ('true', 'true'),
+        ('false', 'false'),
+        ('', 'false'),
+        (None, 'false'),
+    ],
+)
+def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
+    trash_execution: str, expected: str, mocker: MockerFixture
+):
+  mocker.patch('os.getenv', return_value=trash_execution)
+  MetricsCollector.log_start(command='test')
+  payload = MetricsCollector.flush()
+  assert _get_metadata_value(payload, 'XPK_TESTER') == expected
 def _get_metadata_value(payload_str: str, key: str) -> str | None:
   payload = json.loads(payload_str)
   metadata = json.loads(payload['log_event'][0]['source_extension_json'])[

xpk/core/testing/commands_tester.py CHANGED Viewed

@@ -17,6 +17,8 @@ limitations under the License.
 import re
 from pytest_mock import MockerFixture
+from ..commands import FailedCommand
 class CommandsTester:
   """Tester class useful for mocking and asserting command runs."""
@@ -27,6 +29,7 @@ class CommandsTester:
       run_command_for_value_path: str | None = None,
       run_command_with_updates_path: str | None = None,
       run_command_with_updates_retry_path: str | None = None,
+      run_command_batch_path: str | None = None,
   ):
     self.__results: dict[re.Pattern, tuple[int, str]] = {}
     self.commands_history: list[str] = []
@@ -45,6 +48,11 @@ class CommandsTester:
           run_command_with_updates_retry_path,
           wraps=self.__fake_run_command_with_updates_retry,
       )
+    if run_command_batch_path:
+      mocker.patch(
+          run_command_batch_path,
+          wraps=self.__fake_run_command_batch,
+      )
   def set_result_for_command(
       self, result: tuple[int, str], *command_parts: str
@@ -111,6 +119,24 @@ class CommandsTester:
   ) -> tuple[int, str]:
     return self.__common_fake_run_command(command, (0, dry_run_return_val))
+  def __fake_run_command_batch(
+      self,
+      commands: list[str],
+      jobname: str,
+      per_command_name: list[str],
+      output_logs: list[str],
+  ) -> FailedCommand | None:
+    for i, command in enumerate(commands):
+      result = self.__common_fake_run_command(command, (0, ""))[0]
+      if result != 0:
+        return FailedCommand(
+            return_code=result,
+            name=per_command_name[i],
+            command=command,
+            logfile=output_logs[i],
+        )
+    return None
   # pylint: enable=unused-argument
   def __common_fake_run_command(

xpk/core/testing/commands_tester_test.py CHANGED Viewed

@@ -17,7 +17,7 @@ limitations under the License.
 import pytest
 from pytest_mock import MockerFixture
-from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
+from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
 from xpk.core.testing.commands_tester import CommandsTester
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
       run_command_with_updates_retry_path=(
           "xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
       ),
+      run_command_batch_path=(
+          "xpk.core.testing.commands_tester_test.run_command_batch"
+      ),
   )
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
   mock_commands.assert_command_run("cmd", "bar")
+def test_run_command_batch_default_result(
+    mock_commands: CommandsTester,
+):
+  result = run_command_batch(
+      commands=["cmd1 foo bar", "cmd2 foo bar"],
+      jobname="Test command",
+      per_command_name=["cmd1", "cmd2"],
+      output_logs=["log1", "log2"],
+  )
+  assert result is None
+  mock_commands.assert_command_run("foo bar", times=2)
+  mock_commands.assert_command_run("cmd1")
+  mock_commands.assert_command_run("cmd2")
 def test_set_result_for_command(mock_commands: CommandsTester):
   mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")

xpk/core/workload_decorators/rdma_decorator.py CHANGED Viewed

@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
   volumes.append(
       {'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
   )
+  volumes.append({
+      'name': 'dshm',
+      'emptyDir': {
+          'medium': 'Memory',
+      },
+  })
 def add_tolerations(job_manifest):
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
       container['volumeMounts'].append(
           {'name': 'gib', 'mountPath': '/usr/local/gib'}
       )
+      container['volumeMounts'].append(
+          {'name': 'dshm', 'mountPath': '/dev/shm'}
+      )

xpk/parser/cluster.py CHANGED Viewed

@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
   add_resource_limits(cluster_create_resource_limits)
   cluster_create_ray_parser.set_defaults(
-      func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
+      func=cluster_create_ray_cluster,
+      sub_slicing=False,
+      super_slicing=False,
+      num_cubes=None,
   )
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
   )
   add_driver_arguments(cluster_adapt_optional_arguments)
   add_shared_arguments(cluster_adapt_optional_arguments)
+  add_resource_limits(cluster_adapt_optional_arguments)
+  if FeatureFlags.SUB_SLICING_ENABLED:
+    add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
+  if FeatureFlags.SUPER_SLICING_ENABLED:
+    add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
   cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
       'Capacity Arguments', 'Arguments related to capacity for cluster create.'

xpk 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

xpk 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl