PyPI - xpk - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

xpk 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

xpk/commands/workload.py +17 -13
xpk/core/docker_container.py +33 -13
xpk/core/docker_resources.py +5 -5
xpk/core/pathways.py +4 -8
xpk/core/scheduling.py +12 -12
xpk/core/scheduling_test.py +14 -6
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/METADATA +1 -1
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/RECORD +12 -12
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/WHEEL +1 -1
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/entry_points.txt +0 -0
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/licenses/LICENSE +0 -0
{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/top_level.txt +0 -0

xpk/commands/workload.py CHANGED Viewed

@@ -481,6 +481,16 @@ def workload_create(args) -> None:
         + lustre_storages
     )
+  use_sub_slicing = (
+      workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
+  )
+  use_super_slicing = (
+      workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
+  )
+  parallel_containers = workload_system.parallel_containers
+  if args.use_pathways or use_super_slicing:
+    parallel_containers = 1
   # Currently failure policy rules are supported for Pathways workloads. b/408465881
   failure_policy_rules = ''
   pod_failure_policy = ''
@@ -497,10 +507,8 @@ def workload_create(args) -> None:
             rules:
           """
     docker_image = get_main_container_docker_image(args, workload_system)
-    for i in range(workload_system.parallel_containers):
-      docker_image_sufix = (
-          f'-{i + 1}' if workload_system.parallel_containers > 1 else ''
-      )
+    for i in range(parallel_containers):
+      docker_image_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
       pod_failure_policy += f"""
             - action: FailJob
               onPodConditions: []
@@ -533,7 +541,7 @@ def workload_create(args) -> None:
   # Create the workload file based on accelerator type or workload type.
   if workload_system.accelerator_type == AcceleratorType.GPU:
     container, debugging_dashboard_id = get_user_workload_container(
-        args, workload_system
+        args, workload_system, parallel_containers=parallel_containers
     )
     gpu_scheduler, return_code = get_gpu_scheduler(
         args, workload_system, autoprovisioning_args
@@ -624,25 +632,21 @@ def workload_create(args) -> None:
         custom_pathways_server=append_custom_pathways_server(args),
         custom_pathways_worker=append_custom_pathways_worker(args),
         colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
-        user_workload=get_user_workload_for_pathways(args, workload_system),
+        user_workload=get_user_workload_for_pathways(
+            args, workload_system, parallel_containers
+        ),
         local_queue_name=LOCAL_QUEUE_NAME,
         autoprovisioning_args=autoprovisioning_args,
         placement_policy_label=placement_policy_label,
     )
   else:
-    use_sub_slicing = (
-        workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
-    )
-    use_super_slicing = (
-        workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
-    )
     if use_sub_slicing:
       xpk_print('Workload will be scheduled using the Sub-slicing feature.')
     if use_super_slicing:
       xpk_print('Workload will be scheduled using the Super-slicing feature.')
     container, debugging_dashboard_id = get_user_workload_container(
-        args, workload_system
+        args, workload_system, parallel_containers
     )
     machine_label = (

xpk/core/docker_container.py CHANGED Viewed

@@ -30,12 +30,18 @@ from .system_characteristics import (
 )
-def get_main_and_sidecar_container(args, system, docker_image) -> str:
+def get_main_and_sidecar_container(
+    args,
+    system: SystemCharacteristics,
+    docker_image: str,
+    parallel_containers: int,
+) -> str:
   """Generate yaml for main and sidecar container.
   Args:
     args: user provided arguments for running the command.
     system: system characteristics
     docker_image: docker image
+    parallel_containers: number of containers to run per VM.
   Returns:
     str:
@@ -44,7 +50,9 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
   resource_type = AcceleratorTypeToAcceleratorCharacteristics[
       system.accelerator_type
   ].resource_type
-  main_container = get_main_container(args, system, docker_image, resource_type)
+  main_container = get_main_container(
+      args, system, docker_image, resource_type, parallel_containers
+  )
   yaml = """- name: stacktrace-explorer
                 image: busybox:1.28
                 args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
@@ -59,13 +67,20 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
   return yaml.format(main_container=main_container)
-def get_main_container(args, system, docker_image, resource_type) -> str:
+def get_main_container(
+    args,
+    system: SystemCharacteristics,
+    docker_image: str,
+    resource_type,
+    parallel_containers: int,
+) -> str:
   """Generate yaml for main container including the xpk command.
   Args:
     args: user provided arguments for running the command.
     system: system characteristics
     docker_image: docker image
     resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
+    parallel_containers: number of containers to run per VM.
   Returns:
     str:
@@ -149,14 +164,10 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
                 volumeMounts:
                 {volume_mounts}
 """
-  # pathways job running on 2 parallel containers is not verified yet
-  if args.use_pathways:
-    system.parallel_containers = 1
   env = get_env_container(args, system)
   image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system)
-  for i in range(system.parallel_containers):
-    docker_name_sufix = f'-{i + 1}' if system.parallel_containers > 1 else ''
+  for i in range(parallel_containers):
+    docker_name_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
     containers.append(
         container_yaml.format(
             args=args,
@@ -170,19 +181,24 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
             tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
             gpu_workload_terminate_command=gpu_workload_terminate_command,
             xpk_internal_commands=xpk_internal_commands,
-            resources=get_main_container_resources(args, system, resource_type),
+            resources=get_main_container_resources(
+                args, system, resource_type, parallel_containers
+            ),
             volume_mounts=volume_mounts,
         )
     )
   return ''.join(containers)
-def get_user_workload_container(args, system: SystemCharacteristics):
+def get_user_workload_container(
+    args, system: SystemCharacteristics, parallel_containers: int
+):
   """Deploy user workload container
   Args:
       args: user provided args.
       system: system characteristics.
+      parallel_containers: number of containers to run per VM.
   Returns:
       container: main container
@@ -209,11 +225,15 @@ def get_user_workload_container(args, system: SystemCharacteristics):
         'Sidecar container to display stack traces for TPU workloads will also'
         ' be deployed.'
     )
-    container = get_main_and_sidecar_container(args, system, docker_image)
+    container = get_main_and_sidecar_container(
+        args, system, docker_image, parallel_containers
+    )
     # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
     debugging_dashboard_id = get_gke_debugging_dashboard(args)
   else:
-    container = get_main_container(args, system, docker_image, resource_type)
+    container = get_main_container(
+        args, system, docker_image, resource_type, parallel_containers
+    )
   return container, debugging_dashboard_id

xpk/core/docker_resources.py CHANGED Viewed

@@ -23,7 +23,10 @@ from ..utils.execution_context import is_dry_run
 def get_main_container_resources(
-    args, system: SystemCharacteristics, resource_type
+    args,
+    system: SystemCharacteristics,
+    resource_type: str,
+    parallel_containers: int,
 ) -> str:
   """Resources for the main container.
   Args:
@@ -53,10 +56,7 @@ def get_main_container_resources(
     offset_vCPUs = int(system.chips_per_vm) * 0.95
     return f'{resource_type}: {offset_vCPUs}'
-  return (
-      f'{resource_type}:'
-      f' {int(system.chips_per_vm / system.parallel_containers)}'
-  )
+  return f'{resource_type}: {int(system.chips_per_vm / parallel_containers)}'
 def get_env_container(args, system: SystemCharacteristics) -> str:

xpk/core/pathways.py CHANGED Viewed

@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
 def get_user_workload_for_pathways(
-    args,
-    system: SystemCharacteristics,
+    args, system: SystemCharacteristics, parallel_containers: int
 ) -> str:
   """
   Create a user workload container for Pathways.
   Don't create one for Pathways headless mode.
-  Args:
-    args: user provided args.
-    system: system characteristics.
   Returns:
     str:
       Pathways server port as a YAML string
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
   if args.headless:
     return ''
   else:
-    container, _ = get_user_workload_container(args, system)
+    container, _ = get_user_workload_container(
+        args, system, parallel_containers
+    )
     return user_workload_yaml.format(
         args=args,
         container=container,

xpk/core/scheduling.py CHANGED Viewed

@@ -88,6 +88,18 @@ def check_if_workload_can_schedule(
       return WorkloadScheduling.UNAVAILABLE
     return WorkloadScheduling.AVAILABLE
+  if cluster_system and _check_super_slicing_availability(
+      workload_system=workload_system, cluster_system=cluster_system
+  ):
+    if _check_workload_size_fits(
+        args,
+        workload_system,
+        max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
+    ) and _check_super_slicing_topology(workload_system):
+      return WorkloadScheduling.SUPER_SLICING_AVAILABLE
+    else:
+      return WorkloadScheduling.UNAVAILABLE
   if workload_system.device_type in resources_config_map:
     if _check_workload_size_fits(
         args,
@@ -112,18 +124,6 @@ def check_if_workload_can_schedule(
     else:
       return WorkloadScheduling.UNAVAILABLE
-  if cluster_system and _check_super_slicing_availability(
-      workload_system=workload_system, cluster_system=cluster_system
-  ):
-    if _check_workload_size_fits(
-        args,
-        workload_system,
-        max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
-    ) and _check_super_slicing_topology(workload_system):
-      return WorkloadScheduling.SUPER_SLICING_AVAILABLE
-    else:
-      return WorkloadScheduling.UNAVAILABLE
   xpk_print(
       'Workload scheduling validation failed. XPK will not create the workload'
       f' {args.workload}.'

xpk/core/scheduling_test.py CHANGED Viewed

@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
             WorkloadScheduling.UNAVAILABLE,
         ),
         (
-            (
-                'Super-slicing should be ignored when a given device is already'
-                ' present in the cluster'
+            'Super-slicing, but one cube',
+            dataclasses.replace(
+                SUPER_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die('tpu7x-128'),
+                cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
+                resources_config_map={'tpu7x-128': '16'},
             ),
+            WorkloadScheduling.SUPER_SLICING_AVAILABLE,
+        ),
+        (
+            'Super-slicing, but one cube and no super-slicing-topology',
             dataclasses.replace(
                 SUPER_SLICING_CASE,
-                workload_system=_get_system_characteristics_or_die('tpu7x-64'),
-                cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
-                resources_config_map={'tpu7x-64': '16'},
+                workload_system=_get_system_characteristics_or_die('tpu7x-128'),
+                cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
+                resources_config_map={'tpu7x-128': '16'},
+                super_slicing_topology_set=False,
             ),
             WorkloadScheduling.AVAILABLE,
         ),

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xpk
-Version: 1.1.0
+Version: 1.1.2
 Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
 Author-email: XPK team <xpk-code-reviewers@google.com>
 License: Apache-2.0

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/RECORD RENAMED Viewed

@@ -27,7 +27,7 @@ xpk/commands/managed_ml_diagnostics.py,sha256=87wmFbnYQY-kEpJfPo1Up53xM5P_P5wOlX
 xpk/commands/managed_ml_diagnostics_test.py,sha256=pQ1YUGMGRQFJYTS_1o9YyGUzYdLaBdA84LjbnncaeEo,3828
 xpk/commands/storage.py,sha256=cSTJN9Mjvdsvk_Nk43kVdQFhp89nxWbanDsTOGZCkpQ,10708
 xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
-xpk/commands/workload.py,sha256=gDIzul8myTHG5J45LRjeIC-iSeNJ9ATE1j3DJyt4k4A,32172
+xpk/commands/workload.py,sha256=Xhu_xNzGnKVfU3Piwf-rJbNO0r0LCjwslYjYlvOjD8Y,32347
 xpk/commands/workload_test.py,sha256=m79x6YDYn-36BX0CttTtAMdt_O-WJY40FLTGa6KwKg8,9804
 xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
 xpk/core/capacity.py,sha256=MGiNOwBCwg8Ci-hsssbZYIJ2xXTm6Y5yKTO4J5ozqEk,11053
@@ -38,10 +38,10 @@ xpk/core/cluster_test.py,sha256=VeC1C7kN0OJe6yeoL8GCaFk4uPhijP6CjvQAcE7q9xw,6653
 xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
 xpk/core/config.py,sha256=U2JDXx-XBuqQpZJf2iUDoww5--E8ejZfgmIxKeGu-gU,4668
 xpk/core/config_test.py,sha256=POSuofK0LFbNNygDAo2fjtKY4NMrRjUFeGcpBh9JOS4,3569
-xpk/core/docker_container.py,sha256=Lsn6eJNN6dxvd7IbD0Ew4NnPKYM3VQyB8ursdG4jrIc,7919
+xpk/core/docker_container.py,sha256=9kJpTEholW_d_GamjcqunCWT4XwrDyZs3fcvcPNCb8Y,8294
 xpk/core/docker_image.py,sha256=9vwqbb6Mc3C5ZEOph03WS-EWI5hxMYGGigqzIMkDTjE,6909
 xpk/core/docker_manager.py,sha256=vGPCWPDB507sxEsXvSD4IM-h5HqQzYLk7WSdCUmSDb4,10568
-xpk/core/docker_resources.py,sha256=7EXV1CvwCVogE5-m6utSE1GXxwf6EpB4QDYeuGXWHmI,12547
+xpk/core/docker_resources.py,sha256=bwHGNh_gOtprVOeoFC8NObgKGD9aDjNc2XBMS6syD2Q,12562
 xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
 xpk/core/gcloud_context.py,sha256=d1wQ76zp7QMdG5BxB3sJz4b4OF5Mc8OzmPd_m0xd-Ys,6810
 xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKgHk,5987
@@ -56,12 +56,12 @@ xpk/core/nap.py,sha256=gBxXu8Png1-BlAHbxLWZgbSXeLMGVixufkQVMR0fmvk,12963
 xpk/core/network.py,sha256=Oulb7U69lWkpOKxOC1C7ekJDpC51TLwd7XdZA3NQ7E0,10505
 xpk/core/nodepool.py,sha256=FX2ljKvwMsG3fXfn_CDCRwiKH4UAArQeDiFLq3XK9F0,25495
 xpk/core/nodepool_test.py,sha256=9xSFpn-1j9Vd0J8KFzbq8ywS_Ibsbx4CgR1er68mRnw,17542
-xpk/core/pathways.py,sha256=32GxCIPiEBqSpK6g2gMmB7Nxj_HlG4I30u1C9UyWl1A,11594
+xpk/core/pathways.py,sha256=9w_VrpLLjQSSdNd8HJLWWtIYzA0NpR7t70knRSVLK0w,11574
 xpk/core/pathways_test.py,sha256=UeuSo_g9BNI27to-wflQwc6dJFVSA5-kOK_cjmY5qgU,1809
 xpk/core/ray.py,sha256=JWhc_ToRHpF4_URGnuE_47FMgamaRsA4KVUMpqThWzw,6145
 xpk/core/resources.py,sha256=dDsG_LOtcU17p1UKgOYyjdPxbMfqcb7pJ4SjfLDA6Os,9389
-xpk/core/scheduling.py,sha256=UWEN7cstbvc_9EfSTD1efZD59L5oh7riwNs9TLbvx00,12542
-xpk/core/scheduling_test.py,sha256=0QNiucR77tl72s5FOsp_8RKRp9CjjXSrrhAkTX9kMTg,15883
+xpk/core/scheduling.py,sha256=J0yTpb4jBTQTFJ5QPyycFPFAKXC0fnmxeXRxZbvx8k8,12542
+xpk/core/scheduling_test.py,sha256=zoGLoxNYLQGeQKtWOhBPP0bj4B0zXylRKhRIjO-TyTc,16280
 xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
 xpk/core/system_characteristics.py,sha256=8WXi48mZ7eT9r57FZ5eFtmdonik7MItGTYiuYvcjXG8,34335
 xpk/core/system_characteristics_test.py,sha256=XVaKJ5wYdNwwwUKBnuK3zd1u-Qj3VnJR7MHlOeCa-K0,8029
@@ -142,9 +142,9 @@ xpk/utils/validation.py,sha256=rE9LTkXJT7jIesodFb9pONL7ixhLqiQleyoaz7N39Dw,2765
 xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
 xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
 xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
-xpk-1.1.0.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
-xpk-1.1.0.dist-info/METADATA,sha256=b9BX5o6QPikxeZlBzNsCRNSVUpQm3jQs6KSuYhyz88o,10013
-xpk-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-xpk-1.1.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
-xpk-1.1.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
-xpk-1.1.0.dist-info/RECORD,,
+xpk-1.1.2.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
+xpk-1.1.2.dist-info/METADATA,sha256=qKqUDQuylrwPZI7NNzHvJWj6kJE08pZ3SbxHNGZ9qgI,10013
+xpk-1.1.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+xpk-1.1.2.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
+xpk-1.1.2.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
+xpk-1.1.2.dist-info/RECORD,,

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{xpk-1.1.0.dist-info → xpk-1.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

xpk 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

xpk 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl