PyPI - xpk - Versions diffs - 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

xpk 0.8.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

xpk/commands/batch.py +5 -6
xpk/commands/cluster.py +246 -73
xpk/commands/cluster_gcluster.py +27 -0
xpk/commands/common.py +40 -1
xpk/commands/kjob_common.py +13 -1
xpk/commands/run.py +4 -5
xpk/commands/shell.py +2 -2
xpk/commands/storage.py +24 -6
xpk/commands/workload.py +66 -27
xpk/core/blueprint/blueprint_generator.py +115 -47
xpk/core/capacity.py +66 -6
xpk/core/cluster.py +282 -13
xpk/core/config.py +1 -65
xpk/core/docker_manager.py +1 -1
xpk/core/docker_resources.py +145 -72
xpk/core/filestore.py +2 -6
xpk/core/gcsfuse.py +22 -4
xpk/core/jobset.py +143 -0
xpk/core/kjob.py +21 -18
xpk/core/kueue.py +194 -4
xpk/core/mtc.py +195 -0
xpk/core/network.py +23 -1
xpk/core/nodepool.py +17 -4
xpk/core/pathways.py +2 -3
xpk/core/resources.py +21 -0
xpk/core/storage.py +1 -95
xpk/core/system_characteristics.py +1 -1
xpk/core/workload.py +1 -45
xpk/core/workload_decorators/rdma_decorator.py +8 -10
xpk/core/workload_decorators/tcpx_decorator.py +185 -0
xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
xpk/parser/cluster.py +589 -389
xpk/parser/storage.py +12 -3
xpk/parser/workload.py +21 -3
xpk/utils/kubectl.py +4 -1
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0

xpk/commands/storage.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ..core.cluster import (
     setup_k8s_env,
     update_cluster_with_parallelstore_driver_if_necessary,
     update_cluster_with_pd_driver_if_necessary,
+    update_cluster_with_lustre_driver_if_necessary,
     update_cluster_with_gcpfilestore_driver_if_necessary,
     update_cluster_with_gcsfuse_driver_if_necessary,
     update_cluster_with_workload_identity_if_necessary,
@@ -45,6 +46,7 @@ from ..core.storage import (
     GCS_FUSE_TYPE,
     GCE_PD_TYPE,
     PARALLELSTORE_TYPE,
+    LUSTRE_TYPE,
     STORAGE_CRD_PLURAL,
     XPK_API_GROUP_NAME,
     XPK_API_GROUP_VERSION,
@@ -86,7 +88,6 @@ def storage_create(args: Namespace) -> None:
           args.vol,
           args.access_mode,
           filestore_network,
-          args.mount_options,
       )
     k8s_api_client = setup_k8s_env(args)
@@ -162,7 +163,6 @@ def storage_attach(args: Namespace) -> None:
           args.vol,
           args.access_mode,
           filestore_network,
-          args.mount_options,
       )
   elif args.type == GCS_FUSE_TYPE:
@@ -178,14 +178,18 @@ def storage_attach(args: Namespace) -> None:
         manifest = list(yaml.safe_load_all(f))
     else:
       manifest = gcsfuse.manifest(
-          args.name, args.bucket, args.size, args.mount_options
+          args.name,
+          args.bucket,
+          args.size,
+          args.mount_options,
+          args.prefetch_metadata,
       )
-  elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
+  elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE]:
     if args.manifest is None:
       xpk_print(
-          "Parallelstore and PersistentDisk are currently supported only with"
-          " --manifest"
+          "Parallelstore, PersistentDisk, and Lustre are currently supported"
+          " only with --manifest"
       )
       xpk_exit(1)
@@ -232,6 +236,11 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
     if return_code > 0:
       xpk_exit(return_code)
+  if args.type == LUSTRE_TYPE:
+    return_code = update_cluster_with_lustre_driver_if_necessary(args)
+    if return_code > 0:
+      xpk_exit(return_code)
 def storage_list(args: Namespace) -> None:
   k8s_api_client = setup_k8s_env(args)
@@ -323,3 +332,12 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
       storage.name,
       "Storage",
   )
+  # remove kubernetes.io/pvc-protection
+  delete_resource(
+      lambda name: core_api.patch_namespaced_persistent_volume_claim(
+          name, "default", {"metadata": {"finalizers": None}}
+      ),
+      storage.pvc,
+      "Persistent Volume Claim finalizers",
+  )

xpk/commands/workload.py CHANGED Viewed

@@ -15,27 +15,24 @@ limitations under the License.
 """
 from ..core.blueprint.blueprint_generator import (
-    get_subnetworks_for_a3mega,
-    get_subnetworks_for_a3ultra,
-    get_subnetworks_for_a4,
+    a3high_device_type,
+    a3mega_device_type,
+    a3ultra_device_type,
+    a4_device_type,
 )
 from ..core.cluster import (
     XPK_SA,
-    create_xpk_k8s_service_account,
+    setup_k8s_service_accounts,
     get_cluster_credentials,
     setup_k8s_env,
 )
 from ..core.commands import run_command_with_updates, run_commands
-from ..core.config import (
-    VERTEX_TENSORBOARD_FEATURE_FLAG,
-    XPK_CURRENT_VERSION,
-    parse_env_config,
-)
+from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
 from ..core.docker_container import (
     get_main_container_docker_image,
     get_user_workload_container,
 )
-from ..core.docker_resources import get_volumes
+from ..core.docker_resources import get_volumes, parse_env_config
 from ..core.gcloud_context import add_zone_and_project
 from ..core.kueue import LOCAL_QUEUE_NAME
 from ..core.monitoring import get_gke_outlier_dashboard
@@ -43,6 +40,7 @@ from ..core.nap import (
     get_autoprovisioning_node_selector_args,
     is_autoprovisioning_enabled,
 )
+from ..core.network import get_cluster_subnetworks
 from ..core.pathways import (
     append_custom_colocated_python_sidecar,
     append_custom_pathways_proxy_server,
@@ -54,6 +52,10 @@ from ..core.pathways import (
     get_user_workload_for_pathways,
     try_to_delete_pathwaysjob_first,
 )
+from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
+from ..core.capacity import (
+    CapacityType,
+)
 from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
 from ..core.scheduling import (
     check_if_workload_can_schedule,
@@ -69,6 +71,7 @@ from ..core.storage import (
     GCP_FILESTORE_TYPE,
     GCS_FUSE_TYPE,
     PARALLELSTORE_TYPE,
+    LUSTRE_TYPE,
     Storage,
     add_bucket_iam_members,
     get_storage_annotations,
@@ -80,7 +83,6 @@ from ..core.system_characteristics import (
 )
 from ..core.vertex import create_vertex_experiment
 from ..core.workload import (
-    add_gpu_rxdm_container,
     check_if_workload_exists,
     get_workload_list,
     wait_for_job_completion,
@@ -89,11 +91,13 @@ from ..core.workload import (
 from ..core.workload_decorators import (
     rdma_decorator,
     storage_decorator,
+    tcpx_decorator,
     tcpxo_decorator,
 )
 from ..utils.console import get_user_input, xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
 from . import cluster_gcluster
+from .common import is_TAS_possible
 WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
@@ -126,6 +130,8 @@ spec:
                 {storage_annotations}
             spec:
               schedulerName: {args.scheduler}
+              imagePullSecrets:
+              - name: {args.docker_image_pull_secret}
               restartPolicy: Never
               {affinity}
               nodeSelector:
@@ -139,6 +145,8 @@ spec:
               containers:
               {container}
               serviceAccountName: {service_account}
+              tolerations:
+              {tpu_toleration}
               volumes:
               {volumes}
 """
@@ -178,6 +186,8 @@ spec:
               {gpu_scheduler}
               priorityClassName: {args.priority}
               restartPolicy: Never
+              imagePullSecrets:
+              - name: {args.docker_image_pull_secret}
               hostNetwork: true
               dnsPolicy: ClusterFirstWithHostNet
               terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
@@ -216,11 +226,12 @@ spec:
             metadata:
               labels:
                 xpk.google.com/workload: {args.workload}
-              annotations:
-                kueue.x-k8s.io/podset-preferred-topology: "cloud.google.com/gce-topology-host"
+              annotations: {annotations}
             spec:
               priorityClassName: {args.priority}
               restartPolicy: Never
+              imagePullSecrets:
+              - name: {args.docker_image_pull_secret}
               dnsPolicy: ClusterFirstWithHostNet
               terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
               serviceAccountName: {service_account}
@@ -294,7 +305,7 @@ def workload_create(args) -> None:
     0 if successful and 1 otherwise.
   """
   k8s_api_client = setup_k8s_env(args)
-  create_xpk_k8s_service_account()
+  setup_k8s_service_accounts()
   workload_exists = check_if_workload_exists(args)
@@ -350,7 +361,7 @@ def workload_create(args) -> None:
     if not tensorboard_config:
       xpk_exit(1)
-  parse_env_config(args, tensorboard_config, system)
+  parse_env_config(args, tensorboard_config)
   autoprovisioning_args = ''
   autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
@@ -385,6 +396,9 @@ def workload_create(args) -> None:
     pd_storages: list[Storage] = list(
         filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
     )
+    lustre_storages: list[Storage] = list(
+        filter(lambda storage: storage.type == LUSTRE_TYPE, storages)
+    )
     if len(gcs_fuse_storages) > 0:
       service_account = XPK_SA
       xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
@@ -406,7 +420,7 @@ def workload_create(args) -> None:
           f' {parallelstore_storages}'
       )
     else:
-      xpk_print('No gcp filestore instances to add detected.')
+      xpk_print('No gcp parallelstore instances to add detected.')
     if len(pd_storages) > 0:
       service_account = XPK_SA
@@ -414,11 +428,18 @@ def workload_create(args) -> None:
     else:
       xpk_print('No gce persistent disk instances to add detected.')
+    if len(lustre_storages) > 0:
+      service_account = XPK_SA
+      xpk_print(f'Detected managed lustre instances to add: {lustre_storages}')
+    else:
+      xpk_print('No managed lustre instances to add detected.')
     all_storages = (
         gcs_fuse_storages
         + gcpfilestore_storages
         + parallelstore_storages
         + pd_storages
+        + lustre_storages
     )
   # Currently failure policy rules are supported for Pathways workloads. b/408465881
@@ -450,26 +471,41 @@ def workload_create(args) -> None:
     )
     if return_code != 0:
       xpk_exit(return_code)
+    system_characteristics = get_cluster_system_characteristics(args)
+    capacity_type = get_cluster_capacity_type(args)
+    annotations = (
+        ''
+        if not is_TAS_possible(
+            system_characteristics,
+            capacity_type,
+            flex=True if capacity_type == CapacityType.FLEX_START else False,
+        )
+        else (
+            'kueue.x-k8s.io/podset-preferred-topology:'
+            ' "cloud.google.com/gce-topology-host"'
+        )
+    )
-    if system.device_type in cluster_gcluster.supported_device_types:
+    if (
+        system.device_type in cluster_gcluster.supported_device_types
+        or system.device_type == a3high_device_type
+    ):
       yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
           args=args,
           container=container,
           service_account=XPK_SA,
           failure_policy_rules=failure_policy_rules,
           pod_failure_policy=pod_failure_policy,
+          annotations=annotations,
       )
-      if args.device_type == cluster_gcluster.a3mega_device_type:
-        sub_networks = get_subnetworks_for_a3mega(args.cluster)
+      sub_networks = get_cluster_subnetworks(args)
+      if args.device_type == a3high_device_type:
+        yml_string = tcpx_decorator.decorate_jobset(yml_string)
+      elif args.device_type == a3mega_device_type:
         yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
-      if args.device_type == cluster_gcluster.a3ultra_device_type:
-        sub_networks = get_subnetworks_for_a3ultra(args.cluster)
-        yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
-      if args.device_type == cluster_gcluster.a4_device_type:
-        sub_networks = get_subnetworks_for_a4()
+      elif args.device_type in [a3ultra_device_type, a4_device_type]:
         yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
       if all_storages:
@@ -487,7 +523,6 @@ def workload_create(args) -> None:
           failure_policy_rules=failure_policy_rules,
           pod_failure_policy=pod_failure_policy,
       )
-      yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
   elif args.use_pathways and ensure_pathways_workload_prerequisites(
       args, system
@@ -524,6 +559,10 @@ def workload_create(args) -> None:
             get_storage_annotations(all_storages)
         ),
         service_account=service_account,
+        tpu_toleration="""
+              - operator: "Exists"
+                key: google.com/tpu
+        """ if system.accelerator_type == AcceleratorType['TPU'] else '',
         failure_policy_rules=failure_policy_rules,
         pod_failure_policy=pod_failure_policy,
     )

xpk/core/blueprint/blueprint_generator.py CHANGED Viewed

@@ -22,7 +22,9 @@ from ruamel import yaml
 from ...utils.console import xpk_exit, xpk_print
 from ...utils.file import ensure_directory_exists
 from ..capacity import (
+    H100_DEVICE_TYPE,
     B200_DEVICE_TYPE,
     H100_MEGA_DEVICE_TYPE,
     H200_DEVICE_TYPE,
@@ -30,10 +32,11 @@ from ..capacity import (
 )
 from ..system_characteristics import get_system_characteristics_by_device_type
 from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
+from ..kueue import KUEUE_VERSION
 yaml = yaml.YAML()
+a3high_device_type = H100_DEVICE_TYPE
 a3mega_device_type = H100_MEGA_DEVICE_TYPE
 a3ultra_device_type = H200_DEVICE_TYPE
 a4_device_type = B200_DEVICE_TYPE
@@ -49,21 +52,7 @@ blueprint_dependencies_dir = {
 }
 cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
-cluster_toolkit_version = "v1.48.0"
-def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
-  return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
-def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
-  return [f"{cluster_name}-sub-1"] + [
-      f"{cluster_name}-rdma-sub-{i}" for i in range(8)
-  ]
-def get_subnetworks_for_a4() -> list[str]:
-  return ["gvnic-1"] + [f"rdma-{i}" for i in range(8)]
+cluster_toolkit_version = "v1.57.1"
 class BlueprintGeneratorOutput:
@@ -106,6 +95,8 @@ class BlueprintGenerator:
       group_placement_max_distance: int = 2,
       subnetwork_cidr_suffix: int = 24,
       reservation: str | None = None,
+      reservation_placement_policy: dict[str, str] | None = None,
+      reservation_maintenance_interval: str = "PERIODIC",
       gcs_bucket: Optional[str | None] = None,
       capacity_type: CapacityType = CapacityType.ON_DEMAND,
       system_node_pool_min_node_count: int = 2,
@@ -156,7 +147,6 @@ class BlueprintGenerator:
         source="modules/scheduler/gke-cluster",
         use=[primary_vpc_name, gpu_subnets_name],
         settings={
-            "release_channel": "RAPID",
             "prefix_with_deployment_name": False,
             "name_suffix": cluster_name,
             "enable_private_endpoint": False,
@@ -190,27 +180,42 @@ class BlueprintGenerator:
             "group_placement_max_distance": group_placement_max_distance,
         },
     )
+    nodepool_used_deps = ["gke_cluster", gpu_subnets_name]
     a3_megagpu_pool_0 = DeploymentModule(
         id="a3_megagpu_pool_0",
         source="modules/compute/gke-node-pool",
-        use=["gke_cluster", gpu_subnets_name, "group_placement_0"],
+        use=nodepool_used_deps,
         settings={
             "name": f"{cluster_name}-a3-megagpu-pool-0",
             "machine_type": system.gce_machine_type,
-            "static_node_count": num_nodes,
             "zones": [zone],
-            "host_maintenance_interval": "PERIODIC",
+            "host_maintenance_interval": reservation_maintenance_interval,
             "reservation_affinity": self._getblock_reservation_affinity(
                 reservation
             ),
             "run_workload_script": False,
             "spot": capacity_type == CapacityType.SPOT,
             "max_pods_per_node": 32,
-            "auto_upgrade": True,
+            "guest_accelerator": [{
+                "type": "nvidia-h100-mega-80gb",
+                "count": 8,
+                "gpu_driver_installation_config": {
+                    "gpu_driver_version": "LATEST"
+                },
+            }],
+            "auto_upgrade": (
+                True if capacity_type != CapacityType.FLEX_START else False
+            ),
         },
         outputs=["instructions"],
     )
+    if capacity_type == CapacityType.FLEX_START:
+      a3_megagpu_pool_0.settings.update(self.get_dws_flex_start())
+    else:
+      a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
+    set_placement_policy = capacity_type != CapacityType.SPOT
     num_chips = num_nodes * system.chips_per_vm
     workload = DeploymentModule(
         id="workload_component_install",
@@ -219,9 +224,17 @@ class BlueprintGenerator:
         settings={
             "kueue": {
                 "install": True,
-                "version": "v0.10.0",  # TAS feature-gates is enabled in CT
+                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
                 "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {"num_chips": num_chips},
+                "config_template_vars": {
+                    "num_chips": num_chips,
+                    "reservation": (
+                        1 if capacity_type == CapacityType.RESERVATION else 0
+                    ),
+                    "flex_start": (
+                        1 if capacity_type == CapacityType.FLEX_START else 0
+                    ),
+                },
             },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [{
@@ -251,18 +264,27 @@ class BlueprintGenerator:
             }]
         },
     )
+    print(reservation_placement_policy)
+    if reservation_placement_policy is not None:
+      a3_megagpu_pool_0.settings["placement_policy"] = (
+          reservation_placement_policy
+      )
     primary_group = DeploymentGroup(
         group="primary",
         modules=[
             primary_vpc,
             gpunets,
             gke_cluster,
-            group_placement_0,
             a3_megagpu_pool_0,
             workload,
             workload_configmap,
         ],
     )
+    if set_placement_policy and reservation_placement_policy is None:
+      a3_megagpu_pool_0.use.append(group_placement_0.id)
+      primary_group.modules.append(group_placement_0)
     a3_mega_blueprint = Blueprint(
         terraform_backend_defaults=self._getblock_terraform_backend(
             gcs_bucket, cluster_name, prefix
@@ -478,14 +500,22 @@ class BlueprintGenerator:
         source="modules/scheduler/gke-cluster",
         use=[net_0_id],
         settings={
-            "release_channel": "RAPID",
-            "version_prefix": "1.31.",
-            "maintenance_exclusions": [{
-                "name": "no-minor-or-node-upgrades-indefinite",
-                "start_time": "2024-12-01T00:00:00Z",
-                "end_time": "2025-12-22T00:00:00Z",
-                "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
-            }],
+            "release_channel": (
+                "UNSPECIFIED"
+                if capacity_type == CapacityType.FLEX_START
+                else "RAPID"
+            ),
+            "version_prefix": "1.32.",
+            "maintenance_exclusions": (
+                []
+                if capacity_type == CapacityType.FLEX_START
+                else [{
+                    "name": "no-minor-or-node-upgrades-indefinite",
+                    "start_time": "2024-12-01T00:00:00Z",
+                    "end_time": "2025-12-22T00:00:00Z",
+                    "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
+                }]
+            ),
             "prefix_with_deployment_name": False,
             "name_suffix": cluster_name,
             "system_node_pool_machine_type": system_node_pool_machine_type,
@@ -534,9 +564,10 @@ class BlueprintGenerator:
         use=[cluster_id],
         settings={
             "machine_type": system.gce_machine_type,
-            "auto_upgrade": True,
+            "auto_upgrade": (
+                True if capacity_type != CapacityType.FLEX_START else False
+            ),
             "zones": [zone],
-            "static_node_count": num_nodes,
             "spot": capacity_type == CapacityType.SPOT,
             "reservation_affinity": self._getblock_reservation_affinity(
                 reservation
@@ -562,6 +593,10 @@ class BlueprintGenerator:
         },
         outputs=["instructions"],
     )
+    if capacity_type == CapacityType.FLEX_START:
+      gpu_pool.settings.update(self.get_dws_flex_start())
+    else:
+      gpu_pool.settings.update({"static_node_count": num_nodes})
     num_chips = num_nodes * system.chips_per_vm
     workload_manager_install_id = "workload-manager-install"
@@ -572,9 +607,14 @@ class BlueprintGenerator:
         settings={
             "kueue": {
                 "install": True,
-                "version": "v0.10.0",  # TAS feature-gates is enabled in CT
+                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
                 "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {"num_chips": num_chips},
+                "config_template_vars": {
+                    "num_chips": num_chips,
+                    "flex_start": (
+                        1 if capacity_type == CapacityType.FLEX_START else 0
+                    ),
+                },
             },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [
@@ -777,13 +817,21 @@ class BlueprintGenerator:
                 f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
             ),
             "version_prefix": "1.32.",
-            "release_channel": "RAPID",
-            "maintenance_exclusions": [{
-                "name": "no-minor-or-node-upgrades-indefinite",
-                "start_time": "2024-12-01T00:00:00Z",
-                "end_time": "2025-12-22T00:00:00Z",
-                "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
-            }],
+            "release_channel": (
+                "UNSPECIFIED"
+                if capacity_type == CapacityType.FLEX_START
+                else "RAPID"
+            ),
+            "maintenance_exclusions": (
+                []
+                if capacity_type == CapacityType.FLEX_START
+                else [{
+                    "name": "no-minor-or-node-upgrades-indefinite",
+                    "start_time": "2024-12-01T00:00:00Z",
+                    "end_time": "2025-12-22T00:00:00Z",
+                    "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
+                }]
+            ),
         },
         outputs=["instructions"],
     )
@@ -800,10 +848,11 @@ class BlueprintGenerator:
         use=[cluster_id],
         settings={
             "machine_type": system.gce_machine_type,
-            "auto_upgrade": True,
+            "auto_upgrade": (
+                True if capacity_type != CapacityType.FLEX_START else False
+            ),
             "zones": [zone],
             "disk_type": "hyperdisk-balanced",
-            "static_node_count": num_nodes,
             "local_ssd_count_ephemeral_storage": 32,
             "spot": capacity_type == CapacityType.SPOT,
             "reservation_affinity": self._getblock_reservation_affinity(
@@ -830,6 +879,10 @@ class BlueprintGenerator:
         },
         outputs=["instructions"],
     )
+    if capacity_type == CapacityType.FLEX_START:
+      gpu_pool.settings.update(self.get_dws_flex_start())
+    else:
+      gpu_pool.settings.update({"static_node_count": num_nodes})
     num_chips = num_nodes * system.chips_per_vm
     workload_manager_install_id = "workload-manager-install"
@@ -840,9 +893,14 @@ class BlueprintGenerator:
         settings={
             "kueue": {
                 "install": True,
-                "version": "v0.10.0",  # TAS feature-gates is enabled in CT
+                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
                 "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {"num_chips": num_chips},
+                "config_template_vars": {
+                    "num_chips": num_chips,
+                    "flex_start": (
+                        1 if capacity_type == CapacityType.FLEX_START else 0
+                    ),
+                },
             },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [
@@ -992,6 +1050,16 @@ class BlueprintGenerator:
     )
     return deployment_files_path
+  def get_dws_flex_start(self) -> dict:
+    return {
+        "enable_flex_start": True,
+        "enable_queued_provisioning": True,
+        "autoscaling_total_min_nodes": 0,
+        "release_channel": "UNSPECIFIED",
+        "auto_repair": False,
+        "auto_upgrade": False,
+    }
 yaml.register_class(Blueprint)
 yaml.register_class(DeploymentGroup)

xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

xpk 0.8.0py3-none-any.whl → 0.10.0py3-none-any.whl