xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. xpk/commands/cluster.py +270 -8
  2. xpk/commands/cluster_gcluster.py +2 -1
  3. xpk/commands/common.py +3 -3
  4. xpk/commands/info.py +12 -12
  5. xpk/commands/job.py +12 -10
  6. xpk/commands/kjob_common.py +2 -1
  7. xpk/commands/storage.py +1 -1
  8. xpk/commands/workload.py +12 -6
  9. xpk/core/blueprint/blueprint_generator.py +7 -7
  10. xpk/core/blueprint/blueprint_test.py +218 -0
  11. xpk/core/capacity.py +5 -3
  12. xpk/core/cluster.py +9 -7
  13. xpk/core/cluster_private.py +5 -1
  14. xpk/core/commands.py +3 -3
  15. xpk/core/config.py +3 -4
  16. xpk/core/config_test.py +71 -0
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +1 -1
  19. xpk/core/filestore.py +7 -2
  20. xpk/core/gcloud_context.py +2 -2
  21. xpk/core/jobset.py +1 -1
  22. xpk/core/kjob.py +2 -1
  23. xpk/core/kueue.py +12 -4
  24. xpk/core/nap.py +20 -6
  25. xpk/core/nodepool.py +52 -19
  26. xpk/core/nodepool_test.py +82 -0
  27. xpk/core/resources.py +1 -7
  28. xpk/core/scheduling.py +1 -1
  29. xpk/core/storage.py +14 -14
  30. xpk/core/system_characteristics.py +267 -1081
  31. xpk/core/workload.py +11 -0
  32. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  33. xpk/core/workload_decorators/storage_decorator.py +2 -1
  34. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  35. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  37. xpk/core/workload_test.py +28 -0
  38. xpk/main.py +9 -10
  39. xpk/parser/cluster.py +67 -49
  40. xpk/parser/common.py +45 -36
  41. xpk/parser/storage.py +12 -13
  42. xpk/parser/workload.py +57 -39
  43. xpk/utils/console.py +2 -1
  44. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
  45. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
  46. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
  47. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
  48. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
  49. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/core/nodepool.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from typing import List
17
18
  from ..utils.console import get_user_input, xpk_print
18
19
  from .capacity import (
19
20
  AUTOPROVISIONING_CONFIG_VALUE,
@@ -32,6 +33,8 @@ from .resources import (
32
33
  create_or_update_cluster_configmap,
33
34
  )
34
35
  from .system_characteristics import AcceleratorType
36
+ from functools import reduce
37
+ from operator import mul
35
38
 
36
39
  CLOUD_PLATFORM_AUTH_SCOPE_URL = (
37
40
  '"https://www.googleapis.com/auth/cloud-platform"'
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
88
91
  xpk_print('Parsing capacity arguments failed!')
89
92
  return return_code
90
93
 
91
- if system.accelerator_type == AcceleratorType['GPU']:
92
- xpk_print(
93
- f'Creating 1 node pool with {args.num_nodes} nodes of'
94
- f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
95
- )
96
- desired_node_pool_names = [f'{args.cluster}-np-0']
97
- else:
98
- xpk_print(
99
- f'Creating {args.num_slices} node pool or pools of'
100
- f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
101
- )
102
- desired_node_pool_names = [
103
- f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
104
- ]
94
+ desired_node_pool_count = (
95
+ 1
96
+ if system.accelerator_type == AcceleratorType['GPU']
97
+ else args.num_slices
98
+ )
99
+ message = (
100
+ (
101
+ f'Creating 1 node pool with {args.num_nodes} nodes of'
102
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
103
+ )
104
+ if system.accelerator_type == AcceleratorType['GPU']
105
+ else (
106
+ f'Creating {args.num_slices} node pool or pools of'
107
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
108
+ )
109
+ )
110
+ xpk_print(message)
111
+ desired_node_pool_names = get_desired_node_pool_names(
112
+ existing_node_pool_names, args.cluster, desired_node_pool_count
113
+ )
105
114
 
106
115
  node_pools_to_remain = []
107
116
  delete_commands = []
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
275
284
  f' --host-maintenance-interval={args.host_maintenance_interval}'
276
285
  f' {capacity_args}'
277
286
  ' --enable-gvnic'
278
- f' {args.custom_nodepool_arguments}'
279
287
  )
280
288
  if system.accelerator_type == AcceleratorType['TPU']:
281
289
  command += f' --node-version={gke_node_pool_version}'
290
+ topology_product = reduce(
291
+ mul, (int(x) for x in system.topology.split('x')), 1
292
+ )
282
293
  if capacity_type == CapacityType.FLEX_START:
283
294
  command += ' --num-nodes=0'
284
- else:
295
+ elif topology_product > 1:
285
296
  command += f' --num-nodes={system.vms_per_slice}'
286
- command += ' --placement-type=COMPACT --max-pods-per-node 15'
287
297
  command += (
288
298
  f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
289
299
  )
290
- command += f' --tpu-topology={system.topology}'
291
- command += f' {args.custom_tpu_nodepool_arguments}'
300
+
301
+ if topology_product > 1:
302
+ command += ' --placement-type=COMPACT --max-pods-per-node 15'
303
+ command += f' --tpu-topology={system.topology}'
304
+ command += f' {args.custom_tpu_nodepool_arguments}'
292
305
  elif system.accelerator_type == AcceleratorType['GPU']:
293
306
  subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
294
307
  if capacity_type == CapacityType.FLEX_START:
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
319
332
  if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
320
333
  command += ' --workload-metadata=GKE_METADATA'
321
334
 
335
+ command += f' {args.custom_nodepool_arguments}'
336
+
322
337
  task = f'NodepoolCreate-{node_pool_name}'
323
338
  create_commands.append(command)
324
339
  create_task_names.append(task)
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
594
609
  return 1, None
595
610
 
596
611
  return 0, nodepool_WI_mode.strip()
612
+
613
+
614
+ def get_desired_node_pool_names(
615
+ existing_node_pool_names: List[str],
616
+ cluster_name: str,
617
+ desired_node_pool_count: int,
618
+ ) -> List[str]:
619
+ cluster_node_pools = [
620
+ np
621
+ for np in existing_node_pool_names
622
+ if np.startswith(f'{cluster_name}-np-')
623
+ ]
624
+ result = set(cluster_node_pools[:desired_node_pool_count])
625
+ i = 0
626
+ while len(result) < desired_node_pool_count:
627
+ result.add(f'{cluster_name}-np-{i}')
628
+ i += 1
629
+ return list(result)
@@ -0,0 +1,82 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from xpk.core.nodepool import get_desired_node_pool_names
18
+
19
+ CLUSTER_NAME = "running-cucumber"
20
+
21
+
22
+ def node_pool_name(number: int) -> str:
23
+ return f"{CLUSTER_NAME}-np-{number}"
24
+
25
+
26
+ def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
27
+ result = get_desired_node_pool_names(
28
+ existing_node_pool_names=[node_pool_name(0)],
29
+ cluster_name=CLUSTER_NAME,
30
+ desired_node_pool_count=2,
31
+ )
32
+
33
+ expected_result = [node_pool_name(0), node_pool_name(1)]
34
+ assert set(result) == set(expected_result)
35
+
36
+
37
+ def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
38
+ result = get_desired_node_pool_names(
39
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
40
+ cluster_name=CLUSTER_NAME,
41
+ desired_node_pool_count=1,
42
+ )
43
+
44
+ expected_result = [node_pool_name(0)]
45
+ assert set(result) == set(expected_result)
46
+
47
+
48
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
49
+ result = get_desired_node_pool_names(
50
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
51
+ cluster_name=CLUSTER_NAME,
52
+ desired_node_pool_count=3,
53
+ )
54
+
55
+ expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
56
+ assert set(result) == set(expected_result)
57
+
58
+
59
+ def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
60
+ result = get_desired_node_pool_names(
61
+ existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
62
+ cluster_name=CLUSTER_NAME,
63
+ desired_node_pool_count=2,
64
+ )
65
+
66
+ expected_result = [node_pool_name(0), node_pool_name(3)]
67
+ assert set(result) == set(expected_result)
68
+
69
+
70
+ def test_compute_desired_node_pool_names_with_unknown_node_pools():
71
+ result = get_desired_node_pool_names(
72
+ existing_node_pool_names=[
73
+ "unknown-node-pool",
74
+ node_pool_name(0),
75
+ node_pool_name(3),
76
+ ],
77
+ cluster_name=CLUSTER_NAME,
78
+ desired_node_pool_count=2,
79
+ )
80
+
81
+ expected_result = [node_pool_name(0), node_pool_name(3)]
82
+ assert set(result) == set(expected_result)
xpk/core/resources.py CHANGED
@@ -108,13 +108,7 @@ def create_cluster_configmaps(
108
108
  device_type = system.device_type
109
109
  if system.accelerator_type == AcceleratorType['GPU']:
110
110
  resources_data = f'{device_type}: "{int(args.num_nodes)}"'
111
- elif (
112
- not args.enable_pathways
113
- and args.enable_autoprovisioning
114
- and autoprovisioning_config
115
- ):
116
- # Currently autoprovisioning is not supported with Pathways.
117
- # Auto provisioning will have variable topologies for a gke accelerator type.
111
+ elif args.enable_autoprovisioning and autoprovisioning_config:
118
112
  resources_data = (
119
113
  f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
120
114
  )
xpk/core/scheduling.py CHANGED
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
49
49
  missing_gke_accelerator_type = False
50
50
  if not cluster_config_map.get(system.gke_accelerator):
51
51
  xpk_print(
52
- f'Gke Accelerator Type Check: {args.workload} is requesting'
52
+ f'GKE Accelerator Type Check: {args.workload} is requesting'
53
53
  f' {system.gke_accelerator} but cluster only contains'
54
54
  f' {cluster_config_map.keys()}. '
55
55
  )
xpk/core/storage.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import os
18
18
  from argparse import Namespace
19
19
  from dataclasses import dataclass
20
- from typing import Any
20
+ from typing import Any, cast
21
21
 
22
22
  import ruamel.yaml
23
23
  from google.cloud import storage as gcp_storage
@@ -95,17 +95,17 @@ class Storage:
95
95
  Args:
96
96
  data: A dictionary containing the Storage resource definition.
97
97
  """
98
- metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
98
+ metadata = data.get("metadata", {})
99
99
  self.name = metadata.get("name")
100
100
  spec = data.get("spec", {})
101
- self.type: str = spec.get("type")
102
- self.auto_mount: bool = spec.get("auto_mount")
103
- self.mount_point: bool = spec.get("mount_point")
104
- self.readonly: bool = spec.get("readonly")
105
- self.manifest: str = spec.get("manifest")
106
- self.pvc: str = spec.get("pvc")
107
- self.pv: str = spec.get("pv")
108
- self.bucket: str = self._get_bucket()
101
+ self.type = spec.get("type")
102
+ self.auto_mount = spec.get("auto_mount")
103
+ self.mount_point = spec.get("mount_point")
104
+ self.readonly = spec.get("readonly")
105
+ self.manifest = spec.get("manifest")
106
+ self.pvc = spec.get("pvc")
107
+ self.pv = spec.get("pv")
108
+ self.bucket = self._get_bucket()
109
109
 
110
110
  def fields_as_list(self) -> list[str]:
111
111
  """
@@ -117,9 +117,9 @@ class Storage:
117
117
  return [
118
118
  self.name,
119
119
  self.type,
120
- self.auto_mount,
120
+ str(self.auto_mount),
121
121
  self.mount_point,
122
- self.readonly,
122
+ str(self.readonly),
123
123
  self.manifest,
124
124
  ]
125
125
 
@@ -133,7 +133,7 @@ class Storage:
133
133
  client = k8s_client.CoreV1Api()
134
134
  try:
135
135
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
136
- return pv.spec.csi.volume_handle
136
+ return cast(str, pv.spec.csi.volume_handle)
137
137
  except ApiException as e:
138
138
  xpk_print(
139
139
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -150,7 +150,7 @@ class Storage:
150
150
  client = k8s_client.CoreV1Api()
151
151
  try:
152
152
  pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
153
- return pv.spec.mount_options
153
+ return cast(list[str], pv.spec.mount_options)
154
154
  except ApiException as e:
155
155
  xpk_print(
156
156
  f"Exception when calling CoreV1Api->read_persistent_volume: {e}"