xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/ray.py CHANGED
@@ -102,16 +102,16 @@ def install_ray_cluster(args, system) -> int:
102
102
  0 if successful and 1 otherwise.
103
103
  """
104
104
 
105
- delete_ray_cluster(args)
105
+ delete_ray_cluster()
106
106
 
107
107
  label = 'cloud.google.com/gke-nodepool=default-pool'
108
108
  available_head_cpu, available_head_mem = generate_available_resources(
109
- label, args, HEAD_CPU
109
+ label, HEAD_CPU
110
110
  )
111
111
 
112
112
  label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
113
113
  available_worker_cpu, available_worker_mem = generate_available_resources(
114
- label, args, WORKER_CPU
114
+ label, WORKER_CPU
115
115
  )
116
116
 
117
117
  yml_string = ray_cluster_crd_yaml.format(
@@ -132,11 +132,11 @@ def install_ray_cluster(args, system) -> int:
132
132
  )
133
133
 
134
134
  tmp = write_tmp_file(yml_string)
135
- command = f'kubectl apply -f {str(tmp.file.name)}'
135
+ command = f'kubectl apply -f {str(tmp)}'
136
136
  task = 'Applying RayCluster'
137
137
  retry_attempts = 1
138
138
  return_code = run_command_with_updates_retry(
139
- command, task, args, num_retry_attempts=retry_attempts
139
+ command, task, num_retry_attempts=retry_attempts
140
140
  )
141
141
  if return_code != 0:
142
142
  xpk_print(f'{task} not successful.')
@@ -144,12 +144,9 @@ def install_ray_cluster(args, system) -> int:
144
144
  return return_code
145
145
 
146
146
 
147
- def delete_ray_cluster(args) -> None:
147
+ def delete_ray_cluster() -> None:
148
148
  """Delete all RayClusters on the cluster
149
149
 
150
- Args:
151
- args: user provided arguments for running the command.
152
-
153
150
  Returns:
154
151
  None
155
152
  """
@@ -158,7 +155,7 @@ def delete_ray_cluster(args) -> None:
158
155
  task = 'Deleting old RayCluster'
159
156
  retry_attempts = 1
160
157
  return_code = run_command_with_updates_retry(
161
- command, task, args, num_retry_attempts=retry_attempts
158
+ command, task, num_retry_attempts=retry_attempts
162
159
  )
163
160
 
164
161
  if return_code != 0:
@@ -168,12 +165,11 @@ def delete_ray_cluster(args) -> None:
168
165
  return
169
166
 
170
167
 
171
- def generate_available_resources(label, args, percent) -> tuple:
168
+ def generate_available_resources(label, percent) -> tuple:
172
169
  """Generate the available resources for the nodes that match the given label
173
170
 
174
171
  Args:
175
172
  label: the label used to match the appropriate nodes
176
- args: user provided arguments for running the command
177
173
  percent: the percent of the available resources to use
178
174
 
179
175
  Returns:
@@ -184,13 +180,13 @@ def generate_available_resources(label, args, percent) -> tuple:
184
180
  f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
185
181
  )
186
182
  task = f'Getting nodes with label {label}'
187
- _, node_name = run_command_for_value(command, task, args)
183
+ _, node_name = run_command_for_value(command, task)
188
184
 
189
185
  command = (
190
186
  f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
191
187
  )
192
188
  task = 'Fetching available CPU on node'
193
- _, available_cpu = run_command_for_value(command, task, args)
189
+ _, available_cpu = run_command_for_value(command, task)
194
190
  match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
195
191
  if not match:
196
192
  xpk_print(
@@ -207,7 +203,7 @@ def generate_available_resources(label, args, percent) -> tuple:
207
203
  " jsonpath='{.status.allocatable.memory}'"
208
204
  )
209
205
  task = 'Fetching available memory on node'
210
- _, available_memory = run_command_for_value(command, task, args)
206
+ _, available_memory = run_command_for_value(command, task)
211
207
  match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
212
208
  if not match:
213
209
  xpk_print(
xpk/core/resources.py CHANGED
@@ -50,11 +50,10 @@ class AutoprovisioningConfig:
50
50
  maximum_chips: int
51
51
 
52
52
 
53
- def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
53
+ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
54
54
  """Run the Get GKE Cluster ConfigMap request.
55
55
 
56
56
  Args:
57
- args: user provided arguments for running the command.
58
57
  configmap_name: name of the configmap.
59
58
 
60
59
  Returns:
@@ -66,7 +65,9 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
66
65
  )
67
66
 
68
67
  return_code, return_value = run_command_for_value(
69
- command, 'GKE Cluster Get ConfigMap', args
68
+ command,
69
+ 'GKE Cluster Get ConfigMap',
70
+ dry_run_return_val='map[]',
70
71
  )
71
72
  if return_code != 0:
72
73
  xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
@@ -81,8 +82,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
81
82
  configs = return_value[4:-1].split(' ')
82
83
 
83
84
  for config in configs:
84
- key, value = config.strip().split(':')
85
- config_map[key] = value
85
+ parts = config.strip().split(':')
86
+ if len(parts) != 2:
87
+ continue
88
+ config_map[parts[0]] = parts[1]
86
89
  return config_map
87
90
 
88
91
 
@@ -165,13 +168,15 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
165
168
  task_names = []
166
169
  for configmap_name, yml_string in configmap_yml.items():
167
170
  tmp = write_tmp_file(yml_string)
168
- command = f'kubectl apply -f {str(tmp.file.name)}'
171
+ command = f'kubectl apply -f {str(tmp)}'
169
172
  commands.append(command)
170
173
  task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
171
174
  task_names.append(task_name)
172
175
 
173
176
  return_code = run_commands(
174
- commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
177
+ commands,
178
+ 'GKE Cluster CreateOrUpdate ConfigMap(s)',
179
+ task_names,
175
180
  )
176
181
  if return_code != 0:
177
182
  xpk_print(
@@ -196,7 +201,7 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
196
201
  True if device_type/gke_accelerator exists in the cluster, False otherwise.
197
202
  """
198
203
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
199
- resources_config_map = get_cluster_configmap(args, resources_configmap_name)
204
+ resources_config_map = get_cluster_configmap(resources_configmap_name)
200
205
  if resources_config_map is None:
201
206
  xpk_print(
202
207
  f'No ConfigMap exist for cluster with the name {resources_config_map}.'
@@ -219,7 +224,7 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
219
224
  returns system characteristics
220
225
  """
221
226
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
222
- cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
227
+ cluster_config_map = get_cluster_configmap(resources_configmap_name)
223
228
 
224
229
  if cluster_config_map is None:
225
230
  return None
@@ -241,7 +246,7 @@ def get_cluster_capacity_type(args) -> CapacityType | None:
241
246
  returns system characteristics
242
247
  """
243
248
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
244
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
249
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
245
250
 
246
251
  if cluster_config_map is None:
247
252
  return None
xpk/core/scheduling.py CHANGED
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..utils.console import xpk_print
18
+ from ..utils.execution_context import is_dry_run
18
19
  from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
19
20
  from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
20
21
  from .system_characteristics import (
@@ -35,7 +36,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
35
36
  returns true if workload can schedule, otherwise returns false.
36
37
  """
37
38
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
38
- cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
39
+ cluster_config_map = get_cluster_configmap(resources_configmap_name)
39
40
 
40
41
  # Prevents workload creation failure for existing clusters with no ConfigMap
41
42
  if cluster_config_map is None:
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
45
46
  )
46
47
  return True
47
48
 
49
+ if is_dry_run():
50
+ return True
51
+
48
52
  # Check for gke accelerator type:
49
53
  missing_gke_accelerator_type = False
50
54
  if not cluster_config_map.get(system.gke_accelerator):
@@ -287,3 +291,21 @@ def create_tpu_topology(
287
291
  ):
288
292
  return f'{system.topology}'
289
293
  return ''
294
+
295
+
296
+ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
297
+ """Generates subslicing annotations.
298
+
299
+ Args:
300
+ sub_slicing_topology: subslice topology.
301
+
302
+ Returns:
303
+ Annotations to be rendered in deployment yaml.
304
+ """
305
+ return [
306
+ (
307
+ 'kueue.x-k8s.io/podset-required-topology:'
308
+ f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
309
+ ),
310
+ f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
311
+ ]
@@ -0,0 +1,31 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .scheduling import create_sub_slicing_annotations
18
+
19
+
20
+ def test_create_sub_slicing_annotations_returns_valid_annotations():
21
+ subslicing_topology = '2x2'
22
+
23
+ result = create_sub_slicing_annotations(subslicing_topology)
24
+
25
+ assert result == [
26
+ (
27
+ 'kueue.x-k8s.io/podset-required-topology:'
28
+ ' "google.com/gke-tpu-slice-2x2-id"'
29
+ ),
30
+ 'cloud.google.com/gke-tpu-slice-topology: 2x2',
31
+ ]