xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/core/ray.py CHANGED
@@ -102,16 +102,16 @@ def install_ray_cluster(args, system) -> int:
102
102
  0 if successful and 1 otherwise.
103
103
  """
104
104
 
105
- delete_ray_cluster(args)
105
+ delete_ray_cluster()
106
106
 
107
107
  label = 'cloud.google.com/gke-nodepool=default-pool'
108
108
  available_head_cpu, available_head_mem = generate_available_resources(
109
- label, args, HEAD_CPU
109
+ label, HEAD_CPU
110
110
  )
111
111
 
112
112
  label = f'cloud.google.com/gke-tpu-accelerator={system.gke_accelerator}'
113
113
  available_worker_cpu, available_worker_mem = generate_available_resources(
114
- label, args, WORKER_CPU
114
+ label, WORKER_CPU
115
115
  )
116
116
 
117
117
  yml_string = ray_cluster_crd_yaml.format(
@@ -136,7 +136,7 @@ def install_ray_cluster(args, system) -> int:
136
136
  task = 'Applying RayCluster'
137
137
  retry_attempts = 1
138
138
  return_code = run_command_with_updates_retry(
139
- command, task, args, num_retry_attempts=retry_attempts
139
+ command, task, num_retry_attempts=retry_attempts
140
140
  )
141
141
  if return_code != 0:
142
142
  xpk_print(f'{task} not successful.')
@@ -144,12 +144,9 @@ def install_ray_cluster(args, system) -> int:
144
144
  return return_code
145
145
 
146
146
 
147
- def delete_ray_cluster(args) -> None:
147
+ def delete_ray_cluster() -> None:
148
148
  """Delete all RayClusters on the cluster
149
149
 
150
- Args:
151
- args: user provided arguments for running the command.
152
-
153
150
  Returns:
154
151
  None
155
152
  """
@@ -158,7 +155,7 @@ def delete_ray_cluster(args) -> None:
158
155
  task = 'Deleting old RayCluster'
159
156
  retry_attempts = 1
160
157
  return_code = run_command_with_updates_retry(
161
- command, task, args, num_retry_attempts=retry_attempts
158
+ command, task, num_retry_attempts=retry_attempts
162
159
  )
163
160
 
164
161
  if return_code != 0:
@@ -168,12 +165,11 @@ def delete_ray_cluster(args) -> None:
168
165
  return
169
166
 
170
167
 
171
- def generate_available_resources(label, args, percent) -> tuple:
168
+ def generate_available_resources(label, percent) -> tuple:
172
169
  """Generate the available resources for the nodes that match the given label
173
170
 
174
171
  Args:
175
172
  label: the label used to match the appropriate nodes
176
- args: user provided arguments for running the command
177
173
  percent: the percent of the available resources to use
178
174
 
179
175
  Returns:
@@ -184,13 +180,13 @@ def generate_available_resources(label, args, percent) -> tuple:
184
180
  f"kubectl get nodes -l {label} -o jsonpath='{{.items[0].metadata.name}}'"
185
181
  )
186
182
  task = f'Getting nodes with label {label}'
187
- _, node_name = run_command_for_value(command, task, args)
183
+ _, node_name = run_command_for_value(command, task)
188
184
 
189
185
  command = (
190
186
  f"kubectl get node {node_name} -o jsonpath='{{.status.allocatable.cpu}}'"
191
187
  )
192
188
  task = 'Fetching available CPU on node'
193
- _, available_cpu = run_command_for_value(command, task, args)
189
+ _, available_cpu = run_command_for_value(command, task)
194
190
  match = re.match(r'(\d+)([a-zA-Z]+)', available_cpu)
195
191
  if not match:
196
192
  xpk_print(
@@ -207,7 +203,7 @@ def generate_available_resources(label, args, percent) -> tuple:
207
203
  " jsonpath='{.status.allocatable.memory}'"
208
204
  )
209
205
  task = 'Fetching available memory on node'
210
- _, available_memory = run_command_for_value(command, task, args)
206
+ _, available_memory = run_command_for_value(command, task)
211
207
  match = re.match(r'(\d+)([a-zA-Z]+)', available_memory)
212
208
  if not match:
213
209
  xpk_print(
xpk/core/resources.py CHANGED
@@ -50,11 +50,10 @@ class AutoprovisioningConfig:
50
50
  maximum_chips: int
51
51
 
52
52
 
53
- def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
53
+ def get_cluster_configmap(configmap_name) -> dict[str, str] | None:
54
54
  """Run the Get GKE Cluster ConfigMap request.
55
55
 
56
56
  Args:
57
- args: user provided arguments for running the command.
58
57
  configmap_name: name of the configmap.
59
58
 
60
59
  Returns:
@@ -68,7 +67,6 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
68
67
  return_code, return_value = run_command_for_value(
69
68
  command,
70
69
  'GKE Cluster Get ConfigMap',
71
- args,
72
70
  dry_run_return_val='map[]',
73
71
  )
74
72
  if return_code != 0:
@@ -155,12 +153,10 @@ def create_cluster_configmaps(
155
153
  args=args, name=metadata_configmap_name, data=metadata
156
154
  )
157
155
  configmap_yml[metadata_configmap_name] = metadata_yml
158
- return create_or_update_cluster_configmap(configmap_yml, args.dry_run)
156
+ return create_or_update_cluster_configmap(configmap_yml)
159
157
 
160
158
 
161
- def create_or_update_cluster_configmap(
162
- configmap_yml: dict, dry_run: bool
163
- ) -> int:
159
+ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
164
160
  """
165
161
  Args:
166
162
  configmap_yml: dict containing ConfigMap name and yml string.
@@ -181,7 +177,6 @@ def create_or_update_cluster_configmap(
181
177
  commands,
182
178
  'GKE Cluster CreateOrUpdate ConfigMap(s)',
183
179
  task_names,
184
- dry_run=dry_run,
185
180
  )
186
181
  if return_code != 0:
187
182
  xpk_print(
@@ -206,7 +201,7 @@ def check_cluster_resources(args, system) -> tuple[bool, bool]:
206
201
  True if device_type/gke_accelerator exists in the cluster, False otherwise.
207
202
  """
208
203
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
209
- resources_config_map = get_cluster_configmap(args, resources_configmap_name)
204
+ resources_config_map = get_cluster_configmap(resources_configmap_name)
210
205
  if resources_config_map is None:
211
206
  xpk_print(
212
207
  f'No ConfigMap exist for cluster with the name {resources_config_map}.'
@@ -229,7 +224,7 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
229
224
  returns system characteristics
230
225
  """
231
226
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
232
- cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
227
+ cluster_config_map = get_cluster_configmap(resources_configmap_name)
233
228
 
234
229
  if cluster_config_map is None:
235
230
  return None
@@ -251,7 +246,7 @@ def get_cluster_capacity_type(args) -> CapacityType | None:
251
246
  returns system characteristics
252
247
  """
253
248
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
254
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
249
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
255
250
 
256
251
  if cluster_config_map is None:
257
252
  return None
xpk/core/scheduling.py CHANGED
@@ -36,7 +36,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
36
36
  returns true if workload can schedule, otherwise returns false.
37
37
  """
38
38
  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
39
- cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
39
+ cluster_config_map = get_cluster_configmap(resources_configmap_name)
40
40
 
41
41
  # Prevents workload creation failure for existing clusters with no ConfigMap
42
42
  if cluster_config_map is None:
@@ -291,3 +291,21 @@ def create_tpu_topology(
291
291
  ):
292
292
  return f'{system.topology}'
293
293
  return ''
294
+
295
+
296
+ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
297
+ """Generates subslicing annotations.
298
+
299
+ Args:
300
+ sub_slicing_topology: subslice topology.
301
+
302
+ Returns:
303
+ Annotations to be rendered in deployment yaml.
304
+ """
305
+ return [
306
+ (
307
+ 'kueue.x-k8s.io/podset-required-topology:'
308
+ f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
309
+ ),
310
+ f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
311
+ ]
@@ -0,0 +1,31 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .scheduling import create_sub_slicing_annotations
18
+
19
+
20
+ def test_create_sub_slicing_annotations_returns_valid_annotations():
21
+ subslicing_topology = '2x2'
22
+
23
+ result = create_sub_slicing_annotations(subslicing_topology)
24
+
25
+ assert result == [
26
+ (
27
+ 'kueue.x-k8s.io/podset-required-topology:'
28
+ ' "google.com/gke-tpu-slice-2x2-id"'
29
+ ),
30
+ 'cloud.google.com/gke-tpu-slice-topology: 2x2',
31
+ ]