xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. xpk/api/__init__.py +15 -0
  2. xpk/api/storage_crd.yaml +52 -0
  3. xpk/commands/batch.py +27 -5
  4. xpk/commands/cluster.py +104 -80
  5. xpk/commands/cluster_gcluster.py +94 -10
  6. xpk/commands/common.py +44 -0
  7. xpk/commands/config.py +29 -0
  8. xpk/commands/info.py +8 -10
  9. xpk/commands/inspector.py +5 -11
  10. xpk/commands/job.py +9 -7
  11. xpk/commands/kind.py +34 -4
  12. xpk/commands/kjob_common.py +44 -0
  13. xpk/commands/run.py +128 -0
  14. xpk/commands/shell.py +27 -7
  15. xpk/commands/storage.py +280 -0
  16. xpk/commands/version.py +6 -18
  17. xpk/commands/workload.py +381 -184
  18. xpk/core/blueprint/blueprint_definitions.py +1 -0
  19. xpk/core/blueprint/blueprint_generator.py +132 -76
  20. xpk/core/capacity.py +185 -0
  21. xpk/core/cluster.py +564 -0
  22. xpk/core/cluster_private.py +6 -3
  23. xpk/core/commands.py +18 -14
  24. xpk/core/config.py +179 -0
  25. xpk/core/docker_container.py +225 -0
  26. xpk/core/docker_image.py +210 -0
  27. xpk/core/docker_resources.py +350 -0
  28. xpk/core/filestore.py +251 -0
  29. xpk/core/gcloud_context.py +196 -0
  30. xpk/core/gcluster_manager.py +20 -2
  31. xpk/core/gcsfuse.py +50 -0
  32. xpk/core/kjob.py +257 -18
  33. xpk/core/kueue.py +12 -6
  34. xpk/core/monitoring.py +134 -0
  35. xpk/core/nap.py +32 -20
  36. xpk/core/network.py +377 -0
  37. xpk/core/nodepool.py +581 -0
  38. xpk/core/pathways.py +124 -45
  39. xpk/core/remote_state/__init__.py +15 -0
  40. xpk/core/remote_state/fuse_remote_state.py +99 -0
  41. xpk/core/remote_state/remote_state_client.py +38 -0
  42. xpk/core/resources.py +238 -0
  43. xpk/core/scheduling.py +253 -0
  44. xpk/core/storage.py +581 -0
  45. xpk/core/system_characteristics.py +38 -1
  46. xpk/core/vertex.py +105 -0
  47. xpk/core/workload.py +209 -1
  48. xpk/core/workload_decorators/rdma_decorator.py +25 -5
  49. xpk/core/workload_decorators/storage_decorator.py +52 -0
  50. xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
  51. xpk/main.py +3 -1
  52. xpk/parser/batch.py +10 -151
  53. xpk/parser/cluster.py +49 -8
  54. xpk/parser/common.py +189 -1
  55. xpk/parser/config.py +49 -0
  56. xpk/parser/core.py +27 -1
  57. xpk/parser/info.py +2 -1
  58. xpk/parser/inspector.py +3 -3
  59. xpk/parser/job.py +25 -4
  60. xpk/parser/kind.py +3 -2
  61. xpk/parser/run.py +47 -0
  62. xpk/parser/shell.py +10 -1
  63. xpk/parser/storage.py +326 -0
  64. xpk/parser/validators.py +3 -3
  65. xpk/parser/workload.py +118 -76
  66. xpk/templates/__init__.py +15 -0
  67. xpk/templates/storage.yaml +13 -0
  68. xpk/utils/gcs_utils.py +125 -0
  69. xpk/utils/kubectl.py +57 -0
  70. xpk/utils/objects.py +8 -5
  71. xpk/utils/templates.py +28 -0
  72. xpk/utils/validation.py +80 -0
  73. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
  74. xpk-0.7.1.dist-info/RECORD +92 -0
  75. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
  76. xpk/core/core.py +0 -2824
  77. xpk-0.6.0.dist-info/RECORD +0 -57
  78. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
  79. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
  80. {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/core/scheduling.py ADDED
@@ -0,0 +1,253 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
19
+ from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
20
+ from .system_characteristics import (
21
+ AcceleratorType,
22
+ AcceleratorTypeToAcceleratorCharacteristics,
23
+ SystemCharacteristics,
24
+ )
25
+
26
+
27
+ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
28
+ """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
29
+
30
+ Args:
31
+ args: user provided arguments for running the command.
32
+ system: system characteristics
33
+
34
+ Returns:
35
+ returns true if workload can schedule, otherwise returns false.
36
+ """
37
+ resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
38
+ cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
39
+
40
+ # Prevents workload creation failure for existing clusters with no ConfigMap
41
+ if cluster_config_map is None:
42
+ xpk_print(
43
+ 'No ConfigMap exist for cluster with the name'
44
+ f' {resources_configmap_name}.'
45
+ )
46
+ return True
47
+
48
+ # Check for gke accelerator type:
49
+ missing_gke_accelerator_type = False
50
+ if not cluster_config_map.get(system.gke_accelerator):
51
+ xpk_print(
52
+ f'Gke Accelerator Type Check: {args.workload} is requesting'
53
+ f' {system.gke_accelerator} but cluster only contains'
54
+ f' {cluster_config_map.keys()}. '
55
+ )
56
+ missing_gke_accelerator_type = True
57
+ elif (
58
+ cluster_config_map[system.gke_accelerator]
59
+ == AUTOPROVISIONING_CONFIG_VALUE
60
+ ):
61
+ # Run total chip check when in autoprovisioning mode.
62
+ max_chips_in_cluster = int(
63
+ cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
64
+ )
65
+ num_chips_in_workload = get_total_chips_requested_from_args(args, system)
66
+
67
+ if num_chips_in_workload > max_chips_in_cluster:
68
+ xpk_print(
69
+ f'{args.workload} is requesting {num_chips_in_workload} chips but'
70
+ f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.'
71
+ ' Resize the cluster to support more chips with'
72
+ ' `xpk cluster create --autoprovisioning-max-chips=X ...`'
73
+ )
74
+ return False
75
+ return True
76
+
77
+ # Check for device type
78
+ missing_device_type = False
79
+ device_type = system.device_type
80
+ if device_type not in cluster_config_map:
81
+ xpk_print(
82
+ f'Device Type Check: {args.workload} is requesting {device_type} but '
83
+ f'cluster only contains {cluster_config_map.keys()}. '
84
+ )
85
+ missing_device_type = True
86
+
87
+ if missing_device_type and missing_gke_accelerator_type:
88
+ xpk_print(
89
+ 'Both Device Type and GKE Accelerator Type checks failed.'
90
+ f' XPK will not create the workload {args.workload}.'
91
+ )
92
+ return False
93
+ else:
94
+ # Check if the size of the workload will fit in the cluster.
95
+ max_vm_in_cluster = int(cluster_config_map[device_type])
96
+ if system.accelerator_type == AcceleratorType['GPU']:
97
+ vm_required_by_workload = args.num_nodes
98
+ else:
99
+ vm_required_by_workload = args.num_slices * system.vms_per_slice
100
+ if vm_required_by_workload > max_vm_in_cluster:
101
+ xpk_print(
102
+ f'{args.workload} is requesting {args.num_slices} slice/slices of'
103
+ f' {device_type}, which is {vm_required_by_workload} VMs, but the'
104
+ f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
105
+ ' XPK will not create this workload.'
106
+ )
107
+ return False
108
+
109
+ return True
110
+
111
+
112
+ def get_total_chips_requested_from_args(
113
+ args, system: SystemCharacteristics
114
+ ) -> int:
115
+ """Return the total chips requested based on user args.
116
+
117
+ Args:
118
+ args: user provided arguments for running the command.
119
+ system: system characteristics.
120
+
121
+ Returns:
122
+ num of chips for the current request.
123
+ """
124
+ if system.accelerator_type == AcceleratorType['GPU']:
125
+ num_chips = system.vms_per_slice * system.chips_per_vm * args.num_nodes
126
+ else:
127
+ num_chips = system.vms_per_slice * system.chips_per_vm * args.num_slices
128
+
129
+ return int(num_chips)
130
+
131
+
132
+ def get_cpu_affinity(accelerator_type) -> str:
133
+ """Generate affinity rules for CPU nodepools, so that workload pods are
134
+ not scheduled on the default pool machines.
135
+ Args:
136
+ accelerator_type: TPU / GPU / CPU
137
+
138
+ Returns:
139
+ str: yaml containing affinity constraints
140
+ """
141
+ yaml = """affinity:
142
+ nodeAffinity:
143
+ requiredDuringSchedulingIgnoredDuringExecution:
144
+ nodeSelectorTerms:
145
+ - matchExpressions:
146
+ - key: cloud.google.com/gke-nodepool
147
+ operator: NotIn
148
+ values:
149
+ - default-pool
150
+ """
151
+ if accelerator_type == AcceleratorType['CPU']:
152
+ return yaml
153
+ return ''
154
+
155
+
156
+ def get_gpu_scheduler(
157
+ args, system: SystemCharacteristics, autoprovisioning_args: str
158
+ ) -> tuple[str, int]:
159
+ """Get gpu scheduler configuration.
160
+
161
+ Args:
162
+ args: user provided arguments for running the command.
163
+ system: system characteristics.
164
+ autoprovisioning_args: a string of arguments for Autoprovisioning.
165
+
166
+ Returns:
167
+ str: yaml containing gpu scheduler configuration
168
+ int of 0 if successful and 1 otherwise.
169
+ """
170
+ gpu_scheduler = ''
171
+ return_code = 0
172
+
173
+ if args.scheduler == 'gke.io/topology-aware-auto':
174
+ gpu_scheduler = f"""schedulingGates:
175
+ - name: "{args.scheduler}-{args.workload}"
176
+ """
177
+ elif args.scheduler == 'default-scheduler':
178
+ gpu_scheduler_yaml = """schedulerName: {scheduler_name}
179
+ affinity:
180
+ nodeAffinity:
181
+ requiredDuringSchedulingIgnoredDuringExecution:
182
+ nodeSelectorTerms:
183
+ - matchExpressions:
184
+ - key: cloud.google.com/gke-accelerator
185
+ operator: Exists
186
+ - key: cloud.google.com/gke-nodepool
187
+ operator: In
188
+ values: [{node_pool_name}]
189
+ nodeSelector:
190
+ {accelerator_label}
191
+ {machine_label}
192
+ {autoprovisioning_args}
193
+ """
194
+ gpu_scheduler = gpu_scheduler_yaml.format(
195
+ scheduler_name=args.scheduler,
196
+ accelerator_label=create_accelerator_label(
197
+ system.accelerator_type, system
198
+ ),
199
+ machine_label=create_machine_label(system.accelerator_type, system),
200
+ node_pool_name=f'{args.cluster}-np-0',
201
+ autoprovisioning_args=autoprovisioning_args,
202
+ )
203
+ else:
204
+ return_code = 1
205
+ xpk_print(
206
+ '--scheduler needs to be set as either `default-scheduler`'
207
+ ' or `gke.io/topology-aware-auto` in order to schedule the'
208
+ ' workloads on GPUs.'
209
+ )
210
+
211
+ return gpu_scheduler, return_code
212
+
213
+
214
+ def create_accelerator_label(accelerator_type, system) -> str:
215
+ """Generates accelerator label.
216
+
217
+ Args:
218
+ accelerator_type: type of accelerator.
219
+ system: system characteristics.
220
+
221
+ Returns:
222
+ The accelerator label.
223
+ """
224
+ if accelerator_type == AcceleratorType['CPU']:
225
+ return ''
226
+ return (
227
+ f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
228
+ f' {system.gke_accelerator}'
229
+ )
230
+
231
+
232
+ def create_machine_label(
233
+ accelerator_type, system, autoprovisioning_enabled: bool = False
234
+ ) -> str:
235
+ """Generates machine label.
236
+
237
+ Args:
238
+ accelerator_type: type of accelerator.
239
+ system: system characteristics.
240
+ autoprovisioning_enabled: describes autoprovisioning enablement.
241
+
242
+ Returns:
243
+ The machine label.
244
+ """
245
+ if (
246
+ accelerator_type == AcceleratorType['TPU']
247
+ and not autoprovisioning_enabled
248
+ ):
249
+ return (
250
+ f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
251
+ f' {system.topology}'
252
+ )
253
+ return ''