xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/kueue.py ADDED
@@ -0,0 +1,358 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from argparse import Namespace
18
+
19
+ import packaging
20
+ from packaging.version import Version
21
+
22
+ from ..utils.console import xpk_exit, xpk_print
23
+ from ..utils.file import write_tmp_file
24
+ from .commands import (
25
+ run_command_for_value,
26
+ run_command_with_updates,
27
+ run_command_with_updates_retry,
28
+ )
29
+ from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue
30
+ from .resources import AutoprovisioningConfig
31
+ from .scheduling import (
32
+ create_accelerator_label,
33
+ create_machine_label,
34
+ get_total_chips_requested_from_args,
35
+ )
36
+ from .system_characteristics import (
37
+ AcceleratorTypeToAcceleratorCharacteristics,
38
+ SystemCharacteristics,
39
+ )
40
+
41
+ KUEUE_VERSION = 'v0.10.0'
42
+ CLUSTER_QUEUE_NAME = 'cluster-queue'
43
+ LOCAL_QUEUE_NAME = 'multislice-queue'
44
+ WAIT_FOR_KUEUE_TIMEOUT = '5m'
45
+
46
+ packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
47
+
48
+ cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
49
+ kind: ResourceFlavor
50
+ metadata:
51
+ name: {cluster_hardware_name}
52
+ spec:
53
+ nodeLabels:
54
+ {accelerator_label}
55
+ {machine_label}
56
+ ---
57
+ {pw_resource_flavors}
58
+ apiVersion: kueue.x-k8s.io/v1beta1
59
+ kind: ClusterQueue
60
+ metadata:
61
+ name: {cluster_queue_name}
62
+ spec:
63
+ preemption:
64
+ reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
65
+ withinClusterQueue: LowerPriority
66
+ namespaceSelector: {{}} # match all.
67
+ resourceGroups:
68
+ {covered_resources_config}
69
+ {pw_resources_kueue}
70
+ ---
71
+ apiVersion: kueue.x-k8s.io/v1beta1
72
+ kind: LocalQueue
73
+ metadata:
74
+ namespace: default
75
+ name: {local_queue_name}
76
+ spec:
77
+ clusterQueue: {cluster_queue_name}
78
+ ---
79
+ apiVersion: scheduling.k8s.io/v1
80
+ kind: PriorityClass
81
+ metadata:
82
+ name: very-low
83
+ value: 100
84
+ globalDefault: false
85
+ description: "Very Low"
86
+ ---
87
+ apiVersion: scheduling.k8s.io/v1
88
+ kind: PriorityClass
89
+ metadata:
90
+ name: low
91
+ value: 250
92
+ globalDefault: false
93
+ description: "Low"
94
+ ---
95
+ apiVersion: scheduling.k8s.io/v1
96
+ kind: PriorityClass
97
+ metadata:
98
+ name: medium
99
+ value: 500
100
+ globalDefault: false
101
+ description: "Medium"
102
+ ---
103
+ apiVersion: scheduling.k8s.io/v1
104
+ kind: PriorityClass
105
+ metadata:
106
+ name: high
107
+ value: 750
108
+ globalDefault: false
109
+ description: "High"
110
+ ---
111
+ apiVersion: scheduling.k8s.io/v1
112
+ kind: PriorityClass
113
+ metadata:
114
+ name: very-high
115
+ value: 1000
116
+ globalDefault: false
117
+ description: "Very High"
118
+ """
119
+
120
+ cluster_preheat_yml = """
121
+ apiVersion: apps/v1
122
+ kind: DaemonSet
123
+ metadata:
124
+ name: {cachekey}
125
+ labels:
126
+ k8s-app: {cachekey}
127
+ spec:
128
+ selector:
129
+ matchLabels:
130
+ k8s-app: {cachekey}
131
+ updateStrategy:
132
+ type: RollingUpdate
133
+ template:
134
+ metadata:
135
+ labels:
136
+ name: {cachekey}
137
+ k8s-app: {cachekey}
138
+ spec:
139
+ affinity:
140
+ nodeAffinity:
141
+ requiredDuringSchedulingIgnoredDuringExecution:
142
+ nodeSelectorTerms:
143
+ - matchExpressions:
144
+ - key: {nodeSelectorKey}
145
+ operator: Exists
146
+ tolerations:
147
+ - operator: "Exists"
148
+ containers:
149
+ - image: {image_name}
150
+ name: {cachekey}
151
+ command: [ "sleep", "inf" ]
152
+ """
153
+
154
+
155
+ def verify_kueuectl(args: Namespace) -> None:
156
+ """Verify if kueuectl is installed.
157
+ Args:
158
+ args: user provided arguments.
159
+ Returns:
160
+ None
161
+ """
162
+ xpk_print('Veryfing kueuectl installation')
163
+
164
+ command = 'kubectl kueue version'
165
+ task = 'Verify kueuectl installation on cluster'
166
+ verify_kueuectl_installed_code, _ = run_command_for_value(command, task, args)
167
+
168
+ if verify_kueuectl_installed_code == 0:
169
+ xpk_print('kueuectl found')
170
+
171
+ if verify_kueuectl_installed_code != 0:
172
+ xpk_print(
173
+ 'kueuectl not found. Please follow'
174
+ ' https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/'
175
+ ' to install kueuectl.'
176
+ )
177
+ xpk_exit(verify_kueuectl_installed_code)
178
+
179
+
180
+ def delete_multikueueconfigs_definitions(args) -> int:
181
+ command = 'kubectl delete crd multikueueconfigs.kueue.x-k8s.io'
182
+ task = 'Delete multikueueconfigs crds'
183
+ return_code = run_command_with_updates_retry(command, task, args)
184
+ if return_code != 0:
185
+ xpk_print(f'{task} returned ERROR {return_code}')
186
+ return return_code
187
+
188
+
189
+ def delete_multikueueclusters_definitions(args) -> int:
190
+ command = 'kubectl delete crd multikueueclusters.kueue.x-k8s.io'
191
+ task = 'Delete multikueueclusters crds'
192
+ return_code = run_command_with_updates_retry(command, task, args)
193
+ if return_code != 0:
194
+ xpk_print(f'{task} returned ERROR {return_code}')
195
+ return return_code
196
+
197
+
198
+ def get_kueue_version(args) -> (int, str):
199
+ command = 'kubectl kueue version'
200
+ task = 'Get kueue version on server'
201
+ return_code, val = run_command_for_value(command, task, args)
202
+ if return_code != 0:
203
+ return return_code, ''
204
+ lines = val.splitlines()
205
+ if len(lines) == 1:
206
+ return 1, ''
207
+ server_version_line = lines[1]
208
+ manager_image_version = server_version_line.split(':')[-1]
209
+ return return_code, manager_image_version
210
+
211
+
212
+ def install_kueue_on_cluster(args) -> int:
213
+ """Install Kueue on the cluster.
214
+
215
+ Args:
216
+ args: user provided arguments for running the command.
217
+
218
+ Returns:
219
+ 0 if successful and 1 otherwise.
220
+ """
221
+
222
+ err_code, kueue_version_installed = get_kueue_version(args)
223
+ if err_code == 0:
224
+ if Version(kueue_version_installed) < Version('v0.9.0') and Version(
225
+ KUEUE_VERSION
226
+ ) >= Version('v0.9.0'):
227
+ xpk_print('Upgrading kueue on cluster from version < 0.9.0.')
228
+ upgrade_code = delete_multikueueclusters_definitions(args)
229
+ if upgrade_code != 0:
230
+ return upgrade_code
231
+ upgrade_code = delete_multikueueconfigs_definitions(args)
232
+ if upgrade_code != 0:
233
+ return upgrade_code
234
+
235
+ command = (
236
+ 'kubectl apply --server-side --force-conflicts -f'
237
+ f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
238
+ )
239
+ task = 'Set Kueue On Cluster'
240
+ return_code = run_command_with_updates_retry(command, task, args)
241
+ if return_code != 0:
242
+ xpk_print(f'{task} returned ERROR {return_code}')
243
+ return return_code
244
+
245
+
246
+ def wait_for_kueue_available(args: Namespace) -> int:
247
+ """Wait for Kueue to be fully available.
248
+
249
+ Args:
250
+ args: user provided arguments for running the command.
251
+
252
+ Returns:
253
+ 0 if successful and 1 otherwise.
254
+ """
255
+ command = (
256
+ 'kubectl wait deploy/kueue-controller-manager -nkueue-system'
257
+ f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}'
258
+ )
259
+ task = 'Wait for Kueue to be available'
260
+ return_code = run_command_with_updates(command, task, args)
261
+ if return_code != 0:
262
+ xpk_print(f'{task} returned ERROR {return_code}')
263
+ return return_code
264
+
265
+
266
+ def install_kueue_crs(
267
+ args,
268
+ system: SystemCharacteristics,
269
+ autoprovisioning_config: AutoprovisioningConfig | None,
270
+ ) -> int:
271
+ """Install Kueue Custom Resources.
272
+
273
+ Args:
274
+ args: user provided arguments for running the command.
275
+ system: system level arguments.
276
+ autoprovisioning_config: Autoprovisioning config to configure kueue with if
277
+ autoprovisioning is enabled.
278
+
279
+ Returns:
280
+ 0 if successful and 1 otherwise.
281
+ """
282
+ device_type = system.device_type
283
+ cluster_hardware_name = f'{args.num_slices}x{device_type}'
284
+ resource_type = AcceleratorTypeToAcceleratorCharacteristics[
285
+ system.accelerator_type
286
+ ].resource_type
287
+
288
+ autoprovisioning_enabled = False
289
+ if autoprovisioning_config:
290
+ # Determine total resources available based on autoprovisioning max chips.
291
+ autoprovisioning_enabled = True
292
+ total_chips = autoprovisioning_config.maximum_chips
293
+ cluster_hardware_name = f'{system.gke_accelerator}'
294
+ else:
295
+ # Determine total chips based on user specified topology.
296
+ total_chips = get_total_chips_requested_from_args(args, system)
297
+
298
+ covered_resources_config = get_kueue_covered_resources_config(
299
+ cluster_hardware_name=cluster_hardware_name,
300
+ resource_type=resource_type,
301
+ total_chips=total_chips,
302
+ )
303
+ yml_string = cluster_set_crd_yaml.format(
304
+ system=system,
305
+ cluster_hardware_name=cluster_hardware_name,
306
+ accelerator_label=create_accelerator_label(
307
+ system.accelerator_type, system
308
+ ),
309
+ machine_label=create_machine_label(
310
+ system.accelerator_type, system, autoprovisioning_enabled
311
+ ),
312
+ covered_resources_config=covered_resources_config,
313
+ resource_type=AcceleratorTypeToAcceleratorCharacteristics[
314
+ system.accelerator_type
315
+ ].resource_type,
316
+ pw_resource_flavors=add_pw_resource_flavors(args),
317
+ pw_resources_kueue=add_pw_resources_to_kueue(args),
318
+ cluster_queue_name=CLUSTER_QUEUE_NAME,
319
+ local_queue_name=LOCAL_QUEUE_NAME,
320
+ )
321
+
322
+ tmp = write_tmp_file(yml_string)
323
+ command = f'kubectl apply -f {str(tmp.file.name)}'
324
+
325
+ task = 'Applying Kueue Custom Resources'
326
+ return_code = run_command_with_updates_retry(command, task, args)
327
+ if return_code != 0:
328
+ xpk_print(f'{task} returned ERROR {return_code}')
329
+ return return_code
330
+
331
+
332
+ def get_kueue_covered_resources_config(
333
+ cluster_hardware_name, resource_type, total_chips
334
+ ) -> str:
335
+ """Gets Kueue covered resources configuration.
336
+
337
+ Args:
338
+ cluster_hardware_name: cluster hardware name.
339
+ resource_type: resource type of tpu or gpu.
340
+ total_chips: total number of chips for the specific resource type.
341
+
342
+ Returns:
343
+ A string of Kueue covered resources configuration.
344
+ """
345
+ config_format = """
346
+ - coveredResources: ["{resource_type}"]
347
+ flavors:
348
+ - name: {cluster_hardware_name}
349
+ resources:
350
+ - name: "{resource_type}"
351
+ nominalQuota: {total_chips}
352
+ """
353
+ config_string = config_format.format(
354
+ cluster_hardware_name=cluster_hardware_name,
355
+ resource_type=resource_type,
356
+ total_chips=total_chips,
357
+ )
358
+ return config_string
xpk/core/monitoring.py ADDED
@@ -0,0 +1,134 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from .commands import run_command_for_value
19
+
20
+
21
+ def get_gke_dashboard(args, dashboard_filter) -> tuple[bool, str | None]:
22
+ """Get the identifier of GKE dashboard deployed in the project.
23
+
24
+ Args:
25
+ args: user provided arguments for running the command.
26
+
27
+ Returns:
28
+ bool:
29
+ True if 'gcloud monitoring dashboards list' returned an error or
30
+ multiple dashboards with same filter exist in the project,
31
+ False otherwise.
32
+ str:
33
+ identifier of dashboard if deployed in project,
34
+ None otherwise.
35
+ """
36
+ command = (
37
+ 'gcloud monitoring dashboards list'
38
+ f' --project={args.project} --filter="{dashboard_filter}"'
39
+ ' --format="value(name)" --verbosity=error'
40
+ )
41
+
42
+ return_code, return_value = run_command_for_value(
43
+ command, 'GKE Dashboard List', args
44
+ )
45
+
46
+ if return_code != 0:
47
+ xpk_print(
48
+ f'GKE Dashboard List request returned ERROR {return_code}. If there is'
49
+ ' a permissions error, please check'
50
+ ' https://github.com/google/xpk/blob/main/README.md#roles-needed-based-on-permission-errors'
51
+ ' for possible solutions.'
52
+ )
53
+ return True, None
54
+
55
+ if not return_value:
56
+ xpk_print(
57
+ f'No dashboard with {dashboard_filter} found in the'
58
+ f' project:{args.project}.'
59
+ )
60
+ return False, return_value
61
+
62
+ dashboards = return_value.strip().split('\n')
63
+ if len(dashboards) > 1:
64
+ xpk_print(
65
+ f'Multiple dashboards with same {dashboard_filter} exist in the'
66
+ f' project:{args.project}. Delete all but one dashboard deployed using'
67
+ ' https://github.com/google/cloud-tpu-monitoring-debugging.'
68
+ )
69
+ return True, None
70
+
71
+ if dashboards[0]:
72
+ return False, dashboards[0].strip().split('/')[-1]
73
+
74
+ return True, None
75
+
76
+
77
+ def get_gke_outlier_dashboard(args) -> str | None:
78
+ """Get the identifier of GKE outlier dashboard deployed in the project.
79
+
80
+ Args:
81
+ args: user provided arguments for running the command.
82
+
83
+ Returns:
84
+ str:
85
+ identifier of outlier dashboard if deployed in project,
86
+ None otherwise.
87
+ """
88
+ outlier_dashboard_filter = "displayName:'GKE - TPU Monitoring Dashboard'"
89
+ is_error, dashboard_id = get_gke_dashboard(args, outlier_dashboard_filter)
90
+
91
+ # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
92
+ if is_error:
93
+ return None
94
+
95
+ # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
96
+ if not is_error and not dashboard_id:
97
+ xpk_print(
98
+ 'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
99
+ ' deploy monitoring dashboard to view statistics and outlier mode of'
100
+ ' GKE metrics.'
101
+ )
102
+ return None
103
+
104
+ return str(dashboard_id)
105
+
106
+
107
+ def get_gke_debugging_dashboard(args) -> str | None:
108
+ """Get the identifier of GKE debugging dashboard deployed in the project.
109
+
110
+ Args:
111
+ args: user provided arguments for running the command.
112
+
113
+ Returns:
114
+ str:
115
+ identifier of debugging dashboard if deployed in project,
116
+ None otherwise.
117
+ """
118
+ debugging_dashboard_filter = "displayName:'GKE - TPU Logging Dashboard'"
119
+ is_error, dashboard_id = get_gke_dashboard(args, debugging_dashboard_filter)
120
+
121
+ # 'gcloud monitoring dashboards list' returned an error or multiple dashboards with same filter exist in the project
122
+ if is_error:
123
+ return None
124
+
125
+ # 'gcloud monitoring dashboards list' succeeded but no dashboard for the filter exist in the project
126
+ if not is_error and not dashboard_id:
127
+ xpk_print(
128
+ 'Follow https://github.com/google/cloud-tpu-monitoring-debugging to'
129
+ ' deploy debugging dashboard to view stack traces collected in Cloud'
130
+ ' Logging.'
131
+ )
132
+ return None
133
+
134
+ return str(dashboard_id)