xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,692 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.commands import (
18
+ run_command_with_updates,
19
+ run_commands,
20
+ )
21
+ from ..core.core import (
22
+ CLUSTER_METADATA_CONFIGMAP,
23
+ VERTEX_TENSORBOARD_FEATURE_FLAG,
24
+ AcceleratorTypeToAcceleratorCharacteristics,
25
+ add_zone_and_project,
26
+ check_if_workload_can_schedule,
27
+ check_if_workload_exists,
28
+ create_accelerator_label,
29
+ create_machine_label,
30
+ create_vertex_experiment,
31
+ get_cluster_configmap,
32
+ get_cpu_affinity,
33
+ get_gke_outlier_dashboard,
34
+ get_gpu_rxdm_cmd,
35
+ get_gpu_rxdm_image,
36
+ get_gpu_scheduler,
37
+ get_gpu_tcp_volume,
38
+ get_gpu_volume,
39
+ get_user_workload_container,
40
+ get_volumes,
41
+ parse_env_config,
42
+ wait_for_job_completion,
43
+ xpk_current_version,
44
+ zone_to_region,
45
+ )
46
+ from ..core.kueue import LOCAL_QUEUE_NAME
47
+ from ..core.nap import (
48
+ get_autoprovisioning_node_selector_args,
49
+ is_autoprovisioning_enabled,
50
+ )
51
+ from ..core.pathways import (
52
+ ensure_pathways_workload_prerequisites,
53
+ get_pathways_proxy_args,
54
+ get_pathways_rm_args,
55
+ get_pathways_unified_query_link,
56
+ get_pathways_worker_args,
57
+ get_user_workload_for_pathways,
58
+ )
59
+ from ..core.system_characteristics import (
60
+ AcceleratorType,
61
+ get_system_characteristics,
62
+ )
63
+ from ..core.workload import get_workload_list
64
+ from ..utils.console import get_user_input, xpk_exit, xpk_print
65
+ from ..utils.file import write_tmp_file
66
+ from .cluster import set_cluster_command
67
+ from ..core.workload_decorators import tcpxo_decorator, rdma_decorator
68
+ from . import cluster_gcluster
69
+
70
+ workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
71
+ kind: JobSet
72
+ metadata:
73
+ name: {args.workload}
74
+ labels:
75
+ kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
76
+ xpk.google.com/workload: {args.workload}
77
+ annotations:
78
+ alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
79
+ spec:
80
+ ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
81
+ failurePolicy:
82
+ maxRestarts: {args.max_restarts}
83
+ replicatedJobs:
84
+ - name: slice-job
85
+ replicas: {args.num_slices}
86
+ template:
87
+ spec:
88
+ parallelism: {system.vms_per_slice} # Equal to the number of VMs per slice
89
+ completions: {system.vms_per_slice} # Same as the above.
90
+ backoffLimit: 0 # When any pod fails, the job is failed
91
+ template:
92
+ metadata:
93
+ labels:
94
+ xpk.google.com/workload: {args.workload}
95
+ spec:
96
+ schedulerName: {args.scheduler}
97
+ restartPolicy: Never
98
+ {affinity}
99
+ nodeSelector:
100
+ {accelerator_label}
101
+ {machine_label}
102
+ {autoprovisioning_args}
103
+ priorityClassName: {args.priority}
104
+ hostNetwork: true
105
+ dnsPolicy: ClusterFirstWithHostNet
106
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
107
+ containers:
108
+ {container}
109
+ volumes:
110
+ {volumes}
111
+ """
112
+
113
+
114
+ gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
115
+ kind: JobSet
116
+ metadata:
117
+ name: {args.workload}
118
+ labels:
119
+ kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
120
+ xpk.google.com/workload: {args.workload}
121
+ spec:
122
+ ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
123
+ failurePolicy:
124
+ maxRestarts: {args.max_restarts}
125
+ replicatedJobs:
126
+ - name: slice-job
127
+ replicas: 1
128
+ template:
129
+ spec:
130
+ parallelism: {args.num_nodes}
131
+ completions: {args.num_nodes}
132
+ backoffLimit: 0 # When any pod fails, the job is failed
133
+ template:
134
+ metadata:
135
+ labels:
136
+ xpk.google.com/workload: {args.workload}
137
+ spec:
138
+ {gpu_scheduler}
139
+ priorityClassName: {args.priority}
140
+ restartPolicy: Never
141
+ hostNetwork: true
142
+ dnsPolicy: ClusterFirstWithHostNet
143
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
144
+ tolerations:
145
+ - operator: "Exists"
146
+ key: nvidia.com/gpu
147
+ volumes:
148
+ {gpu_volume}
149
+ containers:
150
+ {gpu_rxdm_image}
151
+ imagePullPolicy: Always
152
+ command:
153
+ - "bash"
154
+ - "-c"
155
+ - |
156
+ {gpu_rxdm_cmd} &
157
+ while [ ! -e "/usr/share/workload/workload_terminated" ]; do sleep 10; echo "sleeping"; done
158
+ securityContext:
159
+ privileged: true
160
+ volumeMounts:
161
+ {gpu_tcp_volume}
162
+ - name: nvidia-install-dir-host
163
+ mountPath: /usr/local/nvidia/lib64
164
+ - name: workload-terminated-volume
165
+ mountPath: /usr/share/workload
166
+ env:
167
+ - name: LD_LIBRARY_PATH
168
+ value: /usr/local/nvidia/lib64
169
+ {container}
170
+ """
171
+
172
+ a3_gpu_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
173
+ kind: JobSet
174
+ metadata:
175
+ name: {args.workload}
176
+ labels:
177
+ kueue.x-k8s.io/queue-name: multislice-queue # Name of the LocalQueue
178
+ xpk.google.com/workload: {args.workload}
179
+ spec:
180
+ ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
181
+ failurePolicy:
182
+ maxRestarts: {args.max_restarts}
183
+ replicatedJobs:
184
+ - name: slice-job
185
+ replicas: 1
186
+ template:
187
+ spec:
188
+ parallelism: {args.num_nodes}
189
+ completions: {args.num_nodes}
190
+ backoffLimit: 0 # When any pod fails, the job is failed
191
+ template:
192
+ metadata:
193
+ labels:
194
+ xpk.google.com/workload: {args.workload}
195
+ annotations:
196
+ kueue.x-k8s.io/podset-preferred-topology: "cloud.google.com/gce-topology-host"
197
+ spec:
198
+ priorityClassName: {args.priority}
199
+ restartPolicy: Never
200
+ dnsPolicy: ClusterFirstWithHostNet
201
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
202
+ tolerations:
203
+ - operator: "Exists"
204
+ key: nvidia.com/gpu
205
+ containers:
206
+ {container}
207
+ """
208
+
209
+ pw_workload_create_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
210
+ kind: JobSet
211
+ metadata:
212
+ name: {args.workload}
213
+ labels:
214
+ kueue.x-k8s.io/queue-name: {local_queue_name} # Name of the LocalQueue
215
+ xpk.google.com/workload: {args.workload}
216
+ spec:
217
+ ttlSecondsAfterFinished: {args.ttl_seconds_after_finished}
218
+ failurePolicy:
219
+ maxRestarts: {args.max_restarts}
220
+ successPolicy:
221
+ operator: "All"
222
+ targetReplicatedJobs:
223
+ - {args.targetReplicatedJob}
224
+ replicatedJobs:
225
+ - name: worker
226
+ replicas: {args.num_slices}
227
+ template:
228
+ metadata:
229
+ annotations:
230
+ alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
231
+ labels:
232
+ xpk.google.com/workload: {args.workload}
233
+ spec:
234
+ backoffLimit: {backoff_limit}
235
+ completions: {system.vms_per_slice}
236
+ parallelism: {system.vms_per_slice}
237
+ template:
238
+ spec:
239
+ terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
240
+ containers:
241
+ - args:
242
+ {pathways_worker_args}
243
+ image: {args.server_image}
244
+ imagePullPolicy: Always
245
+ name: pathways-worker
246
+ ports:
247
+ - containerPort: 29001
248
+ - containerPort: 8471
249
+ - containerPort: 8080
250
+ resources:
251
+ limits:
252
+ {resource_type}: {system.chips_per_vm}
253
+ securityContext:
254
+ privileged: true
255
+ volumeMounts:
256
+ - mountPath: /tmp
257
+ name: shared-tmp
258
+ nodeSelector:
259
+ {accelerator_label}
260
+ {machine_label}
261
+ {autoprovisioning_args}
262
+ priorityClassName: {args.priority}
263
+ hostNetwork: true
264
+ dnsPolicy: ClusterFirstWithHostNet
265
+ volumes:
266
+ - hostPath:
267
+ path: /tmp
268
+ type: DirectoryOrCreate
269
+ name: shared-tmp
270
+ - name: rm
271
+ replicas: 1
272
+ template:
273
+ metadata:
274
+ labels:
275
+ xpk.google.com/workload: {args.workload}
276
+ spec:
277
+ backoffLimit: 0
278
+ completions: 1
279
+ parallelism: 1
280
+ template:
281
+ spec:
282
+ containers:
283
+ - args:
284
+ {pathways_rm_args}
285
+ env:
286
+ - name: REPLICATED_JOB_NAME
287
+ valueFrom:
288
+ fieldRef:
289
+ fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
290
+ - name: JOBSET_NAME
291
+ valueFrom:
292
+ fieldRef:
293
+ fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
294
+ - name: HOST_ADDRESS
295
+ value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
296
+ - name: TPU_SKIP_MDS_QUERY
297
+ value: "true"
298
+ image: {args.server_image}
299
+ imagePullPolicy: Always
300
+ name: pathways-rm
301
+ ports:
302
+ - containerPort: 29001
303
+ securityContext:
304
+ privileged: true
305
+ volumeMounts:
306
+ - mountPath: /tmp
307
+ name: shared-tmp
308
+ nodeSelector:
309
+ cloud.google.com/gke-nodepool: cpu-rm-np
310
+ hostNetwork: true
311
+ dnsPolicy: ClusterFirstWithHostNet
312
+ volumes:
313
+ - hostPath:
314
+ path: /tmp
315
+ type: DirectoryOrCreate
316
+ name: shared-tmp
317
+ - name: proxy
318
+ replicas: 1
319
+ template:
320
+ metadata:
321
+ labels:
322
+ xpk.google.com/workload: {args.workload}
323
+ spec:
324
+ backoffLimit: 0
325
+ completions: 1
326
+ parallelism: 1
327
+ template:
328
+ spec:
329
+ containers:
330
+ - args:
331
+ {pathways_proxy_args}
332
+ image: {args.proxy_server_image}
333
+ imagePullPolicy: Always
334
+ name: pathways-proxy
335
+ ports:
336
+ - containerPort: 29000
337
+ hostNetwork: true
338
+ dnsPolicy: ClusterFirstWithHostNet
339
+ nodeSelector:
340
+ cloud.google.com/gke-nodepool: cpu-proxy-np
341
+ {user_workload}
342
+ """
343
+
344
+
345
+ def workload_create_pathways(args) -> None:
346
+ """Run jobset apply command for a file, specifically for Pathways.
347
+
348
+ Args:
349
+ args: user provided arguments for running the command.
350
+
351
+ Returns:
352
+ 0 if successful and 1 otherwise.
353
+ """
354
+ args.use_pathways = True
355
+ workload_create(args)
356
+
357
+
358
+ def workload_create(args) -> None:
359
+ """Run jobset apply command for a file.
360
+
361
+ Args:
362
+ args: user provided arguments for running the command.
363
+
364
+ Returns:
365
+ 0 if successful and 1 otherwise.
366
+ """
367
+ add_zone_and_project(args)
368
+
369
+ if args.headless:
370
+ xpk_print(
371
+ 'Please use kubectl port forwarding to connect to the Pathways proxy.'
372
+ ' kubectl get pods kubectl port-forward <proxy-pod-name> 29000:29000'
373
+ ' JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 python'
374
+ " -c 'import pathwaysutils; import jax; print(jax.devices())'"
375
+ )
376
+
377
+ set_cluster_command_code = set_cluster_command(args)
378
+ if set_cluster_command_code != 0:
379
+ xpk_exit(set_cluster_command_code)
380
+
381
+ workload_exists = check_if_workload_exists(args)
382
+
383
+ if workload_exists:
384
+ xpk_print(
385
+ f'{args.workload} already exists, XPK will not create this workload.'
386
+ ' Please pick a new workload name'
387
+ )
388
+ xpk_exit(1)
389
+
390
+ xpk_print('Starting workload create', flush=True)
391
+ system, return_code = get_system_characteristics(args)
392
+
393
+ if return_code > 0:
394
+ xpk_print('Fetching system characteristics failed!')
395
+ xpk_exit(return_code)
396
+
397
+ if not check_if_workload_can_schedule(args, system):
398
+ xpk_exit(1)
399
+
400
+ xpk_print('Starting workload create', flush=True)
401
+
402
+ metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
403
+ cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
404
+ cluster_xpk_version = None
405
+ if cluster_config_map is None:
406
+ xpk_print(
407
+ f'Warning: Unable to find ConfigMap: {metadata_configmap_name} for the'
408
+ ' cluster. We recommend to upgrade your cluster by running `xpk'
409
+ ' cluster create`.'
410
+ )
411
+ else:
412
+ cluster_xpk_version = cluster_config_map.get('xpk_version')
413
+ if (
414
+ cluster_xpk_version is not None
415
+ and cluster_xpk_version != xpk_current_version
416
+ ):
417
+ xpk_print(
418
+ 'Warning: Cluster has been created using XPK version:'
419
+ f' {cluster_config_map["xpk_version"]} but the XPK version you are'
420
+ f' using to schedule workload is: {xpk_current_version}. Some features'
421
+ ' might not be available for this cluster. We recommend to'
422
+ ' upgrade/downgrade your XPK version or cluster by running `xpk'
423
+ ' cluster create`.'
424
+ )
425
+
426
+ debugging_dashboard_id = None
427
+
428
+ tensorboard_config = {}
429
+ if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
430
+ tensorboard_config = create_vertex_experiment(args)
431
+ # exit if failed to create Experiment in Vertex AI
432
+ if not tensorboard_config:
433
+ xpk_exit(1)
434
+
435
+ parse_env_config(args, tensorboard_config, system)
436
+
437
+ # Currently autoprovisioning is not enabled for Pathways workloads.
438
+ autoprovisioning_args = ''
439
+ autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
440
+ args, system
441
+ )
442
+ if return_code != 0:
443
+ xpk_exit(return_code)
444
+ if autoprovisioning_enabled:
445
+ # Determine NAP capacity type
446
+ autoprovisioning_args, return_code = (
447
+ get_autoprovisioning_node_selector_args(args)
448
+ )
449
+ if return_code != 0:
450
+ xpk_exit(return_code)
451
+
452
+ # Create the workload file based on accelerator type or workload type.
453
+ if system.accelerator_type == AcceleratorType['GPU']:
454
+ container, debugging_dashboard_id = get_user_workload_container(
455
+ args, system
456
+ )
457
+ gpu_scheduler, return_code = get_gpu_scheduler(
458
+ args, system, autoprovisioning_args
459
+ )
460
+ if return_code != 0:
461
+ xpk_exit(return_code)
462
+
463
+ if system.device_type in cluster_gcluster.supported_device_types:
464
+ yml_string = a3_gpu_workload_create_yaml.format(
465
+ args=args, container=container
466
+ )
467
+
468
+ if args.device_type == cluster_gcluster.a3mega_device_type:
469
+ sub_networks = [f'{args.cluster}-gpunet-{i}-subnet' for i in range(8)]
470
+ yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
471
+
472
+ if args.device_type == cluster_gcluster.a3ultra_device_type:
473
+ sub_networks = [f'{args.cluster}-sub-1'] + [
474
+ f'{args.cluster}-rdma-sub-{i}' for i in range(8)
475
+ ]
476
+ yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
477
+ else:
478
+ yml_string = gpu_workload_create_yaml.format(
479
+ args=args,
480
+ container=container,
481
+ command=args.command,
482
+ chips_per_vm=system.chips_per_vm,
483
+ gpu_scheduler=gpu_scheduler,
484
+ gpu_volume=get_gpu_volume(system),
485
+ gpu_rxdm_image=get_gpu_rxdm_image(system),
486
+ gpu_rxdm_cmd=get_gpu_rxdm_cmd(system),
487
+ gpu_tcp_volume=get_gpu_tcp_volume(system),
488
+ )
489
+ elif args.use_pathways and ensure_pathways_workload_prerequisites(
490
+ args, system
491
+ ):
492
+ yml_string = pw_workload_create_yaml.format(
493
+ args=args,
494
+ system=system,
495
+ accelerator_label=create_accelerator_label(
496
+ system.accelerator_type, system
497
+ ),
498
+ machine_label=create_machine_label(system.accelerator_type, system),
499
+ pathways_rm_args=get_pathways_rm_args(args, system),
500
+ pathways_worker_args=get_pathways_worker_args(args),
501
+ pathways_proxy_args=get_pathways_proxy_args(args),
502
+ user_workload=get_user_workload_for_pathways(args, system),
503
+ resource_type=AcceleratorTypeToAcceleratorCharacteristics[
504
+ system.accelerator_type
505
+ ].resource_type,
506
+ local_queue_name=LOCAL_QUEUE_NAME,
507
+ autoprovisioning_args=autoprovisioning_args,
508
+ backoff_limit=system.vms_per_slice * 4,
509
+ )
510
+ else:
511
+ container, debugging_dashboard_id = get_user_workload_container(
512
+ args, system
513
+ )
514
+ yml_string = workload_create_yaml.format(
515
+ args=args,
516
+ system=system,
517
+ container=container,
518
+ affinity=get_cpu_affinity(system.accelerator_type),
519
+ accelerator_label=create_accelerator_label(
520
+ system.accelerator_type, system
521
+ ),
522
+ machine_label=create_machine_label(system.accelerator_type, system),
523
+ local_queue_name=LOCAL_QUEUE_NAME,
524
+ autoprovisioning_args=autoprovisioning_args,
525
+ volumes=get_volumes(args, system),
526
+ )
527
+ tmp = write_tmp_file(yml_string)
528
+ command = f'kubectl apply -f {str(tmp.file.name)}'
529
+ return_code = run_command_with_updates(command, 'Creating Workload', args)
530
+
531
+ if return_code != 0:
532
+ xpk_print(f'Create Workload request returned ERROR {return_code}')
533
+ xpk_exit(return_code)
534
+
535
+ # Get GKE outlier dashboard for TPU
536
+ outlier_dashboard_id = None
537
+ if system.accelerator_type == AcceleratorType['TPU']:
538
+ outlier_dashboard_id = get_gke_outlier_dashboard(args)
539
+
540
+ # Outlier and debugging dashboards
541
+ if outlier_dashboard_id is not None:
542
+ xpk_print(
543
+ 'Check statistics and outlier mode of GKE metrics here:'
544
+ # pylint: disable=line-too-long
545
+ f' https://console.cloud.google.com/monitoring/dashboards/builder/{outlier_dashboard_id}?project={args.project}&f.rlabel.cluster_name.ClusterName={args.cluster}.'
546
+ ' To view the metric data for your workload, select'
547
+ f' {args.workload} from the JobName filter on the dashboard.'
548
+ )
549
+
550
+ if debugging_dashboard_id is not None:
551
+ xpk_print(
552
+ 'Check stack traces collected in Cloud Logging here:'
553
+ # pylint: disable=line-too-long
554
+ f' https://console.cloud.google.com/monitoring/dashboards/builder/{debugging_dashboard_id}?project={args.project}&f.rlabel.cluster_name.ClusterName={args.cluster}.'
555
+ ' To view the stack traces for your workload, select'
556
+ f' {args.workload} from the JobName filter on the dashboard.'
557
+ )
558
+
559
+ if args.use_pathways:
560
+ if args.headless:
561
+ xpk_print(
562
+ ' \n ******* Please connect to your Pathways proxy at'
563
+ f' {args.pathways_proxy_address}, once you see "IFRT proxy server'
564
+ ' started with status OK" on the proxy link below.'
565
+ ' Remember to delete the workload once done! ****** \n'
566
+ )
567
+ pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
568
+ xpk_print(
569
+ 'Follow the proxy here:'
570
+ # pylint: disable=line-too-long)
571
+ f' {pathways_proxy_link} '
572
+ )
573
+ xpk_print(
574
+ 'Follow your Pathways workload and other resources here : '
575
+ f'{get_pathways_unified_query_link(args)}'
576
+ )
577
+ else:
578
+ xpk_print(
579
+ 'Follow your workload here:'
580
+ # pylint: disable=line-too-long
581
+ f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
582
+ )
583
+ duration_of_logs = 'P1D' # Past 1 Day
584
+ xpk_print(
585
+ 'Follow your worker 0, slice 0 logs here:'
586
+ ' Adjust the pod name'
587
+ ' ([prefix]-slice-job-[slice_number]-[worker_number])'
588
+ ' after clicking the url if you want other worker logs.'
589
+ # pylint: disable=line-too-long
590
+ f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
591
+ )
592
+
593
+ xpk_exit(0)
594
+
595
+
596
+ def workload_delete(args) -> None:
597
+ """Function around workload delete.
598
+
599
+ Args:
600
+ args: user provided arguments for running the command.
601
+
602
+ Returns:
603
+ 0 if successful and 1 otherwise.
604
+ """
605
+ xpk_print('Starting Workload delete', flush=True)
606
+ add_zone_and_project(args)
607
+ set_cluster_command_code = set_cluster_command(args)
608
+ if set_cluster_command_code != 0:
609
+ xpk_exit(set_cluster_command_code)
610
+
611
+ will_delete = True
612
+ if not args.workload:
613
+ xpk_print('Get the name of the workloads in the cluster.')
614
+ return_code, return_value = get_workload_list(args)
615
+
616
+ if return_code != 0:
617
+ xpk_print(f'List Job request returned ERROR {return_code}')
618
+ xpk_exit(return_code)
619
+ # Skip the header
620
+ workloads = [x.split(' ')[0] for x in return_value.splitlines()][1:]
621
+ if workloads and not args.force:
622
+ will_delete = get_user_input(
623
+ f'Planning to delete {len(workloads)} workloads in the cluster'
624
+ f' {args.cluster} including {workloads}. \nDo you wish to delete: y'
625
+ ' (yes) / n (no):\n'
626
+ )
627
+ else:
628
+ workloads = [args.workload]
629
+
630
+ if not workloads:
631
+ xpk_print(
632
+ 'There are no workloads to delete matching the filter in the cluster.'
633
+ )
634
+ elif not will_delete:
635
+ xpk_print('Skipping delete command.')
636
+ else:
637
+ commands = []
638
+ task_names = []
639
+ for workload in workloads:
640
+ args.workload = workload
641
+ command = f'kubectl delete jobset {workload} -n default'
642
+ task_name = f'WorkloadDelete-{workload}'
643
+ commands.append(command)
644
+ task_names.append(task_name)
645
+
646
+ # Not batching deletion for single workload
647
+ if len(workloads) == 1:
648
+ return_code = run_command_with_updates(
649
+ commands[0], 'Delete Workload', args
650
+ )
651
+ else:
652
+ return_code = run_commands(
653
+ commands, 'Delete Workload', task_names, batch=100
654
+ )
655
+
656
+ if return_code != 0:
657
+ xpk_print(f'Delete Workload request returned ERROR {return_code}')
658
+ xpk_exit(return_code)
659
+ xpk_exit(0)
660
+
661
+
662
+ def workload_list(args) -> None:
663
+ """Function around workload list.
664
+
665
+ Args:
666
+ args: user provided arguments for running the command.
667
+
668
+ Returns:
669
+ 0 if successful and 1 otherwise.
670
+ """
671
+ xpk_print(args)
672
+
673
+ xpk_print('Starting workload list', flush=True)
674
+ add_zone_and_project(args)
675
+ set_cluster_command_code = set_cluster_command(args)
676
+ if set_cluster_command_code != 0:
677
+ xpk_exit(set_cluster_command_code)
678
+
679
+ if args.wait_for_job_completion:
680
+ return_code = wait_for_job_completion(args)
681
+ if return_code != 0:
682
+ xpk_print(f'Wait for job completion returned ERROR {return_code}')
683
+ xpk_exit(return_code)
684
+ args.filter_by_job = args.wait_for_job_completion
685
+
686
+ return_code, return_value = get_workload_list(args)
687
+
688
+ if return_code != 0:
689
+ xpk_print(f'List Job request returned ERROR {return_code}')
690
+ xpk_exit(return_code)
691
+ xpk_print(f'Workload List Output:\n{return_value}')
692
+ xpk_exit(0)
xpk/core/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """