xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/parser/workload.py ADDED
@@ -0,0 +1,726 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..commands.workload import (
18
+ workload_create,
19
+ workload_create_pathways,
20
+ workload_delete,
21
+ workload_list,
22
+ )
23
+ from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
24
+ from .common import add_shared_arguments
25
+ from .validators import directory_path_type, name_type
26
+
27
+
28
+ def set_workload_parsers(workload_parser):
29
+ workload_subcommands = workload_parser.add_subparsers(
30
+ title='workload subcommands',
31
+ dest='xpk_workload_subcommands',
32
+ help=(
33
+ '`create`, `create-pathways`, `list` and `delete` workloads on'
34
+ ' clusters'
35
+ ),
36
+ )
37
+
38
+ # "workload create" command parser.
39
+ workload_create_parser = workload_subcommands.add_parser(
40
+ 'create', help='Create a new job.'
41
+ )
42
+ workload_create_parser_required_arguments = (
43
+ workload_create_parser.add_argument_group(
44
+ 'Workload Built-in Arguments',
45
+ 'Configure xpk to create a Workload for you.',
46
+ )
47
+ )
48
+ workload_create_parser_optional_arguments = (
49
+ workload_create_parser.add_argument_group(
50
+ 'Optional Arguments', 'Arguments optional for `workload create`.'
51
+ )
52
+ )
53
+ workload_base_docker_image_arguments = workload_create_parser.add_argument_group(
54
+ 'Base Docker Image Arguments',
55
+ 'User supplies a base image or by default the image is set by xpk.'
56
+ ' Xpk will add the `script_dir` to the base image creating an anonymous'
57
+ ' docker image. These arguments are exclusive to `--docker-image`.',
58
+ )
59
+ workload_docker_image_arguments = workload_create_parser.add_argument_group(
60
+ 'Docker Image Arguments',
61
+ '`--base-docker-image` is used by default. Set this argument if the'
62
+ ' user wants the docker image to be used directly by the xpk workload.',
63
+ )
64
+ workload_create_autoprovisioning_arguments = (
65
+ workload_create_parser.add_argument_group(
66
+ 'Optional Autoprovisioning Arguments',
67
+ 'Arguments for configuring autoprovisioning.',
68
+ )
69
+ )
70
+
71
+ workload_vertex_tensorboard_arguments = (
72
+ workload_create_parser.add_argument_group(
73
+ 'Vertex Tensorboard Arguments',
74
+ 'Arguments for creating Vertex AI Experiment in workload create.',
75
+ )
76
+ )
77
+
78
+ ### "workload create" Required arguments
79
+ workload_create_parser_required_arguments.add_argument(
80
+ '--command',
81
+ type=str,
82
+ default=None,
83
+ help=(
84
+ 'Main command to run on each VM. This script runs within the docker'
85
+ ' container. Typically this looks like "--command=\'python3'
86
+ ' train.py\'" but if your docker container is missing the'
87
+ ' dependencies, it might look more like "--command=\'bash setup.sh &&'
88
+ ' python3 train.py\'".'
89
+ ),
90
+ required=True,
91
+ )
92
+ workload_device_group = (
93
+ workload_create_parser_required_arguments.add_mutually_exclusive_group(
94
+ required=True
95
+ )
96
+ )
97
+ workload_device_group.add_argument(
98
+ '--tpu-type',
99
+ type=str,
100
+ default=None,
101
+ help='The tpu type to use, v5litepod-16, etc.',
102
+ )
103
+ workload_device_group.add_argument(
104
+ '--device-type',
105
+ type=str,
106
+ default=None,
107
+ help=(
108
+ 'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
109
+ ' h100-80gb-8, n2-standard-32-4 etc.'
110
+ ),
111
+ )
112
+
113
+ workload_create_parser_optional_arguments.add_argument(
114
+ '--storage',
115
+ action='append',
116
+ default=[],
117
+ help='Names of storages the workload uses',
118
+ )
119
+ workload_create_parser_optional_arguments.add_argument(
120
+ '--num-nodes',
121
+ type=int,
122
+ default=1,
123
+ help='The number of nodes to use, default=1.',
124
+ )
125
+ workload_create_parser_optional_arguments.add_argument(
126
+ '--scheduler',
127
+ type=str,
128
+ default='default-scheduler',
129
+ help=(
130
+ 'Which scheduler you want to use. Defaults to `default-scheduler`. If'
131
+ ' your cluster is configured for high throughput scheduling, you'
132
+ ' might want to use `gke.io/high-throughput-scheduler`.If your'
133
+ ' cluster is configured for topology-aware scheduling, you might want'
134
+ ' to use `gke.io/topology-aware-auto`.'
135
+ ),
136
+ )
137
+ workload_create_parser_optional_arguments.add_argument(
138
+ '--debug-dump-gcs',
139
+ type=str,
140
+ default=None,
141
+ help=(
142
+ 'GCS bucket or a directory within a bucket, e.g gs://bucket/subdir, '
143
+ 'where debugging information such as HLO dumps are uploaded'
144
+ ),
145
+ )
146
+ workload_create_parser_optional_arguments.add_argument(
147
+ '--deploy-stacktrace-sidecar',
148
+ action='store_true',
149
+ help=(
150
+ 'Add this argument to deploy a sidecar container that will '
151
+ 'read the stack traces collected in /tmp/debugging directory '
152
+ 'and forward them to Cloud Logging for TPU workloads.'
153
+ ),
154
+ )
155
+
156
+ workload_create_parser_optional_arguments.add_argument(
157
+ '--use-pathways',
158
+ action='store_true',
159
+ help=(
160
+ 'Please use `xpk workload create-pathways` instead to'
161
+ ' create Pathways workloads.'
162
+ ),
163
+ )
164
+
165
+ # Autoprovisioning workload arguments
166
+ workload_create_autoprovisioning_arguments.add_argument(
167
+ '--on-demand',
168
+ action='store_true',
169
+ help=(
170
+ 'Sets autoprovisioning to use on-demand resources for the workload'
171
+ ' request. See `--reservation` or `--spot` for other capacity types.'
172
+ ),
173
+ )
174
+ workload_create_autoprovisioning_arguments.add_argument(
175
+ '--reservation',
176
+ type=str,
177
+ help=(
178
+ 'Sets autoprovisioning to use reservation resources for the workload'
179
+ ' request. This will attempt to find the provided reservation. See'
180
+ ' `--spot` or `--on-demand` for other capacity types.'
181
+ ),
182
+ )
183
+ workload_create_autoprovisioning_arguments.add_argument(
184
+ '--spot',
185
+ action='store_true',
186
+ help=(
187
+ 'Sets autoprovisioning to use spot resources.'
188
+ ' See `--reservation` or `--on-demand` for other capacity types.'
189
+ ),
190
+ )
191
+
192
+ # "workload create-pathways" command parser.
193
+ workload_create_pathways_parser = workload_subcommands.add_parser(
194
+ 'create-pathways', help='Create a new job.'
195
+ )
196
+ workload_create_pathways_parser_required_arguments = (
197
+ workload_create_pathways_parser.add_argument_group(
198
+ 'Workload create-pathways Built-in Arguments',
199
+ 'Configure xpk to create a Pathways Workload for you.',
200
+ )
201
+ )
202
+ workload_create_pathways_parser_optional_arguments = (
203
+ workload_create_pathways_parser.add_argument_group(
204
+ 'Optional Arguments',
205
+ 'Arguments optional for `workload create-pathways`.',
206
+ )
207
+ )
208
+ workload_create_pathways_base_docker_image_arguments = workload_create_pathways_parser.add_argument_group(
209
+ 'Base Docker Image Arguments',
210
+ 'User supplies a base image or by default the image is set by xpk.'
211
+ ' Xpk will add the `script_dir` to the base image creating an anonymous'
212
+ ' docker image. These arguments are exclusive to `--docker-image`.',
213
+ )
214
+ workload_create_pathways_docker_image_arguments = workload_create_pathways_parser.add_argument_group(
215
+ 'Docker Image Arguments',
216
+ '`--base-docker-image` is used by default. Set this argument if the'
217
+ ' user wants the docker image to be used directly by the xpk workload.',
218
+ )
219
+ workload_create_pathways_vertex_tensorboard_arguments = (
220
+ workload_create_pathways_parser.add_argument_group(
221
+ 'Vertex Tensorboard Arguments',
222
+ 'Arguments for creating Vertex AI Experiment in workload create.',
223
+ )
224
+ )
225
+
226
+ ### "workload create-pathways" Required arguments, specific to Pathways
227
+ workload_create_pathways_parser_required_arguments.add_argument(
228
+ '--tpu-type',
229
+ type=str,
230
+ default=None,
231
+ help='The tpu type to use, v5litepod-16, etc.',
232
+ )
233
+
234
+ ### "workload create-pathways" Optional arguments, specific to Pathways
235
+ workload_create_pathways_parser_optional_arguments.add_argument(
236
+ '--headless',
237
+ action='store_true',
238
+ help=(
239
+ 'Please provide this argument to create Pathways workloads in'
240
+ ' headless mode. This arg can only be used in `xpk workload'
241
+ ' create-pathways`.'
242
+ ),
243
+ )
244
+ workload_create_pathways_parser_optional_arguments.add_argument(
245
+ '--proxy-server-image',
246
+ type=str,
247
+ default=(
248
+ 'us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:latest'
249
+ ),
250
+ help=(
251
+ 'Please provide the proxy server image for Pathways. This arg can'
252
+ ' only be used in `xpk workload create-pathways`.'
253
+ ),
254
+ )
255
+ workload_create_pathways_parser_optional_arguments.add_argument(
256
+ '--server-image',
257
+ type=str,
258
+ default='us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest',
259
+ help=(
260
+ 'Please provide the server image for Pathways. This arg can only be'
261
+ ' used in `xpk workload create-pathways`.'
262
+ ),
263
+ )
264
+ workload_create_pathways_parser_optional_arguments.add_argument(
265
+ '--pathways-gcs-location',
266
+ type=str,
267
+ default='gs://cloud-pathways-staging/tmp',
268
+ help=(
269
+ 'Please provide the GCS location to store Pathways artifacts. This'
270
+ ' arg can only be used in `xpk workload create-pathways`.'
271
+ ),
272
+ )
273
+ workload_create_pathways_parser_optional_arguments.add_argument(
274
+ '--command',
275
+ type=str,
276
+ default=None,
277
+ help=(
278
+ 'Main command to run on each VM. This script runs within the docker'
279
+ ' container. Typically this looks like "--command=\'python3'
280
+ ' train.py\'" but if your docker container is missing the'
281
+ ' dependencies, it might look more like "--command=\'bash setup.sh &&'
282
+ ' python3 train.py\'".'
283
+ ),
284
+ required=False,
285
+ )
286
+ workload_create_pathways_parser_optional_arguments.add_argument(
287
+ '--storage',
288
+ action='append',
289
+ default=[],
290
+ help='Names of storages the workload uses',
291
+ )
292
+
293
+ workload_create_pathways_parser_optional_arguments.add_argument(
294
+ '--custom-pathways-server-args',
295
+ type=str,
296
+ default=None,
297
+ help=(
298
+ 'Provide custom Pathways server args as follows -'
299
+ " --custom-pathways-server-args='--arg_1=xxx --arg2=yyy'"
300
+ ),
301
+ required=False,
302
+ )
303
+
304
+ workload_create_pathways_parser_optional_arguments.add_argument(
305
+ '--custom-pathways-proxy-server-args',
306
+ type=str,
307
+ default=None,
308
+ help=(
309
+ 'Provide custom Pathways proxy server args as follows -'
310
+ " --custom-pathways-proxy-server-args='--arg_1=xxx --arg2=yyy'"
311
+ ),
312
+ required=False,
313
+ )
314
+
315
+ workload_create_pathways_parser_optional_arguments.add_argument(
316
+ '--custom-pathways-worker-args',
317
+ type=str,
318
+ default=None,
319
+ help=(
320
+ 'Provide custom Pathways worker args as follows -'
321
+ " --custom-pathways-worker-args='--arg_1=xxx --arg2=yyy'"
322
+ ),
323
+ required=False,
324
+ )
325
+
326
+ add_shared_workload_create_required_arguments([
327
+ workload_create_parser_required_arguments,
328
+ workload_create_pathways_parser_required_arguments,
329
+ ])
330
+ add_shared_workload_create_optional_arguments([
331
+ workload_create_parser_optional_arguments,
332
+ workload_create_pathways_parser_optional_arguments,
333
+ ])
334
+ add_shared_workload_create_env_arguments([
335
+ workload_create_parser_optional_arguments,
336
+ workload_create_pathways_parser_optional_arguments,
337
+ ])
338
+ add_shared_workload_base_docker_image_arguments([
339
+ workload_base_docker_image_arguments,
340
+ workload_create_pathways_base_docker_image_arguments,
341
+ ])
342
+ add_shared_workload_docker_image_arguments([
343
+ workload_docker_image_arguments,
344
+ workload_create_pathways_docker_image_arguments,
345
+ ])
346
+ add_shared_workload_create_tensorboard_arguments([
347
+ workload_vertex_tensorboard_arguments,
348
+ workload_create_pathways_vertex_tensorboard_arguments,
349
+ ])
350
+
351
+ # Set defaults for both workload create and workload create-pathways after adding all shared args.
352
+ workload_create_parser.set_defaults(func=workload_create)
353
+ workload_create_pathways_parser.set_defaults(func=workload_create_pathways)
354
+
355
+ # "workload delete" command parser.
356
+ workload_delete_parser = workload_subcommands.add_parser(
357
+ 'delete', help='Delete job.'
358
+ )
359
+ workload_delete_parser_required_arguments = (
360
+ workload_delete_parser.add_argument_group(
361
+ 'Required Arguments',
362
+ 'Arguments required for `job delete`.',
363
+ )
364
+ )
365
+ workload_delete_parser_optional_arguments = (
366
+ workload_delete_parser.add_argument_group(
367
+ 'Optional Arguments', 'Arguments optional for `job delete`.'
368
+ )
369
+ )
370
+ add_shared_arguments(workload_delete_parser_optional_arguments)
371
+
372
+ ### "workload delete" Required arguments
373
+ workload_delete_parser_required_arguments.add_argument(
374
+ '--cluster',
375
+ type=name_type,
376
+ default=None,
377
+ help='The name of the cluster to delete the job on.',
378
+ required=True,
379
+ )
380
+ ### "workload delete" Optional arguments
381
+ workload_delete_parser_optional_arguments.add_argument(
382
+ '--workload',
383
+ type=name_type,
384
+ default=None,
385
+ help=(
386
+ 'The name of the workload to delete. If the workload is not'
387
+ ' specified, all workloads will be deleted from the cluster.'
388
+ ),
389
+ )
390
+ workload_delete_parser_optional_arguments.add_argument(
391
+ '--filter-by-job',
392
+ type=str,
393
+ help=(
394
+ 'Filters the arguments based on job name. Provide a regex'
395
+ ' expressionto parse jobs that match the pattern or provide a job'
396
+ ' name to delete a single job.'
397
+ ),
398
+ )
399
+ workload_delete_parser_optional_arguments.add_argument(
400
+ '--filter-by-status',
401
+ type=str,
402
+ default='EVERYTHING',
403
+ choices=[
404
+ 'EVERYTHING',
405
+ 'FINISHED',
406
+ 'RUNNING',
407
+ 'QUEUED',
408
+ 'FAILED',
409
+ 'SUCCESSFUL',
410
+ ],
411
+ help=(
412
+ 'Filters the arguments based on status. Selected filters are listed'
413
+ ' above. FAILED and SUCCESSFUL are sub-states of FINISHED.'
414
+ ),
415
+ required=False,
416
+ )
417
+ workload_delete_parser_optional_arguments.add_argument(
418
+ '--force',
419
+ action='store_true',
420
+ help=(
421
+ 'Forces workload deletion command to run without additional approval.'
422
+ ),
423
+ )
424
+
425
+ workload_delete_parser.set_defaults(func=workload_delete)
426
+
427
+ # "workload list" command parser.
428
+ workload_list_parser = workload_subcommands.add_parser(
429
+ 'list', help='List jobs.'
430
+ )
431
+
432
+ workload_list_parser.add_argument(
433
+ '--cluster',
434
+ type=name_type,
435
+ default=None,
436
+ help='The name of the cluster to list jobs on.',
437
+ required=True,
438
+ )
439
+
440
+ workload_list_parser.add_argument(
441
+ '--filter-by-status',
442
+ type=str,
443
+ default='EVERYTHING',
444
+ choices=[
445
+ 'EVERYTHING',
446
+ 'FINISHED',
447
+ 'RUNNING',
448
+ 'QUEUED',
449
+ 'FAILED',
450
+ 'SUCCESSFUL',
451
+ ],
452
+ help=(
453
+ 'Filters the arguments based on status. Selected filters are listed'
454
+ ' above. FAILED and SUCCESSFUL are sub-states of FINISHED.'
455
+ ),
456
+ required=False,
457
+ )
458
+
459
+ workload_list_parser.add_argument(
460
+ '--filter-by-job',
461
+ type=str,
462
+ help=(
463
+ 'Filters the arguments based on job name. Provide a regex'
464
+ ' expressionto parse jobs that match the pattern or provide a job'
465
+ ' name to view a single job.'
466
+ ),
467
+ required=False,
468
+ )
469
+
470
+ workload_list_wait_for_job_completion_arguments = (
471
+ workload_list_parser.add_argument_group(
472
+ 'Wait for Job Completion Arguments',
473
+ 'Arguments for waiting on the completion of a job.',
474
+ )
475
+ )
476
+
477
+ workload_list_wait_for_job_completion_arguments.add_argument(
478
+ '--wait-for-job-completion',
479
+ type=str,
480
+ default=None,
481
+ help='The name of the job to wait on.',
482
+ required=False,
483
+ )
484
+
485
+ workload_list_wait_for_job_completion_arguments.add_argument(
486
+ '--timeout',
487
+ type=int,
488
+ default=None,
489
+ help=(
490
+ 'Amount of time to wait for job in seconds. Default is the max wait'
491
+ ' time, 1 week.'
492
+ ),
493
+ required=False,
494
+ )
495
+
496
+ add_shared_arguments(workload_list_parser)
497
+
498
+ workload_list_parser.set_defaults(func=workload_list)
499
+
500
+
501
+ def add_shared_workload_create_required_arguments(args_parsers):
502
+ """Add shared required arguments in workload create and Pathways workload create.
503
+
504
+ Args:
505
+ List of workload create required arguments parsers
506
+ """
507
+ for custom_parser in args_parsers:
508
+ custom_parser.add_argument(
509
+ '--workload',
510
+ type=name_type,
511
+ default=None,
512
+ help='The name of the workload to run.',
513
+ required=True,
514
+ )
515
+ custom_parser.add_argument(
516
+ '--cluster',
517
+ type=name_type,
518
+ default=None,
519
+ help='The name of the cluster to run the job on.',
520
+ required=True,
521
+ )
522
+
523
+
524
+ def add_shared_workload_create_optional_arguments(args_parsers):
525
+ """Add shared optional arguments in workload create and Pathways workload create.
526
+
527
+ Args:
528
+ List of workload create optional arguments parsers
529
+ """
530
+ for custom_parser in args_parsers:
531
+ add_shared_arguments(custom_parser)
532
+ custom_parser.add_argument(
533
+ '--docker-name',
534
+ type=str,
535
+ default='jax-tpu',
536
+ help=(
537
+ 'The name of the docker-image to use, default and typically'
538
+ ' `jax-tpu`.'
539
+ ),
540
+ )
541
+ custom_parser.add_argument(
542
+ '--num-slices',
543
+ type=int,
544
+ default=1,
545
+ help='The number of slices to use, default=1.',
546
+ )
547
+ custom_parser.add_argument(
548
+ '--priority',
549
+ type=str,
550
+ default='medium',
551
+ choices=['very-low', 'low', 'medium', 'high', 'very-high'],
552
+ help=(
553
+ 'A priority, one of `very-low`, `low`, `medium`, `high` or'
554
+ ' `very-high`. Defaults to `medium`.'
555
+ ),
556
+ )
557
+ custom_parser.add_argument(
558
+ '--max-restarts',
559
+ type=str,
560
+ default='0',
561
+ help=(
562
+ 'Maximum number of times the JobSet will be restarted upon failure.'
563
+ ' Defaults to 0.'
564
+ ),
565
+ )
566
+ custom_parser.add_argument(
567
+ '--ttl-seconds-after-finished',
568
+ type=int,
569
+ default=12 * 60 * 60,
570
+ help=(
571
+ 'Set the number of seconds to clean up finished Jobsets (either'
572
+ ' Complete or Failed). This is by default set to 12 hours.'
573
+ ),
574
+ )
575
+ custom_parser.add_argument(
576
+ '-tgps',
577
+ '--termination-grace-period-seconds',
578
+ type=str,
579
+ default='30',
580
+ help=(
581
+ 'Maximum wait time for a workload Pod to wrap up after a disruption'
582
+ ' event or deletion request.Defaults to 30 seconds.'
583
+ ),
584
+ )
585
+ custom_parser.add_argument(
586
+ '--remote-python-sidecar-image',
587
+ type=str,
588
+ default=None,
589
+ help='Remote Python sidecar server image.',
590
+ )
591
+ custom_parser.add_argument(
592
+ '--enable-debug-logs',
593
+ action='store_true',
594
+ help=(
595
+ 'Set this flag to get verbose logging to investigate the issue in'
596
+ ' the workload.'
597
+ ),
598
+ )
599
+ custom_parser.add_argument(
600
+ '--restart-on-exit-codes',
601
+ type=str,
602
+ default=None,
603
+ help=(
604
+ 'Adding this argument specifies additional user-defined exit codes'
605
+ ' that allow restarting the workload when --max-restarts is set to'
606
+ ' a value greater than 0. By default, workloads restart on exit'
607
+ ' codes 42 and 127-255. Any exit codes provided through this flag'
608
+ ' will be included alongside the default codes for restarting'
609
+ ' conditions.'
610
+ ),
611
+ )
612
+ custom_parser.add_argument(
613
+ '--ramdisk-directory',
614
+ type=str,
615
+ default='',
616
+ help=(
617
+ 'The directory of the locally mounted RAM disk. This is only to'
618
+ ' be used with the CSI driver provided by GKE.'
619
+ ),
620
+ )
621
+
622
+
623
+ def add_shared_workload_create_env_arguments(args_parsers):
624
+ """Add shared workload create environment arguments in workload create and Pathways workload create.
625
+
626
+ Args:
627
+ List of workload create environment arguments parsers
628
+ """
629
+ for custom_parser in args_parsers:
630
+ workload_env_arguments = custom_parser.add_mutually_exclusive_group()
631
+ workload_env_arguments.add_argument(
632
+ '--env-file',
633
+ type=str,
634
+ default=None,
635
+ help=(
636
+ 'Environment file to be applied to the container. This file should'
637
+ ' use the syntax <variable>=value (which sets the variable to the'
638
+ ' given value) or <variable> (which takes the value from the local'
639
+ ' environment), and # for comments.'
640
+ ),
641
+ )
642
+ workload_env_arguments.add_argument(
643
+ '--env',
644
+ action='append',
645
+ type=str,
646
+ help=(
647
+ 'Environment variable to set in the container environment. '
648
+ 'The format is <variable>=value'
649
+ ),
650
+ )
651
+
652
+
653
+ def add_shared_workload_base_docker_image_arguments(args_parsers):
654
+ """Add shared base docker image arguments in workload create and Pathways workload create.
655
+
656
+ Args:
657
+ List of workload create base docker image arguments parsers
658
+ """
659
+ for custom_parser in args_parsers:
660
+ custom_parser.add_argument(
661
+ '--base-docker-image',
662
+ type=str,
663
+ default=DEFAULT_DOCKER_IMAGE,
664
+ help=(
665
+ 'The base docker-image to use, default'
666
+ f' {DEFAULT_DOCKER_IMAGE}. If using a custom docker image it'
667
+ ' is typically addressed as gcr.io/${PROJECT}/${NAME}:latest.'
668
+ ' This docker image will be used as a base image by default and'
669
+ ' the `--script-dir` by default will be added to the image.'
670
+ ),
671
+ )
672
+ custom_parser.add_argument(
673
+ '--script-dir',
674
+ type=directory_path_type,
675
+ default=DEFAULT_SCRIPT_DIR,
676
+ help=(
677
+ 'The local location of the directory to copy to the docker image'
678
+ ' and run the main command from. Defaults to current working'
679
+ ' directory.'
680
+ ),
681
+ )
682
+
683
+
684
+ def add_shared_workload_docker_image_arguments(args_parsers):
685
+ """Add shared docker image arguments in workload create and Pathways workload create.
686
+
687
+ Args:
688
+ List of workload create docker image arguments parsers
689
+ """
690
+ for custom_parser in args_parsers:
691
+ custom_parser.add_argument(
692
+ '--docker-image',
693
+ type=str,
694
+ help=(
695
+ 'The version of the docker-image to use. By default, '
696
+ ' `--base-docker-image` is used. Set this argument if the user'
697
+ ' wants the docker image to be used directly by the xpk workload. a'
698
+ ' custom docker image it is typically addressed as'
699
+ ' gcr.io/${PROJECT}/${NAME}:latest. This docker image will be used'
700
+ ' directly by the xpk workload.'
701
+ ),
702
+ )
703
+
704
+
705
+ def add_shared_workload_create_tensorboard_arguments(args_parsers):
706
+ """Add shared tensorboard arguments in workload create and Pathways workload create.
707
+
708
+ Args:
709
+ List of workload create optional arguments parsers
710
+ """
711
+ for custom_parser in args_parsers:
712
+ custom_parser.add_argument(
713
+ '--use-vertex-tensorboard',
714
+ action='store_true',
715
+ help='Set this flag to view workload data on Vertex Tensorboard.',
716
+ )
717
+ custom_parser.add_argument(
718
+ '--experiment-name',
719
+ type=str,
720
+ required=False,
721
+ help=(
722
+ 'The name of Vertex Experiment to create. '
723
+ 'If not specified, a Vertex Experiment with the name '
724
+ '<cluster>-<workload> will be created.'
725
+ ),
726
+ )