xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/parser/cluster.py CHANGED
@@ -14,7 +14,10 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from argparse import ArgumentParser
18
+
17
19
  from ..commands.cluster import (
20
+ cluster_adapt,
18
21
  cluster_cacheimage,
19
22
  cluster_create,
20
23
  cluster_create_pathways,
@@ -23,14 +26,14 @@ from ..commands.cluster import (
23
26
  cluster_describe,
24
27
  cluster_list,
25
28
  )
29
+ from ..commands.config import xpk_cfg
30
+ from ..core.config import CFG_BUCKET_KEY
26
31
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
27
32
  from .common import add_shared_arguments
28
33
  from .validators import name_type
29
- from ..commands.config import xpk_cfg
30
- from ..core.config import CFG_BUCKET_KEY
31
34
 
32
35
 
33
- def set_cluster_parser(cluster_parser):
36
+ def set_cluster_parser(cluster_parser: ArgumentParser):
34
37
  cluster_subcommands = cluster_parser.add_subparsers(
35
38
  title='cluster subcommands',
36
39
  dest='xpk_cluster_subcommands',
@@ -40,28 +43,54 @@ def set_cluster_parser(cluster_parser):
40
43
  ),
41
44
  )
42
45
 
43
- ### "cluster create" command parser ###
44
46
  cluster_create_parser = cluster_subcommands.add_parser(
45
47
  'create', help='Create cloud clusters.'
46
48
  )
47
- cluster_create_required_arguments = cluster_create_parser.add_argument_group(
48
- 'Required Arguments',
49
- 'Arguments required for cluster create.',
49
+ cluster_create_pathways_parser = cluster_subcommands.add_parser(
50
+ 'create-pathways',
51
+ help='Create Pathways-on-Cloud clusters.',
50
52
  )
51
- cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
52
- 'Optional Arguments', 'Arguments optional for cluster create.'
53
+ cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
54
+ 'create-ray',
55
+ help='Create RayCluster',
53
56
  )
54
- cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
55
- 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
57
+ cluster_delete_parser = cluster_subcommands.add_parser(
58
+ 'delete',
59
+ help='Delete cloud clusters.',
56
60
  )
57
- cluster_create_tensorboard_arguments = (
58
- cluster_create_parser.add_argument_group(
59
- 'Optional Vertex AI Tensorboard Arguments',
60
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
61
- )
61
+ cluster_cacheimage_parser = cluster_subcommands.add_parser(
62
+ 'cacheimage',
63
+ help='Cache image.',
64
+ )
65
+ cluster_describe_parser = cluster_subcommands.add_parser(
66
+ 'describe',
67
+ help='Describe a cluster.',
68
+ )
69
+ cluster_list_parser = cluster_subcommands.add_parser(
70
+ 'list', help='List cloud clusters.'
62
71
  )
72
+ cluster_adapt_parser = cluster_subcommands.add_parser(
73
+ 'adapt', help='Adapt an existing cluster for XPK.'
74
+ )
75
+
76
+ set_cluster_create_parser(cluster_create_parser)
77
+ set_cluster_create_pathways_parser(cluster_create_pathways_parser)
78
+ set_cluster_create_ray_parser(cluster_create_ray_cluster_parser)
79
+ set_cluster_delete_parser(cluster_delete_parser)
80
+ set_cluster_cacheimage_parser(cluster_cacheimage_parser)
81
+ set_cluster_describe_parser(cluster_describe_parser)
82
+ set_cluster_list_parser(cluster_list_parser)
83
+ set_cluster_adapt_parser(cluster_adapt_parser)
84
+
63
85
 
86
+ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
64
87
  ### Required arguments specific to "cluster create"
88
+ cluster_create_required_arguments = cluster_create_parser.add_argument_group(
89
+ 'Required Arguments', 'Arguments required for cluster create.'
90
+ )
91
+ add_shared_cluster_create_required_arguments(
92
+ cluster_create_required_arguments
93
+ )
65
94
 
66
95
  cluster_device_group = (
67
96
  cluster_create_required_arguments.add_mutually_exclusive_group(
@@ -85,6 +114,12 @@ def set_cluster_parser(cluster_parser):
85
114
  )
86
115
 
87
116
  ### Optional arguments specific to "cluster create"
117
+ cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
118
+ 'Optional Arguments', 'Arguments optional for cluster create.'
119
+ )
120
+ add_shared_cluster_create_optional_arguments(
121
+ cluster_create_optional_arguments
122
+ )
88
123
  cluster_create_optional_arguments.add_argument(
89
124
  '--cluster-state-gcs-bucket',
90
125
  type=str,
@@ -108,111 +143,114 @@ def set_cluster_parser(cluster_parser):
108
143
  ),
109
144
  )
110
145
 
111
- ### Autoprovisioning arguments specific to "cluster create"
112
- cluster_create_autoprovisioning_arguments = (
146
+ autoprovisioning_arguments = cluster_create_parser.add_argument_group(
147
+ 'Autoprovisioning Arguments',
148
+ 'Optional arguments for enabling autoprovisioning.',
149
+ )
150
+ add_autoprovisioning_arguments(autoprovisioning_arguments)
151
+
152
+ ### Capacity arguments specific to "cluster create"
153
+ cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
154
+ 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
155
+ )
156
+ add_shared_cluster_create_capacity_arguments(
157
+ cluster_create_capacity_arguments
158
+ )
159
+
160
+ ### Tensorboard arguments specific to "cluster create"
161
+ cluster_create_tensorboard_arguments = (
113
162
  cluster_create_parser.add_argument_group(
114
- 'Optional Autoprovisioning Arguments',
115
- 'Arguments optional for enabling autoprovisioning.',
163
+ 'Optional Vertex AI Tensorboard Arguments',
164
+ 'Arguments for creating Vertex AI Tensorboard in cluster create.',
116
165
  )
117
166
  )
118
- cluster_create_autoprovisioning_arguments.add_argument(
119
- '--enable-autoprovisioning',
120
- action='store_true',
121
- help=(
122
- 'Enable GKE features for autoprovisioning node pools in GKE clusters.'
123
- ),
167
+ add_shared_cluster_create_tensorboard_arguments(
168
+ cluster_create_tensorboard_arguments
124
169
  )
125
- cluster_create_autoprovisioning_arguments.add_argument(
126
- '--autoprovisioning-min-chips',
127
- type=int,
128
- help=(
129
- 'Optionally set the minimum autoprovisioning accelerator resources in'
130
- ' units of chips.By default, autoprovisioning will use the number of'
131
- ' resources in the cluster as the minimum, and maximum.'
132
- ),
133
- )
134
- cluster_create_autoprovisioning_arguments.add_argument(
135
- '--autoprovisioning-max-chips',
136
- type=int,
137
- help=(
138
- 'Optionally set the maximum autoprovisioning accelerator resources in'
139
- ' units of chips.By default, autoprovisioning will use the number of'
140
- ' resources in the cluster as the minimum, and maximum.'
141
- ),
170
+
171
+ ### MTC arguments specific to "cluster create"
172
+ cluster_create_mtc_arguments = cluster_create_parser.add_argument_group(
173
+ 'Optional MTC Arguments',
174
+ 'Arguments for configuring MTC in cluster create.',
142
175
  )
176
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
177
+ cluster_create_parser.set_defaults(func=cluster_create)
143
178
 
144
- ### "cluster create-pathways" command parser ###
145
179
 
146
- cluster_create_pathways_parser = cluster_subcommands.add_parser(
147
- 'create-pathways',
148
- help='Create Pathways-on-Cloud clusters.',
149
- )
180
+ def set_cluster_create_pathways_parser(
181
+ cluster_create_pathways_parser: ArgumentParser,
182
+ ):
183
+ ### Required arguments specific to "cluster create-pathways"
150
184
  cluster_create_pathways_required_arguments = (
151
185
  cluster_create_pathways_parser.add_argument_group(
152
186
  'Required Arguments',
153
187
  'Arguments required for cluster create-pathways.',
154
188
  )
155
189
  )
190
+ add_shared_cluster_create_required_arguments(
191
+ cluster_create_pathways_required_arguments
192
+ )
193
+ cluster_create_pathways_required_arguments.add_argument(
194
+ '--tpu-type',
195
+ type=str,
196
+ default=None,
197
+ help='The tpu type to use, v5litepod-16, etc.',
198
+ )
199
+
200
+ ### Optional arguments specific to "cluster create-pathways"
156
201
  cluster_create_pathways_optional_arguments = (
157
202
  cluster_create_pathways_parser.add_argument_group(
158
203
  'Optional Arguments',
159
204
  'Arguments optional for cluster create-pathways.',
160
205
  )
161
206
  )
207
+ add_shared_cluster_create_optional_arguments(
208
+ cluster_create_pathways_optional_arguments
209
+ )
210
+
211
+ ### Capacity arguments specific to "cluster create-pathways"
162
212
  cluster_create_pathways_capacity_arguments = (
163
213
  cluster_create_pathways_parser.add_argument_group(
164
214
  'Capacity Arguments',
165
215
  'Arguments related to capacity for cluster create-pathways.',
166
216
  )
167
217
  )
168
- cluster_create_pathways_tensorboard_arguments = (
169
- cluster_create_pathways_parser.add_argument_group(
170
- 'Optional Vertex AI Tensorboard Arguments',
171
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
172
- )
173
- )
174
-
175
- ### Pathways required arguments specific to "cluster create"
176
- cluster_create_pathways_required_arguments.add_argument(
177
- '--tpu-type',
178
- type=str,
179
- default=None,
180
- help='The tpu type to use, v5litepod-16, etc.',
218
+ add_shared_cluster_create_capacity_arguments(
219
+ cluster_create_pathways_capacity_arguments
181
220
  )
182
221
 
183
- ### "cluster create-ray" command parser
184
-
185
- cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
186
- 'create-ray',
187
- help='Create RayCluster',
222
+ ### Tensorboard arguments specific to "cluster create-pathways"
223
+ cluster_create_pathways_tensorboard_arguments = cluster_create_pathways_parser.add_argument_group(
224
+ 'Optional Vertex AI Tensorboard Arguments',
225
+ 'Arguments for creating Vertex AI Tensorboard in cluster'
226
+ ' create-pathways.',
188
227
  )
189
- cluster_create_ray_cluster_required_arguments = (
190
- cluster_create_ray_cluster_parser.add_argument_group(
191
- 'Required Arguments',
192
- 'Arguments required for cluster create-ray.',
193
- )
228
+ add_shared_cluster_create_tensorboard_arguments(
229
+ cluster_create_pathways_tensorboard_arguments
194
230
  )
195
- cluster_create_ray_cluster_optional_arguments = (
196
- cluster_create_ray_cluster_parser.add_argument_group(
197
- 'Optional Arguments',
198
- 'Arguments optional for cluster create-ray.',
231
+
232
+ ### MTC arguments specific to "cluster create"
233
+ cluster_create_mtc_arguments = (
234
+ cluster_create_pathways_parser.add_argument_group(
235
+ 'Optional MTC Arguments',
236
+ 'Arguments for configuring MTC in cluster create.',
199
237
  )
200
238
  )
201
- cluster_create_ray_cluster_capacity_arguments = (
202
- cluster_create_ray_cluster_parser.add_argument_group(
203
- 'Capacity Arguments',
204
- 'Arguments related to capacity for cluster create-ray.',
239
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
240
+ cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
241
+
242
+
243
+ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
244
+ ### Required arguments specific to "cluster create-ray"
245
+ cluster_create_ray_required_arguments = (
246
+ cluster_create_ray_parser.add_argument_group(
247
+ 'Required Arguments', 'Arguments required for cluster create-ray.'
205
248
  )
206
249
  )
207
- cluster_create_ray_cluster_tensorboard_arguments = (
208
- cluster_create_ray_cluster_parser.add_argument_group(
209
- 'Optional Vertex AI Tensorboard Arguments',
210
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
211
- )
250
+ add_shared_cluster_create_required_arguments(
251
+ cluster_create_ray_required_arguments
212
252
  )
213
-
214
- ### RayCluster required arguments specific to "cluster create"
215
- cluster_create_ray_cluster_required_arguments.add_argument(
253
+ cluster_create_ray_required_arguments.add_argument(
216
254
  '--tpu-type',
217
255
  type=str,
218
256
  default=None,
@@ -220,14 +258,24 @@ def set_cluster_parser(cluster_parser):
220
258
  required=True,
221
259
  )
222
260
  # TODO(bzmarke): Add --device-type to support GPU/CPU
223
- cluster_create_ray_cluster_required_arguments.add_argument(
261
+ cluster_create_ray_required_arguments.add_argument(
224
262
  '--ray-version',
225
263
  type=str,
226
264
  default=None,
227
265
  help="The Ray version to use, e.g. '2.38.0'",
228
266
  required=True,
229
267
  )
230
- cluster_create_ray_cluster_optional_arguments.add_argument(
268
+
269
+ ### Optional arguments specific to "cluster create-ray"
270
+ cluster_create_ray_optional_arguments = (
271
+ cluster_create_ray_parser.add_argument_group(
272
+ 'Optional Arguments', 'Arguments optional for cluster create-ray.'
273
+ )
274
+ )
275
+ add_shared_cluster_create_optional_arguments(
276
+ cluster_create_ray_optional_arguments
277
+ )
278
+ cluster_create_ray_optional_arguments.add_argument(
231
279
  '--enable-pathways',
232
280
  action='store_true',
233
281
  help=(
@@ -236,38 +284,38 @@ def set_cluster_parser(cluster_parser):
236
284
  ),
237
285
  )
238
286
 
239
- add_shared_cluster_create_required_arguments([
240
- cluster_create_required_arguments,
241
- cluster_create_pathways_required_arguments,
242
- cluster_create_ray_cluster_required_arguments,
243
- ])
244
- add_shared_cluster_create_optional_arguments([
245
- cluster_create_optional_arguments,
246
- cluster_create_pathways_optional_arguments,
247
- cluster_create_ray_cluster_optional_arguments,
248
- ])
249
- add_shared_cluster_create_capacity_arguments([
250
- cluster_create_capacity_arguments,
251
- cluster_create_pathways_capacity_arguments,
252
- cluster_create_ray_cluster_capacity_arguments,
253
- ])
254
- add_shared_cluster_create_tensorboard_arguments([
255
- cluster_create_tensorboard_arguments,
256
- cluster_create_pathways_tensorboard_arguments,
257
- cluster_create_ray_cluster_tensorboard_arguments,
258
- ])
287
+ ### Capacity arguments specific to "cluster create-ray"
288
+ cluster_create_ray_capacity_arguments = (
289
+ cluster_create_ray_parser.add_argument_group(
290
+ 'Capacity Arguments',
291
+ 'Arguments related to capacity for cluster create-ray.',
292
+ )
293
+ )
294
+ add_shared_cluster_create_capacity_arguments(
295
+ cluster_create_ray_capacity_arguments
296
+ )
259
297
 
260
- cluster_create_parser.set_defaults(func=cluster_create)
261
- cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
262
- cluster_create_ray_cluster_parser.set_defaults(
263
- func=cluster_create_ray_cluster
298
+ ### Tensorboard arguments specific to "cluster create-ray"
299
+ cluster_create_ray_tensorboard_arguments = (
300
+ cluster_create_ray_parser.add_argument_group(
301
+ 'Optional Vertex AI Tensorboard Arguments',
302
+ 'Arguments for creating Vertex AI Tensorboard in cluster create-ray.',
303
+ )
304
+ )
305
+ add_shared_cluster_create_tensorboard_arguments(
306
+ cluster_create_ray_tensorboard_arguments
264
307
  )
265
308
 
266
- ### "cluster delete" command parser ###
267
- cluster_delete_parser = cluster_subcommands.add_parser(
268
- 'delete',
269
- help='Delete cloud clusters.',
309
+ ### MTC arguments specific to "cluster create"
310
+ cluster_create_mtc_arguments = cluster_create_ray_parser.add_argument_group(
311
+ 'Optional MTC Arguments',
312
+ 'Arguments for configuring MTC in cluster create.',
270
313
  )
314
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
315
+ cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
316
+
317
+
318
+ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
271
319
  cluster_delete_required_arguments = cluster_delete_parser.add_argument_group(
272
320
  'Required Arguments',
273
321
  'Arguments required for cluster delete.',
@@ -294,31 +342,25 @@ def set_cluster_parser(cluster_parser):
294
342
  required=False,
295
343
  )
296
344
  add_shared_arguments(cluster_delete_optional_arguments)
297
- cluster_delete_parser.set_defaults(func=cluster_delete)
298
- cluster_delete_parser.add_argument(
345
+ cluster_delete_optional_arguments.add_argument(
299
346
  '--force',
300
347
  action='store_true',
301
348
  help=(
302
- 'Forces workload deletion command to run without additional approval.'
349
+ 'Forces cluster deletion command to run without additional approval.'
303
350
  ),
304
351
  )
305
352
 
306
- ### "cluster cacheimage" command parser ###
307
- cluster_cacheimage_parser = cluster_subcommands.add_parser(
308
- 'cacheimage',
309
- help='Cache image.',
310
- )
353
+ cluster_delete_parser.set_defaults(func=cluster_delete)
354
+
355
+
356
+ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
311
357
  cluster_cacheimage_required_arguments = (
312
358
  cluster_cacheimage_parser.add_argument_group(
313
359
  'Required Arguments',
314
360
  'Arguments required for cluster cacheimage.',
315
361
  )
316
362
  )
317
- cluster_cacheimage_optional_arguments = (
318
- cluster_cacheimage_parser.add_argument_group(
319
- 'Optional Arguments', 'Arguments optional for cluster cacheimage.'
320
- )
321
- )
363
+
322
364
  cluster_cacheimage_group = (
323
365
  cluster_cacheimage_parser.add_mutually_exclusive_group(required=True)
324
366
  )
@@ -357,6 +399,11 @@ def set_cluster_parser(cluster_parser):
357
399
  )
358
400
 
359
401
  ### Optional Arguments
402
+ cluster_cacheimage_optional_arguments = (
403
+ cluster_cacheimage_parser.add_argument_group(
404
+ 'Optional Arguments', 'Arguments optional for cluster cacheimage.'
405
+ )
406
+ )
360
407
  add_shared_arguments(cluster_cacheimage_optional_arguments)
361
408
  cluster_cacheimage_optional_arguments.add_argument(
362
409
  '--cache-key',
@@ -365,26 +412,18 @@ def set_cluster_parser(cluster_parser):
365
412
  help='The key to cache the docker image under.',
366
413
  required=False,
367
414
  )
415
+
368
416
  cluster_cacheimage_parser.set_defaults(func=cluster_cacheimage)
369
417
 
370
- ### "cluster describe" command parser ###
371
- cluster_describe_parser = cluster_subcommands.add_parser(
372
- 'describe',
373
- help='Describe a cluster.',
374
- )
418
+
419
+ def set_cluster_describe_parser(cluster_describe_parser: ArgumentParser):
420
+ ### Required arguments
375
421
  cluster_describe_required_arguments = (
376
422
  cluster_describe_parser.add_argument_group(
377
423
  'Required Arguments',
378
424
  'Arguments required for cluster describe.',
379
425
  )
380
426
  )
381
- cluster_describe_optional_arguments = (
382
- cluster_describe_parser.add_argument_group(
383
- 'Optional Arguments', 'Arguments optional for cluster describe.'
384
- )
385
- )
386
-
387
- ### Required arguments
388
427
  cluster_describe_required_arguments.add_argument(
389
428
  '--cluster',
390
429
  type=name_type,
@@ -392,280 +431,425 @@ def set_cluster_parser(cluster_parser):
392
431
  help='The name of the cluster to be describe.',
393
432
  required=True,
394
433
  )
434
+
395
435
  ### Optional Arguments
436
+ cluster_describe_optional_arguments = (
437
+ cluster_describe_parser.add_argument_group(
438
+ 'Optional Arguments', 'Arguments optional for cluster describe.'
439
+ )
440
+ )
396
441
  add_shared_arguments(cluster_describe_optional_arguments)
397
442
 
398
443
  cluster_describe_parser.set_defaults(func=cluster_describe)
399
444
 
400
- # "cluster list" command parser.
401
- cluster_list_parser = cluster_subcommands.add_parser(
402
- 'list', help='List cloud clusters.'
403
- )
445
+
446
+ def set_cluster_list_parser(cluster_list_parser: ArgumentParser):
447
+ ### Optional Arguments
404
448
  cluster_list_optional_arguments = cluster_list_parser.add_argument_group(
405
449
  'Optional Arguments', 'Arguments optional for cluster list.'
406
450
  )
407
- ### Optional Arguments
408
451
  add_shared_arguments(cluster_list_optional_arguments)
409
452
 
410
453
  cluster_list_parser.set_defaults(func=cluster_list)
411
454
 
412
455
 
413
- def add_shared_cluster_create_required_arguments(args_parsers):
456
+ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
457
+ cluster_adapt_required_arguments = cluster_adapt_parser.add_argument_group(
458
+ 'Required Arguments',
459
+ 'Arguments required for cluster adapt.',
460
+ )
461
+ add_shared_cluster_create_required_arguments(cluster_adapt_required_arguments)
462
+
463
+ cluster_adapt_device_group = (
464
+ cluster_adapt_required_arguments.add_mutually_exclusive_group(
465
+ required=True
466
+ )
467
+ )
468
+ cluster_adapt_device_group.add_argument(
469
+ '--tpu-type',
470
+ type=str,
471
+ default=None,
472
+ help='The tpu type used on cluster, v5litepod-16, etc.',
473
+ )
474
+ cluster_adapt_device_group.add_argument(
475
+ '--device-type',
476
+ type=str,
477
+ default=None,
478
+ help=(
479
+ 'The device type used on cluster (can be tpu or gpu or cpu), eg.'
480
+ ' h100-80gb-8, n2-standard-32-4 etc.'
481
+ ),
482
+ )
483
+
484
+ cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
485
+ 'Optional Arguments',
486
+ 'Arguments optional for cluster adapt.',
487
+ )
488
+ cluster_adapt_optional_arguments.add_argument(
489
+ '--num-nodes',
490
+ type=int,
491
+ help='The number of nodes of a cluster.',
492
+ )
493
+ cluster_adapt_optional_arguments.add_argument(
494
+ '--enable-workload-identity',
495
+ action='store_true',
496
+ help='Enable Workload Identity Federation on the cluster and node-pools.',
497
+ )
498
+ cluster_adapt_optional_arguments.add_argument(
499
+ '--num-slices',
500
+ type=int,
501
+ default=1,
502
+ help='The number of slices to run the job on, defaults to 1.',
503
+ required=False,
504
+ )
505
+ add_driver_arguments(cluster_adapt_optional_arguments)
506
+ add_shared_arguments(cluster_adapt_optional_arguments)
507
+
508
+ cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
509
+ 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
510
+ )
511
+ add_shared_cluster_create_capacity_arguments(cluster_adapt_capacity_arguments)
512
+
513
+ cluster_adapt_autoprovisioning_arguments = (
514
+ cluster_adapt_parser.add_argument_group(
515
+ 'Autoprovisioning Arguments',
516
+ 'Optional arguments for enabling autoprovisioning.',
517
+ )
518
+ )
519
+ add_autoprovisioning_arguments(cluster_adapt_autoprovisioning_arguments)
520
+
521
+ cluster_adapt_tensorboard_arguments = cluster_adapt_parser.add_argument_group(
522
+ 'Optional Vertex AI Tensorboard Arguments',
523
+ 'Arguments for creating Vertex AI Tensorboard in cluster adapt.',
524
+ )
525
+ add_shared_cluster_create_tensorboard_arguments(
526
+ cluster_adapt_tensorboard_arguments
527
+ )
528
+
529
+ cluster_adapt_parser.set_defaults(func=cluster_adapt)
530
+
531
+
532
+ def add_autoprovisioning_arguments(parser: ArgumentParser):
533
+ parser.add_argument(
534
+ '--enable-autoprovisioning',
535
+ action='store_true',
536
+ help=(
537
+ 'Enable GKE features for autoprovisioning node pools in GKE clusters.'
538
+ ),
539
+ )
540
+ parser.add_argument(
541
+ '--autoprovisioning-min-chips',
542
+ type=int,
543
+ help=(
544
+ 'Optionally set the minimum autoprovisioning accelerator resources in'
545
+ ' units of chips.By default, autoprovisioning will use the number of'
546
+ ' resources in the cluster as the minimum, and maximum.'
547
+ ),
548
+ )
549
+ parser.add_argument(
550
+ '--autoprovisioning-max-chips',
551
+ type=int,
552
+ help=(
553
+ 'Optionally set the maximum autoprovisioning accelerator resources in'
554
+ ' units of chips.By default, autoprovisioning will use the number of'
555
+ ' resources in the cluster as the minimum, and maximum.'
556
+ ),
557
+ )
558
+
559
+
560
+ def add_shared_cluster_create_required_arguments(parser: ArgumentParser):
414
561
  """Add shared required arguments in cluster create and Pathways cluster create.
415
562
 
416
563
  Args:
417
- List of cluster create required arguments parsers
564
+ parser: cluster create argument parser or argument group
418
565
  """
419
- for custom_parser in args_parsers:
420
- custom_parser.add_argument(
421
- '--cluster',
422
- type=name_type,
423
- default=None,
424
- help=(
425
- 'The name of the cluster. Will be used as the prefix for internal'
426
- ' objects in the cluster.'
427
- ),
428
- required=True,
429
- )
430
-
431
-
432
- def add_shared_cluster_create_optional_arguments(args_parsers):
566
+ parser.add_argument(
567
+ '--cluster',
568
+ type=name_type,
569
+ default=None,
570
+ help=(
571
+ 'The name of the cluster. Will be used as the prefix for internal'
572
+ ' objects in the cluster.'
573
+ ),
574
+ required=True,
575
+ )
576
+
577
+
578
+ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
433
579
  """Add shared optional arguments in cluster create and Pathways cluster create.
434
580
 
435
581
  Args:
436
- List of cluster create optional arguments parsers
582
+ parser: cluster create argument parser or argument group
437
583
  """
438
- for custom_parser in args_parsers:
439
- add_shared_arguments(custom_parser)
440
- custom_parser.add_argument(
441
- '--host-maintenance-interval',
442
- type=str,
443
- choices=['AS_NEEDED', 'PERIODIC'],
444
- default='AS_NEEDED',
445
- help='The maintenance policy of the cluster and respective clusters.',
446
- )
447
- custom_parser.add_argument(
448
- '--gke-version',
449
- type=str,
450
- help=(
451
- 'The GKE version of the cluster and respective clusters. The'
452
- ' default is determined dynamically based on RAPID channel'
453
- ' recommended version.'
454
- ),
455
- )
456
- custom_parser.add_argument(
457
- '--num-slices',
458
- type=int,
459
- default=1,
460
- help='The number of slices to run the job on, defaults to 1.',
461
- required=False,
462
- )
463
- custom_parser.add_argument(
464
- '--pathways-gce-machine-type',
465
- type=str,
466
- default='n2-standard-64',
467
- help='The CPU type for Pathways CPU nodepools',
468
- )
469
- custom_parser.add_argument(
470
- '--default-pool-cpu-machine-type',
471
- type=str,
472
- default='e2-standard-16',
473
- help=(
474
- 'Set the machine type within the default cpu node pool. For'
475
- ' regional clusters, all zones must support the machine type.'
476
- ),
477
- )
478
- custom_parser.add_argument(
479
- '--cluster-cpu-machine-type',
480
- type=str,
481
- default='',
482
- help=(
483
- 'Getting deprecated soon! Please use'
484
- ' --default-pool-cpu-machine-typeinstead, to denote the machine'
485
- ' type of the default cpu node pool. Set the machine type of other'
486
- ' cpu nodepools using --device-type.'
487
- ),
488
- )
489
- custom_parser.add_argument(
490
- '--default-pool-cpu-num-nodes',
491
- type=int,
492
- default=6,
493
- help=(
494
- 'Set the number of nodes within the default cpu node pool. This is'
495
- ' set to 6 by default. Autoscaling is enabled to scale this value'
496
- ' over time.'
497
- ),
498
- )
499
- custom_parser.add_argument(
500
- '--custom-cluster-arguments',
501
- type=str,
502
- default='',
503
- help=(
504
- 'Users can add their own arguments to customize their cluster'
505
- ' create command. Do note, these will not override already used'
506
- ' cluster creation arguments. e.g.'
507
- " --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
508
- ),
509
- )
510
- custom_parser.add_argument(
511
- '--custom-nodepool-arguments',
512
- type=str,
513
- default='',
514
- help=(
515
- 'Users can add their own arguments to customize their node pool '
516
- ' create command. Do note, these will not override already used'
517
- ' node pool creation arguments. e.g.'
518
- ' --custom-nodepool-arguments="--disk-size=300"'
519
- ),
520
- )
521
- custom_parser.add_argument(
522
- '--force',
523
- action='store_true',
524
- help=(
525
- 'Forces node pool creation and delete commands to run without'
526
- ' additional approval.'
527
- ),
528
- )
529
- custom_parser.add_argument(
530
- '--custom-tpu-nodepool-arguments',
531
- type=str,
532
- default='',
533
- help=(
534
- 'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
535
- ' customize node pool create command. Do note, these will not'
536
- ' override already used node pool creation arguments. Example usage'
537
- ' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
538
- ),
539
- )
540
- custom_parser.add_argument(
541
- '--private',
542
- action='store_true',
543
- help=(
544
- 'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
545
- ' and Pods are isolated from the internet. If set,'
546
- ' master_authorized_networks will also be enabled and access to the'
547
- " cluster's control plane will be restricted only to current"
548
- " machine's IP address unless more IP ranges are authorized by"
549
- ' providing --authorized-networks. This works only on creating new'
550
- ' clusters.'
551
- ),
552
- )
553
- custom_parser.add_argument(
554
- '--authorized-networks',
555
- action='extend',
556
- nargs='+',
557
- help=(
558
- 'Sets the provided cidrs as authorized IP ranges to access the'
559
- " private cluster's control plan. Access to the control plane will"
560
- " be provided to current machine's IP address even if"
561
- ' --authorized-networks is not set or it does not cover the IP'
562
- ' address. If set, --private is considered true and a private'
563
- ' cluster will be provisioned. It replaces existing authorized'
564
- ' networks if used with an existing private cluster.'
565
- ' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
566
- ),
567
- )
568
- custom_parser.add_argument(
569
- '--enable-workload-identity',
570
- action='store_true',
571
- help=(
572
- 'Enable Workload Identity Federation on the cluster and node-pools.'
573
- ),
574
- )
575
- custom_parser.add_argument(
576
- '--enable-gcsfuse-csi-driver',
577
- action='store_true',
578
- help=(
579
- 'Enable GSCFuse driver on the cluster. This enables Workload'
580
- ' Identity Federation. When using A3 ultra/A3 mega Workload'
581
- ' Identity is enabled by default.'
582
- ),
583
- )
584
- custom_parser.add_argument(
585
- '--enable-gcpfilestore-csi-driver',
586
- action='store_true',
587
- help='Enable GCPFilestore driver on the cluster.',
588
- )
589
-
590
- custom_parser.add_argument(
591
- '--enable-parallelstore-csi-driver',
592
- action='store_true',
593
- help='Enable Parallelstore CSI driver on the cluster.',
594
- )
595
-
596
- custom_parser.add_argument(
597
- '--enable-pd-csi-driver',
598
- action='store_true',
599
- help='Enable PersistentDisk CSI driver on the cluster.',
600
- )
601
-
602
-
603
- def add_shared_cluster_create_tensorboard_arguments(args_parsers):
584
+ add_shared_arguments(parser)
585
+ parser.add_argument(
586
+ '--host-maintenance-interval',
587
+ type=str,
588
+ choices=['AS_NEEDED', 'PERIODIC'],
589
+ default='AS_NEEDED',
590
+ help='The maintenance policy of the cluster and respective clusters.',
591
+ )
592
+ parser.add_argument(
593
+ '--gke-version',
594
+ type=str,
595
+ help=(
596
+ 'The GKE version of the cluster and respective clusters. The'
597
+ ' default is determined dynamically based on RAPID channel'
598
+ ' recommended version.'
599
+ ),
600
+ )
601
+ parser.add_argument(
602
+ '--num-slices',
603
+ type=int,
604
+ default=1,
605
+ help='The number of slices to run the job on, defaults to 1.',
606
+ required=False,
607
+ )
608
+ parser.add_argument(
609
+ '--pathways-gce-machine-type',
610
+ type=str,
611
+ default='n2-standard-64',
612
+ help='The CPU type for Pathways CPU nodepools',
613
+ )
614
+ parser.add_argument(
615
+ '--default-pool-cpu-machine-type',
616
+ type=str,
617
+ default='e2-standard-16',
618
+ help=(
619
+ 'Set the machine type within the default cpu node pool. For'
620
+ ' regional clusters, all zones must support the machine type.'
621
+ ),
622
+ )
623
+ parser.add_argument(
624
+ '--cluster-cpu-machine-type',
625
+ type=str,
626
+ default='',
627
+ help=(
628
+ 'Getting deprecated soon! Please use'
629
+ ' --default-pool-cpu-machine-typeinstead, to denote the machine'
630
+ ' type of the default cpu node pool. Set the machine type of other'
631
+ ' cpu nodepools using --device-type.'
632
+ ),
633
+ )
634
+ parser.add_argument(
635
+ '--default-pool-cpu-num-nodes',
636
+ type=int,
637
+ default=6,
638
+ help=(
639
+ 'Set the number of nodes within the default cpu node pool. This is'
640
+ ' set to 6 by default. Autoscaling is enabled to scale this value'
641
+ ' over time.'
642
+ ),
643
+ )
644
+ parser.add_argument(
645
+ '--custom-cluster-arguments',
646
+ type=str,
647
+ default='',
648
+ help=(
649
+ 'Users can add their own arguments to customize their cluster'
650
+ ' create command. Do note, these will not override already used'
651
+ ' cluster creation arguments. e.g.'
652
+ " --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
653
+ ),
654
+ )
655
+ parser.add_argument(
656
+ '--custom-nodepool-arguments',
657
+ type=str,
658
+ default='',
659
+ help=(
660
+ 'Users can add their own arguments to customize their node pool '
661
+ ' create command. Do note, these will not override already used'
662
+ ' node pool creation arguments. e.g.'
663
+ ' --custom-nodepool-arguments="--disk-size=300"'
664
+ ),
665
+ )
666
+ parser.add_argument(
667
+ '--force',
668
+ action='store_true',
669
+ help=(
670
+ 'Forces node pool creation and delete commands to run without'
671
+ ' additional approval.'
672
+ ),
673
+ )
674
+ parser.add_argument(
675
+ '--custom-tpu-nodepool-arguments',
676
+ type=str,
677
+ default='',
678
+ help=(
679
+ 'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
680
+ ' customize node pool create command. Do note, these will not'
681
+ ' override already used node pool creation arguments. Example usage'
682
+ ' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
683
+ ),
684
+ )
685
+ parser.add_argument(
686
+ '--private',
687
+ action='store_true',
688
+ help=(
689
+ 'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
690
+ ' and Pods are isolated from the internet. If set,'
691
+ ' master_authorized_networks will also be enabled and access to the'
692
+ " cluster's control plane will be restricted only to current"
693
+ " machine's IP address unless more IP ranges are authorized by"
694
+ ' providing --authorized-networks. This works only on creating new'
695
+ ' clusters.'
696
+ ),
697
+ )
698
+ parser.add_argument(
699
+ '--authorized-networks',
700
+ action='extend',
701
+ nargs='+',
702
+ help=(
703
+ 'Sets the provided cidrs as authorized IP ranges to access the'
704
+ " private cluster's control plan. Access to the control plane will"
705
+ " be provided to current machine's IP address even if"
706
+ ' --authorized-networks is not set or it does not cover the IP'
707
+ ' address. If set, --private is considered true and a private'
708
+ ' cluster will be provisioned. It replaces existing authorized'
709
+ ' networks if used with an existing private cluster.'
710
+ ' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
711
+ ),
712
+ )
713
+ parser.add_argument(
714
+ '--enable-workload-identity',
715
+ action='store_true',
716
+ help='Enable Workload Identity Federation on the cluster and node-pools.',
717
+ )
718
+ add_driver_arguments(parser)
719
+
720
+
721
+ def add_driver_arguments(parser: ArgumentParser):
722
+ parser.add_argument(
723
+ '--enable-gcsfuse-csi-driver',
724
+ action='store_true',
725
+ help=(
726
+ 'Enable GSCFuse driver on the cluster. This enables Workload'
727
+ ' Identity Federation. When using A3 ultra/A3 mega Workload'
728
+ ' Identity is enabled by default.'
729
+ ),
730
+ )
731
+ parser.add_argument(
732
+ '--enable-gcpfilestore-csi-driver',
733
+ action='store_true',
734
+ help='Enable GCPFilestore driver on the cluster.',
735
+ )
736
+ parser.add_argument(
737
+ '--enable-parallelstore-csi-driver',
738
+ action='store_true',
739
+ help='Enable Parallelstore CSI driver on the cluster.',
740
+ )
741
+ parser.add_argument(
742
+ '--enable-pd-csi-driver',
743
+ action='store_true',
744
+ help='Enable PersistentDisk CSI driver on the cluster.',
745
+ )
746
+
747
+
748
+ def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
604
749
  """Add shared tensorboard arguments in cluster create and Pathways cluster create.
605
750
  Note that this feature enables non-Pathways workloads to use tensorboard arguments
606
751
  on a Pathways cluster.
752
+
607
753
  Args:
608
- List of cluster create tensorboard arguments parsers
754
+ parser: cluster create argument parser or argument group
609
755
  """
610
- for custom_parser in args_parsers:
611
- custom_parser.add_argument(
612
- '--create-vertex-tensorboard',
613
- action='store_true',
614
- help='Set this flag to create a Tensorboard instance in Vertex AI.',
615
- )
616
- custom_parser.add_argument(
617
- '--tensorboard-region',
618
- type=str,
619
- default='us-central1',
620
- help=(
621
- 'The region to create Vertex Tensorboard instance in. Visit'
622
- ' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
623
- ' to view regions supported by Vertex AI. By default, Tensorboard'
624
- ' instance will be created in us-central1.'
625
- ),
626
- )
627
- custom_parser.add_argument(
628
- '--tensorboard-name',
629
- type=str,
630
- required=False,
631
- help=(
632
- 'The name of Vertex Tensorboard instance to create. If not'
633
- ' specified, a Tensorboard instance with the name'
634
- f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
635
- ' created.'
636
- ),
637
- )
638
-
639
-
640
- def add_shared_cluster_create_capacity_arguments(args_parsers):
756
+ parser.add_argument(
757
+ '--create-vertex-tensorboard',
758
+ action='store_true',
759
+ help='Set this flag to create a Tensorboard instance in Vertex AI.',
760
+ )
761
+ parser.add_argument(
762
+ '--tensorboard-region',
763
+ type=str,
764
+ default='us-central1',
765
+ help=(
766
+ 'The region to create Vertex Tensorboard instance in. Visit'
767
+ ' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
768
+ ' to view regions supported by Vertex AI. By default, Tensorboard'
769
+ ' instance will be created in us-central1.'
770
+ ),
771
+ )
772
+ parser.add_argument(
773
+ '--tensorboard-name',
774
+ type=str,
775
+ required=False,
776
+ help=(
777
+ 'The name of Vertex Tensorboard instance to create. If not'
778
+ ' specified, a Tensorboard instance with the name'
779
+ f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
780
+ ' created.'
781
+ ),
782
+ )
783
+
784
+
785
+ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
641
786
  """Add shared capacity arguments in cluster create and Pathways cluster create.
642
787
 
643
788
  Args:
644
- List of cluster create capacity arguments parsers
789
+ parser: cluster create argument parser or argument group
790
+ """
791
+ parser.add_argument(
792
+ '--on-demand',
793
+ action='store_true',
794
+ help=(
795
+ 'Sets node pool creation to use on-demand resources. '
796
+ ' See `--reservation` or `--spot` for other capacity types.'
797
+ ),
798
+ )
799
+ parser.add_argument(
800
+ '--reservation',
801
+ type=str,
802
+ help=(
803
+ 'The reservation to be used for acquiring resources in the'
804
+ ' cluster. This will attempt to find the provided reservation.'
805
+ ' See `--spot` or `--on-demand` for other capacity types.'
806
+ ),
807
+ )
808
+ parser.add_argument(
809
+ '--spot',
810
+ action='store_true',
811
+ help=(
812
+ 'Sets node pool creation to use spot resources.'
813
+ ' See `--reservation` or `--on-demand` for other capacity types.'
814
+ ),
815
+ )
816
+
817
+
818
+ def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
819
+ """Add shared Multi-tier Checkpointing arguments in cluster create and Pathways cluster create.
820
+
821
+ Args:
822
+ List of cluster create MTC arguments parsers
645
823
  """
646
- for custom_parser in args_parsers:
647
- custom_parser.add_argument(
648
- '--on-demand',
649
- action='store_true',
650
- help=(
651
- 'Sets node pool creation to use on-demand resources. '
652
- ' See `--reservation` or `--spot` for other capacity types.'
653
- ),
654
- )
655
- custom_parser.add_argument(
656
- '--reservation',
657
- type=str,
658
- help=(
659
- 'The reservation to be used for acquiring resources in the'
660
- ' cluster. This will attempt to find the provided reservation.'
661
- ' See `--spot` or `--on-demand` for other capacity types.'
662
- ),
663
- )
664
- custom_parser.add_argument(
665
- '--spot',
666
- action='store_true',
667
- help=(
668
- 'Sets node pool creation to use spot resources.'
669
- ' See `--reservation` or `--on-demand` for other capacity types.'
670
- ),
671
- )
824
+ parser.add_argument(
825
+ '--enable-mtc',
826
+ action='store_true',
827
+ help='Enable MTC on the cluster.',
828
+ )
829
+ parser.add_argument(
830
+ '--mtc-ramdisk-size',
831
+ type=str,
832
+ default=None,
833
+ help=(
834
+ '(Required if --enable-mtc is true) The size of the RAM disk to be'
835
+ ' used for multi-tier checkpointing. e.g. "64Mi" '
836
+ ),
837
+ )
838
+ parser.add_argument(
839
+ '--mtc-gcs-bucket',
840
+ type=str,
841
+ default=None,
842
+ help=(
843
+ '(Required if --enable-mtc is true) The GCS bucket to be used for'
844
+ ' multi-tier checkpointing.'
845
+ ),
846
+ )
847
+ parser.add_argument(
848
+ '--mtc-toleration-key',
849
+ type=str,
850
+ default=None,
851
+ help=(
852
+ '(Optional) The tolerance key to be used for multi-tier'
853
+ ' checkpointing. By default, it is set to "google.com/tpu".'
854
+ ),
855
+ )