xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. xpk/commands/batch.py +19 -13
  2. xpk/commands/cluster.py +240 -71
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/common.py +33 -1
  5. xpk/commands/info.py +2 -4
  6. xpk/commands/job.py +7 -8
  7. xpk/commands/kjob_common.py +30 -18
  8. xpk/commands/run.py +17 -12
  9. xpk/commands/shell.py +3 -4
  10. xpk/commands/storage.py +75 -19
  11. xpk/commands/workload.py +161 -324
  12. xpk/core/blueprint/blueprint_definitions.py +2 -0
  13. xpk/core/blueprint/blueprint_generator.py +335 -45
  14. xpk/core/capacity.py +1 -0
  15. xpk/core/cluster.py +193 -12
  16. xpk/core/config.py +3 -1
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +9 -21
  19. xpk/core/filestore.py +5 -1
  20. xpk/core/gcsfuse.py +27 -6
  21. xpk/core/kjob.py +66 -20
  22. xpk/core/kueue.py +30 -0
  23. xpk/core/mtc.py +195 -0
  24. xpk/core/nap.py +4 -0
  25. xpk/core/network.py +34 -22
  26. xpk/core/nodepool.py +28 -26
  27. xpk/core/pathways.py +165 -210
  28. xpk/core/resources.py +21 -0
  29. xpk/core/scheduling.py +36 -0
  30. xpk/core/storage.py +66 -12
  31. xpk/core/system_characteristics.py +9 -0
  32. xpk/core/workload.py +28 -83
  33. xpk/core/workload_decorators/rdma_decorator.py +11 -15
  34. xpk/core/workload_decorators/storage_decorator.py +8 -3
  35. xpk/core/workload_decorators/tcpx_decorator.py +179 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
  37. xpk/parser/cluster.py +574 -381
  38. xpk/parser/storage.py +25 -5
  39. xpk/parser/workload.py +59 -31
  40. xpk/utils/kubectl.py +4 -1
  41. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
  42. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
  43. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
  44. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
  45. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/parser/cluster.py CHANGED
@@ -14,7 +14,10 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from argparse import ArgumentParser
18
+
17
19
  from ..commands.cluster import (
20
+ cluster_adapt,
18
21
  cluster_cacheimage,
19
22
  cluster_create,
20
23
  cluster_create_pathways,
@@ -23,14 +26,14 @@ from ..commands.cluster import (
23
26
  cluster_describe,
24
27
  cluster_list,
25
28
  )
29
+ from ..commands.config import xpk_cfg
30
+ from ..core.config import CFG_BUCKET_KEY
26
31
  from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
27
32
  from .common import add_shared_arguments
28
33
  from .validators import name_type
29
- from ..commands.config import xpk_cfg
30
- from ..core.config import CFG_BUCKET_KEY
31
34
 
32
35
 
33
- def set_cluster_parser(cluster_parser):
36
+ def set_cluster_parser(cluster_parser: ArgumentParser):
34
37
  cluster_subcommands = cluster_parser.add_subparsers(
35
38
  title='cluster subcommands',
36
39
  dest='xpk_cluster_subcommands',
@@ -40,28 +43,54 @@ def set_cluster_parser(cluster_parser):
40
43
  ),
41
44
  )
42
45
 
43
- ### "cluster create" command parser ###
44
46
  cluster_create_parser = cluster_subcommands.add_parser(
45
47
  'create', help='Create cloud clusters.'
46
48
  )
47
- cluster_create_required_arguments = cluster_create_parser.add_argument_group(
48
- 'Required Arguments',
49
- 'Arguments required for cluster create.',
49
+ cluster_create_pathways_parser = cluster_subcommands.add_parser(
50
+ 'create-pathways',
51
+ help='Create Pathways-on-Cloud clusters.',
50
52
  )
51
- cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
52
- 'Optional Arguments', 'Arguments optional for cluster create.'
53
+ cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
54
+ 'create-ray',
55
+ help='Create RayCluster',
53
56
  )
54
- cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
55
- 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
57
+ cluster_delete_parser = cluster_subcommands.add_parser(
58
+ 'delete',
59
+ help='Delete cloud clusters.',
56
60
  )
57
- cluster_create_tensorboard_arguments = (
58
- cluster_create_parser.add_argument_group(
59
- 'Optional Vertex AI Tensorboard Arguments',
60
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
61
- )
61
+ cluster_cacheimage_parser = cluster_subcommands.add_parser(
62
+ 'cacheimage',
63
+ help='Cache image.',
64
+ )
65
+ cluster_describe_parser = cluster_subcommands.add_parser(
66
+ 'describe',
67
+ help='Describe a cluster.',
68
+ )
69
+ cluster_list_parser = cluster_subcommands.add_parser(
70
+ 'list', help='List cloud clusters.'
71
+ )
72
+ cluster_adapt_parser = cluster_subcommands.add_parser(
73
+ 'adapt', help='Adapt an existing cluster for XPK.'
62
74
  )
63
75
 
76
+ set_cluster_create_parser(cluster_create_parser)
77
+ set_cluster_create_pathways_parser(cluster_create_pathways_parser)
78
+ set_cluster_create_ray_parser(cluster_create_ray_cluster_parser)
79
+ set_cluster_delete_parser(cluster_delete_parser)
80
+ set_cluster_cacheimage_parser(cluster_cacheimage_parser)
81
+ set_cluster_describe_parser(cluster_describe_parser)
82
+ set_cluster_list_parser(cluster_list_parser)
83
+ set_cluster_adapt_parser(cluster_adapt_parser)
84
+
85
+
86
+ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
64
87
  ### Required arguments specific to "cluster create"
88
+ cluster_create_required_arguments = cluster_create_parser.add_argument_group(
89
+ 'Required Arguments', 'Arguments required for cluster create.'
90
+ )
91
+ add_shared_cluster_create_required_arguments(
92
+ cluster_create_required_arguments
93
+ )
65
94
 
66
95
  cluster_device_group = (
67
96
  cluster_create_required_arguments.add_mutually_exclusive_group(
@@ -85,6 +114,12 @@ def set_cluster_parser(cluster_parser):
85
114
  )
86
115
 
87
116
  ### Optional arguments specific to "cluster create"
117
+ cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
118
+ 'Optional Arguments', 'Arguments optional for cluster create.'
119
+ )
120
+ add_shared_cluster_create_optional_arguments(
121
+ cluster_create_optional_arguments
122
+ )
88
123
  cluster_create_optional_arguments.add_argument(
89
124
  '--cluster-state-gcs-bucket',
90
125
  type=str,
@@ -107,111 +142,115 @@ def set_cluster_parser(cluster_parser):
107
142
  ' enable cluster to accept Pathways workloads.'
108
143
  ),
109
144
  )
110
- ### Autoprovisioning arguments specific to "cluster create"
111
- cluster_create_autoprovisioning_arguments = (
145
+
146
+ autoprovisioning_arguments = cluster_create_parser.add_argument_group(
147
+ 'Autoprovisioning Arguments',
148
+ 'Optional arguments for enabling autoprovisioning.',
149
+ )
150
+ add_autoprovisioning_arguments(autoprovisioning_arguments)
151
+
152
+ ### Capacity arguments specific to "cluster create"
153
+ cluster_create_capacity_arguments = cluster_create_parser.add_argument_group(
154
+ 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
155
+ )
156
+ add_shared_cluster_create_capacity_arguments(
157
+ cluster_create_capacity_arguments
158
+ )
159
+
160
+ ### Tensorboard arguments specific to "cluster create"
161
+ cluster_create_tensorboard_arguments = (
112
162
  cluster_create_parser.add_argument_group(
113
- 'Optional Autoprovisioning Arguments',
114
- 'Arguments optional for enabling autoprovisioning.',
163
+ 'Optional Vertex AI Tensorboard Arguments',
164
+ 'Arguments for creating Vertex AI Tensorboard in cluster create.',
115
165
  )
116
166
  )
117
- cluster_create_autoprovisioning_arguments.add_argument(
118
- '--enable-autoprovisioning',
119
- action='store_true',
120
- help=(
121
- 'Enable GKE features for autoprovisioning node pools in GKE clusters.'
122
- ),
167
+ add_shared_cluster_create_tensorboard_arguments(
168
+ cluster_create_tensorboard_arguments
123
169
  )
124
- cluster_create_autoprovisioning_arguments.add_argument(
125
- '--autoprovisioning-min-chips',
126
- type=int,
127
- help=(
128
- 'Optionally set the minimum autoprovisioning accelerator resources in'
129
- ' units of chips.By default, autoprovisioning will use the number of'
130
- ' resources in the cluster as the minimum, and maximum.'
131
- ),
132
- )
133
- cluster_create_autoprovisioning_arguments.add_argument(
134
- '--autoprovisioning-max-chips',
135
- type=int,
136
- help=(
137
- 'Optionally set the maximum autoprovisioning accelerator resources in'
138
- ' units of chips.By default, autoprovisioning will use the number of'
139
- ' resources in the cluster as the minimum, and maximum.'
140
- ),
170
+
171
+ ### MTC arguments specific to "cluster create"
172
+ cluster_create_mtc_arguments = cluster_create_parser.add_argument_group(
173
+ 'Optional MTC Arguments',
174
+ 'Arguments for configuring MTC in cluster create.',
141
175
  )
176
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
177
+ cluster_create_parser.set_defaults(func=cluster_create)
142
178
 
143
- ### "cluster create-pathways" command parser ###
144
179
 
145
- cluster_create_pathways_parser = cluster_subcommands.add_parser(
146
- 'create-pathways',
147
- help='Create Pathways-on-Cloud clusters.',
148
- )
180
+ def set_cluster_create_pathways_parser(
181
+ cluster_create_pathways_parser: ArgumentParser,
182
+ ):
183
+ ### Required arguments specific to "cluster create-pathways"
149
184
  cluster_create_pathways_required_arguments = (
150
185
  cluster_create_pathways_parser.add_argument_group(
151
186
  'Required Arguments',
152
187
  'Arguments required for cluster create-pathways.',
153
188
  )
154
189
  )
190
+ add_shared_cluster_create_required_arguments(
191
+ cluster_create_pathways_required_arguments
192
+ )
193
+ cluster_create_pathways_required_arguments.add_argument(
194
+ '--tpu-type',
195
+ type=str,
196
+ default=None,
197
+ help='The tpu type to use, v5litepod-16, etc.',
198
+ )
199
+
200
+ ### Optional arguments specific to "cluster create-pathways"
155
201
  cluster_create_pathways_optional_arguments = (
156
202
  cluster_create_pathways_parser.add_argument_group(
157
203
  'Optional Arguments',
158
204
  'Arguments optional for cluster create-pathways.',
159
205
  )
160
206
  )
207
+ add_shared_cluster_create_optional_arguments(
208
+ cluster_create_pathways_optional_arguments
209
+ )
210
+
211
+ ### Capacity arguments specific to "cluster create-pathways"
161
212
  cluster_create_pathways_capacity_arguments = (
162
213
  cluster_create_pathways_parser.add_argument_group(
163
214
  'Capacity Arguments',
164
215
  'Arguments related to capacity for cluster create-pathways.',
165
216
  )
166
217
  )
167
- cluster_create_pathways_tensorboard_arguments = (
168
- cluster_create_pathways_parser.add_argument_group(
169
- 'Optional Vertex AI Tensorboard Arguments',
170
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
171
- )
218
+ add_shared_cluster_create_capacity_arguments(
219
+ cluster_create_pathways_capacity_arguments
172
220
  )
173
221
 
174
- ### Pathways required arguments specific to "cluster create"
175
- cluster_create_pathways_required_arguments.add_argument(
176
- '--tpu-type',
177
- type=str,
178
- default=None,
179
- help='The tpu type to use, v5litepod-16, etc.',
222
+ ### Tensorboard arguments specific to "cluster create-pathways"
223
+ cluster_create_pathways_tensorboard_arguments = cluster_create_pathways_parser.add_argument_group(
224
+ 'Optional Vertex AI Tensorboard Arguments',
225
+ 'Arguments for creating Vertex AI Tensorboard in cluster'
226
+ ' create-pathways.',
180
227
  )
181
-
182
- ### "cluster create-ray" command parser
183
-
184
- cluster_create_ray_cluster_parser = cluster_subcommands.add_parser(
185
- 'create-ray',
186
- help='Create RayCluster',
187
- )
188
- cluster_create_ray_cluster_required_arguments = (
189
- cluster_create_ray_cluster_parser.add_argument_group(
190
- 'Required Arguments',
191
- 'Arguments required for cluster create-ray.',
192
- )
228
+ add_shared_cluster_create_tensorboard_arguments(
229
+ cluster_create_pathways_tensorboard_arguments
193
230
  )
194
- cluster_create_ray_cluster_optional_arguments = (
195
- cluster_create_ray_cluster_parser.add_argument_group(
196
- 'Optional Arguments',
197
- 'Arguments optional for cluster create-ray.',
231
+
232
+ ### MTC arguments specific to "cluster create"
233
+ cluster_create_mtc_arguments = (
234
+ cluster_create_pathways_parser.add_argument_group(
235
+ 'Optional MTC Arguments',
236
+ 'Arguments for configuring MTC in cluster create.',
198
237
  )
199
238
  )
200
- cluster_create_ray_cluster_capacity_arguments = (
201
- cluster_create_ray_cluster_parser.add_argument_group(
202
- 'Capacity Arguments',
203
- 'Arguments related to capacity for cluster create-ray.',
239
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
240
+ cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
241
+
242
+
243
+ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
244
+ ### Required arguments specific to "cluster create-ray"
245
+ cluster_create_ray_required_arguments = (
246
+ cluster_create_ray_parser.add_argument_group(
247
+ 'Required Arguments', 'Arguments required for cluster create-ray.'
204
248
  )
205
249
  )
206
- cluster_create_ray_cluster_tensorboard_arguments = (
207
- cluster_create_ray_cluster_parser.add_argument_group(
208
- 'Optional Vertex AI Tensorboard Arguments',
209
- 'Arguments for creating Vertex AI Tensorboard in cluster create.',
210
- )
250
+ add_shared_cluster_create_required_arguments(
251
+ cluster_create_ray_required_arguments
211
252
  )
212
-
213
- ### RayCluster required arguments specific to "cluster create"
214
- cluster_create_ray_cluster_required_arguments.add_argument(
253
+ cluster_create_ray_required_arguments.add_argument(
215
254
  '--tpu-type',
216
255
  type=str,
217
256
  default=None,
@@ -219,14 +258,24 @@ def set_cluster_parser(cluster_parser):
219
258
  required=True,
220
259
  )
221
260
  # TODO(bzmarke): Add --device-type to support GPU/CPU
222
- cluster_create_ray_cluster_required_arguments.add_argument(
261
+ cluster_create_ray_required_arguments.add_argument(
223
262
  '--ray-version',
224
263
  type=str,
225
264
  default=None,
226
265
  help="The Ray version to use, e.g. '2.38.0'",
227
266
  required=True,
228
267
  )
229
- cluster_create_ray_cluster_optional_arguments.add_argument(
268
+
269
+ ### Optional arguments specific to "cluster create-ray"
270
+ cluster_create_ray_optional_arguments = (
271
+ cluster_create_ray_parser.add_argument_group(
272
+ 'Optional Arguments', 'Arguments optional for cluster create-ray.'
273
+ )
274
+ )
275
+ add_shared_cluster_create_optional_arguments(
276
+ cluster_create_ray_optional_arguments
277
+ )
278
+ cluster_create_ray_optional_arguments.add_argument(
230
279
  '--enable-pathways',
231
280
  action='store_true',
232
281
  help=(
@@ -235,38 +284,38 @@ def set_cluster_parser(cluster_parser):
235
284
  ),
236
285
  )
237
286
 
238
- add_shared_cluster_create_required_arguments([
239
- cluster_create_required_arguments,
240
- cluster_create_pathways_required_arguments,
241
- cluster_create_ray_cluster_required_arguments,
242
- ])
243
- add_shared_cluster_create_optional_arguments([
244
- cluster_create_optional_arguments,
245
- cluster_create_pathways_optional_arguments,
246
- cluster_create_ray_cluster_optional_arguments,
247
- ])
248
- add_shared_cluster_create_capacity_arguments([
249
- cluster_create_capacity_arguments,
250
- cluster_create_pathways_capacity_arguments,
251
- cluster_create_ray_cluster_capacity_arguments,
252
- ])
253
- add_shared_cluster_create_tensorboard_arguments([
254
- cluster_create_tensorboard_arguments,
255
- cluster_create_pathways_tensorboard_arguments,
256
- cluster_create_ray_cluster_tensorboard_arguments,
257
- ])
287
+ ### Capacity arguments specific to "cluster create-ray"
288
+ cluster_create_ray_capacity_arguments = (
289
+ cluster_create_ray_parser.add_argument_group(
290
+ 'Capacity Arguments',
291
+ 'Arguments related to capacity for cluster create-ray.',
292
+ )
293
+ )
294
+ add_shared_cluster_create_capacity_arguments(
295
+ cluster_create_ray_capacity_arguments
296
+ )
258
297
 
259
- cluster_create_parser.set_defaults(func=cluster_create)
260
- cluster_create_pathways_parser.set_defaults(func=cluster_create_pathways)
261
- cluster_create_ray_cluster_parser.set_defaults(
262
- func=cluster_create_ray_cluster
298
+ ### Tensorboard arguments specific to "cluster create-ray"
299
+ cluster_create_ray_tensorboard_arguments = (
300
+ cluster_create_ray_parser.add_argument_group(
301
+ 'Optional Vertex AI Tensorboard Arguments',
302
+ 'Arguments for creating Vertex AI Tensorboard in cluster create-ray.',
303
+ )
304
+ )
305
+ add_shared_cluster_create_tensorboard_arguments(
306
+ cluster_create_ray_tensorboard_arguments
263
307
  )
264
308
 
265
- ### "cluster delete" command parser ###
266
- cluster_delete_parser = cluster_subcommands.add_parser(
267
- 'delete',
268
- help='Delete cloud clusters.',
309
+ ### MTC arguments specific to "cluster create"
310
+ cluster_create_mtc_arguments = cluster_create_ray_parser.add_argument_group(
311
+ 'Optional MTC Arguments',
312
+ 'Arguments for configuring MTC in cluster create.',
269
313
  )
314
+ add_shared_cluster_create_mtc_arguments(cluster_create_mtc_arguments)
315
+ cluster_create_ray_parser.set_defaults(func=cluster_create_ray_cluster)
316
+
317
+
318
+ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
270
319
  cluster_delete_required_arguments = cluster_delete_parser.add_argument_group(
271
320
  'Required Arguments',
272
321
  'Arguments required for cluster delete.',
@@ -293,31 +342,25 @@ def set_cluster_parser(cluster_parser):
293
342
  required=False,
294
343
  )
295
344
  add_shared_arguments(cluster_delete_optional_arguments)
296
- cluster_delete_parser.set_defaults(func=cluster_delete)
297
- cluster_delete_parser.add_argument(
345
+ cluster_delete_optional_arguments.add_argument(
298
346
  '--force',
299
347
  action='store_true',
300
348
  help=(
301
- 'Forces workload deletion command to run without additional approval.'
349
+ 'Forces cluster deletion command to run without additional approval.'
302
350
  ),
303
351
  )
304
352
 
305
- ### "cluster cacheimage" command parser ###
306
- cluster_cacheimage_parser = cluster_subcommands.add_parser(
307
- 'cacheimage',
308
- help='Cache image.',
309
- )
353
+ cluster_delete_parser.set_defaults(func=cluster_delete)
354
+
355
+
356
+ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
310
357
  cluster_cacheimage_required_arguments = (
311
358
  cluster_cacheimage_parser.add_argument_group(
312
359
  'Required Arguments',
313
360
  'Arguments required for cluster cacheimage.',
314
361
  )
315
362
  )
316
- cluster_cacheimage_optional_arguments = (
317
- cluster_cacheimage_parser.add_argument_group(
318
- 'Optional Arguments', 'Arguments optional for cluster cacheimage.'
319
- )
320
- )
363
+
321
364
  cluster_cacheimage_group = (
322
365
  cluster_cacheimage_parser.add_mutually_exclusive_group(required=True)
323
366
  )
@@ -356,6 +399,11 @@ def set_cluster_parser(cluster_parser):
356
399
  )
357
400
 
358
401
  ### Optional Arguments
402
+ cluster_cacheimage_optional_arguments = (
403
+ cluster_cacheimage_parser.add_argument_group(
404
+ 'Optional Arguments', 'Arguments optional for cluster cacheimage.'
405
+ )
406
+ )
359
407
  add_shared_arguments(cluster_cacheimage_optional_arguments)
360
408
  cluster_cacheimage_optional_arguments.add_argument(
361
409
  '--cache-key',
@@ -364,26 +412,18 @@ def set_cluster_parser(cluster_parser):
364
412
  help='The key to cache the docker image under.',
365
413
  required=False,
366
414
  )
415
+
367
416
  cluster_cacheimage_parser.set_defaults(func=cluster_cacheimage)
368
417
 
369
- ### "cluster describe" command parser ###
370
- cluster_describe_parser = cluster_subcommands.add_parser(
371
- 'describe',
372
- help='Describe a cluster.',
373
- )
418
+
419
+ def set_cluster_describe_parser(cluster_describe_parser: ArgumentParser):
420
+ ### Required arguments
374
421
  cluster_describe_required_arguments = (
375
422
  cluster_describe_parser.add_argument_group(
376
423
  'Required Arguments',
377
424
  'Arguments required for cluster describe.',
378
425
  )
379
426
  )
380
- cluster_describe_optional_arguments = (
381
- cluster_describe_parser.add_argument_group(
382
- 'Optional Arguments', 'Arguments optional for cluster describe.'
383
- )
384
- )
385
-
386
- ### Required arguments
387
427
  cluster_describe_required_arguments.add_argument(
388
428
  '--cluster',
389
429
  type=name_type,
@@ -391,272 +431,425 @@ def set_cluster_parser(cluster_parser):
391
431
  help='The name of the cluster to be describe.',
392
432
  required=True,
393
433
  )
434
+
394
435
  ### Optional Arguments
436
+ cluster_describe_optional_arguments = (
437
+ cluster_describe_parser.add_argument_group(
438
+ 'Optional Arguments', 'Arguments optional for cluster describe.'
439
+ )
440
+ )
395
441
  add_shared_arguments(cluster_describe_optional_arguments)
396
442
 
397
443
  cluster_describe_parser.set_defaults(func=cluster_describe)
398
444
 
399
- # "cluster list" command parser.
400
- cluster_list_parser = cluster_subcommands.add_parser(
401
- 'list', help='List cloud clusters.'
402
- )
445
+
446
+ def set_cluster_list_parser(cluster_list_parser: ArgumentParser):
447
+ ### Optional Arguments
403
448
  cluster_list_optional_arguments = cluster_list_parser.add_argument_group(
404
449
  'Optional Arguments', 'Arguments optional for cluster list.'
405
450
  )
406
- ### Optional Arguments
407
451
  add_shared_arguments(cluster_list_optional_arguments)
408
452
 
409
453
  cluster_list_parser.set_defaults(func=cluster_list)
410
454
 
411
455
 
412
- def add_shared_cluster_create_required_arguments(args_parsers):
456
+ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
457
+ cluster_adapt_required_arguments = cluster_adapt_parser.add_argument_group(
458
+ 'Required Arguments',
459
+ 'Arguments required for cluster adapt.',
460
+ )
461
+ add_shared_cluster_create_required_arguments(cluster_adapt_required_arguments)
462
+
463
+ cluster_adapt_device_group = (
464
+ cluster_adapt_required_arguments.add_mutually_exclusive_group(
465
+ required=True
466
+ )
467
+ )
468
+ cluster_adapt_device_group.add_argument(
469
+ '--tpu-type',
470
+ type=str,
471
+ default=None,
472
+ help='The tpu type used on cluster, v5litepod-16, etc.',
473
+ )
474
+ cluster_adapt_device_group.add_argument(
475
+ '--device-type',
476
+ type=str,
477
+ default=None,
478
+ help=(
479
+ 'The device type used on cluster (can be tpu or gpu or cpu), eg.'
480
+ ' h100-80gb-8, n2-standard-32-4 etc.'
481
+ ),
482
+ )
483
+
484
+ cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
485
+ 'Optional Arguments',
486
+ 'Arguments optional for cluster adapt.',
487
+ )
488
+ cluster_adapt_optional_arguments.add_argument(
489
+ '--num-nodes',
490
+ type=int,
491
+ help='The number of nodes of a cluster.',
492
+ )
493
+ cluster_adapt_optional_arguments.add_argument(
494
+ '--enable-workload-identity',
495
+ action='store_true',
496
+ help='Enable Workload Identity Federation on the cluster and node-pools.',
497
+ )
498
+ cluster_adapt_optional_arguments.add_argument(
499
+ '--num-slices',
500
+ type=int,
501
+ default=1,
502
+ help='The number of slices to run the job on, defaults to 1.',
503
+ required=False,
504
+ )
505
+ add_driver_arguments(cluster_adapt_optional_arguments)
506
+ add_shared_arguments(cluster_adapt_optional_arguments)
507
+
508
+ cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
509
+ 'Capacity Arguments', 'Arguments related to capacity for cluster create.'
510
+ )
511
+ add_shared_cluster_create_capacity_arguments(cluster_adapt_capacity_arguments)
512
+
513
+ cluster_adapt_autoprovisioning_arguments = (
514
+ cluster_adapt_parser.add_argument_group(
515
+ 'Autoprovisioning Arguments',
516
+ 'Optional arguments for enabling autoprovisioning.',
517
+ )
518
+ )
519
+ add_autoprovisioning_arguments(cluster_adapt_autoprovisioning_arguments)
520
+
521
+ cluster_adapt_tensorboard_arguments = cluster_adapt_parser.add_argument_group(
522
+ 'Optional Vertex AI Tensorboard Arguments',
523
+ 'Arguments for creating Vertex AI Tensorboard in cluster adapt.',
524
+ )
525
+ add_shared_cluster_create_tensorboard_arguments(
526
+ cluster_adapt_tensorboard_arguments
527
+ )
528
+
529
+ cluster_adapt_parser.set_defaults(func=cluster_adapt)
530
+
531
+
532
+ def add_autoprovisioning_arguments(parser: ArgumentParser):
533
+ parser.add_argument(
534
+ '--enable-autoprovisioning',
535
+ action='store_true',
536
+ help=(
537
+ 'Enable GKE features for autoprovisioning node pools in GKE clusters.'
538
+ ),
539
+ )
540
+ parser.add_argument(
541
+ '--autoprovisioning-min-chips',
542
+ type=int,
543
+ help=(
544
+ 'Optionally set the minimum autoprovisioning accelerator resources in'
545
+ ' units of chips.By default, autoprovisioning will use the number of'
546
+ ' resources in the cluster as the minimum, and maximum.'
547
+ ),
548
+ )
549
+ parser.add_argument(
550
+ '--autoprovisioning-max-chips',
551
+ type=int,
552
+ help=(
553
+ 'Optionally set the maximum autoprovisioning accelerator resources in'
554
+ ' units of chips.By default, autoprovisioning will use the number of'
555
+ ' resources in the cluster as the minimum, and maximum.'
556
+ ),
557
+ )
558
+
559
+
560
+ def add_shared_cluster_create_required_arguments(parser: ArgumentParser):
413
561
  """Add shared required arguments in cluster create and Pathways cluster create.
414
562
 
415
563
  Args:
416
- List of cluster create required arguments parsers
564
+ parser: cluster create argument parser or argument group
417
565
  """
418
- for custom_parser in args_parsers:
419
- custom_parser.add_argument(
420
- '--cluster',
421
- type=name_type,
422
- default=None,
423
- help=(
424
- 'The name of the cluster. Will be used as the prefix for internal'
425
- ' objects in the cluster.'
426
- ),
427
- required=True,
428
- )
429
-
430
-
431
- def add_shared_cluster_create_optional_arguments(args_parsers):
566
+ parser.add_argument(
567
+ '--cluster',
568
+ type=name_type,
569
+ default=None,
570
+ help=(
571
+ 'The name of the cluster. Will be used as the prefix for internal'
572
+ ' objects in the cluster.'
573
+ ),
574
+ required=True,
575
+ )
576
+
577
+
578
+ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
432
579
  """Add shared optional arguments in cluster create and Pathways cluster create.
433
580
 
434
581
  Args:
435
- List of cluster create optional arguments parsers
582
+ parser: cluster create argument parser or argument group
436
583
  """
437
- for custom_parser in args_parsers:
438
- add_shared_arguments(custom_parser)
439
- custom_parser.add_argument(
440
- '--host-maintenance-interval',
441
- type=str,
442
- choices=['AS_NEEDED', 'PERIODIC'],
443
- default='AS_NEEDED',
444
- help='The maintenance policy of the cluster and respective clusters.',
445
- )
446
- custom_parser.add_argument(
447
- '--gke-version',
448
- type=str,
449
- help=(
450
- 'The GKE version of the cluster and respective clusters. The'
451
- ' default is determined dynamically based on RAPID channel'
452
- ' recommended version.'
453
- ),
454
- )
455
- custom_parser.add_argument(
456
- '--num-slices',
457
- type=int,
458
- default=1,
459
- help='The number of slices to run the job on, defaults to 1.',
460
- required=False,
461
- )
462
- custom_parser.add_argument(
463
- '--pathways-gce-machine-type',
464
- type=str,
465
- default='n1-standard-32',
466
- help='The CPU type for Pathways CPU nodepools',
467
- )
468
- custom_parser.add_argument(
469
- '--default-pool-cpu-machine-type',
470
- type=str,
471
- default='e2-standard-16',
472
- help=(
473
- 'Set the machine type within the default cpu node pool. For'
474
- ' regional clusters, all zones must support the machine type.'
475
- ),
476
- )
477
- custom_parser.add_argument(
478
- '--cluster-cpu-machine-type',
479
- type=str,
480
- default='',
481
- help=(
482
- 'Getting deprecated soon! Please use'
483
- ' --default-pool-cpu-machine-typeinstead, to denote the machine'
484
- ' type of the default cpu node pool. Set the machine type of other'
485
- ' cpu nodepools using --device-type.'
486
- ),
487
- )
488
- custom_parser.add_argument(
489
- '--default-pool-cpu-num-nodes',
490
- type=int,
491
- default=6,
492
- help=(
493
- 'Set the number of nodes within the default cpu node pool. This is'
494
- ' set to 6 by default. Autoscaling is enabled to scale this value'
495
- ' over time.'
496
- ),
497
- )
498
- custom_parser.add_argument(
499
- '--custom-cluster-arguments',
500
- type=str,
501
- default='',
502
- help=(
503
- 'Users can add their own arguments to customize their cluster'
504
- ' create command. Do note, these will not override already used'
505
- ' cluster creation arguments. e.g.'
506
- " --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
507
- ),
508
- )
509
- custom_parser.add_argument(
510
- '--custom-nodepool-arguments',
511
- type=str,
512
- default='',
513
- help=(
514
- 'Users can add their own arguments to customize their node pool '
515
- ' create command. Do note, these will not override already used'
516
- ' node pool creation arguments. e.g.'
517
- ' --custom-nodepool-arguments="--disk-size=300"'
518
- ),
519
- )
520
- custom_parser.add_argument(
521
- '--force',
522
- action='store_true',
523
- help=(
524
- 'Forces node pool creation and delete commands to run without'
525
- ' additional approval.'
526
- ),
527
- )
528
- custom_parser.add_argument(
529
- '--custom-tpu-nodepool-arguments',
530
- type=str,
531
- default='',
532
- help=(
533
- 'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
534
- ' customize node pool create command. Do note, these will not'
535
- ' override already used node pool creation arguments. Example usage'
536
- ' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
537
- ),
538
- )
539
- custom_parser.add_argument(
540
- '--private',
541
- action='store_true',
542
- help=(
543
- 'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
544
- ' and Pods are isolated from the internet. If set,'
545
- ' master_authorized_networks will also be enabled and access to the'
546
- " cluster's control plane will be restricted only to current"
547
- " machine's IP address unless more IP ranges are authorized by"
548
- ' providing --authorized-networks. This works only on creating new'
549
- ' clusters.'
550
- ),
551
- )
552
- custom_parser.add_argument(
553
- '--authorized-networks',
554
- action='extend',
555
- nargs='+',
556
- help=(
557
- 'Sets the provided cidrs as authorized IP ranges to access the'
558
- " private cluster's control plan. Access to the control plane will"
559
- " be provided to current machine's IP address even if"
560
- ' --authorized-networks is not set or it does not cover the IP'
561
- ' address. If set, --private is considered true and a private'
562
- ' cluster will be provisioned. It replaces existing authorized'
563
- ' networks if used with an existing private cluster.'
564
- ' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
565
- ),
566
- )
567
- custom_parser.add_argument(
568
- '--enable-workload-identity',
569
- action='store_true',
570
- help=(
571
- 'Enable Workload Identity Federation on the cluster and node-pools.'
572
- ),
573
- )
574
- custom_parser.add_argument(
575
- '--enable-gcsfuse-csi-driver',
576
- action='store_true',
577
- help=(
578
- 'Enable GSCFuse driver on the cluster. This enables Workload'
579
- ' Identity Federation. When using A3 ultra/A3 mega Workload'
580
- ' Identity is enabled by default.'
581
- ),
582
- )
583
-
584
- custom_parser.add_argument(
585
- '--enable-gcpfilestore-csi-driver',
586
- action='store_true',
587
- help=(
588
- 'Enable GCPFilestore driver on the cluster. This enables Workload'
589
- ' Identity Federation.'
590
- ),
591
- )
592
-
593
-
594
- def add_shared_cluster_create_tensorboard_arguments(args_parsers):
584
+ add_shared_arguments(parser)
585
+ parser.add_argument(
586
+ '--host-maintenance-interval',
587
+ type=str,
588
+ choices=['AS_NEEDED', 'PERIODIC'],
589
+ default='AS_NEEDED',
590
+ help='The maintenance policy of the cluster and respective clusters.',
591
+ )
592
+ parser.add_argument(
593
+ '--gke-version',
594
+ type=str,
595
+ help=(
596
+ 'The GKE version of the cluster and respective clusters. The'
597
+ ' default is determined dynamically based on RAPID channel'
598
+ ' recommended version.'
599
+ ),
600
+ )
601
+ parser.add_argument(
602
+ '--num-slices',
603
+ type=int,
604
+ default=1,
605
+ help='The number of slices to run the job on, defaults to 1.',
606
+ required=False,
607
+ )
608
+ parser.add_argument(
609
+ '--pathways-gce-machine-type',
610
+ type=str,
611
+ default='n2-standard-64',
612
+ help='The CPU type for Pathways CPU nodepools',
613
+ )
614
+ parser.add_argument(
615
+ '--default-pool-cpu-machine-type',
616
+ type=str,
617
+ default='e2-standard-16',
618
+ help=(
619
+ 'Set the machine type within the default cpu node pool. For'
620
+ ' regional clusters, all zones must support the machine type.'
621
+ ),
622
+ )
623
+ parser.add_argument(
624
+ '--cluster-cpu-machine-type',
625
+ type=str,
626
+ default='',
627
+ help=(
628
+ 'Getting deprecated soon! Please use'
629
+ ' --default-pool-cpu-machine-typeinstead, to denote the machine'
630
+ ' type of the default cpu node pool. Set the machine type of other'
631
+ ' cpu nodepools using --device-type.'
632
+ ),
633
+ )
634
+ parser.add_argument(
635
+ '--default-pool-cpu-num-nodes',
636
+ type=int,
637
+ default=6,
638
+ help=(
639
+ 'Set the number of nodes within the default cpu node pool. This is'
640
+ ' set to 6 by default. Autoscaling is enabled to scale this value'
641
+ ' over time.'
642
+ ),
643
+ )
644
+ parser.add_argument(
645
+ '--custom-cluster-arguments',
646
+ type=str,
647
+ default='',
648
+ help=(
649
+ 'Users can add their own arguments to customize their cluster'
650
+ ' create command. Do note, these will not override already used'
651
+ ' cluster creation arguments. e.g.'
652
+ " --custom-cluster-arguments='--network=mtu9k --subnetwork=mtu9k'"
653
+ ),
654
+ )
655
+ parser.add_argument(
656
+ '--custom-nodepool-arguments',
657
+ type=str,
658
+ default='',
659
+ help=(
660
+ 'Users can add their own arguments to customize their node pool '
661
+ ' create command. Do note, these will not override already used'
662
+ ' node pool creation arguments. e.g.'
663
+ ' --custom-nodepool-arguments="--disk-size=300"'
664
+ ),
665
+ )
666
+ parser.add_argument(
667
+ '--force',
668
+ action='store_true',
669
+ help=(
670
+ 'Forces node pool creation and delete commands to run without'
671
+ ' additional approval.'
672
+ ),
673
+ )
674
+ parser.add_argument(
675
+ '--custom-tpu-nodepool-arguments',
676
+ type=str,
677
+ default='',
678
+ help=(
679
+ 'DEPRECATING SOON! Please use --custom-nodepool-arguments to'
680
+ ' customize node pool create command. Do note, these will not'
681
+ ' override already used node pool creation arguments. Example usage'
682
+ ' --custom-tpu-nodepool-arguments="--enable-ip-alias"'
683
+ ),
684
+ )
685
+ parser.add_argument(
686
+ '--private',
687
+ action='store_true',
688
+ help=(
689
+ 'Creates a private GKE cluster, a VPC-native cluster in which Nodes'
690
+ ' and Pods are isolated from the internet. If set,'
691
+ ' master_authorized_networks will also be enabled and access to the'
692
+ " cluster's control plane will be restricted only to current"
693
+ " machine's IP address unless more IP ranges are authorized by"
694
+ ' providing --authorized-networks. This works only on creating new'
695
+ ' clusters.'
696
+ ),
697
+ )
698
+ parser.add_argument(
699
+ '--authorized-networks',
700
+ action='extend',
701
+ nargs='+',
702
+ help=(
703
+ 'Sets the provided cidrs as authorized IP ranges to access the'
704
+ " private cluster's control plan. Access to the control plane will"
705
+ " be provided to current machine's IP address even if"
706
+ ' --authorized-networks is not set or it does not cover the IP'
707
+ ' address. If set, --private is considered true and a private'
708
+ ' cluster will be provisioned. It replaces existing authorized'
709
+ ' networks if used with an existing private cluster.'
710
+ ' Example usage: --authorized-networks 1.2.3.0/24 1.2.4.5/32'
711
+ ),
712
+ )
713
+ parser.add_argument(
714
+ '--enable-workload-identity',
715
+ action='store_true',
716
+ help='Enable Workload Identity Federation on the cluster and node-pools.',
717
+ )
718
+ add_driver_arguments(parser)
719
+
720
+
721
+ def add_driver_arguments(parser: ArgumentParser):
722
+ parser.add_argument(
723
+ '--enable-gcsfuse-csi-driver',
724
+ action='store_true',
725
+ help=(
726
+ 'Enable GSCFuse driver on the cluster. This enables Workload'
727
+ ' Identity Federation. When using A3 ultra/A3 mega Workload'
728
+ ' Identity is enabled by default.'
729
+ ),
730
+ )
731
+ parser.add_argument(
732
+ '--enable-gcpfilestore-csi-driver',
733
+ action='store_true',
734
+ help='Enable GCPFilestore driver on the cluster.',
735
+ )
736
+ parser.add_argument(
737
+ '--enable-parallelstore-csi-driver',
738
+ action='store_true',
739
+ help='Enable Parallelstore CSI driver on the cluster.',
740
+ )
741
+ parser.add_argument(
742
+ '--enable-pd-csi-driver',
743
+ action='store_true',
744
+ help='Enable PersistentDisk CSI driver on the cluster.',
745
+ )
746
+
747
+
748
+ def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
595
749
  """Add shared tensorboard arguments in cluster create and Pathways cluster create.
596
750
  Note that this feature enables non-Pathways workloads to use tensorboard arguments
597
751
  on a Pathways cluster.
752
+
598
753
  Args:
599
- List of cluster create tensorboard arguments parsers
754
+ parser: cluster create argument parser or argument group
600
755
  """
601
- for custom_parser in args_parsers:
602
- custom_parser.add_argument(
603
- '--create-vertex-tensorboard',
604
- action='store_true',
605
- help='Set this flag to create a Tensorboard instance in Vertex AI.',
606
- )
607
- custom_parser.add_argument(
608
- '--tensorboard-region',
609
- type=str,
610
- default='us-central1',
611
- help=(
612
- 'The region to create Vertex Tensorboard instance in. Visit'
613
- ' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
614
- ' to view regions supported by Vertex AI. By default, Tensorboard'
615
- ' instance will be created in us-central1.'
616
- ),
617
- )
618
- custom_parser.add_argument(
619
- '--tensorboard-name',
620
- type=str,
621
- required=False,
622
- help=(
623
- 'The name of Vertex Tensorboard instance to create. If not'
624
- ' specified, a Tensorboard instance with the name'
625
- f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
626
- ' created.'
627
- ),
628
- )
629
-
630
-
631
- def add_shared_cluster_create_capacity_arguments(args_parsers):
756
+ parser.add_argument(
757
+ '--create-vertex-tensorboard',
758
+ action='store_true',
759
+ help='Set this flag to create a Tensorboard instance in Vertex AI.',
760
+ )
761
+ parser.add_argument(
762
+ '--tensorboard-region',
763
+ type=str,
764
+ default='us-central1',
765
+ help=(
766
+ 'The region to create Vertex Tensorboard instance in. Visit'
767
+ ' https://cloud.google.com/vertex-ai/docs/general/locations#available-regions'
768
+ ' to view regions supported by Vertex AI. By default, Tensorboard'
769
+ ' instance will be created in us-central1.'
770
+ ),
771
+ )
772
+ parser.add_argument(
773
+ '--tensorboard-name',
774
+ type=str,
775
+ required=False,
776
+ help=(
777
+ 'The name of Vertex Tensorboard instance to create. If not'
778
+ ' specified, a Tensorboard instance with the name'
779
+ f' <cluster>-{DEFAULT_VERTEX_TENSORBOARD_NAME} will be'
780
+ ' created.'
781
+ ),
782
+ )
783
+
784
+
785
+ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
632
786
  """Add shared capacity arguments in cluster create and Pathways cluster create.
633
787
 
634
788
  Args:
635
- List of cluster create capacity arguments parsers
789
+ parser: cluster create argument parser or argument group
790
+ """
791
+ parser.add_argument(
792
+ '--on-demand',
793
+ action='store_true',
794
+ help=(
795
+ 'Sets node pool creation to use on-demand resources. '
796
+ ' See `--reservation` or `--spot` for other capacity types.'
797
+ ),
798
+ )
799
+ parser.add_argument(
800
+ '--reservation',
801
+ type=str,
802
+ help=(
803
+ 'The reservation to be used for acquiring resources in the'
804
+ ' cluster. This will attempt to find the provided reservation.'
805
+ ' See `--spot` or `--on-demand` for other capacity types.'
806
+ ),
807
+ )
808
+ parser.add_argument(
809
+ '--spot',
810
+ action='store_true',
811
+ help=(
812
+ 'Sets node pool creation to use spot resources.'
813
+ ' See `--reservation` or `--on-demand` for other capacity types.'
814
+ ),
815
+ )
816
+
817
+
818
+ def add_shared_cluster_create_mtc_arguments(parser: ArgumentParser):
819
+ """Add shared Multi-tier Checkpointing arguments in cluster create and Pathways cluster create.
820
+
821
+ Args:
822
+ List of cluster create MTC arguments parsers
636
823
  """
637
- for custom_parser in args_parsers:
638
- custom_parser.add_argument(
639
- '--on-demand',
640
- action='store_true',
641
- help=(
642
- 'Sets node pool creation to use on-demand resources. '
643
- ' See `--reservation` or `--spot` for other capacity types.'
644
- ),
645
- )
646
- custom_parser.add_argument(
647
- '--reservation',
648
- type=str,
649
- help=(
650
- 'The reservation to be used for acquiring resources in the'
651
- ' cluster. This will attempt to find the provided reservation.'
652
- ' See `--spot` or `--on-demand` for other capacity types.'
653
- ),
654
- )
655
- custom_parser.add_argument(
656
- '--spot',
657
- action='store_true',
658
- help=(
659
- 'Sets node pool creation to use spot resources.'
660
- ' See `--reservation` or `--on-demand` for other capacity types.'
661
- ),
662
- )
824
+ parser.add_argument(
825
+ '--enable-mtc',
826
+ action='store_true',
827
+ help='Enable MTC on the cluster.',
828
+ )
829
+ parser.add_argument(
830
+ '--mtc-ramdisk-size',
831
+ type=str,
832
+ default=None,
833
+ help=(
834
+ '(Required if --enable-mtc is true) The size of the RAM disk to be'
835
+ ' used for multi-tier checkpointing. e.g. "64Mi" '
836
+ ),
837
+ )
838
+ parser.add_argument(
839
+ '--mtc-gcs-bucket',
840
+ type=str,
841
+ default=None,
842
+ help=(
843
+ '(Required if --enable-mtc is true) The GCS bucket to be used for'
844
+ ' multi-tier checkpointing.'
845
+ ),
846
+ )
847
+ parser.add_argument(
848
+ '--mtc-toleration-key',
849
+ type=str,
850
+ default=None,
851
+ help=(
852
+ '(Optional) The tolerance key to be used for multi-tier'
853
+ ' checkpointing. By default, it is set to "google.com/tpu".'
854
+ ),
855
+ )