xpk 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/core/kueue.py CHANGED
@@ -16,6 +16,7 @@ limitations under the License.
16
16
 
17
17
  from argparse import Namespace
18
18
 
19
+ import math
19
20
  import packaging
20
21
  from packaging.version import Version
21
22
 
@@ -39,10 +40,12 @@ from .system_characteristics import (
39
40
  SystemCharacteristics,
40
41
  )
41
42
 
42
- KUEUE_VERSION = 'v0.10.0'
43
+ KUEUE_VERSION = 'v0.12.2'
43
44
  CLUSTER_QUEUE_NAME = 'cluster-queue'
44
45
  LOCAL_QUEUE_NAME = 'multislice-queue'
45
46
  WAIT_FOR_KUEUE_TIMEOUT = '5m'
47
+ MEMORY_SIZE_PER_VM = 1.2
48
+ MIN_MEMORY_LIMIT_SIZE = 4096
46
49
 
47
50
  packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
48
51
 
@@ -69,6 +72,26 @@ spec:
69
72
  {machine_label}
70
73
  {topology_label}
71
74
  ---
75
+ apiVersion: kueue.x-k8s.io/v1beta1
76
+ kind: AdmissionCheck
77
+ metadata:
78
+ name: dws-prov
79
+ spec:
80
+ controllerName: kueue.x-k8s.io/provisioning-request
81
+ parameters:
82
+ apiGroup: kueue.x-k8s.io
83
+ kind: ProvisioningRequestConfig
84
+ name: dws-config
85
+ ---
86
+ apiVersion: kueue.x-k8s.io/v1beta1
87
+ kind: ProvisioningRequestConfig
88
+ metadata:
89
+ name: dws-config
90
+ spec:
91
+ provisioningClassName: queued-provisioning.gke.io
92
+ managedResources:
93
+ - {managed_resource}
94
+ ---
72
95
  {pw_resource_flavors}
73
96
  apiVersion: kueue.x-k8s.io/v1beta1
74
97
  kind: ClusterQueue
@@ -82,6 +105,7 @@ spec:
82
105
  resourceGroups:
83
106
  {covered_resources_config}
84
107
  {pw_resources_kueue}
108
+ {admission_checks}
85
109
  ---
86
110
  apiVersion: kueue.x-k8s.io/v1beta1
87
111
  kind: LocalQueue
@@ -166,6 +190,99 @@ spec:
166
190
  command: [ "sleep", "inf" ]
167
191
  """
168
192
 
193
+ kueue_controller_manager_yml = """
194
+ apiVersion: apps/v1
195
+ kind: Deployment
196
+ metadata:
197
+ labels:
198
+ app.kubernetes.io/component: controller
199
+ app.kubernetes.io/name: kueue
200
+ control-plane: controller-manager
201
+ name: kueue-controller-manager
202
+ namespace: kueue-system
203
+ spec:
204
+ replicas: 1
205
+ selector:
206
+ matchLabels:
207
+ control-plane: controller-manager
208
+ template:
209
+ metadata:
210
+ annotations:
211
+ kubectl.kubernetes.io/default-container: manager
212
+ labels:
213
+ app.kubernetes.io/component: controller
214
+ app.kubernetes.io/name: kueue
215
+ control-plane: controller-manager
216
+ spec:
217
+ containers:
218
+ - args:
219
+ - --config=/controller_manager_config.yaml
220
+ - --zap-log-level=2
221
+ command:
222
+ - /manager
223
+ image: registry.k8s.io/kueue/kueue:v0.10.0
224
+ imagePullPolicy: Always
225
+ livenessProbe:
226
+ httpGet:
227
+ path: /healthz
228
+ port: 8081
229
+ initialDelaySeconds: 15
230
+ periodSeconds: 20
231
+ name: manager
232
+ ports:
233
+ - containerPort: 8082
234
+ name: visibility
235
+ protocol: TCP
236
+ - containerPort: 9443
237
+ name: webhook-server
238
+ protocol: TCP
239
+ readinessProbe:
240
+ httpGet:
241
+ path: /readyz
242
+ port: 8081
243
+ initialDelaySeconds: 5
244
+ periodSeconds: 10
245
+ resources:
246
+ limits:
247
+ cpu: 500m
248
+ memory: {memory_limit_size}
249
+ requests:
250
+ cpu: 500m
251
+ memory: 512Mi
252
+ securityContext:
253
+ allowPrivilegeEscalation: false
254
+ volumeMounts:
255
+ - mountPath: /tmp/k8s-webhook-server/serving-certs
256
+ name: cert
257
+ readOnly: true
258
+ - mountPath: /controller_manager_config.yaml
259
+ name: manager-config
260
+ subPath: controller_manager_config.yaml
261
+ - args:
262
+ - --secure-listen-address=0.0.0.0:8443
263
+ - --upstream=http://127.0.0.1:8080/
264
+ - --logtostderr=true
265
+ - --v=10
266
+ image: registry.k8s.io/kubebuilder/kube-rbac-proxy:v0.16.0
267
+ name: kube-rbac-proxy
268
+ ports:
269
+ - containerPort: 8443
270
+ name: https
271
+ protocol: TCP
272
+ securityContext:
273
+ runAsNonRoot: true
274
+ serviceAccountName: kueue-controller-manager
275
+ terminationGracePeriodSeconds: 10
276
+ volumes:
277
+ - name: cert
278
+ secret:
279
+ defaultMode: 420
280
+ secretName: kueue-webhook-server-cert
281
+ - configMap:
282
+ name: kueue-manager-config
283
+ name: manager-config
284
+ """
285
+
169
286
 
170
287
  def verify_kueuectl(args: Namespace) -> None:
171
288
  """Verify if kueuectl is installed.
@@ -282,6 +399,7 @@ def install_kueue_crs(
282
399
  args,
283
400
  system: SystemCharacteristics,
284
401
  autoprovisioning_config: AutoprovisioningConfig | None,
402
+ flex_with_tpu=False,
285
403
  ) -> int:
286
404
  """Install Kueue Custom Resources.
287
405
 
@@ -309,6 +427,13 @@ def install_kueue_crs(
309
427
  else:
310
428
  # Determine total chips based on user specified topology.
311
429
  total_chips = get_total_chips_requested_from_args(args, system)
430
+ if args.flex and flex_with_tpu is False:
431
+ admission_checks = """
432
+ admissionChecks:
433
+ - dws-prov
434
+ """
435
+ else:
436
+ admission_checks = ''
312
437
 
313
438
  covered_resources_config = get_kueue_covered_resources_config(
314
439
  cluster_hardware_name=cluster_hardware_name,
@@ -322,7 +447,9 @@ def install_kueue_crs(
322
447
  B200_DEVICE_TYPE,
323
448
  ]:
324
449
  topology_label = 'topologyName: "gke-default"'
325
-
450
+ res_type = AcceleratorTypeToAcceleratorCharacteristics[
451
+ system.accelerator_type
452
+ ].resource_type
326
453
  yml_string = cluster_set_crd_yaml.format(
327
454
  system=system,
328
455
  cluster_hardware_name=cluster_hardware_name,
@@ -334,11 +461,11 @@ def install_kueue_crs(
334
461
  ),
335
462
  topology_label=topology_label,
336
463
  covered_resources_config=covered_resources_config,
337
- resource_type=AcceleratorTypeToAcceleratorCharacteristics[
338
- system.accelerator_type
339
- ].resource_type,
464
+ resource_type=res_type,
340
465
  pw_resource_flavors=add_pw_resource_flavors(args),
341
466
  pw_resources_kueue=add_pw_resources_to_kueue(args),
467
+ admission_checks=admission_checks,
468
+ managed_resource=res_type,
342
469
  cluster_queue_name=CLUSTER_QUEUE_NAME,
343
470
  local_queue_name=LOCAL_QUEUE_NAME,
344
471
  )
@@ -386,3 +513,36 @@ def get_kueue_covered_resources_config(
386
513
  total_chips=total_chips,
387
514
  )
388
515
  return config_string
516
+
517
+
518
+ def update_kueue_resources_if_necessary(args):
519
+ """Update the kueue manifest to increase the resources for the kueue controller manager.
520
+
521
+ Args:
522
+ args: user provided arguments for running the command.
523
+
524
+ Returns:
525
+ 0 if successful and 1 otherwise.
526
+ """
527
+ # Get total number of nodes
528
+ cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
529
+ return_code, out = run_command_for_value(
530
+ cmd_total_node_num, 'Count total nodes', args
531
+ )
532
+ if return_code != 0:
533
+ xpk_exit(1)
534
+ # 1.2MiB per VM or 4GiB (whichever is greater).
535
+ new_memory_limit = (
536
+ f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
537
+ )
538
+ yml_string = kueue_controller_manager_yml.format(
539
+ memory_limit_size=new_memory_limit,
540
+ )
541
+ tmp = write_tmp_file(yml_string)
542
+ command = f'kubectl apply -f {str(tmp.file.name)}'
543
+
544
+ task = 'Updating Kueue Controller Manager resources'
545
+ return_code = run_command_with_updates_retry(command, task, args)
546
+ if return_code != 0:
547
+ xpk_print(f'{task} returned ERROR {return_code}')
548
+ return return_code
xpk/core/nodepool.py CHANGED
@@ -77,8 +77,12 @@ def run_gke_node_pool_create_command(
77
77
  if return_code > 0:
78
78
  xpk_print('Listing all reservations failed!')
79
79
  return_code = 1
80
+ if system.accelerator_type == AcceleratorType['TPU']:
81
+ max_nodes = system.vms_per_slice
82
+ else:
83
+ max_nodes = 1000
80
84
  capacity_args, return_code = get_capacity_arguments_from_capacity_type(
81
- args, capacity_type
85
+ args, capacity_type, max_nodes
82
86
  )
83
87
  if return_code > 0:
84
88
  xpk_print('Parsing capacity arguments failed!')
@@ -275,7 +279,10 @@ def run_gke_node_pool_create_command(
275
279
  )
276
280
  if system.accelerator_type == AcceleratorType['TPU']:
277
281
  command += f' --node-version={gke_node_pool_version}'
278
- command += f' --num-nodes={system.vms_per_slice}'
282
+ if capacity_type == CapacityType.FLEX_START:
283
+ command += ' --num-nodes=0'
284
+ else:
285
+ command += f' --num-nodes={system.vms_per_slice}'
279
286
  command += ' --placement-type=COMPACT --max-pods-per-node 15'
280
287
  command += (
281
288
  f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
@@ -284,7 +291,10 @@ def run_gke_node_pool_create_command(
284
291
  command += f' {args.custom_tpu_nodepool_arguments}'
285
292
  elif system.accelerator_type == AcceleratorType['GPU']:
286
293
  subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
287
- command += f' --num-nodes={args.num_nodes}'
294
+ if capacity_type == CapacityType.FLEX_START:
295
+ command += ' --num-nodes=0'
296
+ else:
297
+ command += f' --num-nodes={args.num_nodes}'
288
298
  command += (
289
299
  ' --accelerator'
290
300
  f' type={system.gke_accelerator},count={str(system.chips_per_vm)},gpu-driver-version=latest'
@@ -298,7 +308,10 @@ def run_gke_node_pool_create_command(
298
308
  )
299
309
  command += ' --max-pods-per-node=32'
300
310
  elif system.accelerator_type == AcceleratorType['CPU']:
301
- command += f' --num-nodes={system.vms_per_slice}'
311
+ if capacity_type == CapacityType.FLEX_START:
312
+ command += ' --num-nodes=0'
313
+ else:
314
+ command += f' --num-nodes={system.vms_per_slice}'
302
315
  command += (
303
316
  f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
304
317
  )
xpk/core/pathways.py CHANGED
@@ -19,8 +19,7 @@ from ..core.docker_container import get_user_workload_container
19
19
  from ..core.gcloud_context import zone_to_region
20
20
  from ..core.nodepool import get_all_nodepools_programmatic
21
21
  from ..utils.console import xpk_exit, xpk_print
22
- from .config import AcceleratorType
23
- from .system_characteristics import SystemCharacteristics
22
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
24
23
 
25
24
 
26
25
  def add_pw_resource_flavors(args):
xpk/core/storage.py CHANGED
@@ -46,6 +46,7 @@ STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
46
46
  GCS_FUSE_TYPE = "gcsfuse"
47
47
  GCP_FILESTORE_TYPE = "gcpfilestore"
48
48
  PARALLELSTORE_TYPE = "parallelstore"
49
+ LUSTRE_TYPE = "lustre"
49
50
  GCE_PD_TYPE = "pd"
50
51
  MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
51
52
  GCS_FUSE_ANNOTATIONS = {
@@ -365,101 +366,6 @@ def get_storage_annotations(storages: list[Storage]) -> list[str]:
365
366
  return annotations
366
367
 
367
368
 
368
- def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
369
- """
370
- Generates the YAML representation of the volumeMounts section for the given Storages.
371
-
372
- This function creates the YAML snippet that defines how the storage volumes
373
- should be mounted within a Pod's containers.
374
-
375
- Args:
376
- storages: A list of Storage objects.
377
-
378
- Returns:
379
- A string containing the YAML representation of the volumeMounts section.
380
- """
381
- yaml_str = ""
382
- for storage in storages:
383
- yaml_str += f"""- name: {storage.pv}
384
- mountPath: {storage.mount_point}
385
- readOnly: {storage.readonly}
386
- """
387
- return yaml_str
388
-
389
-
390
- def get_storage_volumes_yaml(storages: list[Storage]) -> str:
391
- """
392
- Generates the YAML representation of the volumes section for the given Storages.
393
-
394
- This function creates the YAML snippet that defines the volumes to be
395
- mounted in a Pod, including the PersistentVolumeClaim associated with
396
- each Storage.
397
-
398
- Args:
399
- storages: A list of Storage objects.
400
-
401
- Returns:
402
- A string containing the YAML representation of the volumes section.
403
- """
404
- yaml_str = ""
405
- for storage in storages:
406
- yaml_str += f"""- name: {storage.pv}
407
- persistentVolumeClaim:
408
- claimName: {storage.pvc}
409
- readOnly: {storage.readonly}
410
- """
411
- return yaml_str
412
-
413
-
414
- def get_storage_volume_mounts_for_gpu(
415
- storages: list[Storage],
416
- ) -> list[dict]:
417
- """
418
- Generates the YAML representation of the volumeMounts section for the given Storages.
419
-
420
- This function creates the list of storage specifications that define how the storage volumes
421
- should be mounted within a Pod's containers.
422
-
423
- Args:
424
- storages: A list of Storage objects.
425
-
426
- Returns:
427
- A list containing the dictionary representation of the volumeMounts section.
428
- """
429
- return [
430
- {
431
- "name": storage.pv,
432
- "mountPath": storage.mount_point,
433
- "readOnly": storage.readonly,
434
- }
435
- for storage in storages
436
- ]
437
-
438
-
439
- def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
440
- """
441
- Generates the YAML representation of the volumes section for the given Storages.
442
-
443
- This function creates the YAML snippet that defines the volumes to be
444
- mounted in a Pod, including the PersistentVolumeClaim associated with
445
- each Storage.
446
-
447
- Args:
448
- storages: A list of Storage objects.
449
-
450
- Returns:
451
- A string containing the YAML representation of the volumes section.
452
- """
453
- yaml_str = ""
454
- for storage in storages:
455
- yaml_str += f"""- name: {storage.pv}
456
- persistentVolumeClaim:
457
- claimName: {storage.pvc}
458
- readOnly: {storage.readonly}
459
- """
460
- return yaml_str
461
-
462
-
463
369
  def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
464
370
  vols = []
465
371
  for storage in storages:
@@ -1156,7 +1156,7 @@ UserFacingNameToSystemCharacteristics = {
1156
1156
  2,
1157
1157
  'tpu-v5-lite-podslice',
1158
1158
  'ct5lp-hightpu-4t',
1159
- 8,
1159
+ 4,
1160
1160
  AcceleratorType['TPU'],
1161
1161
  'v5litepod-8',
1162
1162
  ),
xpk/core/workload.py CHANGED
@@ -14,18 +14,9 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- import yaml
18
-
19
- from ..utils import templates
20
17
  from ..utils.console import xpk_exit, xpk_print
21
- from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
22
18
  from .commands import run_command_for_value
23
19
  from .gcloud_context import zone_to_region
24
- from .storage import Storage, get_storage_volume_mounts_for_gpu
25
- from .system_characteristics import SystemCharacteristics
26
-
27
- RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
28
- RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
29
20
 
30
21
 
31
22
  def workload_list_awk_command(filter_key) -> str:
@@ -249,38 +240,3 @@ def wait_for_job_completion(args) -> int:
249
240
  xpk_print('Your workload did not complete successfully')
250
241
  return 125
251
242
  return 0
252
-
253
-
254
- def add_gpu_rxdm_container(
255
- jobset_manifest_str: str,
256
- system: SystemCharacteristics,
257
- all_storages: list[Storage],
258
- ) -> str:
259
- """Add gpu rxdm container to jobset manifest based on user provided arguments.
260
-
261
- Args:
262
- jobset_manifest_str: the JobSet manifest as a YAML string.
263
- system: system characteristics.
264
- all_storages: list of all storages.
265
-
266
- Returns:
267
- str: the modified JobSet manifest as a YAML string.
268
- """
269
- if system.device_type == H100_DEVICE_TYPE:
270
- gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
271
- elif system.device_type == H100_MEGA_DEVICE_TYPE:
272
- gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
273
- else:
274
- return jobset_manifest_str
275
-
276
- storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
277
- gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
278
-
279
- manifest = yaml.safe_load(jobset_manifest_str)
280
-
281
- for job in manifest['spec']['replicatedJobs']:
282
- job['template']['spec']['template']['spec']['containers'].append(
283
- gpu_rxdm_container
284
- )
285
-
286
- return yaml.dump(manifest, sort_keys=False)
@@ -80,6 +80,8 @@ def add_annotations(job_manifest: dict, sub_networks: list[str]):
80
80
  """Adds or updates annotations in the Pod template."""
81
81
  annotations = job_manifest['spec']['template']['metadata']['annotations']
82
82
  interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
83
+ if annotations is None:
84
+ annotations = {}
83
85
  annotations.update({
84
86
  'networking.gke.io/default-interface': 'eth0',
85
87
  interfaces_key: interfaces_value,
@@ -25,7 +25,7 @@ tcpx = 'v2.0.11'
25
25
  def decorate_kjob_template(job_manifest: dict) -> dict:
26
26
  add_volumes(job_manifest)
27
27
  add_tolerations(job_manifest)
28
- add_tcpxo_daemon_container(job_manifest)
28
+ add_tcpx_daemon_container(job_manifest)
29
29
  update_gpu_containers(job_manifest)
30
30
  return job_manifest
31
31
 
@@ -34,7 +34,7 @@ def decorate_job(job_manifest: dict) -> dict:
34
34
  add_annotations(job_manifest)
35
35
  add_volumes(job_manifest)
36
36
  add_tolerations(job_manifest)
37
- add_tcpxo_daemon_container(job_manifest)
37
+ add_tcpx_daemon_container(job_manifest)
38
38
  update_gpu_containers(job_manifest)
39
39
  return job_manifest
40
40
 
@@ -131,10 +131,13 @@ def add_volumes(job_manifest: dict):
131
131
  })
132
132
  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
133
133
  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
134
+ volumes.append(
135
+ {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
136
+ )
134
137
 
135
138
 
136
- def add_tcpxo_daemon_container(job_manifest):
137
- """Adds the tcpxo-daemon container to the Pod spec."""
139
+ def add_tcpx_daemon_container(job_manifest):
140
+ """Adds the tcpx-daemon container to the Pod spec."""
138
141
  tcpxo_daemon_container = {
139
142
  'name': 'tcpx-daemon',
140
143
  'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
@@ -177,3 +180,6 @@ def update_gpu_containers(job_manifest):
177
180
  volumeMounts.append(
178
181
  {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
179
182
  )
183
+ container['volumeMounts'].append(
184
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
185
+ )
@@ -15,6 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import yaml
18
+
18
19
  from ...utils.yaml import literal_string
19
20
 
20
21
  # Component version
@@ -141,6 +142,9 @@ def add_volumes(job_manifest):
141
142
  'name': 'aperture-devices',
142
143
  'hostPath': {'path': '/dev/aperture_devices'},
143
144
  })
145
+ volumes.append(
146
+ {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
147
+ )
144
148
 
145
149
 
146
150
  def add_tcpxo_daemon_container(job_manifest):
@@ -189,3 +193,6 @@ def update_gpu_containers(job_manifest):
189
193
  container['volumeMounts'].append(
190
194
  {'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
191
195
  )
196
+ container['volumeMounts'].append(
197
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
198
+ )
xpk/parser/cluster.py CHANGED
@@ -743,6 +743,11 @@ def add_driver_arguments(parser: ArgumentParser):
743
743
  action='store_true',
744
744
  help='Enable PersistentDisk CSI driver on the cluster.',
745
745
  )
746
+ parser.add_argument(
747
+ '--enable-lustre-csi-driver',
748
+ action='store_true',
749
+ help='Enable Lustre CSI driver on the cluster.',
750
+ )
746
751
 
747
752
 
748
753
  def add_shared_cluster_create_tensorboard_arguments(parser: ArgumentParser):
@@ -792,25 +797,36 @@ def add_shared_cluster_create_capacity_arguments(parser: ArgumentParser):
792
797
  '--on-demand',
793
798
  action='store_true',
794
799
  help=(
795
- 'Sets node pool creation to use on-demand resources. '
796
- ' See `--reservation` or `--spot` for other capacity types.'
800
+ 'Sets node pool creation to use on-demand resources. See'
801
+ ' `--reservation`, `--flex` or `--spot` for other capacity'
802
+ ' types.'
797
803
  ),
798
804
  )
799
805
  parser.add_argument(
800
806
  '--reservation',
801
807
  type=str,
802
808
  help=(
803
- 'The reservation to be used for acquiring resources in the'
804
- ' cluster. This will attempt to find the provided reservation.'
805
- ' See `--spot` or `--on-demand` for other capacity types.'
809
+ 'The reservation to be used for acquiring resources in the cluster.'
810
+ ' This will attempt to find the provided reservation. See `--spot`,'
811
+ ' `--flex` or `--on-demand` for other capacity types.'
806
812
  ),
807
813
  )
808
814
  parser.add_argument(
809
815
  '--spot',
810
816
  action='store_true',
811
817
  help=(
812
- 'Sets node pool creation to use spot resources.'
813
- ' See `--reservation` or `--on-demand` for other capacity types.'
818
+ 'Sets node pool creation to use spot resources. See'
819
+ ' `--reservation`, `--flex` or `--on-demand` for other'
820
+ ' capacity types.'
821
+ ),
822
+ )
823
+ parser.add_argument(
824
+ '--flex',
825
+ action='store_true',
826
+ help=(
827
+ 'Sets node pool creation to use DWS Flex Start resources. See'
828
+ ' `--reservation`, `--on-demand` or `--spot` for other capacity'
829
+ ' types.'
814
830
  ),
815
831
  )
816
832
 
xpk/parser/storage.py CHANGED
@@ -71,9 +71,9 @@ def add_storage_attach_parser(
71
71
  type=str,
72
72
  help=(
73
73
  'The type of storage. Currently supported types: "gcsfuse",'
74
- ' "gcpfilestore", "parallelstore", "pd"'
74
+ ' "gcpfilestore", "parallelstore", "pd", "lustre"'
75
75
  ),
76
- choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd'],
76
+ choices=['gcsfuse', 'gcpfilestore', 'parallelstore', 'pd', 'lustre'],
77
77
  required=True,
78
78
  )
79
79
  add_cluster_arguments(req_args, required=True)
xpk/parser/workload.py CHANGED
@@ -208,15 +208,25 @@ def set_workload_parsers(workload_parser):
208
208
  help=(
209
209
  'Sets autoprovisioning to use reservation resources for the workload'
210
210
  ' request. This will attempt to find the provided reservation. See'
211
- ' `--spot` or `--on-demand` for other capacity types.'
211
+ ' `--spot`, `--flex` or `--on-demand` for other capacity types.'
212
212
  ),
213
213
  )
214
214
  workload_create_autoprovisioning_arguments.add_argument(
215
215
  '--spot',
216
216
  action='store_true',
217
217
  help=(
218
- 'Sets autoprovisioning to use spot resources.'
219
- ' See `--reservation` or `--on-demand` for other capacity types.'
218
+ 'Sets autoprovisioning to use spot resources. See `--reservation`,'
219
+ ' `--flex` or `--on-demand` for other capacity types.'
220
+ ),
221
+ )
222
+
223
+ workload_create_autoprovisioning_arguments.add_argument(
224
+ '--flex',
225
+ action='store_true',
226
+ help=(
227
+ 'Sets autoprovisioning to use flex-start resources. See'
228
+ ' `--reservation`, `--spot` or `--on-demand` for other capacity'
229
+ ' types.'
220
230
  ),
221
231
  )
222
232
 
@@ -728,6 +738,14 @@ def add_shared_workload_docker_image_arguments(args_parsers):
728
738
  ' directly by the xpk workload.'
729
739
  ),
730
740
  )
741
+ custom_parser.add_argument(
742
+ '--docker-image-pull-secret',
743
+ type=str,
744
+ help=(
745
+ 'Name of the secret that will be used to pull image from'
746
+ ' private repository'
747
+ ),
748
+ )
731
749
 
732
750
 
733
751
  def add_shared_workload_create_tensorboard_arguments(args_parsers):