xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +125 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.1.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py CHANGED
@@ -16,9 +16,7 @@ limitations under the License.
16
16
 
17
17
  from ..core.blueprint.blueprint_generator import (
18
18
  a3high_device_type,
19
- a3mega_device_type,
20
- a3ultra_device_type,
21
- a4_device_type,
19
+ a4x_device_types,
22
20
  )
23
21
  from ..core.cluster import (
24
22
  XPK_SA,
@@ -32,7 +30,7 @@ from ..core.docker_container import (
32
30
  get_main_container_docker_image,
33
31
  get_user_workload_container,
34
32
  )
35
- from ..core.kueue_manager import has_sub_slicing_enabled, get_installed_kueue_version, LOCAL_QUEUE_NAME
33
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
36
34
  from ..core.docker_resources import get_volumes, parse_env_config
37
35
  from ..core.gcloud_context import add_zone_and_project
38
36
  from ..core.monitoring import get_gke_outlier_dashboard
@@ -52,18 +50,19 @@ from ..core.pathways import (
52
50
  get_user_workload_for_pathways,
53
51
  try_to_delete_pathwaysjob_first,
54
52
  )
55
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
56
- from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
53
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
54
+ from ..core.resources import ConfigMapType, get_cluster_configmap
55
+ from ..core.nodepool import ensure_resource_policy_exists
57
56
  from ..core.scheduling import (
57
+ WorkloadScheduling,
58
58
  check_if_workload_can_schedule,
59
- create_accelerator_label,
60
- create_machine_label,
61
59
  create_tpu_machine_type,
62
60
  create_tpu_topology,
63
61
  get_cpu_affinity,
64
62
  get_gpu_scheduler,
65
63
  create_sub_slicing_annotations,
66
64
  create_placement_policy_label,
65
+ get_placement_policy_name,
67
66
  is_placement_policy_supported,
68
67
  )
69
68
  from ..core.storage import (
@@ -78,10 +77,10 @@ from ..core.storage import (
78
77
  get_storages_to_mount,
79
78
  )
80
79
  from ..core.system_characteristics import (
81
- SUB_SLICING_TOPOLOGIES,
82
80
  AcceleratorType,
81
+ create_accelerator_label,
82
+ create_machine_label,
83
83
  get_system_characteristics,
84
- compute_vms_per_slice,
85
84
  )
86
85
  from ..core.vertex import create_vertex_experiment
87
86
  from ..core.workload import (
@@ -92,20 +91,16 @@ from ..core.workload import (
92
91
  get_cluster_location,
93
92
  )
94
93
  from ..core.workload_decorators import (
95
- rdma_decorator,
96
94
  storage_decorator,
97
- tcpx_decorator,
98
- tcpxo_decorator,
99
95
  )
100
96
  from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
101
- from packaging.version import Version
102
97
  from ..utils.file import write_tmp_file
103
98
  from ..utils.execution_context import is_dry_run
104
99
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
105
100
  from . import cluster_gcluster
106
- from .common import is_TAS_possible, validate_sub_slicing_system
107
- from ..utils.topology import is_topology_contained
108
- from ..utils.feature_flags import FeatureFlags
101
+ from .common import is_TAS_possible
102
+ from jinja2 import Environment, FileSystemLoader
103
+ from ..utils.templates import get_templates_absolute_path
109
104
 
110
105
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
111
106
  kind: JobSet
@@ -292,7 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
292
287
  {user_workload}
293
288
  """
294
289
 
295
- SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
290
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
296
291
 
297
292
 
298
293
  def workload_create_pathways(args) -> None:
@@ -344,26 +339,35 @@ def workload_create(args) -> None:
344
339
  )
345
340
  xpk_exit(1)
346
341
 
347
- system, return_code = get_system_characteristics(args)
348
- if return_code > 0 or system is None:
342
+ workload_system, return_code = get_system_characteristics(args)
343
+ if return_code > 0 or workload_system is None:
349
344
  xpk_print('Fetching system characteristics failed!')
350
345
  xpk_exit(return_code)
351
346
 
352
- if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
353
- _validate_sub_slicing_availability()
354
- _validate_sub_slicing_topology(system, args.sub_slicing_topology)
355
-
356
- if not check_if_workload_can_schedule(args, system):
347
+ resources_config_map = get_cluster_configmap(
348
+ args.cluster, ConfigMapType.RESOURCES
349
+ )
350
+ cluster_system = get_cluster_system_characteristics_from_config_map(
351
+ resources_config_map
352
+ )
353
+ workload_scheduling = check_if_workload_can_schedule(
354
+ args=args,
355
+ workload_system=workload_system,
356
+ cluster_system=cluster_system,
357
+ resources_config_map=resources_config_map,
358
+ )
359
+ if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
357
360
  xpk_exit(1)
358
361
 
359
362
  xpk_print('Starting workload create', flush=True)
360
363
 
361
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
362
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
364
+ cluster_config_map = get_cluster_configmap(
365
+ args.cluster, ConfigMapType.METADATA
366
+ )
363
367
  cluster_xpk_version = None
364
368
  if cluster_config_map is None:
365
369
  xpk_print(
366
- f'Warning: Unable to find ConfigMap: {metadata_configmap_name} for the'
370
+ 'Warning: Unable to find ConfigMap for the'
367
371
  ' cluster. We recommend to upgrade your cluster by running `xpk'
368
372
  ' cluster create`.'
369
373
  )
@@ -395,7 +399,7 @@ def workload_create(args) -> None:
395
399
 
396
400
  autoprovisioning_args = ''
397
401
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
398
- args, system
402
+ args, workload_system
399
403
  )
400
404
  if return_code != 0:
401
405
  xpk_exit(return_code)
@@ -488,28 +492,36 @@ def workload_create(args) -> None:
488
492
  podFailurePolicy:
489
493
  rules:
490
494
  - action: FailJob
495
+ onPodConditions: []
491
496
  onExitCodes:
492
- containerName: {get_main_container_docker_image(args, system)}
497
+ containerName: {get_main_container_docker_image(args, workload_system)}
493
498
  operator: NotIn
494
499
  values: [{restart_on_exit_codes}]"""
495
500
 
501
+ if is_placement_policy_supported(workload_system):
502
+ ensure_resource_policy_exists(
503
+ resource_policy_name=get_placement_policy_name(workload_system),
504
+ project=args.project,
505
+ zone=args.zone,
506
+ topology=workload_system.topology,
507
+ )
508
+
496
509
  placement_policy_label = (
497
- create_placement_policy_label(system)
498
- if is_placement_policy_supported(system)
510
+ create_placement_policy_label(workload_system)
511
+ if is_placement_policy_supported(workload_system)
499
512
  else ''
500
513
  )
501
514
 
502
515
  # Create the workload file based on accelerator type or workload type.
503
- if system.accelerator_type == AcceleratorType.GPU:
516
+ if workload_system.accelerator_type == AcceleratorType.GPU:
504
517
  container, debugging_dashboard_id = get_user_workload_container(
505
- args, system
518
+ args, workload_system
506
519
  )
507
520
  gpu_scheduler, return_code = get_gpu_scheduler(
508
- args, system, autoprovisioning_args
521
+ args, workload_system, autoprovisioning_args
509
522
  )
510
523
  if return_code != 0:
511
524
  xpk_exit(return_code)
512
- system_characteristics = get_cluster_system_characteristics(args)
513
525
  capacity_type = get_cluster_capacity_type(args)
514
526
 
515
527
  annotations = (
@@ -517,31 +529,55 @@ def workload_create(args) -> None:
517
529
  'kueue.x-k8s.io/podset-preferred-topology:'
518
530
  ' "cloud.google.com/gce-topology-host"'
519
531
  )
520
- if is_TAS_possible(system_characteristics, capacity_type)
532
+ if is_TAS_possible(cluster_system, capacity_type)
521
533
  else ''
522
534
  )
523
535
 
524
536
  if (
525
- system.device_type in cluster_gcluster.supported_device_types
526
- or system.device_type == a3high_device_type
537
+ workload_system.device_type in cluster_gcluster.supported_device_types
538
+ or workload_system.device_type == a3high_device_type
539
+ or workload_system.device_type in a4x_device_types
527
540
  ):
528
- yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
529
- args=args,
530
- container=container,
531
- service_account=XPK_SA,
532
- failure_policy_rules=failure_policy_rules,
533
- pod_failure_policy=pod_failure_policy,
534
- annotations=annotations,
535
- placement_policy_label=placement_policy_label,
536
- )
541
+ if workload_system.device_type in a4x_device_types:
542
+ template_env = Environment(
543
+ loader=FileSystemLoader(searchpath=get_templates_absolute_path())
544
+ )
545
+ workload_create_yaml = template_env.get_template(
546
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
547
+ )
548
+ yml_string = workload_create_yaml.render(
549
+ workload=args.workload,
550
+ num_nodes=args.num_nodes,
551
+ ttl_seconds_after_finished=args.ttl_seconds_after_finished,
552
+ max_restarts=args.max_restarts,
553
+ priority=args.priority,
554
+ termination_grace_period_seconds=args.termination_grace_period_seconds,
555
+ docker_image_pull_secret=args.docker_image_pull_secret,
556
+ container=container,
557
+ service_account=XPK_SA,
558
+ failure_policy_rules=failure_policy_rules,
559
+ pod_failure_policy=pod_failure_policy,
560
+ annotations=annotations,
561
+ placement_policy_label=placement_policy_label,
562
+ )
563
+ else:
564
+ yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
565
+ args=args,
566
+ container=container,
567
+ service_account=XPK_SA,
568
+ failure_policy_rules=failure_policy_rules,
569
+ pod_failure_policy=pod_failure_policy,
570
+ annotations=annotations,
571
+ placement_policy_label=placement_policy_label,
572
+ )
537
573
 
538
574
  sub_networks = get_cluster_subnetworks()
539
- if args.device_type == a3high_device_type:
540
- yml_string = tcpx_decorator.decorate_jobset(yml_string)
541
- elif args.device_type == a3mega_device_type:
542
- yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
543
- elif args.device_type in [a3ultra_device_type, a4_device_type]:
544
- yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
575
+
576
+ if workload_system.gpu_config and callable(
577
+ workload_system.gpu_config.jobset_decorator_fn
578
+ ):
579
+ decorator_fn = workload_system.gpu_config.jobset_decorator_fn
580
+ yml_string = decorator_fn(yml_string, sub_networks)
545
581
 
546
582
  if all_storages:
547
583
  yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
@@ -550,7 +586,7 @@ def workload_create(args) -> None:
550
586
  args=args,
551
587
  container=container,
552
588
  gpu_scheduler=gpu_scheduler,
553
- volumes=get_volumes(args, system),
589
+ volumes=get_volumes(args, workload_system),
554
590
  storage_annotations=('\n' + (' ' * 12)).join(
555
591
  get_storage_annotations(all_storages)
556
592
  ),
@@ -561,53 +597,53 @@ def workload_create(args) -> None:
561
597
  )
562
598
 
563
599
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
564
- args, system
600
+ args, workload_system
565
601
  ):
566
602
  yml_string = PW_WORKLOAD_CREATE_YAML.format(
567
603
  args=args,
568
- system=system,
569
- topology=create_tpu_topology(system.accelerator_type, system),
570
- machine_type=create_tpu_machine_type(system.accelerator_type, system),
604
+ topology=create_tpu_topology(workload_system),
605
+ machine_type=create_tpu_machine_type(workload_system),
571
606
  custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
572
607
  custom_pathways_server=append_custom_pathways_server(args),
573
608
  custom_pathways_worker=append_custom_pathways_worker(args),
574
609
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
575
- user_workload=get_user_workload_for_pathways(args, system),
610
+ user_workload=get_user_workload_for_pathways(args, workload_system),
576
611
  local_queue_name=LOCAL_QUEUE_NAME,
577
612
  autoprovisioning_args=autoprovisioning_args,
578
613
  placement_policy_label=placement_policy_label,
579
614
  )
580
615
  else:
616
+ use_sub_slicing = (
617
+ workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
618
+ )
619
+ if use_sub_slicing:
620
+ xpk_print('Workload will be scheduled using the Sub-slicing feature.')
621
+
581
622
  container, debugging_dashboard_id = get_user_workload_container(
582
- args, system
623
+ args, workload_system
583
624
  )
584
625
  yml_string = WORKLOAD_CREATE_YAML.format(
585
626
  args=args,
586
627
  container=container,
587
- vms_per_slice=(
588
- compute_vms_per_slice(args.sub_slicing_topology)
589
- if system.accelerator_type == AcceleratorType.TPU
590
- and FeatureFlags.SUB_SLICING_ENABLED
591
- and args.sub_slicing_topology is not None
592
- else system.vms_per_slice
593
- ),
594
- affinity=get_cpu_affinity(system.accelerator_type),
595
- accelerator_label=create_accelerator_label(
596
- system.accelerator_type, system
597
- ),
628
+ vms_per_slice=workload_system.vms_per_slice,
629
+ affinity=get_cpu_affinity(workload_system.accelerator_type),
630
+ accelerator_label=create_accelerator_label(workload_system),
598
631
  sub_slicing_annotations=(
599
- ''
600
- if not FeatureFlags.SUB_SLICING_ENABLED
601
- or args.sub_slicing_topology is None
602
- else ('\n' + (' ' * 16)).join(
603
- create_sub_slicing_annotations(args.sub_slicing_topology)
632
+ ('\n' + (' ' * 16)).join(
633
+ create_sub_slicing_annotations(workload_system.topology)
604
634
  )
635
+ if use_sub_slicing
636
+ else ''
605
637
  ),
606
638
  placement_policy_label=placement_policy_label,
607
- machine_label=create_machine_label(system.accelerator_type, system),
639
+ machine_label=(
640
+ create_machine_label(cluster_system)
641
+ if use_sub_slicing and cluster_system
642
+ else create_machine_label(workload_system)
643
+ ),
608
644
  local_queue_name=LOCAL_QUEUE_NAME,
609
645
  autoprovisioning_args=autoprovisioning_args,
610
- volumes=get_volumes(args, system),
646
+ volumes=get_volumes(args, workload_system),
611
647
  storage_annotations=('\n' + (' ' * 16)).join(
612
648
  get_storage_annotations(all_storages)
613
649
  ),
@@ -615,10 +651,18 @@ def workload_create(args) -> None:
615
651
  tpu_toleration="""
616
652
  - operator: "Exists"
617
653
  key: google.com/tpu
618
- """ if system.accelerator_type == AcceleratorType.TPU else '',
654
+ """ if workload_system.accelerator_type == AcceleratorType.TPU else '',
619
655
  failure_policy_rules=failure_policy_rules,
620
656
  pod_failure_policy=pod_failure_policy,
621
657
  )
658
+ if args.output_manifest_file:
659
+ with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
660
+ f.write(yml_string)
661
+ xpk_print(
662
+ f'Workload {args.workload} manifest written to'
663
+ f' {args.output_manifest_file}'
664
+ )
665
+
622
666
  tmp = write_tmp_file(yml_string)
623
667
  command = f'kubectl apply -f {str(tmp)}'
624
668
  return_code = run_command_with_updates(command, 'Creating Workload')
@@ -632,7 +676,7 @@ def workload_create(args) -> None:
632
676
 
633
677
  # Get GKE outlier dashboard for TPU
634
678
  outlier_dashboard_id = None
635
- if system.accelerator_type == AcceleratorType.TPU:
679
+ if workload_system.accelerator_type == AcceleratorType.TPU:
636
680
  outlier_dashboard_id = get_gke_outlier_dashboard(args)
637
681
 
638
682
  # Outlier and debugging dashboards
@@ -699,64 +743,6 @@ def workload_create(args) -> None:
699
743
  xpk_exit(0)
700
744
 
701
745
 
702
- def _validate_sub_slicing_availability():
703
- return_code, sub_slicing_enabled = has_sub_slicing_enabled()
704
- if return_code != 0:
705
- xpk_print(
706
- 'Error: Unable to validate sub-slicing support on a given cluster.'
707
- )
708
- xpk_exit(1)
709
-
710
- if not sub_slicing_enabled:
711
- xpk_print(
712
- 'Error: Cluster has not been not set up for Sub-slicing. Please enable'
713
- ' --sub-slicing in "cluster create" command first.'
714
- )
715
- xpk_exit(1)
716
-
717
- return_code, current_version = get_installed_kueue_version(
718
- dry_run_version=Version('0.13')
719
- )
720
- if return_code != 0 or not current_version:
721
- xpk_print(
722
- 'Error: Unable to validate sub-slicing support on a given cluster.'
723
- )
724
- xpk_exit(1)
725
-
726
- if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
727
- xpk_print(
728
- "Error: Current Kueue version ({current_version}) doesn't support"
729
- ' Sub-slicing. The minimal required version is'
730
- ' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
731
- ' manually, or run "cluster create --sub-slicing" on the existing'
732
- ' cluster.'
733
- )
734
- xpk_exit(1)
735
-
736
-
737
- def _validate_sub_slicing_topology(
738
- system_characteristics: SystemCharacteristics, sub_slicing_topology: str
739
- ) -> None:
740
- if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
741
- xpk_print(
742
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
743
- f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
744
- )
745
- xpk_exit(1)
746
-
747
- if not is_topology_contained(
748
- contained=sub_slicing_topology, container=system_characteristics.topology
749
- ):
750
- xpk_print(
751
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
752
- ' large. The shape cannot be bigger than'
753
- f' {system_characteristics.topology}.'
754
- )
755
- xpk_exit(1)
756
-
757
- validate_sub_slicing_system(system_characteristics)
758
-
759
-
760
746
  def get_restart_exit_codes(args) -> list:
761
747
  exit_codes = [42]
762
748
  exit_codes.extend(range(127, 256, 1))