xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +124 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.0.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0
xpk/commands/workload.py CHANGED
@@ -16,9 +16,7 @@ limitations under the License.
16
16
 
17
17
  from ..core.blueprint.blueprint_generator import (
18
18
  a3high_device_type,
19
- a3mega_device_type,
20
- a3ultra_device_type,
21
- a4_device_type,
19
+ a4x_device_types,
22
20
  )
23
21
  from ..core.cluster import (
24
22
  XPK_SA,
@@ -32,7 +30,7 @@ from ..core.docker_container import (
32
30
  get_main_container_docker_image,
33
31
  get_user_workload_container,
34
32
  )
35
- from ..core.kueue_manager import has_sub_slicing_enabled, get_installed_kueue_version, LOCAL_QUEUE_NAME
33
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
36
34
  from ..core.docker_resources import get_volumes, parse_env_config
37
35
  from ..core.gcloud_context import add_zone_and_project
38
36
  from ..core.monitoring import get_gke_outlier_dashboard
@@ -52,18 +50,19 @@ from ..core.pathways import (
52
50
  get_user_workload_for_pathways,
53
51
  try_to_delete_pathwaysjob_first,
54
52
  )
55
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
56
- from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
53
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics_from_config_map
54
+ from ..core.resources import ConfigMapType, get_cluster_configmap
55
+ from ..core.nodepool import ensure_resource_policy_exists
57
56
  from ..core.scheduling import (
57
+ WorkloadScheduling,
58
58
  check_if_workload_can_schedule,
59
- create_accelerator_label,
60
- create_machine_label,
61
59
  create_tpu_machine_type,
62
60
  create_tpu_topology,
63
61
  get_cpu_affinity,
64
62
  get_gpu_scheduler,
65
63
  create_sub_slicing_annotations,
66
64
  create_placement_policy_label,
65
+ get_placement_policy_name,
67
66
  is_placement_policy_supported,
68
67
  )
69
68
  from ..core.storage import (
@@ -78,10 +77,10 @@ from ..core.storage import (
78
77
  get_storages_to_mount,
79
78
  )
80
79
  from ..core.system_characteristics import (
81
- SUB_SLICING_TOPOLOGIES,
82
80
  AcceleratorType,
81
+ create_accelerator_label,
82
+ create_machine_label,
83
83
  get_system_characteristics,
84
- compute_vms_per_slice,
85
84
  )
86
85
  from ..core.vertex import create_vertex_experiment
87
86
  from ..core.workload import (
@@ -92,20 +91,16 @@ from ..core.workload import (
92
91
  get_cluster_location,
93
92
  )
94
93
  from ..core.workload_decorators import (
95
- rdma_decorator,
96
94
  storage_decorator,
97
- tcpx_decorator,
98
- tcpxo_decorator,
99
95
  )
100
96
  from ..utils.console import ask_for_user_consent, xpk_exit, xpk_print
101
- from packaging.version import Version
102
97
  from ..utils.file import write_tmp_file
103
98
  from ..utils.execution_context import is_dry_run
104
99
  from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
105
100
  from . import cluster_gcluster
106
- from .common import is_TAS_possible, validate_sub_slicing_system
107
- from ..utils.topology import is_topology_contained
108
- from ..utils.feature_flags import FeatureFlags
101
+ from .common import is_TAS_possible
102
+ from jinja2 import Environment, FileSystemLoader
103
+ from ..utils.templates import get_templates_absolute_path
109
104
 
110
105
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
111
106
  kind: JobSet
@@ -292,7 +287,7 @@ PW_WORKLOAD_CREATE_YAML = """
292
287
  {user_workload}
293
288
  """
294
289
 
295
- SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
290
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE = 'arm_gpu_workload_crate.yaml.j2'
296
291
 
297
292
 
298
293
  def workload_create_pathways(args) -> None:
@@ -344,26 +339,35 @@ def workload_create(args) -> None:
344
339
  )
345
340
  xpk_exit(1)
346
341
 
347
- system, return_code = get_system_characteristics(args)
348
- if return_code > 0 or system is None:
342
+ workload_system, return_code = get_system_characteristics(args)
343
+ if return_code > 0 or workload_system is None:
349
344
  xpk_print('Fetching system characteristics failed!')
350
345
  xpk_exit(return_code)
351
346
 
352
- if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
353
- _validate_sub_slicing_availability()
354
- _validate_sub_slicing_topology(system, args.sub_slicing_topology)
355
-
356
- if not check_if_workload_can_schedule(args, system):
347
+ resources_config_map = get_cluster_configmap(
348
+ args.cluster, ConfigMapType.RESOURCES
349
+ )
350
+ cluster_system = get_cluster_system_characteristics_from_config_map(
351
+ resources_config_map
352
+ )
353
+ workload_scheduling = check_if_workload_can_schedule(
354
+ args=args,
355
+ workload_system=workload_system,
356
+ cluster_system=cluster_system,
357
+ resources_config_map=resources_config_map,
358
+ )
359
+ if workload_scheduling == WorkloadScheduling.UNAVAILABLE:
357
360
  xpk_exit(1)
358
361
 
359
362
  xpk_print('Starting workload create', flush=True)
360
363
 
361
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
362
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
364
+ cluster_config_map = get_cluster_configmap(
365
+ args.cluster, ConfigMapType.METADATA
366
+ )
363
367
  cluster_xpk_version = None
364
368
  if cluster_config_map is None:
365
369
  xpk_print(
366
- f'Warning: Unable to find ConfigMap: {metadata_configmap_name} for the'
370
+ 'Warning: Unable to find ConfigMap for the'
367
371
  ' cluster. We recommend to upgrade your cluster by running `xpk'
368
372
  ' cluster create`.'
369
373
  )
@@ -395,7 +399,7 @@ def workload_create(args) -> None:
395
399
 
396
400
  autoprovisioning_args = ''
397
401
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
398
- args, system
402
+ args, workload_system
399
403
  )
400
404
  if return_code != 0:
401
405
  xpk_exit(return_code)
@@ -489,27 +493,34 @@ def workload_create(args) -> None:
489
493
  rules:
490
494
  - action: FailJob
491
495
  onExitCodes:
492
- containerName: {get_main_container_docker_image(args, system)}
496
+ containerName: {get_main_container_docker_image(args, workload_system)}
493
497
  operator: NotIn
494
498
  values: [{restart_on_exit_codes}]"""
495
499
 
500
+ if is_placement_policy_supported(workload_system):
501
+ ensure_resource_policy_exists(
502
+ resource_policy_name=get_placement_policy_name(workload_system),
503
+ project=args.project,
504
+ zone=args.zone,
505
+ topology=workload_system.topology,
506
+ )
507
+
496
508
  placement_policy_label = (
497
- create_placement_policy_label(system)
498
- if is_placement_policy_supported(system)
509
+ create_placement_policy_label(workload_system)
510
+ if is_placement_policy_supported(workload_system)
499
511
  else ''
500
512
  )
501
513
 
502
514
  # Create the workload file based on accelerator type or workload type.
503
- if system.accelerator_type == AcceleratorType.GPU:
515
+ if workload_system.accelerator_type == AcceleratorType.GPU:
504
516
  container, debugging_dashboard_id = get_user_workload_container(
505
- args, system
517
+ args, workload_system
506
518
  )
507
519
  gpu_scheduler, return_code = get_gpu_scheduler(
508
- args, system, autoprovisioning_args
520
+ args, workload_system, autoprovisioning_args
509
521
  )
510
522
  if return_code != 0:
511
523
  xpk_exit(return_code)
512
- system_characteristics = get_cluster_system_characteristics(args)
513
524
  capacity_type = get_cluster_capacity_type(args)
514
525
 
515
526
  annotations = (
@@ -517,31 +528,55 @@ def workload_create(args) -> None:
517
528
  'kueue.x-k8s.io/podset-preferred-topology:'
518
529
  ' "cloud.google.com/gce-topology-host"'
519
530
  )
520
- if is_TAS_possible(system_characteristics, capacity_type)
531
+ if is_TAS_possible(cluster_system, capacity_type)
521
532
  else ''
522
533
  )
523
534
 
524
535
  if (
525
- system.device_type in cluster_gcluster.supported_device_types
526
- or system.device_type == a3high_device_type
536
+ workload_system.device_type in cluster_gcluster.supported_device_types
537
+ or workload_system.device_type == a3high_device_type
538
+ or workload_system.device_type in a4x_device_types
527
539
  ):
528
- yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
529
- args=args,
530
- container=container,
531
- service_account=XPK_SA,
532
- failure_policy_rules=failure_policy_rules,
533
- pod_failure_policy=pod_failure_policy,
534
- annotations=annotations,
535
- placement_policy_label=placement_policy_label,
536
- )
540
+ if workload_system.device_type in a4x_device_types:
541
+ template_env = Environment(
542
+ loader=FileSystemLoader(searchpath=get_templates_absolute_path())
543
+ )
544
+ workload_create_yaml = template_env.get_template(
545
+ ARM_GPU_WORKLOAD_CREATE_JINJA_FILE
546
+ )
547
+ yml_string = workload_create_yaml.render(
548
+ workload=args.workload,
549
+ num_nodes=args.num_nodes,
550
+ ttl_seconds_after_finished=args.ttl_seconds_after_finished,
551
+ max_restarts=args.max_restarts,
552
+ priority=args.priority,
553
+ termination_grace_period_seconds=args.termination_grace_period_seconds,
554
+ docker_image_pull_secret=args.docker_image_pull_secret,
555
+ container=container,
556
+ service_account=XPK_SA,
557
+ failure_policy_rules=failure_policy_rules,
558
+ pod_failure_policy=pod_failure_policy,
559
+ annotations=annotations,
560
+ placement_policy_label=placement_policy_label,
561
+ )
562
+ else:
563
+ yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
564
+ args=args,
565
+ container=container,
566
+ service_account=XPK_SA,
567
+ failure_policy_rules=failure_policy_rules,
568
+ pod_failure_policy=pod_failure_policy,
569
+ annotations=annotations,
570
+ placement_policy_label=placement_policy_label,
571
+ )
537
572
 
538
573
  sub_networks = get_cluster_subnetworks()
539
- if args.device_type == a3high_device_type:
540
- yml_string = tcpx_decorator.decorate_jobset(yml_string)
541
- elif args.device_type == a3mega_device_type:
542
- yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
543
- elif args.device_type in [a3ultra_device_type, a4_device_type]:
544
- yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
574
+
575
+ if workload_system.gpu_config and callable(
576
+ workload_system.gpu_config.jobset_decorator_fn
577
+ ):
578
+ decorator_fn = workload_system.gpu_config.jobset_decorator_fn
579
+ yml_string = decorator_fn(yml_string, sub_networks)
545
580
 
546
581
  if all_storages:
547
582
  yml_string = storage_decorator.decorate_jobset(yml_string, all_storages)
@@ -550,7 +585,7 @@ def workload_create(args) -> None:
550
585
  args=args,
551
586
  container=container,
552
587
  gpu_scheduler=gpu_scheduler,
553
- volumes=get_volumes(args, system),
588
+ volumes=get_volumes(args, workload_system),
554
589
  storage_annotations=('\n' + (' ' * 12)).join(
555
590
  get_storage_annotations(all_storages)
556
591
  ),
@@ -561,53 +596,53 @@ def workload_create(args) -> None:
561
596
  )
562
597
 
563
598
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
564
- args, system
599
+ args, workload_system
565
600
  ):
566
601
  yml_string = PW_WORKLOAD_CREATE_YAML.format(
567
602
  args=args,
568
- system=system,
569
- topology=create_tpu_topology(system.accelerator_type, system),
570
- machine_type=create_tpu_machine_type(system.accelerator_type, system),
603
+ topology=create_tpu_topology(workload_system),
604
+ machine_type=create_tpu_machine_type(workload_system),
571
605
  custom_pathways_proxy_server=append_custom_pathways_proxy_server(args),
572
606
  custom_pathways_server=append_custom_pathways_server(args),
573
607
  custom_pathways_worker=append_custom_pathways_worker(args),
574
608
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
575
- user_workload=get_user_workload_for_pathways(args, system),
609
+ user_workload=get_user_workload_for_pathways(args, workload_system),
576
610
  local_queue_name=LOCAL_QUEUE_NAME,
577
611
  autoprovisioning_args=autoprovisioning_args,
578
612
  placement_policy_label=placement_policy_label,
579
613
  )
580
614
  else:
615
+ use_sub_slicing = (
616
+ workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
617
+ )
618
+ if use_sub_slicing:
619
+ xpk_print('Workload will be scheduled using the Sub-slicing feature.')
620
+
581
621
  container, debugging_dashboard_id = get_user_workload_container(
582
- args, system
622
+ args, workload_system
583
623
  )
584
624
  yml_string = WORKLOAD_CREATE_YAML.format(
585
625
  args=args,
586
626
  container=container,
587
- vms_per_slice=(
588
- compute_vms_per_slice(args.sub_slicing_topology)
589
- if system.accelerator_type == AcceleratorType.TPU
590
- and FeatureFlags.SUB_SLICING_ENABLED
591
- and args.sub_slicing_topology is not None
592
- else system.vms_per_slice
593
- ),
594
- affinity=get_cpu_affinity(system.accelerator_type),
595
- accelerator_label=create_accelerator_label(
596
- system.accelerator_type, system
597
- ),
627
+ vms_per_slice=workload_system.vms_per_slice,
628
+ affinity=get_cpu_affinity(workload_system.accelerator_type),
629
+ accelerator_label=create_accelerator_label(workload_system),
598
630
  sub_slicing_annotations=(
599
- ''
600
- if not FeatureFlags.SUB_SLICING_ENABLED
601
- or args.sub_slicing_topology is None
602
- else ('\n' + (' ' * 16)).join(
603
- create_sub_slicing_annotations(args.sub_slicing_topology)
631
+ ('\n' + (' ' * 16)).join(
632
+ create_sub_slicing_annotations(workload_system.topology)
604
633
  )
634
+ if use_sub_slicing
635
+ else ''
605
636
  ),
606
637
  placement_policy_label=placement_policy_label,
607
- machine_label=create_machine_label(system.accelerator_type, system),
638
+ machine_label=(
639
+ create_machine_label(cluster_system)
640
+ if use_sub_slicing and cluster_system
641
+ else create_machine_label(workload_system)
642
+ ),
608
643
  local_queue_name=LOCAL_QUEUE_NAME,
609
644
  autoprovisioning_args=autoprovisioning_args,
610
- volumes=get_volumes(args, system),
645
+ volumes=get_volumes(args, workload_system),
611
646
  storage_annotations=('\n' + (' ' * 16)).join(
612
647
  get_storage_annotations(all_storages)
613
648
  ),
@@ -615,10 +650,18 @@ def workload_create(args) -> None:
615
650
  tpu_toleration="""
616
651
  - operator: "Exists"
617
652
  key: google.com/tpu
618
- """ if system.accelerator_type == AcceleratorType.TPU else '',
653
+ """ if workload_system.accelerator_type == AcceleratorType.TPU else '',
619
654
  failure_policy_rules=failure_policy_rules,
620
655
  pod_failure_policy=pod_failure_policy,
621
656
  )
657
+ if args.output_manifest_file:
658
+ with open(args.output_manifest_file, 'w', encoding='utf-8') as f:
659
+ f.write(yml_string)
660
+ xpk_print(
661
+ f'Workload {args.workload} manifest written to'
662
+ f' {args.output_manifest_file}'
663
+ )
664
+
622
665
  tmp = write_tmp_file(yml_string)
623
666
  command = f'kubectl apply -f {str(tmp)}'
624
667
  return_code = run_command_with_updates(command, 'Creating Workload')
@@ -632,7 +675,7 @@ def workload_create(args) -> None:
632
675
 
633
676
  # Get GKE outlier dashboard for TPU
634
677
  outlier_dashboard_id = None
635
- if system.accelerator_type == AcceleratorType.TPU:
678
+ if workload_system.accelerator_type == AcceleratorType.TPU:
636
679
  outlier_dashboard_id = get_gke_outlier_dashboard(args)
637
680
 
638
681
  # Outlier and debugging dashboards
@@ -699,64 +742,6 @@ def workload_create(args) -> None:
699
742
  xpk_exit(0)
700
743
 
701
744
 
702
- def _validate_sub_slicing_availability():
703
- return_code, sub_slicing_enabled = has_sub_slicing_enabled()
704
- if return_code != 0:
705
- xpk_print(
706
- 'Error: Unable to validate sub-slicing support on a given cluster.'
707
- )
708
- xpk_exit(1)
709
-
710
- if not sub_slicing_enabled:
711
- xpk_print(
712
- 'Error: Cluster has not been not set up for Sub-slicing. Please enable'
713
- ' --sub-slicing in "cluster create" command first.'
714
- )
715
- xpk_exit(1)
716
-
717
- return_code, current_version = get_installed_kueue_version(
718
- dry_run_version=Version('0.13')
719
- )
720
- if return_code != 0 or not current_version:
721
- xpk_print(
722
- 'Error: Unable to validate sub-slicing support on a given cluster.'
723
- )
724
- xpk_exit(1)
725
-
726
- if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
727
- xpk_print(
728
- "Error: Current Kueue version ({current_version}) doesn't support"
729
- ' Sub-slicing. The minimal required version is'
730
- ' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
731
- ' manually, or run "cluster create --sub-slicing" on the existing'
732
- ' cluster.'
733
- )
734
- xpk_exit(1)
735
-
736
-
737
- def _validate_sub_slicing_topology(
738
- system_characteristics: SystemCharacteristics, sub_slicing_topology: str
739
- ) -> None:
740
- if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
741
- xpk_print(
742
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
743
- f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
744
- )
745
- xpk_exit(1)
746
-
747
- if not is_topology_contained(
748
- contained=sub_slicing_topology, container=system_characteristics.topology
749
- ):
750
- xpk_print(
751
- f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
752
- ' large. The shape cannot be bigger than'
753
- f' {system_characteristics.topology}.'
754
- )
755
- xpk_exit(1)
756
-
757
- validate_sub_slicing_system(system_characteristics)
758
-
759
-
760
745
  def get_restart_exit_codes(args) -> list:
761
746
  exit_codes = [42]
762
747
  exit_codes.extend(range(127, 256, 1))