xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
@@ -32,7 +32,6 @@ from ..capacity import (
32
32
  )
33
33
  from ..system_characteristics import get_system_characteristics_by_device_type
34
34
  from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
35
- from ..kueue import KUEUE_VERSION
36
35
 
37
36
  yaml_parser = yaml.YAML()
38
37
 
@@ -53,6 +52,7 @@ blueprint_dependencies_dir = {
53
52
 
54
53
  cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
55
54
  cluster_toolkit_version = "v1.62.2"
55
+ common_cluster_labels = {"gke_product_type": "xpk"}
56
56
 
57
57
 
58
58
  class BlueprintGeneratorOutput:
@@ -216,26 +216,11 @@ class BlueprintGenerator:
216
216
  a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
217
217
 
218
218
  set_placement_policy = capacity_type != CapacityType.SPOT
219
- num_chips = num_nodes * system.chips_per_vm
220
219
  workload = DeploymentModule(
221
220
  id="workload_component_install",
222
221
  source="modules/management/kubectl-apply",
223
222
  use=["gke_cluster"],
224
223
  settings={
225
- "kueue": {
226
- "install": True,
227
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
228
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
229
- "config_template_vars": {
230
- "num_chips": num_chips,
231
- "reservation": (
232
- 1 if capacity_type == CapacityType.RESERVATION else 0
233
- ),
234
- "flex_start": (
235
- 1 if capacity_type == CapacityType.FLEX_START else 0
236
- ),
237
- },
238
- },
239
224
  "jobset": {"install": True, "version": "v0.7.2"},
240
225
  "apply_manifests": [{
241
226
  "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
@@ -298,6 +283,7 @@ class BlueprintGenerator:
298
283
  "deployment_name": blueprint_name,
299
284
  "region": region,
300
285
  "zone": zone,
286
+ "labels": common_cluster_labels,
301
287
  },
302
288
  )
303
289
 
@@ -598,24 +584,12 @@ class BlueprintGenerator:
598
584
  else:
599
585
  gpu_pool.settings.update({"static_node_count": num_nodes})
600
586
 
601
- num_chips = num_nodes * system.chips_per_vm
602
587
  workload_manager_install_id = "workload-manager-install"
603
588
  workload_manager_install = DeploymentModule(
604
589
  id=workload_manager_install_id,
605
590
  source="modules/management/kubectl-apply",
606
591
  use=[cluster_id],
607
592
  settings={
608
- "kueue": {
609
- "install": True,
610
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
611
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
612
- "config_template_vars": {
613
- "num_chips": num_chips,
614
- "flex_start": (
615
- 1 if capacity_type == CapacityType.FLEX_START else 0
616
- ),
617
- },
618
- },
619
593
  "jobset": {"install": True, "version": "v0.7.2"},
620
594
  "apply_manifests": [
621
595
  {"source": nccl_installer_path},
@@ -676,6 +650,7 @@ class BlueprintGenerator:
676
650
  "deployment_name": blueprint_name,
677
651
  "region": region,
678
652
  "zone": zone,
653
+ "labels": common_cluster_labels,
679
654
  },
680
655
  )
681
656
 
@@ -884,24 +859,12 @@ class BlueprintGenerator:
884
859
  else:
885
860
  gpu_pool.settings.update({"static_node_count": num_nodes})
886
861
 
887
- num_chips = num_nodes * system.chips_per_vm
888
862
  workload_manager_install_id = "workload-manager-install"
889
863
  workload_manager_install = DeploymentModule(
890
864
  id=workload_manager_install_id,
891
865
  source="modules/management/kubectl-apply",
892
866
  use=[cluster_id],
893
867
  settings={
894
- "kueue": {
895
- "install": True,
896
- "version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
897
- "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
898
- "config_template_vars": {
899
- "num_chips": num_chips,
900
- "flex_start": (
901
- 1 if capacity_type == CapacityType.FLEX_START else 0
902
- ),
903
- },
904
- },
905
868
  "jobset": {"install": True, "version": "v0.7.2"},
906
869
  "apply_manifests": [
907
870
  {"source": nccl_installer_path},
@@ -962,6 +925,7 @@ class BlueprintGenerator:
962
925
  "deployment_name": blueprint_name,
963
926
  "region": region,
964
927
  "zone": zone,
928
+ "labels": common_cluster_labels,
965
929
  },
966
930
  )
967
931
 
@@ -32,7 +32,6 @@ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
32
32
  a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
33
33
  a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
34
34
  config_map_filename = "config-map.yaml.tftpl"
35
- kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
36
35
  tmp_test_dir = "/tmp/xpk_test"
37
36
 
38
37
 
@@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint():
82
81
  tmp_test_dir, "prefix", blueprint_name, config_map_filename
83
82
  )
84
83
  )
85
- assert os.path.exists(
86
- os.path.join(
87
- tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
88
- )
89
- )
90
84
 
91
85
  shutil.rmtree(tmp_test_dir)
92
86
 
xpk/core/capacity.py CHANGED
@@ -17,6 +17,7 @@ limitations under the License.
17
17
  import enum
18
18
 
19
19
  from ..utils.console import xpk_print, xpk_exit
20
+ from ..utils.kueue import is_queued_cluster
20
21
  from .commands import run_command_with_updates, run_command_for_value
21
22
 
22
23
  AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
@@ -50,7 +51,7 @@ def print_reservations(args) -> int:
50
51
  """
51
52
  command = f'gcloud beta compute reservations list --project={args.project}'
52
53
  return_code = run_command_with_updates(
53
- command, 'Get all reservations in the project', args
54
+ command, 'Get all reservations in the project'
54
55
  )
55
56
  if return_code != 0:
56
57
  xpk_print(f'Get all reservations returned ERROR {return_code}')
@@ -119,7 +120,7 @@ def get_reservation_maintenance_interval(
119
120
  f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
120
121
  )
121
122
  return_code, output = run_command_for_value(
122
- command, 'Get reservation maintenance interval', None
123
+ command, 'Get reservation maintenance interval'
123
124
  )
124
125
  if return_code != 0:
125
126
  xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
@@ -143,7 +144,7 @@ def get_reservation_placement_policy(
143
144
  f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
144
145
  )
145
146
  return_code, output = run_command_for_value(
146
- command, 'Get reservation placement policy', None
147
+ command, 'Get reservation placement policy'
147
148
  )
148
149
  if return_code != 0:
149
150
  xpk_print(f'Get reservation placement policy ERROR {return_code}')
@@ -164,7 +165,7 @@ def verify_reservation_exists(args) -> int:
164
165
  f'gcloud beta compute reservations describe {args.reservation}'
165
166
  f' --project={args.project} --zone={args.zone}'
166
167
  )
167
- return_code = run_command_with_updates(command, 'Describe reservation', args)
168
+ return_code = run_command_with_updates(command, 'Describe reservation')
168
169
  if return_code != 0:
169
170
  xpk_print(f'Describe reservation returned ERROR {return_code}')
170
171
  xpk_print('Please confirm that your reservation name is correct.')
@@ -199,7 +200,7 @@ def get_capacity_arguments_from_capacity_type(
199
200
  ' --location-policy=ANY --reservation-affinity=none'
200
201
  f' --no-enable-autorepair --max-nodes={max_nodes}'
201
202
  )
202
- if args.num_slices <= 1:
203
+ if is_queued_cluster(args.num_slices):
203
204
  capacity_args += ' --enable-queued-provisioning'
204
205
  case CapacityType.RESERVATION:
205
206
  capacity_args = (