xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +17 -10
- xpk/commands/cluster.py +137 -123
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +13 -27
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +22 -11
- xpk/commands/job.py +53 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +26 -2
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +58 -30
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +96 -195
- xpk/core/cluster_private.py +9 -12
- xpk/core/commands.py +21 -25
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +17 -9
- xpk/core/docker_resources.py +9 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +5 -8
- xpk/core/kjob.py +19 -29
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +11 -16
- xpk/core/network.py +18 -19
- xpk/core/nodepool.py +65 -71
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +9 -5
- xpk/core/ray.py +11 -15
- xpk/core/resources.py +15 -10
- xpk/core/scheduling.py +23 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +3 -2
- xpk/parser/cluster.py +50 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/execution_context.py +28 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/kueue.py +20 -0
- xpk/utils/network.py +4 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -545
- xpk-0.12.0.dist-info/RECORD +0 -100
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -32,7 +32,6 @@ from ..capacity import (
|
|
|
32
32
|
)
|
|
33
33
|
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
34
34
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
35
|
-
from ..kueue import KUEUE_VERSION
|
|
36
35
|
|
|
37
36
|
yaml_parser = yaml.YAML()
|
|
38
37
|
|
|
@@ -53,6 +52,7 @@ blueprint_dependencies_dir = {
|
|
|
53
52
|
|
|
54
53
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
55
54
|
cluster_toolkit_version = "v1.62.2"
|
|
55
|
+
common_cluster_labels = {"gke_product_type": "xpk"}
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
class BlueprintGeneratorOutput:
|
|
@@ -216,26 +216,11 @@ class BlueprintGenerator:
|
|
|
216
216
|
a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
|
|
217
217
|
|
|
218
218
|
set_placement_policy = capacity_type != CapacityType.SPOT
|
|
219
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
220
219
|
workload = DeploymentModule(
|
|
221
220
|
id="workload_component_install",
|
|
222
221
|
source="modules/management/kubectl-apply",
|
|
223
222
|
use=["gke_cluster"],
|
|
224
223
|
settings={
|
|
225
|
-
"kueue": {
|
|
226
|
-
"install": True,
|
|
227
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
228
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
229
|
-
"config_template_vars": {
|
|
230
|
-
"num_chips": num_chips,
|
|
231
|
-
"reservation": (
|
|
232
|
-
1 if capacity_type == CapacityType.RESERVATION else 0
|
|
233
|
-
),
|
|
234
|
-
"flex_start": (
|
|
235
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
236
|
-
),
|
|
237
|
-
},
|
|
238
|
-
},
|
|
239
224
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
240
225
|
"apply_manifests": [{
|
|
241
226
|
"source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
@@ -298,6 +283,7 @@ class BlueprintGenerator:
|
|
|
298
283
|
"deployment_name": blueprint_name,
|
|
299
284
|
"region": region,
|
|
300
285
|
"zone": zone,
|
|
286
|
+
"labels": common_cluster_labels,
|
|
301
287
|
},
|
|
302
288
|
)
|
|
303
289
|
|
|
@@ -598,24 +584,12 @@ class BlueprintGenerator:
|
|
|
598
584
|
else:
|
|
599
585
|
gpu_pool.settings.update({"static_node_count": num_nodes})
|
|
600
586
|
|
|
601
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
602
587
|
workload_manager_install_id = "workload-manager-install"
|
|
603
588
|
workload_manager_install = DeploymentModule(
|
|
604
589
|
id=workload_manager_install_id,
|
|
605
590
|
source="modules/management/kubectl-apply",
|
|
606
591
|
use=[cluster_id],
|
|
607
592
|
settings={
|
|
608
|
-
"kueue": {
|
|
609
|
-
"install": True,
|
|
610
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
611
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
612
|
-
"config_template_vars": {
|
|
613
|
-
"num_chips": num_chips,
|
|
614
|
-
"flex_start": (
|
|
615
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
616
|
-
),
|
|
617
|
-
},
|
|
618
|
-
},
|
|
619
593
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
620
594
|
"apply_manifests": [
|
|
621
595
|
{"source": nccl_installer_path},
|
|
@@ -676,6 +650,7 @@ class BlueprintGenerator:
|
|
|
676
650
|
"deployment_name": blueprint_name,
|
|
677
651
|
"region": region,
|
|
678
652
|
"zone": zone,
|
|
653
|
+
"labels": common_cluster_labels,
|
|
679
654
|
},
|
|
680
655
|
)
|
|
681
656
|
|
|
@@ -884,24 +859,12 @@ class BlueprintGenerator:
|
|
|
884
859
|
else:
|
|
885
860
|
gpu_pool.settings.update({"static_node_count": num_nodes})
|
|
886
861
|
|
|
887
|
-
num_chips = num_nodes * system.chips_per_vm
|
|
888
862
|
workload_manager_install_id = "workload-manager-install"
|
|
889
863
|
workload_manager_install = DeploymentModule(
|
|
890
864
|
id=workload_manager_install_id,
|
|
891
865
|
source="modules/management/kubectl-apply",
|
|
892
866
|
use=[cluster_id],
|
|
893
867
|
settings={
|
|
894
|
-
"kueue": {
|
|
895
|
-
"install": True,
|
|
896
|
-
"version": KUEUE_VERSION, # TAS feature-gates is enabled in CT
|
|
897
|
-
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
898
|
-
"config_template_vars": {
|
|
899
|
-
"num_chips": num_chips,
|
|
900
|
-
"flex_start": (
|
|
901
|
-
1 if capacity_type == CapacityType.FLEX_START else 0
|
|
902
|
-
),
|
|
903
|
-
},
|
|
904
|
-
},
|
|
905
868
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
906
869
|
"apply_manifests": [
|
|
907
870
|
{"source": nccl_installer_path},
|
|
@@ -962,6 +925,7 @@ class BlueprintGenerator:
|
|
|
962
925
|
"deployment_name": blueprint_name,
|
|
963
926
|
"region": region,
|
|
964
927
|
"zone": zone,
|
|
928
|
+
"labels": common_cluster_labels,
|
|
965
929
|
},
|
|
966
930
|
)
|
|
967
931
|
|
|
@@ -32,7 +32,6 @@ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
|
|
|
32
32
|
a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
|
|
33
33
|
a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
|
|
34
34
|
config_map_filename = "config-map.yaml.tftpl"
|
|
35
|
-
kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
|
|
36
35
|
tmp_test_dir = "/tmp/xpk_test"
|
|
37
36
|
|
|
38
37
|
|
|
@@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint():
|
|
|
82
81
|
tmp_test_dir, "prefix", blueprint_name, config_map_filename
|
|
83
82
|
)
|
|
84
83
|
)
|
|
85
|
-
assert os.path.exists(
|
|
86
|
-
os.path.join(
|
|
87
|
-
tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
|
|
88
|
-
)
|
|
89
|
-
)
|
|
90
84
|
|
|
91
85
|
shutil.rmtree(tmp_test_dir)
|
|
92
86
|
|
xpk/core/capacity.py
CHANGED
|
@@ -17,6 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import enum
|
|
18
18
|
|
|
19
19
|
from ..utils.console import xpk_print, xpk_exit
|
|
20
|
+
from ..utils.kueue import is_queued_cluster
|
|
20
21
|
from .commands import run_command_with_updates, run_command_for_value
|
|
21
22
|
|
|
22
23
|
AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
|
|
@@ -50,7 +51,7 @@ def print_reservations(args) -> int:
|
|
|
50
51
|
"""
|
|
51
52
|
command = f'gcloud beta compute reservations list --project={args.project}'
|
|
52
53
|
return_code = run_command_with_updates(
|
|
53
|
-
command, 'Get all reservations in the project'
|
|
54
|
+
command, 'Get all reservations in the project'
|
|
54
55
|
)
|
|
55
56
|
if return_code != 0:
|
|
56
57
|
xpk_print(f'Get all reservations returned ERROR {return_code}')
|
|
@@ -119,7 +120,7 @@ def get_reservation_maintenance_interval(
|
|
|
119
120
|
f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
|
|
120
121
|
)
|
|
121
122
|
return_code, output = run_command_for_value(
|
|
122
|
-
command, 'Get reservation maintenance interval'
|
|
123
|
+
command, 'Get reservation maintenance interval'
|
|
123
124
|
)
|
|
124
125
|
if return_code != 0:
|
|
125
126
|
xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
|
|
@@ -143,7 +144,7 @@ def get_reservation_placement_policy(
|
|
|
143
144
|
f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
|
|
144
145
|
)
|
|
145
146
|
return_code, output = run_command_for_value(
|
|
146
|
-
command, 'Get reservation placement policy'
|
|
147
|
+
command, 'Get reservation placement policy'
|
|
147
148
|
)
|
|
148
149
|
if return_code != 0:
|
|
149
150
|
xpk_print(f'Get reservation placement policy ERROR {return_code}')
|
|
@@ -164,7 +165,7 @@ def verify_reservation_exists(args) -> int:
|
|
|
164
165
|
f'gcloud beta compute reservations describe {args.reservation}'
|
|
165
166
|
f' --project={args.project} --zone={args.zone}'
|
|
166
167
|
)
|
|
167
|
-
return_code = run_command_with_updates(command, 'Describe reservation'
|
|
168
|
+
return_code = run_command_with_updates(command, 'Describe reservation')
|
|
168
169
|
if return_code != 0:
|
|
169
170
|
xpk_print(f'Describe reservation returned ERROR {return_code}')
|
|
170
171
|
xpk_print('Please confirm that your reservation name is correct.')
|
|
@@ -199,7 +200,7 @@ def get_capacity_arguments_from_capacity_type(
|
|
|
199
200
|
' --location-policy=ANY --reservation-affinity=none'
|
|
200
201
|
f' --no-enable-autorepair --max-nodes={max_nodes}'
|
|
201
202
|
)
|
|
202
|
-
if args.num_slices
|
|
203
|
+
if is_queued_cluster(args.num_slices):
|
|
203
204
|
capacity_args += ' --enable-queued-provisioning'
|
|
204
205
|
case CapacityType.RESERVATION:
|
|
205
206
|
capacity_args = (
|