xpk 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/gcluster_a3mega_test.py +11 -0
- integration/gcluster_a3ultra_test.py +11 -0
- integration/gcluster_a4_test.py +11 -0
- xpk/commands/cluster.py +57 -21
- xpk/commands/cluster_gcluster.py +25 -5
- xpk/commands/cluster_gcluster_test.py +11 -2
- xpk/commands/cluster_test.py +233 -12
- xpk/commands/config.py +3 -5
- xpk/commands/kind.py +1 -1
- xpk/commands/storage.py +8 -10
- xpk/commands/workload.py +28 -12
- xpk/commands/workload_test.py +3 -3
- xpk/core/blueprint/blueprint_generator.py +70 -33
- xpk/core/blueprint/blueprint_test.py +9 -0
- xpk/core/capacity.py +46 -8
- xpk/core/capacity_test.py +32 -1
- xpk/core/cluster.py +37 -57
- xpk/core/cluster_test.py +95 -0
- xpk/core/commands.py +4 -10
- xpk/core/config.py +9 -2
- xpk/core/gcloud_context.py +18 -12
- xpk/core/gcloud_context_test.py +111 -1
- xpk/core/kjob.py +6 -9
- xpk/core/kueue_manager.py +192 -32
- xpk/core/kueue_manager_test.py +132 -4
- xpk/core/nodepool.py +21 -29
- xpk/core/nodepool_test.py +17 -15
- xpk/core/scheduling.py +16 -1
- xpk/core/scheduling_test.py +85 -6
- xpk/core/system_characteristics.py +77 -19
- xpk/core/system_characteristics_test.py +80 -5
- xpk/core/telemetry.py +263 -0
- xpk/core/telemetry_test.py +211 -0
- xpk/main.py +31 -13
- xpk/parser/cluster.py +48 -9
- xpk/parser/cluster_test.py +42 -3
- xpk/parser/workload.py +12 -0
- xpk/parser/workload_test.py +4 -4
- xpk/telemetry_uploader.py +29 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
- xpk/utils/console.py +41 -10
- xpk/utils/console_test.py +106 -0
- xpk/utils/feature_flags.py +7 -1
- xpk/utils/file.py +4 -1
- xpk/utils/topology.py +4 -0
- xpk/utils/user_agent.py +35 -0
- xpk/utils/user_agent_test.py +44 -0
- xpk/utils/user_input.py +48 -0
- xpk/utils/user_input_test.py +92 -0
- xpk/utils/validation.py +0 -11
- xpk/utils/versions.py +31 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.14.4.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -19,10 +19,13 @@ import shutil
|
|
|
19
19
|
from typing import Optional
|
|
20
20
|
|
|
21
21
|
from ruamel import yaml
|
|
22
|
+
from packaging.version import parse
|
|
22
23
|
|
|
23
24
|
from ...utils.console import xpk_exit, xpk_print
|
|
25
|
+
from ...utils.versions import ReleaseChannel
|
|
24
26
|
from ...utils.file import ensure_directory_exists
|
|
25
27
|
|
|
28
|
+
|
|
26
29
|
from ..capacity import (
|
|
27
30
|
H100_DEVICE_TYPE,
|
|
28
31
|
B200_DEVICE_TYPE,
|
|
@@ -84,6 +87,8 @@ class BlueprintGenerator:
|
|
|
84
87
|
region: str,
|
|
85
88
|
zone: str,
|
|
86
89
|
auth_cidr: str,
|
|
90
|
+
cluster_version: str,
|
|
91
|
+
release_channel: ReleaseChannel,
|
|
87
92
|
prefix: str = "",
|
|
88
93
|
num_nodes: int = 2,
|
|
89
94
|
pods_ip_cidr_range: str = "10.4.0.0/14",
|
|
@@ -142,11 +147,17 @@ class BlueprintGenerator:
|
|
|
142
147
|
},
|
|
143
148
|
)
|
|
144
149
|
|
|
150
|
+
sanitized_version = cluster_version.replace("-", "+", 1)
|
|
151
|
+
version = parse(sanitized_version)
|
|
152
|
+
version_prefix = f"{version.major}.{version.minor}"
|
|
145
153
|
gke_cluster = DeploymentModule(
|
|
146
154
|
id="gke_cluster",
|
|
147
155
|
source="modules/scheduler/gke-cluster",
|
|
148
156
|
use=[primary_vpc_name, gpu_subnets_name],
|
|
149
157
|
settings={
|
|
158
|
+
"release_channel": release_channel.value,
|
|
159
|
+
"version_prefix": version_prefix,
|
|
160
|
+
"min_master_version": cluster_version,
|
|
150
161
|
"prefix_with_deployment_name": False,
|
|
151
162
|
"name_suffix": cluster_name,
|
|
152
163
|
"enable_private_endpoint": False,
|
|
@@ -171,6 +182,16 @@ class BlueprintGenerator:
|
|
|
171
182
|
},
|
|
172
183
|
outputs=["instructions"],
|
|
173
184
|
)
|
|
185
|
+
if release_channel != ReleaseChannel.RAPID:
|
|
186
|
+
gke_cluster.set_setting(
|
|
187
|
+
"maintenance_exclusions",
|
|
188
|
+
[{
|
|
189
|
+
"name": "no-minor-or-node-upgrades-indefinite",
|
|
190
|
+
"start_time": "2024-12-01T00:00:00Z",
|
|
191
|
+
"end_time": "2026-01-16T00:00:00Z",
|
|
192
|
+
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
193
|
+
}],
|
|
194
|
+
)
|
|
174
195
|
|
|
175
196
|
group_placement_0 = DeploymentModule(
|
|
176
197
|
id="group_placement_0",
|
|
@@ -215,6 +236,9 @@ class BlueprintGenerator:
|
|
|
215
236
|
else:
|
|
216
237
|
a3_megagpu_pool_0.update_settings({"static_node_count": num_nodes})
|
|
217
238
|
|
|
239
|
+
if release_channel == ReleaseChannel.RAPID:
|
|
240
|
+
a3_megagpu_pool_0.set_setting("auto_upgrade", True)
|
|
241
|
+
|
|
218
242
|
set_placement_policy = capacity_type != CapacityType.SPOT
|
|
219
243
|
workload = DeploymentModule(
|
|
220
244
|
id="workload_component_install",
|
|
@@ -391,6 +415,8 @@ class BlueprintGenerator:
|
|
|
391
415
|
zone: str,
|
|
392
416
|
auth_cidr: str,
|
|
393
417
|
system_node_pool_machine_type: str,
|
|
418
|
+
cluster_version: str,
|
|
419
|
+
release_channel: ReleaseChannel,
|
|
394
420
|
reservation: Optional[str | None] = None,
|
|
395
421
|
gcs_bucket: Optional[str | None] = None,
|
|
396
422
|
num_nodes: int = 2,
|
|
@@ -480,28 +506,19 @@ class BlueprintGenerator:
|
|
|
480
506
|
},
|
|
481
507
|
},
|
|
482
508
|
)
|
|
509
|
+
|
|
510
|
+
sanitized_version = cluster_version.replace("-", "+", 1)
|
|
511
|
+
version = parse(sanitized_version)
|
|
512
|
+
version_prefix = f"{version.major}.{version.minor}"
|
|
483
513
|
cluster_id = f"{cluster_name}-a3-ultragpu-cluster"
|
|
484
514
|
a3_ultra_cluster = DeploymentModule(
|
|
485
515
|
id=cluster_id,
|
|
486
516
|
source="modules/scheduler/gke-cluster",
|
|
487
517
|
use=[net_0_id],
|
|
488
518
|
settings={
|
|
489
|
-
"release_channel":
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
else "RAPID"
|
|
493
|
-
),
|
|
494
|
-
"version_prefix": "1.32.",
|
|
495
|
-
"maintenance_exclusions": (
|
|
496
|
-
[]
|
|
497
|
-
if capacity_type == CapacityType.FLEX_START
|
|
498
|
-
else [{
|
|
499
|
-
"name": "no-minor-or-node-upgrades-indefinite",
|
|
500
|
-
"start_time": "2024-12-01T00:00:00Z",
|
|
501
|
-
"end_time": "2025-12-22T00:00:00Z",
|
|
502
|
-
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
503
|
-
}]
|
|
504
|
-
),
|
|
519
|
+
"release_channel": release_channel.value,
|
|
520
|
+
"version_prefix": version_prefix,
|
|
521
|
+
"min_cluster_version": cluster_version,
|
|
505
522
|
"prefix_with_deployment_name": False,
|
|
506
523
|
"name_suffix": cluster_name,
|
|
507
524
|
"system_node_pool_machine_type": system_node_pool_machine_type,
|
|
@@ -537,6 +554,17 @@ class BlueprintGenerator:
|
|
|
537
554
|
},
|
|
538
555
|
outputs=["instructions"],
|
|
539
556
|
)
|
|
557
|
+
if release_channel != ReleaseChannel.RAPID:
|
|
558
|
+
a3_ultra_cluster.set_setting(
|
|
559
|
+
"maintenance_exclusions",
|
|
560
|
+
[{
|
|
561
|
+
"name": "no-minor-or-node-upgrades-indefinite",
|
|
562
|
+
"start_time": "2024-12-01T00:00:00Z",
|
|
563
|
+
"end_time": "2026-01-16T00:00:00Z",
|
|
564
|
+
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
565
|
+
}],
|
|
566
|
+
)
|
|
567
|
+
|
|
540
568
|
system, _ = get_system_characteristics_by_device_type(a3ultra_device_type)
|
|
541
569
|
if system is None:
|
|
542
570
|
xpk_print(
|
|
@@ -584,6 +612,9 @@ class BlueprintGenerator:
|
|
|
584
612
|
else:
|
|
585
613
|
gpu_pool.update_settings({"static_node_count": num_nodes})
|
|
586
614
|
|
|
615
|
+
if release_channel == ReleaseChannel.RAPID:
|
|
616
|
+
gpu_pool.set_setting("auto_upgrade", True)
|
|
617
|
+
|
|
587
618
|
workload_manager_install_id = "workload-manager-install"
|
|
588
619
|
workload_manager_install = DeploymentModule(
|
|
589
620
|
id=workload_manager_install_id,
|
|
@@ -674,6 +705,8 @@ class BlueprintGenerator:
|
|
|
674
705
|
zone: str,
|
|
675
706
|
auth_cidr: str,
|
|
676
707
|
system_node_pool_machine_type: str,
|
|
708
|
+
cluster_version: str,
|
|
709
|
+
release_channel: ReleaseChannel,
|
|
677
710
|
reservation: Optional[str | None] = None,
|
|
678
711
|
gcs_bucket: Optional[str | None] = None,
|
|
679
712
|
num_nodes: int = 2,
|
|
@@ -761,12 +794,19 @@ class BlueprintGenerator:
|
|
|
761
794
|
},
|
|
762
795
|
},
|
|
763
796
|
)
|
|
797
|
+
|
|
798
|
+
sanitized_version = cluster_version.replace("-", "+", 1)
|
|
799
|
+
version = parse(sanitized_version)
|
|
800
|
+
version_prefix = f"{version.major}.{version.minor}"
|
|
764
801
|
cluster_id = f"{cluster_name}-a4-cluster"
|
|
765
802
|
a4_cluster = DeploymentModule(
|
|
766
803
|
id=cluster_id,
|
|
767
804
|
source="modules/scheduler/gke-cluster",
|
|
768
805
|
use=[net_0_id],
|
|
769
806
|
settings={
|
|
807
|
+
"release_channel": release_channel.value,
|
|
808
|
+
"version_prefix": version_prefix,
|
|
809
|
+
"min_cluster_version": cluster_version,
|
|
770
810
|
"system_node_pool_machine_type": system_node_pool_machine_type,
|
|
771
811
|
"system_node_pool_node_count": {
|
|
772
812
|
"total_min_nodes": system_node_pool_min_node_count,
|
|
@@ -791,25 +831,20 @@ class BlueprintGenerator:
|
|
|
791
831
|
" alias_ip_range=[]}],"
|
|
792
832
|
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
793
833
|
),
|
|
794
|
-
"version_prefix": "1.32.",
|
|
795
|
-
"release_channel": (
|
|
796
|
-
"UNSPECIFIED"
|
|
797
|
-
if capacity_type == CapacityType.FLEX_START
|
|
798
|
-
else "RAPID"
|
|
799
|
-
),
|
|
800
|
-
"maintenance_exclusions": (
|
|
801
|
-
[]
|
|
802
|
-
if capacity_type == CapacityType.FLEX_START
|
|
803
|
-
else [{
|
|
804
|
-
"name": "no-minor-or-node-upgrades-indefinite",
|
|
805
|
-
"start_time": "2024-12-01T00:00:00Z",
|
|
806
|
-
"end_time": "2025-12-22T00:00:00Z",
|
|
807
|
-
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
808
|
-
}]
|
|
809
|
-
),
|
|
810
834
|
},
|
|
811
835
|
outputs=["instructions"],
|
|
812
836
|
)
|
|
837
|
+
if release_channel != ReleaseChannel.RAPID:
|
|
838
|
+
a4_cluster.set_setting(
|
|
839
|
+
"maintenance_exclusions",
|
|
840
|
+
[{
|
|
841
|
+
"name": "no-minor-or-node-upgrades-indefinite",
|
|
842
|
+
"start_time": "2024-12-01T00:00:00Z",
|
|
843
|
+
"end_time": "2026-01-16T00:00:00Z",
|
|
844
|
+
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
845
|
+
}],
|
|
846
|
+
)
|
|
847
|
+
|
|
813
848
|
system, _ = get_system_characteristics_by_device_type(a4_device_type)
|
|
814
849
|
if system is None:
|
|
815
850
|
xpk_print(
|
|
@@ -859,6 +894,9 @@ class BlueprintGenerator:
|
|
|
859
894
|
else:
|
|
860
895
|
gpu_pool.update_settings({"static_node_count": num_nodes})
|
|
861
896
|
|
|
897
|
+
if release_channel == ReleaseChannel.RAPID:
|
|
898
|
+
gpu_pool.set_setting("auto_upgrade", True)
|
|
899
|
+
|
|
862
900
|
workload_manager_install_id = "workload-manager-install"
|
|
863
901
|
workload_manager_install = DeploymentModule(
|
|
864
902
|
id=workload_manager_install_id,
|
|
@@ -1019,7 +1057,6 @@ class BlueprintGenerator:
|
|
|
1019
1057
|
"enable_flex_start": True,
|
|
1020
1058
|
"enable_queued_provisioning": True,
|
|
1021
1059
|
"autoscaling_total_min_nodes": 0,
|
|
1022
|
-
"release_channel": "UNSPECIFIED",
|
|
1023
1060
|
"auto_repair": False,
|
|
1024
1061
|
"auto_upgrade": False,
|
|
1025
1062
|
}
|
|
@@ -22,6 +22,7 @@ import ruamel.yaml
|
|
|
22
22
|
from xpk.core.blueprint.blueprint_definitions import Blueprint
|
|
23
23
|
from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
24
24
|
from xpk.core.capacity import CapacityType
|
|
25
|
+
from xpk.utils.versions import ReleaseChannel
|
|
25
26
|
|
|
26
27
|
yaml = ruamel.yaml.YAML()
|
|
27
28
|
|
|
@@ -60,6 +61,8 @@ def test_generate_a3_mega_blueprint():
|
|
|
60
61
|
reservation="test-reservation",
|
|
61
62
|
capacity_type=CapacityType.RESERVATION,
|
|
62
63
|
system_node_pool_min_node_count=5,
|
|
64
|
+
release_channel=ReleaseChannel.RAPID,
|
|
65
|
+
cluster_version="1.2.3",
|
|
63
66
|
)
|
|
64
67
|
|
|
65
68
|
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
@@ -99,6 +102,8 @@ def test_generate_a3_mega_spot_blueprint():
|
|
|
99
102
|
auth_cidr="10.0.0.0/32",
|
|
100
103
|
capacity_type=CapacityType.SPOT,
|
|
101
104
|
system_node_pool_min_node_count=5,
|
|
105
|
+
release_channel=ReleaseChannel.RAPID,
|
|
106
|
+
cluster_version="1.2.3",
|
|
102
107
|
)
|
|
103
108
|
|
|
104
109
|
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
@@ -135,6 +140,8 @@ def test_generate_a3_ultra_blueprint():
|
|
|
135
140
|
capacity_type=CapacityType.RESERVATION,
|
|
136
141
|
gcs_bucket="test-bucket",
|
|
137
142
|
prefix="testdir",
|
|
143
|
+
release_channel=ReleaseChannel.RAPID,
|
|
144
|
+
cluster_version="1.2.3",
|
|
138
145
|
)
|
|
139
146
|
with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
|
|
140
147
|
ctk_yaml = yaml.load(stream)
|
|
@@ -180,6 +187,8 @@ def test_generate_a4_blueprint():
|
|
|
180
187
|
capacity_type=CapacityType.RESERVATION,
|
|
181
188
|
gcs_bucket="test-bucket",
|
|
182
189
|
prefix="testdir",
|
|
190
|
+
release_channel=ReleaseChannel.RAPID,
|
|
191
|
+
cluster_version="1.2.3",
|
|
183
192
|
)
|
|
184
193
|
with open(a4_yaml_test_path, encoding="utf-8") as stream:
|
|
185
194
|
ctk_yaml = yaml.load(stream)
|
xpk/core/capacity.py
CHANGED
|
@@ -115,9 +115,12 @@ def get_reservation_maintenance_interval(
|
|
|
115
115
|
Returns:
|
|
116
116
|
0 if successful and 1 otherwise.
|
|
117
117
|
"""
|
|
118
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
119
|
+
reservation, project
|
|
120
|
+
)
|
|
118
121
|
command = (
|
|
119
|
-
f'gcloud beta compute reservations describe {
|
|
120
|
-
f' --project={
|
|
122
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
123
|
+
f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
|
|
121
124
|
)
|
|
122
125
|
return_code, output = run_command_for_value(
|
|
123
126
|
command, 'Get reservation maintenance interval'
|
|
@@ -139,9 +142,12 @@ def get_reservation_placement_policy(
|
|
|
139
142
|
Returns:
|
|
140
143
|
0 if successful and 1 otherwise.
|
|
141
144
|
"""
|
|
145
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
146
|
+
reservation, project
|
|
147
|
+
)
|
|
142
148
|
command = (
|
|
143
|
-
f'gcloud beta compute reservations describe {
|
|
144
|
-
f' --project={
|
|
149
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
150
|
+
f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
|
|
145
151
|
)
|
|
146
152
|
return_code, output = run_command_for_value(
|
|
147
153
|
command, 'Get reservation placement policy'
|
|
@@ -156,9 +162,12 @@ def get_reservation_deployment_type(
|
|
|
156
162
|
reservation: str, zone: str, project: str
|
|
157
163
|
) -> str:
|
|
158
164
|
"""Get reservation deployment type."""
|
|
165
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
166
|
+
reservation, project
|
|
167
|
+
)
|
|
159
168
|
command = (
|
|
160
|
-
f'gcloud beta compute reservations describe {
|
|
161
|
-
f' --project={
|
|
169
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
170
|
+
f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
|
|
162
171
|
)
|
|
163
172
|
return_code, output = run_command_for_value(
|
|
164
173
|
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
|
|
@@ -178,9 +187,12 @@ def verify_reservation_exists(args) -> int:
|
|
|
178
187
|
Returns:
|
|
179
188
|
0 if successful and 1 otherwise.
|
|
180
189
|
"""
|
|
190
|
+
reservation_project, reservation_name = get_reservation_project_and_name(
|
|
191
|
+
args.reservation, args.project
|
|
192
|
+
)
|
|
181
193
|
command = (
|
|
182
|
-
f'gcloud beta compute reservations describe {
|
|
183
|
-
f' --project={
|
|
194
|
+
f'gcloud beta compute reservations describe {reservation_name}'
|
|
195
|
+
f' --project={reservation_project} --zone={args.zone}'
|
|
184
196
|
)
|
|
185
197
|
return_code = run_command_with_updates(command, 'Describe reservation')
|
|
186
198
|
if return_code != 0:
|
|
@@ -264,3 +276,29 @@ def get_capacity_node_selectors_from_capacity_type(
|
|
|
264
276
|
)
|
|
265
277
|
return_code = 1
|
|
266
278
|
return node_selector, return_code
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def get_reservation_project_and_name(
|
|
282
|
+
reservation_name_or_path: str, cluster_project: str
|
|
283
|
+
) -> tuple[str, str]:
|
|
284
|
+
"""Get the reservation project and name.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
reservation_name_or_path: either reservation name or reservation path in format
|
|
288
|
+
projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
|
|
289
|
+
cluster_project: the cluster project
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Tuple with reservation project and reservation name.
|
|
293
|
+
"""
|
|
294
|
+
if '/' not in reservation_name_or_path:
|
|
295
|
+
return cluster_project, reservation_name_or_path
|
|
296
|
+
reservation_parts = reservation_name_or_path.split('/')
|
|
297
|
+
if (
|
|
298
|
+
len(reservation_parts) != 4
|
|
299
|
+
or reservation_parts[0] != 'projects'
|
|
300
|
+
or reservation_parts[2] != 'reservations'
|
|
301
|
+
):
|
|
302
|
+
xpk_print('Unable to parse reservation: ', reservation_name_or_path)
|
|
303
|
+
xpk_exit(1)
|
|
304
|
+
return reservation_parts[1], reservation_parts[3]
|
xpk/core/capacity_test.py
CHANGED
|
@@ -16,7 +16,7 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import pytest
|
|
18
18
|
from unittest.mock import MagicMock, patch
|
|
19
|
-
from .capacity import get_reservation_deployment_type
|
|
19
|
+
from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@patch('xpk.core.capacity.xpk_print')
|
|
@@ -48,3 +48,34 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
|
|
|
48
48
|
reservation='reservation', zone='zone', project='project'
|
|
49
49
|
)
|
|
50
50
|
assert result == 'DENSE'
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_reservation_project_and_name_parses_local_reservation():
|
|
54
|
+
project, name = get_reservation_project_and_name(
|
|
55
|
+
'test-reservation', 'cluster-project'
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
assert project == 'cluster-project'
|
|
59
|
+
assert name == 'test-reservation'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_get_reservation_project_and_name_parses_shared_reservation():
|
|
63
|
+
project, name = get_reservation_project_and_name(
|
|
64
|
+
'projects/reservation-project/reservations/test-reservation',
|
|
65
|
+
'cluster-project',
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
assert project == 'reservation-project'
|
|
69
|
+
assert name == 'test-reservation'
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@patch('xpk.core.capacity.xpk_print')
|
|
73
|
+
def test_get_reservation_project_and_name_fails_for_invalid_reservation(
|
|
74
|
+
xpk_print: MagicMock, mocker
|
|
75
|
+
):
|
|
76
|
+
with pytest.raises(SystemExit):
|
|
77
|
+
get_reservation_project_and_name(
|
|
78
|
+
'invalid/reservation',
|
|
79
|
+
'cluster-project',
|
|
80
|
+
)
|
|
81
|
+
assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]
|
xpk/core/cluster.py
CHANGED
|
@@ -729,78 +729,58 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int:
|
|
|
729
729
|
return 0
|
|
730
730
|
|
|
731
731
|
|
|
732
|
-
def
|
|
733
|
-
"""
|
|
732
|
+
def get_cluster_credentials(args) -> int:
|
|
733
|
+
"""Run cluster configuration command to set the kubectl config.
|
|
734
734
|
|
|
735
735
|
Args:
|
|
736
736
|
args: user provided arguments for running the command.
|
|
737
737
|
|
|
738
738
|
Returns:
|
|
739
|
-
0 if
|
|
739
|
+
0 if successful and 1 otherwise.
|
|
740
740
|
"""
|
|
741
|
-
|
|
742
|
-
xpk_print('Testing credentials with kubectl...')
|
|
743
|
-
kubectl_command = 'kubectl get pods'
|
|
744
|
-
kubectl_return_code, kubectl_output = run_command_for_value(
|
|
745
|
-
kubectl_command, 'kubectl get pods'
|
|
746
|
-
)
|
|
747
|
-
if kubectl_return_code == 0:
|
|
748
|
-
xpk_print('Credentials test succeeded.')
|
|
749
|
-
return 0
|
|
750
|
-
|
|
751
|
-
dns_endpoint_error = (
|
|
752
|
-
'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic'
|
|
753
|
-
' is disabled'
|
|
754
|
-
)
|
|
755
|
-
if dns_endpoint_error not in kubectl_output:
|
|
756
|
-
xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}')
|
|
757
|
-
xpk_exit(kubectl_return_code)
|
|
758
|
-
xpk_print(
|
|
759
|
-
'Detected DNS endpoint-related error. Retrying without --dns-endpoint'
|
|
760
|
-
' flag...'
|
|
761
|
-
)
|
|
762
|
-
|
|
763
741
|
location = get_cluster_location(args.project, args.cluster, args.zone)
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
)
|
|
771
|
-
return_code = run_command_with_updates(
|
|
772
|
-
without_dns_command, 'get-credentials to cluster', verbose=False
|
|
742
|
+
|
|
743
|
+
return_code = _get_credentials(
|
|
744
|
+
project=args.project,
|
|
745
|
+
cluster=args.cluster,
|
|
746
|
+
location=location,
|
|
747
|
+
dns_endpoint=True,
|
|
773
748
|
)
|
|
774
749
|
if return_code != 0:
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
750
|
+
return return_code
|
|
751
|
+
|
|
752
|
+
if not _are_credentials_valid():
|
|
753
|
+
xpk_print('Detected error. Retrying without --dns-endpoint flag...')
|
|
754
|
+
return_code = _get_credentials(
|
|
755
|
+
project=args.project,
|
|
756
|
+
cluster=args.cluster,
|
|
757
|
+
location=location,
|
|
758
|
+
dns_endpoint=False,
|
|
759
|
+
)
|
|
760
|
+
if return_code != 0:
|
|
761
|
+
return return_code
|
|
779
762
|
|
|
780
|
-
|
|
781
|
-
|
|
763
|
+
xpk_print('Finished get-credentials and kubectl setup.')
|
|
764
|
+
return 0
|
|
782
765
|
|
|
783
|
-
Args:
|
|
784
|
-
args: user provided arguments for running the command.
|
|
785
766
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
767
|
+
def _get_credentials(
|
|
768
|
+
project: str, cluster: str, location: str, dns_endpoint: bool
|
|
769
|
+
) -> int:
|
|
770
|
+
dns_endpoint_arg = '--dns-endpoint' if dns_endpoint else ''
|
|
790
771
|
command = (
|
|
791
772
|
'gcloud container clusters get-credentials'
|
|
792
|
-
f' {
|
|
793
|
-
f' --project={
|
|
773
|
+
f' {cluster} --location={location} {dns_endpoint_arg}'
|
|
774
|
+
f' --project={project} && kubectl config view && kubectl config'
|
|
794
775
|
' set-context --current --namespace=default'
|
|
795
776
|
)
|
|
796
|
-
task = f'get-credentials-dns-endpoint to cluster {
|
|
797
|
-
|
|
777
|
+
task = f'get-credentials-dns-endpoint to cluster {cluster}'
|
|
778
|
+
return run_command_with_updates(command, task, verbose=False)
|
|
798
779
|
|
|
799
|
-
if return_code != 0:
|
|
800
|
-
xpk_print(f'{task} returned ERROR {return_code}')
|
|
801
|
-
xpk_exit(return_code)
|
|
802
780
|
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
781
|
+
def _are_credentials_valid() -> bool:
|
|
782
|
+
kubectl_command = 'kubectl get pods'
|
|
783
|
+
kubectl_return_code = run_command_with_updates(
|
|
784
|
+
kubectl_command, 'Test kubectl credentials'
|
|
785
|
+
)
|
|
786
|
+
return kubectl_return_code == 0
|
xpk/core/cluster_test.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .testing.commands_tester import CommandsTester
|
|
19
|
+
from .cluster import get_cluster_credentials
|
|
20
|
+
from pytest_mock import MockerFixture
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(autouse=True)
|
|
24
|
+
def commands_tester(mocker: MockerFixture) -> CommandsTester:
|
|
25
|
+
return CommandsTester(
|
|
26
|
+
mocker=mocker,
|
|
27
|
+
run_command_for_value_path="xpk.core.cluster.run_command_for_value",
|
|
28
|
+
run_command_with_updates_path="xpk.core.cluster.run_command_with_updates",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(autouse=True)
|
|
33
|
+
def mock_location(mocker: MockerFixture):
|
|
34
|
+
mocker.patch(
|
|
35
|
+
"xpk.core.cluster.get_cluster_location", return_value="us-central1"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture(autouse=True)
|
|
40
|
+
def command_args(mocker: MockerFixture):
|
|
41
|
+
return mocker.Mock(cluster="cluster", project="project", zone="zone")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_get_cluster_credentials_returns_1_when_retrieval_command_fails(
|
|
45
|
+
commands_tester: CommandsTester, command_args
|
|
46
|
+
):
|
|
47
|
+
commands_tester.set_result_for_command(
|
|
48
|
+
(1, ""), "gcloud container clusters get-credentials"
|
|
49
|
+
)
|
|
50
|
+
assert get_cluster_credentials(command_args) == 1
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_get_cluster_credentials_returns_0_when_retrieval_succeeds(
|
|
54
|
+
commands_tester: CommandsTester, command_args
|
|
55
|
+
):
|
|
56
|
+
commands_tester.set_result_for_command(
|
|
57
|
+
(0, ""), "gcloud container clusters get-credentials"
|
|
58
|
+
)
|
|
59
|
+
assert get_cluster_credentials(command_args) == 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_get_cluster_credentials_does_not_retry_with_dns_when_retrieval_succeeds(
|
|
63
|
+
commands_tester: CommandsTester, command_args
|
|
64
|
+
):
|
|
65
|
+
commands_tester.set_result_for_command(
|
|
66
|
+
(0, ""), "gcloud container clusters get-credentials --dns-endpoint"
|
|
67
|
+
)
|
|
68
|
+
commands_tester.set_result_for_command((0, ""), "kubectl get pods")
|
|
69
|
+
get_cluster_credentials(command_args)
|
|
70
|
+
non_dns_endpoint_commands = [
|
|
71
|
+
c
|
|
72
|
+
for c in commands_tester.get_matching_commands(
|
|
73
|
+
"gcloud container clusters get-credentials"
|
|
74
|
+
)
|
|
75
|
+
if "dns-endpoint" not in c
|
|
76
|
+
]
|
|
77
|
+
assert len(non_dns_endpoint_commands) == 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
|
|
81
|
+
commands_tester: CommandsTester, command_args
|
|
82
|
+
):
|
|
83
|
+
commands_tester.set_result_for_command(
|
|
84
|
+
(0, ""), "gcloud container clusters get-credentials --dns-endpoint"
|
|
85
|
+
)
|
|
86
|
+
commands_tester.set_result_for_command((1, ""), "kubectl get pods")
|
|
87
|
+
get_cluster_credentials(command_args)
|
|
88
|
+
non_dns_endpoint_commands = [
|
|
89
|
+
c
|
|
90
|
+
for c in commands_tester.get_matching_commands(
|
|
91
|
+
"gcloud container clusters get-credentials"
|
|
92
|
+
)
|
|
93
|
+
if "dns-endpoint" not in c
|
|
94
|
+
]
|
|
95
|
+
assert len(non_dns_endpoint_commands) == 1
|
xpk/core/commands.py
CHANGED
|
@@ -195,16 +195,13 @@ def run_command_with_updates(command, task, verbose=True) -> int:
|
|
|
195
195
|
return_code = child.poll()
|
|
196
196
|
if return_code is None:
|
|
197
197
|
xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
|
|
198
|
-
time.sleep(
|
|
199
|
-
i +=
|
|
198
|
+
time.sleep(10)
|
|
199
|
+
i += 10
|
|
200
200
|
else:
|
|
201
201
|
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
202
202
|
return return_code
|
|
203
203
|
else:
|
|
204
|
-
xpk_print(
|
|
205
|
-
f'Task: `{task}` is implemented by `{command}`, hiding output unless'
|
|
206
|
-
' there is an error.'
|
|
207
|
-
)
|
|
204
|
+
xpk_print(f'Task: `{task}` is implemented by `{command}`')
|
|
208
205
|
try:
|
|
209
206
|
subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
|
|
210
207
|
except subprocess.CalledProcessError as e:
|
|
@@ -277,10 +274,7 @@ def run_command_for_value(
|
|
|
277
274
|
return return_code, f'{out_str}\n{err_str}'
|
|
278
275
|
else:
|
|
279
276
|
if not quiet:
|
|
280
|
-
xpk_print(
|
|
281
|
-
f'Task: `{task}` is implemented by `{command}`, hiding output unless'
|
|
282
|
-
' there is an error.'
|
|
283
|
-
)
|
|
277
|
+
xpk_print(f'Task: `{task}` is implemented by `{command}`')
|
|
284
278
|
try:
|
|
285
279
|
output = subprocess.check_output(
|
|
286
280
|
command,
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.15.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -30,6 +30,8 @@ CONFIGS_KEY = 'configs'
|
|
|
30
30
|
CFG_BUCKET_KEY = 'cluster-state-gcs-bucket'
|
|
31
31
|
CLUSTER_NAME_KEY = 'cluster-name'
|
|
32
32
|
PROJECT_KEY = 'project-id'
|
|
33
|
+
CLIENT_ID_KEY = 'client-id'
|
|
34
|
+
SEND_TELEMETRY_KEY = 'send-telemetry'
|
|
33
35
|
ZONE_KEY = 'zone'
|
|
34
36
|
KJOB_BATCH_IMAGE = 'batch-image'
|
|
35
37
|
KJOB_BATCH_WORKING_DIRECTORY = 'batch-working-directory'
|
|
@@ -45,6 +47,8 @@ DEFAULT_KEYS = [
|
|
|
45
47
|
CFG_BUCKET_KEY,
|
|
46
48
|
CLUSTER_NAME_KEY,
|
|
47
49
|
PROJECT_KEY,
|
|
50
|
+
CLIENT_ID_KEY,
|
|
51
|
+
SEND_TELEMETRY_KEY,
|
|
48
52
|
ZONE_KEY,
|
|
49
53
|
GKE_ENDPOINT_KEY,
|
|
50
54
|
DEPENDENCIES_KEY,
|
|
@@ -82,7 +86,7 @@ class XpkConfig:
|
|
|
82
86
|
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
83
87
|
yaml.dump(config_yaml, stream)
|
|
84
88
|
|
|
85
|
-
def set(self, key: str, value: str) -> None:
|
|
89
|
+
def set(self, key: str, value: str | None) -> None:
|
|
86
90
|
if key not in self._allowed_keys:
|
|
87
91
|
xpk_print(f'Key {key} is not an allowed xpk config key.')
|
|
88
92
|
return
|
|
@@ -114,3 +118,6 @@ class XpkConfig:
|
|
|
114
118
|
return None
|
|
115
119
|
val: dict[str, str] = config_yaml[CONFIGS_KEY]
|
|
116
120
|
return val
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
xpk_config = XpkConfig()
|