xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +270 -8
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +3 -3
- xpk/commands/info.py +12 -12
- xpk/commands/job.py +12 -10
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +1 -1
- xpk/commands/workload.py +12 -6
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +5 -3
- xpk/core/cluster.py +9 -7
- xpk/core/cluster_private.py +5 -1
- xpk/core/commands.py +3 -3
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +1 -1
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +2 -1
- xpk/core/kueue.py +12 -4
- xpk/core/nap.py +20 -6
- xpk/core/nodepool.py +52 -19
- xpk/core/nodepool_test.py +82 -0
- xpk/core/resources.py +1 -7
- xpk/core/scheduling.py +1 -1
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +267 -1081
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +9 -10
- xpk/parser/cluster.py +67 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import shutil
|
|
19
|
+
|
|
20
|
+
import ruamel.yaml
|
|
21
|
+
|
|
22
|
+
from xpk.core.blueprint.blueprint_definitions import Blueprint
|
|
23
|
+
from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
24
|
+
from xpk.core.capacity import CapacityType
|
|
25
|
+
|
|
26
|
+
yaml = ruamel.yaml.YAML()
|
|
27
|
+
|
|
28
|
+
yaml.register_class(Blueprint)
|
|
29
|
+
|
|
30
|
+
a3_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega.yaml"
|
|
31
|
+
a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
|
|
32
|
+
a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
|
|
33
|
+
a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
|
|
34
|
+
config_map_filename = "config-map.yaml.tftpl"
|
|
35
|
+
kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
|
|
36
|
+
tmp_test_dir = "/tmp/xpk_test"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def prepare_test():
|
|
40
|
+
if os.path.exists(tmp_test_dir):
|
|
41
|
+
shutil.rmtree(tmp_test_dir)
|
|
42
|
+
os.mkdir(tmp_test_dir)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_generate_a3_mega_blueprint():
|
|
46
|
+
prepare_test()
|
|
47
|
+
blueprint_name = "xpk-gke-a3-megagpu"
|
|
48
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
49
|
+
bp = bp_generator.generate_a3_mega_blueprint(
|
|
50
|
+
project_id="foo",
|
|
51
|
+
cluster_name="bar",
|
|
52
|
+
blueprint_name=blueprint_name,
|
|
53
|
+
prefix="prefix",
|
|
54
|
+
region="us-central1",
|
|
55
|
+
zone="us-central1-c",
|
|
56
|
+
auth_cidr="10.0.0.0/32",
|
|
57
|
+
reservation_placement_policy={
|
|
58
|
+
"type": "COMPACT",
|
|
59
|
+
"name": "test-reservation-placement",
|
|
60
|
+
},
|
|
61
|
+
reservation="test-reservation",
|
|
62
|
+
capacity_type=CapacityType.RESERVATION,
|
|
63
|
+
system_node_pool_min_node_count=5,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
67
|
+
|
|
68
|
+
with open(a3_yaml_test_path, encoding="utf-8") as stream:
|
|
69
|
+
ctk_yaml = yaml.load(stream)
|
|
70
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
71
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
72
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
73
|
+
assert ctk_test.terraform_backend_defaults is None
|
|
74
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
75
|
+
assert (
|
|
76
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
77
|
+
)
|
|
78
|
+
assert ctk_yaml.vars == ctk_test.vars
|
|
79
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
80
|
+
assert os.path.exists(
|
|
81
|
+
os.path.join(
|
|
82
|
+
tmp_test_dir, "prefix", blueprint_name, config_map_filename
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
assert os.path.exists(
|
|
86
|
+
os.path.join(
|
|
87
|
+
tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
shutil.rmtree(tmp_test_dir)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_generate_a3_mega_spot_blueprint():
|
|
95
|
+
prepare_test()
|
|
96
|
+
blueprint_name = "xpk-gke-a3-megagpu"
|
|
97
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
98
|
+
bp = bp_generator.generate_a3_mega_blueprint(
|
|
99
|
+
project_id="foo",
|
|
100
|
+
cluster_name="bar",
|
|
101
|
+
blueprint_name=blueprint_name,
|
|
102
|
+
prefix="prefix",
|
|
103
|
+
region="us-central1",
|
|
104
|
+
zone="us-central1-c",
|
|
105
|
+
auth_cidr="10.0.0.0/32",
|
|
106
|
+
capacity_type=CapacityType.SPOT,
|
|
107
|
+
system_node_pool_min_node_count=5,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
111
|
+
|
|
112
|
+
with open(a3_spot_yaml_test_path, encoding="utf-8") as stream:
|
|
113
|
+
ctk_yaml = yaml.load(stream)
|
|
114
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
115
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
116
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
117
|
+
assert ctk_test.terraform_backend_defaults is None
|
|
118
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
119
|
+
assert (
|
|
120
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
121
|
+
)
|
|
122
|
+
assert ctk_yaml.vars == ctk_test.vars
|
|
123
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
124
|
+
|
|
125
|
+
shutil.rmtree(tmp_test_dir)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_generate_a3_ultra_blueprint():
|
|
129
|
+
prepare_test()
|
|
130
|
+
blueprint_name = "xpk-gke-a3-ultra"
|
|
131
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
132
|
+
bp = bp_generator.generate_a3_ultra_blueprint(
|
|
133
|
+
project_id="foo",
|
|
134
|
+
cluster_name="gke-a3-ultra",
|
|
135
|
+
blueprint_name=blueprint_name,
|
|
136
|
+
region="us-central1",
|
|
137
|
+
zone="us-central1-c",
|
|
138
|
+
auth_cidr="10.0.0.0/32",
|
|
139
|
+
reservation="test-reservation",
|
|
140
|
+
system_node_pool_machine_type="e2-standard-16",
|
|
141
|
+
capacity_type=CapacityType.RESERVATION,
|
|
142
|
+
gcs_bucket="test-bucket",
|
|
143
|
+
prefix="testdir",
|
|
144
|
+
)
|
|
145
|
+
with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
|
|
146
|
+
ctk_yaml = yaml.load(stream)
|
|
147
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
148
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
149
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
150
|
+
assert (
|
|
151
|
+
ctk_yaml.terraform_backend_defaults
|
|
152
|
+
== ctk_test.terraform_backend_defaults
|
|
153
|
+
)
|
|
154
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
155
|
+
assert (
|
|
156
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
157
|
+
)
|
|
158
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
159
|
+
assert os.path.exists(
|
|
160
|
+
os.path.join(
|
|
161
|
+
tmp_test_dir, "testdir", blueprint_name, "mlgru-disable.yaml"
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
assert os.path.exists(
|
|
165
|
+
os.path.join(
|
|
166
|
+
tmp_test_dir, "testdir", blueprint_name, "nccl-installer.yaml"
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
shutil.rmtree(tmp_test_dir)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_generate_a4_blueprint():
|
|
174
|
+
prepare_test()
|
|
175
|
+
blueprint_name = "xpk-gke-a4"
|
|
176
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
177
|
+
bp = bp_generator.generate_a4_blueprint(
|
|
178
|
+
project_id="foo",
|
|
179
|
+
cluster_name="gke-a4",
|
|
180
|
+
blueprint_name=blueprint_name,
|
|
181
|
+
region="us-central1",
|
|
182
|
+
zone="us-central1-c",
|
|
183
|
+
auth_cidr="10.0.0.0/32",
|
|
184
|
+
reservation="test-reservation",
|
|
185
|
+
system_node_pool_machine_type="e2-standard-16",
|
|
186
|
+
capacity_type=CapacityType.RESERVATION,
|
|
187
|
+
gcs_bucket="test-bucket",
|
|
188
|
+
prefix="testdir",
|
|
189
|
+
)
|
|
190
|
+
with open(a4_yaml_test_path, encoding="utf-8") as stream:
|
|
191
|
+
ctk_yaml = yaml.load(stream)
|
|
192
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
193
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
194
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
195
|
+
assert (
|
|
196
|
+
ctk_yaml.terraform_backend_defaults
|
|
197
|
+
== ctk_test.terraform_backend_defaults
|
|
198
|
+
)
|
|
199
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
200
|
+
assert (
|
|
201
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
202
|
+
)
|
|
203
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
204
|
+
assert os.path.exists(
|
|
205
|
+
os.path.join(
|
|
206
|
+
tmp_test_dir, "testdir", blueprint_name, "storage_crd.yaml"
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
assert os.path.exists(
|
|
210
|
+
os.path.join(
|
|
211
|
+
tmp_test_dir,
|
|
212
|
+
"testdir",
|
|
213
|
+
blueprint_name,
|
|
214
|
+
"nccl-rdma-installer-a4.yaml",
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
shutil.rmtree(tmp_test_dir)
|
xpk/core/capacity.py
CHANGED
|
@@ -195,10 +195,12 @@ def get_capacity_arguments_from_capacity_type(
|
|
|
195
195
|
capacity_args = '--spot'
|
|
196
196
|
case CapacityType.FLEX_START:
|
|
197
197
|
capacity_args = (
|
|
198
|
-
' --flex-start --enable-
|
|
198
|
+
' --flex-start --enable-autoscaling'
|
|
199
199
|
' --location-policy=ANY --reservation-affinity=none'
|
|
200
200
|
f' --no-enable-autorepair --max-nodes={max_nodes}'
|
|
201
201
|
)
|
|
202
|
+
if args.num_slices <= 1:
|
|
203
|
+
capacity_args += ' --enable-queued-provisioning'
|
|
202
204
|
case CapacityType.RESERVATION:
|
|
203
205
|
capacity_args = (
|
|
204
206
|
f'--reservation-affinity=specific --reservation={args.reservation}'
|
|
@@ -232,9 +234,9 @@ def get_capacity_node_selectors_from_capacity_type(
|
|
|
232
234
|
case CapacityType.ON_DEMAND.name:
|
|
233
235
|
node_selector = ''
|
|
234
236
|
case CapacityType.FLEX_START.name:
|
|
235
|
-
node_selector = 'cloud.google.com/gke-queued
|
|
237
|
+
node_selector = 'cloud.google.com/gke-queued: "true"'
|
|
236
238
|
case CapacityType.SPOT.name:
|
|
237
|
-
node_selector = 'cloud.google.com/gke-spot
|
|
239
|
+
node_selector = 'cloud.google.com/gke-spot: "true"'
|
|
238
240
|
case CapacityType.RESERVATION.name:
|
|
239
241
|
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
|
|
240
242
|
case _:
|
xpk/core/cluster.py
CHANGED
|
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
62
62
|
0 if successful and 1 otherwise.
|
|
63
63
|
"""
|
|
64
64
|
command = (
|
|
65
|
-
'kubectl apply --server-side -
|
|
66
|
-
f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
65
|
+
'kubectl apply --server-side --force-conflicts'
|
|
66
|
+
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
67
67
|
)
|
|
68
68
|
task = f'Install Jobset on {args.cluster}'
|
|
69
69
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
217
217
|
)
|
|
218
218
|
if err_code != 0:
|
|
219
219
|
xpk_exit(err_code)
|
|
220
|
-
data = yaml.safe_load(val)
|
|
221
|
-
return data['items']
|
|
220
|
+
data: dict[str, list[dict]] = yaml.safe_load(val)
|
|
221
|
+
return data['items']
|
|
222
222
|
|
|
223
223
|
|
|
224
224
|
def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
@@ -445,7 +445,7 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
445
445
|
args.project_number = project_id_to_project_number(args.project)
|
|
446
446
|
|
|
447
447
|
config.load_kube_config()
|
|
448
|
-
return k8s_client.ApiClient()
|
|
448
|
+
return k8s_client.ApiClient()
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
def get_gpu_type_from_cluster(args) -> str:
|
|
@@ -817,9 +817,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
817
817
|
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
818
818
|
if server_config_return_code != 0:
|
|
819
819
|
xpk_exit(server_config_return_code)
|
|
820
|
+
assert gke_server_config
|
|
821
|
+
|
|
820
822
|
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
821
823
|
args,
|
|
822
|
-
gke_server_config.default_rapid_gke_version,
|
|
824
|
+
gke_server_config.default_rapid_gke_version,
|
|
823
825
|
)
|
|
824
826
|
if upgrade_master_return_code > 0:
|
|
825
827
|
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
@@ -828,7 +830,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
828
830
|
# Upgrade nodepools version after the master upgrade.
|
|
829
831
|
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
830
832
|
args,
|
|
831
|
-
gke_server_config.default_rapid_gke_version,
|
|
833
|
+
gke_server_config.default_rapid_gke_version,
|
|
832
834
|
)
|
|
833
835
|
if node_pool_update_code > 0:
|
|
834
836
|
xpk_print('Upgrading nodepools version failed!')
|
xpk/core/cluster_private.py
CHANGED
|
@@ -95,7 +95,11 @@ def add_current_machine_to_networks_if_needed(
|
|
|
95
95
|
"Adding current machine's IP address to the authorized networks"
|
|
96
96
|
' failed!'
|
|
97
97
|
)
|
|
98
|
-
return
|
|
98
|
+
return (
|
|
99
|
+
add_current_machine_to_networks_return_code,
|
|
100
|
+
False,
|
|
101
|
+
authorized_networks,
|
|
102
|
+
)
|
|
99
103
|
|
|
100
104
|
return 0, is_current_machine_in_network, authorized_networks
|
|
101
105
|
|
xpk/core/commands.py
CHANGED
|
@@ -274,9 +274,9 @@ def run_command_for_value(
|
|
|
274
274
|
else:
|
|
275
275
|
if not quiet:
|
|
276
276
|
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
return return_code, f'{
|
|
277
|
+
out_bytes, err_bytes = child.communicate()
|
|
278
|
+
out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
|
|
279
|
+
return return_code, f'{out_str}\n{err_str}'
|
|
280
280
|
else:
|
|
281
281
|
if not quiet:
|
|
282
282
|
xpk_print(
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.12.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -71,13 +71,12 @@ class XpkConfig:
|
|
|
71
71
|
dir_path = '/'.join(self._config.split('/')[:-1])
|
|
72
72
|
file.ensure_directory_exists(dir_path)
|
|
73
73
|
|
|
74
|
-
config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
|
|
75
74
|
if not os.path.exists(self._config):
|
|
76
75
|
return None
|
|
77
76
|
|
|
78
77
|
with open(self._config, encoding='utf-8', mode='r') as stream:
|
|
79
78
|
config_yaml: dict = yaml.load(stream)
|
|
80
|
-
|
|
79
|
+
return config_yaml
|
|
81
80
|
|
|
82
81
|
def _save_configs(self, config_yaml: dict) -> None:
|
|
83
82
|
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
@@ -109,7 +108,7 @@ class XpkConfig:
|
|
|
109
108
|
|
|
110
109
|
def get_all(
|
|
111
110
|
self,
|
|
112
|
-
) -> dict[str,
|
|
111
|
+
) -> dict[str, str] | None:
|
|
113
112
|
config_yaml = self._open_configs()
|
|
114
113
|
if config_yaml is None:
|
|
115
114
|
return None
|
xpk/core/config_test.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
config_tmp_path = '/tmp/config/config.yaml'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture(name='_')
|
|
26
|
+
def _():
|
|
27
|
+
if os.path.exists(config_tmp_path):
|
|
28
|
+
os.remove(config_tmp_path)
|
|
29
|
+
yield
|
|
30
|
+
if os.path.exists(config_tmp_path):
|
|
31
|
+
os.remove(config_tmp_path)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_config(_):
|
|
35
|
+
cfg = XpkConfig(config_tmp_path)
|
|
36
|
+
cfg.set('project-id', 'foo')
|
|
37
|
+
project_id = cfg.get('project-id')
|
|
38
|
+
assert project_id == 'foo'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_config_get_all(_):
|
|
42
|
+
cfg = XpkConfig(config_tmp_path)
|
|
43
|
+
cfg.set(PROJECT_KEY, 'foo')
|
|
44
|
+
cfg.set(CLUSTER_NAME_KEY, 'bar')
|
|
45
|
+
cfg.set(ZONE_KEY, 'europe-west1-a')
|
|
46
|
+
cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
|
|
47
|
+
|
|
48
|
+
cfg_all = cfg.get_all()
|
|
49
|
+
assert cfg_all[PROJECT_KEY] == 'foo'
|
|
50
|
+
assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
|
|
51
|
+
assert cfg_all[ZONE_KEY] == 'europe-west1-a'
|
|
52
|
+
assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_config_get_empty(_):
|
|
56
|
+
cfg = XpkConfig(config_tmp_path)
|
|
57
|
+
val = cfg.get(PROJECT_KEY)
|
|
58
|
+
assert val is None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_config_get_all_empty(_):
|
|
62
|
+
cfg = XpkConfig(config_tmp_path)
|
|
63
|
+
val = cfg.get_all()
|
|
64
|
+
assert not val
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_config_set_incorrect(_):
|
|
68
|
+
cfg = XpkConfig(config_tmp_path)
|
|
69
|
+
cfg.set('foo', 'bar')
|
|
70
|
+
cfg_all = cfg.get_all()
|
|
71
|
+
assert not cfg_all
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.62.2"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|
xpk/core/docker_resources.py
CHANGED
|
@@ -72,7 +72,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
72
72
|
if system.accelerator_type == AcceleratorType['CPU']:
|
|
73
73
|
return get_cpu_env(args, system)
|
|
74
74
|
|
|
75
|
-
return format_env_dict(args.env, system)
|
|
75
|
+
return format_env_dict(args.env, system)
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def get_gpu_env(args, system) -> str:
|
xpk/core/filestore.py
CHANGED
|
@@ -93,11 +93,12 @@ class FilestoreClient:
|
|
|
93
93
|
|
|
94
94
|
for instance in instancesZonal:
|
|
95
95
|
if instance.name == fullname_zonal:
|
|
96
|
-
return instance
|
|
96
|
+
return instance
|
|
97
97
|
|
|
98
98
|
for instance in instancesRegional:
|
|
99
99
|
if instance.name == fullname_regional:
|
|
100
|
-
return instance
|
|
100
|
+
return instance
|
|
101
|
+
return None
|
|
101
102
|
|
|
102
103
|
def check_instance_exists(self) -> bool:
|
|
103
104
|
"""Check if Filestore instance exists"""
|
|
@@ -111,6 +112,7 @@ class FilestoreClient:
|
|
|
111
112
|
def get_instance_location(self) -> str:
|
|
112
113
|
"""Get Filestore instance's location"""
|
|
113
114
|
self.load_instance()
|
|
115
|
+
assert self.instance
|
|
114
116
|
return str(self.instance.name.split("/")[3])
|
|
115
117
|
|
|
116
118
|
def create_instance(
|
|
@@ -192,6 +194,7 @@ class FilestoreClient:
|
|
|
192
194
|
|
|
193
195
|
def create_sc(self, name: str, network: str) -> dict:
|
|
194
196
|
"""Create a yaml representing filestore StorageClass."""
|
|
197
|
+
assert self.instance
|
|
195
198
|
data = templates.load(FS_SC_PATH)
|
|
196
199
|
data["metadata"]["name"] = get_storage_class_name(name)
|
|
197
200
|
data["parameters"]["tier"] = self.instance.tier.name
|
|
@@ -202,6 +205,7 @@ class FilestoreClient:
|
|
|
202
205
|
|
|
203
206
|
def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
|
|
204
207
|
"""Create a yaml representing filestore PersistentVolume."""
|
|
208
|
+
assert self.instance
|
|
205
209
|
data = templates.load(FS_PV_PATH)
|
|
206
210
|
data["metadata"]["name"] = get_pv_name(name)
|
|
207
211
|
data["spec"]["storageClassName"] = get_storage_class_name(name)
|
|
@@ -219,6 +223,7 @@ class FilestoreClient:
|
|
|
219
223
|
|
|
220
224
|
def create_pvc(self, name: str, access_mode: str) -> dict:
|
|
221
225
|
"""Create a yaml representing filestore PersistentVolumeClaim."""
|
|
226
|
+
assert self.instance
|
|
222
227
|
data = templates.load(FS_PVC_PATH)
|
|
223
228
|
data["metadata"]["name"] = get_pvc_name(name)
|
|
224
229
|
data["spec"]["accessModes"] = [access_mode]
|
xpk/core/gcloud_context.py
CHANGED
|
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
|
|
|
75
75
|
xpk_print(f'Working on {args.project} and {args.zone}')
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def zone_to_region(zone) -> str:
|
|
78
|
+
def zone_to_region(zone: str) -> str:
|
|
79
79
|
"""Helper function converts zone name to region name.
|
|
80
80
|
|
|
81
81
|
Args:
|
|
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
|
|
|
85
85
|
The region name.
|
|
86
86
|
"""
|
|
87
87
|
zone_terms = zone.split('-')
|
|
88
|
-
return zone_terms[0] + '-' + zone_terms[1]
|
|
88
|
+
return zone_terms[0] + '-' + zone_terms[1]
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
@dataclass
|
xpk/core/jobset.py
CHANGED
xpk/core/kjob.py
CHANGED
|
@@ -277,7 +277,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
|
|
|
277
277
|
job_spec = rdma_decorator.decorate_kjob_template(job_spec)
|
|
278
278
|
job_template_dict = yaml.safe_load(yml_string)
|
|
279
279
|
job_template_dict["template"] = job_spec
|
|
280
|
-
|
|
280
|
+
yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
|
|
281
|
+
return yaml_result
|
|
281
282
|
|
|
282
283
|
|
|
283
284
|
def create_job_template_instance(
|
xpk/core/kueue.py
CHANGED
|
@@ -43,7 +43,7 @@ from .system_characteristics import (
|
|
|
43
43
|
KUEUE_VERSION = 'v0.12.2'
|
|
44
44
|
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
45
45
|
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
46
|
-
WAIT_FOR_KUEUE_TIMEOUT = '
|
|
46
|
+
WAIT_FOR_KUEUE_TIMEOUT = '10m'
|
|
47
47
|
MEMORY_SIZE_PER_VM = 1.2
|
|
48
48
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
49
49
|
|
|
@@ -89,6 +89,10 @@ metadata:
|
|
|
89
89
|
name: dws-config
|
|
90
90
|
spec:
|
|
91
91
|
provisioningClassName: queued-provisioning.gke.io
|
|
92
|
+
podSetUpdates:
|
|
93
|
+
nodeSelector:
|
|
94
|
+
- key: autoscaling.gke.io/provisioning-request
|
|
95
|
+
valueFromProvisioningClassDetail: ResizeRequestName
|
|
92
96
|
managedResources:
|
|
93
97
|
- {managed_resource}
|
|
94
98
|
---
|
|
@@ -244,14 +248,16 @@ spec:
|
|
|
244
248
|
periodSeconds: 10
|
|
245
249
|
resources:
|
|
246
250
|
limits:
|
|
247
|
-
cpu:
|
|
251
|
+
cpu: 1000m
|
|
248
252
|
memory: {memory_limit_size}
|
|
249
253
|
requests:
|
|
250
|
-
cpu:
|
|
254
|
+
cpu: 1000m
|
|
251
255
|
memory: 512Mi
|
|
252
256
|
securityContext:
|
|
253
257
|
allowPrivilegeEscalation: false
|
|
254
258
|
volumeMounts:
|
|
259
|
+
- mountPath: /visibility
|
|
260
|
+
name: visibility
|
|
255
261
|
- mountPath: /tmp/k8s-webhook-server/serving-certs
|
|
256
262
|
name: cert
|
|
257
263
|
readOnly: true
|
|
@@ -263,6 +269,8 @@ spec:
|
|
|
263
269
|
serviceAccountName: kueue-controller-manager
|
|
264
270
|
terminationGracePeriodSeconds: 10
|
|
265
271
|
volumes:
|
|
272
|
+
- name: visibility
|
|
273
|
+
emptyDir: {{}}
|
|
266
274
|
- name: cert
|
|
267
275
|
secret:
|
|
268
276
|
defaultMode: 420
|
|
@@ -316,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
|
|
|
316
324
|
return return_code
|
|
317
325
|
|
|
318
326
|
|
|
319
|
-
def get_kueue_version(args) ->
|
|
327
|
+
def get_kueue_version(args) -> tuple[int, str]:
|
|
320
328
|
command = 'kubectl kueue version'
|
|
321
329
|
task = 'Get kueue version on server'
|
|
322
330
|
return_code, val = run_command_for_value(command, task, args)
|
xpk/core/nap.py
CHANGED
|
@@ -37,11 +37,14 @@ from .resources import (
|
|
|
37
37
|
)
|
|
38
38
|
from .scheduling import get_total_chips_requested_from_args
|
|
39
39
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
40
|
+
from typing import cast
|
|
40
41
|
|
|
41
42
|
AUTOPROVISIONING_CONFIG_FILE = """
|
|
42
43
|
management:
|
|
43
44
|
autoRepair: true
|
|
44
45
|
autoUpgrade: true
|
|
46
|
+
scopes:
|
|
47
|
+
- "https://www.googleapis.com/auth/devstorage.read_write"
|
|
45
48
|
autoprovisioningLocations:
|
|
46
49
|
{zones}
|
|
47
50
|
{resource_limits}
|
|
@@ -106,6 +109,18 @@ def enable_autoprovisioning_on_cluster(
|
|
|
106
109
|
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
107
110
|
return autoprovisioning_config, return_code
|
|
108
111
|
|
|
112
|
+
command = (
|
|
113
|
+
'gcloud container clusters update'
|
|
114
|
+
f' {args.cluster} --project={args.project}'
|
|
115
|
+
f' --region={zone_to_region(args.zone)}'
|
|
116
|
+
' --autoscaling-profile=optimize-utilization'
|
|
117
|
+
)
|
|
118
|
+
task = 'Update cluster with autoscaling-profile'
|
|
119
|
+
return_code = run_command_with_updates(command, task, args)
|
|
120
|
+
if return_code != 0:
|
|
121
|
+
xpk_print(f'{task} request returned ERROR {return_code}')
|
|
122
|
+
return autoprovisioning_config, return_code
|
|
123
|
+
|
|
109
124
|
# Update created accelerator node pools to support autoprovisioning.
|
|
110
125
|
existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
|
|
111
126
|
if return_code != 0:
|
|
@@ -171,11 +186,11 @@ def create_autoprovisioning_config(
|
|
|
171
186
|
# is not controlled by NAP.
|
|
172
187
|
cpu_limits = """
|
|
173
188
|
minimum: 1
|
|
174
|
-
maximum:
|
|
189
|
+
maximum: 1000000
|
|
175
190
|
"""
|
|
176
191
|
memory_limits = """
|
|
177
192
|
minimum: 1
|
|
178
|
-
maximum:
|
|
193
|
+
maximum: 10000000
|
|
179
194
|
"""
|
|
180
195
|
|
|
181
196
|
# By default, the maximum chips is set to be the current number of resources used
|
|
@@ -255,9 +270,6 @@ def is_autoprovisioning_enabled(
|
|
|
255
270
|
bool is true if autoprovisioning is enabled, false otherwise.
|
|
256
271
|
int of 0 if successful and 1 otherwise.
|
|
257
272
|
"""
|
|
258
|
-
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
|
|
259
|
-
if args.use_pathways:
|
|
260
|
-
return False, 0
|
|
261
273
|
|
|
262
274
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
263
275
|
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
@@ -325,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
325
337
|
)
|
|
326
338
|
return node_selector_args, 1
|
|
327
339
|
|
|
328
|
-
return_code,
|
|
340
|
+
return_code, optional_capacity_type_str = get_value_from_map(
|
|
329
341
|
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
|
|
330
342
|
)
|
|
331
343
|
if return_code != 0:
|
|
332
344
|
return node_selector_args, return_code
|
|
345
|
+
# return_code==0 implies capacity_type is defined
|
|
346
|
+
capacity_type_str = cast(str, optional_capacity_type_str)
|
|
333
347
|
|
|
334
348
|
if capacity_type_str == CapacityType.RESERVATION.name:
|
|
335
349
|
return_code, args.reservation = get_value_from_map(
|