xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +8 -8
- xpk/commands/cluster.py +19 -19
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +7 -3
- xpk/commands/info.py +12 -12
- xpk/commands/inspector.py +1 -1
- xpk/commands/job.py +42 -12
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +6 -3
- xpk/commands/workload.py +28 -15
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +14 -8
- xpk/core/cluster_private.py +8 -2
- xpk/core/commands.py +13 -10
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_image.py +14 -5
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +10 -5
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/jobset.py +1 -1
- xpk/core/kjob.py +7 -3
- xpk/core/kueue.py +28 -8
- xpk/core/nap.py +5 -5
- xpk/core/network.py +1 -1
- xpk/core/nodepool.py +8 -3
- xpk/core/nodepool_test.py +82 -0
- xpk/core/pathways.py +6 -2
- xpk/core/ray.py +1 -1
- xpk/core/resources.py +18 -14
- xpk/core/scheduling.py +4 -0
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +12 -10
- xpk/parser/cluster.py +110 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- xpk/utils/execution_context.py +28 -0
- xpk/utils/file.py +25 -10
- xpk/utils/network.py +4 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
- xpk-0.13.0.dist-info/RECORD +101 -0
- xpk-0.11.0.dist-info/RECORD +0 -95
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import shutil
|
|
19
|
+
|
|
20
|
+
import ruamel.yaml
|
|
21
|
+
|
|
22
|
+
from xpk.core.blueprint.blueprint_definitions import Blueprint
|
|
23
|
+
from xpk.core.blueprint.blueprint_generator import BlueprintGenerator
|
|
24
|
+
from xpk.core.capacity import CapacityType
|
|
25
|
+
|
|
26
|
+
yaml = ruamel.yaml.YAML()
|
|
27
|
+
|
|
28
|
+
yaml.register_class(Blueprint)
|
|
29
|
+
|
|
30
|
+
a3_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega.yaml"
|
|
31
|
+
a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
|
|
32
|
+
a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
|
|
33
|
+
a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
|
|
34
|
+
config_map_filename = "config-map.yaml.tftpl"
|
|
35
|
+
kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
|
|
36
|
+
tmp_test_dir = "/tmp/xpk_test"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def prepare_test():
|
|
40
|
+
if os.path.exists(tmp_test_dir):
|
|
41
|
+
shutil.rmtree(tmp_test_dir)
|
|
42
|
+
os.mkdir(tmp_test_dir)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_generate_a3_mega_blueprint():
|
|
46
|
+
prepare_test()
|
|
47
|
+
blueprint_name = "xpk-gke-a3-megagpu"
|
|
48
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
49
|
+
bp = bp_generator.generate_a3_mega_blueprint(
|
|
50
|
+
project_id="foo",
|
|
51
|
+
cluster_name="bar",
|
|
52
|
+
blueprint_name=blueprint_name,
|
|
53
|
+
prefix="prefix",
|
|
54
|
+
region="us-central1",
|
|
55
|
+
zone="us-central1-c",
|
|
56
|
+
auth_cidr="10.0.0.0/32",
|
|
57
|
+
reservation_placement_policy={
|
|
58
|
+
"type": "COMPACT",
|
|
59
|
+
"name": "test-reservation-placement",
|
|
60
|
+
},
|
|
61
|
+
reservation="test-reservation",
|
|
62
|
+
capacity_type=CapacityType.RESERVATION,
|
|
63
|
+
system_node_pool_min_node_count=5,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
67
|
+
|
|
68
|
+
with open(a3_yaml_test_path, encoding="utf-8") as stream:
|
|
69
|
+
ctk_yaml = yaml.load(stream)
|
|
70
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
71
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
72
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
73
|
+
assert ctk_test.terraform_backend_defaults is None
|
|
74
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
75
|
+
assert (
|
|
76
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
77
|
+
)
|
|
78
|
+
assert ctk_yaml.vars == ctk_test.vars
|
|
79
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
80
|
+
assert os.path.exists(
|
|
81
|
+
os.path.join(
|
|
82
|
+
tmp_test_dir, "prefix", blueprint_name, config_map_filename
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
assert os.path.exists(
|
|
86
|
+
os.path.join(
|
|
87
|
+
tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
shutil.rmtree(tmp_test_dir)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_generate_a3_mega_spot_blueprint():
|
|
95
|
+
prepare_test()
|
|
96
|
+
blueprint_name = "xpk-gke-a3-megagpu"
|
|
97
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
98
|
+
bp = bp_generator.generate_a3_mega_blueprint(
|
|
99
|
+
project_id="foo",
|
|
100
|
+
cluster_name="bar",
|
|
101
|
+
blueprint_name=blueprint_name,
|
|
102
|
+
prefix="prefix",
|
|
103
|
+
region="us-central1",
|
|
104
|
+
zone="us-central1-c",
|
|
105
|
+
auth_cidr="10.0.0.0/32",
|
|
106
|
+
capacity_type=CapacityType.SPOT,
|
|
107
|
+
system_node_pool_min_node_count=5,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
assert bp.blueprint_file.endswith("/prefix/xpk-gke-a3-megagpu.yaml")
|
|
111
|
+
|
|
112
|
+
with open(a3_spot_yaml_test_path, encoding="utf-8") as stream:
|
|
113
|
+
ctk_yaml = yaml.load(stream)
|
|
114
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
115
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
116
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
117
|
+
assert ctk_test.terraform_backend_defaults is None
|
|
118
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
119
|
+
assert (
|
|
120
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
121
|
+
)
|
|
122
|
+
assert ctk_yaml.vars == ctk_test.vars
|
|
123
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
124
|
+
|
|
125
|
+
shutil.rmtree(tmp_test_dir)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_generate_a3_ultra_blueprint():
|
|
129
|
+
prepare_test()
|
|
130
|
+
blueprint_name = "xpk-gke-a3-ultra"
|
|
131
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
132
|
+
bp = bp_generator.generate_a3_ultra_blueprint(
|
|
133
|
+
project_id="foo",
|
|
134
|
+
cluster_name="gke-a3-ultra",
|
|
135
|
+
blueprint_name=blueprint_name,
|
|
136
|
+
region="us-central1",
|
|
137
|
+
zone="us-central1-c",
|
|
138
|
+
auth_cidr="10.0.0.0/32",
|
|
139
|
+
reservation="test-reservation",
|
|
140
|
+
system_node_pool_machine_type="e2-standard-16",
|
|
141
|
+
capacity_type=CapacityType.RESERVATION,
|
|
142
|
+
gcs_bucket="test-bucket",
|
|
143
|
+
prefix="testdir",
|
|
144
|
+
)
|
|
145
|
+
with open(a3_ultra_yaml_test_path, encoding="utf-8") as stream:
|
|
146
|
+
ctk_yaml = yaml.load(stream)
|
|
147
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
148
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
149
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
150
|
+
assert (
|
|
151
|
+
ctk_yaml.terraform_backend_defaults
|
|
152
|
+
== ctk_test.terraform_backend_defaults
|
|
153
|
+
)
|
|
154
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
155
|
+
assert (
|
|
156
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
157
|
+
)
|
|
158
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
159
|
+
assert os.path.exists(
|
|
160
|
+
os.path.join(
|
|
161
|
+
tmp_test_dir, "testdir", blueprint_name, "mlgru-disable.yaml"
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
assert os.path.exists(
|
|
165
|
+
os.path.join(
|
|
166
|
+
tmp_test_dir, "testdir", blueprint_name, "nccl-installer.yaml"
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
shutil.rmtree(tmp_test_dir)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_generate_a4_blueprint():
|
|
174
|
+
prepare_test()
|
|
175
|
+
blueprint_name = "xpk-gke-a4"
|
|
176
|
+
bp_generator = BlueprintGenerator(tmp_test_dir)
|
|
177
|
+
bp = bp_generator.generate_a4_blueprint(
|
|
178
|
+
project_id="foo",
|
|
179
|
+
cluster_name="gke-a4",
|
|
180
|
+
blueprint_name=blueprint_name,
|
|
181
|
+
region="us-central1",
|
|
182
|
+
zone="us-central1-c",
|
|
183
|
+
auth_cidr="10.0.0.0/32",
|
|
184
|
+
reservation="test-reservation",
|
|
185
|
+
system_node_pool_machine_type="e2-standard-16",
|
|
186
|
+
capacity_type=CapacityType.RESERVATION,
|
|
187
|
+
gcs_bucket="test-bucket",
|
|
188
|
+
prefix="testdir",
|
|
189
|
+
)
|
|
190
|
+
with open(a4_yaml_test_path, encoding="utf-8") as stream:
|
|
191
|
+
ctk_yaml = yaml.load(stream)
|
|
192
|
+
with open(bp.blueprint_file, encoding="utf-8") as generated_blueprint:
|
|
193
|
+
ctk_test = yaml.load(generated_blueprint)
|
|
194
|
+
assert ctk_yaml.blueprint_name == ctk_test.blueprint_name
|
|
195
|
+
assert (
|
|
196
|
+
ctk_yaml.terraform_backend_defaults
|
|
197
|
+
== ctk_test.terraform_backend_defaults
|
|
198
|
+
)
|
|
199
|
+
assert ctk_yaml.toolkit_modules_url == ctk_test.toolkit_modules_url
|
|
200
|
+
assert (
|
|
201
|
+
ctk_yaml.toolkit_modules_version == ctk_test.toolkit_modules_version
|
|
202
|
+
)
|
|
203
|
+
assert ctk_test.deployment_groups == ctk_yaml.deployment_groups
|
|
204
|
+
assert os.path.exists(
|
|
205
|
+
os.path.join(
|
|
206
|
+
tmp_test_dir, "testdir", blueprint_name, "storage_crd.yaml"
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
assert os.path.exists(
|
|
210
|
+
os.path.join(
|
|
211
|
+
tmp_test_dir,
|
|
212
|
+
"testdir",
|
|
213
|
+
blueprint_name,
|
|
214
|
+
"nccl-rdma-installer-a4.yaml",
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
shutil.rmtree(tmp_test_dir)
|
xpk/core/capacity.py
CHANGED
|
@@ -195,10 +195,12 @@ def get_capacity_arguments_from_capacity_type(
|
|
|
195
195
|
capacity_args = '--spot'
|
|
196
196
|
case CapacityType.FLEX_START:
|
|
197
197
|
capacity_args = (
|
|
198
|
-
' --flex-start --enable-
|
|
198
|
+
' --flex-start --enable-autoscaling'
|
|
199
199
|
' --location-policy=ANY --reservation-affinity=none'
|
|
200
200
|
f' --no-enable-autorepair --max-nodes={max_nodes}'
|
|
201
201
|
)
|
|
202
|
+
if args.num_slices <= 1:
|
|
203
|
+
capacity_args += ' --enable-queued-provisioning'
|
|
202
204
|
case CapacityType.RESERVATION:
|
|
203
205
|
capacity_args = (
|
|
204
206
|
f'--reservation-affinity=specific --reservation={args.reservation}'
|
xpk/core/cluster.py
CHANGED
|
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
62
62
|
0 if successful and 1 otherwise.
|
|
63
63
|
"""
|
|
64
64
|
command = (
|
|
65
|
-
'kubectl apply --server-side -
|
|
66
|
-
f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
65
|
+
'kubectl apply --server-side --force-conflicts'
|
|
66
|
+
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
67
67
|
)
|
|
68
68
|
task = f'Install Jobset on {args.cluster}'
|
|
69
69
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
217
217
|
)
|
|
218
218
|
if err_code != 0:
|
|
219
219
|
xpk_exit(err_code)
|
|
220
|
-
data = yaml.safe_load(val)
|
|
221
|
-
return data['items']
|
|
220
|
+
data: dict[str, list[dict]] = yaml.safe_load(val)
|
|
221
|
+
return data['items']
|
|
222
222
|
|
|
223
223
|
|
|
224
224
|
def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
@@ -442,10 +442,14 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
442
442
|
if not getattr(args, 'kind_cluster', False):
|
|
443
443
|
add_zone_and_project(args)
|
|
444
444
|
get_cluster_credentials(args)
|
|
445
|
-
args.project_number =
|
|
445
|
+
args.project_number = (
|
|
446
|
+
project_id_to_project_number(args.project)
|
|
447
|
+
if not args.dry_run
|
|
448
|
+
else abs(hash(args.project) % (10**12)) # 12 digit hash
|
|
449
|
+
)
|
|
446
450
|
|
|
447
451
|
config.load_kube_config()
|
|
448
|
-
return k8s_client.ApiClient()
|
|
452
|
+
return k8s_client.ApiClient()
|
|
449
453
|
|
|
450
454
|
|
|
451
455
|
def get_gpu_type_from_cluster(args) -> str:
|
|
@@ -817,9 +821,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
817
821
|
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
818
822
|
if server_config_return_code != 0:
|
|
819
823
|
xpk_exit(server_config_return_code)
|
|
824
|
+
assert gke_server_config
|
|
825
|
+
|
|
820
826
|
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
821
827
|
args,
|
|
822
|
-
gke_server_config.default_rapid_gke_version,
|
|
828
|
+
gke_server_config.default_rapid_gke_version,
|
|
823
829
|
)
|
|
824
830
|
if upgrade_master_return_code > 0:
|
|
825
831
|
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
@@ -828,7 +834,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
828
834
|
# Upgrade nodepools version after the master upgrade.
|
|
829
835
|
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
830
836
|
args,
|
|
831
|
-
gke_server_config.default_rapid_gke_version,
|
|
837
|
+
gke_server_config.default_rapid_gke_version,
|
|
832
838
|
)
|
|
833
839
|
if node_pool_update_code > 0:
|
|
834
840
|
xpk_print('Upgrading nodepools version failed!')
|
xpk/core/cluster_private.py
CHANGED
|
@@ -19,6 +19,7 @@ from ..utils.network import (
|
|
|
19
19
|
add_current_machine_to_networks,
|
|
20
20
|
is_current_machine_in_any_network,
|
|
21
21
|
)
|
|
22
|
+
from ..utils.execution_context import is_dry_run
|
|
22
23
|
from ..utils.objects import is_text_true
|
|
23
24
|
from .commands import run_command_for_value, run_command_with_updates
|
|
24
25
|
from .gcloud_context import zone_to_region
|
|
@@ -37,7 +38,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
|
|
|
37
38
|
if not args.private and args.authorized_networks is None:
|
|
38
39
|
xpk_print('Cluster is public and no need to authorize networks.')
|
|
39
40
|
return 0
|
|
40
|
-
|
|
41
|
+
elif not is_dry_run():
|
|
41
42
|
xpk_print(
|
|
42
43
|
'Cannot convert an existing public cluster to private. The arguments'
|
|
43
44
|
' --private and --authorized-networks are not acceptable for public'
|
|
@@ -95,7 +96,11 @@ def add_current_machine_to_networks_if_needed(
|
|
|
95
96
|
"Adding current machine's IP address to the authorized networks"
|
|
96
97
|
' failed!'
|
|
97
98
|
)
|
|
98
|
-
return
|
|
99
|
+
return (
|
|
100
|
+
add_current_machine_to_networks_return_code,
|
|
101
|
+
False,
|
|
102
|
+
authorized_networks,
|
|
103
|
+
)
|
|
99
104
|
|
|
100
105
|
return 0, is_current_machine_in_network, authorized_networks
|
|
101
106
|
|
|
@@ -160,6 +165,7 @@ def get_cluster_authorized_networks(args) -> list[str]:
|
|
|
160
165
|
command,
|
|
161
166
|
'Fetching the list of authorized network from cluster describe.',
|
|
162
167
|
args,
|
|
168
|
+
dry_run_return_val='127.0.0.1/32',
|
|
163
169
|
)
|
|
164
170
|
|
|
165
171
|
if return_code != 0:
|
xpk/core/commands.py
CHANGED
|
@@ -78,14 +78,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
78
78
|
The max return code and a list of all the return codes.
|
|
79
79
|
"""
|
|
80
80
|
|
|
81
|
+
files = [open(f, 'w', encoding='utf-8') for f in output_logs]
|
|
81
82
|
children = []
|
|
82
83
|
start_time = datetime.datetime.now()
|
|
83
|
-
for
|
|
84
|
+
for command, file in zip(commands, files):
|
|
84
85
|
children.append(
|
|
85
86
|
# subprocess managed by list pylint: disable=consider-using-with
|
|
86
|
-
subprocess.Popen(
|
|
87
|
-
command, stdout=output_logs[i], stderr=output_logs[i], shell=True
|
|
88
|
-
)
|
|
87
|
+
subprocess.Popen(command, stdout=file, stderr=file, shell=True)
|
|
89
88
|
)
|
|
90
89
|
|
|
91
90
|
while True:
|
|
@@ -99,7 +98,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
99
98
|
slow_worker_text = per_command_name[slow_worker_index]
|
|
100
99
|
slow_str = (
|
|
101
100
|
f', task {slow_worker_text} still working, logfile'
|
|
102
|
-
f' {output_logs[slow_worker_index]
|
|
101
|
+
f' {output_logs[slow_worker_index]}'
|
|
103
102
|
)
|
|
104
103
|
else:
|
|
105
104
|
slow_str = ''
|
|
@@ -116,7 +115,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
116
115
|
)
|
|
117
116
|
xpk_print(
|
|
118
117
|
f'Failure is {per_command_name[failing_index]}'
|
|
119
|
-
f' and logfile {output_logs[failing_index]
|
|
118
|
+
f' and logfile {output_logs[failing_index]}'
|
|
120
119
|
)
|
|
121
120
|
for child in children:
|
|
122
121
|
child.terminate()
|
|
@@ -126,6 +125,10 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
126
125
|
break
|
|
127
126
|
|
|
128
127
|
time.sleep(1)
|
|
128
|
+
|
|
129
|
+
for file in files:
|
|
130
|
+
file.close()
|
|
131
|
+
|
|
129
132
|
return max_returncode, returncodes
|
|
130
133
|
|
|
131
134
|
|
|
@@ -274,9 +277,9 @@ def run_command_for_value(
|
|
|
274
277
|
else:
|
|
275
278
|
if not quiet:
|
|
276
279
|
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
return return_code, f'{
|
|
280
|
+
out_bytes, err_bytes = child.communicate()
|
|
281
|
+
out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
|
|
282
|
+
return return_code, f'{out_str}\n{err_str}'
|
|
280
283
|
else:
|
|
281
284
|
if not quiet:
|
|
282
285
|
xpk_print(
|
|
@@ -351,6 +354,6 @@ def run_command_with_full_controls(
|
|
|
351
354
|
|
|
352
355
|
def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
|
|
353
356
|
tmp = write_tmp_file(yml_string)
|
|
354
|
-
command = f'kubectl apply -f {str(tmp
|
|
357
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
355
358
|
err_code = run_command_with_updates(command, task, args)
|
|
356
359
|
return err_code
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.13.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -71,13 +71,12 @@ class XpkConfig:
|
|
|
71
71
|
dir_path = '/'.join(self._config.split('/')[:-1])
|
|
72
72
|
file.ensure_directory_exists(dir_path)
|
|
73
73
|
|
|
74
|
-
config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
|
|
75
74
|
if not os.path.exists(self._config):
|
|
76
75
|
return None
|
|
77
76
|
|
|
78
77
|
with open(self._config, encoding='utf-8', mode='r') as stream:
|
|
79
78
|
config_yaml: dict = yaml.load(stream)
|
|
80
|
-
|
|
79
|
+
return config_yaml
|
|
81
80
|
|
|
82
81
|
def _save_configs(self, config_yaml: dict) -> None:
|
|
83
82
|
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
@@ -109,7 +108,7 @@ class XpkConfig:
|
|
|
109
108
|
|
|
110
109
|
def get_all(
|
|
111
110
|
self,
|
|
112
|
-
) -> dict[str,
|
|
111
|
+
) -> dict[str, str] | None:
|
|
113
112
|
config_yaml = self._open_configs()
|
|
114
113
|
if config_yaml is None:
|
|
115
114
|
return None
|
xpk/core/config_test.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
config_tmp_path = '/tmp/config/config.yaml'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture(name='_')
|
|
26
|
+
def _():
|
|
27
|
+
if os.path.exists(config_tmp_path):
|
|
28
|
+
os.remove(config_tmp_path)
|
|
29
|
+
yield
|
|
30
|
+
if os.path.exists(config_tmp_path):
|
|
31
|
+
os.remove(config_tmp_path)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_config(_):
|
|
35
|
+
cfg = XpkConfig(config_tmp_path)
|
|
36
|
+
cfg.set('project-id', 'foo')
|
|
37
|
+
project_id = cfg.get('project-id')
|
|
38
|
+
assert project_id == 'foo'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_config_get_all(_):
|
|
42
|
+
cfg = XpkConfig(config_tmp_path)
|
|
43
|
+
cfg.set(PROJECT_KEY, 'foo')
|
|
44
|
+
cfg.set(CLUSTER_NAME_KEY, 'bar')
|
|
45
|
+
cfg.set(ZONE_KEY, 'europe-west1-a')
|
|
46
|
+
cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
|
|
47
|
+
|
|
48
|
+
cfg_all = cfg.get_all()
|
|
49
|
+
assert cfg_all[PROJECT_KEY] == 'foo'
|
|
50
|
+
assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
|
|
51
|
+
assert cfg_all[ZONE_KEY] == 'europe-west1-a'
|
|
52
|
+
assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_config_get_empty(_):
|
|
56
|
+
cfg = XpkConfig(config_tmp_path)
|
|
57
|
+
val = cfg.get(PROJECT_KEY)
|
|
58
|
+
assert val is None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_config_get_all_empty(_):
|
|
62
|
+
cfg = XpkConfig(config_tmp_path)
|
|
63
|
+
val = cfg.get_all()
|
|
64
|
+
assert not val
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_config_set_incorrect(_):
|
|
68
|
+
cfg = XpkConfig(config_tmp_path)
|
|
69
|
+
cfg.set('foo', 'bar')
|
|
70
|
+
cfg_all = cfg.get_all()
|
|
71
|
+
assert not cfg_all
|
xpk/core/docker_image.py
CHANGED
|
@@ -21,6 +21,7 @@ import string
|
|
|
21
21
|
|
|
22
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
23
23
|
from ..utils.file import write_tmp_file
|
|
24
|
+
from ..utils.execution_context import is_dry_run
|
|
24
25
|
from .commands import run_command_with_updates
|
|
25
26
|
|
|
26
27
|
DEFAULT_DOCKER_IMAGE = 'python:3.10'
|
|
@@ -75,7 +76,9 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
78
|
# Pick a name for the docker image.
|
|
78
|
-
docker_image_prefix =
|
|
79
|
+
docker_image_prefix = (
|
|
80
|
+
'dry-run' if is_dry_run() else os.getenv('USER', 'unknown')
|
|
81
|
+
)
|
|
79
82
|
docker_name = f'{docker_image_prefix}-runner'
|
|
80
83
|
|
|
81
84
|
script_dir_dockerfile = """FROM {base_docker_image}
|
|
@@ -94,7 +97,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
94
97
|
)
|
|
95
98
|
tmp = write_tmp_file(docker_file)
|
|
96
99
|
docker_build_command = (
|
|
97
|
-
f'docker buildx build --platform={PLATFORM} -f {str(tmp
|
|
100
|
+
f'docker buildx build --platform={PLATFORM} -f {str(tmp)} -t'
|
|
98
101
|
f' {docker_name} {args.script_dir}'
|
|
99
102
|
)
|
|
100
103
|
xpk_print(f'Building {args.script_dir} into docker image.')
|
|
@@ -114,10 +117,16 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
114
117
|
|
|
115
118
|
# Pick a randomly generated `tag_length` character docker tag.
|
|
116
119
|
tag_length = 4
|
|
117
|
-
tag_random_prefix =
|
|
118
|
-
|
|
120
|
+
tag_random_prefix = (
|
|
121
|
+
'prefix'
|
|
122
|
+
if is_dry_run()
|
|
123
|
+
else ''.join(random.choices(string.ascii_lowercase, k=tag_length))
|
|
124
|
+
)
|
|
125
|
+
tag_datetime = (
|
|
126
|
+
'current'
|
|
127
|
+
if is_dry_run()
|
|
128
|
+
else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
119
129
|
)
|
|
120
|
-
tag_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
|
|
121
130
|
tag_name = f'{tag_random_prefix}-{tag_datetime}'
|
|
122
131
|
cloud_docker_image = f'gcr.io/{args.project}/{docker_name}:{tag_name}'
|
|
123
132
|
xpk_print(f'Adding Docker Image: {cloud_docker_image} to {args.project}')
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.62.2"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|
xpk/core/docker_resources.py
CHANGED
|
@@ -20,6 +20,7 @@ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
|
20
20
|
from .cluster import setup_k8s_env
|
|
21
21
|
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
|
|
22
22
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
23
|
+
from ..utils.execution_context import is_dry_run
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def get_main_container_resources(
|
|
@@ -72,7 +73,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
72
73
|
if system.accelerator_type == AcceleratorType['CPU']:
|
|
73
74
|
return get_cpu_env(args, system)
|
|
74
75
|
|
|
75
|
-
return format_env_dict(args.env, system)
|
|
76
|
+
return format_env_dict(args.env, system)
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
def get_gpu_env(args, system) -> str:
|
|
@@ -272,8 +273,10 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
272
273
|
- name: shared-data
|
|
273
274
|
"""
|
|
274
275
|
|
|
275
|
-
storages: list[Storage] =
|
|
276
|
-
|
|
276
|
+
storages: list[Storage] = (
|
|
277
|
+
[]
|
|
278
|
+
if is_dry_run()
|
|
279
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
277
280
|
)
|
|
278
281
|
for storage in storages:
|
|
279
282
|
if storage.type in {
|
|
@@ -325,8 +328,10 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
325
328
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
326
329
|
volume_mount_yaml = ''
|
|
327
330
|
|
|
328
|
-
storages: list[Storage] =
|
|
329
|
-
|
|
331
|
+
storages: list[Storage] = (
|
|
332
|
+
[]
|
|
333
|
+
if is_dry_run()
|
|
334
|
+
else get_storages_to_mount(setup_k8s_env(args), args.storage)
|
|
330
335
|
)
|
|
331
336
|
for storage in storages:
|
|
332
337
|
if storage.type in {
|
xpk/core/filestore.py
CHANGED
|
@@ -93,11 +93,12 @@ class FilestoreClient:
|
|
|
93
93
|
|
|
94
94
|
for instance in instancesZonal:
|
|
95
95
|
if instance.name == fullname_zonal:
|
|
96
|
-
return instance
|
|
96
|
+
return instance
|
|
97
97
|
|
|
98
98
|
for instance in instancesRegional:
|
|
99
99
|
if instance.name == fullname_regional:
|
|
100
|
-
return instance
|
|
100
|
+
return instance
|
|
101
|
+
return None
|
|
101
102
|
|
|
102
103
|
def check_instance_exists(self) -> bool:
|
|
103
104
|
"""Check if Filestore instance exists"""
|
|
@@ -111,6 +112,7 @@ class FilestoreClient:
|
|
|
111
112
|
def get_instance_location(self) -> str:
|
|
112
113
|
"""Get Filestore instance's location"""
|
|
113
114
|
self.load_instance()
|
|
115
|
+
assert self.instance
|
|
114
116
|
return str(self.instance.name.split("/")[3])
|
|
115
117
|
|
|
116
118
|
def create_instance(
|
|
@@ -192,6 +194,7 @@ class FilestoreClient:
|
|
|
192
194
|
|
|
193
195
|
def create_sc(self, name: str, network: str) -> dict:
|
|
194
196
|
"""Create a yaml representing filestore StorageClass."""
|
|
197
|
+
assert self.instance
|
|
195
198
|
data = templates.load(FS_SC_PATH)
|
|
196
199
|
data["metadata"]["name"] = get_storage_class_name(name)
|
|
197
200
|
data["parameters"]["tier"] = self.instance.tier.name
|
|
@@ -202,6 +205,7 @@ class FilestoreClient:
|
|
|
202
205
|
|
|
203
206
|
def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
|
|
204
207
|
"""Create a yaml representing filestore PersistentVolume."""
|
|
208
|
+
assert self.instance
|
|
205
209
|
data = templates.load(FS_PV_PATH)
|
|
206
210
|
data["metadata"]["name"] = get_pv_name(name)
|
|
207
211
|
data["spec"]["storageClassName"] = get_storage_class_name(name)
|
|
@@ -219,6 +223,7 @@ class FilestoreClient:
|
|
|
219
223
|
|
|
220
224
|
def create_pvc(self, name: str, access_mode: str) -> dict:
|
|
221
225
|
"""Create a yaml representing filestore PersistentVolumeClaim."""
|
|
226
|
+
assert self.instance
|
|
222
227
|
data = templates.load(FS_PVC_PATH)
|
|
223
228
|
data["metadata"]["name"] = get_pvc_name(name)
|
|
224
229
|
data["spec"]["accessModes"] = [access_mode]
|
xpk/core/gcloud_context.py
CHANGED
|
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
|
|
|
75
75
|
xpk_print(f'Working on {args.project} and {args.zone}')
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def zone_to_region(zone) -> str:
|
|
78
|
+
def zone_to_region(zone: str) -> str:
|
|
79
79
|
"""Helper function converts zone name to region name.
|
|
80
80
|
|
|
81
81
|
Args:
|
|
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
|
|
|
85
85
|
The region name.
|
|
86
86
|
"""
|
|
87
87
|
zone_terms = zone.split('-')
|
|
88
|
-
return zone_terms[0] + '-' + zone_terms[1]
|
|
88
|
+
return zone_terms[0] + '-' + zone_terms[1]
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
@dataclass
|
xpk/core/jobset.py
CHANGED
|
@@ -134,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
|
|
|
134
134
|
memory_limit_size=new_memory_limit,
|
|
135
135
|
)
|
|
136
136
|
tmp = write_tmp_file(yml_string)
|
|
137
|
-
command = f'kubectl apply -f {str(tmp
|
|
137
|
+
command = f'kubectl apply -f {str(tmp)}'
|
|
138
138
|
|
|
139
139
|
task = 'Updating jobset Controller Manager resources'
|
|
140
140
|
return_code = run_command_with_updates_retry(command, task, args)
|