xpk 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +10 -11
- xpk/commands/cluster_gcluster.py +2 -1
- xpk/commands/common.py +3 -3
- xpk/commands/info.py +12 -12
- xpk/commands/job.py +12 -10
- xpk/commands/kjob_common.py +2 -1
- xpk/commands/storage.py +1 -1
- xpk/commands/workload.py +12 -6
- xpk/core/blueprint/blueprint_generator.py +7 -7
- xpk/core/blueprint/blueprint_test.py +218 -0
- xpk/core/capacity.py +3 -1
- xpk/core/cluster.py +9 -7
- xpk/core/cluster_private.py +5 -1
- xpk/core/commands.py +3 -3
- xpk/core/config.py +3 -4
- xpk/core/config_test.py +71 -0
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +1 -1
- xpk/core/filestore.py +7 -2
- xpk/core/gcloud_context.py +2 -2
- xpk/core/kjob.py +2 -1
- xpk/core/kueue.py +6 -2
- xpk/core/nap.py +4 -4
- xpk/core/nodepool_test.py +82 -0
- xpk/core/resources.py +1 -7
- xpk/core/storage.py +14 -14
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +3 -2
- xpk/core/workload_decorators/storage_decorator.py +2 -1
- xpk/core/workload_decorators/tcpx_decorator.py +4 -2
- xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
- xpk/core/workload_test.py +28 -0
- xpk/main.py +9 -10
- xpk/parser/cluster.py +67 -49
- xpk/parser/common.py +45 -36
- xpk/parser/storage.py +12 -13
- xpk/parser/workload.py +57 -39
- xpk/utils/console.py +2 -1
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/RECORD +46 -41
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.11.0.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/core/cluster.py
CHANGED
|
@@ -62,8 +62,8 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
62
62
|
0 if successful and 1 otherwise.
|
|
63
63
|
"""
|
|
64
64
|
command = (
|
|
65
|
-
'kubectl apply --server-side -
|
|
66
|
-
f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
65
|
+
'kubectl apply --server-side --force-conflicts'
|
|
66
|
+
f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml'
|
|
67
67
|
)
|
|
68
68
|
task = f'Install Jobset on {args.cluster}'
|
|
69
69
|
return_code = run_command_with_updates_retry(command, task, args)
|
|
@@ -217,8 +217,8 @@ def get_cluster_nodes_info(args) -> list[dict]:
|
|
|
217
217
|
)
|
|
218
218
|
if err_code != 0:
|
|
219
219
|
xpk_exit(err_code)
|
|
220
|
-
data = yaml.safe_load(val)
|
|
221
|
-
return data['items']
|
|
220
|
+
data: dict[str, list[dict]] = yaml.safe_load(val)
|
|
221
|
+
return data['items']
|
|
222
222
|
|
|
223
223
|
|
|
224
224
|
def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
@@ -445,7 +445,7 @@ def setup_k8s_env(args) -> k8s_client.ApiClient:
|
|
|
445
445
|
args.project_number = project_id_to_project_number(args.project)
|
|
446
446
|
|
|
447
447
|
config.load_kube_config()
|
|
448
|
-
return k8s_client.ApiClient()
|
|
448
|
+
return k8s_client.ApiClient()
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
def get_gpu_type_from_cluster(args) -> str:
|
|
@@ -817,9 +817,11 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
817
817
|
server_config_return_code, gke_server_config = get_gke_server_config(args)
|
|
818
818
|
if server_config_return_code != 0:
|
|
819
819
|
xpk_exit(server_config_return_code)
|
|
820
|
+
assert gke_server_config
|
|
821
|
+
|
|
820
822
|
upgrade_master_return_code = upgrade_gke_control_plane_version(
|
|
821
823
|
args,
|
|
822
|
-
gke_server_config.default_rapid_gke_version,
|
|
824
|
+
gke_server_config.default_rapid_gke_version,
|
|
823
825
|
)
|
|
824
826
|
if upgrade_master_return_code > 0:
|
|
825
827
|
xpk_print("Updating GKE cluster's control plane upgrade failed!")
|
|
@@ -828,7 +830,7 @@ def update_cluster_with_clouddns_if_necessary(args) -> int:
|
|
|
828
830
|
# Upgrade nodepools version after the master upgrade.
|
|
829
831
|
node_pool_update_code = upgrade_gke_nodepools_version(
|
|
830
832
|
args,
|
|
831
|
-
gke_server_config.default_rapid_gke_version,
|
|
833
|
+
gke_server_config.default_rapid_gke_version,
|
|
832
834
|
)
|
|
833
835
|
if node_pool_update_code > 0:
|
|
834
836
|
xpk_print('Upgrading nodepools version failed!')
|
xpk/core/cluster_private.py
CHANGED
|
@@ -95,7 +95,11 @@ def add_current_machine_to_networks_if_needed(
|
|
|
95
95
|
"Adding current machine's IP address to the authorized networks"
|
|
96
96
|
' failed!'
|
|
97
97
|
)
|
|
98
|
-
return
|
|
98
|
+
return (
|
|
99
|
+
add_current_machine_to_networks_return_code,
|
|
100
|
+
False,
|
|
101
|
+
authorized_networks,
|
|
102
|
+
)
|
|
99
103
|
|
|
100
104
|
return 0, is_current_machine_in_network, authorized_networks
|
|
101
105
|
|
xpk/core/commands.py
CHANGED
|
@@ -274,9 +274,9 @@ def run_command_for_value(
|
|
|
274
274
|
else:
|
|
275
275
|
if not quiet:
|
|
276
276
|
xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
return return_code, f'{
|
|
277
|
+
out_bytes, err_bytes = child.communicate()
|
|
278
|
+
out_str, err_str = str(out_bytes, 'UTF-8'), str(err_bytes, 'UTF-8')
|
|
279
|
+
return return_code, f'{out_str}\n{err_str}'
|
|
280
280
|
else:
|
|
281
281
|
if not quiet:
|
|
282
282
|
xpk_print(
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.12.0'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
|
@@ -71,13 +71,12 @@ class XpkConfig:
|
|
|
71
71
|
dir_path = '/'.join(self._config.split('/')[:-1])
|
|
72
72
|
file.ensure_directory_exists(dir_path)
|
|
73
73
|
|
|
74
|
-
config_yaml = {'version': 'v1', CONFIGS_KEY: {}}
|
|
75
74
|
if not os.path.exists(self._config):
|
|
76
75
|
return None
|
|
77
76
|
|
|
78
77
|
with open(self._config, encoding='utf-8', mode='r') as stream:
|
|
79
78
|
config_yaml: dict = yaml.load(stream)
|
|
80
|
-
|
|
79
|
+
return config_yaml
|
|
81
80
|
|
|
82
81
|
def _save_configs(self, config_yaml: dict) -> None:
|
|
83
82
|
with open(self._config, encoding='utf-8', mode='w') as stream:
|
|
@@ -109,7 +108,7 @@ class XpkConfig:
|
|
|
109
108
|
|
|
110
109
|
def get_all(
|
|
111
110
|
self,
|
|
112
|
-
) -> dict[str,
|
|
111
|
+
) -> dict[str, str] | None:
|
|
113
112
|
config_yaml = self._open_configs()
|
|
114
113
|
if config_yaml is None:
|
|
115
114
|
return None
|
xpk/core/config_test.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.config import XpkConfig, CFG_BUCKET_KEY, CLUSTER_NAME_KEY, PROJECT_KEY, ZONE_KEY
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
config_tmp_path = '/tmp/config/config.yaml'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture(name='_')
|
|
26
|
+
def _():
|
|
27
|
+
if os.path.exists(config_tmp_path):
|
|
28
|
+
os.remove(config_tmp_path)
|
|
29
|
+
yield
|
|
30
|
+
if os.path.exists(config_tmp_path):
|
|
31
|
+
os.remove(config_tmp_path)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_config(_):
|
|
35
|
+
cfg = XpkConfig(config_tmp_path)
|
|
36
|
+
cfg.set('project-id', 'foo')
|
|
37
|
+
project_id = cfg.get('project-id')
|
|
38
|
+
assert project_id == 'foo'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_config_get_all(_):
|
|
42
|
+
cfg = XpkConfig(config_tmp_path)
|
|
43
|
+
cfg.set(PROJECT_KEY, 'foo')
|
|
44
|
+
cfg.set(CLUSTER_NAME_KEY, 'bar')
|
|
45
|
+
cfg.set(ZONE_KEY, 'europe-west1-a')
|
|
46
|
+
cfg.set(CFG_BUCKET_KEY, 'cfg-bucket')
|
|
47
|
+
|
|
48
|
+
cfg_all = cfg.get_all()
|
|
49
|
+
assert cfg_all[PROJECT_KEY] == 'foo'
|
|
50
|
+
assert cfg_all[CLUSTER_NAME_KEY] == 'bar'
|
|
51
|
+
assert cfg_all[ZONE_KEY] == 'europe-west1-a'
|
|
52
|
+
assert cfg_all[CFG_BUCKET_KEY] == 'cfg-bucket'
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_config_get_empty(_):
|
|
56
|
+
cfg = XpkConfig(config_tmp_path)
|
|
57
|
+
val = cfg.get(PROJECT_KEY)
|
|
58
|
+
assert val is None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_config_get_all_empty(_):
|
|
62
|
+
cfg = XpkConfig(config_tmp_path)
|
|
63
|
+
val = cfg.get_all()
|
|
64
|
+
assert not val
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_config_set_incorrect(_):
|
|
68
|
+
cfg = XpkConfig(config_tmp_path)
|
|
69
|
+
cfg.set('foo', 'bar')
|
|
70
|
+
cfg_all = cfg.get_all()
|
|
71
|
+
assert not cfg_all
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.62.2"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|
xpk/core/docker_resources.py
CHANGED
|
@@ -72,7 +72,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
72
72
|
if system.accelerator_type == AcceleratorType['CPU']:
|
|
73
73
|
return get_cpu_env(args, system)
|
|
74
74
|
|
|
75
|
-
return format_env_dict(args.env, system)
|
|
75
|
+
return format_env_dict(args.env, system)
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def get_gpu_env(args, system) -> str:
|
xpk/core/filestore.py
CHANGED
|
@@ -93,11 +93,12 @@ class FilestoreClient:
|
|
|
93
93
|
|
|
94
94
|
for instance in instancesZonal:
|
|
95
95
|
if instance.name == fullname_zonal:
|
|
96
|
-
return instance
|
|
96
|
+
return instance
|
|
97
97
|
|
|
98
98
|
for instance in instancesRegional:
|
|
99
99
|
if instance.name == fullname_regional:
|
|
100
|
-
return instance
|
|
100
|
+
return instance
|
|
101
|
+
return None
|
|
101
102
|
|
|
102
103
|
def check_instance_exists(self) -> bool:
|
|
103
104
|
"""Check if Filestore instance exists"""
|
|
@@ -111,6 +112,7 @@ class FilestoreClient:
|
|
|
111
112
|
def get_instance_location(self) -> str:
|
|
112
113
|
"""Get Filestore instance's location"""
|
|
113
114
|
self.load_instance()
|
|
115
|
+
assert self.instance
|
|
114
116
|
return str(self.instance.name.split("/")[3])
|
|
115
117
|
|
|
116
118
|
def create_instance(
|
|
@@ -192,6 +194,7 @@ class FilestoreClient:
|
|
|
192
194
|
|
|
193
195
|
def create_sc(self, name: str, network: str) -> dict:
|
|
194
196
|
"""Create a yaml representing filestore StorageClass."""
|
|
197
|
+
assert self.instance
|
|
195
198
|
data = templates.load(FS_SC_PATH)
|
|
196
199
|
data["metadata"]["name"] = get_storage_class_name(name)
|
|
197
200
|
data["parameters"]["tier"] = self.instance.tier.name
|
|
@@ -202,6 +205,7 @@ class FilestoreClient:
|
|
|
202
205
|
|
|
203
206
|
def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
|
|
204
207
|
"""Create a yaml representing filestore PersistentVolume."""
|
|
208
|
+
assert self.instance
|
|
205
209
|
data = templates.load(FS_PV_PATH)
|
|
206
210
|
data["metadata"]["name"] = get_pv_name(name)
|
|
207
211
|
data["spec"]["storageClassName"] = get_storage_class_name(name)
|
|
@@ -219,6 +223,7 @@ class FilestoreClient:
|
|
|
219
223
|
|
|
220
224
|
def create_pvc(self, name: str, access_mode: str) -> dict:
|
|
221
225
|
"""Create a yaml representing filestore PersistentVolumeClaim."""
|
|
226
|
+
assert self.instance
|
|
222
227
|
data = templates.load(FS_PVC_PATH)
|
|
223
228
|
data["metadata"]["name"] = get_pvc_name(name)
|
|
224
229
|
data["spec"]["accessModes"] = [access_mode]
|
xpk/core/gcloud_context.py
CHANGED
|
@@ -75,7 +75,7 @@ def add_zone_and_project(args):
|
|
|
75
75
|
xpk_print(f'Working on {args.project} and {args.zone}')
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
def zone_to_region(zone) -> str:
|
|
78
|
+
def zone_to_region(zone: str) -> str:
|
|
79
79
|
"""Helper function converts zone name to region name.
|
|
80
80
|
|
|
81
81
|
Args:
|
|
@@ -85,7 +85,7 @@ def zone_to_region(zone) -> str:
|
|
|
85
85
|
The region name.
|
|
86
86
|
"""
|
|
87
87
|
zone_terms = zone.split('-')
|
|
88
|
-
return zone_terms[0] + '-' + zone_terms[1]
|
|
88
|
+
return zone_terms[0] + '-' + zone_terms[1]
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
@dataclass
|
xpk/core/kjob.py
CHANGED
|
@@ -277,7 +277,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
|
|
|
277
277
|
job_spec = rdma_decorator.decorate_kjob_template(job_spec)
|
|
278
278
|
job_template_dict = yaml.safe_load(yml_string)
|
|
279
279
|
job_template_dict["template"] = job_spec
|
|
280
|
-
|
|
280
|
+
yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
|
|
281
|
+
return yaml_result
|
|
281
282
|
|
|
282
283
|
|
|
283
284
|
def create_job_template_instance(
|
xpk/core/kueue.py
CHANGED
|
@@ -43,7 +43,7 @@ from .system_characteristics import (
|
|
|
43
43
|
KUEUE_VERSION = 'v0.12.2'
|
|
44
44
|
CLUSTER_QUEUE_NAME = 'cluster-queue'
|
|
45
45
|
LOCAL_QUEUE_NAME = 'multislice-queue'
|
|
46
|
-
WAIT_FOR_KUEUE_TIMEOUT = '
|
|
46
|
+
WAIT_FOR_KUEUE_TIMEOUT = '10m'
|
|
47
47
|
MEMORY_SIZE_PER_VM = 1.2
|
|
48
48
|
MIN_MEMORY_LIMIT_SIZE = 4096
|
|
49
49
|
|
|
@@ -89,6 +89,10 @@ metadata:
|
|
|
89
89
|
name: dws-config
|
|
90
90
|
spec:
|
|
91
91
|
provisioningClassName: queued-provisioning.gke.io
|
|
92
|
+
podSetUpdates:
|
|
93
|
+
nodeSelector:
|
|
94
|
+
- key: autoscaling.gke.io/provisioning-request
|
|
95
|
+
valueFromProvisioningClassDetail: ResizeRequestName
|
|
92
96
|
managedResources:
|
|
93
97
|
- {managed_resource}
|
|
94
98
|
---
|
|
@@ -320,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
|
|
|
320
324
|
return return_code
|
|
321
325
|
|
|
322
326
|
|
|
323
|
-
def get_kueue_version(args) ->
|
|
327
|
+
def get_kueue_version(args) -> tuple[int, str]:
|
|
324
328
|
command = 'kubectl kueue version'
|
|
325
329
|
task = 'Get kueue version on server'
|
|
326
330
|
return_code, val = run_command_for_value(command, task, args)
|
xpk/core/nap.py
CHANGED
|
@@ -37,6 +37,7 @@ from .resources import (
|
|
|
37
37
|
)
|
|
38
38
|
from .scheduling import get_total_chips_requested_from_args
|
|
39
39
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
40
|
+
from typing import cast
|
|
40
41
|
|
|
41
42
|
AUTOPROVISIONING_CONFIG_FILE = """
|
|
42
43
|
management:
|
|
@@ -269,9 +270,6 @@ def is_autoprovisioning_enabled(
|
|
|
269
270
|
bool is true if autoprovisioning is enabled, false otherwise.
|
|
270
271
|
int of 0 if successful and 1 otherwise.
|
|
271
272
|
"""
|
|
272
|
-
# Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
|
|
273
|
-
if args.use_pathways:
|
|
274
|
-
return False, 0
|
|
275
273
|
|
|
276
274
|
resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
|
|
277
275
|
cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
|
|
@@ -339,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
|
|
|
339
337
|
)
|
|
340
338
|
return node_selector_args, 1
|
|
341
339
|
|
|
342
|
-
return_code,
|
|
340
|
+
return_code, optional_capacity_type_str = get_value_from_map(
|
|
343
341
|
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
|
|
344
342
|
)
|
|
345
343
|
if return_code != 0:
|
|
346
344
|
return node_selector_args, return_code
|
|
345
|
+
# return_code==0 implies capacity_type is defined
|
|
346
|
+
capacity_type_str = cast(str, optional_capacity_type_str)
|
|
347
347
|
|
|
348
348
|
if capacity_type_str == CapacityType.RESERVATION.name:
|
|
349
349
|
return_code, args.reservation = get_value_from_map(
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from xpk.core.nodepool import get_desired_node_pool_names
|
|
18
|
+
|
|
19
|
+
CLUSTER_NAME = "running-cucumber"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def node_pool_name(number: int) -> str:
|
|
23
|
+
return f"{CLUSTER_NAME}-np-{number}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
|
|
27
|
+
result = get_desired_node_pool_names(
|
|
28
|
+
existing_node_pool_names=[node_pool_name(0)],
|
|
29
|
+
cluster_name=CLUSTER_NAME,
|
|
30
|
+
desired_node_pool_count=2,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
expected_result = [node_pool_name(0), node_pool_name(1)]
|
|
34
|
+
assert set(result) == set(expected_result)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
|
|
38
|
+
result = get_desired_node_pool_names(
|
|
39
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
|
|
40
|
+
cluster_name=CLUSTER_NAME,
|
|
41
|
+
desired_node_pool_count=1,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
expected_result = [node_pool_name(0)]
|
|
45
|
+
assert set(result) == set(expected_result)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
|
|
49
|
+
result = get_desired_node_pool_names(
|
|
50
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
51
|
+
cluster_name=CLUSTER_NAME,
|
|
52
|
+
desired_node_pool_count=3,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
|
|
56
|
+
assert set(result) == set(expected_result)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
|
|
60
|
+
result = get_desired_node_pool_names(
|
|
61
|
+
existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
|
|
62
|
+
cluster_name=CLUSTER_NAME,
|
|
63
|
+
desired_node_pool_count=2,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
67
|
+
assert set(result) == set(expected_result)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_compute_desired_node_pool_names_with_unknown_node_pools():
|
|
71
|
+
result = get_desired_node_pool_names(
|
|
72
|
+
existing_node_pool_names=[
|
|
73
|
+
"unknown-node-pool",
|
|
74
|
+
node_pool_name(0),
|
|
75
|
+
node_pool_name(3),
|
|
76
|
+
],
|
|
77
|
+
cluster_name=CLUSTER_NAME,
|
|
78
|
+
desired_node_pool_count=2,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
expected_result = [node_pool_name(0), node_pool_name(3)]
|
|
82
|
+
assert set(result) == set(expected_result)
|
xpk/core/resources.py
CHANGED
|
@@ -108,13 +108,7 @@ def create_cluster_configmaps(
|
|
|
108
108
|
device_type = system.device_type
|
|
109
109
|
if system.accelerator_type == AcceleratorType['GPU']:
|
|
110
110
|
resources_data = f'{device_type}: "{int(args.num_nodes)}"'
|
|
111
|
-
elif
|
|
112
|
-
not args.enable_pathways
|
|
113
|
-
and args.enable_autoprovisioning
|
|
114
|
-
and autoprovisioning_config
|
|
115
|
-
):
|
|
116
|
-
# Currently autoprovisioning is not supported with Pathways.
|
|
117
|
-
# Auto provisioning will have variable topologies for a gke accelerator type.
|
|
111
|
+
elif args.enable_autoprovisioning and autoprovisioning_config:
|
|
118
112
|
resources_data = (
|
|
119
113
|
f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
|
|
120
114
|
)
|
xpk/core/storage.py
CHANGED
|
@@ -17,7 +17,7 @@ limitations under the License.
|
|
|
17
17
|
import os
|
|
18
18
|
from argparse import Namespace
|
|
19
19
|
from dataclasses import dataclass
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any, cast
|
|
21
21
|
|
|
22
22
|
import ruamel.yaml
|
|
23
23
|
from google.cloud import storage as gcp_storage
|
|
@@ -95,17 +95,17 @@ class Storage:
|
|
|
95
95
|
Args:
|
|
96
96
|
data: A dictionary containing the Storage resource definition.
|
|
97
97
|
"""
|
|
98
|
-
metadata
|
|
98
|
+
metadata = data.get("metadata", {})
|
|
99
99
|
self.name = metadata.get("name")
|
|
100
100
|
spec = data.get("spec", {})
|
|
101
|
-
self.type
|
|
102
|
-
self.auto_mount
|
|
103
|
-
self.mount_point
|
|
104
|
-
self.readonly
|
|
105
|
-
self.manifest
|
|
106
|
-
self.pvc
|
|
107
|
-
self.pv
|
|
108
|
-
self.bucket
|
|
101
|
+
self.type = spec.get("type")
|
|
102
|
+
self.auto_mount = spec.get("auto_mount")
|
|
103
|
+
self.mount_point = spec.get("mount_point")
|
|
104
|
+
self.readonly = spec.get("readonly")
|
|
105
|
+
self.manifest = spec.get("manifest")
|
|
106
|
+
self.pvc = spec.get("pvc")
|
|
107
|
+
self.pv = spec.get("pv")
|
|
108
|
+
self.bucket = self._get_bucket()
|
|
109
109
|
|
|
110
110
|
def fields_as_list(self) -> list[str]:
|
|
111
111
|
"""
|
|
@@ -117,9 +117,9 @@ class Storage:
|
|
|
117
117
|
return [
|
|
118
118
|
self.name,
|
|
119
119
|
self.type,
|
|
120
|
-
self.auto_mount,
|
|
120
|
+
str(self.auto_mount),
|
|
121
121
|
self.mount_point,
|
|
122
|
-
self.readonly,
|
|
122
|
+
str(self.readonly),
|
|
123
123
|
self.manifest,
|
|
124
124
|
]
|
|
125
125
|
|
|
@@ -133,7 +133,7 @@ class Storage:
|
|
|
133
133
|
client = k8s_client.CoreV1Api()
|
|
134
134
|
try:
|
|
135
135
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
136
|
-
return pv.spec.csi.volume_handle
|
|
136
|
+
return cast(str, pv.spec.csi.volume_handle)
|
|
137
137
|
except ApiException as e:
|
|
138
138
|
xpk_print(
|
|
139
139
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
|
@@ -150,7 +150,7 @@ class Storage:
|
|
|
150
150
|
client = k8s_client.CoreV1Api()
|
|
151
151
|
try:
|
|
152
152
|
pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
|
|
153
|
-
return pv.spec.mount_options
|
|
153
|
+
return cast(list[str], pv.spec.mount_options)
|
|
154
154
|
except ApiException as e:
|
|
155
155
|
xpk_print(
|
|
156
156
|
f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
|
xpk/core/workload.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import re
|
|
17
18
|
from ..utils.console import xpk_exit, xpk_print
|
|
18
19
|
from .commands import run_command_for_value
|
|
19
20
|
from .gcloud_context import zone_to_region
|
|
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
|
|
|
240
241
|
xpk_print('Your workload did not complete successfully')
|
|
241
242
|
return 125
|
|
242
243
|
return 0
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
|
|
247
|
+
"""Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def get_jobsets_list_gcp_link(project: str) -> str:
|
|
251
|
+
"""Returns a link to Cloud Console JobSets list"""
|
|
252
|
+
|
|
253
|
+
return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'
|
|
@@ -18,7 +18,7 @@ import yaml
|
|
|
18
18
|
from ...utils.yaml import literal_string
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def decorate_kjob_template(job_manifest) ->
|
|
21
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
22
22
|
spec = (
|
|
23
23
|
job_manifest.setdefault('spec', {})
|
|
24
24
|
.setdefault('template', {})
|
|
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
|
64
64
|
add_tolerations(job_manifest)
|
|
65
65
|
update_gpu_containers(job_manifest)
|
|
66
66
|
|
|
67
|
-
|
|
67
|
+
yaml_str: str = yaml.dump(manifest, sort_keys=False)
|
|
68
|
+
return yaml_str
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
|
|
|
36
36
|
job_manifest = job['template']
|
|
37
37
|
add_annotations(job_manifest, storages)
|
|
38
38
|
add_volumes(job_manifest, storage_volumes)
|
|
39
|
-
|
|
39
|
+
yaml_result: str = yaml.dump(manifest, sort_keys=False)
|
|
40
|
+
return yaml_result
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def add_annotations(job_manifest, storages):
|
|
@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
|
|
|
55
55
|
for job in manifest['spec']['replicatedJobs']:
|
|
56
56
|
job_manifest = job['template']
|
|
57
57
|
job_manifest = decorate_job(job_manifest)
|
|
58
|
-
|
|
58
|
+
yaml_str: str = yaml.dump(manifest, sort_keys=False)
|
|
59
|
+
return yaml_str
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
def get_interfaces_annotation() -> dict:
|
|
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
|
|
|
131
132
|
})
|
|
132
133
|
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
133
134
|
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
135
|
+
volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
|
|
134
136
|
volumes.append(
|
|
135
137
|
{'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
|
|
136
138
|
)
|
|
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
|
|
|
168
170
|
spec['initContainers'].append(tcpxo_daemon_container)
|
|
169
171
|
|
|
170
172
|
|
|
171
|
-
def update_gpu_containers(job_manifest):
|
|
173
|
+
def update_gpu_containers(job_manifest) -> None:
|
|
172
174
|
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
173
175
|
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
174
176
|
env: list = container.setdefault('env', [])
|