xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integration/__init__.py +15 -0
- integration/docker_manager_test.py +102 -0
- integration/gcluster_a3mega_test.py +204 -0
- integration/gcluster_a3ultra_test.py +176 -0
- integration/gcluster_a4_test.py +176 -0
- integration/gcluster_test.py +107 -0
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +143 -117
- xpk/commands/cluster_gcluster.py +81 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/cluster_test.py +92 -0
- xpk/commands/common.py +14 -26
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +39 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +84 -29
- xpk/commands/workload_test.py +81 -0
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/blueprint/testing/__init__.py +15 -0
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +423 -0
- xpk/core/kueue_manager_test.py +574 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +350 -232
- xpk/core/system_characteristics_test.py +73 -0
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/templates/cluster_preheat.yaml.j2 +31 -0
- xpk/templates/filestore-pv.yaml +17 -0
- xpk/templates/filestore-pvc.yaml +11 -0
- xpk/templates/filestore-sc.yaml +10 -0
- xpk/templates/fuse-pv.yaml +17 -0
- xpk/templates/fuse-pvc.yaml +13 -0
- xpk/templates/kueue_config.yaml.j2 +95 -0
- xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
- xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
- xpk/templates/mtc-cpc.yaml +15 -0
- xpk/templates/volume_bundle.yaml +7 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +15 -0
- xpk/utils/topology.py +46 -0
- xpk/utils/topology_test.py +63 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
- xpk-0.14.1.dist-info/RECORD +133 -0
- xpk-0.14.1.dist-info/top_level.txt +2 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- xpk-0.13.0.dist-info/top_level.txt +0 -1
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/core/commands.py
CHANGED
|
@@ -18,14 +18,14 @@ import datetime
|
|
|
18
18
|
import subprocess
|
|
19
19
|
import sys
|
|
20
20
|
import time
|
|
21
|
-
from argparse import Namespace
|
|
22
21
|
|
|
23
22
|
from ..utils.objects import chunks
|
|
24
23
|
from ..utils.file import make_tmp_files, write_tmp_file
|
|
25
24
|
from ..utils.console import xpk_print
|
|
25
|
+
from ..utils.execution_context import is_dry_run
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def run_commands(commands, jobname, per_command_name, batch=10
|
|
28
|
+
def run_commands(commands, jobname, per_command_name, batch=10):
|
|
29
29
|
"""Run commands in groups of `batch`.
|
|
30
30
|
|
|
31
31
|
Args:
|
|
@@ -33,7 +33,6 @@ def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False):
|
|
|
33
33
|
jobname: the name of the job.
|
|
34
34
|
per_command_name: list of command names.
|
|
35
35
|
batch: number of commands to run in parallel.
|
|
36
|
-
dry_run: enables dry_run if set to true.
|
|
37
36
|
|
|
38
37
|
Returns:
|
|
39
38
|
0 if successful and 1 otherwise.
|
|
@@ -46,7 +45,7 @@ def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False):
|
|
|
46
45
|
f'Breaking up a total of {len(commands)} commands into'
|
|
47
46
|
f' {len(commands_batched)} batches'
|
|
48
47
|
)
|
|
49
|
-
if
|
|
48
|
+
if is_dry_run():
|
|
50
49
|
xpk_print('Pretending all the jobs succeeded')
|
|
51
50
|
return 0
|
|
52
51
|
|
|
@@ -133,14 +132,13 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
|
|
|
133
132
|
|
|
134
133
|
|
|
135
134
|
def run_command_with_updates_retry(
|
|
136
|
-
command, task,
|
|
135
|
+
command, task, verbose=True, num_retry_attempts=5, wait_seconds=10
|
|
137
136
|
) -> int:
|
|
138
137
|
"""Generic run commands function with updates and retry logic.
|
|
139
138
|
|
|
140
139
|
Args:
|
|
141
140
|
command: command to execute
|
|
142
141
|
task: user-facing name of the task
|
|
143
|
-
args: user provided arguments for running the command.
|
|
144
142
|
verbose: shows stdout and stderr if set to true. Set to True by default.
|
|
145
143
|
num_retry_attempts: number of attempts to retry the command.
|
|
146
144
|
This has a default value in the function arguments.
|
|
@@ -160,23 +158,22 @@ def run_command_with_updates_retry(
|
|
|
160
158
|
time.sleep(wait_seconds)
|
|
161
159
|
i += 1
|
|
162
160
|
xpk_print(f'Try {i}: {task}')
|
|
163
|
-
return_code = run_command_with_updates(command, task,
|
|
161
|
+
return_code = run_command_with_updates(command, task, verbose=verbose)
|
|
164
162
|
return return_code
|
|
165
163
|
|
|
166
164
|
|
|
167
|
-
def run_command_with_updates(command, task,
|
|
165
|
+
def run_command_with_updates(command, task, verbose=True) -> int:
|
|
168
166
|
"""Generic run commands function with updates.
|
|
169
167
|
|
|
170
168
|
Args:
|
|
171
169
|
command: command to execute
|
|
172
170
|
task: user-facing name of the task
|
|
173
|
-
global_args: user provided arguments for running the command.
|
|
174
171
|
verbose: shows stdout and stderr if set to true. Set to True by default.
|
|
175
172
|
|
|
176
173
|
Returns:
|
|
177
174
|
0 if successful and 1 otherwise.
|
|
178
175
|
"""
|
|
179
|
-
if
|
|
176
|
+
if is_dry_run():
|
|
180
177
|
xpk_print(
|
|
181
178
|
f'Task: `{task}` is implemented by the following command'
|
|
182
179
|
' not running since it is a dry run.'
|
|
@@ -226,7 +223,6 @@ def run_command_with_updates(command, task, global_args, verbose=True) -> int:
|
|
|
226
223
|
def run_command_for_value(
|
|
227
224
|
command,
|
|
228
225
|
task,
|
|
229
|
-
global_args,
|
|
230
226
|
dry_run_return_val='0',
|
|
231
227
|
print_timer=False,
|
|
232
228
|
hide_error=False,
|
|
@@ -239,7 +235,6 @@ def run_command_for_value(
|
|
|
239
235
|
Args:
|
|
240
236
|
command: user provided command to run.
|
|
241
237
|
task: user provided task name for running the command.
|
|
242
|
-
global_args: user provided arguments for running the command.
|
|
243
238
|
dry_run_return_val: return value of this command for dry run.
|
|
244
239
|
print_timer: print out the time the command is running.
|
|
245
240
|
hide_error: hide the error from the command output upon success.
|
|
@@ -249,7 +244,7 @@ def run_command_for_value(
|
|
|
249
244
|
int: return_code, default is 0
|
|
250
245
|
str: return_val, default is '0'
|
|
251
246
|
"""
|
|
252
|
-
if
|
|
247
|
+
if is_dry_run():
|
|
253
248
|
xpk_print(
|
|
254
249
|
f'Task: `{task}` is implemented by the following command'
|
|
255
250
|
' not running since it is a dry run.'
|
|
@@ -305,7 +300,6 @@ def run_command_for_value(
|
|
|
305
300
|
def run_command_with_full_controls(
|
|
306
301
|
command: str,
|
|
307
302
|
task: str,
|
|
308
|
-
global_args: Namespace,
|
|
309
303
|
instructions: str | None = None,
|
|
310
304
|
) -> int:
|
|
311
305
|
"""Run command in current shell with system out, in and error handles. Wait
|
|
@@ -314,13 +308,12 @@ def run_command_with_full_controls(
|
|
|
314
308
|
Args:
|
|
315
309
|
command: command to execute
|
|
316
310
|
task: user-facing name of the task
|
|
317
|
-
global_args: user provided arguments for running the command.
|
|
318
311
|
verbose: shows stdout and stderr if set to true. Set to True by default.
|
|
319
312
|
|
|
320
313
|
Returns:
|
|
321
314
|
0 if successful and 1 otherwise.
|
|
322
315
|
"""
|
|
323
|
-
if
|
|
316
|
+
if is_dry_run():
|
|
324
317
|
xpk_print(
|
|
325
318
|
f'Task: `{task}` is implemented by the following command'
|
|
326
319
|
' not running since it is a dry run.'
|
|
@@ -352,8 +345,8 @@ def run_command_with_full_controls(
|
|
|
352
345
|
return return_code
|
|
353
346
|
|
|
354
347
|
|
|
355
|
-
def run_kubectl_apply(yml_string: str, task: str
|
|
348
|
+
def run_kubectl_apply(yml_string: str, task: str) -> int:
|
|
356
349
|
tmp = write_tmp_file(yml_string)
|
|
357
350
|
command = f'kubectl apply -f {str(tmp)}'
|
|
358
|
-
err_code = run_command_with_updates(command, task
|
|
351
|
+
err_code = run_command_with_updates(command, task)
|
|
359
352
|
return err_code
|
xpk/core/config.py
CHANGED
|
@@ -22,7 +22,7 @@ from ..utils import file
|
|
|
22
22
|
from ..utils.console import xpk_print
|
|
23
23
|
|
|
24
24
|
# This is the version for XPK PyPI package
|
|
25
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.14.1'
|
|
26
26
|
XPK_CURRENT_VERSION = __version__
|
|
27
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
28
28
|
|
xpk/core/docker_image.py
CHANGED
|
@@ -49,7 +49,7 @@ def validate_docker_image(docker_image, args) -> int:
|
|
|
49
49
|
f'gcloud container images describe {docker_image} --project {project}'
|
|
50
50
|
)
|
|
51
51
|
return_code = run_command_with_updates(
|
|
52
|
-
command, 'Validate Docker Image',
|
|
52
|
+
command, 'Validate Docker Image', verbose=False
|
|
53
53
|
)
|
|
54
54
|
if return_code != 0:
|
|
55
55
|
xpk_print(
|
|
@@ -104,7 +104,6 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
104
104
|
return_code = run_command_with_updates(
|
|
105
105
|
docker_build_command,
|
|
106
106
|
'Building script_dir into docker image',
|
|
107
|
-
args,
|
|
108
107
|
verbose=verbose,
|
|
109
108
|
)
|
|
110
109
|
if return_code != 0:
|
|
@@ -134,7 +133,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
134
133
|
# Tag the docker image.
|
|
135
134
|
tag_docker_image_command = f'docker tag {docker_name} {cloud_docker_image}'
|
|
136
135
|
return_code = run_command_with_updates(
|
|
137
|
-
tag_docker_image_command, 'Tag Docker Image',
|
|
136
|
+
tag_docker_image_command, 'Tag Docker Image', verbose=verbose
|
|
138
137
|
)
|
|
139
138
|
if return_code != 0:
|
|
140
139
|
xpk_print(
|
|
@@ -147,7 +146,7 @@ def build_docker_image_from_base_image(args, verbose=True) -> tuple[int, str]:
|
|
|
147
146
|
# Upload image to Artifact Registry.
|
|
148
147
|
upload_docker_image_command = f'docker push {cloud_docker_image}'
|
|
149
148
|
return_code = run_command_with_updates(
|
|
150
|
-
upload_docker_image_command, 'Upload Docker Image',
|
|
149
|
+
upload_docker_image_command, 'Upload Docker Image', verbose=verbose
|
|
151
150
|
)
|
|
152
151
|
if return_code != 0:
|
|
153
152
|
xpk_print(
|
xpk/core/gcloud_context.py
CHANGED
|
@@ -18,8 +18,9 @@ import subprocess
|
|
|
18
18
|
import sys
|
|
19
19
|
from dataclasses import dataclass
|
|
20
20
|
|
|
21
|
-
from ..utils.console import xpk_print
|
|
21
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
22
22
|
from .commands import run_command_for_value
|
|
23
|
+
from functools import lru_cache
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def get_project():
|
|
@@ -85,9 +86,33 @@ def zone_to_region(zone: str) -> str:
|
|
|
85
86
|
The region name.
|
|
86
87
|
"""
|
|
87
88
|
zone_terms = zone.split('-')
|
|
89
|
+
if len(zone_terms) != 2 and len(zone_terms) != 3:
|
|
90
|
+
raise ValueError(f'Invalid zone name: {zone}')
|
|
88
91
|
return zone_terms[0] + '-' + zone_terms[1]
|
|
89
92
|
|
|
90
93
|
|
|
94
|
+
@lru_cache()
|
|
95
|
+
def get_cluster_location(project: str, name: str, zone: str) -> str:
|
|
96
|
+
"""Helper function to resolve location for a given cluster"""
|
|
97
|
+
return_code, result = run_command_for_value(
|
|
98
|
+
command=(
|
|
99
|
+
'gcloud container clusters list '
|
|
100
|
+
f'--project={project} '
|
|
101
|
+
f'--filter=name={name} '
|
|
102
|
+
'--format="value(location)"'
|
|
103
|
+
),
|
|
104
|
+
task='Find cluster region or zone',
|
|
105
|
+
dry_run_return_val=zone_to_region(zone),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if return_code != 0:
|
|
109
|
+
xpk_print('Error: Unable to determine cluster region or zone')
|
|
110
|
+
xpk_exit(return_code)
|
|
111
|
+
|
|
112
|
+
regions = result.strip().splitlines()
|
|
113
|
+
return zone if zone in regions else zone_to_region(zone)
|
|
114
|
+
|
|
115
|
+
|
|
91
116
|
@dataclass
|
|
92
117
|
class GkeServerConfig:
|
|
93
118
|
"""Stores the valid gke versions based on gcloud recommendations."""
|
|
@@ -139,7 +164,6 @@ def get_gke_server_config(args) -> tuple[int, GkeServerConfig | None]:
|
|
|
139
164
|
return_code, cmd_output = run_command_for_value(
|
|
140
165
|
command,
|
|
141
166
|
command_description,
|
|
142
|
-
args,
|
|
143
167
|
hide_error=True,
|
|
144
168
|
)
|
|
145
169
|
if return_code != 0:
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import pytest
|
|
18
|
+
from .gcloud_context import get_cluster_location, zone_to_region
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_zone_to_region_raises_when_zone_is_invalid():
|
|
22
|
+
with pytest.raises(ValueError):
|
|
23
|
+
zone_to_region("us")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_zone_to_region_returns_region_when_region_given():
|
|
27
|
+
assert zone_to_region("us-central1") == "us-central1"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_zone_to_region_returns_region_when_zone_is_valid():
|
|
31
|
+
assert zone_to_region("us-central1-a") == "us-central1"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_get_cluster_location_returns_cluster_region_when_cluster_is_regional(
|
|
35
|
+
mocker,
|
|
36
|
+
):
|
|
37
|
+
mocker.patch(
|
|
38
|
+
"xpk.core.gcloud_context.run_command_for_value",
|
|
39
|
+
return_value=(0, "us-central1"),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
result = get_cluster_location(
|
|
43
|
+
project="project1", name="name1", zone="us-central1-a"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
assert result == "us-central1"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_get_cluster_location_returns_cluster_zone_when_both_regional_and_zonal_clusters_exist(
|
|
50
|
+
mocker,
|
|
51
|
+
):
|
|
52
|
+
mocker.patch(
|
|
53
|
+
"xpk.core.gcloud_context.run_command_for_value",
|
|
54
|
+
return_value=(0, "us-central1\nus-central1-a"),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
result = get_cluster_location(
|
|
58
|
+
project="project2", name="name2", zone="us-central1-a"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
assert result == "us-central1-a"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_get_cluster_location_returns_given_zone_converted_to_region_when_cluster_is_not_found(
|
|
65
|
+
mocker,
|
|
66
|
+
):
|
|
67
|
+
mocker.patch(
|
|
68
|
+
"xpk.core.gcloud_context.run_command_for_value", return_value=(0, "")
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
result = get_cluster_location(
|
|
72
|
+
project="project3", name="name3", zone="us-central1-a"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
assert result == "us-central1"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_get_cluster_location_caches_previous_command_result(mocker):
|
|
79
|
+
mock = mocker.patch(
|
|
80
|
+
"xpk.core.gcloud_context.run_command_for_value", return_value=(0, "")
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
get_cluster_location(project="project4", name="name4", zone="us-central1-a")
|
|
84
|
+
|
|
85
|
+
assert mock.call_count == 1
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_get_cluster_location_invokes_command_for_different_input_args(mocker):
|
|
89
|
+
mock = mocker.patch(
|
|
90
|
+
"xpk.core.gcloud_context.run_command_for_value", return_value=(0, "")
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
get_cluster_location(project="project5", name="name5", zone="us-central1-a")
|
|
94
|
+
get_cluster_location(project="project6", name="name6", zone="us-central1-a")
|
|
95
|
+
|
|
96
|
+
assert mock.call_count == 2
|
xpk/core/gcluster_manager.py
CHANGED
|
@@ -27,9 +27,6 @@ blueprint_file_name = 'xpk_blueprint.yaml'
|
|
|
27
27
|
deployment_module = '/out/xpk-deployment'
|
|
28
28
|
a3_utils_dir_name = 'a3-mega-xpk'
|
|
29
29
|
config_map_repo_path = 'src/xpk/blueprints/a3-mega-xpk/config-map.yaml.tftpl'
|
|
30
|
-
kueue_config_repo_path = (
|
|
31
|
-
'src/xpk/blueprints/a3-mega-xpk/kueue-xpk-configuration.yaml.tftpl'
|
|
32
|
-
)
|
|
33
30
|
|
|
34
31
|
|
|
35
32
|
class GclusterManager:
|
xpk/core/jobset.py
CHANGED
|
@@ -18,7 +18,7 @@ import math
|
|
|
18
18
|
|
|
19
19
|
from ..utils.console import xpk_exit, xpk_print
|
|
20
20
|
from ..utils.file import write_tmp_file
|
|
21
|
-
from ..core.
|
|
21
|
+
from ..core.kueue_manager import (
|
|
22
22
|
MEMORY_SIZE_PER_VM,
|
|
23
23
|
MIN_MEMORY_LIMIT_SIZE,
|
|
24
24
|
)
|
|
@@ -110,19 +110,16 @@ spec:
|
|
|
110
110
|
"""
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
def update_jobset_resources_if_necessary(
|
|
113
|
+
def update_jobset_resources_if_necessary():
|
|
114
114
|
"""Update the jobset manifest to increase the resources for the jobset controller manager.
|
|
115
115
|
|
|
116
|
-
Args:
|
|
117
|
-
args: user provided arguments for running the command.
|
|
118
|
-
|
|
119
116
|
Returns:
|
|
120
117
|
0 if successful and 1 otherwise.
|
|
121
118
|
"""
|
|
122
119
|
# Get total number of nodes
|
|
123
120
|
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
|
|
124
121
|
return_code, out = run_command_for_value(
|
|
125
|
-
cmd_total_node_num, 'Count total nodes'
|
|
122
|
+
cmd_total_node_num, 'Count total nodes'
|
|
126
123
|
)
|
|
127
124
|
if return_code != 0:
|
|
128
125
|
xpk_exit(1)
|
|
@@ -137,7 +134,7 @@ def update_jobset_resources_if_necessary(args):
|
|
|
137
134
|
command = f'kubectl apply -f {str(tmp)}'
|
|
138
135
|
|
|
139
136
|
task = 'Updating jobset Controller Manager resources'
|
|
140
|
-
return_code = run_command_with_updates_retry(command, task
|
|
137
|
+
return_code = run_command_with_updates_retry(command, task)
|
|
141
138
|
if return_code != 0:
|
|
142
139
|
xpk_print(f'{task} returned ERROR {return_code}')
|
|
143
140
|
return return_code
|
xpk/core/kjob.py
CHANGED
|
@@ -167,8 +167,8 @@ Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.co
|
|
|
167
167
|
default_interface_annotation = "networking.gke.io/default-interface=eth0"
|
|
168
168
|
|
|
169
169
|
|
|
170
|
-
def get_a4_pod_template_annotations(
|
|
171
|
-
sub_networks = get_cluster_subnetworks(
|
|
170
|
+
def get_a4_pod_template_annotations() -> tuple[str, str]:
|
|
171
|
+
sub_networks = get_cluster_subnetworks()
|
|
172
172
|
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
173
173
|
sub_networks
|
|
174
174
|
)
|
|
@@ -179,8 +179,8 @@ def get_a4_pod_template_annotations(args) -> tuple[str, str]:
|
|
|
179
179
|
)
|
|
180
180
|
|
|
181
181
|
|
|
182
|
-
def get_a3ultra_pod_template_annotations(
|
|
183
|
-
sub_networks = get_cluster_subnetworks(
|
|
182
|
+
def get_a3ultra_pod_template_annotations() -> tuple[str, str]:
|
|
183
|
+
sub_networks = get_cluster_subnetworks()
|
|
184
184
|
interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
|
|
185
185
|
sub_networks
|
|
186
186
|
)
|
|
@@ -191,11 +191,9 @@ def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]:
|
|
|
191
191
|
)
|
|
192
192
|
|
|
193
193
|
|
|
194
|
-
def get_a3mega_pod_template_annotations(
|
|
195
|
-
args: Namespace,
|
|
196
|
-
) -> tuple[str, str, str]:
|
|
194
|
+
def get_a3mega_pod_template_annotations() -> tuple[str, str, str]:
|
|
197
195
|
"""Adds or updates annotations in the Pod template."""
|
|
198
|
-
sub_networks = get_cluster_subnetworks(
|
|
196
|
+
sub_networks = get_cluster_subnetworks()
|
|
199
197
|
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
200
198
|
interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
|
|
201
199
|
sub_networks
|
|
@@ -205,16 +203,14 @@ def get_a3mega_pod_template_annotations(
|
|
|
205
203
|
return tcpxo, interfaces, default_interface_annotation
|
|
206
204
|
|
|
207
205
|
|
|
208
|
-
def verify_kjob_installed(
|
|
206
|
+
def verify_kjob_installed() -> int:
|
|
209
207
|
"""Check if kjob is installed. If not provide user with proper communicate and exit.
|
|
210
|
-
Args:
|
|
211
|
-
args - user provided arguments.
|
|
212
208
|
Returns:
|
|
213
209
|
error code > if kjob not installed, otherwise 0
|
|
214
210
|
"""
|
|
215
211
|
command = "kubectl-kjob help"
|
|
216
212
|
task = "Verify kjob installation "
|
|
217
|
-
verify_kjob_installed_code, _ = run_command_for_value(command, task
|
|
213
|
+
verify_kjob_installed_code, _ = run_command_for_value(command, task)
|
|
218
214
|
|
|
219
215
|
if verify_kjob_installed_code == 0:
|
|
220
216
|
xpk_print("kjob found")
|
|
@@ -246,9 +242,7 @@ def get_pod_template_interactive_command() -> str:
|
|
|
246
242
|
return pod_command
|
|
247
243
|
|
|
248
244
|
|
|
249
|
-
def create_app_profile_instance(
|
|
250
|
-
args: Namespace, volume_bundles: list[str]
|
|
251
|
-
) -> int:
|
|
245
|
+
def create_app_profile_instance(volume_bundles: list[str]) -> int:
|
|
252
246
|
"""Create new AppProfile instance on cluster with default settings.
|
|
253
247
|
|
|
254
248
|
Args:
|
|
@@ -264,7 +258,6 @@ def create_app_profile_instance(
|
|
|
264
258
|
volume_bundles=volume_bundles,
|
|
265
259
|
),
|
|
266
260
|
task="Creating AppProfile",
|
|
267
|
-
args=args,
|
|
268
261
|
)
|
|
269
262
|
|
|
270
263
|
|
|
@@ -332,15 +325,12 @@ def create_job_template_instance(
|
|
|
332
325
|
return run_kubectl_apply(
|
|
333
326
|
yml_string,
|
|
334
327
|
task="Creating JobTemplate",
|
|
335
|
-
args=args,
|
|
336
328
|
)
|
|
337
329
|
|
|
338
330
|
|
|
339
|
-
def create_pod_template_instance(
|
|
331
|
+
def create_pod_template_instance(service_account: str) -> int:
|
|
340
332
|
"""Create new PodTemplate instance on cluster with default settings.
|
|
341
333
|
|
|
342
|
-
Args:
|
|
343
|
-
args - user provided arguments
|
|
344
334
|
Returns:
|
|
345
335
|
exit_code > 0 if creating PodTemplate fails, 0 otherwise
|
|
346
336
|
"""
|
|
@@ -362,7 +352,6 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
|
|
|
362
352
|
service_account=service_account,
|
|
363
353
|
),
|
|
364
354
|
task="Creating PodTemplate",
|
|
365
|
-
args=args,
|
|
366
355
|
)
|
|
367
356
|
|
|
368
357
|
|
|
@@ -381,29 +370,27 @@ def prepare_kjob(args: Namespace) -> int:
|
|
|
381
370
|
job_err_code = create_job_template_instance(args, system, service_account)
|
|
382
371
|
if job_err_code > 0:
|
|
383
372
|
return job_err_code
|
|
384
|
-
pod_err_code = create_pod_template_instance(
|
|
373
|
+
pod_err_code = create_pod_template_instance(service_account)
|
|
385
374
|
if pod_err_code > 0:
|
|
386
375
|
return pod_err_code
|
|
387
376
|
|
|
388
377
|
volume_bundles = [item.name for item in storages]
|
|
389
378
|
|
|
390
|
-
return create_app_profile_instance(
|
|
379
|
+
return create_app_profile_instance(volume_bundles)
|
|
391
380
|
|
|
392
381
|
|
|
393
|
-
def apply_kjob_crds(
|
|
382
|
+
def apply_kjob_crds() -> int:
|
|
394
383
|
"""Apply kjob CRDs on cluster.
|
|
395
384
|
|
|
396
385
|
This function install kjob CRDs files from kjobctl printcrds.
|
|
397
386
|
It creates all neccessary kjob CRDs.
|
|
398
387
|
|
|
399
|
-
Args:
|
|
400
|
-
args - user provided arguments
|
|
401
388
|
Returns:
|
|
402
389
|
None
|
|
403
390
|
"""
|
|
404
391
|
command = "kubectl kjob printcrds | kubectl apply --server-side -f -"
|
|
405
392
|
task = "Create kjob CRDs on cluster"
|
|
406
|
-
return_code = run_command_with_updates(command, task
|
|
393
|
+
return_code = run_command_with_updates(command, task)
|
|
407
394
|
if return_code != 0:
|
|
408
395
|
xpk_print(f"{task} returned ERROR {return_code}")
|
|
409
396
|
return return_code
|