xpk 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +3 -3
- xpk/commands/cluster.py +22 -1
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +12 -5
- xpk/commands/kjob_common.py +4 -1
- xpk/commands/run.py +2 -2
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +10 -3
- xpk/commands/workload.py +64 -27
- xpk/core/blueprint/blueprint_generator.py +108 -40
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +165 -7
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +2 -6
- xpk/core/kueue.py +154 -5
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +1 -2
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +0 -44
- xpk/core/workload_decorators/rdma_decorator.py +2 -0
- xpk/core/workload_decorators/tcpx_decorator.py +10 -4
- xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
- xpk/parser/cluster.py +23 -7
- xpk/parser/storage.py +2 -2
- xpk/parser/workload.py +21 -3
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/METADATA +46 -7
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/RECORD +35 -34
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/WHEEL +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/top_level.txt +0 -0
xpk/core/config.py
CHANGED
|
@@ -15,16 +15,14 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
|
-
import re
|
|
19
18
|
|
|
20
19
|
import ruamel.yaml
|
|
21
20
|
|
|
22
21
|
from ..utils import file
|
|
23
22
|
from ..utils.console import xpk_print
|
|
24
|
-
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
25
23
|
|
|
26
24
|
# This is the version for XPK PyPI package
|
|
27
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.10.1'
|
|
28
26
|
XPK_CURRENT_VERSION = __version__
|
|
29
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
30
28
|
|
|
@@ -117,65 +115,3 @@ class XpkConfig:
|
|
|
117
115
|
return None
|
|
118
116
|
val: dict[str, str] = config_yaml[CONFIGS_KEY]
|
|
119
117
|
return val
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
|
|
123
|
-
"""Parses the environment configurations to the jobset config.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
args: user provided arguments for running the command.
|
|
127
|
-
tensorboard_config: configuration of Vertex Tensorboard.
|
|
128
|
-
system: system characteristics.
|
|
129
|
-
"""
|
|
130
|
-
env = {}
|
|
131
|
-
|
|
132
|
-
env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
|
|
133
|
-
if args.env_file:
|
|
134
|
-
print('Setting container environment from', args.env_file)
|
|
135
|
-
with open(file=args.env_file, mode='r', encoding='utf-8') as f:
|
|
136
|
-
for match in env_pat.finditer(f.read()):
|
|
137
|
-
variable = match.group(1)
|
|
138
|
-
if match.group(2) is not None:
|
|
139
|
-
env[variable] = match.group(2)
|
|
140
|
-
else:
|
|
141
|
-
assert variable in os.environ, (
|
|
142
|
-
f'Variable {variable} is not set in the current '
|
|
143
|
-
'environment, a value must be specified.'
|
|
144
|
-
)
|
|
145
|
-
env[variable] = os.environ[variable]
|
|
146
|
-
if args.env:
|
|
147
|
-
for var in args.env:
|
|
148
|
-
match = env_pat.match(var)
|
|
149
|
-
assert match and match.group(2) is not None, (
|
|
150
|
-
'Invalid environment variable, format must be '
|
|
151
|
-
f'`--env VARIABLE=value`: {var}'
|
|
152
|
-
)
|
|
153
|
-
variable = match.group(1)
|
|
154
|
-
env[variable] = match.group(2)
|
|
155
|
-
|
|
156
|
-
if not args.use_pathways:
|
|
157
|
-
if args.debug_dump_gcs:
|
|
158
|
-
if 'XLA_FLAGS' in env:
|
|
159
|
-
raise ValueError(
|
|
160
|
-
'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
|
|
161
|
-
'and environment file. Please choose one way to define '
|
|
162
|
-
'XLA_FLAGS.'
|
|
163
|
-
)
|
|
164
|
-
env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
|
|
165
|
-
|
|
166
|
-
if tensorboard_config:
|
|
167
|
-
env['UPLOAD_DATA_TO_TENSORBOARD'] = True
|
|
168
|
-
for key, value in tensorboard_config.items():
|
|
169
|
-
env[key.upper()] = value
|
|
170
|
-
|
|
171
|
-
if system.accelerator_type == AcceleratorType['GPU']:
|
|
172
|
-
# For GPUs, it has two more spaces ahead of name and value respectively
|
|
173
|
-
env_format = '''
|
|
174
|
-
- name: {key}
|
|
175
|
-
value: "{value}"'''
|
|
176
|
-
else:
|
|
177
|
-
env_format = '''
|
|
178
|
-
- name: {key}
|
|
179
|
-
value: "{value}"'''
|
|
180
|
-
|
|
181
|
-
args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.57.1"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|
xpk/core/docker_resources.py
CHANGED
|
@@ -14,9 +14,11 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
18
20
|
from .cluster import setup_k8s_env
|
|
19
|
-
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
|
|
21
|
+
from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
|
|
20
22
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
21
23
|
|
|
22
24
|
|
|
@@ -64,6 +66,25 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
64
66
|
str:
|
|
65
67
|
YAML with the env config for the main container, as a YAML string.
|
|
66
68
|
"""
|
|
69
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
70
|
+
return get_gpu_env(args, system)
|
|
71
|
+
|
|
72
|
+
if system.accelerator_type == AcceleratorType['CPU']:
|
|
73
|
+
return get_cpu_env(args, system)
|
|
74
|
+
|
|
75
|
+
return format_env_dict(args.env, system) # pytype: disable=bad-return-type
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_gpu_env(args, system) -> str:
|
|
79
|
+
"""Generate environment variables for GPU nodepools
|
|
80
|
+
Args:
|
|
81
|
+
num_slices: Number of slices to be used in the workload.
|
|
82
|
+
env_vars: Environment variables, processed from user args.
|
|
83
|
+
system: system characteristics
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
str: yaml containing env variables
|
|
87
|
+
"""
|
|
67
88
|
gpu_env_yaml = """
|
|
68
89
|
- name: REPLICATED_JOB_NAME
|
|
69
90
|
valueFrom:
|
|
@@ -73,8 +94,6 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
73
94
|
valueFrom:
|
|
74
95
|
fieldRef:
|
|
75
96
|
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
|
|
76
|
-
- name: JAX_COORDINATOR_ADDRESS
|
|
77
|
-
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
|
|
78
97
|
- name: NNODES
|
|
79
98
|
value: "{args.num_nodes}"
|
|
80
99
|
- name: NODE_RANK
|
|
@@ -84,36 +103,37 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
|
|
|
84
103
|
- name: USE_GPUDIRECT
|
|
85
104
|
value: {gpu_direct_name}
|
|
86
105
|
- name: GPUS_PER_NODE
|
|
87
|
-
value: "{
|
|
88
|
-
- name: JAX_COORDINATOR_PORT
|
|
89
|
-
value: "6002"
|
|
106
|
+
value: "{chips_per_vm}"
|
|
90
107
|
- name: COMMAND
|
|
91
108
|
value: "{args.command}"
|
|
92
|
-
{
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
109
|
+
{custom_envs}"""
|
|
110
|
+
|
|
111
|
+
gpu_direct_name = 'fastrak'
|
|
112
|
+
if args.device_type == H100_DEVICE_TYPE:
|
|
113
|
+
gpu_direct_name = 'tcpx'
|
|
114
|
+
elif args.device_type == H100_MEGA_DEVICE_TYPE:
|
|
115
|
+
gpu_direct_name = 'tcpxo'
|
|
116
|
+
elif args.device_type == H200_DEVICE_TYPE:
|
|
117
|
+
gpu_direct_name = 'rdma'
|
|
118
|
+
|
|
119
|
+
gpu_env_dic = {
|
|
120
|
+
'JAX_COORDINATOR_PORT': '6002',
|
|
121
|
+
'JAX_COORDINATOR_ADDRESS': (
|
|
122
|
+
'$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
|
|
123
|
+
),
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
args.env = gpu_env_dic | args.env
|
|
127
|
+
|
|
128
|
+
return gpu_env_yaml.format(
|
|
129
|
+
args=args,
|
|
130
|
+
chips_per_vm=system.chips_per_vm,
|
|
131
|
+
gpu_direct_name=gpu_direct_name,
|
|
132
|
+
custom_envs=format_env_dict(args.env, system),
|
|
133
|
+
)
|
|
114
134
|
|
|
115
135
|
|
|
116
|
-
def get_cpu_env(
|
|
136
|
+
def get_cpu_env(args, system) -> str:
|
|
117
137
|
"""Generate environment variables for CPU nodepools
|
|
118
138
|
Args:
|
|
119
139
|
num_slices: Number of slices to be used in the workload.
|
|
@@ -136,19 +156,87 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
|
|
|
136
156
|
valueFrom:
|
|
137
157
|
fieldRef:
|
|
138
158
|
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
|
|
139
|
-
|
|
140
|
-
value: "{processes_in_job}"
|
|
141
|
-
- name: JAX_PROCESS_COUNT
|
|
142
|
-
value: "{process_count}"
|
|
143
|
-
{env_vars}
|
|
144
|
-
- name: JAX_COORDINATOR_ADDRESS
|
|
145
|
-
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
|
|
159
|
+
{custom_envs}
|
|
146
160
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
161
|
+
|
|
162
|
+
cpu_env_dic = {
|
|
163
|
+
'PROCESSES_IN_JOB': str(system.vms_per_slice),
|
|
164
|
+
'JAX_PROCESS_COUNT': str(
|
|
165
|
+
calculate_process_count(args.num_slices, system.vms_per_slice)
|
|
166
|
+
),
|
|
167
|
+
'JAX_COORDINATOR_ADDRESS': (
|
|
168
|
+
'$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
|
|
169
|
+
),
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
args.env = cpu_env_dic | args.env
|
|
173
|
+
|
|
174
|
+
return yaml.format(custom_envs=format_env_dict(args.env, system))
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def format_env_dict(env, system: SystemCharacteristics) -> str:
|
|
178
|
+
if system.accelerator_type == AcceleratorType['GPU']:
|
|
179
|
+
# For GPUs, it has two more spaces ahead of name and value respectively
|
|
180
|
+
env_format = '''
|
|
181
|
+
- name: {key}
|
|
182
|
+
value: "{value}"'''
|
|
183
|
+
else:
|
|
184
|
+
env_format = '''
|
|
185
|
+
- name: {key}
|
|
186
|
+
value: "{value}"'''
|
|
187
|
+
return ''.join(env_format.format(key=k, value=v) for k, v in env.items())
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def parse_env_config(args, tensorboard_config):
|
|
191
|
+
"""Parses the environment configurations to the a dictionary.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
args: user provided arguments for running the command.
|
|
195
|
+
tensorboard_config: configuration of Vertex Tensorboard.
|
|
196
|
+
system: system characteristics.
|
|
197
|
+
"""
|
|
198
|
+
env = {}
|
|
199
|
+
|
|
200
|
+
env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
|
|
201
|
+
if args.env_file:
|
|
202
|
+
print('Setting container environment from', args.env_file)
|
|
203
|
+
with open(file=args.env_file, mode='r', encoding='utf-8') as f:
|
|
204
|
+
for match in env_pat.finditer(f.read()):
|
|
205
|
+
variable = match.group(1)
|
|
206
|
+
if match.group(2) is not None:
|
|
207
|
+
env[variable] = match.group(2)
|
|
208
|
+
else:
|
|
209
|
+
assert variable in os.environ, (
|
|
210
|
+
f'Variable {variable} is not set in the current '
|
|
211
|
+
'environment, a value must be specified.'
|
|
212
|
+
)
|
|
213
|
+
env[variable] = os.environ[variable]
|
|
214
|
+
if args.env:
|
|
215
|
+
for var in args.env:
|
|
216
|
+
match = env_pat.match(var)
|
|
217
|
+
assert match and match.group(2) is not None, (
|
|
218
|
+
'Invalid environment variable, format must be '
|
|
219
|
+
f'`--env VARIABLE=value`: {var}'
|
|
220
|
+
)
|
|
221
|
+
variable = match.group(1)
|
|
222
|
+
env[variable] = match.group(2)
|
|
223
|
+
|
|
224
|
+
if not args.use_pathways:
|
|
225
|
+
if args.debug_dump_gcs:
|
|
226
|
+
if 'XLA_FLAGS' in env:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
|
|
229
|
+
'and environment file. Please choose one way to define '
|
|
230
|
+
'XLA_FLAGS.'
|
|
231
|
+
)
|
|
232
|
+
env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
|
|
233
|
+
|
|
234
|
+
if tensorboard_config:
|
|
235
|
+
env['UPLOAD_DATA_TO_TENSORBOARD'] = True
|
|
236
|
+
for key, value in tensorboard_config.items():
|
|
237
|
+
env[key.upper()] = value
|
|
238
|
+
|
|
239
|
+
args.env = env
|
|
152
240
|
|
|
153
241
|
|
|
154
242
|
def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
@@ -188,13 +276,13 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
|
|
|
188
276
|
setup_k8s_env(args), args.storage
|
|
189
277
|
)
|
|
190
278
|
for storage in storages:
|
|
191
|
-
if storage.type
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
279
|
+
if storage.type in {
|
|
280
|
+
GCS_FUSE_TYPE,
|
|
281
|
+
GCP_FILESTORE_TYPE,
|
|
282
|
+
PARALLELSTORE_TYPE,
|
|
283
|
+
GCE_PD_TYPE,
|
|
284
|
+
LUSTRE_TYPE,
|
|
285
|
+
}:
|
|
198
286
|
volumes += f"""- name: {storage.pv}
|
|
199
287
|
persistentVolumeClaim:
|
|
200
288
|
claimName: {storage.pvc}
|
|
@@ -235,34 +323,19 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
|
|
|
235
323
|
mountPath: /shared-volume
|
|
236
324
|
"""
|
|
237
325
|
elif system.accelerator_type == AcceleratorType['GPU']:
|
|
238
|
-
|
|
239
|
-
volume_mount_yaml = """- name: nvidia-install-dir-host
|
|
240
|
-
mountPath: /usr/local/nvidia/lib64
|
|
241
|
-
- name: tcpx-nccl-plugin-volume
|
|
242
|
-
mountPath: /usr/local/tcpx
|
|
243
|
-
- name: tcpd-socket
|
|
244
|
-
mountPath: /tmp
|
|
245
|
-
- name: shared-memory
|
|
246
|
-
mountPath: /dev/shm
|
|
247
|
-
- name: workload-terminated-volume
|
|
248
|
-
mountPath: /usr/share/workload"""
|
|
249
|
-
elif (
|
|
250
|
-
system.device_type == H100_MEGA_DEVICE_TYPE
|
|
251
|
-
or system.device_type == H200_DEVICE_TYPE
|
|
252
|
-
or system.device_type == B200_DEVICE_TYPE
|
|
253
|
-
):
|
|
254
|
-
volume_mount_yaml = ''
|
|
326
|
+
volume_mount_yaml = ''
|
|
255
327
|
|
|
256
328
|
storages: list[Storage] = get_storages_to_mount(
|
|
257
329
|
setup_k8s_env(args), args.storage
|
|
258
330
|
)
|
|
259
331
|
for storage in storages:
|
|
260
|
-
if storage.type
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
332
|
+
if storage.type in {
|
|
333
|
+
GCS_FUSE_TYPE,
|
|
334
|
+
GCP_FILESTORE_TYPE,
|
|
335
|
+
PARALLELSTORE_TYPE,
|
|
336
|
+
GCE_PD_TYPE,
|
|
337
|
+
LUSTRE_TYPE,
|
|
338
|
+
}:
|
|
266
339
|
volume_mount_yaml += f"""- name: {storage.pv}
|
|
267
340
|
mountPath: {storage.mount_point}
|
|
268
341
|
readOnly: {storage.readonly}
|
xpk/core/jobset.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import math
|
|
18
|
+
|
|
19
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
20
|
+
from ..utils.file import write_tmp_file
|
|
21
|
+
from ..core.kueue import (
|
|
22
|
+
MEMORY_SIZE_PER_VM,
|
|
23
|
+
MIN_MEMORY_LIMIT_SIZE,
|
|
24
|
+
)
|
|
25
|
+
from .commands import (
|
|
26
|
+
run_command_for_value,
|
|
27
|
+
run_command_with_updates_retry,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
jobset_controller_manager_yml = """
|
|
31
|
+
apiVersion: apps/v1
|
|
32
|
+
kind: Deployment
|
|
33
|
+
metadata:
|
|
34
|
+
labels:
|
|
35
|
+
app.kubernetes.io/component: manager
|
|
36
|
+
app.kubernetes.io/created-by: jobset
|
|
37
|
+
app.kubernetes.io/instance: controller-manager
|
|
38
|
+
app.kubernetes.io/managed-by: kustomize
|
|
39
|
+
app.kubernetes.io/name: deployment
|
|
40
|
+
app.kubernetes.io/part-of: jobset
|
|
41
|
+
control-plane: controller-manager
|
|
42
|
+
name: jobset-controller-manager
|
|
43
|
+
namespace: jobset-system
|
|
44
|
+
spec:
|
|
45
|
+
replicas: 1
|
|
46
|
+
selector:
|
|
47
|
+
matchLabels:
|
|
48
|
+
control-plane: controller-manager
|
|
49
|
+
template:
|
|
50
|
+
metadata:
|
|
51
|
+
annotations:
|
|
52
|
+
kubectl.kubernetes.io/default-container: manager
|
|
53
|
+
labels:
|
|
54
|
+
control-plane: controller-manager
|
|
55
|
+
spec:
|
|
56
|
+
containers:
|
|
57
|
+
- args:
|
|
58
|
+
- --config=/controller_manager_config.yaml
|
|
59
|
+
- --zap-log-level=2
|
|
60
|
+
command:
|
|
61
|
+
- /manager
|
|
62
|
+
image: registry.k8s.io/jobset/jobset:v0.8.0
|
|
63
|
+
livenessProbe:
|
|
64
|
+
httpGet:
|
|
65
|
+
path: /healthz
|
|
66
|
+
port: 8081
|
|
67
|
+
initialDelaySeconds: 15
|
|
68
|
+
periodSeconds: 20
|
|
69
|
+
name: manager
|
|
70
|
+
ports:
|
|
71
|
+
- containerPort: 9443
|
|
72
|
+
name: webhook-server
|
|
73
|
+
protocol: TCP
|
|
74
|
+
readinessProbe:
|
|
75
|
+
httpGet:
|
|
76
|
+
path: /readyz
|
|
77
|
+
port: 8081
|
|
78
|
+
initialDelaySeconds: 5
|
|
79
|
+
periodSeconds: 10
|
|
80
|
+
resources:
|
|
81
|
+
limits:
|
|
82
|
+
memory: {memory_limit_size}
|
|
83
|
+
requests:
|
|
84
|
+
cpu: 500m
|
|
85
|
+
memory: 128Mi
|
|
86
|
+
securityContext:
|
|
87
|
+
allowPrivilegeEscalation: false
|
|
88
|
+
capabilities:
|
|
89
|
+
drop:
|
|
90
|
+
- ALL
|
|
91
|
+
volumeMounts:
|
|
92
|
+
- mountPath: /controller_manager_config.yaml
|
|
93
|
+
name: manager-config
|
|
94
|
+
subPath: controller_manager_config.yaml
|
|
95
|
+
- mountPath: /tmp/k8s-webhook-server/serving-certs
|
|
96
|
+
name: cert
|
|
97
|
+
readOnly: true
|
|
98
|
+
securityContext:
|
|
99
|
+
runAsNonRoot: true
|
|
100
|
+
serviceAccountName: jobset-controller-manager
|
|
101
|
+
terminationGracePeriodSeconds: 10
|
|
102
|
+
volumes:
|
|
103
|
+
- configMap:
|
|
104
|
+
name: jobset-manager-config
|
|
105
|
+
name: manager-config
|
|
106
|
+
- name: cert
|
|
107
|
+
secret:
|
|
108
|
+
defaultMode: 420
|
|
109
|
+
secretName: jobset-webhook-server-cert
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def update_jobset_resources_if_necessary(args):
|
|
114
|
+
"""Update the jobset manifest to increase the resources for the jobset controller manager.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
args: user provided arguments for running the command.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
0 if successful and 1 otherwise.
|
|
121
|
+
"""
|
|
122
|
+
# Get total number of nodes
|
|
123
|
+
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
|
|
124
|
+
return_code, out = run_command_for_value(
|
|
125
|
+
cmd_total_node_num, 'Count total nodes', args
|
|
126
|
+
)
|
|
127
|
+
if return_code != 0:
|
|
128
|
+
xpk_exit(1)
|
|
129
|
+
# 1.2MiB per VM or 4GiB (whichever is greater).
|
|
130
|
+
new_memory_limit = (
|
|
131
|
+
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
|
|
132
|
+
)
|
|
133
|
+
yml_string = jobset_controller_manager_yml.format(
|
|
134
|
+
memory_limit_size=new_memory_limit,
|
|
135
|
+
)
|
|
136
|
+
tmp = write_tmp_file(yml_string)
|
|
137
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
138
|
+
|
|
139
|
+
task = 'Updating jobset Controller Manager resources'
|
|
140
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
141
|
+
if return_code != 0:
|
|
142
|
+
xpk_print(f'{task} returned ERROR {return_code}')
|
|
143
|
+
return return_code
|
xpk/core/kjob.py
CHANGED
|
@@ -40,11 +40,8 @@ from .config import (
|
|
|
40
40
|
XpkConfig,
|
|
41
41
|
)
|
|
42
42
|
from .network import get_cluster_subnetworks
|
|
43
|
-
from .
|
|
44
|
-
|
|
45
|
-
SystemCharacteristics,
|
|
46
|
-
get_cluster_system_characteristics,
|
|
47
|
-
)
|
|
43
|
+
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
44
|
+
from .resources import get_cluster_system_characteristics
|
|
48
45
|
from .storage import (
|
|
49
46
|
GCS_FUSE_ANNOTATIONS,
|
|
50
47
|
PARALLELSTORE_ANNOTATIONS,
|
|
@@ -380,7 +377,6 @@ def prepare_kjob(args: Namespace) -> int:
|
|
|
380
377
|
job_err_code = create_job_template_instance(args, system, service_account)
|
|
381
378
|
if job_err_code > 0:
|
|
382
379
|
return job_err_code
|
|
383
|
-
|
|
384
380
|
pod_err_code = create_pod_template_instance(args, service_account)
|
|
385
381
|
if pod_err_code > 0:
|
|
386
382
|
return pod_err_code
|