konduktor-nightly 0.1.0.dev20250804105449__py3-none-any.whl → 0.1.0.dev20250806105405__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of konduktor-nightly might be problematic. Click here for more details.
- konduktor/__init__.py +4 -7
- konduktor/backends/__init__.py +2 -4
- konduktor/backends/constants.py +12 -0
- konduktor/backends/deployment.py +179 -0
- konduktor/backends/deployment_utils.py +835 -0
- konduktor/backends/jobset.py +2 -2
- konduktor/backends/jobset_utils.py +16 -266
- konduktor/backends/pod_utils.py +392 -0
- konduktor/cli.py +343 -8
- konduktor/controller/launch.py +1 -1
- konduktor/execution.py +5 -2
- konduktor/kube_client.py +8 -0
- konduktor/resource.py +20 -0
- konduktor/serving.py +149 -0
- konduktor/task.py +61 -0
- konduktor/templates/deployment.yaml.j2 +142 -0
- konduktor/templates/pod.yaml.j2 +36 -0
- konduktor/utils/accelerator_registry.py +1 -1
- konduktor/utils/log_utils.py +1 -1
- konduktor/utils/schemas.py +42 -0
- konduktor/utils/validator.py +51 -16
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/RECORD +26 -21
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/entry_points.txt +0 -0
konduktor/backends/jobset.py
CHANGED
|
@@ -15,7 +15,7 @@ if typing.TYPE_CHECKING:
|
|
|
15
15
|
from konduktor.data import storage as storage_lib
|
|
16
16
|
|
|
17
17
|
from konduktor import config, logging
|
|
18
|
-
from konduktor.backends import backend, jobset_utils
|
|
18
|
+
from konduktor.backends import backend, jobset_utils, pod_utils
|
|
19
19
|
from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
|
|
20
20
|
|
|
21
21
|
Path = str
|
|
@@ -172,7 +172,7 @@ class JobsetBackend(backend.Backend):
|
|
|
172
172
|
# the working container starts.
|
|
173
173
|
|
|
174
174
|
# first define the pod spec then create the jobset definition
|
|
175
|
-
pod_spec =
|
|
175
|
+
pod_spec = pod_utils.create_pod_spec(task)
|
|
176
176
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
177
177
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
178
178
|
# TODO(asaiacai): need to set env variables in pod
|
|
@@ -1,32 +1,25 @@
|
|
|
1
1
|
"""Jobset utils: wraps CRUD operations for jobsets"""
|
|
2
2
|
|
|
3
|
-
import base64
|
|
4
3
|
import enum
|
|
5
4
|
import json
|
|
6
|
-
import os
|
|
7
5
|
import tempfile
|
|
8
6
|
import typing
|
|
9
7
|
from datetime import datetime, timezone
|
|
10
8
|
from typing import Any, Dict, Optional, Tuple
|
|
11
|
-
from urllib.parse import urlparse
|
|
12
9
|
|
|
13
|
-
import click
|
|
14
10
|
import colorama
|
|
15
11
|
|
|
16
12
|
if typing.TYPE_CHECKING:
|
|
17
13
|
from datetime import timedelta
|
|
18
14
|
|
|
19
15
|
import konduktor
|
|
20
|
-
from konduktor import
|
|
16
|
+
from konduktor import kube_client, logging
|
|
21
17
|
from konduktor.backends import constants as backend_constants
|
|
22
|
-
from konduktor.
|
|
18
|
+
from konduktor.backends import pod_utils
|
|
23
19
|
from konduktor.utils import (
|
|
24
20
|
common_utils,
|
|
25
|
-
exceptions,
|
|
26
21
|
kubernetes_utils,
|
|
27
22
|
log_utils,
|
|
28
|
-
ux_utils,
|
|
29
|
-
validator,
|
|
30
23
|
)
|
|
31
24
|
|
|
32
25
|
if typing.TYPE_CHECKING:
|
|
@@ -38,13 +31,14 @@ JOBSET_API_GROUP = 'jobset.x-k8s.io'
|
|
|
38
31
|
JOBSET_API_VERSION = 'v1alpha2'
|
|
39
32
|
JOBSET_PLURAL = 'jobsets'
|
|
40
33
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
34
|
+
# Use shared constants from konduktor.backends.constants
|
|
35
|
+
JOBSET_NAME_LABEL = backend_constants.JOB_NAME_LABEL
|
|
36
|
+
JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
|
|
37
|
+
JOBSET_USER_LABEL = backend_constants.USER_LABEL
|
|
38
|
+
JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
|
|
39
|
+
JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
|
|
46
40
|
|
|
47
|
-
SECRET_BASENAME_LABEL =
|
|
41
|
+
SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
|
|
48
42
|
|
|
49
43
|
_JOBSET_METADATA_LABELS = {
|
|
50
44
|
'jobset_name_label': JOBSET_NAME_LABEL,
|
|
@@ -54,8 +48,6 @@ _JOBSET_METADATA_LABELS = {
|
|
|
54
48
|
'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
|
|
55
49
|
}
|
|
56
50
|
|
|
57
|
-
_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
|
|
58
|
-
|
|
59
51
|
|
|
60
52
|
class JobNotFoundError(Exception):
|
|
61
53
|
pass
|
|
@@ -73,236 +65,6 @@ if typing.TYPE_CHECKING:
|
|
|
73
65
|
import konduktor
|
|
74
66
|
|
|
75
67
|
|
|
76
|
-
def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
77
|
-
"""Merges the task defintion with config
|
|
78
|
-
to create a final pod spec dict for the job
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
Dict[str, Any]: k8s pod spec
|
|
82
|
-
"""
|
|
83
|
-
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
84
|
-
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
85
|
-
|
|
86
|
-
# fill out the templating variables
|
|
87
|
-
assert task.resources is not None, 'Task resources are required'
|
|
88
|
-
if task.resources.accelerators:
|
|
89
|
-
num_gpus = list(task.resources.accelerators.values())[0]
|
|
90
|
-
else:
|
|
91
|
-
num_gpus = 0
|
|
92
|
-
task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
|
|
93
|
-
node_hostnames = ','.join(
|
|
94
|
-
[f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
|
|
95
|
-
)
|
|
96
|
-
master_addr = f'{task.name}-workers-0-0.{task.name}'
|
|
97
|
-
|
|
98
|
-
if task.resources.accelerators:
|
|
99
|
-
accelerator_type = list(task.resources.accelerators.keys())[0]
|
|
100
|
-
else:
|
|
101
|
-
accelerator_type = None
|
|
102
|
-
|
|
103
|
-
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
|
104
|
-
assert task.resources.memory is not None, 'Task resources memory are required'
|
|
105
|
-
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
|
106
|
-
|
|
107
|
-
# template the commands to run on the container for syncing files. At this point
|
|
108
|
-
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
|
109
|
-
# first we iterate through storage_mounts and then file_mounts.
|
|
110
|
-
sync_commands = []
|
|
111
|
-
mkdir_commands = []
|
|
112
|
-
storage_secrets = {}
|
|
113
|
-
# first do storage_mount sync
|
|
114
|
-
for dst, store in task.storage_mounts.items():
|
|
115
|
-
# TODO(asaiacai) idk why but theres an extra storage mount for the
|
|
116
|
-
# file mounts. Should be cleaned up eventually in
|
|
117
|
-
# maybe_translate_local_file_mounts_and_sync_up
|
|
118
|
-
assert store.source is not None and isinstance(
|
|
119
|
-
store.source, str
|
|
120
|
-
), 'Store source is required'
|
|
121
|
-
store_scheme = urlparse(store.source).scheme
|
|
122
|
-
if '/tmp/konduktor-job-filemounts-files' in dst:
|
|
123
|
-
continue
|
|
124
|
-
# should impelement a method here instead of raw dog dict access
|
|
125
|
-
cloud_store = registry._REGISTRY[store_scheme]
|
|
126
|
-
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
127
|
-
exists, _ = kubernetes_utils.check_secret_exists(
|
|
128
|
-
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
129
|
-
)
|
|
130
|
-
assert exists, (
|
|
131
|
-
f"secret {storage_secrets[store_scheme]} doesn't "
|
|
132
|
-
f'exist in namespace {namespace}'
|
|
133
|
-
)
|
|
134
|
-
mkdir_commands.append(
|
|
135
|
-
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
|
|
136
|
-
)
|
|
137
|
-
assert store._bucket_sub_path is not None
|
|
138
|
-
sync_commands.append(
|
|
139
|
-
cloud_store.make_sync_dir_command(
|
|
140
|
-
os.path.join(store.source, store._bucket_sub_path), dst
|
|
141
|
-
)
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
# then do file_mount sync.
|
|
145
|
-
assert task.file_mounts is not None
|
|
146
|
-
for dst, src in task.file_mounts.items():
|
|
147
|
-
store_scheme = str(urlparse(store.source).scheme)
|
|
148
|
-
cloud_store = registry._REGISTRY[store_scheme]
|
|
149
|
-
mkdir_commands.append(
|
|
150
|
-
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
|
|
151
|
-
f'mkdir -p {os.path.dirname(dst)}'
|
|
152
|
-
)
|
|
153
|
-
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
154
|
-
exists, reason = kubernetes_utils.check_secret_exists(
|
|
155
|
-
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
156
|
-
)
|
|
157
|
-
assert exists, (
|
|
158
|
-
f'secret {storage_secrets[store_scheme]} '
|
|
159
|
-
f"doesn't exist in namespace {namespace}"
|
|
160
|
-
)
|
|
161
|
-
sync_commands.append(cloud_store.make_sync_file_command(src, dst))
|
|
162
|
-
|
|
163
|
-
tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
|
|
164
|
-
if tailscale_secret:
|
|
165
|
-
secret_exist, err = kubernetes_utils.check_secret_exists(
|
|
166
|
-
tailscale_secret, namespace, context
|
|
167
|
-
)
|
|
168
|
-
if not secret_exist:
|
|
169
|
-
with ux_utils.print_exception_no_traceback():
|
|
170
|
-
raise exceptions.MissingSecretError(
|
|
171
|
-
f'No tailscale auth-key secret `{tailscale_secret}` found even '
|
|
172
|
-
f'though specified by `tailscale.secret_name`: {err}'
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
|
|
176
|
-
secret_name = None
|
|
177
|
-
if enable_ssh:
|
|
178
|
-
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
|
179
|
-
with (
|
|
180
|
-
open(private_key_path, 'rb') as private_key_file,
|
|
181
|
-
open(public_key_path, 'rb') as public_key_file,
|
|
182
|
-
):
|
|
183
|
-
private_key, public_key = private_key_file.read(), public_key_file.read()
|
|
184
|
-
user_hash = common_utils.get_user_hash()
|
|
185
|
-
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
|
186
|
-
ok, result = kubernetes_utils.set_secret(
|
|
187
|
-
secret_name=secret_name,
|
|
188
|
-
namespace=namespace,
|
|
189
|
-
context=context,
|
|
190
|
-
data={
|
|
191
|
-
'PUBKEY': base64.b64encode(public_key).decode(),
|
|
192
|
-
'PRIVKEY': base64.b64encode(private_key).decode(),
|
|
193
|
-
},
|
|
194
|
-
)
|
|
195
|
-
if not ok:
|
|
196
|
-
raise exceptions.CreateSecretError(
|
|
197
|
-
f'Failed to set k8s secret {secret_name}: \n{result}'
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
# Mount the user's secrets
|
|
201
|
-
git_ssh_secret_name = None
|
|
202
|
-
env_secret_envs = []
|
|
203
|
-
default_secrets = []
|
|
204
|
-
|
|
205
|
-
user_hash = common_utils.get_user_hash()
|
|
206
|
-
label_selector = f'konduktor/owner={user_hash}'
|
|
207
|
-
user_secrets = kubernetes_utils.list_secrets(
|
|
208
|
-
namespace, context, label_filter=label_selector
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
for secret in user_secrets:
|
|
212
|
-
kind = kubernetes_utils.get_secret_kind(secret)
|
|
213
|
-
if kind == 'git-ssh' and git_ssh_secret_name is None:
|
|
214
|
-
git_ssh_secret_name = secret.metadata.name
|
|
215
|
-
elif kind == 'env':
|
|
216
|
-
env_secret_name = secret.metadata.name
|
|
217
|
-
key = next(iter(secret.data))
|
|
218
|
-
env_secret_envs.append(
|
|
219
|
-
{
|
|
220
|
-
'name': key,
|
|
221
|
-
'valueFrom': {
|
|
222
|
-
'secretKeyRef': {'name': env_secret_name, 'key': key}
|
|
223
|
-
},
|
|
224
|
-
}
|
|
225
|
-
)
|
|
226
|
-
elif kind == 'default':
|
|
227
|
-
default_secret_name = secret.metadata.name
|
|
228
|
-
basename = secret.metadata.labels.get(
|
|
229
|
-
SECRET_BASENAME_LABEL, default_secret_name
|
|
230
|
-
)
|
|
231
|
-
default_secrets.append(
|
|
232
|
-
{'k8s_name': default_secret_name, 'mount_name': basename}
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
with tempfile.NamedTemporaryFile() as temp:
|
|
236
|
-
common_utils.fill_template(
|
|
237
|
-
'pod.yaml.j2',
|
|
238
|
-
{
|
|
239
|
-
# TODO(asaiacai) need to parse/round these numbers and sanity check
|
|
240
|
-
'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(task.resources.cpus),
|
|
241
|
-
'memory': kubernetes_utils.parse_memory_resource(task.resources.memory),
|
|
242
|
-
'image_id': task.resources.image_id,
|
|
243
|
-
'num_gpus': num_gpus,
|
|
244
|
-
'master_addr': master_addr,
|
|
245
|
-
'num_nodes': task.num_nodes,
|
|
246
|
-
'job_name': task.name, # append timestamp and user id here?
|
|
247
|
-
'setup_cmd': task.setup or '',
|
|
248
|
-
'run_cmd': task.run,
|
|
249
|
-
'node_hostnames': node_hostnames,
|
|
250
|
-
'accelerator_type': accelerator_type,
|
|
251
|
-
'sync_commands': sync_commands,
|
|
252
|
-
'mkdir_commands': mkdir_commands,
|
|
253
|
-
'mount_secrets': storage_secrets,
|
|
254
|
-
'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
|
|
255
|
-
'user': common_utils.get_cleaned_username(),
|
|
256
|
-
# Tailscale credentials
|
|
257
|
-
'tailscale_secret': tailscale_secret,
|
|
258
|
-
# SSH
|
|
259
|
-
'enable_ssh': enable_ssh,
|
|
260
|
-
'secret_name': secret_name,
|
|
261
|
-
'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
|
|
262
|
-
# Kinds of Secrets
|
|
263
|
-
# --kind git-ssh
|
|
264
|
-
'git_ssh': git_ssh_secret_name,
|
|
265
|
-
# --kind default
|
|
266
|
-
'default_secrets': default_secrets,
|
|
267
|
-
# KONDUKTOR_DEBUG
|
|
268
|
-
'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
|
|
269
|
-
},
|
|
270
|
-
temp.name,
|
|
271
|
-
)
|
|
272
|
-
pod_config = common_utils.read_yaml(temp.name)
|
|
273
|
-
# merge with `~/.konduktor/config.yaml``
|
|
274
|
-
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
|
275
|
-
pod_config = common_utils.read_yaml(temp.name)
|
|
276
|
-
|
|
277
|
-
# Priority order: task.envs > secret envs > existing pod_config envs
|
|
278
|
-
existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
|
|
279
|
-
'env', []
|
|
280
|
-
)
|
|
281
|
-
env_map = {env['name']: env for env in existing_envs}
|
|
282
|
-
|
|
283
|
-
# Inject secret envs
|
|
284
|
-
for env in env_secret_envs:
|
|
285
|
-
env_map[env['name']] = env
|
|
286
|
-
|
|
287
|
-
# Inject task.envs
|
|
288
|
-
for k, v in task.envs.items():
|
|
289
|
-
env_map[k] = {'name': k, 'value': v}
|
|
290
|
-
|
|
291
|
-
# Replace the container's env section with the merged and prioritized map
|
|
292
|
-
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
|
|
293
|
-
env_map.values()
|
|
294
|
-
)
|
|
295
|
-
logger.debug(f'rendered pod spec: \n\t{pod_config}')
|
|
296
|
-
|
|
297
|
-
# validate pod spec using json schema
|
|
298
|
-
try:
|
|
299
|
-
validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
|
|
300
|
-
except ValueError as e:
|
|
301
|
-
raise click.UsageError(str(e))
|
|
302
|
-
|
|
303
|
-
return pod_config
|
|
304
|
-
|
|
305
|
-
|
|
306
68
|
def create_jobset(
|
|
307
69
|
namespace: str,
|
|
308
70
|
task: 'konduktor.Task',
|
|
@@ -313,12 +75,8 @@ def create_jobset(
|
|
|
313
75
|
and returns the created jobset spec
|
|
314
76
|
"""
|
|
315
77
|
assert task.resources is not None, 'Task resources are undefined'
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
num_accelerators = list(task.resources.accelerators.values())[0]
|
|
319
|
-
else:
|
|
320
|
-
accelerator_type = 'None'
|
|
321
|
-
num_accelerators = 0
|
|
78
|
+
accelerator_type = task.resources.get_accelerator_type() or 'None'
|
|
79
|
+
num_accelerators = task.resources.get_accelerator_count() or 0
|
|
322
80
|
with tempfile.NamedTemporaryFile() as temp:
|
|
323
81
|
common_utils.fill_template(
|
|
324
82
|
'jobset.yaml.j2',
|
|
@@ -336,19 +94,11 @@ def create_jobset(
|
|
|
336
94
|
temp.name,
|
|
337
95
|
)
|
|
338
96
|
jobset_spec = common_utils.read_yaml(temp.name)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
if not maxRunDurationSeconds:
|
|
345
|
-
raise ValueError('maxRunDurationSeconds is required')
|
|
346
|
-
jobset_spec['jobset']['metadata']['annotations'][
|
|
347
|
-
_RUN_DURATION_ANNOTATION_KEY
|
|
348
|
-
] = str(maxRunDurationSeconds)
|
|
349
|
-
jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
|
|
350
|
-
'template'
|
|
351
|
-
] = pod_spec # noqa: E501
|
|
97
|
+
# Inject JobSet metadata (labels and annotations)
|
|
98
|
+
pod_utils.inject_jobset_metadata(jobset_spec, task)
|
|
99
|
+
|
|
100
|
+
# Merge pod spec into JobSet template
|
|
101
|
+
pod_utils.merge_pod_into_jobset_template(jobset_spec, pod_spec)
|
|
352
102
|
try:
|
|
353
103
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
354
104
|
jobset = kube_client.crd_api(context=context).create_namespaced_custom_object(
|