konduktor-nightly 0.1.0.dev20250805105421__py3-none-any.whl → 0.1.0.dev20250807105334__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of konduktor-nightly might be problematic. Click here for more details.
- konduktor/__init__.py +4 -7
- konduktor/backends/__init__.py +2 -4
- konduktor/backends/constants.py +12 -0
- konduktor/backends/deployment.py +179 -0
- konduktor/backends/deployment_utils.py +835 -0
- konduktor/backends/jobset.py +2 -2
- konduktor/backends/jobset_utils.py +16 -266
- konduktor/backends/pod_utils.py +392 -0
- konduktor/cli.py +343 -8
- konduktor/controller/launch.py +1 -1
- konduktor/execution.py +5 -2
- konduktor/kube_client.py +8 -0
- konduktor/resource.py +20 -0
- konduktor/serving.py +149 -0
- konduktor/task.py +61 -0
- konduktor/templates/deployment.yaml.j2 +142 -0
- konduktor/templates/pod.yaml.j2 +36 -0
- konduktor/utils/accelerator_registry.py +1 -1
- konduktor/utils/log_utils.py +1 -1
- konduktor/utils/schemas.py +42 -0
- konduktor/utils/validator.py +51 -16
- {konduktor_nightly-0.1.0.dev20250805105421.dist-info → konduktor_nightly-0.1.0.dev20250807105334.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250805105421.dist-info → konduktor_nightly-0.1.0.dev20250807105334.dist-info}/RECORD +26 -21
- {konduktor_nightly-0.1.0.dev20250805105421.dist-info → konduktor_nightly-0.1.0.dev20250807105334.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250805105421.dist-info → konduktor_nightly-0.1.0.dev20250807105334.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250805105421.dist-info → konduktor_nightly-0.1.0.dev20250807105334.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""Pod utils: handles pod spec creation and manipulation"""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
import typing
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
|
|
12
|
+
import konduktor
|
|
13
|
+
from konduktor import authentication, config, constants, logging
|
|
14
|
+
from konduktor.backends import constants as backend_constants
|
|
15
|
+
from konduktor.data import registry
|
|
16
|
+
from konduktor.utils import (
|
|
17
|
+
common_utils,
|
|
18
|
+
exceptions,
|
|
19
|
+
kubernetes_utils,
|
|
20
|
+
ux_utils,
|
|
21
|
+
validator,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if typing.TYPE_CHECKING:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
logger = logging.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
33
|
+
"""Merges the task definition with config to create a final pod spec dict.
|
|
34
|
+
|
|
35
|
+
This function is shared between JobSets and Deployments.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dict[str, Any]: k8s pod spec
|
|
39
|
+
"""
|
|
40
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
41
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
42
|
+
|
|
43
|
+
# fill out the templating variables
|
|
44
|
+
assert task.resources is not None, 'Task resources are required'
|
|
45
|
+
num_gpus = task.resources.get_accelerator_count() or 0
|
|
46
|
+
task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
|
|
47
|
+
node_hostnames = ','.join(
|
|
48
|
+
[f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
|
|
49
|
+
)
|
|
50
|
+
master_addr = f'{task.name}-workers-0-0.{task.name}'
|
|
51
|
+
|
|
52
|
+
accelerator_type = task.resources.get_accelerator_type()
|
|
53
|
+
|
|
54
|
+
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
|
55
|
+
assert task.resources.memory is not None, 'Task resources memory are required'
|
|
56
|
+
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
|
57
|
+
|
|
58
|
+
# template the commands to run on the container for syncing files. At this point
|
|
59
|
+
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
|
60
|
+
# first we iterate through storage_mounts and then file_mounts.
|
|
61
|
+
sync_commands = []
|
|
62
|
+
mkdir_commands = []
|
|
63
|
+
storage_secrets = {}
|
|
64
|
+
# first do storage_mount sync
|
|
65
|
+
for dst, store in task.storage_mounts.items():
|
|
66
|
+
# TODO(asaiacai) idk why but theres an extra storage mount for the
|
|
67
|
+
# file mounts. Should be cleaned up eventually in
|
|
68
|
+
# maybe_translate_local_file_mounts_and_sync_up
|
|
69
|
+
assert store.source is not None and isinstance(
|
|
70
|
+
store.source, str
|
|
71
|
+
), 'Store source is required'
|
|
72
|
+
store_scheme = urlparse(store.source).scheme
|
|
73
|
+
if '/tmp/konduktor-job-filemounts-files' in dst:
|
|
74
|
+
continue
|
|
75
|
+
# should impelement a method here instead of raw dog dict access
|
|
76
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
|
77
|
+
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
78
|
+
exists, _ = kubernetes_utils.check_secret_exists(
|
|
79
|
+
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
80
|
+
)
|
|
81
|
+
assert exists, (
|
|
82
|
+
f"secret {storage_secrets[store_scheme]} doesn't "
|
|
83
|
+
f'exist in namespace {namespace}'
|
|
84
|
+
)
|
|
85
|
+
mkdir_commands.append(
|
|
86
|
+
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
|
|
87
|
+
)
|
|
88
|
+
assert store._bucket_sub_path is not None
|
|
89
|
+
sync_commands.append(
|
|
90
|
+
cloud_store.make_sync_dir_command(
|
|
91
|
+
os.path.join(store.source, store._bucket_sub_path), dst
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# then do file_mount sync.
|
|
96
|
+
assert task.file_mounts is not None
|
|
97
|
+
for dst, src in task.file_mounts.items():
|
|
98
|
+
store_scheme = str(urlparse(store.source).scheme)
|
|
99
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
|
100
|
+
mkdir_commands.append(
|
|
101
|
+
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
|
|
102
|
+
f'mkdir -p {os.path.dirname(dst)}'
|
|
103
|
+
)
|
|
104
|
+
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
105
|
+
exists, reason = kubernetes_utils.check_secret_exists(
|
|
106
|
+
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
107
|
+
)
|
|
108
|
+
assert exists, (
|
|
109
|
+
f'secret {storage_secrets[store_scheme]} '
|
|
110
|
+
f"doesn't exist in namespace {namespace}"
|
|
111
|
+
)
|
|
112
|
+
sync_commands.append(cloud_store.make_sync_file_command(src, dst))
|
|
113
|
+
|
|
114
|
+
tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
|
|
115
|
+
if tailscale_secret:
|
|
116
|
+
secret_exist, err = kubernetes_utils.check_secret_exists(
|
|
117
|
+
tailscale_secret, namespace, context
|
|
118
|
+
)
|
|
119
|
+
if not secret_exist:
|
|
120
|
+
with ux_utils.print_exception_no_traceback():
|
|
121
|
+
raise exceptions.MissingSecretError(
|
|
122
|
+
f'No tailscale auth-key secret `{tailscale_secret}` found even '
|
|
123
|
+
f'though specified by `tailscale.secret_name`: {err}'
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
|
|
127
|
+
secret_name = None
|
|
128
|
+
if enable_ssh:
|
|
129
|
+
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
|
130
|
+
with (
|
|
131
|
+
open(private_key_path, 'rb') as private_key_file,
|
|
132
|
+
open(public_key_path, 'rb') as public_key_file,
|
|
133
|
+
):
|
|
134
|
+
private_key, public_key = private_key_file.read(), public_key_file.read()
|
|
135
|
+
user_hash = common_utils.get_user_hash()
|
|
136
|
+
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
|
137
|
+
ok, result = kubernetes_utils.set_secret(
|
|
138
|
+
secret_name=secret_name,
|
|
139
|
+
namespace=namespace,
|
|
140
|
+
context=context,
|
|
141
|
+
data={
|
|
142
|
+
'PUBKEY': base64.b64encode(public_key).decode(),
|
|
143
|
+
'PRIVKEY': base64.b64encode(private_key).decode(),
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
if not ok:
|
|
147
|
+
raise exceptions.CreateSecretError(
|
|
148
|
+
f'Failed to set k8s secret {secret_name}: \n{result}'
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Mount the user's secrets
|
|
152
|
+
git_ssh_secret_name = None
|
|
153
|
+
env_secret_envs = []
|
|
154
|
+
default_secrets = []
|
|
155
|
+
|
|
156
|
+
user_hash = common_utils.get_user_hash()
|
|
157
|
+
label_selector = f'konduktor/owner={user_hash}'
|
|
158
|
+
user_secrets = kubernetes_utils.list_secrets(
|
|
159
|
+
namespace, context, label_filter=label_selector
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
for secret in user_secrets:
|
|
163
|
+
kind = kubernetes_utils.get_secret_kind(secret)
|
|
164
|
+
if kind == 'git-ssh' and git_ssh_secret_name is None:
|
|
165
|
+
git_ssh_secret_name = secret.metadata.name
|
|
166
|
+
elif kind == 'env':
|
|
167
|
+
env_secret_name = secret.metadata.name
|
|
168
|
+
key = next(iter(secret.data))
|
|
169
|
+
env_secret_envs.append(
|
|
170
|
+
{
|
|
171
|
+
'name': key,
|
|
172
|
+
'valueFrom': {
|
|
173
|
+
'secretKeyRef': {'name': env_secret_name, 'key': key}
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
elif kind == 'default':
|
|
178
|
+
default_secret_name = secret.metadata.name
|
|
179
|
+
basename = secret.metadata.labels.get(
|
|
180
|
+
backend_constants.SECRET_BASENAME_LABEL, default_secret_name
|
|
181
|
+
)
|
|
182
|
+
default_secrets.append(
|
|
183
|
+
{'k8s_name': default_secret_name, 'mount_name': basename}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Inject --served-model-name, --host, and --port into serving run command
|
|
187
|
+
if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
188
|
+
if '--served-model-name' and '--host' and '--port' not in task.run:
|
|
189
|
+
task.run = task.run.replace(
|
|
190
|
+
'--model',
|
|
191
|
+
(
|
|
192
|
+
f'--served-model-name {task.name} \\\n'
|
|
193
|
+
f" --host '0.0.0.0' \\\n"
|
|
194
|
+
f" --port '{task.serving.ports}' \\\n"
|
|
195
|
+
f' --model'
|
|
196
|
+
),
|
|
197
|
+
)
|
|
198
|
+
elif '--served-model-name' in task.run:
|
|
199
|
+
raise ValueError(
|
|
200
|
+
'Error creating vllm deployment: '
|
|
201
|
+
'--served-model-name flag should be excluded from run command'
|
|
202
|
+
)
|
|
203
|
+
elif '--host' in task.run:
|
|
204
|
+
raise ValueError(
|
|
205
|
+
'Error creating vllm deployment: '
|
|
206
|
+
'--host flag should be excluded from run command'
|
|
207
|
+
)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(
|
|
210
|
+
'Error creating vllm deployment: '
|
|
211
|
+
'--port flag should be excluded from run command'
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
general = True
|
|
215
|
+
if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
216
|
+
general = False
|
|
217
|
+
|
|
218
|
+
with tempfile.NamedTemporaryFile() as temp:
|
|
219
|
+
common_utils.fill_template(
|
|
220
|
+
'pod.yaml.j2',
|
|
221
|
+
{
|
|
222
|
+
# TODO(asaiacai) need to parse/round these numbers and sanity check
|
|
223
|
+
'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(
|
|
224
|
+
str(task.resources.cpus or '')
|
|
225
|
+
),
|
|
226
|
+
'memory': kubernetes_utils.parse_memory_resource(
|
|
227
|
+
task.resources.memory or ''
|
|
228
|
+
),
|
|
229
|
+
'image_id': task.resources.image_id,
|
|
230
|
+
'num_gpus': num_gpus,
|
|
231
|
+
'master_addr': master_addr,
|
|
232
|
+
'num_nodes': task.num_nodes,
|
|
233
|
+
'job_name': task.name, # append timestamp and user id here?
|
|
234
|
+
'setup_cmd': task.setup or '',
|
|
235
|
+
'run_cmd': task.run or '',
|
|
236
|
+
'node_hostnames': node_hostnames,
|
|
237
|
+
'accelerator_type': accelerator_type,
|
|
238
|
+
'sync_commands': sync_commands,
|
|
239
|
+
'mkdir_commands': mkdir_commands,
|
|
240
|
+
'mount_secrets': storage_secrets,
|
|
241
|
+
'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
|
|
242
|
+
'user': common_utils.get_cleaned_username(),
|
|
243
|
+
# Tailscale credentials
|
|
244
|
+
'tailscale_secret': tailscale_secret,
|
|
245
|
+
# SSH
|
|
246
|
+
'enable_ssh': enable_ssh,
|
|
247
|
+
'secret_name': secret_name,
|
|
248
|
+
# Serving
|
|
249
|
+
'serving': bool(task.serving),
|
|
250
|
+
'general': general,
|
|
251
|
+
'ports': task.serving.ports if task.serving else None,
|
|
252
|
+
'probe': task.serving.probe if task.serving else None,
|
|
253
|
+
'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
|
|
254
|
+
# Kinds of Secrets
|
|
255
|
+
# --kind git-ssh
|
|
256
|
+
'git_ssh': git_ssh_secret_name,
|
|
257
|
+
# --kind default
|
|
258
|
+
'default_secrets': default_secrets,
|
|
259
|
+
# KONDUKTOR_DEBUG
|
|
260
|
+
'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
|
|
261
|
+
},
|
|
262
|
+
temp.name,
|
|
263
|
+
)
|
|
264
|
+
pod_config = common_utils.read_yaml(temp.name)
|
|
265
|
+
# merge with `~/.konduktor/config.yaml``
|
|
266
|
+
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
|
267
|
+
pod_config = common_utils.read_yaml(temp.name)
|
|
268
|
+
|
|
269
|
+
# Priority order: task.envs > secret envs > existing pod_config envs
|
|
270
|
+
existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
|
|
271
|
+
'env', []
|
|
272
|
+
)
|
|
273
|
+
env_map = {env['name']: env for env in existing_envs}
|
|
274
|
+
|
|
275
|
+
# Inject secret envs
|
|
276
|
+
for env in env_secret_envs:
|
|
277
|
+
env_map[env['name']] = env
|
|
278
|
+
|
|
279
|
+
# Inject task.envs
|
|
280
|
+
for k, v in task.envs.items():
|
|
281
|
+
env_map[k] = {'name': k, 'value': v}
|
|
282
|
+
|
|
283
|
+
# Replace the container's env section with the merged and prioritized map
|
|
284
|
+
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
|
|
285
|
+
env_map.values()
|
|
286
|
+
)
|
|
287
|
+
logger.debug(f'rendered pod spec: \n\t{pod_config}')
|
|
288
|
+
|
|
289
|
+
# validate pod spec using json schema
|
|
290
|
+
try:
|
|
291
|
+
validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
|
|
292
|
+
except ValueError as e:
|
|
293
|
+
raise click.UsageError(str(e))
|
|
294
|
+
|
|
295
|
+
return pod_config
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def inject_deployment_pod_metadata(
|
|
299
|
+
pod_spec: Dict[str, Any], task: 'konduktor.Task'
|
|
300
|
+
) -> None:
|
|
301
|
+
"""Inject deployment-specific metadata into pod spec.
|
|
302
|
+
|
|
303
|
+
This function adds deployment-specific labels, annotations, and settings
|
|
304
|
+
that are not present in the basic pod spec used for JobSets.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
pod_spec: The pod spec dictionary to modify
|
|
308
|
+
task: The task object containing resource information
|
|
309
|
+
"""
|
|
310
|
+
# Ensure metadata structure exists
|
|
311
|
+
pod_spec.setdefault('metadata', {})
|
|
312
|
+
pod_spec['metadata'].setdefault('labels', {})
|
|
313
|
+
pod_spec['metadata'].setdefault('annotations', {})
|
|
314
|
+
|
|
315
|
+
# Determine deployment type
|
|
316
|
+
deployment_type = 'general'
|
|
317
|
+
if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
318
|
+
deployment_type = 'vllm'
|
|
319
|
+
|
|
320
|
+
# Add deployment-specific label for vllm deployments only
|
|
321
|
+
if deployment_type == 'vllm':
|
|
322
|
+
pod_spec['metadata']['labels'][backend_constants.AIBRIX_NAME_LABEL] = task.name
|
|
323
|
+
|
|
324
|
+
# Add deployment-specific label for all deployments
|
|
325
|
+
pod_spec['metadata']['labels'][backend_constants.DEPLOYMENT_NAME_LABEL] = task.name
|
|
326
|
+
|
|
327
|
+
# Add resource labels
|
|
328
|
+
if task.resources and task.resources.labels:
|
|
329
|
+
pod_spec['metadata']['labels'].update(task.resources.labels)
|
|
330
|
+
|
|
331
|
+
# Add max run duration annotation
|
|
332
|
+
assert task.resources is not None and task.resources.labels is not None
|
|
333
|
+
maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
|
|
334
|
+
if not maxRunDurationSeconds:
|
|
335
|
+
raise ValueError('maxRunDurationSeconds is required')
|
|
336
|
+
pod_spec['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = str(
|
|
337
|
+
maxRunDurationSeconds
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Set restart policy for deployments
|
|
341
|
+
pod_spec.setdefault('spec', {})
|
|
342
|
+
pod_spec['spec']['restartPolicy'] = 'Always'
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def merge_pod_into_deployment_template(
|
|
346
|
+
deployment_spec: Dict[str, Any], pod_spec: Dict[str, Any]
|
|
347
|
+
) -> None:
|
|
348
|
+
"""Merge a pod spec into a deployment template.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
deployment_spec: The deployment spec dictionary to modify
|
|
352
|
+
pod_spec: The pod spec to merge into the deployment template
|
|
353
|
+
"""
|
|
354
|
+
deployment_spec['template'] = pod_spec
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task') -> None:
|
|
358
|
+
"""Inject JobSet-specific pod metadata.
|
|
359
|
+
|
|
360
|
+
This function adds JobSet-specific annotations that are not present
|
|
361
|
+
in the basic pod spec.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
jobset_spec: The JobSet spec dictionary to modify
|
|
365
|
+
task: The task object containing resource information
|
|
366
|
+
"""
|
|
367
|
+
# Add max run duration annotation
|
|
368
|
+
assert task.resources is not None and task.resources.labels is not None
|
|
369
|
+
maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
|
|
370
|
+
if not maxRunDurationSeconds:
|
|
371
|
+
raise ValueError('maxRunDurationSeconds is required')
|
|
372
|
+
jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
|
|
373
|
+
str(maxRunDurationSeconds)
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Inject resource labels into JobSet metadata.
|
|
377
|
+
if task.resources and task.resources.labels:
|
|
378
|
+
jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def merge_pod_into_jobset_template(
|
|
382
|
+
jobset_spec: Dict[str, Any], pod_spec: Dict[str, Any]
|
|
383
|
+
) -> None:
|
|
384
|
+
"""Merge a pod spec into a JobSet template.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
jobset_spec: The JobSet spec dictionary to modify
|
|
388
|
+
pod_spec: The pod spec to merge into the JobSet template
|
|
389
|
+
"""
|
|
390
|
+
jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
|
|
391
|
+
'template'
|
|
392
|
+
] = pod_spec
|