konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""Pod utils: handles pod spec creation and manipulation"""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import tempfile
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Any, Dict
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
import konduktor
|
|
14
|
+
from konduktor import authentication, config, constants, logging
|
|
15
|
+
from konduktor.backends import constants as backend_constants
|
|
16
|
+
from konduktor.data import registry
|
|
17
|
+
from konduktor.utils import (
|
|
18
|
+
common_utils,
|
|
19
|
+
exceptions,
|
|
20
|
+
kubernetes_utils,
|
|
21
|
+
ux_utils,
|
|
22
|
+
validator,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
if typing.TYPE_CHECKING:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
logger = logging.get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
34
|
+
"""Merges the task definition with config to create a final pod spec dict.
|
|
35
|
+
|
|
36
|
+
This function is shared between JobSets and Deployments.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dict[str, Any]: k8s pod spec
|
|
40
|
+
"""
|
|
41
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
42
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
43
|
+
|
|
44
|
+
# fill out the templating variables
|
|
45
|
+
assert task.resources is not None, 'Task resources are required'
|
|
46
|
+
num_gpus = task.resources.get_accelerator_count() or 0
|
|
47
|
+
task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
|
|
48
|
+
node_hostnames = ','.join(
|
|
49
|
+
[f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
|
|
50
|
+
)
|
|
51
|
+
master_addr = f'{task.name}-workers-0-0.{task.name}'
|
|
52
|
+
|
|
53
|
+
accelerator_type = task.resources.get_accelerator_type()
|
|
54
|
+
|
|
55
|
+
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
|
56
|
+
assert task.resources.memory is not None, 'Task resources memory are required'
|
|
57
|
+
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
|
58
|
+
|
|
59
|
+
# template the commands to run on the container for syncing files. At this point
|
|
60
|
+
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
|
61
|
+
# first we iterate through storage_mounts and then file_mounts.
|
|
62
|
+
sync_commands = []
|
|
63
|
+
mkdir_commands = []
|
|
64
|
+
storage_secrets = {}
|
|
65
|
+
# first do storage_mount sync
|
|
66
|
+
for dst, store in task.storage_mounts.items():
|
|
67
|
+
# TODO(asaiacai) idk why but theres an extra storage mount for the
|
|
68
|
+
# file mounts. Should be cleaned up eventually in
|
|
69
|
+
# maybe_translate_local_file_mounts_and_sync_up
|
|
70
|
+
assert store.source is not None and isinstance(
|
|
71
|
+
store.source, str
|
|
72
|
+
), 'Store source is required'
|
|
73
|
+
store_scheme = urlparse(store.source).scheme
|
|
74
|
+
if '/tmp/konduktor-job-filemounts-files' in dst:
|
|
75
|
+
continue
|
|
76
|
+
# should impelement a method here instead of raw dog dict access
|
|
77
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
|
78
|
+
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
79
|
+
exists, _ = kubernetes_utils.check_secret_exists(
|
|
80
|
+
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
81
|
+
)
|
|
82
|
+
assert exists, (
|
|
83
|
+
f"secret {storage_secrets[store_scheme]} doesn't "
|
|
84
|
+
f'exist in namespace {namespace}'
|
|
85
|
+
)
|
|
86
|
+
mkdir_commands.append(
|
|
87
|
+
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
|
|
88
|
+
)
|
|
89
|
+
assert store._bucket_sub_path is not None
|
|
90
|
+
sync_commands.append(
|
|
91
|
+
cloud_store.make_sync_dir_command(
|
|
92
|
+
os.path.join(store.source, store._bucket_sub_path), dst
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# then do file_mount sync.
|
|
97
|
+
assert task.file_mounts is not None
|
|
98
|
+
for dst, src in task.file_mounts.items():
|
|
99
|
+
store_scheme = str(urlparse(store.source).scheme)
|
|
100
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
|
101
|
+
mkdir_commands.append(
|
|
102
|
+
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
|
|
103
|
+
f'mkdir -p {os.path.dirname(dst)}'
|
|
104
|
+
)
|
|
105
|
+
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
|
106
|
+
exists, reason = kubernetes_utils.check_secret_exists(
|
|
107
|
+
storage_secrets[store_scheme], namespace=namespace, context=context
|
|
108
|
+
)
|
|
109
|
+
assert exists, (
|
|
110
|
+
f'secret {storage_secrets[store_scheme]} '
|
|
111
|
+
f"doesn't exist in namespace {namespace}"
|
|
112
|
+
)
|
|
113
|
+
sync_commands.append(cloud_store.make_sync_file_command(src, dst))
|
|
114
|
+
|
|
115
|
+
tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
|
|
116
|
+
if tailscale_secret:
|
|
117
|
+
secret_exist, err = kubernetes_utils.check_secret_exists(
|
|
118
|
+
tailscale_secret, namespace, context
|
|
119
|
+
)
|
|
120
|
+
if not secret_exist:
|
|
121
|
+
with ux_utils.print_exception_no_traceback():
|
|
122
|
+
raise exceptions.MissingSecretError(
|
|
123
|
+
f'No tailscale auth-key secret `{tailscale_secret}` found even '
|
|
124
|
+
f'though specified by `tailscale.secret_name`: {err}'
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
|
|
128
|
+
secret_name = None
|
|
129
|
+
if enable_ssh:
|
|
130
|
+
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
|
131
|
+
with (
|
|
132
|
+
open(private_key_path, 'rb') as private_key_file,
|
|
133
|
+
open(public_key_path, 'rb') as public_key_file,
|
|
134
|
+
):
|
|
135
|
+
private_key, public_key = private_key_file.read(), public_key_file.read()
|
|
136
|
+
user_hash = common_utils.get_user_hash()
|
|
137
|
+
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
|
138
|
+
ok, result = kubernetes_utils.set_secret(
|
|
139
|
+
secret_name=secret_name,
|
|
140
|
+
namespace=namespace,
|
|
141
|
+
context=context,
|
|
142
|
+
data={
|
|
143
|
+
'PUBKEY': base64.b64encode(public_key).decode(),
|
|
144
|
+
'PRIVKEY': base64.b64encode(private_key).decode(),
|
|
145
|
+
},
|
|
146
|
+
)
|
|
147
|
+
if not ok:
|
|
148
|
+
raise exceptions.CreateSecretError(
|
|
149
|
+
f'Failed to set k8s secret {secret_name}: \n{result}'
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Mount the user's secrets
|
|
153
|
+
git_ssh_secret_name = None
|
|
154
|
+
env_secret_envs = []
|
|
155
|
+
default_secrets = []
|
|
156
|
+
basename_by_k8s: Dict[str, str] = {}
|
|
157
|
+
|
|
158
|
+
# only get own secrets
|
|
159
|
+
user_hash = common_utils.get_user_hash()
|
|
160
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
161
|
+
user_secrets = kubernetes_utils.list_secrets(
|
|
162
|
+
namespace, context, label_filter=label_selector
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
for secret in user_secrets:
|
|
166
|
+
kind = kubernetes_utils.get_secret_kind(secret)
|
|
167
|
+
|
|
168
|
+
# incase the user modified their secret to have no key:value data
|
|
169
|
+
if secret.data is None:
|
|
170
|
+
secret.data = {}
|
|
171
|
+
|
|
172
|
+
# fill the map for *all* secrets we see
|
|
173
|
+
k8s_name = secret.metadata.name
|
|
174
|
+
lbls = secret.metadata.labels or {}
|
|
175
|
+
base = lbls.get(
|
|
176
|
+
backend_constants.SECRET_BASENAME_LABEL,
|
|
177
|
+
# fallback: strip trailing "-<something>" once if present
|
|
178
|
+
k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
|
|
179
|
+
)
|
|
180
|
+
basename_by_k8s[k8s_name] = base
|
|
181
|
+
|
|
182
|
+
if kind == 'git-ssh' and git_ssh_secret_name is None:
|
|
183
|
+
git_ssh_secret_name = secret.metadata.name
|
|
184
|
+
elif kind == 'env':
|
|
185
|
+
env_secret_name = secret.metadata.name
|
|
186
|
+
# iterate ALL keys, not just one (ex. if user made a multi-key env secret)
|
|
187
|
+
for key, _ in secret.data.items():
|
|
188
|
+
# wire the env var to read its value from a k8s secret
|
|
189
|
+
env_secret_envs.append(
|
|
190
|
+
{
|
|
191
|
+
'name': key,
|
|
192
|
+
'valueFrom': {
|
|
193
|
+
'secretKeyRef': {'name': env_secret_name, 'key': key}
|
|
194
|
+
},
|
|
195
|
+
}
|
|
196
|
+
)
|
|
197
|
+
elif kind == 'default':
|
|
198
|
+
default_secret_name = secret.metadata.name
|
|
199
|
+
basename = secret.metadata.labels.get(
|
|
200
|
+
backend_constants.SECRET_BASENAME_LABEL, default_secret_name
|
|
201
|
+
)
|
|
202
|
+
default_secrets.append(
|
|
203
|
+
{'k8s_name': default_secret_name, 'mount_name': basename}
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
|
|
207
|
+
uses_default_secret_var = (
|
|
208
|
+
'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
|
|
209
|
+
or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
|
|
210
|
+
or '/konduktor/default-secrets/' in (task.run or '')
|
|
211
|
+
or '/konduktor/default-secrets/' in (task.setup or '')
|
|
212
|
+
)
|
|
213
|
+
if uses_default_secret_var and not default_secrets:
|
|
214
|
+
raise exceptions.MissingSecretError(
|
|
215
|
+
f'Task references KONDUKTOR_DEFAULT_SECRETS or '
|
|
216
|
+
f'/konduktor/default-secrets but '
|
|
217
|
+
f'user {common_utils.get_cleaned_username()} '
|
|
218
|
+
f'has no default secrets. Paths like '
|
|
219
|
+
f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Inject --served-model-name, --host, and --port into serving run command
|
|
223
|
+
if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
224
|
+
if '--served-model-name' and '--host' and '--port' not in task.run:
|
|
225
|
+
task.run = task.run.replace(
|
|
226
|
+
'--model',
|
|
227
|
+
(
|
|
228
|
+
f'--served-model-name {task.name} \\\n'
|
|
229
|
+
f" --host '0.0.0.0' \\\n"
|
|
230
|
+
f" --port '{task.serving.ports}' \\\n"
|
|
231
|
+
f' --model'
|
|
232
|
+
),
|
|
233
|
+
)
|
|
234
|
+
elif '--served-model-name' in task.run:
|
|
235
|
+
raise ValueError(
|
|
236
|
+
'Error creating vllm deployment: '
|
|
237
|
+
'--served-model-name flag should be excluded from run command'
|
|
238
|
+
)
|
|
239
|
+
elif '--host' in task.run:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
'Error creating vllm deployment: '
|
|
242
|
+
'--host flag should be excluded from run command'
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
raise ValueError(
|
|
246
|
+
'Error creating vllm deployment: '
|
|
247
|
+
'--port flag should be excluded from run command'
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
general = True
|
|
251
|
+
if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
252
|
+
general = False
|
|
253
|
+
|
|
254
|
+
with tempfile.NamedTemporaryFile() as temp:
|
|
255
|
+
common_utils.fill_template(
|
|
256
|
+
'pod.yaml.j2',
|
|
257
|
+
{
|
|
258
|
+
# TODO(asaiacai) need to parse/round these numbers and sanity check
|
|
259
|
+
'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(
|
|
260
|
+
str(task.resources.cpus or '')
|
|
261
|
+
),
|
|
262
|
+
'memory': kubernetes_utils.parse_memory_resource(
|
|
263
|
+
task.resources.memory or ''
|
|
264
|
+
),
|
|
265
|
+
'image_id': task.resources.image_id,
|
|
266
|
+
'num_gpus': num_gpus,
|
|
267
|
+
'master_addr': master_addr,
|
|
268
|
+
'num_nodes': task.num_nodes,
|
|
269
|
+
'job_name': task.name, # append timestamp and user id here?
|
|
270
|
+
'setup_cmd': task.setup or '',
|
|
271
|
+
'run_cmd': task.run or '',
|
|
272
|
+
'node_hostnames': node_hostnames,
|
|
273
|
+
'accelerator_type': accelerator_type,
|
|
274
|
+
'sync_commands': sync_commands,
|
|
275
|
+
'mkdir_commands': mkdir_commands,
|
|
276
|
+
'mount_secrets': storage_secrets,
|
|
277
|
+
'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
|
|
278
|
+
'user': common_utils.get_cleaned_username(),
|
|
279
|
+
# Tailscale credentials
|
|
280
|
+
'tailscale_secret': tailscale_secret,
|
|
281
|
+
# SSH
|
|
282
|
+
'enable_ssh': enable_ssh,
|
|
283
|
+
'secret_name': secret_name,
|
|
284
|
+
# Serving
|
|
285
|
+
'serving': bool(task.serving),
|
|
286
|
+
'general': general,
|
|
287
|
+
'ports': task.serving.ports if task.serving else None,
|
|
288
|
+
'probe': task.serving.probe if task.serving else None,
|
|
289
|
+
'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
|
|
290
|
+
# Kinds of Secrets
|
|
291
|
+
# --kind git-ssh
|
|
292
|
+
'git_ssh': git_ssh_secret_name,
|
|
293
|
+
# --kind default
|
|
294
|
+
'default_secrets': default_secrets,
|
|
295
|
+
# KONDUKTOR_DEBUG
|
|
296
|
+
'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
|
|
297
|
+
},
|
|
298
|
+
temp.name,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Capture the template env names BEFORE user config is merged
|
|
302
|
+
pod_config_template = common_utils.read_yaml(temp.name)
|
|
303
|
+
tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
|
|
304
|
+
'containers'
|
|
305
|
+
][0].get('env', [])
|
|
306
|
+
tmpl_env_names = {e['name'] for e in tmpl_envs}
|
|
307
|
+
|
|
308
|
+
pod_config = common_utils.read_yaml(temp.name)
|
|
309
|
+
# merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
|
|
310
|
+
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
|
311
|
+
pod_config = common_utils.read_yaml(temp.name)
|
|
312
|
+
|
|
313
|
+
# Find what came from user config (appeared after combine, not in template)
|
|
314
|
+
premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
|
|
315
|
+
'env', []
|
|
316
|
+
)
|
|
317
|
+
premerge_names = {e['name'] for e in premerge_envs}
|
|
318
|
+
config_env_names0 = premerge_names - tmpl_env_names
|
|
319
|
+
|
|
320
|
+
# Build final env list
|
|
321
|
+
env_map = {env['name']: env for env in premerge_envs}
|
|
322
|
+
|
|
323
|
+
# Inject secret envs (env secrets override config.yaml)
|
|
324
|
+
for env in env_secret_envs:
|
|
325
|
+
env_map[env['name']] = env
|
|
326
|
+
|
|
327
|
+
# Inject task envs
|
|
328
|
+
# CLI+task.yaml overrides everything else
|
|
329
|
+
# CLI already overrode task.yaml in other code
|
|
330
|
+
for k, v in task.envs.items():
|
|
331
|
+
env_map[k] = {'name': k, 'value': v}
|
|
332
|
+
|
|
333
|
+
final_envs_list = list(env_map.values())
|
|
334
|
+
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
|
|
335
|
+
final_envs_list
|
|
336
|
+
)
|
|
337
|
+
container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
|
|
338
|
+
final_envs = container['env']
|
|
339
|
+
final_names = {e['name'] for e in final_envs}
|
|
340
|
+
|
|
341
|
+
logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
|
|
342
|
+
|
|
343
|
+
# 1) Get secret envs actually used in the final env list
|
|
344
|
+
secret_details = sorted(
|
|
345
|
+
(e['name'], e['valueFrom']['secretKeyRef']['name'])
|
|
346
|
+
for e in final_envs
|
|
347
|
+
if isinstance(e, dict)
|
|
348
|
+
and e.get('valueFrom', {})
|
|
349
|
+
and e['valueFrom'].get('secretKeyRef')
|
|
350
|
+
)
|
|
351
|
+
secret_names = [n for n, _ in secret_details]
|
|
352
|
+
|
|
353
|
+
# 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
|
|
354
|
+
task_all_names = sorted(
|
|
355
|
+
n
|
|
356
|
+
for n in (task.envs or {}).keys()
|
|
357
|
+
if n in final_names and n not in secret_names
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# 3) Get Config.yaml envs actually used in the final env list
|
|
361
|
+
config_names = sorted(
|
|
362
|
+
n
|
|
363
|
+
for n in config_env_names0
|
|
364
|
+
if n in final_names and n not in secret_names and n not in task_all_names
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# 4) Get other envs (template/system) actually used in the final env list
|
|
368
|
+
other_names = sorted(
|
|
369
|
+
final_names - set(secret_names) - set(task_all_names) - set(config_names)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Export helper envs for the startup script (names only)
|
|
373
|
+
def _append_helper(name: str, values):
|
|
374
|
+
container['env'].append({'name': name, 'value': ','.join(values)})
|
|
375
|
+
|
|
376
|
+
# to show user basenames of k8s secrets instead of actual
|
|
377
|
+
# k8s secret names (which have added suffixes)
|
|
378
|
+
secret_map_pairs = [
|
|
379
|
+
f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
|
|
380
|
+
for (var, secret_k8s) in secret_details
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
# Priority order: CLI > task.yaml > env secret > config > template/system
|
|
384
|
+
_append_helper(
|
|
385
|
+
'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
|
|
386
|
+
secret_names,
|
|
387
|
+
)
|
|
388
|
+
_append_helper(
|
|
389
|
+
'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
|
|
390
|
+
secret_map_pairs,
|
|
391
|
+
)
|
|
392
|
+
_append_helper(
|
|
393
|
+
'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
|
|
394
|
+
task_all_names,
|
|
395
|
+
)
|
|
396
|
+
_append_helper(
|
|
397
|
+
'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
|
|
398
|
+
config_names,
|
|
399
|
+
)
|
|
400
|
+
_append_helper(
|
|
401
|
+
'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
|
|
402
|
+
other_names,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# validate pod spec using json schema
|
|
406
|
+
try:
|
|
407
|
+
validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
|
|
408
|
+
except ValueError as e:
|
|
409
|
+
raise click.UsageError(str(e))
|
|
410
|
+
|
|
411
|
+
return pod_config
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def inject_deployment_pod_metadata(
|
|
415
|
+
pod_spec: Dict[str, Any], task: 'konduktor.Task'
|
|
416
|
+
) -> None:
|
|
417
|
+
"""Inject deployment-specific metadata into pod spec.
|
|
418
|
+
|
|
419
|
+
This function adds deployment-specific labels, annotations, and settings
|
|
420
|
+
that are not present in the basic pod spec used for JobSets.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
pod_spec: The pod spec dictionary to modify
|
|
424
|
+
task: The task object containing resource information
|
|
425
|
+
"""
|
|
426
|
+
# Ensure metadata structure exists
|
|
427
|
+
pod_spec.setdefault('metadata', {})
|
|
428
|
+
pod_spec['metadata'].setdefault('labels', {})
|
|
429
|
+
pod_spec['metadata'].setdefault('annotations', {})
|
|
430
|
+
|
|
431
|
+
# Determine deployment type
|
|
432
|
+
deployment_type = 'general'
|
|
433
|
+
if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
434
|
+
deployment_type = 'vllm'
|
|
435
|
+
|
|
436
|
+
# Add deployment-specific label for vllm deployments only
|
|
437
|
+
if deployment_type == 'vllm':
|
|
438
|
+
pod_spec['metadata']['labels'][backend_constants.AIBRIX_NAME_LABEL] = task.name
|
|
439
|
+
|
|
440
|
+
# Add deployment-specific label for all deployments
|
|
441
|
+
pod_spec['metadata']['labels'][backend_constants.DEPLOYMENT_NAME_LABEL] = task.name
|
|
442
|
+
|
|
443
|
+
# Add resource labels
|
|
444
|
+
if task.resources and task.resources.labels:
|
|
445
|
+
pod_spec['metadata']['labels'].update(task.resources.labels)
|
|
446
|
+
|
|
447
|
+
# Set restart policy for deployments
|
|
448
|
+
pod_spec.setdefault('spec', {})
|
|
449
|
+
pod_spec['spec']['restartPolicy'] = 'Always'
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def merge_pod_into_deployment_template(
|
|
453
|
+
deployment_spec: Dict[str, Any], pod_spec: Dict[str, Any]
|
|
454
|
+
) -> None:
|
|
455
|
+
"""Merge a pod spec into a deployment template.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
deployment_spec: The deployment spec dictionary to modify
|
|
459
|
+
pod_spec: The pod spec to merge into the deployment template
|
|
460
|
+
"""
|
|
461
|
+
deployment_spec['template'] = pod_spec
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task') -> None:
|
|
465
|
+
"""Inject JobSet-specific pod metadata.
|
|
466
|
+
|
|
467
|
+
This function adds JobSet-specific annotations that are not present
|
|
468
|
+
in the basic pod spec.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
jobset_spec: The JobSet spec dictionary to modify
|
|
472
|
+
task: The task object containing resource information
|
|
473
|
+
"""
|
|
474
|
+
# Add max run duration annotation
|
|
475
|
+
assert (
|
|
476
|
+
task.resources is not None and task.resources.labels is not None
|
|
477
|
+
), 'Task resources and task.resources.labels are required'
|
|
478
|
+
maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
|
|
479
|
+
if not maxRunDurationSeconds:
|
|
480
|
+
raise ValueError('maxRunDurationSeconds is required')
|
|
481
|
+
jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
|
|
482
|
+
str(maxRunDurationSeconds)
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
# Inject resource labels into JobSet metadata.
|
|
486
|
+
if task.resources and task.resources.labels:
|
|
487
|
+
jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def merge_pod_into_jobset_template(
|
|
491
|
+
jobset_spec: Dict[str, Any], pod_spec: Dict[str, Any]
|
|
492
|
+
) -> None:
|
|
493
|
+
"""Merge a pod spec into a JobSet template.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
jobset_spec: The JobSet spec dictionary to modify
|
|
497
|
+
pod_spec: The pod spec to merge into the JobSet template
|
|
498
|
+
"""
|
|
499
|
+
jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
|
|
500
|
+
'template'
|
|
501
|
+
] = pod_spec
|
konduktor/check.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Credential checks: check cloud credentials and enable clouds.
|
|
14
|
+
|
|
15
|
+
Our architecture is client-server and requires that credentials are stored
|
|
16
|
+
as a secret in the cluster. This makes it so that cluster admins can just
|
|
17
|
+
deploy credentials (s3, gcs, r2) once to the namespace. Users then during job
|
|
18
|
+
use the secret stored for mounting credentials to pods. Users running must also
|
|
19
|
+
have the credentials present on their local machine, otherwise they won't be able to
|
|
20
|
+
upload files to object storage.
|
|
21
|
+
|
|
22
|
+
We have to check that the credentials are valid on the client side.
|
|
23
|
+
If the check fails, then we will attempt to check the credentials present on the client.
|
|
24
|
+
If these credentials are valid, we update the secret on the cluster, and
|
|
25
|
+
run the job as usual.
|
|
26
|
+
If these credentials are not valid, we fail the job and alert the user.
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import traceback
|
|
31
|
+
import typing
|
|
32
|
+
from typing import Iterable, List, Optional, Tuple
|
|
33
|
+
|
|
34
|
+
import click
|
|
35
|
+
import colorama
|
|
36
|
+
|
|
37
|
+
from konduktor import config as konduktor_config
|
|
38
|
+
from konduktor import logging
|
|
39
|
+
from konduktor.data import registry
|
|
40
|
+
from konduktor.utils import rich_utils
|
|
41
|
+
|
|
42
|
+
if typing.TYPE_CHECKING:
|
|
43
|
+
from konduktor.data import storage_utils
|
|
44
|
+
|
|
45
|
+
logger = logging.get_logger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def check(
|
|
49
|
+
quiet: bool = False,
|
|
50
|
+
clouds: Optional[Iterable[str]] = None,
|
|
51
|
+
) -> List[str]:
|
|
52
|
+
echo = (
|
|
53
|
+
(lambda *_args, **_kwargs: None)
|
|
54
|
+
if quiet
|
|
55
|
+
else lambda *args, **kwargs: click.echo(*args, **kwargs, color=True)
|
|
56
|
+
)
|
|
57
|
+
echo('Checking credentials to enable clouds storage for Konduktor.')
|
|
58
|
+
enabled_clouds = []
|
|
59
|
+
disabled_clouds = []
|
|
60
|
+
|
|
61
|
+
def check_one_cloud(
|
|
62
|
+
cloud_tuple: Tuple[str, 'storage_utils.CloudStorage'],
|
|
63
|
+
) -> None:
|
|
64
|
+
cloud_repr, cloud = cloud_tuple
|
|
65
|
+
with rich_utils.safe_status(f'Checking {cloud_repr}...'):
|
|
66
|
+
try:
|
|
67
|
+
logger.info(f'Checking {cloud_repr} local client credentials...')
|
|
68
|
+
ok, reason = cloud.check_credentials()
|
|
69
|
+
except Exception: # pylint: disable=broad-except
|
|
70
|
+
# Catch all exceptions to prevent a single cloud from blocking
|
|
71
|
+
# the check for other clouds.
|
|
72
|
+
ok, reason = False, traceback.format_exc()
|
|
73
|
+
status_msg = 'enabled' if ok else 'disabled'
|
|
74
|
+
styles = {'fg': 'green', 'bold': False} if ok else {'dim': True}
|
|
75
|
+
echo(' ' + click.style(f'{cloud_repr}: {status_msg}', **styles) + ' ' * 30)
|
|
76
|
+
if ok:
|
|
77
|
+
enabled_clouds.append(cloud_repr)
|
|
78
|
+
if reason is not None:
|
|
79
|
+
echo(f' Hint: {reason}')
|
|
80
|
+
else:
|
|
81
|
+
disabled_clouds.append(cloud_repr)
|
|
82
|
+
echo(f' Reason: {reason}')
|
|
83
|
+
|
|
84
|
+
def get_cloud_tuple(cloud_name: str) -> Tuple[str, 'storage_utils.CloudStorage']:
|
|
85
|
+
# Validates cloud_name and returns a tuple of the cloud's name and
|
|
86
|
+
# the cloud object. Includes special handling for Cloudflare.
|
|
87
|
+
cloud_obj = registry._REGISTRY.get(cloud_name, None)
|
|
88
|
+
assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
|
|
89
|
+
return cloud_name, cloud_obj
|
|
90
|
+
|
|
91
|
+
def get_all_clouds():
|
|
92
|
+
return tuple([c for c in registry._REGISTRY.keys()])
|
|
93
|
+
|
|
94
|
+
if clouds is not None:
|
|
95
|
+
cloud_list = clouds
|
|
96
|
+
else:
|
|
97
|
+
cloud_list = get_all_clouds()
|
|
98
|
+
clouds_to_check = [get_cloud_tuple(c) for c in cloud_list]
|
|
99
|
+
|
|
100
|
+
# Use allowed_clouds from config if it exists, otherwise check all clouds.
|
|
101
|
+
# Also validate names with get_cloud_tuple.
|
|
102
|
+
config_allowed_cloud_names = [
|
|
103
|
+
c for c in konduktor_config.get_nested(('allowed_clouds',), get_all_clouds())
|
|
104
|
+
]
|
|
105
|
+
# Use disallowed_cloud_names for logging the clouds that will be disabled
|
|
106
|
+
# because they are not included in allowed_clouds in config.yaml.
|
|
107
|
+
disallowed_cloud_names = [
|
|
108
|
+
c for c in get_all_clouds() if c not in config_allowed_cloud_names
|
|
109
|
+
]
|
|
110
|
+
# Check only the clouds which are allowed in the config.
|
|
111
|
+
clouds_to_check = [c for c in clouds_to_check if c[0] in config_allowed_cloud_names]
|
|
112
|
+
|
|
113
|
+
for cloud_tuple in sorted(clouds_to_check):
|
|
114
|
+
check_one_cloud(cloud_tuple)
|
|
115
|
+
|
|
116
|
+
# Cloudflare is not a real cloud in registry.CLOUD_REGISTRY, and should
|
|
117
|
+
# not be inserted into the DB (otherwise `sky launch` and other code would
|
|
118
|
+
# error out when it's trying to look it up in the registry).
|
|
119
|
+
enabled_clouds_set = {
|
|
120
|
+
cloud for cloud in enabled_clouds if not cloud.startswith('Cloudflare')
|
|
121
|
+
}
|
|
122
|
+
disabled_clouds_set = {
|
|
123
|
+
cloud for cloud in disabled_clouds if not cloud.startswith('Cloudflare')
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Determine the set of enabled clouds: (previously enabled clouds + newly
|
|
127
|
+
# enabled clouds - newly disabled clouds) intersected with
|
|
128
|
+
# config_allowed_clouds, if specified in config.yaml.
|
|
129
|
+
# This means that if a cloud is already enabled and is not included in
|
|
130
|
+
# allowed_clouds in config.yaml, it will be disabled.
|
|
131
|
+
all_enabled_clouds = enabled_clouds_set - disabled_clouds_set
|
|
132
|
+
|
|
133
|
+
disallowed_clouds_hint = None
|
|
134
|
+
if disallowed_cloud_names:
|
|
135
|
+
disallowed_clouds_hint = (
|
|
136
|
+
'\nNote: The following clouds were disabled because they were not '
|
|
137
|
+
'included in allowed_clouds in ~/.konduktor/config.yaml: '
|
|
138
|
+
f'{", ".join([c for c in disallowed_cloud_names])}'
|
|
139
|
+
)
|
|
140
|
+
if not all_enabled_clouds:
|
|
141
|
+
echo(
|
|
142
|
+
click.style(
|
|
143
|
+
'No cloud is enabled. Konduktor will not be able to run any '
|
|
144
|
+
'task. Run `konduktor check` for more info.',
|
|
145
|
+
fg='red',
|
|
146
|
+
bold=True,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
if disallowed_clouds_hint:
|
|
150
|
+
echo(click.style(disallowed_clouds_hint, dim=True))
|
|
151
|
+
raise SystemExit()
|
|
152
|
+
else:
|
|
153
|
+
clouds_arg = ' ' + ' '.join(disabled_clouds) if clouds is not None else ''
|
|
154
|
+
echo(
|
|
155
|
+
click.style(
|
|
156
|
+
'\nTo enable a cloud, follow the hints above and rerun: ', dim=True
|
|
157
|
+
)
|
|
158
|
+
+ click.style(f'konduktor check {clouds_arg}', bold=True)
|
|
159
|
+
+ '\n'
|
|
160
|
+
+ click.style(
|
|
161
|
+
'If any problems remain, refer to detailed docs at: '
|
|
162
|
+
'https://trainy.mintlify.app', # pylint: disable=line-too-long
|
|
163
|
+
dim=True,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if disallowed_clouds_hint:
|
|
168
|
+
echo(click.style(disallowed_clouds_hint, dim=True))
|
|
169
|
+
|
|
170
|
+
# Pretty print for UX.
|
|
171
|
+
if not quiet:
|
|
172
|
+
enabled_clouds_str = '\n ' + '\n '.join(
|
|
173
|
+
[_format_enabled_storage(cloud) for cloud in sorted(all_enabled_clouds)]
|
|
174
|
+
)
|
|
175
|
+
echo(
|
|
176
|
+
f'\n{colorama.Fore.GREEN}{logging.PARTY_POPPER_EMOJI} '
|
|
177
|
+
f'Enabled clouds {logging.PARTY_POPPER_EMOJI}'
|
|
178
|
+
f'{colorama.Style.RESET_ALL}{enabled_clouds_str}'
|
|
179
|
+
)
|
|
180
|
+
return enabled_clouds
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _format_enabled_storage(cloud_name: str) -> str:
|
|
184
|
+
return f'{colorama.Fore.GREEN}{cloud_name}{colorama.Style.RESET_ALL}'
|