konduktor-nightly 0.1.0.dev20251022104926__py3-none-any.whl → 0.1.0.dev20251107104752__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +2 -2
- konduktor/backends/constants.py +1 -0
- konduktor/backends/deployment.py +13 -2
- konduktor/backends/deployment_utils.py +3 -3
- konduktor/backends/jobset_utils.py +6 -6
- konduktor/backends/pod_utils.py +133 -18
- konduktor/cli.py +57 -27
- konduktor/manifests/aibrix-setup.yaml +157 -1
- konduktor/manifests/apoxy-setup2.yaml +1 -1
- konduktor/resource.py +9 -2
- konduktor/task.py +1 -5
- konduktor/templates/deployment.yaml.j2 +5 -3
- konduktor/templates/pod.yaml.j2 +123 -9
- konduktor/utils/base64_utils.py +2 -0
- konduktor/utils/validator.py +12 -0
- {konduktor_nightly-0.1.0.dev20251022104926.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20251022104926.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/RECORD +20 -20
- {konduktor_nightly-0.1.0.dev20251022104926.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20251022104926.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20251022104926.dist-info → konduktor_nightly-0.1.0.dev20251107104752.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ from konduktor.task import Task
|
|
|
11
11
|
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
12
|
|
|
13
13
|
# Replaced with the current commit when building the wheels.
|
|
14
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '5ceef9b8f579ac23f7a2bd863820aaa2341055e3'
|
|
15
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -45,5 +45,5 @@ def _get_git_commit():
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
__commit__ = _get_git_commit()
|
|
48
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20251107104752'
|
|
49
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
konduktor/backends/constants.py
CHANGED
konduktor/backends/deployment.py
CHANGED
|
@@ -54,8 +54,19 @@ def _wait_for_all_ready(namespace: str, name: str):
|
|
|
54
54
|
except ApiException:
|
|
55
55
|
services_map = {}
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
autoscalers_map = {}
|
|
58
|
+
try:
|
|
59
|
+
autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
|
|
60
|
+
if autoscaler_obj:
|
|
61
|
+
# detect aibrix vs general from deployment labels
|
|
62
|
+
labels = (deployment.metadata.labels or {}) if deployment else {}
|
|
63
|
+
is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
|
|
64
|
+
if is_aibrix:
|
|
65
|
+
autoscalers_map[name] = {'kpa': autoscaler_obj}
|
|
66
|
+
else:
|
|
67
|
+
autoscalers_map[name] = {'hpa': autoscaler_obj}
|
|
68
|
+
except ApiException:
|
|
69
|
+
pass
|
|
59
70
|
|
|
60
71
|
status = deployment_utils.get_model_status(
|
|
61
72
|
name, deployments_map, services_map, autoscalers_map
|
|
@@ -998,13 +998,13 @@ def get_envoy_external_ip() -> Optional[str]:
|
|
|
998
998
|
|
|
999
999
|
|
|
1000
1000
|
def get_ingress_nginx_external_ip() -> Optional[str]:
|
|
1001
|
-
"""Get the external IP of the ingress-nginx-controller LoadBalancer."""
|
|
1001
|
+
"""Get the external IP of the keda-ingress-nginx-controller LoadBalancer."""
|
|
1002
1002
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1003
1003
|
core_api = kube_client.core_api(context=context)
|
|
1004
1004
|
try:
|
|
1005
|
-
# Look for ingress-nginx-controller service in keda namespace
|
|
1005
|
+
# Look for keda-ingress-nginx-controller service in keda namespace
|
|
1006
1006
|
service = core_api.read_namespaced_service(
|
|
1007
|
-
name='ingress-nginx-controller', namespace='keda'
|
|
1007
|
+
name='keda-ingress-nginx-controller', namespace='keda'
|
|
1008
1008
|
)
|
|
1009
1009
|
if service.spec.type == 'LoadBalancer':
|
|
1010
1010
|
ingress = service.status.load_balancer.ingress
|
|
@@ -5,15 +5,12 @@ import json
|
|
|
5
5
|
import tempfile
|
|
6
6
|
import time
|
|
7
7
|
import typing
|
|
8
|
-
from datetime import datetime, timezone
|
|
8
|
+
from datetime import datetime, timedelta, timezone
|
|
9
9
|
from typing import Any, Dict, Optional, Tuple
|
|
10
10
|
|
|
11
11
|
import click
|
|
12
12
|
import colorama
|
|
13
13
|
|
|
14
|
-
if typing.TYPE_CHECKING:
|
|
15
|
-
from datetime import timedelta
|
|
16
|
-
|
|
17
14
|
import konduktor
|
|
18
15
|
from konduktor import kube_client, logging
|
|
19
16
|
from konduktor.backends import constants as backend_constants
|
|
@@ -428,7 +425,9 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
|
|
|
428
425
|
seconds=abs(local_offset)
|
|
429
426
|
)
|
|
430
427
|
else:
|
|
431
|
-
|
|
428
|
+
# Handle date-only format (local midnight --> UTC)
|
|
429
|
+
local_tz = datetime.now().astimezone().tzinfo
|
|
430
|
+
return dt.replace(tzinfo=local_tz).astimezone(timezone.utc)
|
|
432
431
|
return dt
|
|
433
432
|
except ValueError:
|
|
434
433
|
continue
|
|
@@ -450,7 +449,8 @@ def _format_timestamp(timestamp: str) -> str:
|
|
|
450
449
|
|
|
451
450
|
|
|
452
451
|
def _get_job_start_time(job: Dict[str, Any]) -> str:
|
|
453
|
-
|
|
452
|
+
status = job.get('status', {})
|
|
453
|
+
for condition in status.get('conditions', []):
|
|
454
454
|
if condition['reason'] == 'ResumeJobs':
|
|
455
455
|
return condition.get('lastTransitionTime', '')
|
|
456
456
|
return '-'
|
konduktor/backends/pod_utils.py
CHANGED
|
@@ -153,7 +153,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
153
153
|
git_ssh_secret_name = None
|
|
154
154
|
env_secret_envs = []
|
|
155
155
|
default_secrets = []
|
|
156
|
+
basename_by_k8s: Dict[str, str] = {}
|
|
156
157
|
|
|
158
|
+
# only get own secrets
|
|
157
159
|
user_hash = common_utils.get_user_hash()
|
|
158
160
|
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
159
161
|
user_secrets = kubernetes_utils.list_secrets(
|
|
@@ -162,19 +164,36 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
162
164
|
|
|
163
165
|
for secret in user_secrets:
|
|
164
166
|
kind = kubernetes_utils.get_secret_kind(secret)
|
|
167
|
+
|
|
168
|
+
# incase the user modified their secret to have no key:value data
|
|
169
|
+
if secret.data is None:
|
|
170
|
+
secret.data = {}
|
|
171
|
+
|
|
172
|
+
# fill the map for *all* secrets we see
|
|
173
|
+
k8s_name = secret.metadata.name
|
|
174
|
+
lbls = secret.metadata.labels or {}
|
|
175
|
+
base = lbls.get(
|
|
176
|
+
backend_constants.SECRET_BASENAME_LABEL,
|
|
177
|
+
# fallback: strip trailing "-<something>" once if present
|
|
178
|
+
k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
|
|
179
|
+
)
|
|
180
|
+
basename_by_k8s[k8s_name] = base
|
|
181
|
+
|
|
165
182
|
if kind == 'git-ssh' and git_ssh_secret_name is None:
|
|
166
183
|
git_ssh_secret_name = secret.metadata.name
|
|
167
184
|
elif kind == 'env':
|
|
168
185
|
env_secret_name = secret.metadata.name
|
|
169
|
-
key
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
'
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
186
|
+
# iterate ALL keys, not just one (ex. if user made a multi-key env secret)
|
|
187
|
+
for key, _ in secret.data.items():
|
|
188
|
+
# wire the env var to read its value from a k8s secret
|
|
189
|
+
env_secret_envs.append(
|
|
190
|
+
{
|
|
191
|
+
'name': key,
|
|
192
|
+
'valueFrom': {
|
|
193
|
+
'secretKeyRef': {'name': env_secret_name, 'key': key}
|
|
194
|
+
},
|
|
195
|
+
}
|
|
196
|
+
)
|
|
178
197
|
elif kind == 'default':
|
|
179
198
|
default_secret_name = secret.metadata.name
|
|
180
199
|
basename = secret.metadata.labels.get(
|
|
@@ -184,6 +203,22 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
184
203
|
{'k8s_name': default_secret_name, 'mount_name': basename}
|
|
185
204
|
)
|
|
186
205
|
|
|
206
|
+
# Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
|
|
207
|
+
uses_default_secret_var = (
|
|
208
|
+
'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
|
|
209
|
+
or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
|
|
210
|
+
or '/konduktor/default-secrets/' in (task.run or '')
|
|
211
|
+
or '/konduktor/default-secrets/' in (task.setup or '')
|
|
212
|
+
)
|
|
213
|
+
if uses_default_secret_var and not default_secrets:
|
|
214
|
+
raise exceptions.MissingSecretError(
|
|
215
|
+
f'Task references KONDUKTOR_DEFAULT_SECRETS or '
|
|
216
|
+
f'/konduktor/default-secrets but '
|
|
217
|
+
f'user {common_utils.get_cleaned_username()} '
|
|
218
|
+
f'has no default secrets. Paths like '
|
|
219
|
+
f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
|
|
220
|
+
)
|
|
221
|
+
|
|
187
222
|
# Inject --served-model-name, --host, and --port into serving run command
|
|
188
223
|
if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
189
224
|
if '--served-model-name' and '--host' and '--port' not in task.run:
|
|
@@ -262,31 +297,111 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
262
297
|
},
|
|
263
298
|
temp.name,
|
|
264
299
|
)
|
|
300
|
+
|
|
301
|
+
# Capture the template env names BEFORE user config is merged
|
|
302
|
+
pod_config_template = common_utils.read_yaml(temp.name)
|
|
303
|
+
tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
|
|
304
|
+
'containers'
|
|
305
|
+
][0].get('env', [])
|
|
306
|
+
tmpl_env_names = {e['name'] for e in tmpl_envs}
|
|
307
|
+
|
|
265
308
|
pod_config = common_utils.read_yaml(temp.name)
|
|
266
|
-
# merge with `~/.konduktor/config.yaml``
|
|
309
|
+
# merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
|
|
267
310
|
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
|
268
311
|
pod_config = common_utils.read_yaml(temp.name)
|
|
269
312
|
|
|
270
|
-
#
|
|
271
|
-
|
|
313
|
+
# Find what came from user config (appeared after combine, not in template)
|
|
314
|
+
premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
|
|
272
315
|
'env', []
|
|
273
316
|
)
|
|
274
|
-
|
|
317
|
+
premerge_names = {e['name'] for e in premerge_envs}
|
|
318
|
+
config_env_names0 = premerge_names - tmpl_env_names
|
|
275
319
|
|
|
276
|
-
#
|
|
320
|
+
# Build final env list
|
|
321
|
+
env_map = {env['name']: env for env in premerge_envs}
|
|
322
|
+
|
|
323
|
+
# Inject secret envs (env secrets override config.yaml)
|
|
277
324
|
for env in env_secret_envs:
|
|
278
325
|
env_map[env['name']] = env
|
|
279
326
|
|
|
280
|
-
# Inject task
|
|
327
|
+
# Inject task envs
|
|
328
|
+
# CLI+task.yaml overrides everything else
|
|
329
|
+
# CLI already overrode task.yaml in other code
|
|
281
330
|
for k, v in task.envs.items():
|
|
282
331
|
env_map[k] = {'name': k, 'value': v}
|
|
283
332
|
|
|
284
|
-
|
|
285
|
-
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] =
|
|
286
|
-
|
|
333
|
+
final_envs_list = list(env_map.values())
|
|
334
|
+
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
|
|
335
|
+
final_envs_list
|
|
287
336
|
)
|
|
337
|
+
container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
|
|
338
|
+
final_envs = container['env']
|
|
339
|
+
final_names = {e['name'] for e in final_envs}
|
|
340
|
+
|
|
288
341
|
logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
|
|
289
342
|
|
|
343
|
+
# 1) Get secret envs actually used in the final env list
|
|
344
|
+
secret_details = sorted(
|
|
345
|
+
(e['name'], e['valueFrom']['secretKeyRef']['name'])
|
|
346
|
+
for e in final_envs
|
|
347
|
+
if isinstance(e, dict)
|
|
348
|
+
and e.get('valueFrom', {})
|
|
349
|
+
and e['valueFrom'].get('secretKeyRef')
|
|
350
|
+
)
|
|
351
|
+
secret_names = [n for n, _ in secret_details]
|
|
352
|
+
|
|
353
|
+
# 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
|
|
354
|
+
task_all_names = sorted(
|
|
355
|
+
n
|
|
356
|
+
for n in (task.envs or {}).keys()
|
|
357
|
+
if n in final_names and n not in secret_names
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# 3) Get Config.yaml envs actually used in the final env list
|
|
361
|
+
config_names = sorted(
|
|
362
|
+
n
|
|
363
|
+
for n in config_env_names0
|
|
364
|
+
if n in final_names and n not in secret_names and n not in task_all_names
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# 4) Get other envs (template/system) actually used in the final env list
|
|
368
|
+
other_names = sorted(
|
|
369
|
+
final_names - set(secret_names) - set(task_all_names) - set(config_names)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Export helper envs for the startup script (names only)
|
|
373
|
+
def _append_helper(name: str, values):
|
|
374
|
+
container['env'].append({'name': name, 'value': ','.join(values)})
|
|
375
|
+
|
|
376
|
+
# to show user basenames of k8s secrets instead of actual
|
|
377
|
+
# k8s secret names (which have added suffixes)
|
|
378
|
+
secret_map_pairs = [
|
|
379
|
+
f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
|
|
380
|
+
for (var, secret_k8s) in secret_details
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
# Priority order: CLI > task.yaml > env secret > config > template/system
|
|
384
|
+
_append_helper(
|
|
385
|
+
'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
|
|
386
|
+
secret_names,
|
|
387
|
+
)
|
|
388
|
+
_append_helper(
|
|
389
|
+
'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
|
|
390
|
+
secret_map_pairs,
|
|
391
|
+
)
|
|
392
|
+
_append_helper(
|
|
393
|
+
'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
|
|
394
|
+
task_all_names,
|
|
395
|
+
)
|
|
396
|
+
_append_helper(
|
|
397
|
+
'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
|
|
398
|
+
config_names,
|
|
399
|
+
)
|
|
400
|
+
_append_helper(
|
|
401
|
+
'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
|
|
402
|
+
other_names,
|
|
403
|
+
)
|
|
404
|
+
|
|
290
405
|
# validate pod spec using json schema
|
|
291
406
|
try:
|
|
292
407
|
validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
|
konduktor/cli.py
CHANGED
|
@@ -54,6 +54,7 @@ from konduktor import logging
|
|
|
54
54
|
from konduktor.backends import constants as backend_constants
|
|
55
55
|
from konduktor.backends import deployment_utils, jobset_utils
|
|
56
56
|
from konduktor.utils import (
|
|
57
|
+
base64_utils,
|
|
57
58
|
common_utils,
|
|
58
59
|
kubernetes_utils,
|
|
59
60
|
log_utils,
|
|
@@ -161,7 +162,9 @@ def _make_task_with_overrides(
|
|
|
161
162
|
if workdir is not None:
|
|
162
163
|
task.workdir = workdir
|
|
163
164
|
|
|
164
|
-
|
|
165
|
+
# perform overrides from CLI
|
|
166
|
+
if override_params:
|
|
167
|
+
task.set_resources_override(override_params)
|
|
165
168
|
if task.serving:
|
|
166
169
|
task.set_serving_override(serving_override_params)
|
|
167
170
|
|
|
@@ -653,28 +656,23 @@ def status(
|
|
|
653
656
|
all_users: bool, limit: Optional[int], after: Optional[str], before: Optional[str]
|
|
654
657
|
):
|
|
655
658
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
656
|
-
"""Shows list of all the jobs with optional filtering and pagination
|
|
657
|
-
|
|
658
|
-
Args:
|
|
659
|
-
all_users (bool): whether to show all jobs for all users
|
|
660
|
-
limit (Optional[int]): maximum number of jobs to display
|
|
661
|
-
after (Optional[str]): show jobs created after this timestamp
|
|
662
|
-
before (Optional[str]): show jobs created before this timestamp
|
|
659
|
+
"""Shows list of all the jobs with optional filtering and pagination.
|
|
663
660
|
|
|
661
|
+
\b
|
|
664
662
|
Examples:
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
663
|
+
konduktor status --limit 10
|
|
664
|
+
konduktor status --before "08/06/25 03:53PM"
|
|
665
|
+
konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
|
|
666
|
+
|
|
667
|
+
\b
|
|
668
|
+
Notes:
|
|
669
|
+
• When using --before or --after timestamps, "08/06/25"
|
|
670
|
+
is equivalent to "08/06/25 00:00".
|
|
671
|
+
• "03:53PM" is equivalent to "03:53:00PM".
|
|
672
|
+
• Timestamps shown in "konduktor status" are truncated
|
|
673
|
+
and are in the local timezone.
|
|
674
|
+
Example: "03:53:55PM" → "03:53PM" — would show up in
|
|
675
|
+
--after "03:53PM" but not in --before "03:53PM".
|
|
678
676
|
"""
|
|
679
677
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
680
678
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
@@ -802,6 +800,13 @@ def logs(
|
|
|
802
800
|
# pylint: disable=bad-docstring-quotes
|
|
803
801
|
help='Skip confirmation prompt.',
|
|
804
802
|
)
|
|
803
|
+
@click.option(
|
|
804
|
+
'--skip-image-check',
|
|
805
|
+
'-s',
|
|
806
|
+
is_flag=True,
|
|
807
|
+
default=False,
|
|
808
|
+
help='Skip Docker image validation checks for faster startup.',
|
|
809
|
+
)
|
|
805
810
|
def launch(
|
|
806
811
|
entrypoint: Tuple[str, ...],
|
|
807
812
|
dryrun: bool,
|
|
@@ -820,6 +825,7 @@ def launch(
|
|
|
820
825
|
env: List[Tuple[str, str]],
|
|
821
826
|
disk_size: Optional[int],
|
|
822
827
|
yes: bool,
|
|
828
|
+
skip_image_check: bool,
|
|
823
829
|
):
|
|
824
830
|
"""Launch a task.
|
|
825
831
|
|
|
@@ -829,6 +835,9 @@ def launch(
|
|
|
829
835
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
830
836
|
env = _merge_env_vars(env_file, env)
|
|
831
837
|
|
|
838
|
+
if skip_image_check:
|
|
839
|
+
os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
|
|
840
|
+
|
|
832
841
|
task = _make_task_with_overrides(
|
|
833
842
|
entrypoint=entrypoint,
|
|
834
843
|
name=name,
|
|
@@ -1483,12 +1492,21 @@ def create(kind, from_file, from_directory, inline, name):
|
|
|
1483
1492
|
data = {}
|
|
1484
1493
|
if from_directory:
|
|
1485
1494
|
click.echo(f'Creating secret from directory: {from_directory}')
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1495
|
+
# Use ABSOLUTE directory path so the top-level folder name is preserved
|
|
1496
|
+
base_dir_abs = os.path.abspath(os.path.expanduser(from_directory))
|
|
1497
|
+
if not os.path.isdir(base_dir_abs):
|
|
1498
|
+
raise click.BadParameter(
|
|
1499
|
+
f"--from-directory {from_directory} doesn't exist or is not a directory"
|
|
1500
|
+
)
|
|
1501
|
+
# Ensure there is at least one file inside
|
|
1502
|
+
if not any(p.is_file() for p in pathlib.Path(base_dir_abs).rglob('*')):
|
|
1503
|
+
raise click.BadParameter(f'--from-directory {from_directory} is empty.')
|
|
1504
|
+
|
|
1505
|
+
# Zip + base64 the WHOLE directory (this preserves the inner structure)
|
|
1506
|
+
archive_b64 = base64_utils.zip_base64encode([base_dir_abs])
|
|
1507
|
+
|
|
1508
|
+
# Store as a single key; pod will unzip to the expanded path
|
|
1509
|
+
data = {'payload.zip': archive_b64}
|
|
1492
1510
|
elif from_file:
|
|
1493
1511
|
click.echo(f'Creating secret from file: {from_file}')
|
|
1494
1512
|
key = os.path.basename(from_file)
|
|
@@ -1694,6 +1712,13 @@ def serve():
|
|
|
1694
1712
|
# pylint: disable=bad-docstring-quotes
|
|
1695
1713
|
help='Skip confirmation prompt.',
|
|
1696
1714
|
)
|
|
1715
|
+
@click.option(
|
|
1716
|
+
'--skip-image-check',
|
|
1717
|
+
'-s',
|
|
1718
|
+
is_flag=True,
|
|
1719
|
+
default=False,
|
|
1720
|
+
help='Skip Docker image validation checks for faster startup.',
|
|
1721
|
+
)
|
|
1697
1722
|
def serve_launch(
|
|
1698
1723
|
entrypoint: Tuple[str, ...],
|
|
1699
1724
|
dryrun: bool,
|
|
@@ -1716,6 +1741,7 @@ def serve_launch(
|
|
|
1716
1741
|
ports: Optional[int],
|
|
1717
1742
|
probe: Optional[str],
|
|
1718
1743
|
yes: bool,
|
|
1744
|
+
skip_image_check: bool = False,
|
|
1719
1745
|
):
|
|
1720
1746
|
"""Launch a deployment to serve.
|
|
1721
1747
|
|
|
@@ -1725,6 +1751,9 @@ def serve_launch(
|
|
|
1725
1751
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
1726
1752
|
env = _merge_env_vars(env_file, env)
|
|
1727
1753
|
|
|
1754
|
+
if skip_image_check:
|
|
1755
|
+
os.environ['KONDUKTOR_SKIP_IMAGE_CHECK'] = '1'
|
|
1756
|
+
|
|
1728
1757
|
task = _make_task_with_overrides(
|
|
1729
1758
|
entrypoint=entrypoint,
|
|
1730
1759
|
name=name,
|
|
@@ -1739,6 +1768,7 @@ def serve_launch(
|
|
|
1739
1768
|
image_id=image_id,
|
|
1740
1769
|
env=env,
|
|
1741
1770
|
disk_size=disk_size,
|
|
1771
|
+
# serving stuff
|
|
1742
1772
|
min_replicas=min_replicas,
|
|
1743
1773
|
max_replicas=max_replicas,
|
|
1744
1774
|
ports=ports,
|
|
@@ -34,6 +34,34 @@ metadata:
|
|
|
34
34
|
name: aibrix-activator
|
|
35
35
|
---
|
|
36
36
|
apiVersion: v1
|
|
37
|
+
kind: ServiceAccount
|
|
38
|
+
metadata:
|
|
39
|
+
name: aibrix-activator
|
|
40
|
+
namespace: aibrix-activator
|
|
41
|
+
---
|
|
42
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
43
|
+
kind: ClusterRole
|
|
44
|
+
metadata:
|
|
45
|
+
name: aibrix-activator
|
|
46
|
+
rules:
|
|
47
|
+
- apiGroups: ["apps"]
|
|
48
|
+
resources: ["deployments"]
|
|
49
|
+
verbs: ["get", "list", "watch"]
|
|
50
|
+
---
|
|
51
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
52
|
+
kind: ClusterRoleBinding
|
|
53
|
+
metadata:
|
|
54
|
+
name: aibrix-activator
|
|
55
|
+
roleRef:
|
|
56
|
+
apiGroup: rbac.authorization.k8s.io
|
|
57
|
+
kind: ClusterRole
|
|
58
|
+
name: aibrix-activator
|
|
59
|
+
subjects:
|
|
60
|
+
- kind: ServiceAccount
|
|
61
|
+
name: aibrix-activator
|
|
62
|
+
namespace: aibrix-activator
|
|
63
|
+
---
|
|
64
|
+
apiVersion: v1
|
|
37
65
|
kind: ConfigMap
|
|
38
66
|
metadata:
|
|
39
67
|
name: activator-code
|
|
@@ -44,16 +72,26 @@ data:
|
|
|
44
72
|
from collections import defaultdict, deque
|
|
45
73
|
from fastapi import FastAPI, Request
|
|
46
74
|
from fastapi.responses import PlainTextResponse, JSONResponse
|
|
75
|
+
import asyncio
|
|
76
|
+
from kubernetes import client, config
|
|
47
77
|
|
|
48
78
|
NAMESPACE = os.getenv("NAMESPACE", "default")
|
|
49
79
|
WINDOW_SEC = int(os.getenv("WINDOW_SEC", "30")) # demand lookback
|
|
50
80
|
CAPACITY_RPS = float(os.getenv("CAPACITY_RPS", "1.0")) # per-replica capacity
|
|
51
81
|
MIN_WAKE = int(os.getenv("MIN_REPLICA_ON_WAKE", "1"))
|
|
52
82
|
MAX_REPLICAS = int(os.getenv("MAX_REPLICAS", "8"))
|
|
83
|
+
CLEANUP_INTERVAL = int(os.getenv("CLEANUP_INTERVAL", "300")) # 5 minutes
|
|
53
84
|
|
|
54
85
|
app = FastAPI()
|
|
55
86
|
events = defaultdict(deque) # key=(ns,model) -> deque[timestamps]
|
|
56
87
|
|
|
88
|
+
# Initialize Kubernetes client
|
|
89
|
+
try:
|
|
90
|
+
config.load_incluster_config()
|
|
91
|
+
k8s_apps_v1 = client.AppsV1Api()
|
|
92
|
+
except:
|
|
93
|
+
k8s_apps_v1 = None
|
|
94
|
+
|
|
57
95
|
def _prune(q, now):
|
|
58
96
|
while q and now - q[0] > WINDOW_SEC: q.popleft()
|
|
59
97
|
|
|
@@ -89,6 +127,48 @@ data:
|
|
|
89
127
|
pass
|
|
90
128
|
return None
|
|
91
129
|
|
|
130
|
+
def _get_existing_deployments():
|
|
131
|
+
"""Get list of existing Aibrix deployments from Kubernetes"""
|
|
132
|
+
if not k8s_apps_v1:
|
|
133
|
+
return set()
|
|
134
|
+
try:
|
|
135
|
+
deployments = k8s_apps_v1.list_namespaced_deployment(
|
|
136
|
+
namespace=NAMESPACE,
|
|
137
|
+
label_selector="model.aibrix.ai/name"
|
|
138
|
+
)
|
|
139
|
+
return {d.metadata.name for d in deployments.items}
|
|
140
|
+
except Exception:
|
|
141
|
+
return set()
|
|
142
|
+
|
|
143
|
+
def _cleanup_stale_entries():
|
|
144
|
+
"""Remove entries for deployments that no longer exist"""
|
|
145
|
+
if not k8s_apps_v1:
|
|
146
|
+
return
|
|
147
|
+
try:
|
|
148
|
+
existing_deployments = _get_existing_deployments()
|
|
149
|
+
# Remove entries for deployments that no longer exist
|
|
150
|
+
keys_to_remove = []
|
|
151
|
+
for (ns, model) in list(events.keys()):
|
|
152
|
+
if ns == NAMESPACE and model not in existing_deployments:
|
|
153
|
+
keys_to_remove.append((ns, model))
|
|
154
|
+
|
|
155
|
+
for key in keys_to_remove:
|
|
156
|
+
del events[key]
|
|
157
|
+
print(f"Cleaned up stale entry for deployment: {key[1]}")
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Error during cleanup: {e}")
|
|
160
|
+
|
|
161
|
+
async def _cleanup_task():
|
|
162
|
+
"""Background task to periodically clean up stale entries"""
|
|
163
|
+
while True:
|
|
164
|
+
await asyncio.sleep(CLEANUP_INTERVAL)
|
|
165
|
+
_cleanup_stale_entries()
|
|
166
|
+
|
|
167
|
+
@app.on_event("startup")
|
|
168
|
+
async def startup_event():
|
|
169
|
+
"""Start background cleanup task"""
|
|
170
|
+
asyncio.create_task(_cleanup_task())
|
|
171
|
+
|
|
92
172
|
# Mirror endpoints (same as your API paths); quick 204 response
|
|
93
173
|
@app.post("/v1/completions")
|
|
94
174
|
@app.post("/v1/chat/completions")
|
|
@@ -108,6 +188,37 @@ data:
|
|
|
108
188
|
_bump(NAMESPACE, model)
|
|
109
189
|
return JSONResponse({"ok": True}, status_code=204)
|
|
110
190
|
|
|
191
|
+
# Prometheus-friendly aggregate endpoint: export ALL (ns, model)
|
|
192
|
+
@app.get("/metrics", response_class=PlainTextResponse)
|
|
193
|
+
async def metrics_all():
|
|
194
|
+
lines = []
|
|
195
|
+
# Idiomatic names
|
|
196
|
+
lines.append("# HELP vllm_deployment_replicas Number of suggested replicas.")
|
|
197
|
+
lines.append("# TYPE vllm_deployment_replicas gauge")
|
|
198
|
+
lines.append("# HELP vllm_observed_rps Incoming requests per second.")
|
|
199
|
+
lines.append("# TYPE vllm_observed_rps gauge")
|
|
200
|
+
now = time.time()
|
|
201
|
+
for (ns, model), q in list(events.items()):
|
|
202
|
+
_prune(q, now)
|
|
203
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
204
|
+
d = _desired(ns, model)
|
|
205
|
+
lines.append(f'vllm_deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
|
|
206
|
+
lines.append(f'vllm_observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
|
|
207
|
+
# (Optional) keep legacy names with colons for back-compat
|
|
208
|
+
lines.append("# HELP vllm:deployment_replicas Number of suggested replicas.")
|
|
209
|
+
lines.append("# TYPE vllm:deployment_replicas gauge")
|
|
210
|
+
lines.append("# HELP vllm:observed_rps Incoming requests per second.")
|
|
211
|
+
lines.append("# TYPE vllm:observed_rps gauge")
|
|
212
|
+
now = time.time()
|
|
213
|
+
for (ns, model), q in list(events.items()):
|
|
214
|
+
_prune(q, now)
|
|
215
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
216
|
+
d = _desired(ns, model)
|
|
217
|
+
lines.append(f'vllm:deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
|
|
218
|
+
lines.append(f'vllm:observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
|
|
219
|
+
return "\n".join(lines) + "\n"
|
|
220
|
+
|
|
221
|
+
|
|
111
222
|
# Metrics for KPA and Debugging
|
|
112
223
|
@app.get("/metrics/{ns}/{model}", response_class=PlainTextResponse)
|
|
113
224
|
async def metrics(ns: str, model: str):
|
|
@@ -142,7 +253,7 @@ spec:
|
|
|
142
253
|
command: ["bash","-lc"]
|
|
143
254
|
args:
|
|
144
255
|
- |
|
|
145
|
-
pip install fastapi uvicorn >/dev/null && \
|
|
256
|
+
pip install fastapi uvicorn kubernetes >/dev/null && \
|
|
146
257
|
uvicorn activator:app --host 0.0.0.0 --port 8080
|
|
147
258
|
env:
|
|
148
259
|
- { name: NAMESPACE, value: "default" }
|
|
@@ -150,10 +261,12 @@ spec:
|
|
|
150
261
|
- { name: CAPACITY_RPS, value: "1.0" }
|
|
151
262
|
- { name: MIN_REPLICA_ON_WAKE, value: "1" }
|
|
152
263
|
- { name: MAX_REPLICAS, value: "8" }
|
|
264
|
+
- { name: CLEANUP_INTERVAL, value: "300" }
|
|
153
265
|
ports: [{containerPort: 8080}]
|
|
154
266
|
volumeMounts:
|
|
155
267
|
- { name: code, mountPath: /app/activator.py, subPath: activator.py }
|
|
156
268
|
workingDir: /app
|
|
269
|
+
serviceAccountName: aibrix-activator
|
|
157
270
|
volumes:
|
|
158
271
|
- name: code
|
|
159
272
|
configMap: { name: activator-code }
|
|
@@ -163,6 +276,13 @@ kind: Service
|
|
|
163
276
|
metadata:
|
|
164
277
|
name: aibrix-activator
|
|
165
278
|
namespace: aibrix-activator
|
|
279
|
+
annotations:
|
|
280
|
+
prometheus.io/scrape: "true"
|
|
281
|
+
prometheus.io/port: "8080"
|
|
282
|
+
prometheus.io/path: "/metrics"
|
|
283
|
+
labels:
|
|
284
|
+
app: aibrix-activator
|
|
285
|
+
prometheus-discovery: "true"
|
|
166
286
|
spec:
|
|
167
287
|
selector: { app: aibrix-activator }
|
|
168
288
|
ports:
|
|
@@ -172,6 +292,42 @@ spec:
|
|
|
172
292
|
protocol: TCP
|
|
173
293
|
type: ClusterIP
|
|
174
294
|
---
|
|
295
|
+
apiVersion: monitoring.coreos.com/v1
|
|
296
|
+
kind: ServiceMonitor
|
|
297
|
+
metadata:
|
|
298
|
+
name: aibrix-activator
|
|
299
|
+
namespace: prometheus
|
|
300
|
+
labels:
|
|
301
|
+
app: aibrix-activator
|
|
302
|
+
spec:
|
|
303
|
+
selector:
|
|
304
|
+
matchLabels:
|
|
305
|
+
app: aibrix-activator
|
|
306
|
+
namespaceSelector:
|
|
307
|
+
matchNames:
|
|
308
|
+
- aibrix-activator
|
|
309
|
+
endpoints:
|
|
310
|
+
- port: http
|
|
311
|
+
path: /metrics
|
|
312
|
+
---
|
|
313
|
+
apiVersion: monitoring.coreos.com/v1
|
|
314
|
+
kind: ServiceMonitor
|
|
315
|
+
metadata:
|
|
316
|
+
name: vllm-deployments
|
|
317
|
+
namespace: prometheus
|
|
318
|
+
labels:
|
|
319
|
+
app: vllm-deployments
|
|
320
|
+
spec:
|
|
321
|
+
selector:
|
|
322
|
+
matchLabels:
|
|
323
|
+
prometheus-discovery: "true"
|
|
324
|
+
namespaceSelector:
|
|
325
|
+
matchNames:
|
|
326
|
+
- default
|
|
327
|
+
endpoints:
|
|
328
|
+
- port: serve
|
|
329
|
+
path: /metrics
|
|
330
|
+
---
|
|
175
331
|
apiVersion: gateway.networking.k8s.io/v1beta1
|
|
176
332
|
kind: ReferenceGrant
|
|
177
333
|
metadata:
|
|
@@ -59,7 +59,7 @@ metadata:
|
|
|
59
59
|
name: UNIQUE-TEMPNAME-backend2
|
|
60
60
|
spec:
|
|
61
61
|
endpoints:
|
|
62
|
-
- fqdn: ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
|
|
62
|
+
- fqdn: keda-ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
|
|
63
63
|
---
|
|
64
64
|
# HTTPRoute for general deployments
|
|
65
65
|
apiVersion: gateway.apoxy.dev/v1
|
konduktor/resource.py
CHANGED
|
@@ -59,6 +59,8 @@ class Resources:
|
|
|
59
59
|
# Internal use only.
|
|
60
60
|
# pylint: disable=invalid-name
|
|
61
61
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
|
62
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
63
|
+
_validate_image: bool = True,
|
|
62
64
|
):
|
|
63
65
|
"""Initialize a Resources object.
|
|
64
66
|
|
|
@@ -124,7 +126,8 @@ class Resources:
|
|
|
124
126
|
if isinstance(image_id, str):
|
|
125
127
|
self._image_id = image_id.strip()
|
|
126
128
|
# Validate Docker image format and existence
|
|
127
|
-
|
|
129
|
+
if _validate_image:
|
|
130
|
+
validator.validate_and_warn_image(self._image_id, 'task')
|
|
128
131
|
|
|
129
132
|
self._labels = labels
|
|
130
133
|
self._cluster_config_overrides = _cluster_config_overrides
|
|
@@ -435,15 +438,19 @@ class Resources:
|
|
|
435
438
|
|
|
436
439
|
def copy(self, **override) -> 'Resources':
|
|
437
440
|
"""Returns a copy of the given Resources."""
|
|
441
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
442
|
+
new_image_id = override.pop('image_id', self.image_id)
|
|
438
443
|
resources = Resources(
|
|
439
444
|
cloud=override.pop('cloud', self.cloud),
|
|
440
445
|
cpus=override.pop('cpus', self._cpus),
|
|
441
446
|
memory=override.pop('memory', self.memory),
|
|
442
447
|
accelerators=override.pop('accelerators', self.accelerators),
|
|
443
448
|
disk_size=override.pop('disk_size', self.disk_size),
|
|
444
|
-
image_id=
|
|
449
|
+
image_id=new_image_id,
|
|
445
450
|
labels=override.pop('labels', self.labels),
|
|
446
451
|
job_config=override.pop('job_config', self.job_config),
|
|
452
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
453
|
+
_validate_image=(new_image_id != self.image_id),
|
|
447
454
|
)
|
|
448
455
|
assert len(override) == 0
|
|
449
456
|
return resources
|
konduktor/task.py
CHANGED
|
@@ -29,7 +29,7 @@ import konduktor
|
|
|
29
29
|
from konduktor import constants, logging
|
|
30
30
|
from konduktor.data import data_utils
|
|
31
31
|
from konduktor.data import storage as storage_lib
|
|
32
|
-
from konduktor.utils import common_utils, exceptions, schemas, ux_utils
|
|
32
|
+
from konduktor.utils import common_utils, exceptions, schemas, ux_utils
|
|
33
33
|
|
|
34
34
|
logger = logging.get_logger(__name__)
|
|
35
35
|
|
|
@@ -388,10 +388,6 @@ class Task:
|
|
|
388
388
|
)
|
|
389
389
|
resources_config['_cluster_config_overrides'] = cluster_config_override
|
|
390
390
|
|
|
391
|
-
# Validate Docker image if specified in resources
|
|
392
|
-
if 'image_id' in resources_config and resources_config['image_id']:
|
|
393
|
-
validator.validate_and_warn_image(resources_config['image_id'], 'task')
|
|
394
|
-
|
|
395
391
|
task.set_resources(konduktor.Resources.from_yaml_config(resources_config))
|
|
396
392
|
|
|
397
393
|
# Parse serving field.
|
|
@@ -41,11 +41,9 @@ metadata:
|
|
|
41
41
|
{{ deployment_name_label }}: "{{ name }}"
|
|
42
42
|
{{ deployment_user_label }}: "{{ user }}"
|
|
43
43
|
trainy.ai/has-autoscaler: "{{ autoscaler }}"
|
|
44
|
-
{% if not general %}
|
|
45
44
|
annotations:
|
|
46
45
|
prometheus.io/scrape: "true"
|
|
47
|
-
prometheus.io/port: "
|
|
48
|
-
{% endif %}
|
|
46
|
+
prometheus.io/port: "9000"
|
|
49
47
|
name: {{ name }}
|
|
50
48
|
namespace: default
|
|
51
49
|
spec:
|
|
@@ -142,6 +140,10 @@ apiVersion: networking.k8s.io/v1
|
|
|
142
140
|
kind: Ingress
|
|
143
141
|
metadata:
|
|
144
142
|
name: {{ name }}-ingress
|
|
143
|
+
labels:
|
|
144
|
+
{{ deployment_name_label }}: "{{ name }}"
|
|
145
|
+
{{ deployment_user_label }}: "{{ user }}"
|
|
146
|
+
trainy.ai/konduktor-managed: "true"
|
|
145
147
|
annotations:
|
|
146
148
|
nginx.ingress.kubernetes.io/use-regex: "true"
|
|
147
149
|
nginx.ingress.kubernetes.io/rewrite-target: /$1
|
konduktor/templates/pod.yaml.j2
CHANGED
|
@@ -28,16 +28,21 @@ kubernetes:
|
|
|
28
28
|
containers:
|
|
29
29
|
# TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
|
|
30
30
|
- name: konduktor-container
|
|
31
|
-
{% if enable_ssh %}
|
|
31
|
+
{% if enable_ssh or serving %}
|
|
32
32
|
ports:
|
|
33
|
+
{% if enable_ssh %}
|
|
33
34
|
- name: ssh
|
|
34
35
|
containerPort: {{ konduktor_ssh_port }}
|
|
36
|
+
{% endif %}
|
|
37
|
+
|
|
38
|
+
{% if serving %}
|
|
39
|
+
- name: serving
|
|
40
|
+
containerPort: {{ ports }}
|
|
41
|
+
{% endif %}
|
|
35
42
|
{% endif %}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
- containerPort: {{ ports }}
|
|
43
|
+
|
|
44
|
+
{% if serving and probe %}
|
|
39
45
|
# TODO (ryan): allow modification of thresholds and timings
|
|
40
|
-
{% if probe %}
|
|
41
46
|
livenessProbe:
|
|
42
47
|
httpGet:
|
|
43
48
|
path: {{ probe }}
|
|
@@ -68,7 +73,6 @@ kubernetes:
|
|
|
68
73
|
successThreshold: 1
|
|
69
74
|
timeoutSeconds: 1
|
|
70
75
|
{% endif %}
|
|
71
|
-
{% endif %}
|
|
72
76
|
image: {{ image_id }}
|
|
73
77
|
# this is set during jobset definition since we need to know the jobset
|
|
74
78
|
# name and number of nodes to set all the environment variables correctly here
|
|
@@ -77,6 +81,10 @@ kubernetes:
|
|
|
77
81
|
# flush logs immediately to stdout for more reactive log streaming
|
|
78
82
|
- name: PYTHONUNBUFFERED
|
|
79
83
|
value: "0"
|
|
84
|
+
- name: KONDUKTOR_NODENAME
|
|
85
|
+
valueFrom:
|
|
86
|
+
fieldRef:
|
|
87
|
+
fieldPath: spec.nodeName
|
|
80
88
|
- name: KONDUKTOR_JOB_NAME
|
|
81
89
|
value: "{{ job_name }}"
|
|
82
90
|
- name: NODE_HOST_IPS
|
|
@@ -134,6 +142,8 @@ kubernetes:
|
|
|
134
142
|
{% if default_secrets %}
|
|
135
143
|
- name: KONDUKTOR_DEFAULT_SECRETS
|
|
136
144
|
value: "/konduktor/default-secrets"
|
|
145
|
+
- name: KONDUKTOR_DEFAULT_SECRETS_EXPANDED
|
|
146
|
+
value: "/run/konduktor/expanded-default-secrets"
|
|
137
147
|
{% endif %}
|
|
138
148
|
# these are for compatibility with skypilot
|
|
139
149
|
- name: SKYPILOT_NODE_IPS
|
|
@@ -146,6 +156,10 @@ kubernetes:
|
|
|
146
156
|
value: "{{ num_nodes }}"
|
|
147
157
|
- name: SKYPILOT_NUM_GPUS_PER_NODE
|
|
148
158
|
value: "{{ num_gpus }}"
|
|
159
|
+
- name: RESTART_ATTEMPT
|
|
160
|
+
valueFrom:
|
|
161
|
+
fieldRef:
|
|
162
|
+
fieldPath: metadata.labels['jobset.sigs.k8s.io/restart-attempt']
|
|
149
163
|
volumeMounts:
|
|
150
164
|
- name: shared-memory
|
|
151
165
|
mountPath: /dev/shm
|
|
@@ -159,6 +173,10 @@ kubernetes:
|
|
|
159
173
|
- name: default-secret-{{ secret.mount_name }}
|
|
160
174
|
mountPath: /konduktor/default-secrets/{{ secret.mount_name }}
|
|
161
175
|
{% endfor %}
|
|
176
|
+
{% if default_secrets %}
|
|
177
|
+
- name: default-secrets-expanded
|
|
178
|
+
mountPath: /run/konduktor/expanded-default-secrets
|
|
179
|
+
{% endif %}
|
|
162
180
|
{% if git_ssh %}
|
|
163
181
|
- name: git-ssh-secret
|
|
164
182
|
mountPath: /run/konduktor/git-ssh-secret
|
|
@@ -192,7 +210,7 @@ kubernetes:
|
|
|
192
210
|
{% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
|
|
193
211
|
PACKAGES="$PACKAGES curl";
|
|
194
212
|
{% endif %}
|
|
195
|
-
{% if 'gs' in mount_secrets or 's3' in mount_secrets %}
|
|
213
|
+
{% if 'gs' in mount_secrets or 's3' in mount_secrets or default_secrets %}
|
|
196
214
|
PACKAGES="$PACKAGES unzip wget";
|
|
197
215
|
{% endif %}
|
|
198
216
|
{% if 'git' in run_cmd or 'git' in setup_cmd %}
|
|
@@ -231,7 +249,7 @@ kubernetes:
|
|
|
231
249
|
fi;
|
|
232
250
|
end_epoch=$(date +%s);
|
|
233
251
|
|
|
234
|
-
echo "Exposing ENV variables"
|
|
252
|
+
echo "===== KONDUKTOR: Exposing ENV variables ====="
|
|
235
253
|
$(prefix_cmd) env -0 | awk -v RS='\0' '
|
|
236
254
|
{
|
|
237
255
|
gsub(/\\/,"\\\\"); # escape existing backslashes first
|
|
@@ -346,8 +364,41 @@ kubernetes:
|
|
|
346
364
|
|
|
347
365
|
$(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
|
|
348
366
|
|
|
367
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary ====="
|
|
368
|
+
start_epoch=$(date +%s);
|
|
369
|
+
|
|
370
|
+
print_bucket () {
|
|
371
|
+
title="$1"; list="${2:-}"
|
|
372
|
+
echo "--- $title ---"
|
|
373
|
+
if [ -n "$list" ]; then
|
|
374
|
+
echo "$list" | tr ',' '\n' | sed "s/^/[$title] /"
|
|
375
|
+
else
|
|
376
|
+
echo "[none]"
|
|
377
|
+
fi
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
# Secrets: prefer detailed mapping if available
|
|
381
|
+
echo "--- env secret ---"
|
|
382
|
+
if [ -n "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
383
|
+
echo "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
384
|
+
| tr ',' '\n' \
|
|
385
|
+
| awk -F'=' '{ printf("[secret: %s] %s\n", $2, $1) }'
|
|
386
|
+
elif [ -n "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
|
|
387
|
+
echo "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION}" \
|
|
388
|
+
| tr ',' '\n' | sed 's/^/[secret] /'
|
|
389
|
+
else
|
|
390
|
+
echo "[none]"
|
|
391
|
+
fi
|
|
392
|
+
|
|
393
|
+
print_bucket "CLI + task.yaml" "${KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION}"
|
|
394
|
+
print_bucket "config.yaml" "${KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION}"
|
|
395
|
+
print_bucket "other" "${KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION}"
|
|
396
|
+
|
|
397
|
+
end_epoch=$(date +%s);
|
|
398
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary took $((end_epoch - start_epoch)) seconds ====="
|
|
399
|
+
|
|
349
400
|
# unpack secrets credentials
|
|
350
|
-
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking
|
|
401
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking cloud storage secret credentials ====="
|
|
351
402
|
start_epoch=$(date +%s);
|
|
352
403
|
mkdir -p ~/.konduktor
|
|
353
404
|
mkdir -p {{ remote_workdir }}
|
|
@@ -362,12 +413,71 @@ kubernetes:
|
|
|
362
413
|
$(prefix_cmd) unzip /run/konduktor/s3-secret/awscredentials -d ~/.aws
|
|
363
414
|
{% endif %}
|
|
364
415
|
{% endfor %}
|
|
416
|
+
|
|
417
|
+
{% if default_secrets %}
|
|
418
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking default secrets ====="
|
|
419
|
+
$(prefix_cmd) mkdir -p "${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
420
|
+
|
|
421
|
+
# For each mounted default secret folder:
|
|
422
|
+
# - if payload.zip exists, unzip it into the expanded dir
|
|
423
|
+
# - otherwise, copy the files as-is
|
|
424
|
+
for src in "${KONDUKTOR_DEFAULT_SECRETS}"/*; do
|
|
425
|
+
[ -d "$src" ] || continue
|
|
426
|
+
name="$(basename "$src")"
|
|
427
|
+
dst="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}/${name}"
|
|
428
|
+
$(prefix_cmd) mkdir -p "$dst"
|
|
429
|
+
|
|
430
|
+
if [ -f "${src}/payload.zip" ]; then
|
|
431
|
+
$(prefix_cmd) unzip -oq "${src}/payload.zip" -d "$dst"
|
|
432
|
+
else
|
|
433
|
+
$(prefix_cmd) cp -a "${src}/." "$dst/"
|
|
434
|
+
fi
|
|
435
|
+
done
|
|
436
|
+
|
|
437
|
+
# Point callers to the expanded (writable) path going forward
|
|
438
|
+
export KONDUKTOR_DEFAULT_SECRETS="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
|
|
439
|
+
$(prefix_cmd) echo "KONDUKTOR_DEFAULT_SECRETS=${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}" >> /etc/environment
|
|
440
|
+
{% endif %}
|
|
441
|
+
|
|
365
442
|
{% if git_ssh %}
|
|
366
443
|
$(prefix_cmd) echo "Unpacking GIT-SSH secret"
|
|
367
444
|
{% endif %}
|
|
368
445
|
end_epoch=$(date +%s);
|
|
369
446
|
$(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
|
|
370
447
|
|
|
448
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary ====="
|
|
449
|
+
start_epoch=$(date +%s)
|
|
450
|
+
|
|
451
|
+
root="${KONDUKTOR_DEFAULT_SECRETS:-}"
|
|
452
|
+
if [[ -z "$root" || ! -d "$root" ]]; then
|
|
453
|
+
$(prefix_cmd) echo "NO DEFAULT SECRETS FOUND."
|
|
454
|
+
else
|
|
455
|
+
for dir in "$root"/*; do
|
|
456
|
+
[ -d "$dir" ] || continue
|
|
457
|
+
name="$(basename "$dir")"
|
|
458
|
+
|
|
459
|
+
# Pretty header that mirrors the logical mount base:
|
|
460
|
+
$(prefix_cmd) echo "/konduktor/default-secrets/${name}:"
|
|
461
|
+
|
|
462
|
+
# Print relative paths only; skip macOS junk and k8s secret internals
|
|
463
|
+
(
|
|
464
|
+
cd "$dir"
|
|
465
|
+
out="$(find . \
|
|
466
|
+
\( -name '.DS_Store' -o -name '__MACOSX' -o -name '..data' -o -name '..*' \) -prune -o \
|
|
467
|
+
\( -type f -o -type l \) -print \
|
|
468
|
+
| sed 's|^\./||' \
|
|
469
|
+
| sort)"
|
|
470
|
+
if [ -n "$out" ]; then
|
|
471
|
+
printf "%s\n" "$out"
|
|
472
|
+
fi
|
|
473
|
+
)
|
|
474
|
+
done
|
|
475
|
+
fi
|
|
476
|
+
|
|
477
|
+
end_epoch=$(date +%s)
|
|
478
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Default secret summary took $((end_epoch - start_epoch)) seconds ====="
|
|
479
|
+
|
|
480
|
+
|
|
371
481
|
# sync file mounts
|
|
372
482
|
{% for mkdir_command in mkdir_commands %}
|
|
373
483
|
$(prefix_cmd) {{ mkdir_command }}
|
|
@@ -436,6 +546,10 @@ kubernetes:
|
|
|
436
546
|
secret:
|
|
437
547
|
secretName: {{ secret.k8s_name }}
|
|
438
548
|
{% endfor %}
|
|
549
|
+
{% if default_secrets %}
|
|
550
|
+
- name: default-secrets-expanded
|
|
551
|
+
emptyDir: {}
|
|
552
|
+
{% endif %}
|
|
439
553
|
{% if git_ssh %}
|
|
440
554
|
- name: git-ssh-secret
|
|
441
555
|
secret:
|
konduktor/utils/base64_utils.py
CHANGED
|
@@ -44,6 +44,8 @@ def zip_base64encode(files: List[str]) -> str:
|
|
|
44
44
|
else:
|
|
45
45
|
for root, _, files in os.walk(item_path):
|
|
46
46
|
for file in files:
|
|
47
|
+
if file == '.DS_Store':
|
|
48
|
+
continue
|
|
47
49
|
file_path = os.path.join(root, file)
|
|
48
50
|
arcname = os.path.relpath(file_path, temp_dir)
|
|
49
51
|
zipf.write(file_path, arcname)
|
konduktor/utils/validator.py
CHANGED
|
@@ -37,6 +37,11 @@ SCHEMA_URLS = {
|
|
|
37
37
|
logger = logging.get_logger(__name__)
|
|
38
38
|
|
|
39
39
|
|
|
40
|
+
def _skip_image_checks() -> bool:
|
|
41
|
+
val = os.getenv('KONDUKTOR_SKIP_IMAGE_CHECK', '')
|
|
42
|
+
return val.lower() in ('1', 'true', 'yes', 'y')
|
|
43
|
+
|
|
44
|
+
|
|
40
45
|
def case_insensitive_enum(validator, enums, instance, schema):
|
|
41
46
|
del validator, schema # Unused.
|
|
42
47
|
if instance.lower() not in [enum.lower() for enum in enums]:
|
|
@@ -419,6 +424,13 @@ def validate_and_warn_image(image_id: str, context: str = 'task') -> None:
|
|
|
419
424
|
if not image_id:
|
|
420
425
|
return
|
|
421
426
|
|
|
427
|
+
if _skip_image_checks():
|
|
428
|
+
logger.info(
|
|
429
|
+
'Skipping Docker image validation for %s',
|
|
430
|
+
image_id,
|
|
431
|
+
)
|
|
432
|
+
return
|
|
433
|
+
|
|
422
434
|
status, message = validate_docker_image(image_id)
|
|
423
435
|
|
|
424
436
|
if status == 'invalid':
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
konduktor/__init__.py,sha256=
|
|
1
|
+
konduktor/__init__.py,sha256=mHmTi0owXeaxTt6NwGboUKlwfKWw6xwzbdcUjq9-1DM,1574
|
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
|
|
4
4
|
konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
|
|
@@ -6,14 +6,14 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
|
|
|
6
6
|
konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
|
|
7
7
|
konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3ys,199
|
|
8
8
|
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
|
9
|
-
konduktor/backends/constants.py,sha256=
|
|
10
|
-
konduktor/backends/deployment.py,sha256=
|
|
11
|
-
konduktor/backends/deployment_utils.py,sha256=
|
|
9
|
+
konduktor/backends/constants.py,sha256=uAU-edQ_9DNYnu6x7fwNYXIEM7KMfJMOrnG74rlZ8mY,766
|
|
10
|
+
konduktor/backends/deployment.py,sha256=d0a3F7dxDbnRKIt4ZO_kQ0_vet0pZvg4bWYzVZ8DZIQ,6640
|
|
11
|
+
konduktor/backends/deployment_utils.py,sha256=9CmB9CYC_3wxIfIOmTSCN2hbURZ5MpEMTvPwYMUXBRM,49272
|
|
12
12
|
konduktor/backends/jobset.py,sha256=drt8Gc0iYQx18JWXBU6XfhUvC2xCKd8szSJ2JC4O20Q,8640
|
|
13
|
-
konduktor/backends/jobset_utils.py,sha256=
|
|
14
|
-
konduktor/backends/pod_utils.py,sha256=
|
|
13
|
+
konduktor/backends/jobset_utils.py,sha256=g49NY8RFhL_NNd4c1adRLG_Bq3UTFtRURxcAzxnMEYw,26524
|
|
14
|
+
konduktor/backends/pod_utils.py,sha256=kOi3cLbTI3abZFCNQswWrkrOiBBm3gW_9N4INjxeS-w,19276
|
|
15
15
|
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
|
16
|
-
konduktor/cli.py,sha256=
|
|
16
|
+
konduktor/cli.py,sha256=B3Pp3RCwkGj8r9YgH-TgC85XU4zcc3eema1kpcDTQ3I,58452
|
|
17
17
|
konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
|
|
18
18
|
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
|
19
19
|
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -68,25 +68,25 @@ konduktor/data/storage_utils.py,sha256=n4GivkN0KMqmyOTDznF0Z-hzsJvm7KCEh5i5HgFAT
|
|
|
68
68
|
konduktor/execution.py,sha256=d0EP79iSrW2uFsoqn0YV_4kgIupPIqpMOParXx0y3kg,18519
|
|
69
69
|
konduktor/kube_client.py,sha256=HtM3d-_GigHnfGINRANchApR9_OigqczBgeYJ6Dj4j0,8504
|
|
70
70
|
konduktor/logging.py,sha256=xtcCdnecmC3rqMTyunK-klQRINojI7NI4Apag78i9jM,3221
|
|
71
|
-
konduktor/manifests/aibrix-setup.yaml,sha256=
|
|
71
|
+
konduktor/manifests/aibrix-setup.yaml,sha256=Foe3M1C0zVt-CVSJGr5SlQkMaNGs9kU2CvIZoANm3f8,14133
|
|
72
72
|
konduktor/manifests/apoxy-setup.yaml,sha256=EipknCq33aBdxu9BIo6y5novjO0B_d_DCWqY44zYNuU,4262
|
|
73
|
-
konduktor/manifests/apoxy-setup2.yaml,sha256=
|
|
73
|
+
konduktor/manifests/apoxy-setup2.yaml,sha256=fc1tDwVopPVFzvUygkqxDGVqHHeo1cF9ERTnzUdgaGs,2517
|
|
74
74
|
konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
|
|
75
75
|
konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
|
|
76
76
|
konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
|
|
77
77
|
konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
|
|
78
|
-
konduktor/resource.py,sha256=
|
|
78
|
+
konduktor/resource.py,sha256=JqEE3LZiBBd5vqAiHDk-nlLve_VUQHhgdo6BIgx2Xfk,21215
|
|
79
79
|
konduktor/serving.py,sha256=4s8cQhsVjf-HByZF65pbMxuqaV319hUSQE9pC8gP4Sg,5405
|
|
80
|
-
konduktor/task.py,sha256=
|
|
81
|
-
konduktor/templates/deployment.yaml.j2,sha256=
|
|
80
|
+
konduktor/task.py,sha256=FIWm_rC_63GPBoe-Hi8a_eJ0H8Szw747SwXYPrNtOWE,37820
|
|
81
|
+
konduktor/templates/deployment.yaml.j2,sha256=0Cer53I8YHtYgUeEBQ_NVgC36FdOcjMNejgfP8teJC4,4964
|
|
82
82
|
konduktor/templates/jobset.yaml.j2,sha256=NQcVeRNsTLLmTnJRnkL1vr45mSeth-b11YShXn_RoSg,1323
|
|
83
|
-
konduktor/templates/pod.yaml.j2,sha256=
|
|
83
|
+
konduktor/templates/pod.yaml.j2,sha256=gGYwdXsPxStiua9Mm-OF7byVfuKjcH-TYNjoQmdpX_Q,24107
|
|
84
84
|
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
85
85
|
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
|
86
86
|
konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
87
|
konduktor/utils/accelerator_registry.py,sha256=ythz3ynulP1DSSU7Jj5VUsQeBzSYRkxCVDZ5oOg0xtc,560
|
|
88
88
|
konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
|
|
89
|
-
konduktor/utils/base64_utils.py,sha256=
|
|
89
|
+
konduktor/utils/base64_utils.py,sha256=TzKxe_SPHrurJ1lTOwOi4OmGUNkGxomstFcTWcWLQhw,3223
|
|
90
90
|
konduktor/utils/common_utils.py,sha256=8gBpzYiC1bQ8sbgHIFLkKCGT5nLs1afpejod60kVSos,15076
|
|
91
91
|
konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
|
|
92
92
|
konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
|
|
@@ -99,9 +99,9 @@ konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo
|
|
|
99
99
|
konduktor/utils/schemas.py,sha256=cr39nEAgjluhXoUYnvIwCwLBH8rLds37MBsF1uQv1rw,19067
|
|
100
100
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
|
101
101
|
konduktor/utils/ux_utils.py,sha256=LSH4b5lckD157qDF4keThxtkGdxNrAfGKmH1ewhZkm4,8646
|
|
102
|
-
konduktor/utils/validator.py,sha256=
|
|
103
|
-
konduktor_nightly-0.1.0.
|
|
104
|
-
konduktor_nightly-0.1.0.
|
|
105
|
-
konduktor_nightly-0.1.0.
|
|
106
|
-
konduktor_nightly-0.1.0.
|
|
107
|
-
konduktor_nightly-0.1.0.
|
|
102
|
+
konduktor/utils/validator.py,sha256=UcLvZCk9Cpbbhw8r_ZJtTpMSTfY1NKqcyciKsPzRPZM,17222
|
|
103
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
|
104
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/METADATA,sha256=EEA9KjVBKhzBk4hO1-mWEacCmBul0d5GqMbB_VUKWbQ,4247
|
|
105
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
106
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
|
107
|
+
konduktor_nightly-0.1.0.dev20251107104752.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|