konduktor-nightly 0.1.0.dev20251030104830__tar.gz → 0.1.0.dev20251211105235__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/PKG-INFO +2 -1
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/__init__.py +2 -2
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/constants.py +1 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment.py +13 -2
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment_utils.py +3 -3
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset_utils.py +2 -1
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/pod_utils.py +147 -27
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/cli.py +303 -301
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/aibrix-setup.yaml +157 -1
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup2.yaml +1 -1
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/deployment.yaml.j2 +5 -3
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/pod.yaml.j2 +123 -9
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/base64_utils.py +2 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/log_utils.py +8 -5
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/pyproject.toml +2 -1
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/aws.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/common.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/gcp.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/authentication.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/backend.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/check.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/config.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/launch.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/node.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/parse.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/main.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/sockets.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.gitignore +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/globals.css +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/layout.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/jsconfig.json +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/next.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package-lock.json +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package.json +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/server.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/s3.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/data_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/gcs.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/execution.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/kube_client.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/logging.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/controller_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dashboard_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/resource.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/serving.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/task.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/jobset.yaml.j2 +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/accelerator_registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/annotations.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/common_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/env_options.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/exceptions.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_enums.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/loki_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/rich_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/schemas.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/subprocess_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/ux_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/validator.py +0 -0
{konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: konduktor-nightly
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev20251211105235
|
|
4
4
|
Summary: GPU Cluster Health Management
|
|
5
5
|
Author: Andrew Aikawa
|
|
6
6
|
Author-email: asai@berkeley.edu
|
|
@@ -29,6 +29,7 @@ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
|
|
|
29
29
|
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
|
30
30
|
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
31
31
|
Requires-Dist: rich (>=13.9.4,<14.0.0)
|
|
32
|
+
Requires-Dist: sniffio (>=1.3,<2.0)
|
|
32
33
|
Requires-Dist: websockets (>=15.0.1,<16.0.0)
|
|
33
34
|
Description-Content-Type: text/markdown
|
|
34
35
|
|
|
@@ -11,7 +11,7 @@ from konduktor.task import Task
|
|
|
11
11
|
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
12
|
|
|
13
13
|
# Replaced with the current commit when building the wheels.
|
|
14
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '421390595e3a1b9f263e790323deae61d94da231'
|
|
15
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -45,5 +45,5 @@ def _get_git_commit():
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
__commit__ = _get_git_commit()
|
|
48
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20251211105235'
|
|
49
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -54,8 +54,19 @@ def _wait_for_all_ready(namespace: str, name: str):
|
|
|
54
54
|
except ApiException:
|
|
55
55
|
services_map = {}
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
autoscalers_map = {}
|
|
58
|
+
try:
|
|
59
|
+
autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
|
|
60
|
+
if autoscaler_obj:
|
|
61
|
+
# detect aibrix vs general from deployment labels
|
|
62
|
+
labels = (deployment.metadata.labels or {}) if deployment else {}
|
|
63
|
+
is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
|
|
64
|
+
if is_aibrix:
|
|
65
|
+
autoscalers_map[name] = {'kpa': autoscaler_obj}
|
|
66
|
+
else:
|
|
67
|
+
autoscalers_map[name] = {'hpa': autoscaler_obj}
|
|
68
|
+
except ApiException:
|
|
69
|
+
pass
|
|
59
70
|
|
|
60
71
|
status = deployment_utils.get_model_status(
|
|
61
72
|
name, deployments_map, services_map, autoscalers_map
|
|
@@ -998,13 +998,13 @@ def get_envoy_external_ip() -> Optional[str]:
|
|
|
998
998
|
|
|
999
999
|
|
|
1000
1000
|
def get_ingress_nginx_external_ip() -> Optional[str]:
|
|
1001
|
-
"""Get the external IP of the ingress-nginx-controller LoadBalancer."""
|
|
1001
|
+
"""Get the external IP of the keda-ingress-nginx-controller LoadBalancer."""
|
|
1002
1002
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1003
1003
|
core_api = kube_client.core_api(context=context)
|
|
1004
1004
|
try:
|
|
1005
|
-
# Look for ingress-nginx-controller service in keda namespace
|
|
1005
|
+
# Look for keda-ingress-nginx-controller service in keda namespace
|
|
1006
1006
|
service = core_api.read_namespaced_service(
|
|
1007
|
-
name='ingress-nginx-controller', namespace='keda'
|
|
1007
|
+
name='keda-ingress-nginx-controller', namespace='keda'
|
|
1008
1008
|
)
|
|
1009
1009
|
if service.spec.type == 'LoadBalancer':
|
|
1010
1010
|
ingress = service.status.load_balancer.ingress
|
|
@@ -449,7 +449,8 @@ def _format_timestamp(timestamp: str) -> str:
|
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
def _get_job_start_time(job: Dict[str, Any]) -> str:
|
|
452
|
-
|
|
452
|
+
status = job.get('status', {})
|
|
453
|
+
for condition in status.get('conditions', []):
|
|
453
454
|
if condition['reason'] == 'ResumeJobs':
|
|
454
455
|
return condition.get('lastTransitionTime', '')
|
|
455
456
|
return '-'
|
|
@@ -28,6 +28,8 @@ if typing.TYPE_CHECKING:
|
|
|
28
28
|
logger = logging.get_logger(__name__)
|
|
29
29
|
|
|
30
30
|
_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
|
|
31
|
+
# Use a large default (7 days) to mimic "infinite" runtime.
|
|
32
|
+
_DEFAULT_MAX_RUN_DURATION_SECONDS = 604800
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
@@ -153,7 +155,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
153
155
|
git_ssh_secret_name = None
|
|
154
156
|
env_secret_envs = []
|
|
155
157
|
default_secrets = []
|
|
158
|
+
basename_by_k8s: Dict[str, str] = {}
|
|
156
159
|
|
|
160
|
+
# only get own secrets
|
|
157
161
|
user_hash = common_utils.get_user_hash()
|
|
158
162
|
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
159
163
|
user_secrets = kubernetes_utils.list_secrets(
|
|
@@ -162,19 +166,36 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
162
166
|
|
|
163
167
|
for secret in user_secrets:
|
|
164
168
|
kind = kubernetes_utils.get_secret_kind(secret)
|
|
169
|
+
|
|
170
|
+
# incase the user modified their secret to have no key:value data
|
|
171
|
+
if secret.data is None:
|
|
172
|
+
secret.data = {}
|
|
173
|
+
|
|
174
|
+
# fill the map for *all* secrets we see
|
|
175
|
+
k8s_name = secret.metadata.name
|
|
176
|
+
lbls = secret.metadata.labels or {}
|
|
177
|
+
base = lbls.get(
|
|
178
|
+
backend_constants.SECRET_BASENAME_LABEL,
|
|
179
|
+
# fallback: strip trailing "-<something>" once if present
|
|
180
|
+
k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
|
|
181
|
+
)
|
|
182
|
+
basename_by_k8s[k8s_name] = base
|
|
183
|
+
|
|
165
184
|
if kind == 'git-ssh' and git_ssh_secret_name is None:
|
|
166
185
|
git_ssh_secret_name = secret.metadata.name
|
|
167
186
|
elif kind == 'env':
|
|
168
187
|
env_secret_name = secret.metadata.name
|
|
169
|
-
key
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
'
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
188
|
+
# iterate ALL keys, not just one (ex. if user made a multi-key env secret)
|
|
189
|
+
for key, _ in secret.data.items():
|
|
190
|
+
# wire the env var to read its value from a k8s secret
|
|
191
|
+
env_secret_envs.append(
|
|
192
|
+
{
|
|
193
|
+
'name': key,
|
|
194
|
+
'valueFrom': {
|
|
195
|
+
'secretKeyRef': {'name': env_secret_name, 'key': key}
|
|
196
|
+
},
|
|
197
|
+
}
|
|
198
|
+
)
|
|
178
199
|
elif kind == 'default':
|
|
179
200
|
default_secret_name = secret.metadata.name
|
|
180
201
|
basename = secret.metadata.labels.get(
|
|
@@ -184,6 +205,22 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
184
205
|
{'k8s_name': default_secret_name, 'mount_name': basename}
|
|
185
206
|
)
|
|
186
207
|
|
|
208
|
+
# Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
|
|
209
|
+
uses_default_secret_var = (
|
|
210
|
+
'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
|
|
211
|
+
or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
|
|
212
|
+
or '/konduktor/default-secrets/' in (task.run or '')
|
|
213
|
+
or '/konduktor/default-secrets/' in (task.setup or '')
|
|
214
|
+
)
|
|
215
|
+
if uses_default_secret_var and not default_secrets:
|
|
216
|
+
raise exceptions.MissingSecretError(
|
|
217
|
+
f'Task references KONDUKTOR_DEFAULT_SECRETS or '
|
|
218
|
+
f'/konduktor/default-secrets but '
|
|
219
|
+
f'user {common_utils.get_cleaned_username()} '
|
|
220
|
+
f'has no default secrets. Paths like '
|
|
221
|
+
f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
|
|
222
|
+
)
|
|
223
|
+
|
|
187
224
|
# Inject --served-model-name, --host, and --port into serving run command
|
|
188
225
|
if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
|
|
189
226
|
if '--served-model-name' and '--host' and '--port' not in task.run:
|
|
@@ -262,31 +299,111 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
262
299
|
},
|
|
263
300
|
temp.name,
|
|
264
301
|
)
|
|
302
|
+
|
|
303
|
+
# Capture the template env names BEFORE user config is merged
|
|
304
|
+
pod_config_template = common_utils.read_yaml(temp.name)
|
|
305
|
+
tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
|
|
306
|
+
'containers'
|
|
307
|
+
][0].get('env', [])
|
|
308
|
+
tmpl_env_names = {e['name'] for e in tmpl_envs}
|
|
309
|
+
|
|
265
310
|
pod_config = common_utils.read_yaml(temp.name)
|
|
266
|
-
# merge with `~/.konduktor/config.yaml``
|
|
311
|
+
# merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
|
|
267
312
|
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
|
268
313
|
pod_config = common_utils.read_yaml(temp.name)
|
|
269
314
|
|
|
270
|
-
#
|
|
271
|
-
|
|
315
|
+
# Find what came from user config (appeared after combine, not in template)
|
|
316
|
+
premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
|
|
272
317
|
'env', []
|
|
273
318
|
)
|
|
274
|
-
|
|
319
|
+
premerge_names = {e['name'] for e in premerge_envs}
|
|
320
|
+
config_env_names0 = premerge_names - tmpl_env_names
|
|
321
|
+
|
|
322
|
+
# Build final env list
|
|
323
|
+
env_map = {env['name']: env for env in premerge_envs}
|
|
275
324
|
|
|
276
|
-
# Inject secret envs
|
|
325
|
+
# Inject secret envs (env secrets override config.yaml)
|
|
277
326
|
for env in env_secret_envs:
|
|
278
327
|
env_map[env['name']] = env
|
|
279
328
|
|
|
280
|
-
# Inject task
|
|
329
|
+
# Inject task envs
|
|
330
|
+
# CLI+task.yaml overrides everything else
|
|
331
|
+
# CLI already overrode task.yaml in other code
|
|
281
332
|
for k, v in task.envs.items():
|
|
282
333
|
env_map[k] = {'name': k, 'value': v}
|
|
283
334
|
|
|
284
|
-
|
|
285
|
-
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] =
|
|
286
|
-
|
|
335
|
+
final_envs_list = list(env_map.values())
|
|
336
|
+
pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
|
|
337
|
+
final_envs_list
|
|
287
338
|
)
|
|
339
|
+
container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
|
|
340
|
+
final_envs = container['env']
|
|
341
|
+
final_names = {e['name'] for e in final_envs}
|
|
342
|
+
|
|
288
343
|
logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
|
|
289
344
|
|
|
345
|
+
# 1) Get secret envs actually used in the final env list
|
|
346
|
+
secret_details = sorted(
|
|
347
|
+
(e['name'], e['valueFrom']['secretKeyRef']['name'])
|
|
348
|
+
for e in final_envs
|
|
349
|
+
if isinstance(e, dict)
|
|
350
|
+
and e.get('valueFrom', {})
|
|
351
|
+
and e['valueFrom'].get('secretKeyRef')
|
|
352
|
+
)
|
|
353
|
+
secret_names = [n for n, _ in secret_details]
|
|
354
|
+
|
|
355
|
+
# 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
|
|
356
|
+
task_all_names = sorted(
|
|
357
|
+
n
|
|
358
|
+
for n in (task.envs or {}).keys()
|
|
359
|
+
if n in final_names and n not in secret_names
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# 3) Get Config.yaml envs actually used in the final env list
|
|
363
|
+
config_names = sorted(
|
|
364
|
+
n
|
|
365
|
+
for n in config_env_names0
|
|
366
|
+
if n in final_names and n not in secret_names and n not in task_all_names
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# 4) Get other envs (template/system) actually used in the final env list
|
|
370
|
+
other_names = sorted(
|
|
371
|
+
final_names - set(secret_names) - set(task_all_names) - set(config_names)
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Export helper envs for the startup script (names only)
|
|
375
|
+
def _append_helper(name: str, values):
|
|
376
|
+
container['env'].append({'name': name, 'value': ','.join(values)})
|
|
377
|
+
|
|
378
|
+
# to show user basenames of k8s secrets instead of actual
|
|
379
|
+
# k8s secret names (which have added suffixes)
|
|
380
|
+
secret_map_pairs = [
|
|
381
|
+
f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
|
|
382
|
+
for (var, secret_k8s) in secret_details
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
# Priority order: CLI > task.yaml > env secret > config > template/system
|
|
386
|
+
_append_helper(
|
|
387
|
+
'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
|
|
388
|
+
secret_names,
|
|
389
|
+
)
|
|
390
|
+
_append_helper(
|
|
391
|
+
'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
|
|
392
|
+
secret_map_pairs,
|
|
393
|
+
)
|
|
394
|
+
_append_helper(
|
|
395
|
+
'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
|
|
396
|
+
task_all_names,
|
|
397
|
+
)
|
|
398
|
+
_append_helper(
|
|
399
|
+
'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
|
|
400
|
+
config_names,
|
|
401
|
+
)
|
|
402
|
+
_append_helper(
|
|
403
|
+
'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
|
|
404
|
+
other_names,
|
|
405
|
+
)
|
|
406
|
+
|
|
290
407
|
# validate pod spec using json schema
|
|
291
408
|
try:
|
|
292
409
|
validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
|
|
@@ -356,18 +473,21 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
|
|
|
356
473
|
jobset_spec: The JobSet spec dictionary to modify
|
|
357
474
|
task: The task object containing resource information
|
|
358
475
|
"""
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
jobset_spec['jobset']['metadata']
|
|
365
|
-
|
|
476
|
+
assert task.resources is not None, 'Task resources are required'
|
|
477
|
+
labels = task.resources.labels or {}
|
|
478
|
+
|
|
479
|
+
# Add max run duration annotation, defaulting to a practically infinite value.
|
|
480
|
+
maxRunDurationSeconds = labels.get('maxRunDurationSeconds')
|
|
481
|
+
metadata = jobset_spec['jobset']['metadata']
|
|
482
|
+
metadata.setdefault('annotations', {})[_RUN_DURATION_ANNOTATION_KEY] = str(
|
|
483
|
+
maxRunDurationSeconds
|
|
484
|
+
if maxRunDurationSeconds is not None
|
|
485
|
+
else _DEFAULT_MAX_RUN_DURATION_SECONDS
|
|
366
486
|
)
|
|
367
487
|
|
|
368
488
|
# Inject resource labels into JobSet metadata.
|
|
369
|
-
if
|
|
370
|
-
jobset_spec['jobset']['metadata']['labels'].update(
|
|
489
|
+
if labels:
|
|
490
|
+
jobset_spec['jobset']['metadata']['labels'].update(labels)
|
|
371
491
|
|
|
372
492
|
|
|
373
493
|
def merge_pod_into_jobset_template(
|