konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Konduktor backend interface."""
|
|
14
|
+
|
|
15
|
+
import typing
|
|
16
|
+
from typing import Dict, Optional
|
|
17
|
+
|
|
18
|
+
if typing.TYPE_CHECKING:
|
|
19
|
+
from konduktor.data import storage as storage_lib
|
|
20
|
+
|
|
21
|
+
import konduktor
|
|
22
|
+
from konduktor.utils import ux_utils
|
|
23
|
+
|
|
24
|
+
Path = str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Backend:
|
|
28
|
+
"""Backend interface: handles provisioning, setup, and scheduling."""
|
|
29
|
+
|
|
30
|
+
# NAME is used to identify the backend class from cli/yaml.
|
|
31
|
+
NAME = 'backend'
|
|
32
|
+
|
|
33
|
+
# --- APIs ---
|
|
34
|
+
def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
|
|
35
|
+
"""Check whether resources of the task are satisfied by cluster."""
|
|
36
|
+
raise NotImplementedError
|
|
37
|
+
|
|
38
|
+
def sync_workdir(self, workdir: Path) -> None:
|
|
39
|
+
return self._sync_workdir(workdir)
|
|
40
|
+
|
|
41
|
+
def sync_file_mounts(
|
|
42
|
+
self,
|
|
43
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
|
44
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
|
45
|
+
) -> None:
|
|
46
|
+
return self._sync_file_mounts(all_file_mounts, storage_mounts)
|
|
47
|
+
|
|
48
|
+
def add_storage_objects(self, task: 'konduktor.Task') -> None:
|
|
49
|
+
raise NotImplementedError
|
|
50
|
+
|
|
51
|
+
def execute(
|
|
52
|
+
self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
|
|
53
|
+
) -> Optional[str]:
|
|
54
|
+
"""Execute the task on the cluster.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Job id if the task is submitted to the cluster, None otherwise.
|
|
58
|
+
"""
|
|
59
|
+
ux_utils.spinner_message('Submitting job')
|
|
60
|
+
return self._execute(task, detach_run, dryrun)
|
|
61
|
+
|
|
62
|
+
def post_execute(self) -> None:
|
|
63
|
+
"""Post execute(): e.g., print helpful inspection messages."""
|
|
64
|
+
return self._post_execute()
|
|
65
|
+
|
|
66
|
+
def register_info(self, **kwargs) -> None:
|
|
67
|
+
"""Register backend-specific information."""
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
def _sync_workdir(self, workdir: Path) -> None:
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
def _sync_file_mounts(
|
|
74
|
+
self,
|
|
75
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
|
76
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
|
77
|
+
) -> None:
|
|
78
|
+
raise NotImplementedError
|
|
79
|
+
|
|
80
|
+
def _execute(
|
|
81
|
+
self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
|
|
82
|
+
) -> Optional[str]:
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
def _post_execute(self) -> None:
|
|
86
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
KONDUKTOR_SSH_PORT = 2222
|
|
2
|
+
|
|
3
|
+
# Common labels used across JobSets and Deployments
|
|
4
|
+
JOB_NAME_LABEL = 'trainy.ai/job-name'
|
|
5
|
+
DEPLOYMENT_NAME_LABEL = 'trainy.ai/deployment-name'
|
|
6
|
+
AIBRIX_NAME_LABEL = 'model.aibrix.ai/name'
|
|
7
|
+
USERID_LABEL = 'trainy.ai/user-id'
|
|
8
|
+
USER_LABEL = 'trainy.ai/username'
|
|
9
|
+
ACCELERATOR_LABEL = 'trainy.ai/accelerator'
|
|
10
|
+
NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
|
|
11
|
+
MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
|
|
12
|
+
|
|
13
|
+
# Start/stop/status related labels
|
|
14
|
+
STOP_USERID_LABEL = 'trainy.ai/stop-userid'
|
|
15
|
+
STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
|
|
16
|
+
|
|
17
|
+
# Secret labels
|
|
18
|
+
SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
|
|
19
|
+
SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
|
|
20
|
+
SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
|
|
21
|
+
ROOT_NAME = 'trainy.ai/root-name'
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
|
|
5
|
+
import colorama
|
|
6
|
+
|
|
7
|
+
if typing.TYPE_CHECKING:
|
|
8
|
+
import konduktor
|
|
9
|
+
from konduktor.data import storage as storage_lib
|
|
10
|
+
|
|
11
|
+
from kubernetes.client.exceptions import ApiException
|
|
12
|
+
|
|
13
|
+
from konduktor import config, kube_client, logging
|
|
14
|
+
from konduktor.backends import backend, deployment_utils, pod_utils
|
|
15
|
+
from konduktor.utils import kubernetes_utils, rich_utils, ux_utils
|
|
16
|
+
|
|
17
|
+
Path = str
|
|
18
|
+
logger = logging.get_logger(__file__)
|
|
19
|
+
|
|
20
|
+
POLL_INTERVAL = 5
|
|
21
|
+
DEFAULT_ATTACH_TIMEOUT = 300
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeploymentError(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _wait_for_all_ready(namespace: str, name: str):
|
|
29
|
+
"""Wait for Deployment, Service, and Autoscaler readiness."""
|
|
30
|
+
time.sleep(2)
|
|
31
|
+
start = time.time()
|
|
32
|
+
timeout = config.get_nested(
|
|
33
|
+
('kubernetes', 'provision_timeout'),
|
|
34
|
+
default_value=DEFAULT_ATTACH_TIMEOUT,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
while True:
|
|
38
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
39
|
+
|
|
40
|
+
# Directly read objects instead of listing everything
|
|
41
|
+
try:
|
|
42
|
+
deployment = kube_client.apps_api(context).read_namespaced_deployment(
|
|
43
|
+
name=name, namespace=namespace
|
|
44
|
+
)
|
|
45
|
+
deployments_map = {name: deployment}
|
|
46
|
+
except ApiException:
|
|
47
|
+
deployments_map = {}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
service = kube_client.core_api(context).read_namespaced_service(
|
|
51
|
+
name=name, namespace=namespace
|
|
52
|
+
)
|
|
53
|
+
services_map = {name: service}
|
|
54
|
+
except ApiException:
|
|
55
|
+
services_map = {}
|
|
56
|
+
|
|
57
|
+
autoscalers_map = {}
|
|
58
|
+
try:
|
|
59
|
+
autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
|
|
60
|
+
if autoscaler_obj:
|
|
61
|
+
# detect aibrix vs general from deployment labels
|
|
62
|
+
labels = (deployment.metadata.labels or {}) if deployment else {}
|
|
63
|
+
is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
|
|
64
|
+
if is_aibrix:
|
|
65
|
+
autoscalers_map[name] = {'kpa': autoscaler_obj}
|
|
66
|
+
else:
|
|
67
|
+
autoscalers_map[name] = {'hpa': autoscaler_obj}
|
|
68
|
+
except ApiException:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
status = deployment_utils.get_model_status(
|
|
72
|
+
name, deployments_map, services_map, autoscalers_map
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
is_ready = (
|
|
76
|
+
status['deployment'] == 'ready'
|
|
77
|
+
and status['service'] == 'ready'
|
|
78
|
+
and (status['autoscaler'] == 'ready' or status['autoscaler'] is None)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
states = {
|
|
82
|
+
'Deployment': status['deployment'],
|
|
83
|
+
'Service': status['service'],
|
|
84
|
+
'Autoscaler': status['autoscaler'],
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Figure out which components are missing
|
|
88
|
+
missing_parts = [name for name, state in states.items() if state == 'missing']
|
|
89
|
+
|
|
90
|
+
if missing_parts:
|
|
91
|
+
deployment_utils.delete_serving_specs(name, namespace)
|
|
92
|
+
missing_str = ', '.join(missing_parts)
|
|
93
|
+
raise DeploymentError(
|
|
94
|
+
f'Deployment failed. '
|
|
95
|
+
f'The following components are missing: {missing_str}.'
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if is_ready:
|
|
99
|
+
logger.info(
|
|
100
|
+
f'task {colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
101
|
+
f'{name}{colorama.Style.RESET_ALL} ready'
|
|
102
|
+
)
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
if timeout != -1 and time.time() - start > timeout:
|
|
106
|
+
logger.error(
|
|
107
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
|
108
|
+
f'Model timed out waiting for readiness.'
|
|
109
|
+
f'{colorama.Style.RESET_ALL}'
|
|
110
|
+
f'Final status:\n{status}'
|
|
111
|
+
)
|
|
112
|
+
deployment_utils.delete_serving_specs(name, namespace)
|
|
113
|
+
raise DeploymentError(
|
|
114
|
+
f'Model failed to become ready within {timeout} seconds.\n'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
time.sleep(POLL_INTERVAL)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class DeploymentBackend(backend.Backend):
|
|
121
|
+
NAME = 'deployment'
|
|
122
|
+
|
|
123
|
+
def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
def add_storage_objects(self, task: 'konduktor.Task') -> None:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
def register_info(self, **kwargs) -> None:
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
def _sync_file_mounts(
|
|
133
|
+
self,
|
|
134
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
|
135
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
|
136
|
+
) -> None:
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
def _sync_workdir(self, workdir: str) -> None:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
def _post_execute(self) -> None:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
def _execute(
|
|
146
|
+
self,
|
|
147
|
+
task: 'konduktor.Task',
|
|
148
|
+
detach_run: bool = False,
|
|
149
|
+
dryrun: bool = False,
|
|
150
|
+
) -> Optional[str]:
|
|
151
|
+
"""Execute a task by launching a long-running Deployment."""
|
|
152
|
+
|
|
153
|
+
pod_spec = pod_utils.create_pod_spec(task)
|
|
154
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
155
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
156
|
+
|
|
157
|
+
if not dryrun and task.serving:
|
|
158
|
+
logger.debug(f'[DEBUG] Creating deployment for task: {task.name}')
|
|
159
|
+
deployment_utils.create_deployment(
|
|
160
|
+
namespace=namespace,
|
|
161
|
+
task=task,
|
|
162
|
+
pod_spec=pod_spec['kubernetes']['pod_config'],
|
|
163
|
+
dryrun=dryrun,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
logger.debug(f'[DEBUG] Creating service for task: {task.name}')
|
|
167
|
+
deployment_utils.create_service(
|
|
168
|
+
namespace=namespace,
|
|
169
|
+
task=task,
|
|
170
|
+
dryrun=dryrun,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Create podautoscaler for non-general deployments
|
|
174
|
+
logger.debug(f'[DEBUG] Creating podautoscaler for task: {task.name}')
|
|
175
|
+
deployment_utils.create_pod_autoscaler(
|
|
176
|
+
namespace=namespace,
|
|
177
|
+
task=task,
|
|
178
|
+
dryrun=dryrun,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# HTTP Add-on resources for general deployments
|
|
182
|
+
logger.debug(
|
|
183
|
+
f'[DEBUG] Creating HTTP Add-on resources for task: {task.name}'
|
|
184
|
+
)
|
|
185
|
+
deployment_utils.create_http_addon_resources(
|
|
186
|
+
namespace=namespace,
|
|
187
|
+
task=task,
|
|
188
|
+
dryrun=dryrun,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if not dryrun and not detach_run:
|
|
192
|
+
with ux_utils.print_exception_no_traceback():
|
|
193
|
+
with rich_utils.safe_status(
|
|
194
|
+
ux_utils.spinner_message('waiting for resources to be ready.\n')
|
|
195
|
+
):
|
|
196
|
+
_wait_for_all_ready(namespace, task.name)
|
|
197
|
+
logger.info(
|
|
198
|
+
f"Model '{task.name}' is ready. "
|
|
199
|
+
f'Run `konduktor serve status` for details.'
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
logger.info('detaching from run.')
|
|
203
|
+
|
|
204
|
+
return task.name
|