konduktor-nightly 0.1.0.dev20250804105449__py3-none-any.whl → 0.1.0.dev20250806105405__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of konduktor-nightly might be problematic. Click here for more details.
- konduktor/__init__.py +4 -7
- konduktor/backends/__init__.py +2 -4
- konduktor/backends/constants.py +12 -0
- konduktor/backends/deployment.py +179 -0
- konduktor/backends/deployment_utils.py +835 -0
- konduktor/backends/jobset.py +2 -2
- konduktor/backends/jobset_utils.py +16 -266
- konduktor/backends/pod_utils.py +392 -0
- konduktor/cli.py +343 -8
- konduktor/controller/launch.py +1 -1
- konduktor/execution.py +5 -2
- konduktor/kube_client.py +8 -0
- konduktor/resource.py +20 -0
- konduktor/serving.py +149 -0
- konduktor/task.py +61 -0
- konduktor/templates/deployment.yaml.j2 +142 -0
- konduktor/templates/pod.yaml.j2 +36 -0
- konduktor/utils/accelerator_registry.py +1 -1
- konduktor/utils/log_utils.py +1 -1
- konduktor/utils/schemas.py +42 -0
- konduktor/utils/validator.py +51 -16
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/RECORD +26 -21
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
|
@@ -5,16 +5,13 @@ import subprocess
|
|
|
5
5
|
|
|
6
6
|
from konduktor.execution import launch
|
|
7
7
|
from konduktor.resource import Resources
|
|
8
|
+
from konduktor.serving import Serving
|
|
8
9
|
from konduktor.task import Task
|
|
9
10
|
|
|
10
|
-
__all__ = [
|
|
11
|
-
'launch',
|
|
12
|
-
'Resources',
|
|
13
|
-
'Task',
|
|
14
|
-
]
|
|
11
|
+
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
15
12
|
|
|
16
13
|
# Replaced with the current commit when building the wheels.
|
|
17
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '75e0a60af7c427a1419fda58e1ccc6dff519b4ba'
|
|
18
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
19
16
|
|
|
20
17
|
|
|
@@ -48,5 +45,5 @@ def _get_git_commit():
|
|
|
48
45
|
|
|
49
46
|
|
|
50
47
|
__commit__ = _get_git_commit()
|
|
51
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20250806105405'
|
|
52
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
konduktor/backends/__init__.py
CHANGED
konduktor/backends/constants.py
CHANGED
|
@@ -1 +1,13 @@
|
|
|
1
1
|
KONDUKTOR_SSH_PORT = 2222
|
|
2
|
+
|
|
3
|
+
# Common labels used across JobSets and Deployments
|
|
4
|
+
JOB_NAME_LABEL = 'trainy.ai/job-name'
|
|
5
|
+
DEPLOYMENT_NAME_LABEL = 'trainy.ai/deployment-name'
|
|
6
|
+
AIBRIX_NAME_LABEL = 'model.aibrix.ai/name'
|
|
7
|
+
USERID_LABEL = 'trainy.ai/user-id'
|
|
8
|
+
USER_LABEL = 'trainy.ai/username'
|
|
9
|
+
ACCELERATOR_LABEL = 'trainy.ai/accelerator'
|
|
10
|
+
NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
|
|
11
|
+
|
|
12
|
+
# Secret labels
|
|
13
|
+
SECRET_BASENAME_LABEL = 'konduktor/basename'
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
|
|
5
|
+
import colorama
|
|
6
|
+
|
|
7
|
+
if typing.TYPE_CHECKING:
|
|
8
|
+
import konduktor
|
|
9
|
+
from konduktor.data import storage as storage_lib
|
|
10
|
+
|
|
11
|
+
from kubernetes.client.exceptions import ApiException
|
|
12
|
+
|
|
13
|
+
from konduktor import config, kube_client, logging
|
|
14
|
+
from konduktor.backends import backend, deployment_utils, pod_utils
|
|
15
|
+
from konduktor.utils import kubernetes_utils, rich_utils, ux_utils
|
|
16
|
+
|
|
17
|
+
Path = str
|
|
18
|
+
logger = logging.get_logger(__file__)
|
|
19
|
+
|
|
20
|
+
POLL_INTERVAL = 5
|
|
21
|
+
DEFAULT_ATTACH_TIMEOUT = 300
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeploymentError(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _wait_for_all_ready(namespace: str, name: str):
|
|
29
|
+
"""Wait for Deployment, Service, and Autoscaler readiness."""
|
|
30
|
+
time.sleep(2)
|
|
31
|
+
start = time.time()
|
|
32
|
+
timeout = config.get_nested(
|
|
33
|
+
('kubernetes', 'provision_timeout'),
|
|
34
|
+
default_value=DEFAULT_ATTACH_TIMEOUT,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
while True:
|
|
38
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
39
|
+
|
|
40
|
+
# Directly read objects instead of listing everything
|
|
41
|
+
try:
|
|
42
|
+
deployment = kube_client.apps_api(context).read_namespaced_deployment(
|
|
43
|
+
name=name, namespace=namespace
|
|
44
|
+
)
|
|
45
|
+
deployments_map = {name: deployment}
|
|
46
|
+
except ApiException:
|
|
47
|
+
deployments_map = {}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
service = kube_client.core_api(context).read_namespaced_service(
|
|
51
|
+
name=name, namespace=namespace
|
|
52
|
+
)
|
|
53
|
+
services_map = {name: service}
|
|
54
|
+
except ApiException:
|
|
55
|
+
services_map = {}
|
|
56
|
+
|
|
57
|
+
autoscaler = deployment_utils.get_autoscaler(namespace, name)
|
|
58
|
+
autoscalers_map = {name: autoscaler} if autoscaler else {}
|
|
59
|
+
|
|
60
|
+
status = deployment_utils.get_model_status(
|
|
61
|
+
name, deployments_map, services_map, autoscalers_map
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
is_ready = (
|
|
65
|
+
status['deployment'] == 'ready'
|
|
66
|
+
and status['service'] == 'ready'
|
|
67
|
+
and (status['autoscaler'] == 'ready' or status['autoscaler'] is None)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
states = {
|
|
71
|
+
'Deployment': status['deployment'],
|
|
72
|
+
'Service': status['service'],
|
|
73
|
+
'Autoscaler': status['autoscaler'],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Figure out which components are missing
|
|
77
|
+
missing_parts = [name for name, state in states.items() if state == 'missing']
|
|
78
|
+
|
|
79
|
+
if missing_parts:
|
|
80
|
+
deployment_utils.delete_serving_specs(name, namespace)
|
|
81
|
+
missing_str = ', '.join(missing_parts)
|
|
82
|
+
raise DeploymentError(
|
|
83
|
+
f'Deployment failed. '
|
|
84
|
+
f'The following components are missing: {missing_str}.'
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if is_ready:
|
|
88
|
+
logger.info(
|
|
89
|
+
f'task {colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
90
|
+
f'{name}{colorama.Style.RESET_ALL} ready'
|
|
91
|
+
)
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
if timeout != -1 and time.time() - start > timeout:
|
|
95
|
+
logger.error(
|
|
96
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
|
97
|
+
f'Model timed out waiting for readiness.'
|
|
98
|
+
f'{colorama.Style.RESET_ALL}'
|
|
99
|
+
f'Final status:\n{status}'
|
|
100
|
+
)
|
|
101
|
+
deployment_utils.delete_serving_specs(name, namespace)
|
|
102
|
+
raise DeploymentError(
|
|
103
|
+
f'Model failed to become ready within {timeout} seconds.\n'
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
time.sleep(POLL_INTERVAL)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class DeploymentBackend(backend.Backend):
|
|
110
|
+
NAME = 'deployment'
|
|
111
|
+
|
|
112
|
+
def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
def add_storage_objects(self, task: 'konduktor.Task') -> None:
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
def register_info(self, **kwargs) -> None:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
def _sync_file_mounts(
|
|
122
|
+
self,
|
|
123
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
|
124
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
|
125
|
+
) -> None:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def _sync_workdir(self, workdir: str) -> None:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def _post_execute(self) -> None:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
def _execute(
|
|
135
|
+
self,
|
|
136
|
+
task: 'konduktor.Task',
|
|
137
|
+
detach_run: bool = False,
|
|
138
|
+
dryrun: bool = False,
|
|
139
|
+
) -> Optional[str]:
|
|
140
|
+
"""Execute a task by launching a long-running Deployment."""
|
|
141
|
+
|
|
142
|
+
pod_spec = pod_utils.create_pod_spec(task)
|
|
143
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
144
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
145
|
+
|
|
146
|
+
if not dryrun and task.serving:
|
|
147
|
+
deployment_utils.create_deployment(
|
|
148
|
+
namespace=namespace,
|
|
149
|
+
task=task,
|
|
150
|
+
pod_spec=pod_spec['kubernetes']['pod_config'],
|
|
151
|
+
dryrun=dryrun,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
deployment_utils.create_service(
|
|
155
|
+
namespace=namespace,
|
|
156
|
+
task=task,
|
|
157
|
+
dryrun=dryrun,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
deployment_utils.create_autoscaler(
|
|
161
|
+
namespace=namespace,
|
|
162
|
+
task=task,
|
|
163
|
+
dryrun=dryrun,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if not dryrun and not detach_run:
|
|
167
|
+
with ux_utils.print_exception_no_traceback():
|
|
168
|
+
with rich_utils.safe_status(
|
|
169
|
+
ux_utils.spinner_message('waiting for resources to be ready.\n')
|
|
170
|
+
):
|
|
171
|
+
_wait_for_all_ready(namespace, task.name)
|
|
172
|
+
logger.info(
|
|
173
|
+
f"Model '{task.name}' is ready. "
|
|
174
|
+
f'Run `konduktor serve status` for details.'
|
|
175
|
+
)
|
|
176
|
+
else:
|
|
177
|
+
logger.info('detaching from run.')
|
|
178
|
+
|
|
179
|
+
return task.name
|