konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/resource.py
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Resources: compute requirements of Tasks."""
|
|
14
|
+
|
|
15
|
+
import functools
|
|
16
|
+
from typing import Any, Dict, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from konduktor import logging
|
|
19
|
+
from konduktor.utils import (
|
|
20
|
+
accelerator_registry,
|
|
21
|
+
common_utils,
|
|
22
|
+
schemas,
|
|
23
|
+
ux_utils,
|
|
24
|
+
validator,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
_DEFAULT_DISK_SIZE_GB = 256
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Resources:
|
|
33
|
+
"""Resources: compute requirements of Tasks.
|
|
34
|
+
|
|
35
|
+
This class is immutable once created (to ensure some validations are done
|
|
36
|
+
whenever properties change). To update the property of an instance of
|
|
37
|
+
Resources, use `resources.copy(**new_properties)`.
|
|
38
|
+
|
|
39
|
+
Used:
|
|
40
|
+
|
|
41
|
+
* for representing resource requests for task
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# If any fields changed, increment the version. For backward compatibility,
|
|
46
|
+
# modify the __setstate__ method to handle the old version.
|
|
47
|
+
_VERSION = 1
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
cloud: Optional[Any] = None,
|
|
52
|
+
cpus: Union[None, int, float, str] = None,
|
|
53
|
+
memory: Union[None, int, float, str] = None,
|
|
54
|
+
accelerators: Optional[str] = None,
|
|
55
|
+
image_id: Union[str, None] = None,
|
|
56
|
+
disk_size: Optional[int] = None,
|
|
57
|
+
labels: Optional[Dict[str, str]] = None,
|
|
58
|
+
job_config: Optional[Dict[str, Union[int, str]]] = None,
|
|
59
|
+
# Internal use only.
|
|
60
|
+
# pylint: disable=invalid-name
|
|
61
|
+
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
|
62
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
63
|
+
_validate_image: bool = True,
|
|
64
|
+
):
|
|
65
|
+
"""Initialize a Resources object.
|
|
66
|
+
|
|
67
|
+
All fields are optional. ``Resources.is_launchable`` decides whether
|
|
68
|
+
the Resources is fully specified to launch an instance.
|
|
69
|
+
|
|
70
|
+
Examples:
|
|
71
|
+
.. code-block:: python
|
|
72
|
+
|
|
73
|
+
# Specifying required resources; the system decides the
|
|
74
|
+
# cloud/instance type. The below are equivalent:
|
|
75
|
+
konduktor.Resources(accelerators='V100')
|
|
76
|
+
konduktor.Resources(accelerators='V100:1')
|
|
77
|
+
konduktor.Resources(accelerators={'V100': 1})
|
|
78
|
+
konduktor.Resources(cpus='2+', memory='16+', accelerators='V100')
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
cloud: the cloud to use. (deprecated) all jobs are submitted to k8s
|
|
82
|
+
instance_type: the instance type to use.
|
|
83
|
+
cpus: the number of CPUs required for the task.
|
|
84
|
+
If a str, must be a string of the form ``'2'`` or ``'2+'``, where
|
|
85
|
+
the ``+`` indicates that the task requires at least 2 CPUs.
|
|
86
|
+
memory: the amount of memory in GiB required. If a
|
|
87
|
+
str, must be a string of the form ``'16'`` or ``'16+'``, where
|
|
88
|
+
the ``+`` indicates that the task requires at least 16 GB of memory.
|
|
89
|
+
accelerators: the accelerators required. If a str, must be
|
|
90
|
+
a string of the form ``'V100'`` or ``'V100:2'``, where the ``:2``
|
|
91
|
+
indicates that the task requires 2 V100 GPUs. If a dict, must be a
|
|
92
|
+
dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
|
|
93
|
+
|
|
94
|
+
image_id: docker image to use
|
|
95
|
+
|
|
96
|
+
disk_size: the size of the OS disk in GiB.
|
|
97
|
+
labels: the labels to apply to the instance. These are useful for
|
|
98
|
+
assigning metadata that may be used by external tools.
|
|
99
|
+
Implementation depends on the chosen cloud - On AWS, labels map to
|
|
100
|
+
instance tags. On GCP, labels map to instance labels. On
|
|
101
|
+
Kubernetes, labels map to pod labels. On other clouds, labels are
|
|
102
|
+
not supported and will be ignored.
|
|
103
|
+
job_config: the configuration of the job spec
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: if some attributes are invalid.
|
|
106
|
+
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
|
107
|
+
"""
|
|
108
|
+
self._version = self._VERSION
|
|
109
|
+
if cloud is not None:
|
|
110
|
+
raise ValueError('cloud specified, but all jobs are submitted to k8s')
|
|
111
|
+
self._cloud = cloud
|
|
112
|
+
|
|
113
|
+
if disk_size is not None:
|
|
114
|
+
if round(disk_size) != disk_size:
|
|
115
|
+
with ux_utils.print_exception_no_traceback():
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f'OS disk size must be an integer. Got: {disk_size}.'
|
|
118
|
+
)
|
|
119
|
+
self._disk_size = int(disk_size)
|
|
120
|
+
else:
|
|
121
|
+
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
|
122
|
+
|
|
123
|
+
# self._image_id is a dict of {region: image_id}.
|
|
124
|
+
# The key is None if the same image_id applies for all regions.
|
|
125
|
+
self._image_id = image_id
|
|
126
|
+
if isinstance(image_id, str):
|
|
127
|
+
self._image_id = image_id.strip()
|
|
128
|
+
# Validate Docker image format and existence
|
|
129
|
+
if _validate_image:
|
|
130
|
+
validator.validate_and_warn_image(self._image_id, 'task')
|
|
131
|
+
|
|
132
|
+
self._labels = labels
|
|
133
|
+
self._cluster_config_overrides = _cluster_config_overrides
|
|
134
|
+
|
|
135
|
+
self._set_cpus(cpus)
|
|
136
|
+
self._set_memory(memory)
|
|
137
|
+
self._set_accelerators(accelerators)
|
|
138
|
+
self.job_config = job_config or {}
|
|
139
|
+
|
|
140
|
+
# TODO: move these out of init to prevent repeated calls.
|
|
141
|
+
self._try_validate_cpus_mem()
|
|
142
|
+
self._try_validate_image_id()
|
|
143
|
+
|
|
144
|
+
def __repr__(self) -> str:
|
|
145
|
+
"""Returns a string representation for display.
|
|
146
|
+
|
|
147
|
+
Examples:
|
|
148
|
+
|
|
149
|
+
>>> konduktor.Resources(accelerators='V100')
|
|
150
|
+
<Kubernetes>({'V100': 1})
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
accelerators = ''
|
|
154
|
+
if self.accelerators is not None:
|
|
155
|
+
accelerators = f', {self.accelerators}'
|
|
156
|
+
|
|
157
|
+
cpus = ''
|
|
158
|
+
if self._cpus is not None:
|
|
159
|
+
cpus = f', cpus={self._cpus}'
|
|
160
|
+
|
|
161
|
+
memory = ''
|
|
162
|
+
if self.memory is not None:
|
|
163
|
+
memory = f', mem={self.memory}'
|
|
164
|
+
|
|
165
|
+
image_id = ''
|
|
166
|
+
if self.image_id is not None:
|
|
167
|
+
image_id = f', image_id={self.image_id}'
|
|
168
|
+
else:
|
|
169
|
+
with ux_utils.print_exception_no_traceback():
|
|
170
|
+
raise ValueError(
|
|
171
|
+
'no image id for the task was specified. You must '
|
|
172
|
+
'specify an image id for this task (e.g. '
|
|
173
|
+
'`nvcr.io/nvidia/pytorch:xx.xx-py3`'
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
disk_size = ''
|
|
177
|
+
if self.disk_size != _DEFAULT_DISK_SIZE_GB:
|
|
178
|
+
disk_size = f', disk_size={self.disk_size}'
|
|
179
|
+
|
|
180
|
+
# Do not show region/zone here as `konduktor status -a` would show them as
|
|
181
|
+
# separate columns. Also, Resources repr will be printed during
|
|
182
|
+
# failover, and the region may be dynamically determined.
|
|
183
|
+
hardware_str = f'{cpus}{memory}{accelerators}{image_id}' f'{disk_size}'
|
|
184
|
+
# It may have leading ',' (for example, instance_type not set) or empty
|
|
185
|
+
# spaces. Remove them.
|
|
186
|
+
while hardware_str and hardware_str[0] in (',', ' '):
|
|
187
|
+
hardware_str = hardware_str[1:]
|
|
188
|
+
|
|
189
|
+
return f'({hardware_str})'
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def cloud(self):
|
|
193
|
+
return self._cloud
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
@functools.lru_cache(maxsize=1)
|
|
197
|
+
def cpus(self) -> Optional[str]:
|
|
198
|
+
"""Returns the number of vCPUs that each instance must have.
|
|
199
|
+
|
|
200
|
+
For example, cpus='4' means each instance must have exactly 4 vCPUs,
|
|
201
|
+
and cpus='4+' means each instance must have at least 4 vCPUs.
|
|
202
|
+
|
|
203
|
+
(Developer note: The cpus field is only used to select the instance type
|
|
204
|
+
at launch time. Thus, Resources in the backend's ResourceHandle will
|
|
205
|
+
always have the cpus field set to None.)
|
|
206
|
+
"""
|
|
207
|
+
if self._cpus is not None:
|
|
208
|
+
return self._cpus
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def memory(self) -> Optional[str]:
|
|
213
|
+
"""Returns the memory that each instance must have in GB.
|
|
214
|
+
|
|
215
|
+
For example, memory='16' means each instance must have exactly 16GB
|
|
216
|
+
memory; memory='16+' means each instance must have at least 16GB
|
|
217
|
+
memory.
|
|
218
|
+
|
|
219
|
+
(Developer note: The memory field is only used to select the instance
|
|
220
|
+
type at launch time. Thus, Resources in the backend's ResourceHandle
|
|
221
|
+
will always have the memory field set to None.)
|
|
222
|
+
"""
|
|
223
|
+
return self._memory
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
@functools.lru_cache(maxsize=1)
|
|
227
|
+
def accelerators(self) -> Optional[Dict[str, int]]:
|
|
228
|
+
"""Returns the accelerators field directly or by inferring.
|
|
229
|
+
|
|
230
|
+
For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
|
|
231
|
+
set to None, but this function will infer {'V100': 1} from the instance
|
|
232
|
+
type.
|
|
233
|
+
"""
|
|
234
|
+
if self._accelerators is not None:
|
|
235
|
+
return self._accelerators
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def disk_size(self) -> int:
|
|
240
|
+
return self._disk_size
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def image_id(self) -> Optional[str]:
|
|
244
|
+
return self._image_id
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def labels(self) -> Optional[Dict[str, str]]:
|
|
248
|
+
return self._labels
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def cluster_config_overrides(self) -> Dict[str, Any]:
|
|
252
|
+
if self._cluster_config_overrides is None:
|
|
253
|
+
return {}
|
|
254
|
+
return self._cluster_config_overrides
|
|
255
|
+
|
|
256
|
+
def _set_cpus(
|
|
257
|
+
self,
|
|
258
|
+
cpus: Union[None, int, float, str],
|
|
259
|
+
) -> None:
|
|
260
|
+
if cpus is None:
|
|
261
|
+
self._cpus = None
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
self._cpus = str(cpus)
|
|
265
|
+
if isinstance(cpus, str):
|
|
266
|
+
if cpus.endswith('+'):
|
|
267
|
+
num_cpus_str = cpus[:-1]
|
|
268
|
+
else:
|
|
269
|
+
num_cpus_str = cpus
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
num_cpus = float(num_cpus_str)
|
|
273
|
+
except ValueError:
|
|
274
|
+
with ux_utils.print_exception_no_traceback():
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f'The "cpus" field should be either a number or '
|
|
277
|
+
f'a string "<number>+". Found: {cpus!r}'
|
|
278
|
+
) from None
|
|
279
|
+
else:
|
|
280
|
+
num_cpus = float(cpus)
|
|
281
|
+
|
|
282
|
+
if num_cpus <= 0:
|
|
283
|
+
with ux_utils.print_exception_no_traceback():
|
|
284
|
+
raise ValueError(
|
|
285
|
+
f'The "cpus" field should be positive. Found: {cpus!r}'
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def _set_memory(
|
|
289
|
+
self,
|
|
290
|
+
memory: Union[None, int, float, str],
|
|
291
|
+
) -> None:
|
|
292
|
+
if memory is None:
|
|
293
|
+
self._memory = None
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
self._memory = str(memory)
|
|
297
|
+
if isinstance(memory, str):
|
|
298
|
+
if memory.endswith(('+', 'x')):
|
|
299
|
+
# 'x' is used internally for make sure our resources used by
|
|
300
|
+
# jobs controller (memory: 3x) to have enough memory based on
|
|
301
|
+
# the vCPUs.
|
|
302
|
+
num_memory_gb = memory[:-1]
|
|
303
|
+
else:
|
|
304
|
+
num_memory_gb = memory
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
memory_gb = float(num_memory_gb)
|
|
308
|
+
except ValueError:
|
|
309
|
+
with ux_utils.print_exception_no_traceback():
|
|
310
|
+
raise ValueError(
|
|
311
|
+
f'The "memory" field should be either a number or '
|
|
312
|
+
f'a string "<number>+". Found: {memory!r}'
|
|
313
|
+
) from None
|
|
314
|
+
else:
|
|
315
|
+
memory_gb = float(memory)
|
|
316
|
+
|
|
317
|
+
if memory_gb <= 0:
|
|
318
|
+
with ux_utils.print_exception_no_traceback():
|
|
319
|
+
raise ValueError(
|
|
320
|
+
f'The "cpus" field should be positive. Found: {memory!r}'
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def _set_accelerators(
|
|
324
|
+
self,
|
|
325
|
+
accelerators: Union[None, str, Dict[str, int]],
|
|
326
|
+
accelerator_args: Optional[Dict[str, str]] = None,
|
|
327
|
+
) -> None:
|
|
328
|
+
"""Sets accelerators.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
accelerators: A string or a dict of accelerator types to counts.
|
|
332
|
+
accelerator_args: (deprecated) A dict of accelerator types to args.
|
|
333
|
+
"""
|
|
334
|
+
if accelerators is not None:
|
|
335
|
+
if isinstance(accelerators, str): # Convert to Dict[str, int].
|
|
336
|
+
if ':' not in accelerators:
|
|
337
|
+
accelerators = {accelerators: 1}
|
|
338
|
+
else:
|
|
339
|
+
splits = accelerators.split(':')
|
|
340
|
+
parse_error = (
|
|
341
|
+
'The "accelerators" field as a str '
|
|
342
|
+
'should be <name> or <name>:<cnt>. '
|
|
343
|
+
f'Found: {accelerators!r}'
|
|
344
|
+
)
|
|
345
|
+
if len(splits) != 2:
|
|
346
|
+
with ux_utils.print_exception_no_traceback():
|
|
347
|
+
raise ValueError(parse_error)
|
|
348
|
+
try:
|
|
349
|
+
num = float(splits[1])
|
|
350
|
+
num = int(num)
|
|
351
|
+
accelerators = {splits[0]: num}
|
|
352
|
+
except ValueError:
|
|
353
|
+
with ux_utils.print_exception_no_traceback():
|
|
354
|
+
raise ValueError(parse_error) from None
|
|
355
|
+
|
|
356
|
+
# Canonicalize the accelerator names.
|
|
357
|
+
accelerators = {
|
|
358
|
+
accelerator_registry.canonicalize_accelerator_name(acc): acc_count
|
|
359
|
+
for acc, acc_count in accelerators.items()
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
acc, _ = list(accelerators.items())[0]
|
|
363
|
+
|
|
364
|
+
self._accelerators = accelerators
|
|
365
|
+
|
|
366
|
+
def _try_validate_cpus_mem(self) -> None:
|
|
367
|
+
"""Try to validate the cpus and memory attributes.
|
|
368
|
+
|
|
369
|
+
Raises:
|
|
370
|
+
ValueError: if the attributes are invalid.
|
|
371
|
+
"""
|
|
372
|
+
if self._cpus is None and self._memory is None:
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
def _try_validate_image_id(self) -> None:
|
|
376
|
+
"""Try to validate the image_id attribute.
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
ValueError: if the attribute is invalid.
|
|
380
|
+
"""
|
|
381
|
+
if self._image_id is None:
|
|
382
|
+
with ux_utils.print_exception_no_traceback():
|
|
383
|
+
raise ValueError(
|
|
384
|
+
'no image id for the task was specified. You must '
|
|
385
|
+
'specify an image id for this task (e.g. '
|
|
386
|
+
'`nvcr.io/nvidia/pytorch:xx.xx-py3`'
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
def get_accelerators_str(self) -> str:
|
|
390
|
+
accelerators = self.accelerators
|
|
391
|
+
accel_str = ''
|
|
392
|
+
if accelerators is None:
|
|
393
|
+
accel_str = '-'
|
|
394
|
+
elif isinstance(accelerators, dict) and len(accelerators) == 1:
|
|
395
|
+
accel_name, accel_count = list(accelerators.items())[0]
|
|
396
|
+
accel_str = f'{accel_name}:{accel_count}'
|
|
397
|
+
return accel_str
|
|
398
|
+
|
|
399
|
+
def get_completions(self) -> Optional[int]:
|
|
400
|
+
value = self.job_config.get('completions')
|
|
401
|
+
if value is not None:
|
|
402
|
+
value = int(value)
|
|
403
|
+
if value <= 0:
|
|
404
|
+
with ux_utils.print_exception_no_traceback():
|
|
405
|
+
raise ValueError('completions must be a positive integer')
|
|
406
|
+
return value
|
|
407
|
+
return None
|
|
408
|
+
|
|
409
|
+
def get_max_restarts(self) -> Optional[int]:
|
|
410
|
+
value = self.job_config.get('max_restarts')
|
|
411
|
+
if value is not None:
|
|
412
|
+
value = int(value)
|
|
413
|
+
if value < 0:
|
|
414
|
+
with ux_utils.print_exception_no_traceback():
|
|
415
|
+
raise ValueError('max_restarts must be a non-negative integer')
|
|
416
|
+
return value
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
def get_accelerator_type(self) -> Optional[str]:
|
|
420
|
+
"""Returns the first accelerator type from the accelerators dict.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
The accelerator type (e.g., 'V100', 'A100') or None if no accelerators
|
|
424
|
+
"""
|
|
425
|
+
if self.accelerators is None or not self.accelerators:
|
|
426
|
+
return None
|
|
427
|
+
return next(iter(self.accelerators.keys())) # type: ignore
|
|
428
|
+
|
|
429
|
+
def get_accelerator_count(self) -> Optional[int]:
|
|
430
|
+
"""Returns the count of the first accelerator type from the accelerators dict.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
The accelerator count (e.g., 1, 2) or None if no accelerators
|
|
434
|
+
"""
|
|
435
|
+
if self.accelerators is None or not self.accelerators:
|
|
436
|
+
return None
|
|
437
|
+
return next(iter(self.accelerators.values())) # type: ignore
|
|
438
|
+
|
|
439
|
+
def copy(self, **override) -> 'Resources':
|
|
440
|
+
"""Returns a copy of the given Resources."""
|
|
441
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
442
|
+
new_image_id = override.pop('image_id', self.image_id)
|
|
443
|
+
resources = Resources(
|
|
444
|
+
cloud=override.pop('cloud', self.cloud),
|
|
445
|
+
cpus=override.pop('cpus', self._cpus),
|
|
446
|
+
memory=override.pop('memory', self.memory),
|
|
447
|
+
accelerators=override.pop('accelerators', self.accelerators),
|
|
448
|
+
disk_size=override.pop('disk_size', self.disk_size),
|
|
449
|
+
image_id=new_image_id,
|
|
450
|
+
labels=override.pop('labels', self.labels),
|
|
451
|
+
job_config=override.pop('job_config', self.job_config),
|
|
452
|
+
# used to prevent double validation of image (would happen from overrides)
|
|
453
|
+
_validate_image=(new_image_id != self.image_id),
|
|
454
|
+
)
|
|
455
|
+
assert len(override) == 0
|
|
456
|
+
return resources
|
|
457
|
+
|
|
458
|
+
@classmethod
|
|
459
|
+
def from_yaml_config(cls, config: Optional[Dict[str, Any]]) -> 'Resources':
|
|
460
|
+
if config is None:
|
|
461
|
+
return Resources()
|
|
462
|
+
common_utils.validate_schema(
|
|
463
|
+
config, schemas.get_resources_schema(), 'Invalid resources YAML: '
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
if config.get('job_config', None):
|
|
467
|
+
common_utils.validate_schema(
|
|
468
|
+
config['job_config'],
|
|
469
|
+
schemas.get_job_schema(),
|
|
470
|
+
'Invalid job config YAML: ',
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def _override_resources(
|
|
474
|
+
base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
|
|
475
|
+
) -> List[Resources]:
|
|
476
|
+
resources_list = []
|
|
477
|
+
for override_config in override_configs:
|
|
478
|
+
new_resource_config = base_resource_config.copy()
|
|
479
|
+
# Labels are handled separately.
|
|
480
|
+
override_labels = override_config.pop('labels', None)
|
|
481
|
+
new_resource_config.update(override_config)
|
|
482
|
+
|
|
483
|
+
# Update the labels with the override labels.
|
|
484
|
+
labels = new_resource_config.get('labels', None)
|
|
485
|
+
if labels is not None and override_labels is not None:
|
|
486
|
+
labels.update(override_labels)
|
|
487
|
+
elif override_labels is not None:
|
|
488
|
+
labels = override_labels
|
|
489
|
+
new_resource_config['labels'] = labels
|
|
490
|
+
|
|
491
|
+
# Call from_yaml_config again instead of
|
|
492
|
+
# _from_yaml_config_single to handle the case, where both
|
|
493
|
+
# multiple accelerators and `any_of` is specified.
|
|
494
|
+
# This will not cause infinite recursion because we have made
|
|
495
|
+
# sure that `any_of` and `ordered` cannot be specified in the
|
|
496
|
+
# resource candidates in `any_of` or `ordered`, by the schema
|
|
497
|
+
# validation above.
|
|
498
|
+
resources_list.extend([Resources.from_yaml_config(new_resource_config)])
|
|
499
|
+
|
|
500
|
+
return resources_list
|
|
501
|
+
|
|
502
|
+
config = config.copy()
|
|
503
|
+
|
|
504
|
+
return Resources._from_yaml_config_single(config)
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
|
508
|
+
resources_fields: Dict[str, Any] = {}
|
|
509
|
+
resources_fields['cpus'] = config.pop('cpus', None)
|
|
510
|
+
resources_fields['memory'] = config.pop('memory', None)
|
|
511
|
+
resources_fields['accelerators'] = config.pop('accelerators', None)
|
|
512
|
+
resources_fields['disk_size'] = config.pop('disk_size', None)
|
|
513
|
+
resources_fields['image_id'] = config.pop('image_id', None)
|
|
514
|
+
resources_fields['labels'] = config.pop('labels', None)
|
|
515
|
+
resources_fields['job_config'] = config.pop('job_config', None)
|
|
516
|
+
|
|
517
|
+
if resources_fields['cpus'] is not None:
|
|
518
|
+
resources_fields['cpus'] = str(resources_fields['cpus'])
|
|
519
|
+
if resources_fields['memory'] is not None:
|
|
520
|
+
resources_fields['memory'] = str(resources_fields['memory'])
|
|
521
|
+
# TODO(asaiacai): should we remove disk size
|
|
522
|
+
# since we aren't letting users set this at the host level?
|
|
523
|
+
if resources_fields['disk_size'] is not None:
|
|
524
|
+
resources_fields['disk_size'] = int(resources_fields['disk_size'])
|
|
525
|
+
|
|
526
|
+
assert not config, f'Invalid resource args: {config.keys()}'
|
|
527
|
+
return Resources(**resources_fields)
|
|
528
|
+
|
|
529
|
+
def to_yaml_config(self) -> Dict[str, Union[str, int]]:
|
|
530
|
+
"""Returns a yaml-style dict of config for this resource bundle."""
|
|
531
|
+
config = {}
|
|
532
|
+
|
|
533
|
+
def add_if_not_none(key, value):
|
|
534
|
+
if value is not None and value != 'None':
|
|
535
|
+
config[key] = value
|
|
536
|
+
|
|
537
|
+
add_if_not_none('cloud', str(self.cloud))
|
|
538
|
+
add_if_not_none('cpus', self._cpus)
|
|
539
|
+
add_if_not_none('memory', self.memory)
|
|
540
|
+
add_if_not_none('accelerators', self.accelerators)
|
|
541
|
+
|
|
542
|
+
add_if_not_none('disk_size', self.disk_size)
|
|
543
|
+
add_if_not_none('image_id', self.image_id)
|
|
544
|
+
add_if_not_none('labels', self.labels)
|
|
545
|
+
add_if_not_none('job_config', self.job_config)
|
|
546
|
+
return config
|
konduktor/serving.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Serving: configuration for long-running serving deployments."""
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict, Optional, Union
|
|
16
|
+
|
|
17
|
+
from konduktor import logging
|
|
18
|
+
from konduktor.utils import common_utils, schemas, ux_utils
|
|
19
|
+
|
|
20
|
+
logger = logging.get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Serving:
|
|
24
|
+
"""Serving: configuration for deployments.
|
|
25
|
+
|
|
26
|
+
Immutable once created. Use `copy()` to create a modified copy.
|
|
27
|
+
|
|
28
|
+
Used:
|
|
29
|
+
* to represent serving config in tasks
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
_VERSION = 1
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
min_replicas: Optional[int] = None,
|
|
37
|
+
max_replicas: Optional[int] = None,
|
|
38
|
+
ports: Optional[int] = 8000,
|
|
39
|
+
probe: Optional[str] = '/health',
|
|
40
|
+
):
|
|
41
|
+
self._version = self._VERSION
|
|
42
|
+
|
|
43
|
+
if min_replicas is None and max_replicas is None:
|
|
44
|
+
with ux_utils.print_exception_no_traceback():
|
|
45
|
+
raise ValueError(
|
|
46
|
+
'At least one of min_replicas or ' 'max_replicas must be specified.'
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if min_replicas is None:
|
|
50
|
+
min_replicas = max_replicas
|
|
51
|
+
if max_replicas is None:
|
|
52
|
+
# Edge case: if min_replicas is 0, set max_replicas to 1
|
|
53
|
+
if min_replicas == 0:
|
|
54
|
+
max_replicas = 1
|
|
55
|
+
else:
|
|
56
|
+
max_replicas = min_replicas
|
|
57
|
+
|
|
58
|
+
if min_replicas is not None and min_replicas < 0:
|
|
59
|
+
with ux_utils.print_exception_no_traceback():
|
|
60
|
+
raise ValueError('min_replicas must be >= 0')
|
|
61
|
+
|
|
62
|
+
if (
|
|
63
|
+
max_replicas is not None
|
|
64
|
+
and min_replicas is not None
|
|
65
|
+
and max_replicas < min_replicas
|
|
66
|
+
):
|
|
67
|
+
with ux_utils.print_exception_no_traceback():
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f'max_replicas ({max_replicas}) must '
|
|
70
|
+
f'be >= min_replicas ({min_replicas})'
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
self._min_replicas = min_replicas
|
|
74
|
+
self._max_replicas = max_replicas
|
|
75
|
+
self._ports = ports
|
|
76
|
+
self._probe = probe
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def min_replicas(self) -> int:
|
|
80
|
+
assert self._min_replicas is not None
|
|
81
|
+
return self._min_replicas
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def max_replicas(self) -> int:
|
|
85
|
+
assert self._max_replicas is not None
|
|
86
|
+
return self._max_replicas
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def ports(self) -> int:
|
|
90
|
+
assert self._ports is not None
|
|
91
|
+
return self._ports
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def probe(self) -> Optional[str]:
|
|
95
|
+
return self._probe
|
|
96
|
+
|
|
97
|
+
def get(self, key: str, default=None):
|
|
98
|
+
return {
|
|
99
|
+
'min_replicas': self._min_replicas,
|
|
100
|
+
'max_replicas': self._max_replicas,
|
|
101
|
+
'ports': self._ports,
|
|
102
|
+
'probe': self._probe,
|
|
103
|
+
}.get(key, default)
|
|
104
|
+
|
|
105
|
+
def copy(self, **override) -> 'Serving':
|
|
106
|
+
"""Returns a copy of this Serving with fields overridden."""
|
|
107
|
+
return Serving(
|
|
108
|
+
min_replicas=override.pop('min_replicas', self._min_replicas),
|
|
109
|
+
max_replicas=override.pop('max_replicas', self._max_replicas),
|
|
110
|
+
ports=override.pop('ports', self._ports),
|
|
111
|
+
probe=override.pop('probe', self._probe),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_yaml_config(
|
|
116
|
+
cls, config: Optional[Dict[str, Any]], task_run: Optional[str] = None
|
|
117
|
+
) -> Optional['Serving']:
|
|
118
|
+
if config is None:
|
|
119
|
+
return None
|
|
120
|
+
common_utils.validate_schema(
|
|
121
|
+
config,
|
|
122
|
+
schemas.get_serving_schema(),
|
|
123
|
+
'Invalid serving config YAML: ',
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if 'min_replicas' not in config and 'max_replicas' not in config:
|
|
127
|
+
raise ValueError(
|
|
128
|
+
'At least one of min_replicas or '
|
|
129
|
+
'max_replicas must be specified in serving'
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Determine default probe based on deployment type
|
|
133
|
+
default_probe = None # No probing by default for general deployments
|
|
134
|
+
if task_run and 'vllm.entrypoints.openai.api_server' in task_run:
|
|
135
|
+
default_probe = '/health' # Aibrix deployments get /health by default
|
|
136
|
+
|
|
137
|
+
return cls(
|
|
138
|
+
min_replicas=config.get('min_replicas', None),
|
|
139
|
+
max_replicas=config.get('max_replicas', None),
|
|
140
|
+
ports=config.get('ports', 8000),
|
|
141
|
+
probe=config.get('probe', default_probe),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def to_yaml_config(self) -> Dict[str, Union[int, str]]:
|
|
145
|
+
config: Dict[str, Union[int, str]] = {
|
|
146
|
+
'min_replicas': self._min_replicas if self._min_replicas is not None else 1,
|
|
147
|
+
'max_replicas': self._max_replicas if self._max_replicas is not None else 1,
|
|
148
|
+
'ports': self._ports if self._ports is not None else 8000,
|
|
149
|
+
}
|
|
150
|
+
# Only include probe if it's not None
|
|
151
|
+
if self._probe is not None:
|
|
152
|
+
config['probe'] = self._probe
|
|
153
|
+
return config
|