konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/resource.py ADDED
@@ -0,0 +1,546 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Resources: compute requirements of Tasks."""
14
+
15
+ import functools
16
+ from typing import Any, Dict, List, Optional, Union
17
+
18
+ from konduktor import logging
19
+ from konduktor.utils import (
20
+ accelerator_registry,
21
+ common_utils,
22
+ schemas,
23
+ ux_utils,
24
+ validator,
25
+ )
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ _DEFAULT_DISK_SIZE_GB = 256
30
+
31
+
32
+ class Resources:
33
+ """Resources: compute requirements of Tasks.
34
+
35
+ This class is immutable once created (to ensure some validations are done
36
+ whenever properties change). To update the property of an instance of
37
+ Resources, use `resources.copy(**new_properties)`.
38
+
39
+ Used:
40
+
41
+ * for representing resource requests for task
42
+
43
+ """
44
+
45
+ # If any fields changed, increment the version. For backward compatibility,
46
+ # modify the __setstate__ method to handle the old version.
47
+ _VERSION = 1
48
+
49
+ def __init__(
50
+ self,
51
+ cloud: Optional[Any] = None,
52
+ cpus: Union[None, int, float, str] = None,
53
+ memory: Union[None, int, float, str] = None,
54
+ accelerators: Optional[str] = None,
55
+ image_id: Union[str, None] = None,
56
+ disk_size: Optional[int] = None,
57
+ labels: Optional[Dict[str, str]] = None,
58
+ job_config: Optional[Dict[str, Union[int, str]]] = None,
59
+ # Internal use only.
60
+ # pylint: disable=invalid-name
61
+ _cluster_config_overrides: Optional[Dict[str, Any]] = None,
62
+ # used to prevent double validation of image (would happen from overrides)
63
+ _validate_image: bool = True,
64
+ ):
65
+ """Initialize a Resources object.
66
+
67
+ All fields are optional. ``Resources.is_launchable`` decides whether
68
+ the Resources is fully specified to launch an instance.
69
+
70
+ Examples:
71
+ .. code-block:: python
72
+
73
+ # Specifying required resources; the system decides the
74
+ # cloud/instance type. The below are equivalent:
75
+ konduktor.Resources(accelerators='V100')
76
+ konduktor.Resources(accelerators='V100:1')
77
+ konduktor.Resources(accelerators={'V100': 1})
78
+ konduktor.Resources(cpus='2+', memory='16+', accelerators='V100')
79
+
80
+ Args:
81
+ cloud: the cloud to use. (deprecated) all jobs are submitted to k8s
82
+ instance_type: the instance type to use.
83
+ cpus: the number of CPUs required for the task.
84
+ If a str, must be a string of the form ``'2'`` or ``'2+'``, where
85
+ the ``+`` indicates that the task requires at least 2 CPUs.
86
+ memory: the amount of memory in GiB required. If a
87
+ str, must be a string of the form ``'16'`` or ``'16+'``, where
88
+ the ``+`` indicates that the task requires at least 16 GB of memory.
89
+ accelerators: the accelerators required. If a str, must be
90
+ a string of the form ``'V100'`` or ``'V100:2'``, where the ``:2``
91
+ indicates that the task requires 2 V100 GPUs. If a dict, must be a
92
+ dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
93
+
94
+ image_id: docker image to use
95
+
96
+ disk_size: the size of the OS disk in GiB.
97
+ labels: the labels to apply to the instance. These are useful for
98
+ assigning metadata that may be used by external tools.
99
+ Implementation depends on the chosen cloud - On AWS, labels map to
100
+ instance tags. On GCP, labels map to instance labels. On
101
+ Kubernetes, labels map to pod labels. On other clouds, labels are
102
+ not supported and will be ignored.
103
+ job_config: the configuration of the job spec
104
+ Raises:
105
+ ValueError: if some attributes are invalid.
106
+ exceptions.NoCloudAccessError: if no public cloud is enabled.
107
+ """
108
+ self._version = self._VERSION
109
+ if cloud is not None:
110
+ raise ValueError('cloud specified, but all jobs are submitted to k8s')
111
+ self._cloud = cloud
112
+
113
+ if disk_size is not None:
114
+ if round(disk_size) != disk_size:
115
+ with ux_utils.print_exception_no_traceback():
116
+ raise ValueError(
117
+ f'OS disk size must be an integer. Got: {disk_size}.'
118
+ )
119
+ self._disk_size = int(disk_size)
120
+ else:
121
+ self._disk_size = _DEFAULT_DISK_SIZE_GB
122
+
123
+ # self._image_id is a dict of {region: image_id}.
124
+ # The key is None if the same image_id applies for all regions.
125
+ self._image_id = image_id
126
+ if isinstance(image_id, str):
127
+ self._image_id = image_id.strip()
128
+ # Validate Docker image format and existence
129
+ if _validate_image:
130
+ validator.validate_and_warn_image(self._image_id, 'task')
131
+
132
+ self._labels = labels
133
+ self._cluster_config_overrides = _cluster_config_overrides
134
+
135
+ self._set_cpus(cpus)
136
+ self._set_memory(memory)
137
+ self._set_accelerators(accelerators)
138
+ self.job_config = job_config or {}
139
+
140
+ # TODO: move these out of init to prevent repeated calls.
141
+ self._try_validate_cpus_mem()
142
+ self._try_validate_image_id()
143
+
144
+ def __repr__(self) -> str:
145
+ """Returns a string representation for display.
146
+
147
+ Examples:
148
+
149
+ >>> konduktor.Resources(accelerators='V100')
150
+ <Kubernetes>({'V100': 1})
151
+
152
+ """
153
+ accelerators = ''
154
+ if self.accelerators is not None:
155
+ accelerators = f', {self.accelerators}'
156
+
157
+ cpus = ''
158
+ if self._cpus is not None:
159
+ cpus = f', cpus={self._cpus}'
160
+
161
+ memory = ''
162
+ if self.memory is not None:
163
+ memory = f', mem={self.memory}'
164
+
165
+ image_id = ''
166
+ if self.image_id is not None:
167
+ image_id = f', image_id={self.image_id}'
168
+ else:
169
+ with ux_utils.print_exception_no_traceback():
170
+ raise ValueError(
171
+ 'no image id for the task was specified. You must '
172
+ 'specify an image id for this task (e.g. '
173
+ '`nvcr.io/nvidia/pytorch:xx.xx-py3`'
174
+ )
175
+
176
+ disk_size = ''
177
+ if self.disk_size != _DEFAULT_DISK_SIZE_GB:
178
+ disk_size = f', disk_size={self.disk_size}'
179
+
180
+ # Do not show region/zone here as `konduktor status -a` would show them as
181
+ # separate columns. Also, Resources repr will be printed during
182
+ # failover, and the region may be dynamically determined.
183
+ hardware_str = f'{cpus}{memory}{accelerators}{image_id}' f'{disk_size}'
184
+ # It may have leading ',' (for example, instance_type not set) or empty
185
+ # spaces. Remove them.
186
+ while hardware_str and hardware_str[0] in (',', ' '):
187
+ hardware_str = hardware_str[1:]
188
+
189
+ return f'({hardware_str})'
190
+
191
+ @property
192
+ def cloud(self):
193
+ return self._cloud
194
+
195
+ @property
196
+ @functools.lru_cache(maxsize=1)
197
+ def cpus(self) -> Optional[str]:
198
+ """Returns the number of vCPUs that each instance must have.
199
+
200
+ For example, cpus='4' means each instance must have exactly 4 vCPUs,
201
+ and cpus='4+' means each instance must have at least 4 vCPUs.
202
+
203
+ (Developer note: The cpus field is only used to select the instance type
204
+ at launch time. Thus, Resources in the backend's ResourceHandle will
205
+ always have the cpus field set to None.)
206
+ """
207
+ if self._cpus is not None:
208
+ return self._cpus
209
+ return None
210
+
211
+ @property
212
+ def memory(self) -> Optional[str]:
213
+ """Returns the memory that each instance must have in GB.
214
+
215
+ For example, memory='16' means each instance must have exactly 16GB
216
+ memory; memory='16+' means each instance must have at least 16GB
217
+ memory.
218
+
219
+ (Developer note: The memory field is only used to select the instance
220
+ type at launch time. Thus, Resources in the backend's ResourceHandle
221
+ will always have the memory field set to None.)
222
+ """
223
+ return self._memory
224
+
225
+ @property
226
+ @functools.lru_cache(maxsize=1)
227
+ def accelerators(self) -> Optional[Dict[str, int]]:
228
+ """Returns the accelerators field directly or by inferring.
229
+
230
+ For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
231
+ set to None, but this function will infer {'V100': 1} from the instance
232
+ type.
233
+ """
234
+ if self._accelerators is not None:
235
+ return self._accelerators
236
+ return None
237
+
238
+ @property
239
+ def disk_size(self) -> int:
240
+ return self._disk_size
241
+
242
+ @property
243
+ def image_id(self) -> Optional[str]:
244
+ return self._image_id
245
+
246
+ @property
247
+ def labels(self) -> Optional[Dict[str, str]]:
248
+ return self._labels
249
+
250
+ @property
251
+ def cluster_config_overrides(self) -> Dict[str, Any]:
252
+ if self._cluster_config_overrides is None:
253
+ return {}
254
+ return self._cluster_config_overrides
255
+
256
+ def _set_cpus(
257
+ self,
258
+ cpus: Union[None, int, float, str],
259
+ ) -> None:
260
+ if cpus is None:
261
+ self._cpus = None
262
+ return
263
+
264
+ self._cpus = str(cpus)
265
+ if isinstance(cpus, str):
266
+ if cpus.endswith('+'):
267
+ num_cpus_str = cpus[:-1]
268
+ else:
269
+ num_cpus_str = cpus
270
+
271
+ try:
272
+ num_cpus = float(num_cpus_str)
273
+ except ValueError:
274
+ with ux_utils.print_exception_no_traceback():
275
+ raise ValueError(
276
+ f'The "cpus" field should be either a number or '
277
+ f'a string "<number>+". Found: {cpus!r}'
278
+ ) from None
279
+ else:
280
+ num_cpus = float(cpus)
281
+
282
+ if num_cpus <= 0:
283
+ with ux_utils.print_exception_no_traceback():
284
+ raise ValueError(
285
+ f'The "cpus" field should be positive. Found: {cpus!r}'
286
+ )
287
+
288
+ def _set_memory(
289
+ self,
290
+ memory: Union[None, int, float, str],
291
+ ) -> None:
292
+ if memory is None:
293
+ self._memory = None
294
+ return
295
+
296
+ self._memory = str(memory)
297
+ if isinstance(memory, str):
298
+ if memory.endswith(('+', 'x')):
299
+ # 'x' is used internally for make sure our resources used by
300
+ # jobs controller (memory: 3x) to have enough memory based on
301
+ # the vCPUs.
302
+ num_memory_gb = memory[:-1]
303
+ else:
304
+ num_memory_gb = memory
305
+
306
+ try:
307
+ memory_gb = float(num_memory_gb)
308
+ except ValueError:
309
+ with ux_utils.print_exception_no_traceback():
310
+ raise ValueError(
311
+ f'The "memory" field should be either a number or '
312
+ f'a string "<number>+". Found: {memory!r}'
313
+ ) from None
314
+ else:
315
+ memory_gb = float(memory)
316
+
317
+ if memory_gb <= 0:
318
+ with ux_utils.print_exception_no_traceback():
319
+ raise ValueError(
320
+ f'The "cpus" field should be positive. Found: {memory!r}'
321
+ )
322
+
323
+ def _set_accelerators(
324
+ self,
325
+ accelerators: Union[None, str, Dict[str, int]],
326
+ accelerator_args: Optional[Dict[str, str]] = None,
327
+ ) -> None:
328
+ """Sets accelerators.
329
+
330
+ Args:
331
+ accelerators: A string or a dict of accelerator types to counts.
332
+ accelerator_args: (deprecated) A dict of accelerator types to args.
333
+ """
334
+ if accelerators is not None:
335
+ if isinstance(accelerators, str): # Convert to Dict[str, int].
336
+ if ':' not in accelerators:
337
+ accelerators = {accelerators: 1}
338
+ else:
339
+ splits = accelerators.split(':')
340
+ parse_error = (
341
+ 'The "accelerators" field as a str '
342
+ 'should be <name> or <name>:<cnt>. '
343
+ f'Found: {accelerators!r}'
344
+ )
345
+ if len(splits) != 2:
346
+ with ux_utils.print_exception_no_traceback():
347
+ raise ValueError(parse_error)
348
+ try:
349
+ num = float(splits[1])
350
+ num = int(num)
351
+ accelerators = {splits[0]: num}
352
+ except ValueError:
353
+ with ux_utils.print_exception_no_traceback():
354
+ raise ValueError(parse_error) from None
355
+
356
+ # Canonicalize the accelerator names.
357
+ accelerators = {
358
+ accelerator_registry.canonicalize_accelerator_name(acc): acc_count
359
+ for acc, acc_count in accelerators.items()
360
+ }
361
+
362
+ acc, _ = list(accelerators.items())[0]
363
+
364
+ self._accelerators = accelerators
365
+
366
+ def _try_validate_cpus_mem(self) -> None:
367
+ """Try to validate the cpus and memory attributes.
368
+
369
+ Raises:
370
+ ValueError: if the attributes are invalid.
371
+ """
372
+ if self._cpus is None and self._memory is None:
373
+ return
374
+
375
+ def _try_validate_image_id(self) -> None:
376
+ """Try to validate the image_id attribute.
377
+
378
+ Raises:
379
+ ValueError: if the attribute is invalid.
380
+ """
381
+ if self._image_id is None:
382
+ with ux_utils.print_exception_no_traceback():
383
+ raise ValueError(
384
+ 'no image id for the task was specified. You must '
385
+ 'specify an image id for this task (e.g. '
386
+ '`nvcr.io/nvidia/pytorch:xx.xx-py3`'
387
+ )
388
+
389
+ def get_accelerators_str(self) -> str:
390
+ accelerators = self.accelerators
391
+ accel_str = ''
392
+ if accelerators is None:
393
+ accel_str = '-'
394
+ elif isinstance(accelerators, dict) and len(accelerators) == 1:
395
+ accel_name, accel_count = list(accelerators.items())[0]
396
+ accel_str = f'{accel_name}:{accel_count}'
397
+ return accel_str
398
+
399
+ def get_completions(self) -> Optional[int]:
400
+ value = self.job_config.get('completions')
401
+ if value is not None:
402
+ value = int(value)
403
+ if value <= 0:
404
+ with ux_utils.print_exception_no_traceback():
405
+ raise ValueError('completions must be a positive integer')
406
+ return value
407
+ return None
408
+
409
+ def get_max_restarts(self) -> Optional[int]:
410
+ value = self.job_config.get('max_restarts')
411
+ if value is not None:
412
+ value = int(value)
413
+ if value < 0:
414
+ with ux_utils.print_exception_no_traceback():
415
+ raise ValueError('max_restarts must be a non-negative integer')
416
+ return value
417
+ return None
418
+
419
+ def get_accelerator_type(self) -> Optional[str]:
420
+ """Returns the first accelerator type from the accelerators dict.
421
+
422
+ Returns:
423
+ The accelerator type (e.g., 'V100', 'A100') or None if no accelerators
424
+ """
425
+ if self.accelerators is None or not self.accelerators:
426
+ return None
427
+ return next(iter(self.accelerators.keys())) # type: ignore
428
+
429
+ def get_accelerator_count(self) -> Optional[int]:
430
+ """Returns the count of the first accelerator type from the accelerators dict.
431
+
432
+ Returns:
433
+ The accelerator count (e.g., 1, 2) or None if no accelerators
434
+ """
435
+ if self.accelerators is None or not self.accelerators:
436
+ return None
437
+ return next(iter(self.accelerators.values())) # type: ignore
438
+
439
+ def copy(self, **override) -> 'Resources':
440
+ """Returns a copy of the given Resources."""
441
+ # used to prevent double validation of image (would happen from overrides)
442
+ new_image_id = override.pop('image_id', self.image_id)
443
+ resources = Resources(
444
+ cloud=override.pop('cloud', self.cloud),
445
+ cpus=override.pop('cpus', self._cpus),
446
+ memory=override.pop('memory', self.memory),
447
+ accelerators=override.pop('accelerators', self.accelerators),
448
+ disk_size=override.pop('disk_size', self.disk_size),
449
+ image_id=new_image_id,
450
+ labels=override.pop('labels', self.labels),
451
+ job_config=override.pop('job_config', self.job_config),
452
+ # used to prevent double validation of image (would happen from overrides)
453
+ _validate_image=(new_image_id != self.image_id),
454
+ )
455
+ assert len(override) == 0
456
+ return resources
457
+
458
+ @classmethod
459
+ def from_yaml_config(cls, config: Optional[Dict[str, Any]]) -> 'Resources':
460
+ if config is None:
461
+ return Resources()
462
+ common_utils.validate_schema(
463
+ config, schemas.get_resources_schema(), 'Invalid resources YAML: '
464
+ )
465
+
466
+ if config.get('job_config', None):
467
+ common_utils.validate_schema(
468
+ config['job_config'],
469
+ schemas.get_job_schema(),
470
+ 'Invalid job config YAML: ',
471
+ )
472
+
473
+ def _override_resources(
474
+ base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
475
+ ) -> List[Resources]:
476
+ resources_list = []
477
+ for override_config in override_configs:
478
+ new_resource_config = base_resource_config.copy()
479
+ # Labels are handled separately.
480
+ override_labels = override_config.pop('labels', None)
481
+ new_resource_config.update(override_config)
482
+
483
+ # Update the labels with the override labels.
484
+ labels = new_resource_config.get('labels', None)
485
+ if labels is not None and override_labels is not None:
486
+ labels.update(override_labels)
487
+ elif override_labels is not None:
488
+ labels = override_labels
489
+ new_resource_config['labels'] = labels
490
+
491
+ # Call from_yaml_config again instead of
492
+ # _from_yaml_config_single to handle the case, where both
493
+ # multiple accelerators and `any_of` is specified.
494
+ # This will not cause infinite recursion because we have made
495
+ # sure that `any_of` and `ordered` cannot be specified in the
496
+ # resource candidates in `any_of` or `ordered`, by the schema
497
+ # validation above.
498
+ resources_list.extend([Resources.from_yaml_config(new_resource_config)])
499
+
500
+ return resources_list
501
+
502
+ config = config.copy()
503
+
504
+ return Resources._from_yaml_config_single(config)
505
+
506
+ @classmethod
507
+ def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
508
+ resources_fields: Dict[str, Any] = {}
509
+ resources_fields['cpus'] = config.pop('cpus', None)
510
+ resources_fields['memory'] = config.pop('memory', None)
511
+ resources_fields['accelerators'] = config.pop('accelerators', None)
512
+ resources_fields['disk_size'] = config.pop('disk_size', None)
513
+ resources_fields['image_id'] = config.pop('image_id', None)
514
+ resources_fields['labels'] = config.pop('labels', None)
515
+ resources_fields['job_config'] = config.pop('job_config', None)
516
+
517
+ if resources_fields['cpus'] is not None:
518
+ resources_fields['cpus'] = str(resources_fields['cpus'])
519
+ if resources_fields['memory'] is not None:
520
+ resources_fields['memory'] = str(resources_fields['memory'])
521
+ # TODO(asaiacai): should we remove disk size
522
+ # since we aren't letting users set this at the host level?
523
+ if resources_fields['disk_size'] is not None:
524
+ resources_fields['disk_size'] = int(resources_fields['disk_size'])
525
+
526
+ assert not config, f'Invalid resource args: {config.keys()}'
527
+ return Resources(**resources_fields)
528
+
529
+ def to_yaml_config(self) -> Dict[str, Union[str, int]]:
530
+ """Returns a yaml-style dict of config for this resource bundle."""
531
+ config = {}
532
+
533
+ def add_if_not_none(key, value):
534
+ if value is not None and value != 'None':
535
+ config[key] = value
536
+
537
+ add_if_not_none('cloud', str(self.cloud))
538
+ add_if_not_none('cpus', self._cpus)
539
+ add_if_not_none('memory', self.memory)
540
+ add_if_not_none('accelerators', self.accelerators)
541
+
542
+ add_if_not_none('disk_size', self.disk_size)
543
+ add_if_not_none('image_id', self.image_id)
544
+ add_if_not_none('labels', self.labels)
545
+ add_if_not_none('job_config', self.job_config)
546
+ return config
konduktor/serving.py ADDED
@@ -0,0 +1,153 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Serving: configuration for long-running serving deployments."""
14
+
15
+ from typing import Any, Dict, Optional, Union
16
+
17
+ from konduktor import logging
18
+ from konduktor.utils import common_utils, schemas, ux_utils
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class Serving:
24
+ """Serving: configuration for deployments.
25
+
26
+ Immutable once created. Use `copy()` to create a modified copy.
27
+
28
+ Used:
29
+ * to represent serving config in tasks
30
+ """
31
+
32
+ _VERSION = 1
33
+
34
+ def __init__(
35
+ self,
36
+ min_replicas: Optional[int] = None,
37
+ max_replicas: Optional[int] = None,
38
+ ports: Optional[int] = 8000,
39
+ probe: Optional[str] = '/health',
40
+ ):
41
+ self._version = self._VERSION
42
+
43
+ if min_replicas is None and max_replicas is None:
44
+ with ux_utils.print_exception_no_traceback():
45
+ raise ValueError(
46
+ 'At least one of min_replicas or ' 'max_replicas must be specified.'
47
+ )
48
+
49
+ if min_replicas is None:
50
+ min_replicas = max_replicas
51
+ if max_replicas is None:
52
+ # Edge case: if min_replicas is 0, set max_replicas to 1
53
+ if min_replicas == 0:
54
+ max_replicas = 1
55
+ else:
56
+ max_replicas = min_replicas
57
+
58
+ if min_replicas is not None and min_replicas < 0:
59
+ with ux_utils.print_exception_no_traceback():
60
+ raise ValueError('min_replicas must be >= 0')
61
+
62
+ if (
63
+ max_replicas is not None
64
+ and min_replicas is not None
65
+ and max_replicas < min_replicas
66
+ ):
67
+ with ux_utils.print_exception_no_traceback():
68
+ raise ValueError(
69
+ f'max_replicas ({max_replicas}) must '
70
+ f'be >= min_replicas ({min_replicas})'
71
+ )
72
+
73
+ self._min_replicas = min_replicas
74
+ self._max_replicas = max_replicas
75
+ self._ports = ports
76
+ self._probe = probe
77
+
78
+ @property
79
+ def min_replicas(self) -> int:
80
+ assert self._min_replicas is not None
81
+ return self._min_replicas
82
+
83
+ @property
84
+ def max_replicas(self) -> int:
85
+ assert self._max_replicas is not None
86
+ return self._max_replicas
87
+
88
+ @property
89
+ def ports(self) -> int:
90
+ assert self._ports is not None
91
+ return self._ports
92
+
93
+ @property
94
+ def probe(self) -> Optional[str]:
95
+ return self._probe
96
+
97
+ def get(self, key: str, default=None):
98
+ return {
99
+ 'min_replicas': self._min_replicas,
100
+ 'max_replicas': self._max_replicas,
101
+ 'ports': self._ports,
102
+ 'probe': self._probe,
103
+ }.get(key, default)
104
+
105
+ def copy(self, **override) -> 'Serving':
106
+ """Returns a copy of this Serving with fields overridden."""
107
+ return Serving(
108
+ min_replicas=override.pop('min_replicas', self._min_replicas),
109
+ max_replicas=override.pop('max_replicas', self._max_replicas),
110
+ ports=override.pop('ports', self._ports),
111
+ probe=override.pop('probe', self._probe),
112
+ )
113
+
114
+ @classmethod
115
+ def from_yaml_config(
116
+ cls, config: Optional[Dict[str, Any]], task_run: Optional[str] = None
117
+ ) -> Optional['Serving']:
118
+ if config is None:
119
+ return None
120
+ common_utils.validate_schema(
121
+ config,
122
+ schemas.get_serving_schema(),
123
+ 'Invalid serving config YAML: ',
124
+ )
125
+
126
+ if 'min_replicas' not in config and 'max_replicas' not in config:
127
+ raise ValueError(
128
+ 'At least one of min_replicas or '
129
+ 'max_replicas must be specified in serving'
130
+ )
131
+
132
+ # Determine default probe based on deployment type
133
+ default_probe = None # No probing by default for general deployments
134
+ if task_run and 'vllm.entrypoints.openai.api_server' in task_run:
135
+ default_probe = '/health' # Aibrix deployments get /health by default
136
+
137
+ return cls(
138
+ min_replicas=config.get('min_replicas', None),
139
+ max_replicas=config.get('max_replicas', None),
140
+ ports=config.get('ports', 8000),
141
+ probe=config.get('probe', default_probe),
142
+ )
143
+
144
+ def to_yaml_config(self) -> Dict[str, Union[int, str]]:
145
+ config: Dict[str, Union[int, str]] = {
146
+ 'min_replicas': self._min_replicas if self._min_replicas is not None else 1,
147
+ 'max_replicas': self._max_replicas if self._max_replicas is not None else 1,
148
+ 'ports': self._ports if self._ports is not None else 8000,
149
+ }
150
+ # Only include probe if it's not None
151
+ if self._probe is not None:
152
+ config['probe'] = self._probe
153
+ return config