konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/resource.py ADDED
@@ -0,0 +1,478 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Resources: compute requirements of Tasks."""
14
+
15
+ import functools
16
+ from typing import Any, Dict, List, Optional, Union
17
+
18
+ from konduktor import logging
19
+ from konduktor.utils import accelerator_registry, common_utils, schemas, ux_utils
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ _DEFAULT_DISK_SIZE_GB = 256
24
+
25
+
26
+ class Resources:
27
+ """Resources: compute requirements of Tasks.
28
+
29
+ This class is immutable once created (to ensure some validations are done
30
+ whenever properties change). To update the property of an instance of
31
+ Resources, use `resources.copy(**new_properties)`.
32
+
33
+ Used:
34
+
35
+ * for representing resource requests for task
36
+
37
+ """
38
+
39
+ # If any fields changed, increment the version. For backward compatibility,
40
+ # modify the __setstate__ method to handle the old version.
41
+ _VERSION = 1
42
+
43
+ def __init__(
44
+ self,
45
+ cloud: Optional[Any] = None,
46
+ cpus: Union[None, int, float, str] = None,
47
+ memory: Union[None, int, float, str] = None,
48
+ accelerators: Optional[str] = None,
49
+ image_id: Union[str, None] = None,
50
+ disk_size: Optional[int] = None,
51
+ labels: Optional[Dict[str, str]] = None,
52
+ # Internal use only.
53
+ # pylint: disable=invalid-name
54
+ _cluster_config_overrides: Optional[Dict[str, Any]] = None,
55
+ ):
56
+ """Initialize a Resources object.
57
+
58
+ All fields are optional. ``Resources.is_launchable`` decides whether
59
+ the Resources is fully specified to launch an instance.
60
+
61
+ Examples:
62
+ .. code-block:: python
63
+
64
+ # Specifying required resources; the system decides the
65
+ # cloud/instance type. The below are equivalent:
66
+ konduktor.Resources(accelerators='V100')
67
+ konduktor.Resources(accelerators='V100:1')
68
+ konduktor.Resources(accelerators={'V100': 1})
69
+ konduktor.Resources(cpus='2+', memory='16+', accelerators='V100')
70
+
71
+ Args:
72
+ cloud: the cloud to use. (deprecated) all jobs are submitted to k8s
73
+ instance_type: the instance type to use.
74
+ cpus: the number of CPUs required for the task.
75
+ If a str, must be a string of the form ``'2'`` or ``'2+'``, where
76
+ the ``+`` indicates that the task requires at least 2 CPUs.
77
+ memory: the amount of memory in GiB required. If a
78
+ str, must be a string of the form ``'16'`` or ``'16+'``, where
79
+ the ``+`` indicates that the task requires at least 16 GB of memory.
80
+ accelerators: the accelerators required. If a str, must be
81
+ a string of the form ``'V100'`` or ``'V100:2'``, where the ``:2``
82
+ indicates that the task requires 2 V100 GPUs. If a dict, must be a
83
+ dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
84
+
85
+ image_id: docker image to use
86
+
87
+ disk_size: the size of the OS disk in GiB.
88
+ labels: the labels to apply to the instance. These are useful for
89
+ assigning metadata that may be used by external tools.
90
+ Implementation depends on the chosen cloud - On AWS, labels map to
91
+ instance tags. On GCP, labels map to instance labels. On
92
+ Kubernetes, labels map to pod labels. On other clouds, labels are
93
+ not supported and will be ignored.
94
+ Raises:
95
+ ValueError: if some attributes are invalid.
96
+ exceptions.NoCloudAccessError: if no public cloud is enabled.
97
+ """
98
+ self._version = self._VERSION
99
+ if cloud is not None:
100
+ raise ValueError('cloud specified, but all jobs are submitted to k8s')
101
+ self._cloud = cloud
102
+
103
+ if disk_size is not None:
104
+ if round(disk_size) != disk_size:
105
+ with ux_utils.print_exception_no_traceback():
106
+ raise ValueError(
107
+ f'OS disk size must be an integer. Got: {disk_size}.'
108
+ )
109
+ self._disk_size = int(disk_size)
110
+ else:
111
+ self._disk_size = _DEFAULT_DISK_SIZE_GB
112
+
113
+ # self._image_id is a dict of {region: image_id}.
114
+ # The key is None if the same image_id applies for all regions.
115
+ self._image_id = image_id
116
+ if isinstance(image_id, str):
117
+ self._image_id = image_id.strip()
118
+
119
+ self._labels = labels
120
+ self._cluster_config_overrides = _cluster_config_overrides
121
+
122
+ self._set_cpus(cpus)
123
+ self._set_memory(memory)
124
+ self._set_accelerators(accelerators)
125
+
126
+ # TODO: move these out of init to prevent repeated calls.
127
+ self._try_validate_cpus_mem()
128
+ self._try_validate_image_id()
129
+
130
+ def __repr__(self) -> str:
131
+ """Returns a string representation for display.
132
+
133
+ Examples:
134
+
135
+ >>> konduktor.Resources(accelerators='V100')
136
+ <Kubernetes>({'V100': 1})
137
+
138
+ """
139
+ accelerators = ''
140
+ if self.accelerators is not None:
141
+ accelerators = f', {self.accelerators}'
142
+
143
+ cpus = ''
144
+ if self._cpus is not None:
145
+ cpus = f', cpus={self._cpus}'
146
+
147
+ memory = ''
148
+ if self.memory is not None:
149
+ memory = f', mem={self.memory}'
150
+
151
+ image_id = ''
152
+ if self.image_id is not None:
153
+ image_id = f', image_id={self.image_id}'
154
+ else:
155
+ with ux_utils.print_exception_no_traceback():
156
+ raise ValueError(
157
+ 'no image id for the task was specified. You must '
158
+ 'specify an image id for this task (e.g. '
159
+ '`nvcr.io/nvidia/pytorch:xx.xx-py3`'
160
+ )
161
+
162
+ disk_size = ''
163
+ if self.disk_size != _DEFAULT_DISK_SIZE_GB:
164
+ disk_size = f', disk_size={self.disk_size}'
165
+
166
+ # Do not show region/zone here as `konduktor status -a` would show them as
167
+ # separate columns. Also, Resources repr will be printed during
168
+ # failover, and the region may be dynamically determined.
169
+ hardware_str = f'{cpus}{memory}{accelerators}{image_id}' f'{disk_size}'
170
+ # It may have leading ',' (for example, instance_type not set) or empty
171
+ # spaces. Remove them.
172
+ while hardware_str and hardware_str[0] in (',', ' '):
173
+ hardware_str = hardware_str[1:]
174
+
175
+ return f'({hardware_str})'
176
+
177
+ @property
178
+ def cloud(self):
179
+ return self._cloud
180
+
181
+ @property
182
+ @functools.lru_cache(maxsize=1)
183
+ def cpus(self) -> Optional[str]:
184
+ """Returns the number of vCPUs that each instance must have.
185
+
186
+ For example, cpus='4' means each instance must have exactly 4 vCPUs,
187
+ and cpus='4+' means each instance must have at least 4 vCPUs.
188
+
189
+ (Developer note: The cpus field is only used to select the instance type
190
+ at launch time. Thus, Resources in the backend's ResourceHandle will
191
+ always have the cpus field set to None.)
192
+ """
193
+ if self._cpus is not None:
194
+ return self._cpus
195
+ return None
196
+
197
+ @property
198
+ def memory(self) -> Optional[str]:
199
+ """Returns the memory that each instance must have in GB.
200
+
201
+ For example, memory='16' means each instance must have exactly 16GB
202
+ memory; memory='16+' means each instance must have at least 16GB
203
+ memory.
204
+
205
+ (Developer note: The memory field is only used to select the instance
206
+ type at launch time. Thus, Resources in the backend's ResourceHandle
207
+ will always have the memory field set to None.)
208
+ """
209
+ return self._memory
210
+
211
+ @property
212
+ @functools.lru_cache(maxsize=1)
213
+ def accelerators(self) -> Optional[Dict[str, int]]:
214
+ """Returns the accelerators field directly or by inferring.
215
+
216
+ For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
217
+ set to None, but this function will infer {'V100': 1} from the instance
218
+ type.
219
+ """
220
+ if self._accelerators is not None:
221
+ return self._accelerators
222
+ return None
223
+
224
+ @property
225
+ def disk_size(self) -> int:
226
+ return self._disk_size
227
+
228
+ @property
229
+ def image_id(self) -> Optional[str]:
230
+ return self._image_id
231
+
232
+ @property
233
+ def labels(self) -> Optional[Dict[str, str]]:
234
+ return self._labels
235
+
236
+ @property
237
+ def cluster_config_overrides(self) -> Dict[str, Any]:
238
+ if self._cluster_config_overrides is None:
239
+ return {}
240
+ return self._cluster_config_overrides
241
+
242
+ def _set_cpus(
243
+ self,
244
+ cpus: Union[None, int, float, str],
245
+ ) -> None:
246
+ if cpus is None:
247
+ self._cpus = None
248
+ return
249
+
250
+ self._cpus = str(cpus)
251
+ if isinstance(cpus, str):
252
+ if cpus.endswith('+'):
253
+ num_cpus_str = cpus[:-1]
254
+ else:
255
+ num_cpus_str = cpus
256
+
257
+ try:
258
+ num_cpus = float(num_cpus_str)
259
+ except ValueError:
260
+ with ux_utils.print_exception_no_traceback():
261
+ raise ValueError(
262
+ f'The "cpus" field should be either a number or '
263
+ f'a string "<number>+". Found: {cpus!r}'
264
+ ) from None
265
+ else:
266
+ num_cpus = float(cpus)
267
+
268
+ if num_cpus <= 0:
269
+ with ux_utils.print_exception_no_traceback():
270
+ raise ValueError(
271
+ f'The "cpus" field should be positive. Found: {cpus!r}'
272
+ )
273
+
274
+ def _set_memory(
275
+ self,
276
+ memory: Union[None, int, float, str],
277
+ ) -> None:
278
+ if memory is None:
279
+ self._memory = None
280
+ return
281
+
282
+ self._memory = str(memory)
283
+ if isinstance(memory, str):
284
+ if memory.endswith(('+', 'x')):
285
+ # 'x' is used internally for make sure our resources used by
286
+ # jobs controller (memory: 3x) to have enough memory based on
287
+ # the vCPUs.
288
+ num_memory_gb = memory[:-1]
289
+ else:
290
+ num_memory_gb = memory
291
+
292
+ try:
293
+ memory_gb = float(num_memory_gb)
294
+ except ValueError:
295
+ with ux_utils.print_exception_no_traceback():
296
+ raise ValueError(
297
+ f'The "memory" field should be either a number or '
298
+ f'a string "<number>+". Found: {memory!r}'
299
+ ) from None
300
+ else:
301
+ memory_gb = float(memory)
302
+
303
+ if memory_gb <= 0:
304
+ with ux_utils.print_exception_no_traceback():
305
+ raise ValueError(
306
+ f'The "cpus" field should be positive. Found: {memory!r}'
307
+ )
308
+
309
+ def _set_accelerators(
310
+ self,
311
+ accelerators: Union[None, str, Dict[str, int]],
312
+ accelerator_args: Optional[Dict[str, str]] = None,
313
+ ) -> None:
314
+ """Sets accelerators.
315
+
316
+ Args:
317
+ accelerators: A string or a dict of accelerator types to counts.
318
+ accelerator_args: (deprecated) A dict of accelerator types to args.
319
+ """
320
+ if accelerators is not None:
321
+ if isinstance(accelerators, str): # Convert to Dict[str, int].
322
+ if ':' not in accelerators:
323
+ accelerators = {accelerators: 1}
324
+ else:
325
+ splits = accelerators.split(':')
326
+ parse_error = (
327
+ 'The "accelerators" field as a str '
328
+ 'should be <name> or <name>:<cnt>. '
329
+ f'Found: {accelerators!r}'
330
+ )
331
+ if len(splits) != 2:
332
+ with ux_utils.print_exception_no_traceback():
333
+ raise ValueError(parse_error)
334
+ try:
335
+ num = float(splits[1])
336
+ num = int(num)
337
+ accelerators = {splits[0]: num}
338
+ except ValueError:
339
+ with ux_utils.print_exception_no_traceback():
340
+ raise ValueError(parse_error) from None
341
+
342
+ # Canonicalize the accelerator names.
343
+ accelerators = {
344
+ accelerator_registry.canonicalize_accelerator_name(acc): acc_count
345
+ for acc, acc_count in accelerators.items()
346
+ }
347
+
348
+ acc, _ = list(accelerators.items())[0]
349
+
350
+ self._accelerators = accelerators
351
+
352
+ def _try_validate_cpus_mem(self) -> None:
353
+ """Try to validate the cpus and memory attributes.
354
+
355
+ Raises:
356
+ ValueError: if the attributes are invalid.
357
+ """
358
+ if self._cpus is None and self._memory is None:
359
+ return
360
+
361
+ def _try_validate_image_id(self) -> None:
362
+ """Try to validate the image_id attribute.
363
+
364
+ Raises:
365
+ ValueError: if the attribute is invalid.
366
+ """
367
+ if self._image_id is None:
368
+ with ux_utils.print_exception_no_traceback():
369
+ raise ValueError(
370
+ 'no image id for the task was specified. You must '
371
+ 'specify an image id for this task (e.g. '
372
+ '`nvcr.io/nvidia/pytorch:xx.xx-py3`'
373
+ )
374
+
375
+ def get_accelerators_str(self) -> str:
376
+ accelerators = self.accelerators
377
+ accel_str = ''
378
+ if accelerators is None:
379
+ accel_str = '-'
380
+ elif isinstance(accelerators, dict) and len(accelerators) == 1:
381
+ accel_name, accel_count = list(accelerators.items())[0]
382
+ accel_str = f'{accel_name}:{accel_count}'
383
+ return accel_str
384
+
385
+ def copy(self, **override) -> 'Resources':
386
+ """Returns a copy of the given Resources."""
387
+ resources = Resources(
388
+ cloud=override.pop('cloud', self.cloud),
389
+ cpus=override.pop('cpus', self._cpus),
390
+ memory=override.pop('memory', self.memory),
391
+ accelerators=override.pop('accelerators', self.accelerators),
392
+ disk_size=override.pop('disk_size', self.disk_size),
393
+ image_id=override.pop('image_id', self.image_id),
394
+ labels=override.pop('labels', self.labels),
395
+ )
396
+ assert len(override) == 0
397
+ return resources
398
+
399
+ @classmethod
400
+ def from_yaml_config(cls, config: Optional[Dict[str, Any]]) -> 'Resources':
401
+ if config is None:
402
+ return Resources()
403
+ common_utils.validate_schema(
404
+ config, schemas.get_resources_schema(), 'Invalid resources YAML: '
405
+ )
406
+
407
+ def _override_resources(
408
+ base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
409
+ ) -> List[Resources]:
410
+ resources_list = []
411
+ for override_config in override_configs:
412
+ new_resource_config = base_resource_config.copy()
413
+ # Labels are handled separately.
414
+ override_labels = override_config.pop('labels', None)
415
+ new_resource_config.update(override_config)
416
+
417
+ # Update the labels with the override labels.
418
+ labels = new_resource_config.get('labels', None)
419
+ if labels is not None and override_labels is not None:
420
+ labels.update(override_labels)
421
+ elif override_labels is not None:
422
+ labels = override_labels
423
+ new_resource_config['labels'] = labels
424
+
425
+ # Call from_yaml_config again instead of
426
+ # _from_yaml_config_single to handle the case, where both
427
+ # multiple accelerators and `any_of` is specified.
428
+ # This will not cause infinite recursion because we have made
429
+ # sure that `any_of` and `ordered` cannot be specified in the
430
+ # resource candidates in `any_of` or `ordered`, by the schema
431
+ # validation above.
432
+ resources_list.extend([Resources.from_yaml_config(new_resource_config)])
433
+
434
+ return resources_list
435
+
436
+ config = config.copy()
437
+
438
+ return Resources._from_yaml_config_single(config)
439
+
440
+ @classmethod
441
+ def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
442
+ resources_fields: Dict[str, Any] = {}
443
+ resources_fields['cpus'] = config.pop('cpus', None)
444
+ resources_fields['memory'] = config.pop('memory', None)
445
+ resources_fields['accelerators'] = config.pop('accelerators', None)
446
+ resources_fields['disk_size'] = config.pop('disk_size', None)
447
+ resources_fields['image_id'] = config.pop('image_id', None)
448
+ resources_fields['labels'] = config.pop('labels', None)
449
+
450
+ if resources_fields['cpus'] is not None:
451
+ resources_fields['cpus'] = str(resources_fields['cpus'])
452
+ if resources_fields['memory'] is not None:
453
+ resources_fields['memory'] = str(resources_fields['memory'])
454
+ # TODO(asaiacai): should we remove disk size
455
+ # since we aren't letting users set this at the host level?
456
+ if resources_fields['disk_size'] is not None:
457
+ resources_fields['disk_size'] = int(resources_fields['disk_size'])
458
+
459
+ assert not config, f'Invalid resource args: {config.keys()}'
460
+ return Resources(**resources_fields)
461
+
462
+ def to_yaml_config(self) -> Dict[str, Union[str, int]]:
463
+ """Returns a yaml-style dict of config for this resource bundle."""
464
+ config = {}
465
+
466
+ def add_if_not_none(key, value):
467
+ if value is not None and value != 'None':
468
+ config[key] = value
469
+
470
+ add_if_not_none('cloud', str(self.cloud))
471
+ add_if_not_none('cpus', self._cpus)
472
+ add_if_not_none('memory', self.memory)
473
+ add_if_not_none('accelerators', self.accelerators)
474
+
475
+ add_if_not_none('disk_size', self.disk_size)
476
+ add_if_not_none('image_id', self.image_id)
477
+ add_if_not_none('labels', self.labels)
478
+ return config