konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,393 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import datetime
14
+ import difflib
15
+ import functools
16
+ import getpass
17
+ import hashlib
18
+ import inspect
19
+ import os
20
+ import re
21
+ import socket
22
+ import sys
23
+ import uuid
24
+ from typing import Any, Callable, Dict, List, Optional, Union
25
+
26
+ import jinja2
27
+ import jsonschema
28
+ import yaml
29
+
30
+ from konduktor.utils import annotations, constants, ux_utils, validator
31
+
32
+ _USER_HASH_FILE = os.path.expanduser('~/.konduktor/user_hash')
33
+ _usage_run_id = None
34
+ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
35
+ USER_HASH_LENGTH = 8
36
+ USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
37
+
38
+
39
+ def get_timestamp() -> str:
40
+ return datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
41
+
42
+
43
+ def user_and_hostname_hash() -> str:
44
+ """Returns a string containing <user>-<hostname hash last 4 chars>.
45
+
46
+ For uniquefying user workloads on a shared-k8s cluster.
47
+
48
+ Using uuid.getnode() instead of gethostname() is incorrect; observed to
49
+ collide on Macs.
50
+ """
51
+ hostname_hash = hashlib.md5(socket.gethostname().encode()).hexdigest()[-4:]
52
+ return f'{getpass.getuser()}-{hostname_hash}'
53
+
54
+
55
+ def base36_encode(hex_str: str) -> str:
56
+ """Converts a hex string to a base36 string."""
57
+ int_value = int(hex_str, 16)
58
+
59
+ def _base36_encode(num: int) -> str:
60
+ if num == 0:
61
+ return '0'
62
+ alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
63
+ base36 = ''
64
+ while num != 0:
65
+ num, i = divmod(num, 36)
66
+ base36 = alphabet[i] + base36
67
+ return base36
68
+
69
+ return _base36_encode(int_value)
70
+
71
+
72
+ def get_cleaned_username(username: str = '') -> str:
73
+ """Cleans the username. Underscores are allowed, as we will
74
+ handle it when mapping to the cluster_name_on_cloud in
75
+ common_utils.make_cluster_name_on_cloud.
76
+
77
+ Clean up includes:
78
+ 1. Making all characters lowercase
79
+ 2. Removing any non-alphanumeric characters (excluding hyphens and
80
+ underscores)
81
+ 3. Removing any numbers and/or hyphens at the start of the username.
82
+ 4. Removing any hyphens at the end of the username
83
+ 5. Truncate the username to 63 characters, as requested by GCP labels
84
+
85
+ Dots are removed due to: https://cloud.google.com/compute/docs/labeling-resources#requirements
86
+
87
+ e.g. 1SkY-PiLot2- becomes sky-pilot2
88
+
89
+ Returns:
90
+ A cleaned username.
91
+ """ # noqa: E501
92
+ username = username or getpass.getuser()
93
+ username = username.lower()
94
+ username = re.sub(r'[^a-z0-9-_]', '', username)
95
+ username = re.sub(r'^[0-9-]+', '', username)
96
+ username = re.sub(r'-$', '', username)
97
+ username = username[:63]
98
+ return username
99
+
100
+
101
+ def is_valid_env_var(name: str) -> bool:
102
+ """Checks if the task environment variable name is valid."""
103
+ return bool(re.fullmatch(_VALID_ENV_VAR_REGEX, name))
104
+
105
+
106
+ def get_pretty_entry_point() -> str:
107
+ """Returns the prettified entry point of this process (sys.argv).
108
+
109
+ Example return values:
110
+ $ konduktor launch app.yaml # 'konduktor launch app.yaml'
111
+ $ python examples/app.py # 'app.py'
112
+ """
113
+ argv = sys.argv
114
+ basename = os.path.basename(argv[0])
115
+ if basename == 'konduktor':
116
+ # Turn '/.../anaconda/envs/py36/bin/sky' into 'konduktor', but keep other
117
+ # things like 'examples/app.py'.
118
+ argv[0] = basename
119
+ return ' '.join(argv)
120
+
121
+
122
+ @annotations.lru_cache(scope='request')
123
+ def get_usage_run_id() -> str:
124
+ """Returns a unique run id for each 'run'.
125
+
126
+ A run is defined as the lifetime of a process that has imported `sky`
127
+ and has called its CLI or programmatic APIs. For example, two successive
128
+ `sky launch` are two runs.
129
+ """
130
+ global _usage_run_id
131
+ if _usage_run_id is None:
132
+ _usage_run_id = str(uuid.uuid4())
133
+ return _usage_run_id
134
+
135
+
136
+ def make_decorator(cls, name_or_fn: Union[str, Callable], **ctx_kwargs) -> Callable:
137
+ """Make the cls a decorator.
138
+
139
+ class cls:
140
+ def __init__(self, name, **kwargs):
141
+ pass
142
+ def __enter__(self):
143
+ pass
144
+ def __exit__(self, exc_type, exc_value, traceback):
145
+ pass
146
+
147
+ Args:
148
+ name_or_fn: The name of the event or the function to be wrapped.
149
+ message: The message attached to the event.
150
+ """
151
+ if isinstance(name_or_fn, str):
152
+
153
+ def _wrapper(f):
154
+ @functools.wraps(f)
155
+ def _record(*args, **kwargs):
156
+ with cls(name_or_fn, **ctx_kwargs):
157
+ return f(*args, **kwargs)
158
+
159
+ return _record
160
+
161
+ return _wrapper
162
+ else:
163
+ if not inspect.isfunction(name_or_fn):
164
+ raise ValueError('Should directly apply the decorator to a function.')
165
+
166
+ @functools.wraps(name_or_fn)
167
+ def _record(*args, **kwargs):
168
+ f = name_or_fn
169
+ func_name = getattr(f, '__qualname__', f.__name__)
170
+ module_name = getattr(f, '__module__', '')
171
+ if module_name:
172
+ full_name = f'{module_name}.{func_name}'
173
+ else:
174
+ full_name = func_name
175
+ with cls(full_name, **ctx_kwargs):
176
+ return f(*args, **kwargs)
177
+
178
+ return _record
179
+
180
+
181
+ def get_user_hash(force_fresh_hash: bool = False) -> str:
182
+ """Returns a unique user-machine specific hash as a user id.
183
+
184
+ We cache the user hash in a file to avoid potential user_name or
185
+ hostname changes causing a new user hash to be generated.
186
+
187
+ Args:
188
+ force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the
189
+ hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash
190
+ to be generated. Used by `kubernetes.ssh_key_secret_field_name` to
191
+ avoid controllers sharing the same ssh key field name as the
192
+ local client.
193
+ """
194
+
195
+ def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
196
+ if user_hash is None:
197
+ return False
198
+ try:
199
+ int(user_hash, 16)
200
+ except (TypeError, ValueError):
201
+ return False
202
+ return len(user_hash) == USER_HASH_LENGTH
203
+
204
+ if not force_fresh_hash:
205
+ user_hash = os.getenv(constants.USER_ID_ENV_VAR)
206
+ if _is_valid_user_hash(user_hash):
207
+ assert user_hash is not None
208
+ return user_hash
209
+
210
+ if not force_fresh_hash and os.path.exists(_USER_HASH_FILE):
211
+ # Read from cached user hash file.
212
+ with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
213
+ # Remove invalid characters.
214
+ user_hash = f.read().strip()
215
+ if _is_valid_user_hash(user_hash):
216
+ return user_hash
217
+
218
+ hash_str = user_and_hostname_hash()
219
+ user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
220
+ if not _is_valid_user_hash(user_hash):
221
+ # A fallback in case the hash is invalid.
222
+ user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
223
+ os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
224
+ if not force_fresh_hash:
225
+ # Do not cache to file if force_fresh_hash is True since the file may
226
+ # be intentionally using a different hash, e.g. we want to keep the
227
+ # user_hash for usage collection the same on the jobs/serve controller
228
+ # as users' local client.
229
+ with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
230
+ f.write(user_hash)
231
+ return user_hash
232
+
233
+
234
+ def read_yaml(path: str) -> Dict[str, Any]:
235
+ with open(path, 'r', encoding='utf-8') as f:
236
+ config = yaml.safe_load(f)
237
+ return config
238
+
239
+
240
+ def read_yaml_all(path: str) -> List[Dict[str, Any]]:
241
+ with open(path, 'r', encoding='utf-8') as f:
242
+ config = yaml.safe_load_all(f)
243
+ configs = list(config)
244
+ if not configs:
245
+ # Empty YAML file.
246
+ return [{}]
247
+ return configs
248
+
249
+
250
+ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
251
+ """Validates an object against a given JSON schema.
252
+
253
+ Args:
254
+ obj: The object to validate.
255
+ schema: The JSON schema against which to validate the object.
256
+ err_msg_prefix: The string to prepend to the error message if
257
+ validation fails.
258
+ skip_none: If True, removes fields with value None from the object
259
+ before validation. This is useful for objects that will never contain
260
+ None because yaml.safe_load() loads empty fields as None.
261
+
262
+ Raises:
263
+ ValueError: if the object does not match the schema.
264
+ """
265
+ if skip_none:
266
+ obj = {k: v for k, v in obj.items() if v is not None}
267
+ err_msg = None
268
+ try:
269
+ validator.SchemaValidator(schema).validate(obj)
270
+ except jsonschema.ValidationError as e:
271
+ if e.validator == 'additionalProperties':
272
+ if tuple(e.schema_path) == ('properties', 'envs', 'additionalProperties'):
273
+ # Hack. Here the error is Task.envs having some invalid keys. So
274
+ # we should not print "unsupported field".
275
+ #
276
+ # This will print something like:
277
+ # 'hello world' does not match any of the regexes: <regex>
278
+ err_msg = (
279
+ err_msg_prefix
280
+ + 'The `envs` field contains invalid keys:\n'
281
+ + e.message
282
+ )
283
+ else:
284
+ err_msg = err_msg_prefix
285
+ assert isinstance(e.schema, dict), 'Schema must be a dictionary'
286
+ known_fields = set(e.schema.get('properties', {}).keys())
287
+ assert isinstance(e.instance, dict), 'Instance must be a dictionary'
288
+ for field in e.instance:
289
+ if field not in known_fields:
290
+ most_similar_field = difflib.get_close_matches(
291
+ field, known_fields, 1
292
+ )
293
+ if most_similar_field:
294
+ err_msg += (
295
+ f'Instead of {field!r}, did you mean '
296
+ f'{most_similar_field[0]!r}?'
297
+ )
298
+ else:
299
+ err_msg += f'Found unsupported field {field!r}.'
300
+ else:
301
+ message = e.message
302
+ # Object in jsonschema is represented as dict in Python. Replace
303
+ # 'object' with 'dict' for better readability.
304
+ message = message.replace("type 'object'", "type 'dict'")
305
+ # Example e.json_path value: '$.resources'
306
+ err_msg = (
307
+ err_msg_prefix
308
+ + message
309
+ + f'. Check problematic field(s): {e.json_path}'
310
+ )
311
+
312
+ if err_msg:
313
+ with ux_utils.print_exception_no_traceback():
314
+ raise ValueError(err_msg)
315
+
316
+
317
+ def dump_yaml(path: str, config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> None:
318
+ with open(path, 'w', encoding='utf-8') as f:
319
+ f.write(dump_yaml_str(config))
320
+
321
+
322
+ def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
323
+ # https://github.com/yaml/pyyaml/issues/127
324
+ class LineBreakDumper(yaml.SafeDumper):
325
+ def write_line_break(self, data=None):
326
+ super().write_line_break(data)
327
+ if len(self.indents) == 1:
328
+ super().write_line_break()
329
+
330
+ if isinstance(config, list):
331
+ dump_func = yaml.dump_all # type: ignore
332
+ else:
333
+ dump_func = yaml.dump # type: ignore
334
+ return dump_func(
335
+ config, Dumper=LineBreakDumper, sort_keys=False, default_flow_style=False
336
+ )
337
+
338
+
339
+ def fill_template(
340
+ template_name: str, variables: Dict[str, Any], output_path: str
341
+ ) -> None:
342
+ """Create a file from a Jinja template and return the filename."""
343
+ assert template_name.endswith('.j2'), template_name
344
+ root_dir = os.path.dirname(os.path.dirname(__file__))
345
+ template_path = os.path.join(root_dir, 'templates', template_name)
346
+ if not os.path.exists(template_path):
347
+ raise FileNotFoundError(f'Template "{template_name}" does not exist.')
348
+ with open(template_path, 'r', encoding='utf-8') as fin:
349
+ template = fin.read()
350
+ output_path = os.path.abspath(os.path.expanduser(output_path))
351
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
352
+
353
+ # Write out yaml config.
354
+ j2_template = jinja2.Template(template)
355
+ content = j2_template.render(**variables)
356
+ with open(output_path, 'w', encoding='utf-8') as fout:
357
+ fout.write(content)
358
+
359
+
360
+ def class_fullname(cls, skip_builtins: bool = True):
361
+ """Get the full name of a class.
362
+
363
+ Example:
364
+ >>> e = konduktor.exceptions.FetchClusterInfoError()
365
+ >>> class_fullname(e.__class__)
366
+ 'konduktor.exceptions.FetchClusterInfoError'
367
+
368
+ Args:
369
+ cls: The class to get the full name.
370
+
371
+ Returns:
372
+ The full name of the class.
373
+ """
374
+ module_name = getattr(cls, '__module__', '')
375
+ if not module_name or (module_name == 'builtins' and skip_builtins):
376
+ return cls.__name__
377
+ return f'{cls.__module__}.{cls.__name__}'
378
+
379
+
380
+ def format_exception(
381
+ e: Union[Exception, SystemExit, KeyboardInterrupt], use_bracket: bool = False
382
+ ) -> str:
383
+ """Format an exception to a string.
384
+
385
+ Args:
386
+ e: The exception to format.
387
+
388
+ Returns:
389
+ A string that represents the exception.
390
+ """
391
+ if use_bracket:
392
+ return f'[{class_fullname(e.__class__)}] {e}'
393
+ return f'{class_fullname(e.__class__)}: {e}'
@@ -0,0 +1,5 @@
1
+ # The name for the environment variable that stores KONDUKTOR user hash
2
+ USER_ID_ENV_VAR = 'KONDUKTOR_USER_ID'
3
+
4
+ # The name for the environment variable that stores KONDUKTOR user name.
5
+ USER_ENV_VAR = 'KONDUKTOR_USER'
@@ -0,0 +1,55 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Global environment options for konduktor."""
14
+
15
+ import enum
16
+ import os
17
+ from typing import Dict
18
+
19
+
20
+ class Options(enum.Enum):
21
+ """Environment variables for SkyPilot."""
22
+
23
+ # (env var name, default value)
24
+ IS_DEVELOPER = ('KONDUKTOR_DEV', False)
25
+ SHOW_DEBUG_INFO = ('KONDUKTOR_DEBUG', True)
26
+ DISABLE_LOGGING = ('KONDUKTOR_DISABLE_USAGE_COLLECTION', False)
27
+ MINIMIZE_LOGGING = ('KONDUKTOR_MINIMIZE_LOGGING', False)
28
+ SUPPRESS_SENSITIVE_LOG = ('KONDUKTOR_SUPPRESS_SENSITIVE_LOG', False)
29
+ # Internal: this is used to skip the cloud user identity check, which is
30
+ # used to protect cluster operations in a multi-identity scenario.
31
+ # Currently, this is only used in the job and serve controller, as there
32
+ # will not be multiple identities, and skipping the check can increase
33
+ # robustness.
34
+ SKIP_CLOUD_IDENTITY_CHECK = ('KONDUKTOR_SKIP_CLOUD_IDENTITY_CHECK', False)
35
+
36
+ def __init__(self, env_var: str, default: bool) -> None:
37
+ self.env_var = env_var
38
+ self.default = default
39
+
40
+ def __repr__(self) -> str:
41
+ return self.env_var
42
+
43
+ def get(self) -> bool:
44
+ """Check if an environment variable is set to True."""
45
+ return os.getenv(self.env_var, str(self.default)).lower() in ('true', '1')
46
+
47
+ @property
48
+ def env_key(self) -> str:
49
+ """The environment variable key name."""
50
+ return self.value[0]
51
+
52
+ @classmethod
53
+ def all_options(cls) -> Dict[str, bool]:
54
+ """Returns all options as a dictionary."""
55
+ return {option.env_key: option.get() for option in list(Options)}
@@ -0,0 +1,226 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Exceptions."""
14
+
15
+ import builtins
16
+ import traceback
17
+ import types
18
+ from typing import Any, Dict
19
+
20
+ # Return code for keyboard interruption and SIGTSTP
21
+ KEYBOARD_INTERRUPT_CODE = 130
22
+ SIGTSTP_CODE = 146
23
+ RSYNC_FILE_NOT_FOUND_CODE = 23
24
+ # Arbitrarily chosen value. Used in SkyPilot's storage mounting scripts
25
+ MOUNT_PATH_NON_EMPTY_CODE = 42
26
+ # Arbitrarily chosen value. Used to provision Kubernetes instance in Skypilot
27
+ INSUFFICIENT_PRIVILEGES_CODE = 52
28
+ # Return code when git command is ran in a dir that is not git repo
29
+ GIT_FATAL_EXIT_CODE = 128
30
+
31
+
32
+ def is_safe_exception(exc: Exception) -> bool:
33
+ """Returns True if the exception is safe to send to clients.
34
+
35
+ Safe exceptions are:
36
+ 1. Built-in exceptions
37
+ 2. Konduktor's own exceptions
38
+ """
39
+ module = type(exc).__module__
40
+
41
+ # Builtin exceptions (e.g., ValueError, RuntimeError)
42
+ if module == 'builtins':
43
+ return True
44
+
45
+ # Konduktor's own exceptions
46
+ if module.startswith('sky.'):
47
+ return True
48
+
49
+ return False
50
+
51
+
52
+ def wrap_exception(exc: Exception) -> Exception:
53
+ """Wraps non-safe exceptions into Konduktor exceptions
54
+
55
+ This is used to wrap exceptions that are not safe to deserialize at clients.
56
+
57
+ Examples include exceptions from cloud providers whose packages are not
58
+ available at clients.
59
+ """
60
+ if is_safe_exception(exc):
61
+ return exc
62
+
63
+ return CloudError(
64
+ message=str(exc),
65
+ cloud_provider=type(exc).__module__.split('.')[0],
66
+ error_type=type(exc).__name__,
67
+ )
68
+
69
+
70
+ def serialize_exception(e: Exception) -> Dict[str, Any]:
71
+ """Serialize the exception.
72
+
73
+ This function also wraps any unsafe exceptions (e.g., cloud exceptions)
74
+ into Konduktor's CloudError before serialization to ensure clients can
75
+ deserialize them without needing cloud provider packages installed.
76
+ """
77
+ # Wrap unsafe exceptions before serialization
78
+ e = wrap_exception(e)
79
+
80
+ stacktrace = getattr(e, 'stacktrace', None)
81
+ attributes = e.__dict__.copy()
82
+ if 'stacktrace' in attributes:
83
+ del attributes['stacktrace']
84
+ for attr_k in list(attributes.keys()):
85
+ attr_v = attributes[attr_k]
86
+ if isinstance(attr_v, types.TracebackType):
87
+ attributes[attr_k] = traceback.format_tb(attr_v)
88
+
89
+ data = {
90
+ 'type': e.__class__.__name__,
91
+ 'message': str(e),
92
+ 'args': e.args,
93
+ 'attributes': attributes,
94
+ 'stacktrace': stacktrace,
95
+ }
96
+ return data
97
+
98
+
99
+ def deserialize_exception(serialized: Dict[str, Any]) -> Exception:
100
+ """Deserialize the exception."""
101
+ exception_type = serialized['type']
102
+ if hasattr(builtins, exception_type):
103
+ exception_class = getattr(builtins, exception_type)
104
+ else:
105
+ exception_class = globals().get(exception_type, None)
106
+ if exception_class is None:
107
+ # Unknown exception type.
108
+ return Exception(f'{exception_type}: {serialized["message"]}')
109
+ e = exception_class(*serialized['args'], **serialized['attributes'])
110
+ if serialized['stacktrace'] is not None:
111
+ setattr(e, 'stacktrace', serialized['stacktrace'])
112
+ return e
113
+
114
+
115
+ class CloudError(Exception):
116
+ """Wraps cloud-specific errors into a SkyPilot exception."""
117
+
118
+ def __init__(self, message: str, cloud_provider: str, error_type: str):
119
+ super().__init__(message)
120
+ self.cloud_provider = cloud_provider
121
+ self.error_type = error_type
122
+
123
+ def __str__(self):
124
+ return (
125
+ f'{self.cloud_provider} error ({self.error_type}): ' f'{super().__str__()}'
126
+ )
127
+
128
+
129
+ class CommandError(Exception):
130
+ pass
131
+
132
+
133
+ class NotSupportedError(Exception):
134
+ """Raised when a feature is not supported."""
135
+
136
+ pass
137
+
138
+
139
+ class StorageError(Exception):
140
+ pass
141
+
142
+
143
+ class StorageSpecError(ValueError):
144
+ # Errors raised due to invalid specification of the Storage object
145
+ pass
146
+
147
+
148
+ class StorageInitError(StorageError):
149
+ # Error raised when Initialization fails - either due to permissions,
150
+ # unavailable name, or other reasons.
151
+ pass
152
+
153
+
154
+ class StorageBucketCreateError(StorageInitError):
155
+ # Error raised when bucket creation fails.
156
+ pass
157
+
158
+
159
+ class StorageBucketGetError(StorageInitError):
160
+ # Error raised if attempt to fetch an existing bucket fails.
161
+ pass
162
+
163
+
164
+ class StorageBucketDeleteError(StorageError):
165
+ # Error raised if attempt to delete an existing bucket fails.
166
+ pass
167
+
168
+
169
+ class StorageUploadError(StorageError):
170
+ # Error raised when bucket is successfully initialized, but upload fails,
171
+ # either due to permissions, ctrl-c, or other reasons.
172
+ pass
173
+
174
+
175
+ class StorageSourceError(StorageSpecError):
176
+ # Error raised when the source of the storage is invalid. E.g., does not
177
+ # exist, malformed path, or other reasons.
178
+ pass
179
+
180
+
181
+ class StorageNameError(StorageSpecError):
182
+ # Error raised when the source of the storage is invalid. E.g., does not
183
+ # exist, malformed path, or other reasons.
184
+ pass
185
+
186
+
187
+ class StorageModeError(StorageSpecError):
188
+ # Error raised when the storage mode is invalid or does not support the
189
+ # requested operation (e.g., passing a file as source to MOUNT mode)
190
+ pass
191
+
192
+
193
+ class StorageExternalDeletionError(StorageBucketGetError):
194
+ # Error raised when the bucket is attempted to be fetched while it has been
195
+ # deleted externally.
196
+ pass
197
+
198
+
199
+ class NonExistentStorageAccountError(StorageExternalDeletionError):
200
+ # Error raise when storage account provided through config.yaml or read
201
+ # from store handle(local db) does not exist.
202
+ pass
203
+
204
+
205
+ class NetworkError(Exception):
206
+ """Raised when network fails."""
207
+
208
+ pass
209
+
210
+
211
+ class CloudUserIdentityError(Exception):
212
+ """Raised when the cloud identity is invalid."""
213
+
214
+ pass
215
+
216
+
217
+ class ClusterOwnerIdentityMismatchError(Exception):
218
+ """The cluster's owner identity does not match the current user identity."""
219
+
220
+ pass
221
+
222
+
223
+ class NoCloudAccessError(Exception):
224
+ """Raised when all clouds are disabled."""
225
+
226
+ pass
@@ -0,0 +1,8 @@
1
+ import enum
2
+
3
+
4
+ class KubernetesAutoscalerType(enum.Enum):
5
+ """Enum for the different types of cluster autoscalers for Kubernetes."""
6
+
7
+ GKE = 'gke'
8
+ GENERIC = 'generic'