konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
import datetime
|
14
|
+
import difflib
|
15
|
+
import functools
|
16
|
+
import getpass
|
17
|
+
import hashlib
|
18
|
+
import inspect
|
19
|
+
import os
|
20
|
+
import re
|
21
|
+
import socket
|
22
|
+
import sys
|
23
|
+
import uuid
|
24
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
25
|
+
|
26
|
+
import jinja2
|
27
|
+
import jsonschema
|
28
|
+
import yaml
|
29
|
+
|
30
|
+
from konduktor.utils import annotations, constants, ux_utils, validator
|
31
|
+
|
32
|
+
_USER_HASH_FILE = os.path.expanduser('~/.konduktor/user_hash')
|
33
|
+
_usage_run_id = None
|
34
|
+
_VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
|
35
|
+
USER_HASH_LENGTH = 8
|
36
|
+
USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
|
37
|
+
|
38
|
+
|
39
|
+
def get_timestamp() -> str:
|
40
|
+
return datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
|
41
|
+
|
42
|
+
|
43
|
+
def user_and_hostname_hash() -> str:
|
44
|
+
"""Returns a string containing <user>-<hostname hash last 4 chars>.
|
45
|
+
|
46
|
+
For uniquefying user workloads on a shared-k8s cluster.
|
47
|
+
|
48
|
+
Using uuid.getnode() instead of gethostname() is incorrect; observed to
|
49
|
+
collide on Macs.
|
50
|
+
"""
|
51
|
+
hostname_hash = hashlib.md5(socket.gethostname().encode()).hexdigest()[-4:]
|
52
|
+
return f'{getpass.getuser()}-{hostname_hash}'
|
53
|
+
|
54
|
+
|
55
|
+
def base36_encode(hex_str: str) -> str:
|
56
|
+
"""Converts a hex string to a base36 string."""
|
57
|
+
int_value = int(hex_str, 16)
|
58
|
+
|
59
|
+
def _base36_encode(num: int) -> str:
|
60
|
+
if num == 0:
|
61
|
+
return '0'
|
62
|
+
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
|
63
|
+
base36 = ''
|
64
|
+
while num != 0:
|
65
|
+
num, i = divmod(num, 36)
|
66
|
+
base36 = alphabet[i] + base36
|
67
|
+
return base36
|
68
|
+
|
69
|
+
return _base36_encode(int_value)
|
70
|
+
|
71
|
+
|
72
|
+
def get_cleaned_username(username: str = '') -> str:
|
73
|
+
"""Cleans the username. Underscores are allowed, as we will
|
74
|
+
handle it when mapping to the cluster_name_on_cloud in
|
75
|
+
common_utils.make_cluster_name_on_cloud.
|
76
|
+
|
77
|
+
Clean up includes:
|
78
|
+
1. Making all characters lowercase
|
79
|
+
2. Removing any non-alphanumeric characters (excluding hyphens and
|
80
|
+
underscores)
|
81
|
+
3. Removing any numbers and/or hyphens at the start of the username.
|
82
|
+
4. Removing any hyphens at the end of the username
|
83
|
+
5. Truncate the username to 63 characters, as requested by GCP labels
|
84
|
+
|
85
|
+
Dots are removed due to: https://cloud.google.com/compute/docs/labeling-resources#requirements
|
86
|
+
|
87
|
+
e.g. 1SkY-PiLot2- becomes sky-pilot2
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
A cleaned username.
|
91
|
+
""" # noqa: E501
|
92
|
+
username = username or getpass.getuser()
|
93
|
+
username = username.lower()
|
94
|
+
username = re.sub(r'[^a-z0-9-_]', '', username)
|
95
|
+
username = re.sub(r'^[0-9-]+', '', username)
|
96
|
+
username = re.sub(r'-$', '', username)
|
97
|
+
username = username[:63]
|
98
|
+
return username
|
99
|
+
|
100
|
+
|
101
|
+
def is_valid_env_var(name: str) -> bool:
|
102
|
+
"""Checks if the task environment variable name is valid."""
|
103
|
+
return bool(re.fullmatch(_VALID_ENV_VAR_REGEX, name))
|
104
|
+
|
105
|
+
|
106
|
+
def get_pretty_entry_point() -> str:
|
107
|
+
"""Returns the prettified entry point of this process (sys.argv).
|
108
|
+
|
109
|
+
Example return values:
|
110
|
+
$ konduktor launch app.yaml # 'konduktor launch app.yaml'
|
111
|
+
$ python examples/app.py # 'app.py'
|
112
|
+
"""
|
113
|
+
argv = sys.argv
|
114
|
+
basename = os.path.basename(argv[0])
|
115
|
+
if basename == 'konduktor':
|
116
|
+
# Turn '/.../anaconda/envs/py36/bin/sky' into 'konduktor', but keep other
|
117
|
+
# things like 'examples/app.py'.
|
118
|
+
argv[0] = basename
|
119
|
+
return ' '.join(argv)
|
120
|
+
|
121
|
+
|
122
|
+
@annotations.lru_cache(scope='request')
|
123
|
+
def get_usage_run_id() -> str:
|
124
|
+
"""Returns a unique run id for each 'run'.
|
125
|
+
|
126
|
+
A run is defined as the lifetime of a process that has imported `sky`
|
127
|
+
and has called its CLI or programmatic APIs. For example, two successive
|
128
|
+
`sky launch` are two runs.
|
129
|
+
"""
|
130
|
+
global _usage_run_id
|
131
|
+
if _usage_run_id is None:
|
132
|
+
_usage_run_id = str(uuid.uuid4())
|
133
|
+
return _usage_run_id
|
134
|
+
|
135
|
+
|
136
|
+
def make_decorator(cls, name_or_fn: Union[str, Callable], **ctx_kwargs) -> Callable:
|
137
|
+
"""Make the cls a decorator.
|
138
|
+
|
139
|
+
class cls:
|
140
|
+
def __init__(self, name, **kwargs):
|
141
|
+
pass
|
142
|
+
def __enter__(self):
|
143
|
+
pass
|
144
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
145
|
+
pass
|
146
|
+
|
147
|
+
Args:
|
148
|
+
name_or_fn: The name of the event or the function to be wrapped.
|
149
|
+
message: The message attached to the event.
|
150
|
+
"""
|
151
|
+
if isinstance(name_or_fn, str):
|
152
|
+
|
153
|
+
def _wrapper(f):
|
154
|
+
@functools.wraps(f)
|
155
|
+
def _record(*args, **kwargs):
|
156
|
+
with cls(name_or_fn, **ctx_kwargs):
|
157
|
+
return f(*args, **kwargs)
|
158
|
+
|
159
|
+
return _record
|
160
|
+
|
161
|
+
return _wrapper
|
162
|
+
else:
|
163
|
+
if not inspect.isfunction(name_or_fn):
|
164
|
+
raise ValueError('Should directly apply the decorator to a function.')
|
165
|
+
|
166
|
+
@functools.wraps(name_or_fn)
|
167
|
+
def _record(*args, **kwargs):
|
168
|
+
f = name_or_fn
|
169
|
+
func_name = getattr(f, '__qualname__', f.__name__)
|
170
|
+
module_name = getattr(f, '__module__', '')
|
171
|
+
if module_name:
|
172
|
+
full_name = f'{module_name}.{func_name}'
|
173
|
+
else:
|
174
|
+
full_name = func_name
|
175
|
+
with cls(full_name, **ctx_kwargs):
|
176
|
+
return f(*args, **kwargs)
|
177
|
+
|
178
|
+
return _record
|
179
|
+
|
180
|
+
|
181
|
+
def get_user_hash(force_fresh_hash: bool = False) -> str:
|
182
|
+
"""Returns a unique user-machine specific hash as a user id.
|
183
|
+
|
184
|
+
We cache the user hash in a file to avoid potential user_name or
|
185
|
+
hostname changes causing a new user hash to be generated.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the
|
189
|
+
hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash
|
190
|
+
to be generated. Used by `kubernetes.ssh_key_secret_field_name` to
|
191
|
+
avoid controllers sharing the same ssh key field name as the
|
192
|
+
local client.
|
193
|
+
"""
|
194
|
+
|
195
|
+
def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
|
196
|
+
if user_hash is None:
|
197
|
+
return False
|
198
|
+
try:
|
199
|
+
int(user_hash, 16)
|
200
|
+
except (TypeError, ValueError):
|
201
|
+
return False
|
202
|
+
return len(user_hash) == USER_HASH_LENGTH
|
203
|
+
|
204
|
+
if not force_fresh_hash:
|
205
|
+
user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
206
|
+
if _is_valid_user_hash(user_hash):
|
207
|
+
assert user_hash is not None
|
208
|
+
return user_hash
|
209
|
+
|
210
|
+
if not force_fresh_hash and os.path.exists(_USER_HASH_FILE):
|
211
|
+
# Read from cached user hash file.
|
212
|
+
with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
|
213
|
+
# Remove invalid characters.
|
214
|
+
user_hash = f.read().strip()
|
215
|
+
if _is_valid_user_hash(user_hash):
|
216
|
+
return user_hash
|
217
|
+
|
218
|
+
hash_str = user_and_hostname_hash()
|
219
|
+
user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
|
220
|
+
if not _is_valid_user_hash(user_hash):
|
221
|
+
# A fallback in case the hash is invalid.
|
222
|
+
user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
|
223
|
+
os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
|
224
|
+
if not force_fresh_hash:
|
225
|
+
# Do not cache to file if force_fresh_hash is True since the file may
|
226
|
+
# be intentionally using a different hash, e.g. we want to keep the
|
227
|
+
# user_hash for usage collection the same on the jobs/serve controller
|
228
|
+
# as users' local client.
|
229
|
+
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
230
|
+
f.write(user_hash)
|
231
|
+
return user_hash
|
232
|
+
|
233
|
+
|
234
|
+
def read_yaml(path: str) -> Dict[str, Any]:
|
235
|
+
with open(path, 'r', encoding='utf-8') as f:
|
236
|
+
config = yaml.safe_load(f)
|
237
|
+
return config
|
238
|
+
|
239
|
+
|
240
|
+
def read_yaml_all(path: str) -> List[Dict[str, Any]]:
|
241
|
+
with open(path, 'r', encoding='utf-8') as f:
|
242
|
+
config = yaml.safe_load_all(f)
|
243
|
+
configs = list(config)
|
244
|
+
if not configs:
|
245
|
+
# Empty YAML file.
|
246
|
+
return [{}]
|
247
|
+
return configs
|
248
|
+
|
249
|
+
|
250
|
+
def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
|
251
|
+
"""Validates an object against a given JSON schema.
|
252
|
+
|
253
|
+
Args:
|
254
|
+
obj: The object to validate.
|
255
|
+
schema: The JSON schema against which to validate the object.
|
256
|
+
err_msg_prefix: The string to prepend to the error message if
|
257
|
+
validation fails.
|
258
|
+
skip_none: If True, removes fields with value None from the object
|
259
|
+
before validation. This is useful for objects that will never contain
|
260
|
+
None because yaml.safe_load() loads empty fields as None.
|
261
|
+
|
262
|
+
Raises:
|
263
|
+
ValueError: if the object does not match the schema.
|
264
|
+
"""
|
265
|
+
if skip_none:
|
266
|
+
obj = {k: v for k, v in obj.items() if v is not None}
|
267
|
+
err_msg = None
|
268
|
+
try:
|
269
|
+
validator.SchemaValidator(schema).validate(obj)
|
270
|
+
except jsonschema.ValidationError as e:
|
271
|
+
if e.validator == 'additionalProperties':
|
272
|
+
if tuple(e.schema_path) == ('properties', 'envs', 'additionalProperties'):
|
273
|
+
# Hack. Here the error is Task.envs having some invalid keys. So
|
274
|
+
# we should not print "unsupported field".
|
275
|
+
#
|
276
|
+
# This will print something like:
|
277
|
+
# 'hello world' does not match any of the regexes: <regex>
|
278
|
+
err_msg = (
|
279
|
+
err_msg_prefix
|
280
|
+
+ 'The `envs` field contains invalid keys:\n'
|
281
|
+
+ e.message
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
err_msg = err_msg_prefix
|
285
|
+
assert isinstance(e.schema, dict), 'Schema must be a dictionary'
|
286
|
+
known_fields = set(e.schema.get('properties', {}).keys())
|
287
|
+
assert isinstance(e.instance, dict), 'Instance must be a dictionary'
|
288
|
+
for field in e.instance:
|
289
|
+
if field not in known_fields:
|
290
|
+
most_similar_field = difflib.get_close_matches(
|
291
|
+
field, known_fields, 1
|
292
|
+
)
|
293
|
+
if most_similar_field:
|
294
|
+
err_msg += (
|
295
|
+
f'Instead of {field!r}, did you mean '
|
296
|
+
f'{most_similar_field[0]!r}?'
|
297
|
+
)
|
298
|
+
else:
|
299
|
+
err_msg += f'Found unsupported field {field!r}.'
|
300
|
+
else:
|
301
|
+
message = e.message
|
302
|
+
# Object in jsonschema is represented as dict in Python. Replace
|
303
|
+
# 'object' with 'dict' for better readability.
|
304
|
+
message = message.replace("type 'object'", "type 'dict'")
|
305
|
+
# Example e.json_path value: '$.resources'
|
306
|
+
err_msg = (
|
307
|
+
err_msg_prefix
|
308
|
+
+ message
|
309
|
+
+ f'. Check problematic field(s): {e.json_path}'
|
310
|
+
)
|
311
|
+
|
312
|
+
if err_msg:
|
313
|
+
with ux_utils.print_exception_no_traceback():
|
314
|
+
raise ValueError(err_msg)
|
315
|
+
|
316
|
+
|
317
|
+
def dump_yaml(path: str, config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> None:
|
318
|
+
with open(path, 'w', encoding='utf-8') as f:
|
319
|
+
f.write(dump_yaml_str(config))
|
320
|
+
|
321
|
+
|
322
|
+
def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
|
323
|
+
# https://github.com/yaml/pyyaml/issues/127
|
324
|
+
class LineBreakDumper(yaml.SafeDumper):
|
325
|
+
def write_line_break(self, data=None):
|
326
|
+
super().write_line_break(data)
|
327
|
+
if len(self.indents) == 1:
|
328
|
+
super().write_line_break()
|
329
|
+
|
330
|
+
if isinstance(config, list):
|
331
|
+
dump_func = yaml.dump_all # type: ignore
|
332
|
+
else:
|
333
|
+
dump_func = yaml.dump # type: ignore
|
334
|
+
return dump_func(
|
335
|
+
config, Dumper=LineBreakDumper, sort_keys=False, default_flow_style=False
|
336
|
+
)
|
337
|
+
|
338
|
+
|
339
|
+
def fill_template(
|
340
|
+
template_name: str, variables: Dict[str, Any], output_path: str
|
341
|
+
) -> None:
|
342
|
+
"""Create a file from a Jinja template and return the filename."""
|
343
|
+
assert template_name.endswith('.j2'), template_name
|
344
|
+
root_dir = os.path.dirname(os.path.dirname(__file__))
|
345
|
+
template_path = os.path.join(root_dir, 'templates', template_name)
|
346
|
+
if not os.path.exists(template_path):
|
347
|
+
raise FileNotFoundError(f'Template "{template_name}" does not exist.')
|
348
|
+
with open(template_path, 'r', encoding='utf-8') as fin:
|
349
|
+
template = fin.read()
|
350
|
+
output_path = os.path.abspath(os.path.expanduser(output_path))
|
351
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
352
|
+
|
353
|
+
# Write out yaml config.
|
354
|
+
j2_template = jinja2.Template(template)
|
355
|
+
content = j2_template.render(**variables)
|
356
|
+
with open(output_path, 'w', encoding='utf-8') as fout:
|
357
|
+
fout.write(content)
|
358
|
+
|
359
|
+
|
360
|
+
def class_fullname(cls, skip_builtins: bool = True):
|
361
|
+
"""Get the full name of a class.
|
362
|
+
|
363
|
+
Example:
|
364
|
+
>>> e = konduktor.exceptions.FetchClusterInfoError()
|
365
|
+
>>> class_fullname(e.__class__)
|
366
|
+
'konduktor.exceptions.FetchClusterInfoError'
|
367
|
+
|
368
|
+
Args:
|
369
|
+
cls: The class to get the full name.
|
370
|
+
|
371
|
+
Returns:
|
372
|
+
The full name of the class.
|
373
|
+
"""
|
374
|
+
module_name = getattr(cls, '__module__', '')
|
375
|
+
if not module_name or (module_name == 'builtins' and skip_builtins):
|
376
|
+
return cls.__name__
|
377
|
+
return f'{cls.__module__}.{cls.__name__}'
|
378
|
+
|
379
|
+
|
380
|
+
def format_exception(
|
381
|
+
e: Union[Exception, SystemExit, KeyboardInterrupt], use_bracket: bool = False
|
382
|
+
) -> str:
|
383
|
+
"""Format an exception to a string.
|
384
|
+
|
385
|
+
Args:
|
386
|
+
e: The exception to format.
|
387
|
+
|
388
|
+
Returns:
|
389
|
+
A string that represents the exception.
|
390
|
+
"""
|
391
|
+
if use_bracket:
|
392
|
+
return f'[{class_fullname(e.__class__)}] {e}'
|
393
|
+
return f'{class_fullname(e.__class__)}: {e}'
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Global environment options for konduktor."""
|
14
|
+
|
15
|
+
import enum
|
16
|
+
import os
|
17
|
+
from typing import Dict
|
18
|
+
|
19
|
+
|
20
|
+
class Options(enum.Enum):
|
21
|
+
"""Environment variables for SkyPilot."""
|
22
|
+
|
23
|
+
# (env var name, default value)
|
24
|
+
IS_DEVELOPER = ('KONDUKTOR_DEV', False)
|
25
|
+
SHOW_DEBUG_INFO = ('KONDUKTOR_DEBUG', True)
|
26
|
+
DISABLE_LOGGING = ('KONDUKTOR_DISABLE_USAGE_COLLECTION', False)
|
27
|
+
MINIMIZE_LOGGING = ('KONDUKTOR_MINIMIZE_LOGGING', False)
|
28
|
+
SUPPRESS_SENSITIVE_LOG = ('KONDUKTOR_SUPPRESS_SENSITIVE_LOG', False)
|
29
|
+
# Internal: this is used to skip the cloud user identity check, which is
|
30
|
+
# used to protect cluster operations in a multi-identity scenario.
|
31
|
+
# Currently, this is only used in the job and serve controller, as there
|
32
|
+
# will not be multiple identities, and skipping the check can increase
|
33
|
+
# robustness.
|
34
|
+
SKIP_CLOUD_IDENTITY_CHECK = ('KONDUKTOR_SKIP_CLOUD_IDENTITY_CHECK', False)
|
35
|
+
|
36
|
+
def __init__(self, env_var: str, default: bool) -> None:
|
37
|
+
self.env_var = env_var
|
38
|
+
self.default = default
|
39
|
+
|
40
|
+
def __repr__(self) -> str:
|
41
|
+
return self.env_var
|
42
|
+
|
43
|
+
def get(self) -> bool:
|
44
|
+
"""Check if an environment variable is set to True."""
|
45
|
+
return os.getenv(self.env_var, str(self.default)).lower() in ('true', '1')
|
46
|
+
|
47
|
+
@property
|
48
|
+
def env_key(self) -> str:
|
49
|
+
"""The environment variable key name."""
|
50
|
+
return self.value[0]
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
def all_options(cls) -> Dict[str, bool]:
|
54
|
+
"""Returns all options as a dictionary."""
|
55
|
+
return {option.env_key: option.get() for option in list(Options)}
|
@@ -0,0 +1,226 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Exceptions."""
|
14
|
+
|
15
|
+
import builtins
|
16
|
+
import traceback
|
17
|
+
import types
|
18
|
+
from typing import Any, Dict
|
19
|
+
|
20
|
+
# Return code for keyboard interruption and SIGTSTP
|
21
|
+
KEYBOARD_INTERRUPT_CODE = 130
|
22
|
+
SIGTSTP_CODE = 146
|
23
|
+
RSYNC_FILE_NOT_FOUND_CODE = 23
|
24
|
+
# Arbitrarily chosen value. Used in SkyPilot's storage mounting scripts
|
25
|
+
MOUNT_PATH_NON_EMPTY_CODE = 42
|
26
|
+
# Arbitrarily chosen value. Used to provision Kubernetes instance in Skypilot
|
27
|
+
INSUFFICIENT_PRIVILEGES_CODE = 52
|
28
|
+
# Return code when git command is ran in a dir that is not git repo
|
29
|
+
GIT_FATAL_EXIT_CODE = 128
|
30
|
+
|
31
|
+
|
32
|
+
def is_safe_exception(exc: Exception) -> bool:
|
33
|
+
"""Returns True if the exception is safe to send to clients.
|
34
|
+
|
35
|
+
Safe exceptions are:
|
36
|
+
1. Built-in exceptions
|
37
|
+
2. Konduktor's own exceptions
|
38
|
+
"""
|
39
|
+
module = type(exc).__module__
|
40
|
+
|
41
|
+
# Builtin exceptions (e.g., ValueError, RuntimeError)
|
42
|
+
if module == 'builtins':
|
43
|
+
return True
|
44
|
+
|
45
|
+
# Konduktor's own exceptions
|
46
|
+
if module.startswith('sky.'):
|
47
|
+
return True
|
48
|
+
|
49
|
+
return False
|
50
|
+
|
51
|
+
|
52
|
+
def wrap_exception(exc: Exception) -> Exception:
|
53
|
+
"""Wraps non-safe exceptions into Konduktor exceptions
|
54
|
+
|
55
|
+
This is used to wrap exceptions that are not safe to deserialize at clients.
|
56
|
+
|
57
|
+
Examples include exceptions from cloud providers whose packages are not
|
58
|
+
available at clients.
|
59
|
+
"""
|
60
|
+
if is_safe_exception(exc):
|
61
|
+
return exc
|
62
|
+
|
63
|
+
return CloudError(
|
64
|
+
message=str(exc),
|
65
|
+
cloud_provider=type(exc).__module__.split('.')[0],
|
66
|
+
error_type=type(exc).__name__,
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
def serialize_exception(e: Exception) -> Dict[str, Any]:
|
71
|
+
"""Serialize the exception.
|
72
|
+
|
73
|
+
This function also wraps any unsafe exceptions (e.g., cloud exceptions)
|
74
|
+
into Konduktor's CloudError before serialization to ensure clients can
|
75
|
+
deserialize them without needing cloud provider packages installed.
|
76
|
+
"""
|
77
|
+
# Wrap unsafe exceptions before serialization
|
78
|
+
e = wrap_exception(e)
|
79
|
+
|
80
|
+
stacktrace = getattr(e, 'stacktrace', None)
|
81
|
+
attributes = e.__dict__.copy()
|
82
|
+
if 'stacktrace' in attributes:
|
83
|
+
del attributes['stacktrace']
|
84
|
+
for attr_k in list(attributes.keys()):
|
85
|
+
attr_v = attributes[attr_k]
|
86
|
+
if isinstance(attr_v, types.TracebackType):
|
87
|
+
attributes[attr_k] = traceback.format_tb(attr_v)
|
88
|
+
|
89
|
+
data = {
|
90
|
+
'type': e.__class__.__name__,
|
91
|
+
'message': str(e),
|
92
|
+
'args': e.args,
|
93
|
+
'attributes': attributes,
|
94
|
+
'stacktrace': stacktrace,
|
95
|
+
}
|
96
|
+
return data
|
97
|
+
|
98
|
+
|
99
|
+
def deserialize_exception(serialized: Dict[str, Any]) -> Exception:
|
100
|
+
"""Deserialize the exception."""
|
101
|
+
exception_type = serialized['type']
|
102
|
+
if hasattr(builtins, exception_type):
|
103
|
+
exception_class = getattr(builtins, exception_type)
|
104
|
+
else:
|
105
|
+
exception_class = globals().get(exception_type, None)
|
106
|
+
if exception_class is None:
|
107
|
+
# Unknown exception type.
|
108
|
+
return Exception(f'{exception_type}: {serialized["message"]}')
|
109
|
+
e = exception_class(*serialized['args'], **serialized['attributes'])
|
110
|
+
if serialized['stacktrace'] is not None:
|
111
|
+
setattr(e, 'stacktrace', serialized['stacktrace'])
|
112
|
+
return e
|
113
|
+
|
114
|
+
|
115
|
+
class CloudError(Exception):
|
116
|
+
"""Wraps cloud-specific errors into a SkyPilot exception."""
|
117
|
+
|
118
|
+
def __init__(self, message: str, cloud_provider: str, error_type: str):
|
119
|
+
super().__init__(message)
|
120
|
+
self.cloud_provider = cloud_provider
|
121
|
+
self.error_type = error_type
|
122
|
+
|
123
|
+
def __str__(self):
|
124
|
+
return (
|
125
|
+
f'{self.cloud_provider} error ({self.error_type}): ' f'{super().__str__()}'
|
126
|
+
)
|
127
|
+
|
128
|
+
|
129
|
+
class CommandError(Exception):
|
130
|
+
pass
|
131
|
+
|
132
|
+
|
133
|
+
class NotSupportedError(Exception):
|
134
|
+
"""Raised when a feature is not supported."""
|
135
|
+
|
136
|
+
pass
|
137
|
+
|
138
|
+
|
139
|
+
class StorageError(Exception):
|
140
|
+
pass
|
141
|
+
|
142
|
+
|
143
|
+
class StorageSpecError(ValueError):
|
144
|
+
# Errors raised due to invalid specification of the Storage object
|
145
|
+
pass
|
146
|
+
|
147
|
+
|
148
|
+
class StorageInitError(StorageError):
|
149
|
+
# Error raised when Initialization fails - either due to permissions,
|
150
|
+
# unavailable name, or other reasons.
|
151
|
+
pass
|
152
|
+
|
153
|
+
|
154
|
+
class StorageBucketCreateError(StorageInitError):
|
155
|
+
# Error raised when bucket creation fails.
|
156
|
+
pass
|
157
|
+
|
158
|
+
|
159
|
+
class StorageBucketGetError(StorageInitError):
|
160
|
+
# Error raised if attempt to fetch an existing bucket fails.
|
161
|
+
pass
|
162
|
+
|
163
|
+
|
164
|
+
class StorageBucketDeleteError(StorageError):
|
165
|
+
# Error raised if attempt to delete an existing bucket fails.
|
166
|
+
pass
|
167
|
+
|
168
|
+
|
169
|
+
class StorageUploadError(StorageError):
|
170
|
+
# Error raised when bucket is successfully initialized, but upload fails,
|
171
|
+
# either due to permissions, ctrl-c, or other reasons.
|
172
|
+
pass
|
173
|
+
|
174
|
+
|
175
|
+
class StorageSourceError(StorageSpecError):
|
176
|
+
# Error raised when the source of the storage is invalid. E.g., does not
|
177
|
+
# exist, malformed path, or other reasons.
|
178
|
+
pass
|
179
|
+
|
180
|
+
|
181
|
+
class StorageNameError(StorageSpecError):
|
182
|
+
# Error raised when the source of the storage is invalid. E.g., does not
|
183
|
+
# exist, malformed path, or other reasons.
|
184
|
+
pass
|
185
|
+
|
186
|
+
|
187
|
+
class StorageModeError(StorageSpecError):
|
188
|
+
# Error raised when the storage mode is invalid or does not support the
|
189
|
+
# requested operation (e.g., passing a file as source to MOUNT mode)
|
190
|
+
pass
|
191
|
+
|
192
|
+
|
193
|
+
class StorageExternalDeletionError(StorageBucketGetError):
|
194
|
+
# Error raised when the bucket is attempted to be fetched while it has been
|
195
|
+
# deleted externally.
|
196
|
+
pass
|
197
|
+
|
198
|
+
|
199
|
+
class NonExistentStorageAccountError(StorageExternalDeletionError):
|
200
|
+
# Error raise when storage account provided through config.yaml or read
|
201
|
+
# from store handle(local db) does not exist.
|
202
|
+
pass
|
203
|
+
|
204
|
+
|
205
|
+
class NetworkError(Exception):
|
206
|
+
"""Raised when network fails."""
|
207
|
+
|
208
|
+
pass
|
209
|
+
|
210
|
+
|
211
|
+
class CloudUserIdentityError(Exception):
|
212
|
+
"""Raised when the cloud identity is invalid."""
|
213
|
+
|
214
|
+
pass
|
215
|
+
|
216
|
+
|
217
|
+
class ClusterOwnerIdentityMismatchError(Exception):
|
218
|
+
"""The cluster's owner identity does not match the current user identity."""
|
219
|
+
|
220
|
+
pass
|
221
|
+
|
222
|
+
|
223
|
+
class NoCloudAccessError(Exception):
|
224
|
+
"""Raised when all clouds are disabled."""
|
225
|
+
|
226
|
+
pass
|