konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility for (un)zip and encode/decoding k8s secrets in base64
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
import zipfile
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def zip_base64encode(files: List[str]) -> str:
|
|
14
|
+
"""Zips files and encodes them in base64.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
files: List of file paths to zip. Can include files and directories.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Base64 encoded string of the zipped files.
|
|
21
|
+
"""
|
|
22
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
23
|
+
# Copy all files/directories to temp dir preserving structure
|
|
24
|
+
for file_path in files:
|
|
25
|
+
src_path = os.path.expanduser(file_path)
|
|
26
|
+
if not os.path.exists(src_path):
|
|
27
|
+
continue
|
|
28
|
+
dst_path = os.path.join(temp_dir, os.path.basename(file_path))
|
|
29
|
+
|
|
30
|
+
if os.path.isdir(src_path):
|
|
31
|
+
shutil.copytree(src_path, dst_path)
|
|
32
|
+
else:
|
|
33
|
+
shutil.copy2(src_path, dst_path)
|
|
34
|
+
|
|
35
|
+
# Create zip file
|
|
36
|
+
zip_path = os.path.join(temp_dir, 'archive.zip')
|
|
37
|
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
38
|
+
for item in os.listdir(temp_dir):
|
|
39
|
+
if item == 'archive.zip':
|
|
40
|
+
continue
|
|
41
|
+
item_path = os.path.join(temp_dir, item)
|
|
42
|
+
if os.path.isfile(item_path):
|
|
43
|
+
zipf.write(item_path, item)
|
|
44
|
+
else:
|
|
45
|
+
for root, _, files in os.walk(item_path):
|
|
46
|
+
for file in files:
|
|
47
|
+
if file == '.DS_Store':
|
|
48
|
+
continue
|
|
49
|
+
file_path = os.path.join(root, file)
|
|
50
|
+
arcname = os.path.relpath(file_path, temp_dir)
|
|
51
|
+
zipf.write(file_path, arcname)
|
|
52
|
+
|
|
53
|
+
# Read and encode zip file
|
|
54
|
+
with open(zip_path, 'rb') as f:
|
|
55
|
+
zip_str = f.read()
|
|
56
|
+
secret_value = base64.b64encode(zip_str).decode('utf-8')
|
|
57
|
+
# print("encoding")
|
|
58
|
+
# print(type(secret_value))
|
|
59
|
+
# print(len(secret_value))
|
|
60
|
+
# print(secret_value[-20:])
|
|
61
|
+
return secret_value
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def base64decode_unzip(secret_value: str, output_path: str) -> str:
|
|
65
|
+
"""Decodes a base64 encoded string and unzips the files.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
secret_value: Base64 encoded string of the zipped files.
|
|
69
|
+
output_path: Path where to extract the unzipped files.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Path to the unzipped files.
|
|
73
|
+
"""
|
|
74
|
+
# TODO(asaiacai): this is messy I know...
|
|
75
|
+
# Decode base64 string
|
|
76
|
+
# print("decoding")
|
|
77
|
+
# print(type(secret_value))
|
|
78
|
+
# print(len(secret_value))
|
|
79
|
+
# print(secret_value[-20:])
|
|
80
|
+
decoded_data = base64.b64decode(secret_value)
|
|
81
|
+
|
|
82
|
+
# Write decoded data to temporary zip file
|
|
83
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
84
|
+
zip_path = os.path.join(temp_dir, 'archive.zip')
|
|
85
|
+
|
|
86
|
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
|
87
|
+
zipf.writestr('data.zip', decoded_data)
|
|
88
|
+
|
|
89
|
+
with zipfile.ZipFile(zip_path, 'r') as zipf:
|
|
90
|
+
zipf.extractall(path=output_path)
|
|
91
|
+
|
|
92
|
+
with zipfile.ZipFile(os.path.join(output_path, 'data.zip'), 'r') as zipf:
|
|
93
|
+
zipf.extractall(path=output_path)
|
|
94
|
+
|
|
95
|
+
return output_path
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
import datetime
|
|
14
|
+
import difflib
|
|
15
|
+
import functools
|
|
16
|
+
import getpass
|
|
17
|
+
import hashlib
|
|
18
|
+
import inspect
|
|
19
|
+
import os
|
|
20
|
+
import random
|
|
21
|
+
import re
|
|
22
|
+
import socket
|
|
23
|
+
import sys
|
|
24
|
+
import uuid
|
|
25
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
26
|
+
|
|
27
|
+
import jinja2
|
|
28
|
+
import jsonschema
|
|
29
|
+
import yaml # type: ignore
|
|
30
|
+
|
|
31
|
+
from konduktor.utils import annotations, constants, ux_utils, validator
|
|
32
|
+
|
|
33
|
+
_USER_HASH_FILE = os.path.expanduser('~/.konduktor/user_hash')
|
|
34
|
+
_usage_run_id = None
|
|
35
|
+
_VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
|
|
36
|
+
USER_HASH_LENGTH = 8
|
|
37
|
+
USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_timestamp() -> str:
|
|
41
|
+
return datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def user_and_hostname_hash() -> str:
|
|
45
|
+
"""Returns a string containing <user>-<hostname hash last 4 chars>.
|
|
46
|
+
|
|
47
|
+
For uniquefying user workloads on a shared-k8s cluster.
|
|
48
|
+
|
|
49
|
+
Using uuid.getnode() instead of gethostname() is incorrect; observed to
|
|
50
|
+
collide on Macs.
|
|
51
|
+
"""
|
|
52
|
+
hostname_hash = hashlib.md5(socket.gethostname().encode()).hexdigest()[-4:]
|
|
53
|
+
return f'{getpass.getuser()}-{hostname_hash}'
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def base36_encode(hex_str: str) -> str:
|
|
57
|
+
"""Converts a hex string to a base36 string."""
|
|
58
|
+
int_value = int(hex_str, 16)
|
|
59
|
+
|
|
60
|
+
def _base36_encode(num: int) -> str:
|
|
61
|
+
if num == 0:
|
|
62
|
+
return '0'
|
|
63
|
+
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz'
|
|
64
|
+
base36 = ''
|
|
65
|
+
while num != 0:
|
|
66
|
+
num, i = divmod(num, 36)
|
|
67
|
+
base36 = alphabet[i] + base36
|
|
68
|
+
return base36
|
|
69
|
+
|
|
70
|
+
return _base36_encode(int_value)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_cleaned_username(username: str = '') -> str:
|
|
74
|
+
"""Cleans the username. Underscores are allowed, as we will
|
|
75
|
+
handle it when mapping to the cluster_name_on_cloud in
|
|
76
|
+
common_utils.make_cluster_name_on_cloud.
|
|
77
|
+
|
|
78
|
+
Clean up includes:
|
|
79
|
+
1. Making all characters lowercase
|
|
80
|
+
2. Removing any non-alphanumeric characters (excluding hyphens and
|
|
81
|
+
underscores)
|
|
82
|
+
3. Removing any numbers and/or hyphens at the start of the username.
|
|
83
|
+
4. Removing any hyphens at the end of the username
|
|
84
|
+
5. Truncate the username to 63 characters, as requested by GCP labels
|
|
85
|
+
|
|
86
|
+
Dots are removed due to: https://cloud.google.com/compute/docs/labeling-resources#requirements
|
|
87
|
+
|
|
88
|
+
e.g. 1SkY-PiLot2- becomes sky-pilot2
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A cleaned username.
|
|
92
|
+
""" # noqa: E501
|
|
93
|
+
username = username or getpass.getuser()
|
|
94
|
+
username = username.lower()
|
|
95
|
+
username = re.sub(r'[^a-z0-9-_]', '', username)
|
|
96
|
+
username = re.sub(r'^[0-9-]+', '', username)
|
|
97
|
+
username = re.sub(r'-$', '', username)
|
|
98
|
+
username = username[:63]
|
|
99
|
+
return username
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def is_valid_env_var(name: str) -> bool:
|
|
103
|
+
"""Checks if the task environment variable name is valid."""
|
|
104
|
+
return bool(re.fullmatch(_VALID_ENV_VAR_REGEX, name))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_pretty_entry_point() -> str:
|
|
108
|
+
"""Returns the prettified entry point of this process (sys.argv).
|
|
109
|
+
|
|
110
|
+
Example return values:
|
|
111
|
+
$ konduktor launch app.yaml # 'konduktor launch app.yaml'
|
|
112
|
+
$ python examples/app.py # 'app.py'
|
|
113
|
+
"""
|
|
114
|
+
argv = sys.argv
|
|
115
|
+
basename = os.path.basename(argv[0])
|
|
116
|
+
if basename == 'konduktor':
|
|
117
|
+
# Turn '/.../anaconda/envs/py36/bin/sky' into 'konduktor', but keep other
|
|
118
|
+
# things like 'examples/app.py'.
|
|
119
|
+
argv[0] = basename
|
|
120
|
+
return ' '.join(argv)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@annotations.lru_cache(scope='request')
|
|
124
|
+
def get_usage_run_id() -> str:
|
|
125
|
+
"""Returns a unique run id for each 'run'.
|
|
126
|
+
|
|
127
|
+
A run is defined as the lifetime of a process that has imported `sky`
|
|
128
|
+
and has called its CLI or programmatic APIs. For example, two successive
|
|
129
|
+
`sky launch` are two runs.
|
|
130
|
+
"""
|
|
131
|
+
global _usage_run_id
|
|
132
|
+
if _usage_run_id is None:
|
|
133
|
+
_usage_run_id = str(uuid.uuid4())
|
|
134
|
+
return _usage_run_id
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def make_decorator(cls, name_or_fn: Union[str, Callable], **ctx_kwargs) -> Callable:
|
|
138
|
+
"""Make the cls a decorator.
|
|
139
|
+
|
|
140
|
+
class cls:
|
|
141
|
+
def __init__(self, name, **kwargs):
|
|
142
|
+
pass
|
|
143
|
+
def __enter__(self):
|
|
144
|
+
pass
|
|
145
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
name_or_fn: The name of the event or the function to be wrapped.
|
|
150
|
+
message: The message attached to the event.
|
|
151
|
+
"""
|
|
152
|
+
if isinstance(name_or_fn, str):
|
|
153
|
+
|
|
154
|
+
def _wrapper(f):
|
|
155
|
+
@functools.wraps(f)
|
|
156
|
+
def _record(*args, **kwargs):
|
|
157
|
+
with cls(name_or_fn, **ctx_kwargs):
|
|
158
|
+
return f(*args, **kwargs)
|
|
159
|
+
|
|
160
|
+
return _record
|
|
161
|
+
|
|
162
|
+
return _wrapper
|
|
163
|
+
else:
|
|
164
|
+
if not inspect.isfunction(name_or_fn):
|
|
165
|
+
raise ValueError('Should directly apply the decorator to a function.')
|
|
166
|
+
|
|
167
|
+
@functools.wraps(name_or_fn)
|
|
168
|
+
def _record(*args, **kwargs):
|
|
169
|
+
f = name_or_fn
|
|
170
|
+
func_name = getattr(f, '__qualname__', f.__name__)
|
|
171
|
+
module_name = getattr(f, '__module__', '')
|
|
172
|
+
if module_name:
|
|
173
|
+
full_name = f'{module_name}.{func_name}'
|
|
174
|
+
else:
|
|
175
|
+
full_name = func_name
|
|
176
|
+
with cls(full_name, **ctx_kwargs):
|
|
177
|
+
return f(*args, **kwargs)
|
|
178
|
+
|
|
179
|
+
return _record
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_user_hash(force_fresh_hash: bool = False) -> str:
|
|
183
|
+
"""Returns a unique user-machine specific hash as a user id.
|
|
184
|
+
|
|
185
|
+
We cache the user hash in a file to avoid potential user_name or
|
|
186
|
+
hostname changes causing a new user hash to be generated.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
force_fresh_hash: Bypasses the cached hash in USER_HASH_FILE and the
|
|
190
|
+
hash in the USER_ID_ENV_VAR and forces a fresh user-machine hash
|
|
191
|
+
to be generated. Used by `kubernetes.ssh_key_secret_field_name` to
|
|
192
|
+
avoid controllers sharing the same ssh key field name as the
|
|
193
|
+
local client.
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
override = os.environ.get('KONDUKTOR_TEST_USER_HASH')
|
|
197
|
+
if override:
|
|
198
|
+
return override
|
|
199
|
+
|
|
200
|
+
def _is_valid_user_hash(user_hash: Optional[str]) -> bool:
|
|
201
|
+
if user_hash is None:
|
|
202
|
+
return False
|
|
203
|
+
try:
|
|
204
|
+
int(user_hash, 16)
|
|
205
|
+
except (TypeError, ValueError):
|
|
206
|
+
return False
|
|
207
|
+
return len(user_hash) == USER_HASH_LENGTH
|
|
208
|
+
|
|
209
|
+
if not force_fresh_hash:
|
|
210
|
+
user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
|
211
|
+
if _is_valid_user_hash(user_hash):
|
|
212
|
+
assert user_hash is not None
|
|
213
|
+
return user_hash
|
|
214
|
+
|
|
215
|
+
if not force_fresh_hash and os.path.exists(_USER_HASH_FILE):
|
|
216
|
+
# Read from cached user hash file.
|
|
217
|
+
with open(_USER_HASH_FILE, 'r', encoding='utf-8') as f:
|
|
218
|
+
# Remove invalid characters.
|
|
219
|
+
user_hash = f.read().strip()
|
|
220
|
+
if _is_valid_user_hash(user_hash):
|
|
221
|
+
return user_hash
|
|
222
|
+
|
|
223
|
+
hash_str = user_and_hostname_hash()
|
|
224
|
+
user_hash = hashlib.md5(hash_str.encode()).hexdigest()[:USER_HASH_LENGTH]
|
|
225
|
+
if not _is_valid_user_hash(user_hash):
|
|
226
|
+
# A fallback in case the hash is invalid.
|
|
227
|
+
user_hash = uuid.uuid4().hex[:USER_HASH_LENGTH]
|
|
228
|
+
os.makedirs(os.path.dirname(_USER_HASH_FILE), exist_ok=True)
|
|
229
|
+
if not force_fresh_hash:
|
|
230
|
+
# Do not cache to file if force_fresh_hash is True since the file may
|
|
231
|
+
# be intentionally using a different hash, e.g. we want to keep the
|
|
232
|
+
# user_hash for usage collection the same on the jobs/serve controller
|
|
233
|
+
# as users' local client.
|
|
234
|
+
with open(_USER_HASH_FILE, 'w', encoding='utf-8') as f:
|
|
235
|
+
f.write(user_hash)
|
|
236
|
+
return user_hash
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def read_yaml(path: str) -> Dict[str, Any]:
|
|
240
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
241
|
+
config = yaml.safe_load(f)
|
|
242
|
+
return config
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def read_yaml_all(path: str) -> List[Dict[str, Any]]:
|
|
246
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
247
|
+
config = yaml.safe_load_all(f)
|
|
248
|
+
configs = list(config)
|
|
249
|
+
if not configs:
|
|
250
|
+
# Empty YAML file.
|
|
251
|
+
return [{}]
|
|
252
|
+
return configs
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
|
|
256
|
+
"""Validates an object against a given JSON schema.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
obj: The object to validate.
|
|
260
|
+
schema: The JSON schema against which to validate the object.
|
|
261
|
+
err_msg_prefix: The string to prepend to the error message if
|
|
262
|
+
validation fails.
|
|
263
|
+
skip_none: If True, removes fields with value None from the object
|
|
264
|
+
before validation. This is useful for objects that will never contain
|
|
265
|
+
None because yaml.safe_load() loads empty fields as None.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
ValueError: if the object does not match the schema.
|
|
269
|
+
"""
|
|
270
|
+
if skip_none:
|
|
271
|
+
obj = {k: v for k, v in obj.items() if v is not None}
|
|
272
|
+
err_msg = None
|
|
273
|
+
try:
|
|
274
|
+
validator.SchemaValidator(schema).validate(obj)
|
|
275
|
+
except jsonschema.ValidationError as e:
|
|
276
|
+
if e.validator == 'additionalProperties':
|
|
277
|
+
if tuple(e.schema_path) == ('properties', 'envs', 'additionalProperties'):
|
|
278
|
+
# Hack. Here the error is Task.envs having some invalid keys. So
|
|
279
|
+
# we should not print "unsupported field".
|
|
280
|
+
#
|
|
281
|
+
# This will print something like:
|
|
282
|
+
# 'hello world' does not match any of the regexes: <regex>
|
|
283
|
+
err_msg = (
|
|
284
|
+
err_msg_prefix
|
|
285
|
+
+ 'The `envs` field contains invalid keys:\n'
|
|
286
|
+
+ e.message
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
err_msg = err_msg_prefix
|
|
290
|
+
assert isinstance(e.schema, dict), 'Schema must be a dictionary'
|
|
291
|
+
known_fields = set(e.schema.get('properties', {}).keys())
|
|
292
|
+
assert isinstance(e.instance, dict), 'Instance must be a dictionary'
|
|
293
|
+
for field in e.instance:
|
|
294
|
+
if field not in known_fields:
|
|
295
|
+
most_similar_field = difflib.get_close_matches(
|
|
296
|
+
field, known_fields, 1
|
|
297
|
+
)
|
|
298
|
+
if most_similar_field:
|
|
299
|
+
err_msg += (
|
|
300
|
+
f'Instead of {field!r}, did you mean '
|
|
301
|
+
f'{most_similar_field[0]!r}?'
|
|
302
|
+
)
|
|
303
|
+
else:
|
|
304
|
+
err_msg += f'Found unsupported field {field!r}.'
|
|
305
|
+
else:
|
|
306
|
+
message = e.message
|
|
307
|
+
# Object in jsonschema is represented as dict in Python. Replace
|
|
308
|
+
# 'object' with 'dict' for better readability.
|
|
309
|
+
message = message.replace("type 'object'", "type 'dict'")
|
|
310
|
+
# Example e.json_path value: '$.resources'
|
|
311
|
+
err_msg = (
|
|
312
|
+
err_msg_prefix
|
|
313
|
+
+ message
|
|
314
|
+
+ f'. Check problematic field(s): {e.json_path}'
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if err_msg:
|
|
318
|
+
with ux_utils.print_exception_no_traceback():
|
|
319
|
+
raise ValueError(err_msg)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def dump_yaml(path: str, config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> None:
|
|
323
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
324
|
+
f.write(dump_yaml_str(config))
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def dump_yaml_str(config: Union[List[Dict[str, Any]], Dict[str, Any]]) -> str:
|
|
328
|
+
# https://github.com/yaml/pyyaml/issues/127
|
|
329
|
+
class LineBreakDumper(yaml.SafeDumper):
|
|
330
|
+
def write_line_break(self, data=None):
|
|
331
|
+
super().write_line_break(data)
|
|
332
|
+
if len(self.indents) == 1:
|
|
333
|
+
super().write_line_break()
|
|
334
|
+
|
|
335
|
+
if isinstance(config, list):
|
|
336
|
+
dump_func = yaml.dump_all # type: ignore
|
|
337
|
+
else:
|
|
338
|
+
dump_func = yaml.dump # type: ignore
|
|
339
|
+
return dump_func(
|
|
340
|
+
config, Dumper=LineBreakDumper, sort_keys=False, default_flow_style=False
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def fill_template(
|
|
345
|
+
template_name: str, variables: Dict[str, Any], output_path: str
|
|
346
|
+
) -> None:
|
|
347
|
+
"""Create a file from a Jinja template and return the filename."""
|
|
348
|
+
assert template_name.endswith('.j2'), template_name
|
|
349
|
+
root_dir = os.path.dirname(os.path.dirname(__file__))
|
|
350
|
+
template_path = os.path.join(root_dir, 'templates', template_name)
|
|
351
|
+
if not os.path.exists(template_path):
|
|
352
|
+
raise FileNotFoundError(f'Template "{template_name}" does not exist.')
|
|
353
|
+
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
354
|
+
template = fin.read()
|
|
355
|
+
output_path = os.path.abspath(os.path.expanduser(output_path))
|
|
356
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
357
|
+
|
|
358
|
+
# Write out yaml config.
|
|
359
|
+
j2_template = jinja2.Template(template)
|
|
360
|
+
content = j2_template.render(**variables)
|
|
361
|
+
with open(output_path, 'w', encoding='utf-8') as fout:
|
|
362
|
+
fout.write(content)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def class_fullname(cls, skip_builtins: bool = True):
|
|
366
|
+
"""Get the full name of a class.
|
|
367
|
+
|
|
368
|
+
Example:
|
|
369
|
+
>>> e = konduktor.exceptions.FetchClusterInfoError()
|
|
370
|
+
>>> class_fullname(e.__class__)
|
|
371
|
+
'konduktor.exceptions.FetchClusterInfoError'
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
cls: The class to get the full name.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
The full name of the class.
|
|
378
|
+
"""
|
|
379
|
+
module_name = getattr(cls, '__module__', '')
|
|
380
|
+
if not module_name or (module_name == 'builtins' and skip_builtins):
|
|
381
|
+
return cls.__name__
|
|
382
|
+
return f'{cls.__module__}.{cls.__name__}'
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def format_exception(
|
|
386
|
+
e: Union[Exception, SystemExit, KeyboardInterrupt], use_bracket: bool = False
|
|
387
|
+
) -> str:
|
|
388
|
+
"""Format an exception to a string.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
e: The exception to format.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
A string that represents the exception.
|
|
395
|
+
"""
|
|
396
|
+
if use_bracket:
|
|
397
|
+
return f'[{class_fullname(e.__class__)}] {e}'
|
|
398
|
+
return f'{class_fullname(e.__class__)}: {e}'
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class Backoff:
|
|
402
|
+
"""Exponential backoff with jittering."""
|
|
403
|
+
|
|
404
|
+
MULTIPLIER = 1.6
|
|
405
|
+
JITTER = 0.4
|
|
406
|
+
|
|
407
|
+
def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
|
|
408
|
+
self._initial = True
|
|
409
|
+
self._backoff = 0.0
|
|
410
|
+
self._initial_backoff = initial_backoff
|
|
411
|
+
self._max_backoff = max_backoff_factor * self._initial_backoff
|
|
412
|
+
|
|
413
|
+
# https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
|
|
414
|
+
# https://github.com/grpc/grpc/blob/5fc3ff82032d0ebc4bf252a170ebe66aacf9ed9d/src/core/lib/backoff/backoff.cc
|
|
415
|
+
|
|
416
|
+
def current_backoff(self) -> float:
|
|
417
|
+
"""Backs off once and returns the current backoff in seconds."""
|
|
418
|
+
if self._initial:
|
|
419
|
+
self._initial = False
|
|
420
|
+
self._backoff = min(self._initial_backoff, self._max_backoff)
|
|
421
|
+
else:
|
|
422
|
+
self._backoff = min(self._backoff * self.MULTIPLIER, self._max_backoff)
|
|
423
|
+
self._backoff += random.uniform(
|
|
424
|
+
-self.JITTER * self._backoff, self.JITTER * self._backoff
|
|
425
|
+
)
|
|
426
|
+
return self._backoff
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Global environment options for konduktor."""
|
|
14
|
+
|
|
15
|
+
import enum
|
|
16
|
+
import os
|
|
17
|
+
from typing import Dict
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Options(enum.Enum):
|
|
21
|
+
"""Environment variables for SkyPilot."""
|
|
22
|
+
|
|
23
|
+
# (env var name, default value)
|
|
24
|
+
IS_DEVELOPER = ('KONDUKTOR_DEV', False)
|
|
25
|
+
SHOW_DEBUG_INFO = ('KONDUKTOR_DEBUG', True)
|
|
26
|
+
DISABLE_LOGGING = ('KONDUKTOR_DISABLE_USAGE_COLLECTION', False)
|
|
27
|
+
MINIMIZE_LOGGING = ('KONDUKTOR_MINIMIZE_LOGGING', False)
|
|
28
|
+
SUPPRESS_SENSITIVE_LOG = ('KONDUKTOR_SUPPRESS_SENSITIVE_LOG', False)
|
|
29
|
+
# Internal: this is used to skip the cloud user identity check, which is
|
|
30
|
+
# used to protect cluster operations in a multi-identity scenario.
|
|
31
|
+
# Currently, this is only used in the job and serve controller, as there
|
|
32
|
+
# will not be multiple identities, and skipping the check can increase
|
|
33
|
+
# robustness.
|
|
34
|
+
SKIP_CLOUD_IDENTITY_CHECK = ('KONDUKTOR_SKIP_CLOUD_IDENTITY_CHECK', False)
|
|
35
|
+
|
|
36
|
+
def __init__(self, env_var: str, default: bool) -> None:
|
|
37
|
+
self.env_var = env_var
|
|
38
|
+
self.default = default
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return self.env_var
|
|
42
|
+
|
|
43
|
+
def get(self) -> bool:
|
|
44
|
+
"""Check if an environment variable is set to True."""
|
|
45
|
+
return os.getenv(self.env_var, str(self.default)).lower() in ('true', '1')
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def env_key(self) -> str:
|
|
49
|
+
"""The environment variable key name."""
|
|
50
|
+
return self.value[0]
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def all_options(cls) -> Dict[str, bool]:
|
|
54
|
+
"""Returns all options as a dictionary."""
|
|
55
|
+
return {option.env_key: option.get() for option in list(Options)}
|