konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""The Konduktor package."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
from konduktor.execution import launch
|
|
7
|
+
from konduktor.resource import Resources
|
|
8
|
+
from konduktor.serving import Serving
|
|
9
|
+
from konduktor.task import Task
|
|
10
|
+
|
|
11
|
+
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
|
+
|
|
13
|
+
# Replaced with the current commit when building the wheels.
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '2c7cb302ac6c8f558befdf1ad323e30c47f1ba71'
|
|
15
|
+
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_git_commit():
|
|
19
|
+
if 'KONDUKTOR_COMMIT_SHA' not in _KONDUKTOR_COMMIT_SHA:
|
|
20
|
+
# This is a release build, so we don't need to get the commit hash from
|
|
21
|
+
# git, as it's already been set.
|
|
22
|
+
return _KONDUKTOR_COMMIT_SHA
|
|
23
|
+
|
|
24
|
+
# This is a development build (pip install -e .), so we need to get the
|
|
25
|
+
# commit hash from git.
|
|
26
|
+
try:
|
|
27
|
+
cwd = os.path.dirname(__file__)
|
|
28
|
+
commit_hash = subprocess.check_output(
|
|
29
|
+
['git', 'rev-parse', 'HEAD'],
|
|
30
|
+
cwd=cwd,
|
|
31
|
+
universal_newlines=True,
|
|
32
|
+
stderr=subprocess.DEVNULL,
|
|
33
|
+
).strip()
|
|
34
|
+
changes = subprocess.check_output(
|
|
35
|
+
['git', 'status', '--porcelain'],
|
|
36
|
+
cwd=cwd,
|
|
37
|
+
universal_newlines=True,
|
|
38
|
+
stderr=subprocess.DEVNULL,
|
|
39
|
+
).strip()
|
|
40
|
+
if changes:
|
|
41
|
+
commit_hash += '-dirty'
|
|
42
|
+
return commit_hash
|
|
43
|
+
except Exception: # pylint: disable=broad-except
|
|
44
|
+
return _KONDUKTOR_COMMIT_SHA
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
__commit__ = _get_git_commit()
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20251128104812'
|
|
49
|
+
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
File without changes
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""AWS cloud adaptors
|
|
14
|
+
|
|
15
|
+
Thread safety notes:
|
|
16
|
+
|
|
17
|
+
The results of session() is cached by each thread in a thread.local() storage.
|
|
18
|
+
This means using their results is completely thread-safe.
|
|
19
|
+
|
|
20
|
+
We do not cache the resource/client objects, because some credentials may be
|
|
21
|
+
automatically rotated, but the cached resource/client object may not refresh the
|
|
22
|
+
credential quick enough, which can cause unexpected NoCredentialsError. By
|
|
23
|
+
creating the resource/client object from the thread-local session() object every
|
|
24
|
+
time, the credentials will be explicitly refreshed.
|
|
25
|
+
|
|
26
|
+
Calling session(), resource(), and client() is thread-safe, since they use a
|
|
27
|
+
lock to protect each object's creation.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
This is informed by the following boto3 docs:
|
|
31
|
+
- Unlike Resources and Sessions, clients are generally thread-safe.
|
|
32
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/clients.html
|
|
33
|
+
- Resource instances are not thread safe and should not be shared across
|
|
34
|
+
threads or processes
|
|
35
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html
|
|
36
|
+
- Similar to Resource objects, Session objects are not thread safe and
|
|
37
|
+
should not be shared across threads and processes.
|
|
38
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# pylint: disable=import-outside-toplevel
|
|
42
|
+
|
|
43
|
+
import functools
|
|
44
|
+
import logging
|
|
45
|
+
import threading
|
|
46
|
+
import time
|
|
47
|
+
from typing import Any, Callable
|
|
48
|
+
|
|
49
|
+
from konduktor.adaptors import common
|
|
50
|
+
from konduktor.utils import annotations, common_utils
|
|
51
|
+
|
|
52
|
+
_IMPORT_ERROR_MESSAGE = (
|
|
53
|
+
'Failed to import dependencies for AWS. ' 'Try pip install konduktor-nightly[s3]'
|
|
54
|
+
)
|
|
55
|
+
boto3 = common.LazyImport('boto3', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
56
|
+
botocore = common.LazyImport('botocore', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
57
|
+
_LAZY_MODULES = (boto3, botocore)
|
|
58
|
+
|
|
59
|
+
logger = logging.getLogger(__name__)
|
|
60
|
+
_session_creation_lock = threading.RLock()
|
|
61
|
+
|
|
62
|
+
version = 1
|
|
63
|
+
|
|
64
|
+
# Retry 5 times by default for potential credential errors,
|
|
65
|
+
_MAX_ATTEMPT_FOR_CREATION = 5
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class _ThreadLocalLRUCache(threading.local):
|
|
69
|
+
def __init__(self, maxsize=32):
|
|
70
|
+
super().__init__()
|
|
71
|
+
self.cache = annotations.lru_cache(scope='global', maxsize=maxsize)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _thread_local_lru_cache(maxsize=32):
|
|
75
|
+
# Create thread-local storage for the LRU cache
|
|
76
|
+
local_cache = _ThreadLocalLRUCache(maxsize)
|
|
77
|
+
|
|
78
|
+
def decorator(func):
|
|
79
|
+
@functools.wraps(func)
|
|
80
|
+
def wrapper(*args, **kwargs):
|
|
81
|
+
# Use the thread-local LRU cache
|
|
82
|
+
return local_cache.cache(func)(*args, **kwargs)
|
|
83
|
+
|
|
84
|
+
return wrapper
|
|
85
|
+
|
|
86
|
+
return decorator
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _assert_kwargs_builtin_type(kwargs):
|
|
90
|
+
assert all(
|
|
91
|
+
isinstance(v, (int, float, str)) for v in kwargs.values()
|
|
92
|
+
), f'kwargs should not contain none built-in types: {kwargs}'
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _create_aws_object(creation_fn_or_cls: Callable[[], Any], object_name: str) -> Any:
|
|
96
|
+
"""Create an AWS object.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
creation_fn: The function to create the AWS object.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The created AWS object.
|
|
103
|
+
"""
|
|
104
|
+
attempt = 0
|
|
105
|
+
backoff = common_utils.Backoff()
|
|
106
|
+
while True:
|
|
107
|
+
try:
|
|
108
|
+
# Creating the boto3 objects are not thread-safe,
|
|
109
|
+
# so we add a reentrant lock to synchronize the session creation.
|
|
110
|
+
# Reference: https://github.com/boto/boto3/issues/1592
|
|
111
|
+
|
|
112
|
+
# NOTE: we need the lock here to avoid thread-safety issues when
|
|
113
|
+
# creating the resource, because Python module is a shared object,
|
|
114
|
+
# and we are not sure if the code inside 'session()' or
|
|
115
|
+
# 'session().xx()' is thread-safe.
|
|
116
|
+
with _session_creation_lock:
|
|
117
|
+
return creation_fn_or_cls()
|
|
118
|
+
except (
|
|
119
|
+
botocore_exceptions().CredentialRetrievalError,
|
|
120
|
+
botocore_exceptions().NoCredentialsError,
|
|
121
|
+
) as e:
|
|
122
|
+
attempt += 1
|
|
123
|
+
if attempt >= _MAX_ATTEMPT_FOR_CREATION:
|
|
124
|
+
raise
|
|
125
|
+
time.sleep(backoff.current_backoff())
|
|
126
|
+
logger.info(
|
|
127
|
+
f'Retry creating AWS {object_name} due to '
|
|
128
|
+
f'{common_utils.format_exception(e)}.'
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# The LRU cache needs to be thread-local to avoid multiple threads sharing the
|
|
133
|
+
# same session object, which is not guaranteed to be thread-safe.
|
|
134
|
+
@_thread_local_lru_cache()
|
|
135
|
+
def session(check_credentials: bool = True):
|
|
136
|
+
"""Create an AWS session."""
|
|
137
|
+
s = _create_aws_object(boto3.session.Session, 'session')
|
|
138
|
+
if check_credentials and s.get_credentials() is None:
|
|
139
|
+
# s.get_credentials() can be None if there are actually no credentials,
|
|
140
|
+
# or if we fail to get credentials from IMDS (e.g. due to throttling).
|
|
141
|
+
# Technically, it could be okay to have no credentials, as certain AWS
|
|
142
|
+
# APIs don't actually need them. But afaik everything we use AWS for
|
|
143
|
+
# needs credentials.
|
|
144
|
+
raise botocore_exceptions().NoCredentialsError()
|
|
145
|
+
return s
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# Avoid caching the resource/client objects. If we are using the assumed role,
|
|
149
|
+
# the credentials will be automatically rotated, but the cached resource/client
|
|
150
|
+
# object will only refresh the credentials with a fixed 15 minutes interval,
|
|
151
|
+
# which can cause unexpected NoCredentialsError. By creating the resource/client
|
|
152
|
+
# object every time, the credentials will be explicitly refreshed.
|
|
153
|
+
# The creation of the resource/client is relatively fast (around 0.3s), so the
|
|
154
|
+
# performance impact is negligible.
|
|
155
|
+
# Reference: https://github.com/skypilot-org/skypilot/issues/2697
|
|
156
|
+
def resource(service_name: str, **kwargs):
|
|
157
|
+
"""Create an AWS resource of a certain service.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
service_name: AWS resource name (e.g., 's3').
|
|
161
|
+
kwargs: Other options. We add max_attempts to the kwargs instead of
|
|
162
|
+
using botocore.config.Config() because the latter will generate
|
|
163
|
+
different keys even if the config is the same
|
|
164
|
+
"""
|
|
165
|
+
_assert_kwargs_builtin_type(kwargs)
|
|
166
|
+
|
|
167
|
+
max_attempts = kwargs.pop('max_attempts', None)
|
|
168
|
+
if max_attempts is not None:
|
|
169
|
+
config = botocore_config().Config(retries={'max_attempts': max_attempts})
|
|
170
|
+
kwargs['config'] = config
|
|
171
|
+
|
|
172
|
+
check_credentials = kwargs.pop('check_credentials', True)
|
|
173
|
+
|
|
174
|
+
# Need to use the client retrieved from the per-thread session to avoid
|
|
175
|
+
# thread-safety issues (Directly creating the client with boto3.resource()
|
|
176
|
+
# is not thread-safe). Reference: https://stackoverflow.com/a/59635814
|
|
177
|
+
return _create_aws_object(
|
|
178
|
+
lambda: session(check_credentials=check_credentials).resource(
|
|
179
|
+
service_name, **kwargs
|
|
180
|
+
),
|
|
181
|
+
'resource',
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def client(service_name: str, **kwargs):
|
|
186
|
+
"""Create an AWS client of a certain service.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
service_name: AWS service name (e.g., 's3', 'ec2').
|
|
190
|
+
kwargs: Other options.
|
|
191
|
+
"""
|
|
192
|
+
_assert_kwargs_builtin_type(kwargs)
|
|
193
|
+
|
|
194
|
+
check_credentials = kwargs.pop('check_credentials', True)
|
|
195
|
+
|
|
196
|
+
# Need to use the client retrieved from the per-thread session to avoid
|
|
197
|
+
# thread-safety issues (Directly creating the client with boto3.client() is
|
|
198
|
+
# not thread-safe). Reference: https://stackoverflow.com/a/59635814
|
|
199
|
+
|
|
200
|
+
return _create_aws_object(
|
|
201
|
+
lambda: session(check_credentials=check_credentials).client(
|
|
202
|
+
service_name, **kwargs
|
|
203
|
+
),
|
|
204
|
+
'client',
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
|
209
|
+
def botocore_exceptions():
|
|
210
|
+
"""AWS botocore exception."""
|
|
211
|
+
from botocore import exceptions
|
|
212
|
+
|
|
213
|
+
return exceptions
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
|
217
|
+
def botocore_config():
|
|
218
|
+
"""AWS botocore exception."""
|
|
219
|
+
from botocore import config
|
|
220
|
+
|
|
221
|
+
return config
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""Lazy import for modules to avoid import error when not used."""
|
|
14
|
+
|
|
15
|
+
import functools
|
|
16
|
+
import importlib
|
|
17
|
+
import os
|
|
18
|
+
import threading
|
|
19
|
+
from typing import Any, Callable, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
import filelock
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LazyImport:
|
|
25
|
+
"""Lazy importer for heavy modules or cloud modules only when enabled.
|
|
26
|
+
|
|
27
|
+
We use this for pandas and networkx, as they can be time-consuming to import
|
|
28
|
+
(0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
|
|
29
|
+
when the module is not used.
|
|
30
|
+
|
|
31
|
+
We also use this for cloud adaptors, because we do not want to import the
|
|
32
|
+
cloud dependencies when it is not enabled.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
module_name: str,
|
|
38
|
+
import_error_message: Optional[str] = None,
|
|
39
|
+
set_loggers: Optional[Callable] = None,
|
|
40
|
+
):
|
|
41
|
+
self._module_name = module_name
|
|
42
|
+
self._module = None
|
|
43
|
+
self._import_error_message = import_error_message
|
|
44
|
+
self._set_loggers = set_loggers
|
|
45
|
+
self._lock = threading.RLock()
|
|
46
|
+
|
|
47
|
+
def load_module(self):
|
|
48
|
+
# Avoid extra imports when multiple threads try to import the same
|
|
49
|
+
# module. The overhead is minor since import can only run in serial
|
|
50
|
+
# due to GIL even in multi-threaded environments.
|
|
51
|
+
with self._lock:
|
|
52
|
+
if self._module is None:
|
|
53
|
+
try:
|
|
54
|
+
self._module = importlib.import_module(self._module_name)
|
|
55
|
+
if self._set_loggers is not None:
|
|
56
|
+
self._set_loggers()
|
|
57
|
+
except ImportError as e:
|
|
58
|
+
if self._import_error_message is not None:
|
|
59
|
+
raise ImportError(self._import_error_message) from e
|
|
60
|
+
raise
|
|
61
|
+
return self._module
|
|
62
|
+
|
|
63
|
+
def __getattr__(self, name: str) -> Any:
|
|
64
|
+
# Attempt to access the attribute, if it fails, assume it's a submodule
|
|
65
|
+
# and lazily import it
|
|
66
|
+
try:
|
|
67
|
+
if name in self.__dict__:
|
|
68
|
+
return self.__dict__[name]
|
|
69
|
+
return getattr(self.load_module(), name)
|
|
70
|
+
except AttributeError:
|
|
71
|
+
# Dynamically create a new LazyImport instance for the submodule
|
|
72
|
+
submodule_name = f'{self._module_name}.{name}'
|
|
73
|
+
lazy_submodule = LazyImport(submodule_name, self._import_error_message)
|
|
74
|
+
setattr(self, name, lazy_submodule)
|
|
75
|
+
return lazy_submodule
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def load_lazy_modules(modules: Tuple[LazyImport, ...]):
|
|
79
|
+
"""Load lazy modules before entering a function to error out quickly."""
|
|
80
|
+
|
|
81
|
+
def decorator(func):
|
|
82
|
+
@functools.wraps(func)
|
|
83
|
+
def wrapper(*args, **kwargs):
|
|
84
|
+
for m in modules:
|
|
85
|
+
m.load_module()
|
|
86
|
+
return func(*args, **kwargs)
|
|
87
|
+
|
|
88
|
+
return wrapper
|
|
89
|
+
|
|
90
|
+
return decorator
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class LockedClientProxy:
|
|
94
|
+
"""Proxy for GCP client that locks access to the client."""
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
client,
|
|
99
|
+
lock_path=os.path.expanduser('~/.konduktor/gcs_storage.lock'),
|
|
100
|
+
timeout=10,
|
|
101
|
+
):
|
|
102
|
+
self._client = client
|
|
103
|
+
self._lock = filelock.FileLock(lock_path, timeout=timeout)
|
|
104
|
+
|
|
105
|
+
def __getattr__(self, attr):
|
|
106
|
+
target = getattr(self._client, attr)
|
|
107
|
+
|
|
108
|
+
if callable(target):
|
|
109
|
+
|
|
110
|
+
@functools.wraps(target)
|
|
111
|
+
def locked_method(*args, **kwargs):
|
|
112
|
+
with self._lock:
|
|
113
|
+
return target(*args, **kwargs)
|
|
114
|
+
|
|
115
|
+
return locked_method
|
|
116
|
+
else:
|
|
117
|
+
# Attribute (not method) access just passes through
|
|
118
|
+
return target
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""GCP cloud adaptors"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from konduktor.adaptors import common
|
|
19
|
+
|
|
20
|
+
_IMPORT_ERROR_MESSAGE = (
|
|
21
|
+
'Failed to import dependencies for GCP. ' 'Try pip install "konduktor[gcp]"'
|
|
22
|
+
)
|
|
23
|
+
googleapiclient = common.LazyImport(
|
|
24
|
+
'googleapiclient', import_error_message=_IMPORT_ERROR_MESSAGE
|
|
25
|
+
)
|
|
26
|
+
google = common.LazyImport('google', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
27
|
+
_LAZY_MODULES = (google, googleapiclient)
|
|
28
|
+
|
|
29
|
+
_LOCK_PATH = '~/.konduktor/gcs_storage.lock'
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
33
|
+
def build(service_name: str, version: str, *args, **kwargs):
|
|
34
|
+
"""Build a GCP service.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
service_name: GCP service name (e.g., 'compute', 'storagetransfer').
|
|
38
|
+
version: Service version (e.g., 'v1').
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
return googleapiclient.discovery.build(service_name, version, *args, **kwargs)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
45
|
+
def storage_client():
|
|
46
|
+
"""Helper that connects to GCS Storage Client for GCS Bucket"""
|
|
47
|
+
from google.cloud import storage
|
|
48
|
+
|
|
49
|
+
return common.LockedClientProxy(
|
|
50
|
+
storage.Client(), lock_path=os.path.expanduser(_LOCK_PATH)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
55
|
+
def anonymous_storage_client():
|
|
56
|
+
"""Helper that connects to GCS Storage Client for Public GCS Buckets"""
|
|
57
|
+
from google.cloud import storage
|
|
58
|
+
|
|
59
|
+
return common.LockedClientProxy(
|
|
60
|
+
storage.Client(), lock_path=os.path.expanduser(_LOCK_PATH)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
65
|
+
def not_found_exception():
|
|
66
|
+
"""NotFound exception."""
|
|
67
|
+
from google.api_core import exceptions as gcs_exceptions
|
|
68
|
+
|
|
69
|
+
return gcs_exceptions.NotFound
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
73
|
+
def forbidden_exception():
|
|
74
|
+
"""Forbidden exception."""
|
|
75
|
+
from google.api_core import exceptions as gcs_exceptions
|
|
76
|
+
|
|
77
|
+
return gcs_exceptions.Forbidden
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
81
|
+
def conflict_exception():
|
|
82
|
+
"""Conflict exception."""
|
|
83
|
+
from google.api_core import exceptions as gcs_exceptions
|
|
84
|
+
|
|
85
|
+
return gcs_exceptions.Conflict
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
89
|
+
def http_error_exception():
|
|
90
|
+
"""HttpError exception."""
|
|
91
|
+
from googleapiclient import errors
|
|
92
|
+
|
|
93
|
+
return errors.HttpError
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
97
|
+
def credential_error_exception():
|
|
98
|
+
"""CredentialError exception."""
|
|
99
|
+
from google.auth import exceptions
|
|
100
|
+
|
|
101
|
+
return exceptions.DefaultCredentialsError
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
|
105
|
+
def get_credentials(cred_type: str, credentials_field: str):
|
|
106
|
+
"""Get GCP credentials."""
|
|
107
|
+
from google.oauth2 import service_account
|
|
108
|
+
from google.oauth2.credentials import Credentials as OAuthCredentials
|
|
109
|
+
|
|
110
|
+
if cred_type == 'service_account':
|
|
111
|
+
# If parsing the gcp_credentials failed, then the user likely made a
|
|
112
|
+
# mistake in copying the credentials into the config yaml.
|
|
113
|
+
try:
|
|
114
|
+
service_account_info = json.loads(credentials_field)
|
|
115
|
+
except json.decoder.JSONDecodeError as e:
|
|
116
|
+
raise RuntimeError(
|
|
117
|
+
'gcp_credentials found in cluster yaml file but '
|
|
118
|
+
'formatted improperly.'
|
|
119
|
+
) from e
|
|
120
|
+
credentials = service_account.Credentials.from_service_account_info(
|
|
121
|
+
service_account_info
|
|
122
|
+
)
|
|
123
|
+
elif cred_type == 'credentials_token':
|
|
124
|
+
# Otherwise the credentials type must be credentials_token.
|
|
125
|
+
credentials = OAuthCredentials(credentials_field)
|
|
126
|
+
return credentials
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
The local machine's public key should not be uploaded to the remote VM, because
|
|
15
|
+
it will cause private/public key pair mismatch when the user tries to launch new
|
|
16
|
+
VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
|
|
17
|
+
controller. (Lambda cloud is an exception, due to the limitation of the cloud
|
|
18
|
+
provider. See the comments in setup_lambda_authentication)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import functools
|
|
22
|
+
import os
|
|
23
|
+
from typing import Tuple
|
|
24
|
+
|
|
25
|
+
import filelock
|
|
26
|
+
|
|
27
|
+
from konduktor import logging
|
|
28
|
+
from konduktor.utils import common_utils
|
|
29
|
+
|
|
30
|
+
logger = logging.get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
_SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
|
|
33
|
+
|
|
34
|
+
MAX_TRIALS = 64
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
|
|
38
|
+
user_hash = common_utils.get_user_hash()
|
|
39
|
+
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
|
40
|
+
os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
|
|
41
|
+
private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
|
|
42
|
+
public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
|
|
43
|
+
lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
|
|
44
|
+
return private_key_path, public_key_path, lock_path
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _generate_rsa_key_pair() -> Tuple[str, str]:
|
|
48
|
+
# Keep the import of the cryptography local to avoid expensive
|
|
49
|
+
# third-party imports when not needed.
|
|
50
|
+
# pylint: disable=import-outside-toplevel
|
|
51
|
+
from cryptography.hazmat.backends import default_backend
|
|
52
|
+
from cryptography.hazmat.primitives import serialization
|
|
53
|
+
from cryptography.hazmat.primitives.asymmetric import rsa
|
|
54
|
+
|
|
55
|
+
key = rsa.generate_private_key(
|
|
56
|
+
backend=default_backend(), public_exponent=65537, key_size=2048
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
private_key = (
|
|
60
|
+
key.private_bytes(
|
|
61
|
+
encoding=serialization.Encoding.PEM,
|
|
62
|
+
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
|
63
|
+
encryption_algorithm=serialization.NoEncryption(),
|
|
64
|
+
)
|
|
65
|
+
.decode('utf-8')
|
|
66
|
+
.strip()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
public_key = (
|
|
70
|
+
key.public_key()
|
|
71
|
+
.public_bytes(
|
|
72
|
+
serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
|
|
73
|
+
)
|
|
74
|
+
.decode('utf-8')
|
|
75
|
+
.strip()
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return public_key, private_key
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _save_key_pair(
|
|
82
|
+
private_key_path: str, public_key_path: str, private_key: str, public_key: str
|
|
83
|
+
) -> None:
|
|
84
|
+
key_dir = os.path.dirname(private_key_path)
|
|
85
|
+
os.makedirs(key_dir, exist_ok=True, mode=0o700)
|
|
86
|
+
|
|
87
|
+
with open(
|
|
88
|
+
private_key_path,
|
|
89
|
+
'w',
|
|
90
|
+
encoding='utf-8',
|
|
91
|
+
opener=functools.partial(os.open, mode=0o600),
|
|
92
|
+
) as f:
|
|
93
|
+
f.write(private_key)
|
|
94
|
+
|
|
95
|
+
with open(
|
|
96
|
+
public_key_path,
|
|
97
|
+
'w',
|
|
98
|
+
encoding='utf-8',
|
|
99
|
+
opener=functools.partial(os.open, mode=0o644),
|
|
100
|
+
) as f:
|
|
101
|
+
f.write(public_key)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_or_generate_keys() -> Tuple[str, str]:
|
|
105
|
+
"""Returns the aboslute private and public key paths."""
|
|
106
|
+
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
|
107
|
+
private_key_path = os.path.expanduser(private_key_path)
|
|
108
|
+
public_key_path = os.path.expanduser(public_key_path)
|
|
109
|
+
lock_path = os.path.expanduser(lock_path)
|
|
110
|
+
|
|
111
|
+
lock_dir = os.path.dirname(lock_path)
|
|
112
|
+
# We should have the folder ~/.konduktor/generated/ssh to have 0o700 permission,
|
|
113
|
+
# as the ssh configs will be written to this folder as well in
|
|
114
|
+
# backend_utils.SSHConfigHelper
|
|
115
|
+
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
|
116
|
+
with filelock.FileLock(lock_path, timeout=10):
|
|
117
|
+
if not os.path.exists(private_key_path):
|
|
118
|
+
public_key, private_key = _generate_rsa_key_pair()
|
|
119
|
+
_save_key_pair(private_key_path, public_key_path, private_key, public_key)
|
|
120
|
+
assert os.path.exists(public_key_path), (
|
|
121
|
+
'Private key found, but associated public key '
|
|
122
|
+
f'{public_key_path} does not exist.'
|
|
123
|
+
)
|
|
124
|
+
return private_key_path, public_key_path
|