konduktor-nightly 0.1.0.dev20250513105010__tar.gz → 0.1.0.dev20250515104942__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/PKG-INFO +1 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/__init__.py +2 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/common.py +1 -2
- konduktor_nightly-0.1.0.dev20250515104942/konduktor/authentication.py +124 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/jobset.py +20 -19
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/jobset_utils.py +47 -5
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/cli.py +2 -3
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/aws/s3.py +5 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/data_utils.py +2 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/gcs.py +5 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/storage.py +6 -7
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/execution.py +2 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/kube_client.py +0 -3
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/resource.py +18 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/task.py +3 -3
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/templates/pod.yaml.j2 +182 -16
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/exceptions.py +5 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/kubernetes_utils.py +7 -4
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/log_utils.py +0 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/loki_utils.py +2 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/rich_utils.py +1 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/schemas.py +29 -82
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/pyproject.toml +1 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/aws.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/gcp.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/backend.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/check.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/config.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/launch.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/node.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/parse.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/backend/main.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/backend/sockets.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/.gitignore +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/globals.css +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/layout.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/jsconfig.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/next.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/package-lock.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/package.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/server.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/aws/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/storage_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/logging.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/controller_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/dashboard_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/templates/jobset.yaml.j2 +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/usage/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/usage/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/accelerator_registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/annotations.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/base64_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/common_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/env_options.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/kubernetes_enums.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/subprocess_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/ux_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/validator.py +0 -0
@@ -14,7 +14,7 @@ __all__ = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
# Replaced with the current commit when building the wheels.
|
17
|
-
_KONDUKTOR_COMMIT_SHA = '
|
17
|
+
_KONDUKTOR_COMMIT_SHA = 'c0bd8e8774fab8042721b43a8cb8c35a624f8299'
|
18
18
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
19
19
|
|
20
20
|
|
@@ -48,5 +48,5 @@ def _get_git_commit():
|
|
48
48
|
|
49
49
|
|
50
50
|
__commit__ = _get_git_commit()
|
51
|
-
__version__ = '1.0.0.dev0.1.0.
|
51
|
+
__version__ = '1.0.0.dev0.1.0.dev20250515104942'
|
52
52
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
@@ -26,8 +26,7 @@ class LazyImport:
|
|
26
26
|
|
27
27
|
We use this for pandas and networkx, as they can be time-consuming to import
|
28
28
|
(0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
|
29
|
-
when the module is not used
|
30
|
-
`sky status and `pandas` should not be imported for `sky exec`).
|
29
|
+
when the module is not used.
|
31
30
|
|
32
31
|
We also use this for cloud adaptors, because we do not want to import the
|
33
32
|
cloud dependencies when it is not enabled.
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""
|
14
|
+
The local machine's public key should not be uploaded to the remote VM, because
|
15
|
+
it will cause private/public key pair mismatch when the user tries to launch new
|
16
|
+
VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
|
17
|
+
controller. (Lambda cloud is an exception, due to the limitation of the cloud
|
18
|
+
provider. See the comments in setup_lambda_authentication)
|
19
|
+
"""
|
20
|
+
|
21
|
+
import functools
|
22
|
+
import os
|
23
|
+
from typing import Tuple
|
24
|
+
|
25
|
+
import filelock
|
26
|
+
|
27
|
+
from konduktor import logging
|
28
|
+
from konduktor.utils import common_utils
|
29
|
+
|
30
|
+
logger = logging.get_logger(__name__)
|
31
|
+
|
32
|
+
_SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
|
33
|
+
|
34
|
+
MAX_TRIALS = 64
|
35
|
+
|
36
|
+
|
37
|
+
def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
|
38
|
+
user_hash = common_utils.get_user_hash()
|
39
|
+
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
40
|
+
os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
|
41
|
+
private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
|
42
|
+
public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
|
43
|
+
lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
|
44
|
+
return private_key_path, public_key_path, lock_path
|
45
|
+
|
46
|
+
|
47
|
+
def _generate_rsa_key_pair() -> Tuple[str, str]:
|
48
|
+
# Keep the import of the cryptography local to avoid expensive
|
49
|
+
# third-party imports when not needed.
|
50
|
+
# pylint: disable=import-outside-toplevel
|
51
|
+
from cryptography.hazmat.backends import default_backend
|
52
|
+
from cryptography.hazmat.primitives import serialization
|
53
|
+
from cryptography.hazmat.primitives.asymmetric import rsa
|
54
|
+
|
55
|
+
key = rsa.generate_private_key(
|
56
|
+
backend=default_backend(), public_exponent=65537, key_size=2048
|
57
|
+
)
|
58
|
+
|
59
|
+
private_key = (
|
60
|
+
key.private_bytes(
|
61
|
+
encoding=serialization.Encoding.PEM,
|
62
|
+
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
63
|
+
encryption_algorithm=serialization.NoEncryption(),
|
64
|
+
)
|
65
|
+
.decode('utf-8')
|
66
|
+
.strip()
|
67
|
+
)
|
68
|
+
|
69
|
+
public_key = (
|
70
|
+
key.public_key()
|
71
|
+
.public_bytes(
|
72
|
+
serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
|
73
|
+
)
|
74
|
+
.decode('utf-8')
|
75
|
+
.strip()
|
76
|
+
)
|
77
|
+
|
78
|
+
return public_key, private_key
|
79
|
+
|
80
|
+
|
81
|
+
def _save_key_pair(
|
82
|
+
private_key_path: str, public_key_path: str, private_key: str, public_key: str
|
83
|
+
) -> None:
|
84
|
+
key_dir = os.path.dirname(private_key_path)
|
85
|
+
os.makedirs(key_dir, exist_ok=True, mode=0o700)
|
86
|
+
|
87
|
+
with open(
|
88
|
+
private_key_path,
|
89
|
+
'w',
|
90
|
+
encoding='utf-8',
|
91
|
+
opener=functools.partial(os.open, mode=0o600),
|
92
|
+
) as f:
|
93
|
+
f.write(private_key)
|
94
|
+
|
95
|
+
with open(
|
96
|
+
public_key_path,
|
97
|
+
'w',
|
98
|
+
encoding='utf-8',
|
99
|
+
opener=functools.partial(os.open, mode=0o644),
|
100
|
+
) as f:
|
101
|
+
f.write(public_key)
|
102
|
+
|
103
|
+
|
104
|
+
def get_or_generate_keys() -> Tuple[str, str]:
|
105
|
+
"""Returns the aboslute private and public key paths."""
|
106
|
+
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
107
|
+
private_key_path = os.path.expanduser(private_key_path)
|
108
|
+
public_key_path = os.path.expanduser(public_key_path)
|
109
|
+
lock_path = os.path.expanduser(lock_path)
|
110
|
+
|
111
|
+
lock_dir = os.path.dirname(lock_path)
|
112
|
+
# We should have the folder ~/.konduktor/generated/ssh to have 0o700 permission,
|
113
|
+
# as the ssh configs will be written to this folder as well in
|
114
|
+
# backend_utils.SSHConfigHelper
|
115
|
+
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
116
|
+
with filelock.FileLock(lock_path, timeout=10):
|
117
|
+
if not os.path.exists(private_key_path):
|
118
|
+
public_key, private_key = _generate_rsa_key_pair()
|
119
|
+
_save_key_pair(private_key_path, public_key_path, private_key, public_key)
|
120
|
+
assert os.path.exists(public_key_path), (
|
121
|
+
'Private key found, but associated public key '
|
122
|
+
f'{public_key_path} does not exist.'
|
123
|
+
)
|
124
|
+
return private_key_path, public_key_path
|
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
|
|
70
70
|
assert jobsets is not None, (
|
71
71
|
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
72
72
|
)
|
73
|
-
if
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
73
|
+
if 'status' in jobsets:
|
74
|
+
if jobsets['status']['replicatedJobsStatus'][0]['ready']:
|
75
|
+
logger.info(
|
76
|
+
f'task '
|
77
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
78
|
+
f'{colorama.Style.RESET_ALL} ready'
|
79
|
+
)
|
80
|
+
break
|
81
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
82
|
+
return
|
83
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
84
|
+
logger.info(
|
85
|
+
f'job '
|
86
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
87
|
+
f'{colorama.Style.RESET_ALL} '
|
88
|
+
f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
89
|
+
)
|
90
|
+
job = jobset_utils.get_job(namespace, job_name)
|
91
|
+
_raise_job_error(job)
|
92
|
+
return
|
92
93
|
if timeout != -1 and time.time() - start > timeout:
|
93
94
|
logger.error(
|
94
95
|
f'{colorama.Style.BRIGHT}'
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Jobset utils: wraps CRUD operations for jobsets"""
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import enum
|
4
5
|
import json
|
5
6
|
import os
|
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
|
|
15
16
|
from datetime import timedelta
|
16
17
|
|
17
18
|
import konduktor
|
18
|
-
from konduktor import config, constants, kube_client, logging
|
19
|
+
from konduktor import authentication, config, constants, kube_client, logging
|
19
20
|
from konduktor.data import registry
|
20
21
|
from konduktor.utils import (
|
21
22
|
common_utils,
|
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
93
94
|
else:
|
94
95
|
accelerator_type = None
|
95
96
|
|
97
|
+
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
98
|
+
assert task.resources.memory is not None, 'Task resources memory are required'
|
99
|
+
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
100
|
+
|
96
101
|
# template the commands to run on the container for syncing files. At this point
|
97
102
|
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
98
103
|
# first we iterate through storage_mounts and then file_mounts.
|
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
150
155
|
f'though specified by `tailscale.secret_name`: {err}'
|
151
156
|
)
|
152
157
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
158
|
+
enable_ssh = config.get_nested(('ssh', 'enable'), False)
|
159
|
+
secret_name = None
|
160
|
+
if enable_ssh:
|
161
|
+
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
162
|
+
with (
|
163
|
+
open(private_key_path, 'rb') as private_key_file,
|
164
|
+
open(public_key_path, 'rb') as public_key_file,
|
165
|
+
):
|
166
|
+
private_key, public_key = private_key_file.read(), public_key_file.read()
|
167
|
+
user_hash = common_utils.get_user_hash()
|
168
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
169
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(
|
170
|
+
context_name=context
|
171
|
+
)
|
172
|
+
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
173
|
+
ok, result = kubernetes_utils.set_secret(
|
174
|
+
secret_name=secret_name,
|
175
|
+
namespace=namespace,
|
176
|
+
context=context,
|
177
|
+
data={
|
178
|
+
'PUBKEY': base64.b64encode(public_key).decode(),
|
179
|
+
'PRIVKEY': base64.b64encode(private_key).decode(),
|
180
|
+
},
|
181
|
+
)
|
182
|
+
if not ok:
|
183
|
+
raise exceptions.CreateSecretError(
|
184
|
+
f'Failed to set k8s secret {secret_name}: \n{result}'
|
185
|
+
)
|
186
|
+
|
157
187
|
with tempfile.NamedTemporaryFile() as temp:
|
158
188
|
common_utils.fill_template(
|
159
189
|
'pod.yaml.j2',
|
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
166
196
|
'master_addr': master_addr,
|
167
197
|
'num_nodes': task.num_nodes,
|
168
198
|
'job_name': task.name, # append timestamp and user id here?
|
199
|
+
'setup_cmd': task.setup or '',
|
169
200
|
'run_cmd': task.run,
|
170
201
|
'node_hostnames': node_hostnames,
|
171
202
|
'accelerator_type': accelerator_type,
|
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
176
207
|
'user': common_utils.get_cleaned_username(),
|
177
208
|
# Tailscale credentials
|
178
209
|
'tailscale_secret': tailscale_secret,
|
210
|
+
# SSH
|
211
|
+
'enable_ssh': enable_ssh,
|
212
|
+
'secret_name': secret_name,
|
179
213
|
},
|
180
214
|
temp.name,
|
181
215
|
)
|
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
183
217
|
# merge with `~/.konduktor/config.yaml``
|
184
218
|
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
185
219
|
pod_config = common_utils.read_yaml(temp.name)
|
220
|
+
|
221
|
+
for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
222
|
+
'env'
|
223
|
+
]:
|
224
|
+
if env_var['name'] in task.envs:
|
225
|
+
env_var['value'] = task.envs.pop(env_var['name'])
|
226
|
+
|
186
227
|
for k, v in task.envs.items():
|
187
228
|
pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
188
229
|
'env'
|
@@ -221,6 +262,7 @@ def create_jobset(
|
|
221
262
|
'user': common_utils.get_cleaned_username(),
|
222
263
|
'accelerator_type': accelerator_type,
|
223
264
|
'num_accelerators': num_accelerators,
|
265
|
+
'completions': task.resources.get_completions(),
|
224
266
|
**_JOBSET_METADATA_LABELS,
|
225
267
|
},
|
226
268
|
temp.name,
|
@@ -105,7 +105,7 @@ def _make_task_with_overrides(
|
|
105
105
|
env: Optional[List[Tuple[str, str]]] = None,
|
106
106
|
field_to_ignore: Optional[List[str]] = None,
|
107
107
|
) -> konduktor.Task:
|
108
|
-
"""Creates a task
|
108
|
+
"""Creates a task from an entrypoint with overrides.
|
109
109
|
|
110
110
|
Returns:
|
111
111
|
konduktor.Task
|
@@ -271,8 +271,7 @@ _EXTRA_RESOURCES_OPTIONS = [
|
|
271
271
|
type=str,
|
272
272
|
help=(
|
273
273
|
'Type and number of GPUs to use. Example values: '
|
274
|
-
'"V100:8", "V100" (short for a count of 1)
|
275
|
-
'(fractional counts are supported by the scheduling framework). '
|
274
|
+
'"V100:8", "V100" (short for a count of 1)'
|
276
275
|
'If a new cluster is being launched by this command, this is the '
|
277
276
|
'resources to provision. If an existing cluster is being reused, this'
|
278
277
|
" is seen as the task demand, which must fit the cluster's total "
|
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
|
|
1037
1037
|
secret_name=cls._AWS_SECRET_NAME,
|
1038
1038
|
namespace=namespace,
|
1039
1039
|
context=context,
|
1040
|
-
|
1041
|
-
|
1040
|
+
data={
|
1041
|
+
cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
1042
|
+
credentials_files
|
1043
|
+
)
|
1044
|
+
},
|
1042
1045
|
)
|
1043
1046
|
if not ok:
|
1044
1047
|
logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
|
@@ -219,10 +219,10 @@ def get_gsutil_command() -> Tuple[str, str]:
|
|
219
219
|
cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
|
220
220
|
```
|
221
221
|
"""
|
222
|
-
gsutil_alias = '
|
222
|
+
gsutil_alias = 'konduktor_gsutil'
|
223
223
|
disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
|
224
224
|
|
225
|
-
# Define
|
225
|
+
# Define konduktor_gsutil as a shell function instead of an alias.
|
226
226
|
# This function will behave just like alias, but can be called immediately
|
227
227
|
# after its definition on the same line
|
228
228
|
alias_gen = (
|
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
|
|
891
891
|
secret_name=cls._GCP_SECRET_NAME,
|
892
892
|
namespace=namespace,
|
893
893
|
context=context,
|
894
|
-
|
895
|
-
|
894
|
+
data={
|
895
|
+
cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
896
|
+
credentials_files
|
897
|
+
)
|
898
|
+
},
|
896
899
|
)
|
897
900
|
if not ok:
|
898
901
|
logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
|
@@ -271,15 +271,14 @@ class Storage(object):
|
|
271
271
|
Can be a single local path, a list of local paths, or a cloud URI
|
272
272
|
(s3://, gs://, etc.). Local paths do not need to be absolute.
|
273
273
|
stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
|
274
|
-
persistent: bool; Whether to persist across
|
274
|
+
persistent: bool; Whether to persist across konduktor launches.
|
275
275
|
mode: StorageMode; Specify how the storage object is manifested on
|
276
276
|
the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
|
277
|
-
sync_on_reconstruction: bool; Whether to sync the
|
278
|
-
object is found in the global_user_state
|
279
|
-
there. This is set to
|
280
|
-
|
281
|
-
|
282
|
-
_is_sky_managed: Optional[bool]; Indicates if the storage is managed
|
277
|
+
sync_on_reconstruction: bool; [defunct] Whether to sync the
|
278
|
+
data if the storage object is found in the global_user_state
|
279
|
+
and reconstructed from there. This is set to
|
280
|
+
false when the Storage object is created not for direct use
|
281
|
+
_is_sky_managed: Optional[bool]; [defunct] Indicates if the storage is managed
|
283
282
|
by Sky. Without this argument, the controller's behavior differs
|
284
283
|
from the local machine. For example, if a bucket does not exist:
|
285
284
|
Local Machine (is_sky_managed=True) →
|
@@ -149,10 +149,10 @@ def maybe_translate_local_file_mounts_and_sync_up(
|
|
149
149
|
msg = 'workdir'
|
150
150
|
if msg:
|
151
151
|
logger.info(
|
152
|
-
ux_utils.starting_message(f'Translating {msg} to ' '
|
152
|
+
ux_utils.starting_message(f'Translating {msg} to ' 'cloud Storage...')
|
153
153
|
)
|
154
154
|
rich_utils.force_update_status(
|
155
|
-
ux_utils.spinner_message(f'Translating {msg} to
|
155
|
+
ux_utils.spinner_message(f'Translating {msg} to cloud Storage...')
|
156
156
|
)
|
157
157
|
|
158
158
|
# Get the bucket name for the workdir and file mounts,
|
@@ -63,8 +63,6 @@ def _load_config(context: Optional[str] = None):
|
|
63
63
|
err_str = (
|
64
64
|
f'Failed to load Kubernetes configuration for {context!r}. '
|
65
65
|
'Kubeconfig does not contain any valid context(s).\n'
|
66
|
-
' If you were running a local Kubernetes '
|
67
|
-
'cluster, run `sky local up` to start the cluster.'
|
68
66
|
)
|
69
67
|
else:
|
70
68
|
err_str = (
|
@@ -72,7 +70,6 @@ def _load_config(context: Optional[str] = None):
|
|
72
70
|
'Please check if your kubeconfig file exists at '
|
73
71
|
f'~/.kube/config and is valid.'
|
74
72
|
)
|
75
|
-
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
76
73
|
with ux_utils.print_exception_no_traceback():
|
77
74
|
raise ValueError(err_str) from None
|
78
75
|
|
@@ -49,6 +49,7 @@ class Resources:
|
|
49
49
|
image_id: Union[str, None] = None,
|
50
50
|
disk_size: Optional[int] = None,
|
51
51
|
labels: Optional[Dict[str, str]] = None,
|
52
|
+
job_config: Optional[Dict[str, Union[int, str]]] = None,
|
52
53
|
# Internal use only.
|
53
54
|
# pylint: disable=invalid-name
|
54
55
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
@@ -91,6 +92,7 @@ class Resources:
|
|
91
92
|
instance tags. On GCP, labels map to instance labels. On
|
92
93
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
93
94
|
not supported and will be ignored.
|
95
|
+
job_config: the configuration of the job spec
|
94
96
|
Raises:
|
95
97
|
ValueError: if some attributes are invalid.
|
96
98
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
@@ -122,6 +124,7 @@ class Resources:
|
|
122
124
|
self._set_cpus(cpus)
|
123
125
|
self._set_memory(memory)
|
124
126
|
self._set_accelerators(accelerators)
|
127
|
+
self.job_config = job_config
|
125
128
|
|
126
129
|
# TODO: move these out of init to prevent repeated calls.
|
127
130
|
self._try_validate_cpus_mem()
|
@@ -382,6 +385,11 @@ class Resources:
|
|
382
385
|
accel_str = f'{accel_name}:{accel_count}'
|
383
386
|
return accel_str
|
384
387
|
|
388
|
+
def get_completions(self) -> Optional[int]:
|
389
|
+
if self.job_config and self.job_config['completions']:
|
390
|
+
return int(self.job_config['completions'])
|
391
|
+
return None
|
392
|
+
|
385
393
|
def copy(self, **override) -> 'Resources':
|
386
394
|
"""Returns a copy of the given Resources."""
|
387
395
|
resources = Resources(
|
@@ -392,6 +400,7 @@ class Resources:
|
|
392
400
|
disk_size=override.pop('disk_size', self.disk_size),
|
393
401
|
image_id=override.pop('image_id', self.image_id),
|
394
402
|
labels=override.pop('labels', self.labels),
|
403
|
+
job_config=override.pop('job_config', self.job_config),
|
395
404
|
)
|
396
405
|
assert len(override) == 0
|
397
406
|
return resources
|
@@ -404,6 +413,13 @@ class Resources:
|
|
404
413
|
config, schemas.get_resources_schema(), 'Invalid resources YAML: '
|
405
414
|
)
|
406
415
|
|
416
|
+
if config.get('job_config', None):
|
417
|
+
common_utils.validate_schema(
|
418
|
+
config['job_config'],
|
419
|
+
schemas.get_job_schema(),
|
420
|
+
'Invalid job config YAML',
|
421
|
+
)
|
422
|
+
|
407
423
|
def _override_resources(
|
408
424
|
base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
|
409
425
|
) -> List[Resources]:
|
@@ -446,6 +462,7 @@ class Resources:
|
|
446
462
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
447
463
|
resources_fields['image_id'] = config.pop('image_id', None)
|
448
464
|
resources_fields['labels'] = config.pop('labels', None)
|
465
|
+
resources_fields['job_config'] = config.pop('job_config', None)
|
449
466
|
|
450
467
|
if resources_fields['cpus'] is not None:
|
451
468
|
resources_fields['cpus'] = str(resources_fields['cpus'])
|
@@ -475,4 +492,5 @@ class Resources:
|
|
475
492
|
add_if_not_none('disk_size', self.disk_size)
|
476
493
|
add_if_not_none('image_id', self.image_id)
|
477
494
|
add_if_not_none('labels', self.labels)
|
495
|
+
add_if_not_none('job_config', self.job_config)
|
478
496
|
return config
|
@@ -181,8 +181,7 @@ class Task:
|
|
181
181
|
"""
|
182
182
|
assert name is not None, 'Task name is required'
|
183
183
|
self.name = name
|
184
|
-
|
185
|
-
raise ValueError('`setup` is being deprecated and not supported')
|
184
|
+
self.setup = setup
|
186
185
|
self.run = run
|
187
186
|
self.storage_mounts: Dict[str, storage_lib.Storage] = {}
|
188
187
|
self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
|
@@ -320,6 +319,7 @@ class Task:
|
|
320
319
|
|
321
320
|
task = Task(
|
322
321
|
config.pop('name', None),
|
322
|
+
setup=config.pop('setup', None),
|
323
323
|
run=config.pop('run', None),
|
324
324
|
workdir=config.pop('workdir', None),
|
325
325
|
num_nodes=config.pop('num_nodes', None),
|
@@ -695,7 +695,7 @@ class Task:
|
|
695
695
|
This should be called before provisioning in order to take effect.
|
696
696
|
|
697
697
|
Args:
|
698
|
-
storage_mounts: an optional dict of ``{mount_path:
|
698
|
+
storage_mounts: an optional dict of ``{mount_path: konduktor.data.Storage
|
699
699
|
object}``, where mount_path is the path inside the remote VM(s)
|
700
700
|
where the Storage object will be mounted on.
|
701
701
|
|