konduktor-nightly 0.1.0.dev20250513105010__tar.gz → 0.1.0.dev20250514104854__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/PKG-INFO +1 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/__init__.py +2 -2
- konduktor_nightly-0.1.0.dev20250514104854/konduktor/authentication.py +124 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/jobset.py +20 -19
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/jobset_utils.py +47 -5
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/aws/s3.py +5 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/gcs.py +5 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/resource.py +18 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/task.py +2 -2
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/templates/pod.yaml.j2 +180 -16
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/exceptions.py +4 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/kubernetes_utils.py +7 -4
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/loki_utils.py +2 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/schemas.py +29 -82
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/pyproject.toml +1 -1
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/aws.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/common.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/gcp.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/backend.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/check.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/cli.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/config.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/launch.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/node.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/parse.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/backend/main.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/backend/sockets.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/.gitignore +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/globals.css +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/layout.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/jsconfig.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/next.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/package-lock.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/package.json +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/server.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/aws/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/data_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/storage.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/storage_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/execution.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/kube_client.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/logging.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/controller_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/dashboard_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/templates/jobset.yaml.j2 +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/usage/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/usage/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/accelerator_registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/annotations.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/base64_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/common_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/env_options.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/kubernetes_enums.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/log_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/rich_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/subprocess_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/ux_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/validator.py +0 -0
@@ -14,7 +14,7 @@ __all__ = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
# Replaced with the current commit when building the wheels.
|
17
|
-
_KONDUKTOR_COMMIT_SHA = '
|
17
|
+
_KONDUKTOR_COMMIT_SHA = '05c7d9e243ae23c6e9abb0a4a034bfc0815fd587'
|
18
18
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
19
19
|
|
20
20
|
|
@@ -48,5 +48,5 @@ def _get_git_commit():
|
|
48
48
|
|
49
49
|
|
50
50
|
__commit__ = _get_git_commit()
|
51
|
-
__version__ = '1.0.0.dev0.1.0.
|
51
|
+
__version__ = '1.0.0.dev0.1.0.dev20250514104854'
|
52
52
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""
|
14
|
+
The local machine's public key should not be uploaded to the remote VM, because
|
15
|
+
it will cause private/public key pair mismatch when the user tries to launch new
|
16
|
+
VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
|
17
|
+
controller. (Lambda cloud is an exception, due to the limitation of the cloud
|
18
|
+
provider. See the comments in setup_lambda_authentication)
|
19
|
+
"""
|
20
|
+
|
21
|
+
import functools
|
22
|
+
import os
|
23
|
+
from typing import Tuple
|
24
|
+
|
25
|
+
import filelock
|
26
|
+
|
27
|
+
from konduktor import logging
|
28
|
+
from konduktor.utils import common_utils
|
29
|
+
|
30
|
+
logger = logging.get_logger(__name__)
|
31
|
+
|
32
|
+
_SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
|
33
|
+
|
34
|
+
MAX_TRIALS = 64
|
35
|
+
|
36
|
+
|
37
|
+
def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
|
38
|
+
user_hash = common_utils.get_user_hash()
|
39
|
+
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
40
|
+
os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
|
41
|
+
private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
|
42
|
+
public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
|
43
|
+
lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
|
44
|
+
return private_key_path, public_key_path, lock_path
|
45
|
+
|
46
|
+
|
47
|
+
def _generate_rsa_key_pair() -> Tuple[str, str]:
|
48
|
+
# Keep the import of the cryptography local to avoid expensive
|
49
|
+
# third-party imports when not needed.
|
50
|
+
# pylint: disable=import-outside-toplevel
|
51
|
+
from cryptography.hazmat.backends import default_backend
|
52
|
+
from cryptography.hazmat.primitives import serialization
|
53
|
+
from cryptography.hazmat.primitives.asymmetric import rsa
|
54
|
+
|
55
|
+
key = rsa.generate_private_key(
|
56
|
+
backend=default_backend(), public_exponent=65537, key_size=2048
|
57
|
+
)
|
58
|
+
|
59
|
+
private_key = (
|
60
|
+
key.private_bytes(
|
61
|
+
encoding=serialization.Encoding.PEM,
|
62
|
+
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
63
|
+
encryption_algorithm=serialization.NoEncryption(),
|
64
|
+
)
|
65
|
+
.decode('utf-8')
|
66
|
+
.strip()
|
67
|
+
)
|
68
|
+
|
69
|
+
public_key = (
|
70
|
+
key.public_key()
|
71
|
+
.public_bytes(
|
72
|
+
serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
|
73
|
+
)
|
74
|
+
.decode('utf-8')
|
75
|
+
.strip()
|
76
|
+
)
|
77
|
+
|
78
|
+
return public_key, private_key
|
79
|
+
|
80
|
+
|
81
|
+
def _save_key_pair(
|
82
|
+
private_key_path: str, public_key_path: str, private_key: str, public_key: str
|
83
|
+
) -> None:
|
84
|
+
key_dir = os.path.dirname(private_key_path)
|
85
|
+
os.makedirs(key_dir, exist_ok=True, mode=0o700)
|
86
|
+
|
87
|
+
with open(
|
88
|
+
private_key_path,
|
89
|
+
'w',
|
90
|
+
encoding='utf-8',
|
91
|
+
opener=functools.partial(os.open, mode=0o600),
|
92
|
+
) as f:
|
93
|
+
f.write(private_key)
|
94
|
+
|
95
|
+
with open(
|
96
|
+
public_key_path,
|
97
|
+
'w',
|
98
|
+
encoding='utf-8',
|
99
|
+
opener=functools.partial(os.open, mode=0o644),
|
100
|
+
) as f:
|
101
|
+
f.write(public_key)
|
102
|
+
|
103
|
+
|
104
|
+
def get_or_generate_keys() -> Tuple[str, str]:
|
105
|
+
"""Returns the aboslute private and public key paths."""
|
106
|
+
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
107
|
+
private_key_path = os.path.expanduser(private_key_path)
|
108
|
+
public_key_path = os.path.expanduser(public_key_path)
|
109
|
+
lock_path = os.path.expanduser(lock_path)
|
110
|
+
|
111
|
+
lock_dir = os.path.dirname(lock_path)
|
112
|
+
# We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
|
113
|
+
# as the ssh configs will be written to this folder as well in
|
114
|
+
# backend_utils.SSHConfigHelper
|
115
|
+
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
116
|
+
with filelock.FileLock(lock_path, timeout=10):
|
117
|
+
if not os.path.exists(private_key_path):
|
118
|
+
public_key, private_key = _generate_rsa_key_pair()
|
119
|
+
_save_key_pair(private_key_path, public_key_path, private_key, public_key)
|
120
|
+
assert os.path.exists(public_key_path), (
|
121
|
+
'Private key found, but associated public key '
|
122
|
+
f'{public_key_path} does not exist.'
|
123
|
+
)
|
124
|
+
return private_key_path, public_key_path
|
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
|
|
70
70
|
assert jobsets is not None, (
|
71
71
|
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
72
72
|
)
|
73
|
-
if
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
73
|
+
if 'status' in jobsets:
|
74
|
+
if jobsets['status']['replicatedJobsStatus'][0]['ready']:
|
75
|
+
logger.info(
|
76
|
+
f'task '
|
77
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
78
|
+
f'{colorama.Style.RESET_ALL} ready'
|
79
|
+
)
|
80
|
+
break
|
81
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
82
|
+
return
|
83
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
84
|
+
logger.info(
|
85
|
+
f'job '
|
86
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
87
|
+
f'{colorama.Style.RESET_ALL} '
|
88
|
+
f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
89
|
+
)
|
90
|
+
job = jobset_utils.get_job(namespace, job_name)
|
91
|
+
_raise_job_error(job)
|
92
|
+
return
|
92
93
|
if timeout != -1 and time.time() - start > timeout:
|
93
94
|
logger.error(
|
94
95
|
f'{colorama.Style.BRIGHT}'
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Jobset utils: wraps CRUD operations for jobsets"""
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import enum
|
4
5
|
import json
|
5
6
|
import os
|
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
|
|
15
16
|
from datetime import timedelta
|
16
17
|
|
17
18
|
import konduktor
|
18
|
-
from konduktor import config, constants, kube_client, logging
|
19
|
+
from konduktor import authentication, config, constants, kube_client, logging
|
19
20
|
from konduktor.data import registry
|
20
21
|
from konduktor.utils import (
|
21
22
|
common_utils,
|
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
93
94
|
else:
|
94
95
|
accelerator_type = None
|
95
96
|
|
97
|
+
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
98
|
+
assert task.resources.memory is not None, 'Task resources memory are required'
|
99
|
+
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
100
|
+
|
96
101
|
# template the commands to run on the container for syncing files. At this point
|
97
102
|
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
98
103
|
# first we iterate through storage_mounts and then file_mounts.
|
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
150
155
|
f'though specified by `tailscale.secret_name`: {err}'
|
151
156
|
)
|
152
157
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
158
|
+
enable_ssh = config.get_nested(('ssh', 'enable'), False)
|
159
|
+
secret_name = None
|
160
|
+
if enable_ssh:
|
161
|
+
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
162
|
+
with (
|
163
|
+
open(private_key_path, 'rb') as private_key_file,
|
164
|
+
open(public_key_path, 'rb') as public_key_file,
|
165
|
+
):
|
166
|
+
private_key, public_key = private_key_file.read(), public_key_file.read()
|
167
|
+
user_hash = common_utils.get_user_hash()
|
168
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
169
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(
|
170
|
+
context_name=context
|
171
|
+
)
|
172
|
+
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
173
|
+
ok, result = kubernetes_utils.set_secret(
|
174
|
+
secret_name=secret_name,
|
175
|
+
namespace=namespace,
|
176
|
+
context=context,
|
177
|
+
data={
|
178
|
+
'PUBKEY': base64.b64encode(public_key).decode(),
|
179
|
+
'PRIVKEY': base64.b64encode(private_key).decode(),
|
180
|
+
},
|
181
|
+
)
|
182
|
+
if not ok:
|
183
|
+
raise exceptions.CreateSecretError(
|
184
|
+
f'Failed to set k8s secret {secret_name}: \n{result}'
|
185
|
+
)
|
186
|
+
|
157
187
|
with tempfile.NamedTemporaryFile() as temp:
|
158
188
|
common_utils.fill_template(
|
159
189
|
'pod.yaml.j2',
|
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
166
196
|
'master_addr': master_addr,
|
167
197
|
'num_nodes': task.num_nodes,
|
168
198
|
'job_name': task.name, # append timestamp and user id here?
|
199
|
+
'setup_cmd': task.setup or '',
|
169
200
|
'run_cmd': task.run,
|
170
201
|
'node_hostnames': node_hostnames,
|
171
202
|
'accelerator_type': accelerator_type,
|
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
176
207
|
'user': common_utils.get_cleaned_username(),
|
177
208
|
# Tailscale credentials
|
178
209
|
'tailscale_secret': tailscale_secret,
|
210
|
+
# SSH
|
211
|
+
'enable_ssh': enable_ssh,
|
212
|
+
'secret_name': secret_name,
|
179
213
|
},
|
180
214
|
temp.name,
|
181
215
|
)
|
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
183
217
|
# merge with `~/.konduktor/config.yaml``
|
184
218
|
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
185
219
|
pod_config = common_utils.read_yaml(temp.name)
|
220
|
+
|
221
|
+
for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
222
|
+
'env'
|
223
|
+
]:
|
224
|
+
if env_var['name'] in task.envs:
|
225
|
+
env_var['value'] = task.envs.pop(env_var['name'])
|
226
|
+
|
186
227
|
for k, v in task.envs.items():
|
187
228
|
pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
188
229
|
'env'
|
@@ -221,6 +262,7 @@ def create_jobset(
|
|
221
262
|
'user': common_utils.get_cleaned_username(),
|
222
263
|
'accelerator_type': accelerator_type,
|
223
264
|
'num_accelerators': num_accelerators,
|
265
|
+
'completions': task.resources.get_completions(),
|
224
266
|
**_JOBSET_METADATA_LABELS,
|
225
267
|
},
|
226
268
|
temp.name,
|
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
|
|
1037
1037
|
secret_name=cls._AWS_SECRET_NAME,
|
1038
1038
|
namespace=namespace,
|
1039
1039
|
context=context,
|
1040
|
-
|
1041
|
-
|
1040
|
+
data={
|
1041
|
+
cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
1042
|
+
credentials_files
|
1043
|
+
)
|
1044
|
+
},
|
1042
1045
|
)
|
1043
1046
|
if not ok:
|
1044
1047
|
logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
|
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
|
|
891
891
|
secret_name=cls._GCP_SECRET_NAME,
|
892
892
|
namespace=namespace,
|
893
893
|
context=context,
|
894
|
-
|
895
|
-
|
894
|
+
data={
|
895
|
+
cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
896
|
+
credentials_files
|
897
|
+
)
|
898
|
+
},
|
896
899
|
)
|
897
900
|
if not ok:
|
898
901
|
logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
|
@@ -49,6 +49,7 @@ class Resources:
|
|
49
49
|
image_id: Union[str, None] = None,
|
50
50
|
disk_size: Optional[int] = None,
|
51
51
|
labels: Optional[Dict[str, str]] = None,
|
52
|
+
job_config: Optional[Dict[str, Union[int, str]]] = None,
|
52
53
|
# Internal use only.
|
53
54
|
# pylint: disable=invalid-name
|
54
55
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
@@ -91,6 +92,7 @@ class Resources:
|
|
91
92
|
instance tags. On GCP, labels map to instance labels. On
|
92
93
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
93
94
|
not supported and will be ignored.
|
95
|
+
job_config: the configuration of the job spec
|
94
96
|
Raises:
|
95
97
|
ValueError: if some attributes are invalid.
|
96
98
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
@@ -122,6 +124,7 @@ class Resources:
|
|
122
124
|
self._set_cpus(cpus)
|
123
125
|
self._set_memory(memory)
|
124
126
|
self._set_accelerators(accelerators)
|
127
|
+
self.job_config = job_config
|
125
128
|
|
126
129
|
# TODO: move these out of init to prevent repeated calls.
|
127
130
|
self._try_validate_cpus_mem()
|
@@ -382,6 +385,11 @@ class Resources:
|
|
382
385
|
accel_str = f'{accel_name}:{accel_count}'
|
383
386
|
return accel_str
|
384
387
|
|
388
|
+
def get_completions(self) -> Optional[int]:
|
389
|
+
if self.job_config and self.job_config['completions']:
|
390
|
+
return int(self.job_config['completions'])
|
391
|
+
return None
|
392
|
+
|
385
393
|
def copy(self, **override) -> 'Resources':
|
386
394
|
"""Returns a copy of the given Resources."""
|
387
395
|
resources = Resources(
|
@@ -392,6 +400,7 @@ class Resources:
|
|
392
400
|
disk_size=override.pop('disk_size', self.disk_size),
|
393
401
|
image_id=override.pop('image_id', self.image_id),
|
394
402
|
labels=override.pop('labels', self.labels),
|
403
|
+
job_config=override.pop('job_config', self.job_config),
|
395
404
|
)
|
396
405
|
assert len(override) == 0
|
397
406
|
return resources
|
@@ -404,6 +413,13 @@ class Resources:
|
|
404
413
|
config, schemas.get_resources_schema(), 'Invalid resources YAML: '
|
405
414
|
)
|
406
415
|
|
416
|
+
if config.get('job_config', None):
|
417
|
+
common_utils.validate_schema(
|
418
|
+
config['job_config'],
|
419
|
+
schemas.get_job_schema(),
|
420
|
+
'Invalid job config YAML',
|
421
|
+
)
|
422
|
+
|
407
423
|
def _override_resources(
|
408
424
|
base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
|
409
425
|
) -> List[Resources]:
|
@@ -446,6 +462,7 @@ class Resources:
|
|
446
462
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
447
463
|
resources_fields['image_id'] = config.pop('image_id', None)
|
448
464
|
resources_fields['labels'] = config.pop('labels', None)
|
465
|
+
resources_fields['job_config'] = config.pop('job_config', None)
|
449
466
|
|
450
467
|
if resources_fields['cpus'] is not None:
|
451
468
|
resources_fields['cpus'] = str(resources_fields['cpus'])
|
@@ -475,4 +492,5 @@ class Resources:
|
|
475
492
|
add_if_not_none('disk_size', self.disk_size)
|
476
493
|
add_if_not_none('image_id', self.image_id)
|
477
494
|
add_if_not_none('labels', self.labels)
|
495
|
+
add_if_not_none('job_config', self.job_config)
|
478
496
|
return config
|
@@ -181,8 +181,7 @@ class Task:
|
|
181
181
|
"""
|
182
182
|
assert name is not None, 'Task name is required'
|
183
183
|
self.name = name
|
184
|
-
|
185
|
-
raise ValueError('`setup` is being deprecated and not supported')
|
184
|
+
self.setup = setup
|
186
185
|
self.run = run
|
187
186
|
self.storage_mounts: Dict[str, storage_lib.Storage] = {}
|
188
187
|
self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
|
@@ -320,6 +319,7 @@ class Task:
|
|
320
319
|
|
321
320
|
task = Task(
|
322
321
|
config.pop('name', None),
|
322
|
+
setup=config.pop('setup', None),
|
323
323
|
run=config.pop('run', None),
|
324
324
|
workdir=config.pop('workdir', None),
|
325
325
|
num_nodes=config.pop('num_nodes', None),
|