konduktor-nightly 0.1.0.dev20250512104920__py3-none-any.whl → 0.1.0.dev20250514104854__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +2 -2
- konduktor/authentication.py +124 -0
- konduktor/backends/jobset.py +20 -19
- konduktor/backends/jobset_utils.py +47 -5
- konduktor/data/aws/s3.py +5 -2
- konduktor/data/gcp/gcs.py +5 -2
- konduktor/resource.py +18 -0
- konduktor/task.py +2 -2
- konduktor/templates/pod.yaml.j2 +180 -16
- konduktor/utils/exceptions.py +4 -0
- konduktor/utils/kubernetes_utils.py +7 -4
- konduktor/utils/loki_utils.py +2 -1
- konduktor/utils/schemas.py +29 -82
- {konduktor_nightly-0.1.0.dev20250512104920.dist-info → konduktor_nightly-0.1.0.dev20250514104854.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250512104920.dist-info → konduktor_nightly-0.1.0.dev20250514104854.dist-info}/RECORD +18 -17
- {konduktor_nightly-0.1.0.dev20250512104920.dist-info → konduktor_nightly-0.1.0.dev20250514104854.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250512104920.dist-info → konduktor_nightly-0.1.0.dev20250514104854.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250512104920.dist-info → konduktor_nightly-0.1.0.dev20250514104854.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
@@ -14,7 +14,7 @@ __all__ = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
# Replaced with the current commit when building the wheels.
|
17
|
-
_KONDUKTOR_COMMIT_SHA = '
|
17
|
+
_KONDUKTOR_COMMIT_SHA = '05c7d9e243ae23c6e9abb0a4a034bfc0815fd587'
|
18
18
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
19
19
|
|
20
20
|
|
@@ -48,5 +48,5 @@ def _get_git_commit():
|
|
48
48
|
|
49
49
|
|
50
50
|
__commit__ = _get_git_commit()
|
51
|
-
__version__ = '1.0.0.dev0.1.0.
|
51
|
+
__version__ = '1.0.0.dev0.1.0.dev20250514104854'
|
52
52
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""
|
14
|
+
The local machine's public key should not be uploaded to the remote VM, because
|
15
|
+
it will cause private/public key pair mismatch when the user tries to launch new
|
16
|
+
VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
|
17
|
+
controller. (Lambda cloud is an exception, due to the limitation of the cloud
|
18
|
+
provider. See the comments in setup_lambda_authentication)
|
19
|
+
"""
|
20
|
+
|
21
|
+
import functools
|
22
|
+
import os
|
23
|
+
from typing import Tuple
|
24
|
+
|
25
|
+
import filelock
|
26
|
+
|
27
|
+
from konduktor import logging
|
28
|
+
from konduktor.utils import common_utils
|
29
|
+
|
30
|
+
logger = logging.get_logger(__name__)
|
31
|
+
|
32
|
+
_SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
|
33
|
+
|
34
|
+
MAX_TRIALS = 64
|
35
|
+
|
36
|
+
|
37
|
+
def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
|
38
|
+
user_hash = common_utils.get_user_hash()
|
39
|
+
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
40
|
+
os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
|
41
|
+
private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
|
42
|
+
public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
|
43
|
+
lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
|
44
|
+
return private_key_path, public_key_path, lock_path
|
45
|
+
|
46
|
+
|
47
|
+
def _generate_rsa_key_pair() -> Tuple[str, str]:
|
48
|
+
# Keep the import of the cryptography local to avoid expensive
|
49
|
+
# third-party imports when not needed.
|
50
|
+
# pylint: disable=import-outside-toplevel
|
51
|
+
from cryptography.hazmat.backends import default_backend
|
52
|
+
from cryptography.hazmat.primitives import serialization
|
53
|
+
from cryptography.hazmat.primitives.asymmetric import rsa
|
54
|
+
|
55
|
+
key = rsa.generate_private_key(
|
56
|
+
backend=default_backend(), public_exponent=65537, key_size=2048
|
57
|
+
)
|
58
|
+
|
59
|
+
private_key = (
|
60
|
+
key.private_bytes(
|
61
|
+
encoding=serialization.Encoding.PEM,
|
62
|
+
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
63
|
+
encryption_algorithm=serialization.NoEncryption(),
|
64
|
+
)
|
65
|
+
.decode('utf-8')
|
66
|
+
.strip()
|
67
|
+
)
|
68
|
+
|
69
|
+
public_key = (
|
70
|
+
key.public_key()
|
71
|
+
.public_bytes(
|
72
|
+
serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
|
73
|
+
)
|
74
|
+
.decode('utf-8')
|
75
|
+
.strip()
|
76
|
+
)
|
77
|
+
|
78
|
+
return public_key, private_key
|
79
|
+
|
80
|
+
|
81
|
+
def _save_key_pair(
|
82
|
+
private_key_path: str, public_key_path: str, private_key: str, public_key: str
|
83
|
+
) -> None:
|
84
|
+
key_dir = os.path.dirname(private_key_path)
|
85
|
+
os.makedirs(key_dir, exist_ok=True, mode=0o700)
|
86
|
+
|
87
|
+
with open(
|
88
|
+
private_key_path,
|
89
|
+
'w',
|
90
|
+
encoding='utf-8',
|
91
|
+
opener=functools.partial(os.open, mode=0o600),
|
92
|
+
) as f:
|
93
|
+
f.write(private_key)
|
94
|
+
|
95
|
+
with open(
|
96
|
+
public_key_path,
|
97
|
+
'w',
|
98
|
+
encoding='utf-8',
|
99
|
+
opener=functools.partial(os.open, mode=0o644),
|
100
|
+
) as f:
|
101
|
+
f.write(public_key)
|
102
|
+
|
103
|
+
|
104
|
+
def get_or_generate_keys() -> Tuple[str, str]:
|
105
|
+
"""Returns the aboslute private and public key paths."""
|
106
|
+
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
107
|
+
private_key_path = os.path.expanduser(private_key_path)
|
108
|
+
public_key_path = os.path.expanduser(public_key_path)
|
109
|
+
lock_path = os.path.expanduser(lock_path)
|
110
|
+
|
111
|
+
lock_dir = os.path.dirname(lock_path)
|
112
|
+
# We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
|
113
|
+
# as the ssh configs will be written to this folder as well in
|
114
|
+
# backend_utils.SSHConfigHelper
|
115
|
+
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
116
|
+
with filelock.FileLock(lock_path, timeout=10):
|
117
|
+
if not os.path.exists(private_key_path):
|
118
|
+
public_key, private_key = _generate_rsa_key_pair()
|
119
|
+
_save_key_pair(private_key_path, public_key_path, private_key, public_key)
|
120
|
+
assert os.path.exists(public_key_path), (
|
121
|
+
'Private key found, but associated public key '
|
122
|
+
f'{public_key_path} does not exist.'
|
123
|
+
)
|
124
|
+
return private_key_path, public_key_path
|
konduktor/backends/jobset.py
CHANGED
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
|
|
70
70
|
assert jobsets is not None, (
|
71
71
|
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
72
72
|
)
|
73
|
-
if
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
73
|
+
if 'status' in jobsets:
|
74
|
+
if jobsets['status']['replicatedJobsStatus'][0]['ready']:
|
75
|
+
logger.info(
|
76
|
+
f'task '
|
77
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
78
|
+
f'{colorama.Style.RESET_ALL} ready'
|
79
|
+
)
|
80
|
+
break
|
81
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
82
|
+
return
|
83
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
84
|
+
logger.info(
|
85
|
+
f'job '
|
86
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
87
|
+
f'{colorama.Style.RESET_ALL} '
|
88
|
+
f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
89
|
+
)
|
90
|
+
job = jobset_utils.get_job(namespace, job_name)
|
91
|
+
_raise_job_error(job)
|
92
|
+
return
|
92
93
|
if timeout != -1 and time.time() - start > timeout:
|
93
94
|
logger.error(
|
94
95
|
f'{colorama.Style.BRIGHT}'
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Jobset utils: wraps CRUD operations for jobsets"""
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import enum
|
4
5
|
import json
|
5
6
|
import os
|
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
|
|
15
16
|
from datetime import timedelta
|
16
17
|
|
17
18
|
import konduktor
|
18
|
-
from konduktor import config, constants, kube_client, logging
|
19
|
+
from konduktor import authentication, config, constants, kube_client, logging
|
19
20
|
from konduktor.data import registry
|
20
21
|
from konduktor.utils import (
|
21
22
|
common_utils,
|
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
93
94
|
else:
|
94
95
|
accelerator_type = None
|
95
96
|
|
97
|
+
assert task.resources.cpus is not None, 'Task resources cpus are required'
|
98
|
+
assert task.resources.memory is not None, 'Task resources memory are required'
|
99
|
+
assert task.resources.image_id is not None, 'Task resources image_id are required'
|
100
|
+
|
96
101
|
# template the commands to run on the container for syncing files. At this point
|
97
102
|
# task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
|
98
103
|
# first we iterate through storage_mounts and then file_mounts.
|
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
150
155
|
f'though specified by `tailscale.secret_name`: {err}'
|
151
156
|
)
|
152
157
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
158
|
+
enable_ssh = config.get_nested(('ssh', 'enable'), False)
|
159
|
+
secret_name = None
|
160
|
+
if enable_ssh:
|
161
|
+
private_key_path, public_key_path = authentication.get_or_generate_keys()
|
162
|
+
with (
|
163
|
+
open(private_key_path, 'rb') as private_key_file,
|
164
|
+
open(public_key_path, 'rb') as public_key_file,
|
165
|
+
):
|
166
|
+
private_key, public_key = private_key_file.read(), public_key_file.read()
|
167
|
+
user_hash = common_utils.get_user_hash()
|
168
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
169
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(
|
170
|
+
context_name=context
|
171
|
+
)
|
172
|
+
secret_name = f'konduktor-ssh-keys-{user_hash}'
|
173
|
+
ok, result = kubernetes_utils.set_secret(
|
174
|
+
secret_name=secret_name,
|
175
|
+
namespace=namespace,
|
176
|
+
context=context,
|
177
|
+
data={
|
178
|
+
'PUBKEY': base64.b64encode(public_key).decode(),
|
179
|
+
'PRIVKEY': base64.b64encode(private_key).decode(),
|
180
|
+
},
|
181
|
+
)
|
182
|
+
if not ok:
|
183
|
+
raise exceptions.CreateSecretError(
|
184
|
+
f'Failed to set k8s secret {secret_name}: \n{result}'
|
185
|
+
)
|
186
|
+
|
157
187
|
with tempfile.NamedTemporaryFile() as temp:
|
158
188
|
common_utils.fill_template(
|
159
189
|
'pod.yaml.j2',
|
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
166
196
|
'master_addr': master_addr,
|
167
197
|
'num_nodes': task.num_nodes,
|
168
198
|
'job_name': task.name, # append timestamp and user id here?
|
199
|
+
'setup_cmd': task.setup or '',
|
169
200
|
'run_cmd': task.run,
|
170
201
|
'node_hostnames': node_hostnames,
|
171
202
|
'accelerator_type': accelerator_type,
|
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
176
207
|
'user': common_utils.get_cleaned_username(),
|
177
208
|
# Tailscale credentials
|
178
209
|
'tailscale_secret': tailscale_secret,
|
210
|
+
# SSH
|
211
|
+
'enable_ssh': enable_ssh,
|
212
|
+
'secret_name': secret_name,
|
179
213
|
},
|
180
214
|
temp.name,
|
181
215
|
)
|
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
183
217
|
# merge with `~/.konduktor/config.yaml``
|
184
218
|
kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
|
185
219
|
pod_config = common_utils.read_yaml(temp.name)
|
220
|
+
|
221
|
+
for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
222
|
+
'env'
|
223
|
+
]:
|
224
|
+
if env_var['name'] in task.envs:
|
225
|
+
env_var['value'] = task.envs.pop(env_var['name'])
|
226
|
+
|
186
227
|
for k, v in task.envs.items():
|
187
228
|
pod_config['kubernetes']['pod_config']['spec']['containers'][0][
|
188
229
|
'env'
|
@@ -221,6 +262,7 @@ def create_jobset(
|
|
221
262
|
'user': common_utils.get_cleaned_username(),
|
222
263
|
'accelerator_type': accelerator_type,
|
223
264
|
'num_accelerators': num_accelerators,
|
265
|
+
'completions': task.resources.get_completions(),
|
224
266
|
**_JOBSET_METADATA_LABELS,
|
225
267
|
},
|
226
268
|
temp.name,
|
konduktor/data/aws/s3.py
CHANGED
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
|
|
1037
1037
|
secret_name=cls._AWS_SECRET_NAME,
|
1038
1038
|
namespace=namespace,
|
1039
1039
|
context=context,
|
1040
|
-
|
1041
|
-
|
1040
|
+
data={
|
1041
|
+
cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
1042
|
+
credentials_files
|
1043
|
+
)
|
1044
|
+
},
|
1042
1045
|
)
|
1043
1046
|
if not ok:
|
1044
1047
|
logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
|
konduktor/data/gcp/gcs.py
CHANGED
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
|
|
891
891
|
secret_name=cls._GCP_SECRET_NAME,
|
892
892
|
namespace=namespace,
|
893
893
|
context=context,
|
894
|
-
|
895
|
-
|
894
|
+
data={
|
895
|
+
cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
|
896
|
+
credentials_files
|
897
|
+
)
|
898
|
+
},
|
896
899
|
)
|
897
900
|
if not ok:
|
898
901
|
logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
|
konduktor/resource.py
CHANGED
@@ -49,6 +49,7 @@ class Resources:
|
|
49
49
|
image_id: Union[str, None] = None,
|
50
50
|
disk_size: Optional[int] = None,
|
51
51
|
labels: Optional[Dict[str, str]] = None,
|
52
|
+
job_config: Optional[Dict[str, Union[int, str]]] = None,
|
52
53
|
# Internal use only.
|
53
54
|
# pylint: disable=invalid-name
|
54
55
|
_cluster_config_overrides: Optional[Dict[str, Any]] = None,
|
@@ -91,6 +92,7 @@ class Resources:
|
|
91
92
|
instance tags. On GCP, labels map to instance labels. On
|
92
93
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
93
94
|
not supported and will be ignored.
|
95
|
+
job_config: the configuration of the job spec
|
94
96
|
Raises:
|
95
97
|
ValueError: if some attributes are invalid.
|
96
98
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
@@ -122,6 +124,7 @@ class Resources:
|
|
122
124
|
self._set_cpus(cpus)
|
123
125
|
self._set_memory(memory)
|
124
126
|
self._set_accelerators(accelerators)
|
127
|
+
self.job_config = job_config
|
125
128
|
|
126
129
|
# TODO: move these out of init to prevent repeated calls.
|
127
130
|
self._try_validate_cpus_mem()
|
@@ -382,6 +385,11 @@ class Resources:
|
|
382
385
|
accel_str = f'{accel_name}:{accel_count}'
|
383
386
|
return accel_str
|
384
387
|
|
388
|
+
def get_completions(self) -> Optional[int]:
|
389
|
+
if self.job_config and self.job_config['completions']:
|
390
|
+
return int(self.job_config['completions'])
|
391
|
+
return None
|
392
|
+
|
385
393
|
def copy(self, **override) -> 'Resources':
|
386
394
|
"""Returns a copy of the given Resources."""
|
387
395
|
resources = Resources(
|
@@ -392,6 +400,7 @@ class Resources:
|
|
392
400
|
disk_size=override.pop('disk_size', self.disk_size),
|
393
401
|
image_id=override.pop('image_id', self.image_id),
|
394
402
|
labels=override.pop('labels', self.labels),
|
403
|
+
job_config=override.pop('job_config', self.job_config),
|
395
404
|
)
|
396
405
|
assert len(override) == 0
|
397
406
|
return resources
|
@@ -404,6 +413,13 @@ class Resources:
|
|
404
413
|
config, schemas.get_resources_schema(), 'Invalid resources YAML: '
|
405
414
|
)
|
406
415
|
|
416
|
+
if config.get('job_config', None):
|
417
|
+
common_utils.validate_schema(
|
418
|
+
config['job_config'],
|
419
|
+
schemas.get_job_schema(),
|
420
|
+
'Invalid job config YAML',
|
421
|
+
)
|
422
|
+
|
407
423
|
def _override_resources(
|
408
424
|
base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
|
409
425
|
) -> List[Resources]:
|
@@ -446,6 +462,7 @@ class Resources:
|
|
446
462
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
447
463
|
resources_fields['image_id'] = config.pop('image_id', None)
|
448
464
|
resources_fields['labels'] = config.pop('labels', None)
|
465
|
+
resources_fields['job_config'] = config.pop('job_config', None)
|
449
466
|
|
450
467
|
if resources_fields['cpus'] is not None:
|
451
468
|
resources_fields['cpus'] = str(resources_fields['cpus'])
|
@@ -475,4 +492,5 @@ class Resources:
|
|
475
492
|
add_if_not_none('disk_size', self.disk_size)
|
476
493
|
add_if_not_none('image_id', self.image_id)
|
477
494
|
add_if_not_none('labels', self.labels)
|
495
|
+
add_if_not_none('job_config', self.job_config)
|
478
496
|
return config
|
konduktor/task.py
CHANGED
@@ -181,8 +181,7 @@ class Task:
|
|
181
181
|
"""
|
182
182
|
assert name is not None, 'Task name is required'
|
183
183
|
self.name = name
|
184
|
-
|
185
|
-
raise ValueError('`setup` is being deprecated and not supported')
|
184
|
+
self.setup = setup
|
186
185
|
self.run = run
|
187
186
|
self.storage_mounts: Dict[str, storage_lib.Storage] = {}
|
188
187
|
self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
|
@@ -320,6 +319,7 @@ class Task:
|
|
320
319
|
|
321
320
|
task = Task(
|
322
321
|
config.pop('name', None),
|
322
|
+
setup=config.pop('setup', None),
|
323
323
|
run=config.pop('run', None),
|
324
324
|
workdir=config.pop('workdir', None),
|
325
325
|
num_nodes=config.pop('num_nodes', None),
|
konduktor/templates/pod.yaml.j2
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
kubernetes:
|
2
2
|
pod_config:
|
3
3
|
metadata:
|
4
|
-
{% if accelerator_type %}
|
5
4
|
labels:
|
6
5
|
parent: trainy
|
7
|
-
trainy.ai/accelerator: {{ accelerator_type }}
|
8
6
|
trainy.ai/username: {{ user }}
|
9
|
-
|
7
|
+
{% if accelerator_type %}
|
8
|
+
trainy.ai/accelerator: {{ accelerator_type }}
|
9
|
+
{% endif %}
|
10
10
|
spec:
|
11
11
|
restartPolicy: "Never"
|
12
12
|
# trigger this on GPU request
|
@@ -15,9 +15,39 @@ kubernetes:
|
|
15
15
|
- key: "nvidia.com/gpu"
|
16
16
|
operator: "Exists"
|
17
17
|
{% endif %}
|
18
|
+
initContainers:
|
19
|
+
- name: setup-synchronizer
|
20
|
+
image: "alpine:3.19"
|
21
|
+
restartPolicy: Always
|
22
|
+
command: ["/bin/sh", "-c"]
|
23
|
+
args:
|
24
|
+
- |
|
25
|
+
apk add --no-cache socat
|
26
|
+
wget https://raw.githubusercontent.com/asaiacai/dumb_barrier/refs/heads/main/dumb_barrier.sh
|
27
|
+
sh -x dumb_barrier.sh
|
28
|
+
volumeMounts:
|
29
|
+
- name: sync
|
30
|
+
mountPath: /tmp/konduktor
|
31
|
+
env:
|
32
|
+
- name: MASTER_ADDR
|
33
|
+
value: "{{ master_addr }}"
|
34
|
+
- name: RANK
|
35
|
+
valueFrom:
|
36
|
+
fieldRef:
|
37
|
+
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
|
38
|
+
- name: WORLD_SIZE
|
39
|
+
value: "{{ num_nodes }}"
|
40
|
+
- name: MASTER_PORT
|
41
|
+
value: "11111"
|
42
|
+
- name: GO_PORT
|
43
|
+
value: "11112"
|
18
44
|
containers:
|
19
45
|
# TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
|
20
46
|
- name: konduktor-container
|
47
|
+
{% if enable_ssh %}
|
48
|
+
ports:
|
49
|
+
- containerPort: 2222
|
50
|
+
{% endif %}
|
21
51
|
image: {{ image_id }}
|
22
52
|
# this is set during jobset definition since we need to know the jobset
|
23
53
|
# name and number of nodes to set all the environment variables correctly here
|
@@ -56,6 +86,18 @@ kubernetes:
|
|
56
86
|
fieldRef:
|
57
87
|
fieldPath: metadata.uid
|
58
88
|
{% endif %}
|
89
|
+
{% if enable_ssh %}
|
90
|
+
- name: KONDUKTOR_SSHPUB
|
91
|
+
valueFrom:
|
92
|
+
secretKeyRef:
|
93
|
+
name: {{ secret_name }}
|
94
|
+
key: PUBKEY
|
95
|
+
- name: KONDUKTOR_SSHPRIV
|
96
|
+
valueFrom:
|
97
|
+
secretKeyRef:
|
98
|
+
name: {{ secret_name }}
|
99
|
+
key: PRIVKEY
|
100
|
+
{% endif %}
|
59
101
|
# these are for compatibility with skypilot
|
60
102
|
- name: SKYPILOT_NODE_IPS
|
61
103
|
value: "{{ node_hostnames }}"
|
@@ -70,6 +112,8 @@ kubernetes:
|
|
70
112
|
volumeMounts:
|
71
113
|
- name: shared-memory
|
72
114
|
mountPath: /dev/shm
|
115
|
+
- name: sync
|
116
|
+
mountPath: /tmp/konduktor
|
73
117
|
{% for secret_type, secret_name in mount_secrets.items() %}
|
74
118
|
- name: {{ secret_type }}-secret
|
75
119
|
mountPath: /run/konduktor/{{ secret_type }}-secret
|
@@ -89,22 +133,22 @@ kubernetes:
|
|
89
133
|
|
90
134
|
|
91
135
|
PACKAGES="";
|
92
|
-
{% if 'rsync' in run_cmd %}
|
136
|
+
{% if 'rsync' in run_cmd or 'rsync' in setup_cmd %}
|
93
137
|
PACKAGES="$PACKAGES rsync";
|
94
138
|
{% endif %}
|
95
|
-
{% if 'curl' in run_cmd or tailscale_secret %}
|
139
|
+
{% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
|
96
140
|
PACKAGES="$PACKAGES curl";
|
97
141
|
{% endif %}
|
98
142
|
{% if 'gs' in mount_secrets or 's3' in mount_secrets %}
|
99
143
|
PACKAGES="$PACKAGES unzip wget";
|
100
144
|
{% endif %}
|
101
|
-
{% if 'git' in run_cmd %}
|
145
|
+
{% if 'git' in run_cmd or 'git' in setup_cmd %}
|
102
146
|
PACKAGES="$PACKAGES git";
|
103
147
|
{% endif %}
|
104
148
|
|
105
149
|
if [ ! -z "${PACKAGES}" ]; then
|
106
150
|
# Run apt update, install missing packages
|
107
|
-
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update
|
151
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update 2>&1 | tee -a ~/.konduktor/tmp/apt-update.log 2>&1 || \
|
108
152
|
$(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
|
109
153
|
fi
|
110
154
|
|
@@ -125,22 +169,112 @@ kubernetes:
|
|
125
169
|
done;
|
126
170
|
if [ ! -z "$INSTALL_FIRST" ]; then
|
127
171
|
$(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
|
128
|
-
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST
|
172
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
|
129
173
|
fi;
|
130
174
|
|
131
175
|
if [ ! -z "$MISSING_PACKAGES" ]; then
|
132
176
|
$(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
|
133
|
-
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES
|
177
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
|
134
178
|
fi;
|
135
179
|
end_epoch=$(date +%s);
|
136
180
|
|
181
|
+
{% if enable_ssh %}
|
182
|
+
|
183
|
+
function InstallSSH {
|
184
|
+
export DEBIAN_FRONTEND=noninteractive
|
185
|
+
export TZ=Etc/UTC
|
186
|
+
if service sshd status > /dev/null 2>&1; then
|
187
|
+
$(prefix_cmd) echo "OpenSSH server is already started."
|
188
|
+
return
|
189
|
+
fi
|
190
|
+
# Check if OpenSSH server is already installed
|
191
|
+
if ! command -v sshd &> /dev/null; then
|
192
|
+
$(prefix_cmd) echo "OpenSSH server is not installed. Installing..."
|
193
|
+
|
194
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt update
|
195
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt install -y openssh-server
|
196
|
+
|
197
|
+
$(prefix_cmd) echo "OpenSSH server installation complete."
|
198
|
+
else
|
199
|
+
$(prefix_cmd) echo "OpenSSH server is already installed."
|
200
|
+
fi
|
201
|
+
|
202
|
+
# Set root password if SSHKEY is provided
|
203
|
+
# Enable root login in SSH configuration
|
204
|
+
$(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
|
205
|
+
$(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
|
206
|
+
$(prefix_cmd) echo "Root login is enabled."
|
207
|
+
|
208
|
+
# Create the .ssh directory and authorized_keys file if they don't exist
|
209
|
+
if [ ! -d "$HOME/.ssh" ]; then
|
210
|
+
$(prefix_cmd) mkdir -p "$HOME/.ssh"
|
211
|
+
$(prefix_cmd) chmod 0700 "$HOME/.ssh"
|
212
|
+
$(prefix_cmd) echo "Directory $HOME/.ssh created."
|
213
|
+
fi
|
214
|
+
if [ ! -f "$HOME/.ssh/authorized_keys" ]; then
|
215
|
+
$(prefix_cmd) touch "$HOME/.ssh/authorized_keys"
|
216
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/authorized_keys"
|
217
|
+
$(prefix_cmd) echo "File $HOME/.ssh/authorized_keys created."
|
218
|
+
fi
|
219
|
+
# Check if the public key is not already present in authorized_keys
|
220
|
+
if ! grep -q "${KONDUKTOR_SSHPUB}" "$HOME/.ssh/authorized_keys"; then
|
221
|
+
# Append the public key to authorized_keys
|
222
|
+
$(prefix_cmd) echo "${KONDUKTOR_SSHPUB}" >> "$HOME/.ssh/authorized_keys"
|
223
|
+
$(prefix_cmd) echo "Public key added."
|
224
|
+
fi
|
225
|
+
if [ ! -f "$HOME/.ssh/konduktor-key" ]; then
|
226
|
+
# create the private key to authorized_keys
|
227
|
+
$(prefix_cmd) touch "$HOME/.ssh/konduktor-key"
|
228
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/konduktor-key"
|
229
|
+
$(prefix_cmd) echo "${KONDUKTOR_SSHPRIV}" >> "$HOME/.ssh/konduktor-key"
|
230
|
+
$(prefix_cmd) echo "private key added."
|
231
|
+
fi
|
232
|
+
if [ ! -f "$HOME/.ssh/config" ]; then
|
233
|
+
# create the private key to authorized_keys
|
234
|
+
$(prefix_cmd) touch "$HOME/.ssh/config"
|
235
|
+
$(prefix_cmd) chmod 0600 "$HOME/.ssh/config"
|
236
|
+
$(prefix_cmd) printf '\nHost *\n StrictHostKeyChecking no\n' >> "$HOME/.ssh/config"
|
237
|
+
$(prefix_cmd) echo "ssh config set"
|
238
|
+
fi
|
239
|
+
|
240
|
+
# turn off PAM to fix sshd login issue
|
241
|
+
$(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
|
242
|
+
|
243
|
+
# set default port to 2222
|
244
|
+
$(prefix_cmd) sed -i 's/#Port 22/Port 2222/' /etc/ssh/sshd_config
|
245
|
+
|
246
|
+
echo "Exposing ENV variables"
|
247
|
+
env -0 | awk -v RS='\0' '
|
248
|
+
{
|
249
|
+
gsub(/\\/,"\\\\"); # escape existing backslashes first
|
250
|
+
gsub(/"/,"\\\""); # escape any double quotes
|
251
|
+
gsub(/\n/,"\\n"); # turn real newlines into the two characters \n
|
252
|
+
sub(/=/,"=\""); # open the value-quoting
|
253
|
+
print $0 "\""; # close the quote and add a newline record separator
|
254
|
+
}
|
255
|
+
' > /etc/environment
|
256
|
+
echo "set -a; source /etc/environment; set +a;" >> /root/.bashrc
|
257
|
+
|
258
|
+
$(prefix_cmd) mkdir /run/sshd
|
259
|
+
$(prefix_cmd) chmod 0755 /run/sshd
|
260
|
+
|
261
|
+
$(prefix_cmd) service ssh start
|
262
|
+
$(prefix_cmd) echo "sshd service started"
|
263
|
+
}
|
264
|
+
|
265
|
+
InstallSSH
|
266
|
+
{% endif %}
|
267
|
+
|
137
268
|
{% if tailscale_secret %}
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
269
|
+
function InstallTailscale {
|
270
|
+
if ! command -v tailscale >/dev/null 2>&1; then
|
271
|
+
export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
|
272
|
+
$(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh 2>&1 | tee -a ~/.konduktor/tmp/tailscale-install.log
|
273
|
+
$(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
|
274
|
+
$(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} >/dev/null 2>&1
|
275
|
+
fi
|
276
|
+
}
|
277
|
+
InstallTailscale &
|
144
278
|
{% endif %}
|
145
279
|
end_epoch=$(date +%s);
|
146
280
|
|
@@ -182,14 +316,42 @@ kubernetes:
|
|
182
316
|
end_setup_time=$((end_epoch - start_setup));
|
183
317
|
ulimit -Sc 0 && ulimit -Hc 0
|
184
318
|
$(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
|
185
|
-
|
319
|
+
set +eo pipefail
|
186
320
|
$(prefix_cmd) cd {{ remote_workdir }}
|
321
|
+
{% if setup_cmd %}
|
322
|
+
# setup task
|
323
|
+
$(prefix_cmd) echo "===== KONDUKTOR: Running setup ======="
|
324
|
+
{{ setup_cmd | indent( width=14 ) }}
|
325
|
+
{% endif %}
|
326
|
+
|
327
|
+
# synchronize workers before executing `run`
|
328
|
+
set -e
|
329
|
+
touch "/tmp/konduktor/SETUP"
|
330
|
+
# TODO(asaiacai): should we make this value tuneable for users?
|
331
|
+
TIMEOUT=3600
|
332
|
+
start_sync=$(date +%s);
|
333
|
+
DEADLINE=$(( $(date +%s) + TIMEOUT ))
|
334
|
+
|
335
|
+
echo "[KONDUKTOR: main] Waiting for workers to synchronize"
|
336
|
+
while [ ! -f "/tmp/konduktor/READY" ]; do
|
337
|
+
if [ "$(date +%s)" -ge "$DEADLINE" ]; then
|
338
|
+
echo "[KONDUKTOR: main] ERROR: Timed out after 2 minutes of waiting for worker synchronization"
|
339
|
+
exit 1
|
340
|
+
fi
|
341
|
+
sleep 0.5
|
342
|
+
done
|
343
|
+
echo "[KONDUKTOR: main] All workers have joined"
|
344
|
+
end_sync=$(date +%s);
|
345
|
+
echo "[KONDUKTOR: main] Synchronization took $((end_sync - start_sync)) seconds"
|
187
346
|
set +eo pipefail
|
347
|
+
# run task
|
348
|
+
$(prefix_cmd) cd {{ remote_workdir }}
|
188
349
|
$(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
|
189
350
|
start_epoch=$(date +%s);
|
190
351
|
{{ run_cmd | indent( width=14 ) }}
|
191
352
|
end_epoch=$(date +%s);
|
192
353
|
exit_code=$?
|
354
|
+
set +ex
|
193
355
|
$(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
|
194
356
|
exit $exit_code
|
195
357
|
resources:
|
@@ -216,6 +378,8 @@ kubernetes:
|
|
216
378
|
emptyDir:
|
217
379
|
medium: "Memory"
|
218
380
|
sizeLimit: 4Gi
|
381
|
+
- name: sync
|
382
|
+
emptyDir: {}
|
219
383
|
{% for secret_type, secret_name in mount_secrets.items() %}
|
220
384
|
- name: {{ secret_type }}-secret
|
221
385
|
secret:
|
konduktor/utils/exceptions.py
CHANGED
@@ -54,7 +54,7 @@ NO_ACCELERATOR_HELP_MESSAGE = (
|
|
54
54
|
'(e.g. `nvidia.com/gpu` are setup correctly. '
|
55
55
|
)
|
56
56
|
|
57
|
-
_K8S_CLIENT_LOCK_PATH = '~/.konduktor/k8s_client.lock'
|
57
|
+
_K8S_CLIENT_LOCK_PATH = os.path.expanduser('~/.konduktor/k8s_client.lock')
|
58
58
|
_K8s_CLIENT_LOCK = filelock.FileLock(_K8S_CLIENT_LOCK_PATH)
|
59
59
|
|
60
60
|
logger = logging.get_logger(__name__)
|
@@ -578,11 +578,14 @@ def set_secret(
|
|
578
578
|
secret_name: str,
|
579
579
|
namespace: str,
|
580
580
|
context: Optional[str],
|
581
|
-
|
582
|
-
secret_value: str,
|
581
|
+
data: Dict[str, str],
|
583
582
|
) -> Tuple[bool, Optional[str]]:
|
584
583
|
"""
|
585
584
|
Create/update a secret in a namespace. Values are encoded to base64.
|
585
|
+
`secret` must be base64 encoded ie
|
586
|
+
```
|
587
|
+
base64.b64encode(secret).decode()
|
588
|
+
```
|
586
589
|
"""
|
587
590
|
with _K8s_CLIENT_LOCK:
|
588
591
|
secret_exists, response = check_secret_exists(
|
@@ -598,7 +601,7 @@ def set_secret(
|
|
598
601
|
secret = kubernetes.client.V1Secret(
|
599
602
|
metadata=kubernetes.client.V1ObjectMeta(**secret_metadata),
|
600
603
|
type='Opaque',
|
601
|
-
data=
|
604
|
+
data=data,
|
602
605
|
)
|
603
606
|
|
604
607
|
try:
|
konduktor/utils/loki_utils.py
CHANGED
@@ -67,7 +67,8 @@ def tail_loki_logs_ws(
|
|
67
67
|
logger.debug(f'Loki URL: {loki_url}')
|
68
68
|
params = {
|
69
69
|
'query': urllib.parse.quote(
|
70
|
-
f'
|
70
|
+
r'{' + f'k8s_job_name="{job_name}-workers-0",'
|
71
|
+
r' k8s_container_name="konduktor-container"} '
|
71
72
|
f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
|
72
73
|
),
|
73
74
|
'limit': num_logs,
|
konduktor/utils/schemas.py
CHANGED
@@ -87,6 +87,7 @@ def _get_single_resources_schema():
|
|
87
87
|
'_cluster_config_overrides': {
|
88
88
|
'type': 'object',
|
89
89
|
},
|
90
|
+
'job_config': {'type': 'object'},
|
90
91
|
},
|
91
92
|
}
|
92
93
|
|
@@ -153,8 +154,6 @@ def get_resources_schema():
|
|
153
154
|
'items': multi_resources_schema,
|
154
155
|
},
|
155
156
|
},
|
156
|
-
# Avoid job_recovery and spot_recovery being present at the same time.
|
157
|
-
**_check_not_both_fields_present('job_recovery', 'spot_recovery'),
|
158
157
|
}
|
159
158
|
|
160
159
|
|
@@ -337,84 +336,6 @@ def get_cluster_schema():
|
|
337
336
|
}
|
338
337
|
|
339
338
|
|
340
|
-
_NETWORK_CONFIG_SCHEMA = {
|
341
|
-
'vpc_name': {
|
342
|
-
'oneOf': [
|
343
|
-
{
|
344
|
-
'type': 'string',
|
345
|
-
},
|
346
|
-
{
|
347
|
-
'type': 'null',
|
348
|
-
},
|
349
|
-
],
|
350
|
-
},
|
351
|
-
'use_internal_ips': {
|
352
|
-
'type': 'boolean',
|
353
|
-
},
|
354
|
-
'ssh_proxy_command': {
|
355
|
-
'oneOf': [
|
356
|
-
{
|
357
|
-
'type': 'string',
|
358
|
-
},
|
359
|
-
{
|
360
|
-
'type': 'null',
|
361
|
-
},
|
362
|
-
{
|
363
|
-
'type': 'object',
|
364
|
-
'required': [],
|
365
|
-
'additionalProperties': {
|
366
|
-
'anyOf': [
|
367
|
-
{'type': 'string'},
|
368
|
-
{'type': 'null'},
|
369
|
-
]
|
370
|
-
},
|
371
|
-
},
|
372
|
-
]
|
373
|
-
},
|
374
|
-
}
|
375
|
-
|
376
|
-
_LABELS_SCHEMA = {
|
377
|
-
# Deprecated: 'instance_tags' is replaced by 'labels'. Keeping for backward
|
378
|
-
# compatibility. Will be removed after 0.8.0.
|
379
|
-
'instance_tags': {
|
380
|
-
'type': 'object',
|
381
|
-
'required': [],
|
382
|
-
'additionalProperties': {
|
383
|
-
'type': 'string',
|
384
|
-
},
|
385
|
-
},
|
386
|
-
'labels': {
|
387
|
-
'type': 'object',
|
388
|
-
'required': [],
|
389
|
-
'additionalProperties': {
|
390
|
-
'type': 'string',
|
391
|
-
},
|
392
|
-
},
|
393
|
-
}
|
394
|
-
|
395
|
-
_PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY = {
|
396
|
-
'oneOf': [
|
397
|
-
{'type': 'string'},
|
398
|
-
{
|
399
|
-
# A list of single-element dict to pretain the
|
400
|
-
# order.
|
401
|
-
# Example:
|
402
|
-
# property_name:
|
403
|
-
# - my-cluster1-*: my-property-1
|
404
|
-
# - my-cluster2-*: my-property-2
|
405
|
-
# - "*"": my-property-3
|
406
|
-
'type': 'array',
|
407
|
-
'items': {
|
408
|
-
'type': 'object',
|
409
|
-
'additionalProperties': {'type': 'string'},
|
410
|
-
'maxProperties': 1,
|
411
|
-
'minProperties': 1,
|
412
|
-
},
|
413
|
-
},
|
414
|
-
]
|
415
|
-
}
|
416
|
-
|
417
|
-
|
418
339
|
class RemoteIdentityOptions(enum.Enum):
|
419
340
|
"""Enum for remote identity types.
|
420
341
|
|
@@ -454,9 +375,8 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
|
|
454
375
|
|
455
376
|
def get_storage_schema():
|
456
377
|
# pylint: disable=import-outside-toplevel
|
457
|
-
from knoduktor.registry import registry
|
458
|
-
|
459
378
|
from konduktor.data import storage
|
379
|
+
from konduktor.registry import registry
|
460
380
|
|
461
381
|
return {
|
462
382
|
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
@@ -496,6 +416,21 @@ def get_storage_schema():
|
|
496
416
|
}
|
497
417
|
|
498
418
|
|
419
|
+
def get_job_schema():
|
420
|
+
"""Schema for a job spec, which is defined under resources."""
|
421
|
+
return {
|
422
|
+
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
423
|
+
'type': 'object',
|
424
|
+
'required': [],
|
425
|
+
'additionalProperties': False,
|
426
|
+
'properties': {
|
427
|
+
'completions': {
|
428
|
+
'type': 'number',
|
429
|
+
},
|
430
|
+
},
|
431
|
+
}
|
432
|
+
|
433
|
+
|
499
434
|
def get_config_schema():
|
500
435
|
# pylint: disable=import-outside-toplevel
|
501
436
|
from konduktor.data import registry
|
@@ -574,6 +509,17 @@ def get_config_schema():
|
|
574
509
|
},
|
575
510
|
}
|
576
511
|
|
512
|
+
ssh_configs = {
|
513
|
+
'type': 'object',
|
514
|
+
'required': [],
|
515
|
+
'additionalProperties': False,
|
516
|
+
'properties': {
|
517
|
+
'enable': {
|
518
|
+
'type': 'boolean',
|
519
|
+
},
|
520
|
+
},
|
521
|
+
}
|
522
|
+
|
577
523
|
for cloud, config in cloud_configs.items():
|
578
524
|
if cloud == 'kubernetes':
|
579
525
|
config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
|
@@ -589,6 +535,7 @@ def get_config_schema():
|
|
589
535
|
'nvidia_gpus': gpu_configs,
|
590
536
|
'allowed_clouds': allowed_clouds,
|
591
537
|
'tailscale': tailscale_configs,
|
538
|
+
'ssh': ssh_configs,
|
592
539
|
**cloud_configs,
|
593
540
|
},
|
594
541
|
}
|
@@ -1,12 +1,13 @@
|
|
1
|
-
konduktor/__init__.py,sha256=
|
1
|
+
konduktor/__init__.py,sha256=ODIjRocI7dlxyMFYh5S2VYJai-4MWm98MSyiSzaGDbA,1540
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
|
4
4
|
konduktor/adaptors/common.py,sha256=uTdpKvgBSwYMmynx9wR5kiZQyTrdaw9ZI4KH6Z2E5Hw,4296
|
5
5
|
konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,4102
|
6
|
+
konduktor/authentication.py,sha256=jhw_virbyvrY_9WZqOXX3LyOP_HkpfmTssWUMxJVyBg,4564
|
6
7
|
konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
|
7
8
|
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
8
|
-
konduktor/backends/jobset.py,sha256=
|
9
|
-
konduktor/backends/jobset_utils.py,sha256=
|
9
|
+
konduktor/backends/jobset.py,sha256=UdhwAuZODLMbLY51Y2zOBsh6wg4Pb84oHVvUKzx3Z2w,8434
|
10
|
+
konduktor/backends/jobset_utils.py,sha256=4vMYOhTENfBL9khzFuj69-Vy4g0sBkUpXX-1bfPnVys,20054
|
10
11
|
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
11
12
|
konduktor/cli.py,sha256=Ii9-2mrc-1f2ksLasA-xRb-JnEi_9ZeCXZ3lJ1GG8H8,23515
|
12
13
|
konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
|
@@ -50,12 +51,12 @@ konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZn
|
|
50
51
|
konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
|
51
52
|
konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
|
52
53
|
konduktor/data/aws/__init__.py,sha256=_6zWfNNAK1QGgyKqg_yPYWcXlnffchyvIMErYa6tw_U,331
|
53
|
-
konduktor/data/aws/s3.py,sha256=
|
54
|
+
konduktor/data/aws/s3.py,sha256=T4FnCxilNp35bsgmE7j5O3j15FVbgWRdUH8YFXCiwSw,48335
|
54
55
|
konduktor/data/constants.py,sha256=yXVEoTI2we1xOjVSU-bjRCQCLpVvpEvJ0GedXvSwEfw,127
|
55
56
|
konduktor/data/data_utils.py,sha256=yrnu8_cY63TXqfWfFG3yqY2w_tE9UQK9jIQAFQCDVg0,9668
|
56
57
|
konduktor/data/gcp/__init__.py,sha256=rlQxACBC_Vu36mdgPyJgUy4mGc_6Nt_a96JAuaPz2pQ,489
|
57
58
|
konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
|
58
|
-
konduktor/data/gcp/gcs.py,sha256=
|
59
|
+
konduktor/data/gcp/gcs.py,sha256=nqhCvQuGpHFPoxT5SKgxL25KtZuSg377Nh1bICiQwlc,42057
|
59
60
|
konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
|
60
61
|
konduktor/data/registry.py,sha256=CUbMsN_Q17Pf4wRHkqZrycErEjTP7cLEdgcfwVGcEpc,696
|
61
62
|
konduktor/data/storage.py,sha256=SDKRWDd7PCT9ytuz4cH0CejZj5QmWG_EZhUMVoTzWsc,35308
|
@@ -67,10 +68,10 @@ konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4
|
|
67
68
|
konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
|
68
69
|
konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
|
69
70
|
konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
|
70
|
-
konduktor/resource.py,sha256=
|
71
|
-
konduktor/task.py,sha256=
|
71
|
+
konduktor/resource.py,sha256=w2PdIrmQaJWA-GLSmVBcg4lxwuxvPulz35_YSKa5o24,19254
|
72
|
+
konduktor/task.py,sha256=2JOHRS4JE2FdN-M3qZKhII1hkUvWHbreNtkf30Mo2lo,35196
|
72
73
|
konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
|
73
|
-
konduktor/templates/pod.yaml.j2,sha256=
|
74
|
+
konduktor/templates/pod.yaml.j2,sha256=AobmCpvXRnZuQjfT000vN72Nuk380CCmWPHC_BVrUhM,17161
|
74
75
|
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
75
76
|
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
76
77
|
konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -80,18 +81,18 @@ konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMug
|
|
80
81
|
konduktor/utils/common_utils.py,sha256=F5x7k4AdBB44u8PYRkaugORnZKnK3JLqGn1jHOKgUYo,14960
|
81
82
|
konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
|
82
83
|
konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
|
83
|
-
konduktor/utils/exceptions.py,sha256=
|
84
|
+
konduktor/utils/exceptions.py,sha256=bOYHk3SHR3XO__p9bPwVPz8g9k6weIRxGRFNkyzgZOA,6676
|
84
85
|
konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
|
85
|
-
konduktor/utils/kubernetes_utils.py,sha256=
|
86
|
+
konduktor/utils/kubernetes_utils.py,sha256=1MZHwU4vy-exA4TA5_oTiV-zm1A2ayfeA0T_75DMFM8,23937
|
86
87
|
konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
|
87
|
-
konduktor/utils/loki_utils.py,sha256=
|
88
|
+
konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
|
88
89
|
konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
|
89
|
-
konduktor/utils/schemas.py,sha256=
|
90
|
+
konduktor/utils/schemas.py,sha256=2fHsTi3t9q3LXqOPrcpkmPsMbaoJBnuJstd6ULmDiUo,16455
|
90
91
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
91
92
|
konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
|
92
93
|
konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
|
93
|
-
konduktor_nightly-0.1.0.
|
94
|
-
konduktor_nightly-0.1.0.
|
95
|
-
konduktor_nightly-0.1.0.
|
96
|
-
konduktor_nightly-0.1.0.
|
97
|
-
konduktor_nightly-0.1.0.
|
94
|
+
konduktor_nightly-0.1.0.dev20250514104854.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
95
|
+
konduktor_nightly-0.1.0.dev20250514104854.dist-info/METADATA,sha256=ErMUfOWxJPkbM0by718uNtBgUv-2w7m5sqFzJ_cHc64,4366
|
96
|
+
konduktor_nightly-0.1.0.dev20250514104854.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
97
|
+
konduktor_nightly-0.1.0.dev20250514104854.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
98
|
+
konduktor_nightly-0.1.0.dev20250514104854.dist-info/RECORD,,
|
File without changes
|
File without changes
|