konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Cloud object stores.
|
14
|
+
|
15
|
+
Currently, used for transferring data in bulk. Thus, this module does not
|
16
|
+
offer file-level calls (e.g., open, reading, writing).
|
17
|
+
|
18
|
+
TODO:
|
19
|
+
* Better interface.
|
20
|
+
* Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
|
21
|
+
"""
|
22
|
+
|
23
|
+
import subprocess
|
24
|
+
import typing
|
25
|
+
|
26
|
+
from konduktor import logging
|
27
|
+
from konduktor.data import data_utils, gcp, storage_utils
|
28
|
+
|
29
|
+
logger = logging.get_logger(__name__)
|
30
|
+
|
31
|
+
# TODO(asaiacai): this internal API is shit and should just be unified with
|
32
|
+
# the storage_utils.AbstractStore class. Shit Berkeley EECS as usual.
|
33
|
+
|
34
|
+
|
35
|
+
class CloudStorage:
|
36
|
+
"""Interface for a cloud object store."""
|
37
|
+
|
38
|
+
# this needs to be overridden by the subclass
|
39
|
+
_STORE: typing.Type[storage_utils.AbstractStore]
|
40
|
+
|
41
|
+
def is_directory(self, url: str) -> bool:
|
42
|
+
"""Returns whether 'url' is a directory.
|
43
|
+
|
44
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
45
|
+
name is a prefix of other objects.
|
46
|
+
"""
|
47
|
+
raise NotImplementedError
|
48
|
+
|
49
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
50
|
+
"""Makes a runnable bash command to sync a 'directory'."""
|
51
|
+
raise NotImplementedError
|
52
|
+
|
53
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
54
|
+
"""Makes a runnable bash command to sync a file."""
|
55
|
+
raise NotImplementedError
|
56
|
+
|
57
|
+
def check_credentials(self):
|
58
|
+
"""Checks if the user has access credentials to this cloud."""
|
59
|
+
return self._STORE.check_credentials()
|
60
|
+
|
61
|
+
def check_credentials_from_secret(self):
|
62
|
+
"""Checks if the user has access credentials to this cloud."""
|
63
|
+
return self._STORE.check_credentials_from_secret()
|
64
|
+
|
65
|
+
def set_secret_credentials(self):
|
66
|
+
"""Set the credentials from the secret"""
|
67
|
+
return self._STORE.set_secret_credentials()
|
68
|
+
|
69
|
+
|
70
|
+
class GcsCloudStorage(CloudStorage):
|
71
|
+
"""Google Cloud Storage."""
|
72
|
+
|
73
|
+
# We use gsutil as a basic implementation. One pro is that its -m
|
74
|
+
# multi-threaded download is nice, which frees us from implementing
|
75
|
+
# parellel workers on our end.
|
76
|
+
# The gsutil command is part of the Google Cloud SDK, and we reuse
|
77
|
+
# the installation logic here.
|
78
|
+
_INSTALL_GSUTIL = gcp.GOOGLE_SDK_INSTALLATION_COMMAND
|
79
|
+
_STORE: typing.Type[storage_utils.AbstractStore] = gcp.GcsStore
|
80
|
+
|
81
|
+
@property
|
82
|
+
def _gsutil_command(self):
|
83
|
+
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
|
84
|
+
return (
|
85
|
+
f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
|
86
|
+
f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
|
87
|
+
# Explicitly activate service account. Unlike the gcp packages
|
88
|
+
# and other GCP commands, gsutil does not automatically pick up
|
89
|
+
# the default credential keys when it is a service account.
|
90
|
+
'gcloud auth activate-service-account '
|
91
|
+
'--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
|
92
|
+
'2> /dev/null || true; '
|
93
|
+
f'{gsutil_alias}'
|
94
|
+
)
|
95
|
+
|
96
|
+
def is_directory(self, url: str) -> bool:
|
97
|
+
"""Returns whether 'url' is a directory.
|
98
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
99
|
+
name is a prefix of other objects.
|
100
|
+
"""
|
101
|
+
commands = [self._INSTALL_GSUTIL]
|
102
|
+
commands.append(f'{self._gsutil_command} ls -d {url}')
|
103
|
+
command = ' && '.join(commands)
|
104
|
+
p = subprocess.run(
|
105
|
+
command,
|
106
|
+
stdout=subprocess.PIPE,
|
107
|
+
shell=True,
|
108
|
+
check=True,
|
109
|
+
executable='/bin/bash',
|
110
|
+
)
|
111
|
+
out = p.stdout.decode().strip()
|
112
|
+
# Edge Case: Gcloud command is run for first time #437
|
113
|
+
out = out.split('\n')[-1]
|
114
|
+
# If <url> is a bucket root, then we only need `gsutil` to succeed
|
115
|
+
# to make sure the bucket exists. It is already a directory.
|
116
|
+
_, key = data_utils.split_gcs_path(url)
|
117
|
+
if not key:
|
118
|
+
return True
|
119
|
+
# Otherwise, gsutil ls -d url will return:
|
120
|
+
# --> url.rstrip('/') if url is not a directory
|
121
|
+
# --> url with an ending '/' if url is a directory
|
122
|
+
if not out.endswith('/'):
|
123
|
+
assert out == url.rstrip('/'), (out, url)
|
124
|
+
return False
|
125
|
+
url = url if url.endswith('/') else (url + '/')
|
126
|
+
assert out == url, (out, url)
|
127
|
+
return True
|
128
|
+
|
129
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
130
|
+
"""Downloads a directory using gsutil."""
|
131
|
+
download_via_gsutil = (
|
132
|
+
f'{self._gsutil_command} ' f'rsync -e -r {source} {destination}'
|
133
|
+
)
|
134
|
+
all_commands = [self._INSTALL_GSUTIL]
|
135
|
+
all_commands.append(download_via_gsutil)
|
136
|
+
return ' && '.join(all_commands)
|
137
|
+
|
138
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
139
|
+
"""Downloads a file using gsutil."""
|
140
|
+
download_via_gsutil = f'{self._gsutil_command} ' f'cp {source} {destination}'
|
141
|
+
all_commands = [self._INSTALL_GSUTIL]
|
142
|
+
all_commands.append(download_via_gsutil)
|
143
|
+
return ' && '.join(all_commands)
|
144
|
+
|
145
|
+
|
146
|
+
# Maps bucket's URIs prefix(scheme) to its corresponding storage class
|
147
|
+
_REGISTRY = {
|
148
|
+
'gs': GcsCloudStorage(),
|
149
|
+
# TODO(asaiacai): Add other cloud stores here
|
150
|
+
# 's3': S3CloudStorage(),
|
151
|
+
# 'r2': R2CloudStorage(),
|
152
|
+
# 'cos': IBMCosCloudStorage(),
|
153
|
+
# 'oci': OciCloudStorage(),
|
154
|
+
# # TODO: This is a hack, as Azure URL starts with https://, we should
|
155
|
+
# # refactor the registry to be able to take regex, so that Azure blob can
|
156
|
+
# # be identified with `https://(.*?)\.blob\.core\.windows\.net`
|
157
|
+
# 'https': AzureBlobCloudStorage()
|
158
|
+
}
|
konduktor/config.py
ADDED
@@ -0,0 +1,420 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""
|
14
|
+
On module import, we attempt to parse the config located at KONDUKTOR_CONFIG
|
15
|
+
(default: ~/.konduktor/config.yaml). Caller can then use
|
16
|
+
|
17
|
+
>> konduktor_config.loaded()
|
18
|
+
|
19
|
+
to check if the config is successfully loaded.
|
20
|
+
|
21
|
+
To read a nested-key config:
|
22
|
+
|
23
|
+
>> konduktor_config.get_nested(('auth', 'some_auth_config'), default_value)
|
24
|
+
|
25
|
+
The config can be overridden by the configs in task YAMLs. Callers are
|
26
|
+
responsible to provide the override_configs. If the nested key is part of
|
27
|
+
OVERRIDEABLE_CONFIG_KEYS, override_configs must be provided (can be empty):
|
28
|
+
|
29
|
+
>> konduktor_config.get_nested(('docker', 'run_options'), default_value
|
30
|
+
override_configs={'docker': {'run_options': 'value'}})
|
31
|
+
|
32
|
+
To set a value in the nested-key config:
|
33
|
+
|
34
|
+
>> config_dict = konduktor_config.set_nested(('auth', 'some_key'), value)
|
35
|
+
|
36
|
+
This operation returns a deep-copy dict, and is safe in that any key not found
|
37
|
+
will not raise an error.
|
38
|
+
|
39
|
+
Example usage:
|
40
|
+
|
41
|
+
Consider the following config contents:
|
42
|
+
|
43
|
+
a:
|
44
|
+
nested: 1
|
45
|
+
b: 2
|
46
|
+
|
47
|
+
then:
|
48
|
+
|
49
|
+
# Assuming ~/.konduktor/config.yaml exists and can be loaded:
|
50
|
+
konduktor_config.loaded() # ==> True
|
51
|
+
|
52
|
+
konduktor_config.get_nested(('a', 'nested'), None) # ==> 1
|
53
|
+
konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
|
54
|
+
konduktor_config.get_nested(('a',), None) # ==> {'nested': 1}
|
55
|
+
|
56
|
+
# If ~/.konduktor/config.yaml doesn't exist or failed to be loaded:
|
57
|
+
konduktor_config.loaded() # ==> False
|
58
|
+
konduktor_config.get_nested(('a', 'nested'), None) # ==> None
|
59
|
+
konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
|
60
|
+
konduktor_config.get_nested(('a',), None) # ==> None
|
61
|
+
"""
|
62
|
+
|
63
|
+
import copy
|
64
|
+
import os
|
65
|
+
import pprint
|
66
|
+
from typing import Any, Dict, List, Optional, Tuple
|
67
|
+
|
68
|
+
import yaml
|
69
|
+
|
70
|
+
from konduktor import logging
|
71
|
+
from konduktor.utils import common_utils, schemas, ux_utils
|
72
|
+
|
73
|
+
logger = logging.get_logger(__name__)
|
74
|
+
|
75
|
+
# overrides are specified in task YAMLs.
|
76
|
+
OVERRIDEABLE_CONFIG_KEYS: List[Tuple[str, ...]] = [
|
77
|
+
('kubernetes', 'pod_config'),
|
78
|
+
]
|
79
|
+
|
80
|
+
# The config path is discovered in this order:
|
81
|
+
#
|
82
|
+
# (1) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
|
83
|
+
# path;
|
84
|
+
# (2) If file {CONFIG_PATH} exists, use this file.
|
85
|
+
#
|
86
|
+
# If the path discovered by (1) fails to load, we do not attempt to go to step
|
87
|
+
# 2 in the list.
|
88
|
+
|
89
|
+
# (Used internally) An env var holding the path to the local config file. This
|
90
|
+
# is only used by jobs controller tasks to ensure recoveries of the same job
|
91
|
+
# use the same config file.
|
92
|
+
ENV_VAR_CONFIG = 'KONDUKTOR_CONFIG'
|
93
|
+
|
94
|
+
# Path to the local config file.
|
95
|
+
CONFIG_PATH = '~/.konduktor/config.yaml'
|
96
|
+
|
97
|
+
|
98
|
+
class Config(Dict[str, Any]):
|
99
|
+
"""Konduktor config that supports setting/getting values with nested keys."""
|
100
|
+
|
101
|
+
def get_nested(
|
102
|
+
self,
|
103
|
+
keys: Tuple[str, ...],
|
104
|
+
default_value: Any,
|
105
|
+
override_configs: Optional[Dict[str, Any]] = None,
|
106
|
+
) -> Any:
|
107
|
+
"""Gets a nested key.
|
108
|
+
|
109
|
+
If any key is not found, or any intermediate key does not point to a
|
110
|
+
dict value, returns 'default_value'.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
keys: A tuple of strings representing the nested keys.
|
114
|
+
default_value: The default value to return if the key is not found.
|
115
|
+
override_configs: A dict of override configs with the same schema as
|
116
|
+
the config file, but only containing the keys to override.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
The value of the nested key, or 'default_value' if not found.
|
120
|
+
"""
|
121
|
+
config = copy.deepcopy(self)
|
122
|
+
if override_configs is not None:
|
123
|
+
config = _recursive_update(config, override_configs)
|
124
|
+
return _get_nested(config, keys, default_value)
|
125
|
+
|
126
|
+
def set_nested(self, keys: Tuple[str, ...], value: Any) -> None:
|
127
|
+
"""In-place sets a nested key to value.
|
128
|
+
|
129
|
+
Like get_nested(), if any key is not found, this will not raise an
|
130
|
+
error.
|
131
|
+
"""
|
132
|
+
override = {}
|
133
|
+
for i, key in enumerate(reversed(keys)):
|
134
|
+
if i == 0:
|
135
|
+
override = {key: value}
|
136
|
+
else:
|
137
|
+
override = {key: override}
|
138
|
+
_recursive_update(self, override)
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def from_dict(cls, config: Optional[Dict[str, Any]]) -> 'Config':
|
142
|
+
if config is None:
|
143
|
+
return cls()
|
144
|
+
return cls(**config)
|
145
|
+
|
146
|
+
|
147
|
+
# The loaded config.
|
148
|
+
_dict = Config()
|
149
|
+
_loaded_config_path: Optional[str] = None
|
150
|
+
|
151
|
+
|
152
|
+
def get_nested(
|
153
|
+
keys: Tuple[str, ...],
|
154
|
+
default_value: Any,
|
155
|
+
override_configs: Optional[Dict[str, Any]] = None,
|
156
|
+
) -> Any:
|
157
|
+
"""Gets a nested key.
|
158
|
+
|
159
|
+
If any key is not found, or any intermediate key does not point to a dict
|
160
|
+
value, returns 'default_value'.
|
161
|
+
|
162
|
+
When 'keys' is within OVERRIDEABLE_CONFIG_KEYS, 'override_configs' must be
|
163
|
+
provided (can be empty). Otherwise, 'override_configs' must not be provided.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
keys: A tuple of strings representing the nested keys.
|
167
|
+
default_value: The default value to return if the key is not found.
|
168
|
+
override_configs: A dict of override configs with the same schema as
|
169
|
+
the config file, but only containing the keys to override.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
The value of the nested key, or 'default_value' if not found.
|
173
|
+
"""
|
174
|
+
assert not (keys in OVERRIDEABLE_CONFIG_KEYS and override_configs is None), (
|
175
|
+
f'Override configs must be provided when keys {keys} is within '
|
176
|
+
'OVERRIDEABLE_CONFIG_KEYS: '
|
177
|
+
f'{OVERRIDEABLE_CONFIG_KEYS}'
|
178
|
+
)
|
179
|
+
assert not (
|
180
|
+
keys not in OVERRIDEABLE_CONFIG_KEYS and override_configs is not None
|
181
|
+
), (
|
182
|
+
f'Override configs must not be provided when keys {keys} is not within '
|
183
|
+
'OVERRIDEABLE_CONFIG_KEYS: '
|
184
|
+
f'{OVERRIDEABLE_CONFIG_KEYS}'
|
185
|
+
)
|
186
|
+
return _dict.get_nested(keys, default_value, override_configs)
|
187
|
+
|
188
|
+
|
189
|
+
def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
|
190
|
+
"""Returns a deep-copied config with the nested key set to value.
|
191
|
+
|
192
|
+
Like get_nested(), if any key is not found, this will not raise an error.
|
193
|
+
"""
|
194
|
+
copied_dict = copy.deepcopy(_dict)
|
195
|
+
copied_dict.set_nested(keys, value)
|
196
|
+
return dict(**copied_dict)
|
197
|
+
|
198
|
+
|
199
|
+
def to_dict() -> Config:
|
200
|
+
"""Returns a deep-copied version of the current config."""
|
201
|
+
return copy.deepcopy(_dict)
|
202
|
+
|
203
|
+
|
204
|
+
def _try_load_config() -> None:
|
205
|
+
global _dict, _loaded_config_path
|
206
|
+
config_path_via_env_var = os.environ.get(ENV_VAR_CONFIG)
|
207
|
+
if config_path_via_env_var is not None:
|
208
|
+
config_path = os.path.expanduser(config_path_via_env_var)
|
209
|
+
if not os.path.exists(config_path):
|
210
|
+
with ux_utils.print_exception_no_traceback():
|
211
|
+
raise FileNotFoundError(
|
212
|
+
'Config file specified by env var '
|
213
|
+
f'{ENV_VAR_CONFIG} ({config_path!r}) does not '
|
214
|
+
'exist. Please double check the path or unset the env var: '
|
215
|
+
f'unset {ENV_VAR_CONFIG}'
|
216
|
+
)
|
217
|
+
else:
|
218
|
+
config_path = CONFIG_PATH
|
219
|
+
config_path = os.path.expanduser(config_path)
|
220
|
+
if os.path.exists(config_path):
|
221
|
+
logger.debug(f'Using config path: {config_path}')
|
222
|
+
try:
|
223
|
+
config = common_utils.read_yaml(config_path)
|
224
|
+
_dict = Config.from_dict(config)
|
225
|
+
_loaded_config_path = config_path
|
226
|
+
logger.debug(f'Config loaded:\n{pprint.pformat(_dict)}')
|
227
|
+
except yaml.YAMLError as e:
|
228
|
+
logger.error(f'Error in loading config file ({config_path}):', e)
|
229
|
+
if _dict:
|
230
|
+
common_utils.validate_schema(
|
231
|
+
_dict,
|
232
|
+
schemas.get_config_schema(),
|
233
|
+
f'Invalid config YAML ({config_path}). See: '
|
234
|
+
'https://konduktor.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long
|
235
|
+
'Error: ',
|
236
|
+
skip_none=False,
|
237
|
+
)
|
238
|
+
|
239
|
+
logger.debug('Config syntax check passed.')
|
240
|
+
|
241
|
+
|
242
|
+
def _check_allowed_and_disallowed_override_keys(
|
243
|
+
key: str,
|
244
|
+
allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
245
|
+
disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
246
|
+
) -> Tuple[Optional[List[Tuple[str, ...]]], Optional[List[Tuple[str, ...]]]]:
|
247
|
+
allowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
|
248
|
+
disallowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
|
249
|
+
if allowed_override_keys is not None:
|
250
|
+
for nested_key in allowed_override_keys:
|
251
|
+
if key == nested_key[0]:
|
252
|
+
if len(nested_key) == 1:
|
253
|
+
# Allowed key is fully matched, no need to check further.
|
254
|
+
allowed_keys_with_matched_prefix = None
|
255
|
+
break
|
256
|
+
assert allowed_keys_with_matched_prefix is not None
|
257
|
+
allowed_keys_with_matched_prefix.append(nested_key[1:])
|
258
|
+
if (
|
259
|
+
allowed_keys_with_matched_prefix is not None
|
260
|
+
and not allowed_keys_with_matched_prefix
|
261
|
+
):
|
262
|
+
raise ValueError(
|
263
|
+
f'Key {key} is not in allowed override keys: '
|
264
|
+
f'{allowed_override_keys}'
|
265
|
+
)
|
266
|
+
else:
|
267
|
+
allowed_keys_with_matched_prefix = None
|
268
|
+
|
269
|
+
if disallowed_override_keys is not None:
|
270
|
+
for nested_key in disallowed_override_keys:
|
271
|
+
if key == nested_key[0]:
|
272
|
+
if len(nested_key) == 1:
|
273
|
+
raise ValueError(
|
274
|
+
f'Key {key} is in disallowed override keys: '
|
275
|
+
f'{disallowed_override_keys}'
|
276
|
+
)
|
277
|
+
assert disallowed_keys_with_matched_prefix is not None
|
278
|
+
disallowed_keys_with_matched_prefix.append(nested_key[1:])
|
279
|
+
else:
|
280
|
+
disallowed_keys_with_matched_prefix = None
|
281
|
+
return allowed_keys_with_matched_prefix, disallowed_keys_with_matched_prefix
|
282
|
+
|
283
|
+
|
284
|
+
def _recursive_update(
|
285
|
+
base_config: Config,
|
286
|
+
override_config: Dict[str, Any],
|
287
|
+
allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
288
|
+
disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
289
|
+
) -> Config:
|
290
|
+
"""Recursively updates base configuration with override configuration"""
|
291
|
+
for key, value in override_config.items():
|
292
|
+
(next_allowed_override_keys, next_disallowed_override_keys) = (
|
293
|
+
_check_allowed_and_disallowed_override_keys(
|
294
|
+
key, allowed_override_keys, disallowed_override_keys
|
295
|
+
)
|
296
|
+
)
|
297
|
+
if key == 'kubernetes' and key in base_config:
|
298
|
+
merge_k8s_configs(
|
299
|
+
base_config[key],
|
300
|
+
value,
|
301
|
+
next_allowed_override_keys,
|
302
|
+
next_disallowed_override_keys,
|
303
|
+
)
|
304
|
+
elif (
|
305
|
+
isinstance(value, dict)
|
306
|
+
and key in base_config
|
307
|
+
and isinstance(base_config[key], dict)
|
308
|
+
):
|
309
|
+
_recursive_update(
|
310
|
+
base_config[key],
|
311
|
+
value,
|
312
|
+
next_allowed_override_keys,
|
313
|
+
next_disallowed_override_keys,
|
314
|
+
)
|
315
|
+
else:
|
316
|
+
base_config[key] = value
|
317
|
+
return base_config
|
318
|
+
|
319
|
+
|
320
|
+
def _get_nested(
|
321
|
+
configs: Optional[Dict[str, Any]],
|
322
|
+
keys: Tuple[str, ...],
|
323
|
+
default_value: Any,
|
324
|
+
pop: bool = False,
|
325
|
+
) -> Any:
|
326
|
+
if configs is None:
|
327
|
+
return default_value
|
328
|
+
curr = configs
|
329
|
+
for i, key in enumerate(keys):
|
330
|
+
if isinstance(curr, dict) and key in curr:
|
331
|
+
value = curr[key]
|
332
|
+
if i == len(keys) - 1:
|
333
|
+
if pop:
|
334
|
+
curr.pop(key, default_value)
|
335
|
+
curr = value
|
336
|
+
else:
|
337
|
+
return default_value
|
338
|
+
logger.debug(f'User config: {".".join(keys)} -> {curr}')
|
339
|
+
return curr
|
340
|
+
|
341
|
+
|
342
|
+
def merge_k8s_configs(
|
343
|
+
base_config: Dict[Any, Any],
|
344
|
+
override_config: Dict[Any, Any],
|
345
|
+
allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
346
|
+
disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
|
347
|
+
) -> None:
|
348
|
+
"""Merge two configs into the base_config.
|
349
|
+
|
350
|
+
Updates nested dictionaries instead of replacing them.
|
351
|
+
If a list is encountered, it will be appended to the base_config list.
|
352
|
+
|
353
|
+
An exception is when the key is 'containers', in which case the
|
354
|
+
first container in the list will be fetched and merge_dict will be
|
355
|
+
called on it with the first container in the base_config list.
|
356
|
+
"""
|
357
|
+
for key, value in override_config.items():
|
358
|
+
(next_allowed_override_keys, next_disallowed_override_keys) = (
|
359
|
+
_check_allowed_and_disallowed_override_keys(
|
360
|
+
key, allowed_override_keys, disallowed_override_keys
|
361
|
+
)
|
362
|
+
)
|
363
|
+
if isinstance(value, dict) and key in base_config:
|
364
|
+
merge_k8s_configs(
|
365
|
+
base_config[key],
|
366
|
+
value,
|
367
|
+
next_allowed_override_keys,
|
368
|
+
next_disallowed_override_keys,
|
369
|
+
)
|
370
|
+
elif isinstance(value, list) and key in base_config:
|
371
|
+
assert isinstance(
|
372
|
+
base_config[key], list
|
373
|
+
), f'Expected {key} to be a list, found {base_config[key]}'
|
374
|
+
if key in ['containers', 'imagePullSecrets']:
|
375
|
+
# If the key is 'containers' or 'imagePullSecrets, we take the
|
376
|
+
# first and only container/secret in the list and merge it, as
|
377
|
+
# we only support one container per pod.
|
378
|
+
assert len(value) == 1, f'Expected only one container, found {value}'
|
379
|
+
merge_k8s_configs(
|
380
|
+
base_config[key][0],
|
381
|
+
value[0],
|
382
|
+
next_allowed_override_keys,
|
383
|
+
next_disallowed_override_keys,
|
384
|
+
)
|
385
|
+
elif key in ['volumes', 'volumeMounts']:
|
386
|
+
# If the key is 'volumes' or 'volumeMounts', we search for
|
387
|
+
# item with the same name and merge it.
|
388
|
+
for new_volume in value:
|
389
|
+
new_volume_name = new_volume.get('name')
|
390
|
+
if new_volume_name is not None:
|
391
|
+
destination_volume = next(
|
392
|
+
(
|
393
|
+
v
|
394
|
+
for v in base_config[key]
|
395
|
+
if v.get('name') == new_volume_name
|
396
|
+
),
|
397
|
+
None,
|
398
|
+
)
|
399
|
+
if destination_volume is not None:
|
400
|
+
merge_k8s_configs(destination_volume, new_volume)
|
401
|
+
else:
|
402
|
+
base_config[key].append(new_volume)
|
403
|
+
else:
|
404
|
+
base_config[key].extend(value)
|
405
|
+
else:
|
406
|
+
base_config[key] = value
|
407
|
+
|
408
|
+
|
409
|
+
def loaded_config_path() -> Optional[str]:
|
410
|
+
"""Returns the path to the loaded config file."""
|
411
|
+
return _loaded_config_path
|
412
|
+
|
413
|
+
|
414
|
+
# Load on import.
|
415
|
+
_try_load_config()
|
416
|
+
|
417
|
+
|
418
|
+
def loaded() -> bool:
|
419
|
+
"""Returns if the user configurations are loaded."""
|
420
|
+
return bool(_dict)
|
konduktor/constants.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
KONDUKTOR_IGNORE_FILE = '.konduktorignore'
|
2
|
+
GIT_IGNORE_FILE = '.gitignore'
|
3
|
+
KONDUKTOR_REMOTE_WORKDIR = '~/konduktor_workdir'
|
4
|
+
KONDUKTOR_LOGS_DIRECTORY = '~/konduktor_logs'
|
5
|
+
|
6
|
+
# Used for translate local file mounts to cloud storage. Please refer to
|
7
|
+
# konduktor/utils/controller_utils.py::maybe_translate_local_file_mounts_and_sync_up for
|
8
|
+
# more details.
|
9
|
+
# TODO(asaiacai): Unlike skypilot, we don't delete buckets after a job completes
|
10
|
+
# because we want to persists code, logs, and artifacts for debugging.
|
11
|
+
# yes it's a resource leak, but object store is
|
12
|
+
# so cheap and code/data is small in comparison.
|
13
|
+
FILE_MOUNTS_BUCKET_NAME = 'konduktor-filemounts-{username}-{user_hash}'
|
14
|
+
FILE_MOUNTS_LOCAL_TMP_DIR = 'konduktor-filemounts-files-{id}'
|
15
|
+
FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/konduktor-{}-filemounts-files'
|
16
|
+
|
17
|
+
# For API server, the use a temporary directory in the same path as the upload
|
18
|
+
# directory to avoid using a different block device, which may not allow hard
|
19
|
+
# linking. E.g., in our API server deployment on k8s, ~/.konduktor/ is mounted from a
|
20
|
+
# persistent volume, so any contents in ~/.konduktor/ cannot be hard linked elsewhere.
|
21
|
+
FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.konduktor/tmp/'
|
22
|
+
# Base path for two-hop file mounts translation. See
|
23
|
+
# controller_utils.translate_local_file_mounts_to_two_hop().
|
24
|
+
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.konduktor/tmp/controller'
|
25
|
+
|
26
|
+
|
27
|
+
# Used when an managed jobs are created and
|
28
|
+
# files are synced up to the cloud.
|
29
|
+
FILE_MOUNTS_WORKDIR_SUBPATH = '{task_name}-{run_id}/workdir'
|
30
|
+
FILE_MOUNTS_SUBPATH = '{task_name}-{run_id}/local-file-mounts/{i}'
|
31
|
+
FILE_MOUNTS_TMP_SUBPATH = '{task_name}-{run_id}/tmp-files'
|
32
|
+
|
33
|
+
# Path to the file that contains the python path.
|
34
|
+
GET_PYTHON_PATH_CMD = 'which python3'
|
35
|
+
# Python executable, e.g., /opt/conda/bin/python3
|
36
|
+
PYTHON_CMD = f'$({GET_PYTHON_PATH_CMD})'
|
@@ -1,4 +1,4 @@
|
|
1
|
-
KONDUKTOR_CONTROLLER_VERSION =
|
1
|
+
KONDUKTOR_CONTROLLER_VERSION = '0.1.0'
|
2
2
|
|
3
3
|
HARDWARE_XID_ERRORS = set(
|
4
4
|
(
|
@@ -45,12 +45,12 @@ ALLOWLISTED_NVSWITCH_SXID_ERRORS = set(
|
|
45
45
|
|
46
46
|
POD_LOG_ERROR_REGEXES = [
|
47
47
|
# possibly indicates degraded nvidia-FM in bad state
|
48
|
-
r
|
48
|
+
r'`invalid device ordinal`',
|
49
49
|
]
|
50
50
|
|
51
51
|
DMESG_ERROR_REGEXES = [
|
52
|
-
r
|
53
|
-
r
|
54
|
-
r
|
55
|
-
r
|
52
|
+
r'`(?i)nvidia-peermem nv_get_p2p_free_callback:\d+ '
|
53
|
+
r'ERROR detected invalid context, skipping further processing`',
|
54
|
+
r'`(?i)NVRM: xid`',
|
55
|
+
r'`(?i)SXid`',
|
56
56
|
]
|
konduktor/controller/launch.py
CHANGED
@@ -22,12 +22,12 @@ from konduktor.controller import node as node_control
|
|
22
22
|
KONDUKTOR_CONTROLLER_LOG_POLL_SECONDS = 5
|
23
23
|
KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ = 5
|
24
24
|
|
25
|
-
logger = logging.get_logger(
|
25
|
+
logger = logging.get_logger('konduktor.controller')
|
26
26
|
|
27
27
|
|
28
28
|
def main():
|
29
29
|
logger.info(
|
30
|
-
f
|
30
|
+
f'starting konduktor.controller ver. {constants.KONDUKTOR_CONTROLLER_VERSION}'
|
31
31
|
)
|
32
32
|
while True:
|
33
33
|
for _ in range(KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ):
|
@@ -40,5 +40,5 @@ def main():
|
|
40
40
|
node_control.health_check()
|
41
41
|
|
42
42
|
|
43
|
-
if __name__ ==
|
43
|
+
if __name__ == '__main__':
|
44
44
|
main()
|