konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,158 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Cloud object stores.
14
+
15
+ Currently, used for transferring data in bulk. Thus, this module does not
16
+ offer file-level calls (e.g., open, reading, writing).
17
+
18
+ TODO:
19
+ * Better interface.
20
+ * Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
21
+ """
22
+
23
+ import subprocess
24
+ import typing
25
+
26
+ from konduktor import logging
27
+ from konduktor.data import data_utils, gcp, storage_utils
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ # TODO(asaiacai): this internal API is shit and should just be unified with
32
+ # the storage_utils.AbstractStore class. Shit Berkeley EECS as usual.
33
+
34
+
35
+ class CloudStorage:
36
+ """Interface for a cloud object store."""
37
+
38
+ # this needs to be overridden by the subclass
39
+ _STORE: typing.Type[storage_utils.AbstractStore]
40
+
41
+ def is_directory(self, url: str) -> bool:
42
+ """Returns whether 'url' is a directory.
43
+
44
+ In cloud object stores, a "directory" refers to a regular object whose
45
+ name is a prefix of other objects.
46
+ """
47
+ raise NotImplementedError
48
+
49
+ def make_sync_dir_command(self, source: str, destination: str) -> str:
50
+ """Makes a runnable bash command to sync a 'directory'."""
51
+ raise NotImplementedError
52
+
53
+ def make_sync_file_command(self, source: str, destination: str) -> str:
54
+ """Makes a runnable bash command to sync a file."""
55
+ raise NotImplementedError
56
+
57
+ def check_credentials(self):
58
+ """Checks if the user has access credentials to this cloud."""
59
+ return self._STORE.check_credentials()
60
+
61
+ def check_credentials_from_secret(self):
62
+ """Checks if the user has access credentials to this cloud."""
63
+ return self._STORE.check_credentials_from_secret()
64
+
65
+ def set_secret_credentials(self):
66
+ """Set the credentials from the secret"""
67
+ return self._STORE.set_secret_credentials()
68
+
69
+
70
+ class GcsCloudStorage(CloudStorage):
71
+ """Google Cloud Storage."""
72
+
73
+ # We use gsutil as a basic implementation. One pro is that its -m
74
+ # multi-threaded download is nice, which frees us from implementing
75
+ # parellel workers on our end.
76
+ # The gsutil command is part of the Google Cloud SDK, and we reuse
77
+ # the installation logic here.
78
+ _INSTALL_GSUTIL = gcp.GOOGLE_SDK_INSTALLATION_COMMAND
79
+ _STORE: typing.Type[storage_utils.AbstractStore] = gcp.GcsStore
80
+
81
+ @property
82
+ def _gsutil_command(self):
83
+ gsutil_alias, alias_gen = data_utils.get_gsutil_command()
84
+ return (
85
+ f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
86
+ f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
87
+ # Explicitly activate service account. Unlike the gcp packages
88
+ # and other GCP commands, gsutil does not automatically pick up
89
+ # the default credential keys when it is a service account.
90
+ 'gcloud auth activate-service-account '
91
+ '--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
92
+ '2> /dev/null || true; '
93
+ f'{gsutil_alias}'
94
+ )
95
+
96
+ def is_directory(self, url: str) -> bool:
97
+ """Returns whether 'url' is a directory.
98
+ In cloud object stores, a "directory" refers to a regular object whose
99
+ name is a prefix of other objects.
100
+ """
101
+ commands = [self._INSTALL_GSUTIL]
102
+ commands.append(f'{self._gsutil_command} ls -d {url}')
103
+ command = ' && '.join(commands)
104
+ p = subprocess.run(
105
+ command,
106
+ stdout=subprocess.PIPE,
107
+ shell=True,
108
+ check=True,
109
+ executable='/bin/bash',
110
+ )
111
+ out = p.stdout.decode().strip()
112
+ # Edge Case: Gcloud command is run for first time #437
113
+ out = out.split('\n')[-1]
114
+ # If <url> is a bucket root, then we only need `gsutil` to succeed
115
+ # to make sure the bucket exists. It is already a directory.
116
+ _, key = data_utils.split_gcs_path(url)
117
+ if not key:
118
+ return True
119
+ # Otherwise, gsutil ls -d url will return:
120
+ # --> url.rstrip('/') if url is not a directory
121
+ # --> url with an ending '/' if url is a directory
122
+ if not out.endswith('/'):
123
+ assert out == url.rstrip('/'), (out, url)
124
+ return False
125
+ url = url if url.endswith('/') else (url + '/')
126
+ assert out == url, (out, url)
127
+ return True
128
+
129
+ def make_sync_dir_command(self, source: str, destination: str) -> str:
130
+ """Downloads a directory using gsutil."""
131
+ download_via_gsutil = (
132
+ f'{self._gsutil_command} ' f'rsync -e -r {source} {destination}'
133
+ )
134
+ all_commands = [self._INSTALL_GSUTIL]
135
+ all_commands.append(download_via_gsutil)
136
+ return ' && '.join(all_commands)
137
+
138
+ def make_sync_file_command(self, source: str, destination: str) -> str:
139
+ """Downloads a file using gsutil."""
140
+ download_via_gsutil = f'{self._gsutil_command} ' f'cp {source} {destination}'
141
+ all_commands = [self._INSTALL_GSUTIL]
142
+ all_commands.append(download_via_gsutil)
143
+ return ' && '.join(all_commands)
144
+
145
+
146
+ # Maps bucket's URIs prefix(scheme) to its corresponding storage class
147
+ _REGISTRY = {
148
+ 'gs': GcsCloudStorage(),
149
+ # TODO(asaiacai): Add other cloud stores here
150
+ # 's3': S3CloudStorage(),
151
+ # 'r2': R2CloudStorage(),
152
+ # 'cos': IBMCosCloudStorage(),
153
+ # 'oci': OciCloudStorage(),
154
+ # # TODO: This is a hack, as Azure URL starts with https://, we should
155
+ # # refactor the registry to be able to take regex, so that Azure blob can
156
+ # # be identified with `https://(.*?)\.blob\.core\.windows\.net`
157
+ # 'https': AzureBlobCloudStorage()
158
+ }
konduktor/config.py ADDED
@@ -0,0 +1,420 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ On module import, we attempt to parse the config located at KONDUKTOR_CONFIG
15
+ (default: ~/.konduktor/config.yaml). Caller can then use
16
+
17
+ >> konduktor_config.loaded()
18
+
19
+ to check if the config is successfully loaded.
20
+
21
+ To read a nested-key config:
22
+
23
+ >> konduktor_config.get_nested(('auth', 'some_auth_config'), default_value)
24
+
25
+ The config can be overridden by the configs in task YAMLs. Callers are
26
+ responsible to provide the override_configs. If the nested key is part of
27
+ OVERRIDEABLE_CONFIG_KEYS, override_configs must be provided (can be empty):
28
+
29
+ >> konduktor_config.get_nested(('docker', 'run_options'), default_value
30
+ override_configs={'docker': {'run_options': 'value'}})
31
+
32
+ To set a value in the nested-key config:
33
+
34
+ >> config_dict = konduktor_config.set_nested(('auth', 'some_key'), value)
35
+
36
+ This operation returns a deep-copy dict, and is safe in that any key not found
37
+ will not raise an error.
38
+
39
+ Example usage:
40
+
41
+ Consider the following config contents:
42
+
43
+ a:
44
+ nested: 1
45
+ b: 2
46
+
47
+ then:
48
+
49
+ # Assuming ~/.konduktor/config.yaml exists and can be loaded:
50
+ konduktor_config.loaded() # ==> True
51
+
52
+ konduktor_config.get_nested(('a', 'nested'), None) # ==> 1
53
+ konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
54
+ konduktor_config.get_nested(('a',), None) # ==> {'nested': 1}
55
+
56
+ # If ~/.konduktor/config.yaml doesn't exist or failed to be loaded:
57
+ konduktor_config.loaded() # ==> False
58
+ konduktor_config.get_nested(('a', 'nested'), None) # ==> None
59
+ konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
60
+ konduktor_config.get_nested(('a',), None) # ==> None
61
+ """
62
+
63
+ import copy
64
+ import os
65
+ import pprint
66
+ from typing import Any, Dict, List, Optional, Tuple
67
+
68
+ import yaml
69
+
70
+ from konduktor import logging
71
+ from konduktor.utils import common_utils, schemas, ux_utils
72
+
73
+ logger = logging.get_logger(__name__)
74
+
75
+ # overrides are specified in task YAMLs.
76
+ OVERRIDEABLE_CONFIG_KEYS: List[Tuple[str, ...]] = [
77
+ ('kubernetes', 'pod_config'),
78
+ ]
79
+
80
+ # The config path is discovered in this order:
81
+ #
82
+ # (1) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
83
+ # path;
84
+ # (2) If file {CONFIG_PATH} exists, use this file.
85
+ #
86
+ # If the path discovered by (1) fails to load, we do not attempt to go to step
87
+ # 2 in the list.
88
+
89
+ # (Used internally) An env var holding the path to the local config file. This
90
+ # is only used by jobs controller tasks to ensure recoveries of the same job
91
+ # use the same config file.
92
+ ENV_VAR_CONFIG = 'KONDUKTOR_CONFIG'
93
+
94
+ # Path to the local config file.
95
+ CONFIG_PATH = '~/.konduktor/config.yaml'
96
+
97
+
98
+ class Config(Dict[str, Any]):
99
+ """Konduktor config that supports setting/getting values with nested keys."""
100
+
101
+ def get_nested(
102
+ self,
103
+ keys: Tuple[str, ...],
104
+ default_value: Any,
105
+ override_configs: Optional[Dict[str, Any]] = None,
106
+ ) -> Any:
107
+ """Gets a nested key.
108
+
109
+ If any key is not found, or any intermediate key does not point to a
110
+ dict value, returns 'default_value'.
111
+
112
+ Args:
113
+ keys: A tuple of strings representing the nested keys.
114
+ default_value: The default value to return if the key is not found.
115
+ override_configs: A dict of override configs with the same schema as
116
+ the config file, but only containing the keys to override.
117
+
118
+ Returns:
119
+ The value of the nested key, or 'default_value' if not found.
120
+ """
121
+ config = copy.deepcopy(self)
122
+ if override_configs is not None:
123
+ config = _recursive_update(config, override_configs)
124
+ return _get_nested(config, keys, default_value)
125
+
126
+ def set_nested(self, keys: Tuple[str, ...], value: Any) -> None:
127
+ """In-place sets a nested key to value.
128
+
129
+ Like get_nested(), if any key is not found, this will not raise an
130
+ error.
131
+ """
132
+ override = {}
133
+ for i, key in enumerate(reversed(keys)):
134
+ if i == 0:
135
+ override = {key: value}
136
+ else:
137
+ override = {key: override}
138
+ _recursive_update(self, override)
139
+
140
+ @classmethod
141
+ def from_dict(cls, config: Optional[Dict[str, Any]]) -> 'Config':
142
+ if config is None:
143
+ return cls()
144
+ return cls(**config)
145
+
146
+
147
+ # The loaded config.
148
+ _dict = Config()
149
+ _loaded_config_path: Optional[str] = None
150
+
151
+
152
+ def get_nested(
153
+ keys: Tuple[str, ...],
154
+ default_value: Any,
155
+ override_configs: Optional[Dict[str, Any]] = None,
156
+ ) -> Any:
157
+ """Gets a nested key.
158
+
159
+ If any key is not found, or any intermediate key does not point to a dict
160
+ value, returns 'default_value'.
161
+
162
+ When 'keys' is within OVERRIDEABLE_CONFIG_KEYS, 'override_configs' must be
163
+ provided (can be empty). Otherwise, 'override_configs' must not be provided.
164
+
165
+ Args:
166
+ keys: A tuple of strings representing the nested keys.
167
+ default_value: The default value to return if the key is not found.
168
+ override_configs: A dict of override configs with the same schema as
169
+ the config file, but only containing the keys to override.
170
+
171
+ Returns:
172
+ The value of the nested key, or 'default_value' if not found.
173
+ """
174
+ assert not (keys in OVERRIDEABLE_CONFIG_KEYS and override_configs is None), (
175
+ f'Override configs must be provided when keys {keys} is within '
176
+ 'OVERRIDEABLE_CONFIG_KEYS: '
177
+ f'{OVERRIDEABLE_CONFIG_KEYS}'
178
+ )
179
+ assert not (
180
+ keys not in OVERRIDEABLE_CONFIG_KEYS and override_configs is not None
181
+ ), (
182
+ f'Override configs must not be provided when keys {keys} is not within '
183
+ 'OVERRIDEABLE_CONFIG_KEYS: '
184
+ f'{OVERRIDEABLE_CONFIG_KEYS}'
185
+ )
186
+ return _dict.get_nested(keys, default_value, override_configs)
187
+
188
+
189
+ def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
190
+ """Returns a deep-copied config with the nested key set to value.
191
+
192
+ Like get_nested(), if any key is not found, this will not raise an error.
193
+ """
194
+ copied_dict = copy.deepcopy(_dict)
195
+ copied_dict.set_nested(keys, value)
196
+ return dict(**copied_dict)
197
+
198
+
199
+ def to_dict() -> Config:
200
+ """Returns a deep-copied version of the current config."""
201
+ return copy.deepcopy(_dict)
202
+
203
+
204
+ def _try_load_config() -> None:
205
+ global _dict, _loaded_config_path
206
+ config_path_via_env_var = os.environ.get(ENV_VAR_CONFIG)
207
+ if config_path_via_env_var is not None:
208
+ config_path = os.path.expanduser(config_path_via_env_var)
209
+ if not os.path.exists(config_path):
210
+ with ux_utils.print_exception_no_traceback():
211
+ raise FileNotFoundError(
212
+ 'Config file specified by env var '
213
+ f'{ENV_VAR_CONFIG} ({config_path!r}) does not '
214
+ 'exist. Please double check the path or unset the env var: '
215
+ f'unset {ENV_VAR_CONFIG}'
216
+ )
217
+ else:
218
+ config_path = CONFIG_PATH
219
+ config_path = os.path.expanduser(config_path)
220
+ if os.path.exists(config_path):
221
+ logger.debug(f'Using config path: {config_path}')
222
+ try:
223
+ config = common_utils.read_yaml(config_path)
224
+ _dict = Config.from_dict(config)
225
+ _loaded_config_path = config_path
226
+ logger.debug(f'Config loaded:\n{pprint.pformat(_dict)}')
227
+ except yaml.YAMLError as e:
228
+ logger.error(f'Error in loading config file ({config_path}):', e)
229
+ if _dict:
230
+ common_utils.validate_schema(
231
+ _dict,
232
+ schemas.get_config_schema(),
233
+ f'Invalid config YAML ({config_path}). See: '
234
+ 'https://konduktor.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long
235
+ 'Error: ',
236
+ skip_none=False,
237
+ )
238
+
239
+ logger.debug('Config syntax check passed.')
240
+
241
+
242
+ def _check_allowed_and_disallowed_override_keys(
243
+ key: str,
244
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
245
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
246
+ ) -> Tuple[Optional[List[Tuple[str, ...]]], Optional[List[Tuple[str, ...]]]]:
247
+ allowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
248
+ disallowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
249
+ if allowed_override_keys is not None:
250
+ for nested_key in allowed_override_keys:
251
+ if key == nested_key[0]:
252
+ if len(nested_key) == 1:
253
+ # Allowed key is fully matched, no need to check further.
254
+ allowed_keys_with_matched_prefix = None
255
+ break
256
+ assert allowed_keys_with_matched_prefix is not None
257
+ allowed_keys_with_matched_prefix.append(nested_key[1:])
258
+ if (
259
+ allowed_keys_with_matched_prefix is not None
260
+ and not allowed_keys_with_matched_prefix
261
+ ):
262
+ raise ValueError(
263
+ f'Key {key} is not in allowed override keys: '
264
+ f'{allowed_override_keys}'
265
+ )
266
+ else:
267
+ allowed_keys_with_matched_prefix = None
268
+
269
+ if disallowed_override_keys is not None:
270
+ for nested_key in disallowed_override_keys:
271
+ if key == nested_key[0]:
272
+ if len(nested_key) == 1:
273
+ raise ValueError(
274
+ f'Key {key} is in disallowed override keys: '
275
+ f'{disallowed_override_keys}'
276
+ )
277
+ assert disallowed_keys_with_matched_prefix is not None
278
+ disallowed_keys_with_matched_prefix.append(nested_key[1:])
279
+ else:
280
+ disallowed_keys_with_matched_prefix = None
281
+ return allowed_keys_with_matched_prefix, disallowed_keys_with_matched_prefix
282
+
283
+
284
+ def _recursive_update(
285
+ base_config: Config,
286
+ override_config: Dict[str, Any],
287
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
288
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
289
+ ) -> Config:
290
+ """Recursively updates base configuration with override configuration"""
291
+ for key, value in override_config.items():
292
+ (next_allowed_override_keys, next_disallowed_override_keys) = (
293
+ _check_allowed_and_disallowed_override_keys(
294
+ key, allowed_override_keys, disallowed_override_keys
295
+ )
296
+ )
297
+ if key == 'kubernetes' and key in base_config:
298
+ merge_k8s_configs(
299
+ base_config[key],
300
+ value,
301
+ next_allowed_override_keys,
302
+ next_disallowed_override_keys,
303
+ )
304
+ elif (
305
+ isinstance(value, dict)
306
+ and key in base_config
307
+ and isinstance(base_config[key], dict)
308
+ ):
309
+ _recursive_update(
310
+ base_config[key],
311
+ value,
312
+ next_allowed_override_keys,
313
+ next_disallowed_override_keys,
314
+ )
315
+ else:
316
+ base_config[key] = value
317
+ return base_config
318
+
319
+
320
+ def _get_nested(
321
+ configs: Optional[Dict[str, Any]],
322
+ keys: Tuple[str, ...],
323
+ default_value: Any,
324
+ pop: bool = False,
325
+ ) -> Any:
326
+ if configs is None:
327
+ return default_value
328
+ curr = configs
329
+ for i, key in enumerate(keys):
330
+ if isinstance(curr, dict) and key in curr:
331
+ value = curr[key]
332
+ if i == len(keys) - 1:
333
+ if pop:
334
+ curr.pop(key, default_value)
335
+ curr = value
336
+ else:
337
+ return default_value
338
+ logger.debug(f'User config: {".".join(keys)} -> {curr}')
339
+ return curr
340
+
341
+
342
+ def merge_k8s_configs(
343
+ base_config: Dict[Any, Any],
344
+ override_config: Dict[Any, Any],
345
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
346
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
347
+ ) -> None:
348
+ """Merge two configs into the base_config.
349
+
350
+ Updates nested dictionaries instead of replacing them.
351
+ If a list is encountered, it will be appended to the base_config list.
352
+
353
+ An exception is when the key is 'containers', in which case the
354
+ first container in the list will be fetched and merge_dict will be
355
+ called on it with the first container in the base_config list.
356
+ """
357
+ for key, value in override_config.items():
358
+ (next_allowed_override_keys, next_disallowed_override_keys) = (
359
+ _check_allowed_and_disallowed_override_keys(
360
+ key, allowed_override_keys, disallowed_override_keys
361
+ )
362
+ )
363
+ if isinstance(value, dict) and key in base_config:
364
+ merge_k8s_configs(
365
+ base_config[key],
366
+ value,
367
+ next_allowed_override_keys,
368
+ next_disallowed_override_keys,
369
+ )
370
+ elif isinstance(value, list) and key in base_config:
371
+ assert isinstance(
372
+ base_config[key], list
373
+ ), f'Expected {key} to be a list, found {base_config[key]}'
374
+ if key in ['containers', 'imagePullSecrets']:
375
+ # If the key is 'containers' or 'imagePullSecrets, we take the
376
+ # first and only container/secret in the list and merge it, as
377
+ # we only support one container per pod.
378
+ assert len(value) == 1, f'Expected only one container, found {value}'
379
+ merge_k8s_configs(
380
+ base_config[key][0],
381
+ value[0],
382
+ next_allowed_override_keys,
383
+ next_disallowed_override_keys,
384
+ )
385
+ elif key in ['volumes', 'volumeMounts']:
386
+ # If the key is 'volumes' or 'volumeMounts', we search for
387
+ # item with the same name and merge it.
388
+ for new_volume in value:
389
+ new_volume_name = new_volume.get('name')
390
+ if new_volume_name is not None:
391
+ destination_volume = next(
392
+ (
393
+ v
394
+ for v in base_config[key]
395
+ if v.get('name') == new_volume_name
396
+ ),
397
+ None,
398
+ )
399
+ if destination_volume is not None:
400
+ merge_k8s_configs(destination_volume, new_volume)
401
+ else:
402
+ base_config[key].append(new_volume)
403
+ else:
404
+ base_config[key].extend(value)
405
+ else:
406
+ base_config[key] = value
407
+
408
+
409
+ def loaded_config_path() -> Optional[str]:
410
+ """Returns the path to the loaded config file."""
411
+ return _loaded_config_path
412
+
413
+
414
+ # Load on import.
415
+ _try_load_config()
416
+
417
+
418
+ def loaded() -> bool:
419
+ """Returns if the user configurations are loaded."""
420
+ return bool(_dict)
konduktor/constants.py ADDED
@@ -0,0 +1,36 @@
1
+ KONDUKTOR_IGNORE_FILE = '.konduktorignore'
2
+ GIT_IGNORE_FILE = '.gitignore'
3
+ KONDUKTOR_REMOTE_WORKDIR = '~/konduktor_workdir'
4
+ KONDUKTOR_LOGS_DIRECTORY = '~/konduktor_logs'
5
+
6
+ # Used for translate local file mounts to cloud storage. Please refer to
7
+ # konduktor/utils/controller_utils.py::maybe_translate_local_file_mounts_and_sync_up for
8
+ # more details.
9
+ # TODO(asaiacai): Unlike skypilot, we don't delete buckets after a job completes
10
+ # because we want to persists code, logs, and artifacts for debugging.
11
+ # yes it's a resource leak, but object store is
12
+ # so cheap and code/data is small in comparison.
13
+ FILE_MOUNTS_BUCKET_NAME = 'konduktor-filemounts-{username}-{user_hash}'
14
+ FILE_MOUNTS_LOCAL_TMP_DIR = 'konduktor-filemounts-files-{id}'
15
+ FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/konduktor-{}-filemounts-files'
16
+
17
+ # For API server, the use a temporary directory in the same path as the upload
18
+ # directory to avoid using a different block device, which may not allow hard
19
+ # linking. E.g., in our API server deployment on k8s, ~/.konduktor/ is mounted from a
20
+ # persistent volume, so any contents in ~/.konduktor/ cannot be hard linked elsewhere.
21
+ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.konduktor/tmp/'
22
+ # Base path for two-hop file mounts translation. See
23
+ # controller_utils.translate_local_file_mounts_to_two_hop().
24
+ FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.konduktor/tmp/controller'
25
+
26
+
27
+ # Used when an managed jobs are created and
28
+ # files are synced up to the cloud.
29
+ FILE_MOUNTS_WORKDIR_SUBPATH = '{task_name}-{run_id}/workdir'
30
+ FILE_MOUNTS_SUBPATH = '{task_name}-{run_id}/local-file-mounts/{i}'
31
+ FILE_MOUNTS_TMP_SUBPATH = '{task_name}-{run_id}/tmp-files'
32
+
33
+ # Path to the file that contains the python path.
34
+ GET_PYTHON_PATH_CMD = 'which python3'
35
+ # Python executable, e.g., /opt/conda/bin/python3
36
+ PYTHON_CMD = f'$({GET_PYTHON_PATH_CMD})'
@@ -1,4 +1,4 @@
1
- KONDUKTOR_CONTROLLER_VERSION = "0.1.0"
1
+ KONDUKTOR_CONTROLLER_VERSION = '0.1.0'
2
2
 
3
3
  HARDWARE_XID_ERRORS = set(
4
4
  (
@@ -45,12 +45,12 @@ ALLOWLISTED_NVSWITCH_SXID_ERRORS = set(
45
45
 
46
46
  POD_LOG_ERROR_REGEXES = [
47
47
  # possibly indicates degraded nvidia-FM in bad state
48
- r"`invalid device ordinal`",
48
+ r'`invalid device ordinal`',
49
49
  ]
50
50
 
51
51
  DMESG_ERROR_REGEXES = [
52
- r"`(?i)nvidia-peermem nv_get_p2p_free_callback:\d+ "
53
- r"ERROR detected invalid context, skipping further processing`",
54
- r"`(?i)NVRM: xid`",
55
- r"`(?i)SXid`",
52
+ r'`(?i)nvidia-peermem nv_get_p2p_free_callback:\d+ '
53
+ r'ERROR detected invalid context, skipping further processing`',
54
+ r'`(?i)NVRM: xid`',
55
+ r'`(?i)SXid`',
56
56
  ]
@@ -22,12 +22,12 @@ from konduktor.controller import node as node_control
22
22
  KONDUKTOR_CONTROLLER_LOG_POLL_SECONDS = 5
23
23
  KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ = 5
24
24
 
25
- logger = logging.get_logger("konduktor.controller")
25
+ logger = logging.get_logger('konduktor.controller')
26
26
 
27
27
 
28
28
  def main():
29
29
  logger.info(
30
- f"starting konduktor.controller ver. {constants.KONDUKTOR_CONTROLLER_VERSION}"
30
+ f'starting konduktor.controller ver. {constants.KONDUKTOR_CONTROLLER_VERSION}'
31
31
  )
32
32
  while True:
33
33
  for _ in range(KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ):
@@ -40,5 +40,5 @@ def main():
40
40
  node_control.health_check()
41
41
 
42
42
 
43
- if __name__ == "__main__":
43
+ if __name__ == '__main__':
44
44
  main()