konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/config.py ADDED
@@ -0,0 +1,420 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ On module import, we attempt to parse the config located at KONDUKTOR_CONFIG
15
+ (default: ~/.konduktor/config.yaml). Caller can then use
16
+
17
+ >> konduktor_config.loaded()
18
+
19
+ to check if the config is successfully loaded.
20
+
21
+ To read a nested-key config:
22
+
23
+ >> konduktor_config.get_nested(('auth', 'some_auth_config'), default_value)
24
+
25
+ The config can be overridden by the configs in task YAMLs. Callers are
26
+ responsible to provide the override_configs. If the nested key is part of
27
+ OVERRIDEABLE_CONFIG_KEYS, override_configs must be provided (can be empty):
28
+
29
+ >> konduktor_config.get_nested(('docker', 'run_options'), default_value
30
+ override_configs={'docker': {'run_options': 'value'}})
31
+
32
+ To set a value in the nested-key config:
33
+
34
+ >> config_dict = konduktor_config.set_nested(('auth', 'some_key'), value)
35
+
36
+ This operation returns a deep-copy dict, and is safe in that any key not found
37
+ will not raise an error.
38
+
39
+ Example usage:
40
+
41
+ Consider the following config contents:
42
+
43
+ a:
44
+ nested: 1
45
+ b: 2
46
+
47
+ then:
48
+
49
+ # Assuming ~/.konduktor/config.yaml exists and can be loaded:
50
+ konduktor_config.loaded() # ==> True
51
+
52
+ konduktor_config.get_nested(('a', 'nested'), None) # ==> 1
53
+ konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
54
+ konduktor_config.get_nested(('a',), None) # ==> {'nested': 1}
55
+
56
+ # If ~/.konduktor/config.yaml doesn't exist or failed to be loaded:
57
+ konduktor_config.loaded() # ==> False
58
+ konduktor_config.get_nested(('a', 'nested'), None) # ==> None
59
+ konduktor_config.get_nested(('a', 'nonexist'), None) # ==> None
60
+ konduktor_config.get_nested(('a',), None) # ==> None
61
+ """
62
+
63
+ import copy
64
+ import os
65
+ import pprint
66
+ from typing import Any, Dict, List, Optional, Tuple
67
+
68
+ import yaml # type: ignore
69
+
70
+ from konduktor import logging
71
+ from konduktor.utils import common_utils, schemas, ux_utils
72
+
73
+ logger = logging.get_logger(__name__)
74
+
75
+ # overrides are specified in task YAMLs.
76
+ OVERRIDEABLE_CONFIG_KEYS: List[Tuple[str, ...]] = [
77
+ ('kubernetes', 'pod_config'),
78
+ ]
79
+
80
+ # The config path is discovered in this order:
81
+ #
82
+ # (1) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
83
+ # path;
84
+ # (2) If file {CONFIG_PATH} exists, use this file.
85
+ #
86
+ # If the path discovered by (1) fails to load, we do not attempt to go to step
87
+ # 2 in the list.
88
+
89
+ # (Used internally) An env var holding the path to the local config file. This
90
+ # is only used by jobs controller tasks to ensure recoveries of the same job
91
+ # use the same config file.
92
+ ENV_VAR_CONFIG = 'KONDUKTOR_CONFIG'
93
+
94
+ # Path to the local config file.
95
+ CONFIG_PATH = '~/.konduktor/config.yaml'
96
+
97
+
98
+ class Config(Dict[str, Any]):
99
+ """Konduktor config that supports setting/getting values with nested keys."""
100
+
101
+ def get_nested(
102
+ self,
103
+ keys: Tuple[str, ...],
104
+ default_value: Any,
105
+ override_configs: Optional[Dict[str, Any]] = None,
106
+ ) -> Any:
107
+ """Gets a nested key.
108
+
109
+ If any key is not found, or any intermediate key does not point to a
110
+ dict value, returns 'default_value'.
111
+
112
+ Args:
113
+ keys: A tuple of strings representing the nested keys.
114
+ default_value: The default value to return if the key is not found.
115
+ override_configs: A dict of override configs with the same schema as
116
+ the config file, but only containing the keys to override.
117
+
118
+ Returns:
119
+ The value of the nested key, or 'default_value' if not found.
120
+ """
121
+ config = copy.deepcopy(self)
122
+ if override_configs is not None:
123
+ config = _recursive_update(config, override_configs)
124
+ return _get_nested(config, keys, default_value)
125
+
126
+ def set_nested(self, keys: Tuple[str, ...], value: Any) -> None:
127
+ """In-place sets a nested key to value.
128
+
129
+ Like get_nested(), if any key is not found, this will not raise an
130
+ error.
131
+ """
132
+ override = {}
133
+ for i, key in enumerate(reversed(keys)):
134
+ if i == 0:
135
+ override = {key: value}
136
+ else:
137
+ override = {key: override}
138
+ _recursive_update(self, override)
139
+
140
+ @classmethod
141
+ def from_dict(cls, config: Optional[Dict[str, Any]]) -> 'Config':
142
+ if config is None:
143
+ return cls()
144
+ return cls(**config)
145
+
146
+
147
+ # The loaded config.
148
+ _dict = Config()
149
+ _loaded_config_path: Optional[str] = None
150
+
151
+
152
+ def get_nested(
153
+ keys: Tuple[str, ...],
154
+ default_value: Any,
155
+ override_configs: Optional[Dict[str, Any]] = None,
156
+ ) -> Any:
157
+ """Gets a nested key.
158
+
159
+ If any key is not found, or any intermediate key does not point to a dict
160
+ value, returns 'default_value'.
161
+
162
+ When 'keys' is within OVERRIDEABLE_CONFIG_KEYS, 'override_configs' must be
163
+ provided (can be empty). Otherwise, 'override_configs' must not be provided.
164
+
165
+ Args:
166
+ keys: A tuple of strings representing the nested keys.
167
+ default_value: The default value to return if the key is not found.
168
+ override_configs: A dict of override configs with the same schema as
169
+ the config file, but only containing the keys to override.
170
+
171
+ Returns:
172
+ The value of the nested key, or 'default_value' if not found.
173
+ """
174
+ assert not (keys in OVERRIDEABLE_CONFIG_KEYS and override_configs is None), (
175
+ f'Override configs must be provided when keys {keys} is within '
176
+ 'OVERRIDEABLE_CONFIG_KEYS: '
177
+ f'{OVERRIDEABLE_CONFIG_KEYS}'
178
+ )
179
+ assert not (
180
+ keys not in OVERRIDEABLE_CONFIG_KEYS and override_configs is not None
181
+ ), (
182
+ f'Override configs must not be provided when keys {keys} is not within '
183
+ 'OVERRIDEABLE_CONFIG_KEYS: '
184
+ f'{OVERRIDEABLE_CONFIG_KEYS}'
185
+ )
186
+ return _dict.get_nested(keys, default_value, override_configs)
187
+
188
+
189
+ def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
190
+ """Returns a deep-copied config with the nested key set to value.
191
+
192
+ Like get_nested(), if any key is not found, this will not raise an error.
193
+ """
194
+ copied_dict = copy.deepcopy(_dict)
195
+ copied_dict.set_nested(keys, value)
196
+ return dict(**copied_dict)
197
+
198
+
199
+ def to_dict() -> Config:
200
+ """Returns a deep-copied version of the current config."""
201
+ return copy.deepcopy(_dict)
202
+
203
+
204
+ def _try_load_config() -> None:
205
+ global _dict, _loaded_config_path
206
+ config_path_via_env_var = os.environ.get(ENV_VAR_CONFIG)
207
+ if config_path_via_env_var is not None:
208
+ config_path = os.path.expanduser(config_path_via_env_var)
209
+ if not os.path.exists(config_path):
210
+ with ux_utils.print_exception_no_traceback():
211
+ raise FileNotFoundError(
212
+ 'Config file specified by env var '
213
+ f'{ENV_VAR_CONFIG} ({config_path!r}) does not '
214
+ 'exist. Please double check the path or unset the env var: '
215
+ f'unset {ENV_VAR_CONFIG}'
216
+ )
217
+ else:
218
+ config_path = CONFIG_PATH
219
+ config_path = os.path.expanduser(config_path)
220
+ if os.path.exists(config_path):
221
+ logger.debug(f'Using config path: {config_path}')
222
+ try:
223
+ config = common_utils.read_yaml(config_path)
224
+ _dict = Config.from_dict(config)
225
+ _loaded_config_path = config_path
226
+ logger.debug(f'Config loaded:\n{pprint.pformat(_dict)}')
227
+ except yaml.YAMLError as e:
228
+ logger.error(f'Error in loading config file ({config_path}):', e)
229
+ if _dict:
230
+ common_utils.validate_schema(
231
+ _dict,
232
+ schemas.get_config_schema(),
233
+ f'Invalid config YAML ({config_path}). See: '
234
+ 'https://konduktor.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long
235
+ 'Error: ',
236
+ skip_none=False,
237
+ )
238
+
239
+ logger.debug('Config syntax check passed.')
240
+
241
+
242
+ def _check_allowed_and_disallowed_override_keys(
243
+ key: str,
244
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
245
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
246
+ ) -> Tuple[Optional[List[Tuple[str, ...]]], Optional[List[Tuple[str, ...]]]]:
247
+ allowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
248
+ disallowed_keys_with_matched_prefix: Optional[List[Tuple[str, ...]]] = []
249
+ if allowed_override_keys is not None:
250
+ for nested_key in allowed_override_keys:
251
+ if key == nested_key[0]:
252
+ if len(nested_key) == 1:
253
+ # Allowed key is fully matched, no need to check further.
254
+ allowed_keys_with_matched_prefix = None
255
+ break
256
+ assert allowed_keys_with_matched_prefix is not None
257
+ allowed_keys_with_matched_prefix.append(nested_key[1:])
258
+ if (
259
+ allowed_keys_with_matched_prefix is not None
260
+ and not allowed_keys_with_matched_prefix
261
+ ):
262
+ raise ValueError(
263
+ f'Key {key} is not in allowed override keys: '
264
+ f'{allowed_override_keys}'
265
+ )
266
+ else:
267
+ allowed_keys_with_matched_prefix = None
268
+
269
+ if disallowed_override_keys is not None:
270
+ for nested_key in disallowed_override_keys:
271
+ if key == nested_key[0]:
272
+ if len(nested_key) == 1:
273
+ raise ValueError(
274
+ f'Key {key} is in disallowed override keys: '
275
+ f'{disallowed_override_keys}'
276
+ )
277
+ assert disallowed_keys_with_matched_prefix is not None
278
+ disallowed_keys_with_matched_prefix.append(nested_key[1:])
279
+ else:
280
+ disallowed_keys_with_matched_prefix = None
281
+ return allowed_keys_with_matched_prefix, disallowed_keys_with_matched_prefix
282
+
283
+
284
+ def _recursive_update(
285
+ base_config: Config,
286
+ override_config: Dict[str, Any],
287
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
288
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
289
+ ) -> Config:
290
+ """Recursively updates base configuration with override configuration"""
291
+ for key, value in override_config.items():
292
+ (next_allowed_override_keys, next_disallowed_override_keys) = (
293
+ _check_allowed_and_disallowed_override_keys(
294
+ key, allowed_override_keys, disallowed_override_keys
295
+ )
296
+ )
297
+ if key == 'kubernetes' and key in base_config:
298
+ merge_k8s_configs(
299
+ base_config[key],
300
+ value,
301
+ next_allowed_override_keys,
302
+ next_disallowed_override_keys,
303
+ )
304
+ elif (
305
+ isinstance(value, dict)
306
+ and key in base_config
307
+ and isinstance(base_config[key], dict)
308
+ ):
309
+ _recursive_update(
310
+ base_config[key],
311
+ value,
312
+ next_allowed_override_keys,
313
+ next_disallowed_override_keys,
314
+ )
315
+ else:
316
+ base_config[key] = value
317
+ return base_config
318
+
319
+
320
+ def _get_nested(
321
+ configs: Optional[Dict[str, Any]],
322
+ keys: Tuple[str, ...],
323
+ default_value: Any,
324
+ pop: bool = False,
325
+ ) -> Any:
326
+ if configs is None:
327
+ return default_value
328
+ curr = configs
329
+ for i, key in enumerate(keys):
330
+ if isinstance(curr, dict) and key in curr:
331
+ value = curr[key]
332
+ if i == len(keys) - 1:
333
+ if pop:
334
+ curr.pop(key, default_value)
335
+ curr = value
336
+ else:
337
+ return default_value
338
+ logger.debug(f'User config: {".".join(keys)} -> {curr}')
339
+ return curr
340
+
341
+
342
+ def merge_k8s_configs(
343
+ base_config: Dict[Any, Any],
344
+ override_config: Dict[Any, Any],
345
+ allowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
346
+ disallowed_override_keys: Optional[List[Tuple[str, ...]]] = None,
347
+ ) -> None:
348
+ """Merge two configs into the base_config.
349
+
350
+ Updates nested dictionaries instead of replacing them.
351
+ If a list is encountered, it will be appended to the base_config list.
352
+
353
+ An exception is when the key is 'containers', in which case the
354
+ first container in the list will be fetched and merge_dict will be
355
+ called on it with the first container in the base_config list.
356
+ """
357
+ for key, value in override_config.items():
358
+ (next_allowed_override_keys, next_disallowed_override_keys) = (
359
+ _check_allowed_and_disallowed_override_keys(
360
+ key, allowed_override_keys, disallowed_override_keys
361
+ )
362
+ )
363
+ if isinstance(value, dict) and key in base_config:
364
+ merge_k8s_configs(
365
+ base_config[key],
366
+ value,
367
+ next_allowed_override_keys,
368
+ next_disallowed_override_keys,
369
+ )
370
+ elif isinstance(value, list) and key in base_config:
371
+ assert isinstance(
372
+ base_config[key], list
373
+ ), f'Expected {key} to be a list, found {base_config[key]}'
374
+ if key in ['containers', 'imagePullSecrets']:
375
+ # If the key is 'containers' or 'imagePullSecrets, we take the
376
+ # first and only container/secret in the list and merge it, as
377
+ # we only support one container per pod.
378
+ assert len(value) == 1, f'Expected only one container, found {value}'
379
+ merge_k8s_configs(
380
+ base_config[key][0],
381
+ value[0],
382
+ next_allowed_override_keys,
383
+ next_disallowed_override_keys,
384
+ )
385
+ elif key in ['volumes', 'volumeMounts']:
386
+ # If the key is 'volumes' or 'volumeMounts', we search for
387
+ # item with the same name and merge it.
388
+ for new_volume in value:
389
+ new_volume_name = new_volume.get('name')
390
+ if new_volume_name is not None:
391
+ destination_volume = next(
392
+ (
393
+ v
394
+ for v in base_config[key]
395
+ if v.get('name') == new_volume_name
396
+ ),
397
+ None,
398
+ )
399
+ if destination_volume is not None:
400
+ merge_k8s_configs(destination_volume, new_volume)
401
+ else:
402
+ base_config[key].append(new_volume)
403
+ else:
404
+ base_config[key].extend(value)
405
+ else:
406
+ base_config[key] = value
407
+
408
+
409
+ def loaded_config_path() -> Optional[str]:
410
+ """Returns the path to the loaded config file."""
411
+ return _loaded_config_path
412
+
413
+
414
+ # Load on import.
415
+ _try_load_config()
416
+
417
+
418
+ def loaded() -> bool:
419
+ """Returns if the user configurations are loaded."""
420
+ return bool(_dict)
konduktor/constants.py ADDED
@@ -0,0 +1,36 @@
1
+ KONDUKTOR_IGNORE_FILE = '.konduktorignore'
2
+ GIT_IGNORE_FILE = '.gitignore'
3
+ KONDUKTOR_REMOTE_WORKDIR = '~/konduktor_workdir'
4
+ KONDUKTOR_LOGS_DIRECTORY = '~/konduktor_logs'
5
+
6
+ # Used for translate local file mounts to cloud storage. Please refer to
7
+ # konduktor/utils/controller_utils.py::maybe_translate_local_file_mounts_and_sync_up for
8
+ # more details.
9
+ # TODO(asaiacai): Unlike skypilot, we don't delete buckets after a job completes
10
+ # because we want to persists code, logs, and artifacts for debugging.
11
+ # yes it's a resource leak, but object store is
12
+ # so cheap and code/data is small in comparison.
13
+ FILE_MOUNTS_BUCKET_NAME = 'konduktor-filemounts-{username}-{user_hash}'
14
+ FILE_MOUNTS_LOCAL_TMP_DIR = 'konduktor-filemounts-files-{id}'
15
+ FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/konduktor-{}-filemounts-files'
16
+
17
+ # For API server, the use a temporary directory in the same path as the upload
18
+ # directory to avoid using a different block device, which may not allow hard
19
+ # linking. E.g., in our API server deployment on k8s, ~/.konduktor/ is mounted from a
20
+ # persistent volume, so any contents in ~/.konduktor/ cannot be hard linked elsewhere.
21
+ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.konduktor/tmp/'
22
+ # Base path for two-hop file mounts translation. See
23
+ # controller_utils.translate_local_file_mounts_to_two_hop().
24
+ FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.konduktor/tmp/controller'
25
+
26
+
27
+ # Used when an managed jobs are created and
28
+ # files are synced up to the cloud.
29
+ FILE_MOUNTS_WORKDIR_SUBPATH = '{task_name}-{run_id}/workdir'
30
+ FILE_MOUNTS_SUBPATH = '{task_name}-{run_id}/local-file-mounts/{i}'
31
+ FILE_MOUNTS_TMP_SUBPATH = '{task_name}-{run_id}/tmp-files'
32
+
33
+ # Path to the file that contains the python path.
34
+ GET_PYTHON_PATH_CMD = 'which python3'
35
+ # Python executable, e.g., /opt/conda/bin/python3
36
+ PYTHON_CMD = f'$({GET_PYTHON_PATH_CMD})'
File without changes
@@ -0,0 +1,56 @@
1
+ KONDUKTOR_CONTROLLER_VERSION = '0.1.0'
2
+
3
+ HARDWARE_XID_ERRORS = set(
4
+ (
5
+ 48,
6
+ *range(56, 59),
7
+ *range(62, 65),
8
+ *range(68, 78),
9
+ *range(79, 87),
10
+ *range(88, 90),
11
+ 92,
12
+ *range(94, 106),
13
+ *range(110, 121),
14
+ *range(122, 126),
15
+ )
16
+ )
17
+
18
+ # The set of all SXid error ids that are known to be harmless.
19
+ # See D.4 of https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
20
+ ALLOWLISTED_NVSWITCH_SXID_ERRORS = set(
21
+ (
22
+ 11012,
23
+ 11021,
24
+ 11022,
25
+ 11023,
26
+ 12021,
27
+ 12023,
28
+ 15008,
29
+ 15011,
30
+ 19049,
31
+ 19055,
32
+ 19057,
33
+ 19059,
34
+ 19062,
35
+ 19065,
36
+ 19068,
37
+ 19071,
38
+ 24001,
39
+ 24002,
40
+ 24003,
41
+ 22013,
42
+ )
43
+ )
44
+
45
+
46
+ POD_LOG_ERROR_REGEXES = [
47
+ # possibly indicates degraded nvidia-FM in bad state
48
+ r'`invalid device ordinal`',
49
+ ]
50
+
51
+ DMESG_ERROR_REGEXES = [
52
+ r'`(?i)nvidia-peermem nv_get_p2p_free_callback:\d+ '
53
+ r'ERROR detected invalid context, skipping further processing`',
54
+ r'`(?i)NVRM: xid`',
55
+ r'`(?i)SXid`',
56
+ ]
@@ -0,0 +1,44 @@
1
+ """
2
+ Controller loop
3
+ The controller is run as a deployment and repeatedly polls the logging backend
4
+ for GPU related error logs. When a GPU, CUDA, NCCL error is detected,
5
+ we check against a set of known patterns(regexes) to see if the error is
6
+ irrecoverable making the node unfit for more work. We can then label/taint
7
+ the node to prevent more pods from being scheduled onto the node.
8
+
9
+ Sometimes an NCCL error can be raised due to all-reduce style workloads causing
10
+ all workers to fail even though only one is actually faulty. To place workers,
11
+ back into the working pool, we run a health check which just consists of doing NCCL
12
+ test on the tainted nodes
13
+ """
14
+
15
+ import time
16
+ from typing import Set
17
+
18
+ from konduktor import logging
19
+ from konduktor.controller import constants, parse
20
+ from konduktor.controller import node as node_control
21
+
22
+ KONDUKTOR_CONTROLLER_LOG_POLL_SECONDS = 5
23
+ KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ = 5
24
+
25
+ logger = logging.get_logger('konduktor.controller')
26
+
27
+
28
+ def main() -> None:
29
+ logger.info(
30
+ f'starting konduktor.controller ver. {constants.KONDUKTOR_CONTROLLER_VERSION}'
31
+ )
32
+ while True:
33
+ for _ in range(KONDUKTOR_CONTROLLER_HEALTH_CHECK_FREQ):
34
+ time.sleep(KONDUKTOR_CONTROLLER_LOG_POLL_SECONDS)
35
+ error_by_pod: Set[str] = parse.pod_errors()
36
+ error_by_dmesg: Set[str] = parse.dmesg_errors()
37
+ for node in error_by_pod | error_by_dmesg:
38
+ node_control.taint(node)
39
+
40
+ node_control.health_check()
41
+
42
+
43
+ if __name__ == '__main__':
44
+ main()
@@ -0,0 +1,116 @@
1
+ from typing import List
2
+
3
+ import kubernetes
4
+
5
+ from konduktor import kube_client
6
+ from konduktor import logging as konduktor_logging
7
+
8
+ # node taint/label
9
+ NODE_HEALTH_LABEL = 'trainy.konduktor.ai/faulty'
10
+
11
+ logger = konduktor_logging.get_logger(__name__)
12
+
13
+
14
+ def nccl_single_test(node: str, thresh: int = 400):
15
+ """Runs NCCL test within a node. Tests NVLINK BW
16
+
17
+ Args:
18
+ node (str): k8s node name
19
+ thresh (int, optional): minimum busbw to be considered healthy
20
+ H100SXM should report 450GB/s max theoretically. Default 400
21
+
22
+ """
23
+
24
+
25
+ def nccl_pair_test(nodeA: str, nodeB: str, thresh: int = 350):
26
+ """Runs NCCL test between a pair of nodes. Tests
27
+ internode bandwidth
28
+
29
+ Args:
30
+ nodeA (str): k8s node name
31
+ nodeB (str): k8s node name
32
+ thresh (int, optional): minimum busbw to be considered healthy
33
+ ConnectX-7 cards have a theoretical max BW of 400GB/s
34
+ Defaults to 350GB/s.
35
+ """
36
+ pass
37
+
38
+
39
+ def health_check():
40
+ """Gathers nodes with label/taint `trainy.konduktor.ai/faulty=true:NoSchedule`
41
+ and attempts to run NCCL test on them. Nodes that pass
42
+ have their label/taint removed.
43
+ """
44
+ pass
45
+
46
+
47
+ def untaint(node_name: str):
48
+ """Removes label/taint of `trainy.konduktor.ai/faulty=true:NoSchedule`
49
+
50
+ Args:
51
+ node (str): k8s node name
52
+ """
53
+ core_api = kube_client.core_api()
54
+ node = core_api.read_node(
55
+ name=node_name,
56
+ _request_timeout=kube_client.API_TIMEOUT,
57
+ )
58
+
59
+ if node.spec.taints is not None:
60
+ node.spec.taints = [
61
+ taint for taint in node.spec.taints if taint.key != NODE_HEALTH_LABEL
62
+ ]
63
+
64
+ # Patch the node with the new taints
65
+ core_api.patch_node(
66
+ name=node_name,
67
+ body=node,
68
+ _request_timeout=kube_client.API_TIMEOUT,
69
+ )
70
+
71
+ logger.info(f'Node {node_name} taint removed.')
72
+
73
+
74
+ def taint(node_name: str):
75
+ """Labels/Taints node with `trainy.konduktor.ai/faulty=true:NoSchedule`
76
+
77
+ Args:
78
+ node (str): k8s node name
79
+ """
80
+ core_api = kube_client.core_api()
81
+ taint = kubernetes.client.V1Taint(
82
+ key=NODE_HEALTH_LABEL,
83
+ value='true',
84
+ effect='NoSchedule',
85
+ )
86
+ node = core_api.read_node(
87
+ name=node_name,
88
+ _request_timeout=kube_client.API_TIMEOUT,
89
+ )
90
+
91
+ if node.spec.taints is None:
92
+ node.spec.taints = []
93
+
94
+ # duplicate taints are disallowed
95
+ tainted = any(taint.key == NODE_HEALTH_LABEL for taint in node.spec.taints)
96
+ if not tainted:
97
+ node.spec.taints.append(taint)
98
+
99
+ # Patch the node with the new taints
100
+ core_api.patch_node(
101
+ name=node_name,
102
+ body=node,
103
+ _request_timeout=kube_client.API_TIMEOUT,
104
+ )
105
+
106
+ logger.info(f'Node {node_name} tainted.')
107
+
108
+
109
+ def list_nodes() -> List[str]:
110
+ """returns a list of k8s node names
111
+
112
+ Returns:
113
+ List[str]: List of k8s node names
114
+ """
115
+ nodes = kube_client.core_api().list_node()
116
+ return [node.metadata.name for node in nodes.items]