konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,273 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Utility functions for subprocesses."""
14
+
15
+ import os
16
+ import resource
17
+ import subprocess
18
+ from multiprocessing import pool
19
+ from typing import Any, Callable, Dict, List, Optional, Union
20
+
21
+ import colorama
22
+ import psutil
23
+
24
+ from konduktor import constants, logging
25
+ from konduktor.utils import exceptions, log_utils, ux_utils
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ _fd_limit_warning_shown = False
30
+
31
+
32
+ def run(cmd, **kwargs):
33
+ # Should be careful to use this function, as the child process cmd spawn may
34
+ # keep running in the background after the current program is killed. To get
35
+ # rid of this problem, use `log_utils.run_with_log`.
36
+ shell = kwargs.pop('shell', True)
37
+ check = kwargs.pop('check', True)
38
+ executable = kwargs.pop('executable', '/bin/bash')
39
+ if not shell:
40
+ executable = None
41
+ return subprocess.run(
42
+ cmd, shell=shell, check=check, executable=executable, **kwargs
43
+ )
44
+
45
+
46
+ def run_no_outputs(cmd, **kwargs):
47
+ return run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, **kwargs)
48
+
49
+
50
+ def _get_thread_multiplier(cloud_str: Optional[str] = None) -> int:
51
+ # If using Kubernetes, we use 4x the number of cores.
52
+ if cloud_str and cloud_str.lower() == 'kubernetes':
53
+ return 4
54
+ return 1
55
+
56
+
57
+ def get_max_workers_for_file_mounts(
58
+ common_file_mounts: Dict[str, str], cloud_str: Optional[str] = None
59
+ ) -> int:
60
+ global _fd_limit_warning_shown
61
+ fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
62
+
63
+ # Raise warning for low fd_limit (only once)
64
+ if fd_limit < 1024 and not _fd_limit_warning_shown:
65
+ logger.warning(
66
+ f'Open file descriptor limit ({fd_limit}) is low. File sync to '
67
+ 'remote clusters may be slow. Consider increasing the limit using '
68
+ '`ulimit -n <number>` or modifying system limits.'
69
+ )
70
+ _fd_limit_warning_shown = True
71
+
72
+ fd_per_rsync = 5
73
+ for src in common_file_mounts.values():
74
+ if os.path.isdir(src):
75
+ # Assume that each file/folder under src takes 5 file descriptors
76
+ # on average.
77
+ fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
78
+
79
+ # Reserve some file descriptors for the system and other processes
80
+ fd_reserve = 100
81
+
82
+ max_workers = (fd_limit - fd_reserve) // fd_per_rsync
83
+ # At least 1 worker, and avoid too many workers overloading the system.
84
+ num_threads = get_parallel_threads(cloud_str)
85
+ max_workers = min(max(max_workers, 1), num_threads)
86
+ logger.debug(f'Using {max_workers} workers for file mounts.')
87
+ return max_workers
88
+
89
+
90
+ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
91
+ """Returns the number of threads to use for parallel execution.
92
+
93
+ Args:
94
+ cloud_str: The cloud
95
+ """
96
+ cpu_count = os.cpu_count()
97
+ if cpu_count is None:
98
+ cpu_count = 1
99
+ return max(4, cpu_count - 1) * _get_thread_multiplier(cloud_str)
100
+
101
+
102
+ def run_in_parallel(
103
+ func: Callable, args: List[Any], num_threads: Optional[int] = None
104
+ ) -> List[Any]:
105
+ """Run a function in parallel on a list of arguments.
106
+
107
+ Args:
108
+ func: The function to run in parallel
109
+ args: Iterable of arguments to pass to func
110
+ num_threads: Number of threads to use. If None, uses
111
+ get_parallel_threads()
112
+
113
+ Returns:
114
+ A list of the return values of the function func, in the same order as the
115
+ arguments.
116
+
117
+ Raises:
118
+ Exception: The first exception encountered.
119
+ """
120
+ # Short-circuit for short lists
121
+ if len(args) == 0:
122
+ return []
123
+ if len(args) == 1:
124
+ return [func(args[0])]
125
+
126
+ processes = num_threads if num_threads is not None else get_parallel_threads()
127
+
128
+ with pool.ThreadPool(processes=processes) as p:
129
+ ordered_iterators = p.imap(func, args)
130
+ return list(ordered_iterators)
131
+
132
+
133
+ def handle_returncode(
134
+ returncode: int,
135
+ command: str,
136
+ error_msg: Union[str, Callable[[], str]],
137
+ stderr: Optional[str] = None,
138
+ stream_logs: bool = True,
139
+ ) -> None:
140
+ """Handle the returncode of a command.
141
+
142
+ Args:
143
+ returncode: The returncode of the command.
144
+ command: The command that was run.
145
+ error_msg: The error message to print.
146
+ stderr: The stderr of the command.
147
+ stream_logs: Whether to stream logs.
148
+ """
149
+ echo = logger.error if stream_logs else logger.debug
150
+ if returncode != 0:
151
+ if stderr is not None:
152
+ echo(stderr)
153
+
154
+ if callable(error_msg):
155
+ error_msg = error_msg()
156
+ format_err_msg = f'{colorama.Fore.RED}{error_msg}{colorama.Style.RESET_ALL}'
157
+ with ux_utils.print_exception_no_traceback():
158
+ raise exceptions.CommandError(returncode, command, format_err_msg, stderr)
159
+
160
+
161
+ def kill_children_processes(
162
+ parent_pids: Optional[Union[int, List[Optional[int]]]] = None, force: bool = False
163
+ ) -> None:
164
+ """Kill children processes recursively.
165
+
166
+ We need to kill the children, so that
167
+ 1. The underlying subprocess will not print the logs to the terminal,
168
+ after this program exits.
169
+ 2. The underlying subprocess will not continue with starting a cluster
170
+ etc. while we are cleaning up the clusters.
171
+
172
+ Args:
173
+ parent_pids: Optional PIDs of a series of processes. The processes and
174
+ their children will be killed. If a list of PID is specified, it is
175
+ killed by the order in the list. This is for guaranteeing the order
176
+ of cleaning up and suppress flaky errors.
177
+ force: bool, send SIGKILL if force, otherwise, use SIGTERM for
178
+ gracefully kill the process.
179
+ """
180
+ if isinstance(parent_pids, int):
181
+ parent_pids = [parent_pids]
182
+
183
+ def kill(proc: psutil.Process):
184
+ if not proc.is_running():
185
+ # Skip if the process is not running.
186
+ return
187
+ logger.debug(f'Killing process {proc.pid}')
188
+ try:
189
+ if force:
190
+ proc.kill()
191
+ else:
192
+ proc.terminate()
193
+ proc.wait(timeout=10)
194
+ except psutil.NoSuchProcess:
195
+ # The child process may have already been terminated.
196
+ pass
197
+ except psutil.TimeoutExpired:
198
+ logger.debug(f'Process {proc.pid} did not terminate after 10 seconds')
199
+ # Attempt to force kill if the normal termination fails
200
+ if not force:
201
+ logger.debug(f'Force killing process {proc.pid}')
202
+ proc.kill()
203
+ proc.wait(timeout=5) # Shorter timeout after force kill
204
+
205
+ parent_processes = []
206
+ if parent_pids is None:
207
+ parent_processes = [psutil.Process()]
208
+ else:
209
+ for pid in parent_pids:
210
+ try:
211
+ process = psutil.Process(pid)
212
+ except psutil.NoSuchProcess:
213
+ continue
214
+ parent_processes.append(process)
215
+
216
+ for parent_process in parent_processes:
217
+ child_processes = parent_process.children(recursive=True)
218
+ if parent_pids is not None:
219
+ kill(parent_process)
220
+ logger.debug(f'Killing child processes: {child_processes}')
221
+ for child in child_processes:
222
+ kill(child)
223
+
224
+
225
+ def kill_process_daemon(process_pid: int) -> None:
226
+ """Start a daemon as a safety net to kill the process.
227
+
228
+ Args:
229
+ process_pid: The PID of the process to kill.
230
+ """
231
+ # Get initial children list
232
+ try:
233
+ process = psutil.Process(process_pid)
234
+ initial_children = [p.pid for p in process.children(recursive=True)]
235
+ except psutil.NoSuchProcess:
236
+ initial_children = []
237
+
238
+ parent_pid = os.getpid()
239
+ daemon_script = os.path.join(
240
+ os.path.dirname(os.path.abspath(log_utils.__file__)), 'subprocess_daemon.py'
241
+ )
242
+ python_path = subprocess.check_output(
243
+ constants.GET_PYTHON_PATH_CMD,
244
+ shell=True,
245
+ stderr=subprocess.DEVNULL,
246
+ encoding='utf-8',
247
+ ).strip()
248
+ daemon_cmd = [
249
+ python_path,
250
+ daemon_script,
251
+ '--parent-pid',
252
+ str(parent_pid),
253
+ '--proc-pid',
254
+ str(process_pid),
255
+ # We pass the initial children list to avoid the race condition where
256
+ # the process_pid is terminated before the daemon starts and gets the
257
+ # children list.
258
+ '--initial-children',
259
+ ','.join(map(str, initial_children)),
260
+ ]
261
+
262
+ # We do not need to set `start_new_session=True` here, as the
263
+ # daemon script will detach itself from the parent process with
264
+ # fork to avoid being killed by parent process. See the reason we
265
+ # daemonize the process in `sky/skylet/subprocess_daemon.py`.
266
+ subprocess.Popen(
267
+ daemon_cmd,
268
+ # Suppress output
269
+ stdout=subprocess.DEVNULL,
270
+ stderr=subprocess.DEVNULL,
271
+ # Disable input
272
+ stdin=subprocess.DEVNULL,
273
+ )
@@ -0,0 +1,216 @@
1
+ """Utility functions for UX."""
2
+
3
+ import contextlib
4
+ import enum
5
+ import os
6
+ import sys
7
+ import traceback
8
+ import typing
9
+ from typing import Callable, Optional, Union
10
+
11
+ import colorama
12
+ import rich.console as rich_console
13
+
14
+ from konduktor import logging as konduktor_logging
15
+
16
+ if typing.TYPE_CHECKING:
17
+ import pathlib
18
+
19
+ console = rich_console.Console()
20
+
21
+ INDENT_SYMBOL = f'{colorama.Style.DIM}├── {colorama.Style.RESET_ALL}'
22
+ INDENT_LAST_SYMBOL = f'{colorama.Style.DIM}└── {colorama.Style.RESET_ALL}'
23
+
24
+ # Console formatting constants
25
+ BOLD = '\033[1m'
26
+ RESET_BOLD = '\033[0m'
27
+
28
+ # Log path hint in the spinner during launching
29
+ _LOG_PATH_HINT = (
30
+ f'{colorama.Style.DIM}View logs at: {{log_path}}' f'{colorama.Style.RESET_ALL}'
31
+ )
32
+
33
+
34
+ def console_newline():
35
+ """Prints a newline to the console using rich.
36
+
37
+ Useful when catching exceptions inside console.status()
38
+ """
39
+ console.print()
40
+
41
+
42
+ @contextlib.contextmanager
43
+ def print_exception_no_traceback():
44
+ """A context manager that prints out an exception without traceback.
45
+
46
+ Mainly for UX: user-facing errors, e.g., ValueError, should suppress long
47
+ tracebacks.
48
+
49
+ If KONDUKTOR_DEBUG environment variable is set, this context manager is a
50
+ no-op and the full traceback will be shown.
51
+
52
+ Example usage:
53
+
54
+ with print_exception_no_traceback():
55
+ if error():
56
+ raise ValueError('...')
57
+ """
58
+ if os.environ.get('KONDUKTOR_DEBUG'):
59
+ # When KONDUKTOR_DEBUG is set, show the full traceback
60
+ yield
61
+ else:
62
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
63
+ sys.tracebacklimit = 0
64
+ yield
65
+ sys.tracebacklimit = original_tracelimit
66
+
67
+
68
+ @contextlib.contextmanager
69
+ def enable_traceback():
70
+ """Reverts the effect of print_exception_no_traceback().
71
+
72
+ This is used for usage_lib to collect the full traceback.
73
+ """
74
+ original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
75
+ sys.tracebacklimit = 1000
76
+ yield
77
+ sys.tracebacklimit = original_tracelimit
78
+
79
+
80
+ class RedirectOutputForProcess:
81
+ """Redirects stdout and stderr to a file.
82
+
83
+ This class enabled output redirect for multiprocessing.Process.
84
+ Example usage:
85
+
86
+ p = multiprocessing.Process(
87
+ target=RedirectOutputForProcess(func, file_name).run, args=...)
88
+
89
+ This is equal to:
90
+
91
+ p = multiprocessing.Process(target=func, args=...)
92
+
93
+ Plus redirect all stdout/stderr to file_name.
94
+ """
95
+
96
+ def __init__(self, func: Callable, file: str, mode: str = 'w') -> None:
97
+ self.func = func
98
+ self.file = file
99
+ self.mode = mode
100
+
101
+ def run(self, *args, **kwargs):
102
+ with open(self.file, self.mode, encoding='utf-8') as f:
103
+ sys.stdout = f
104
+ sys.stderr = f
105
+ # reconfigure logger since the logger is initialized before
106
+ # with previous stdout/stderr
107
+ konduktor_logging.reload_logger()
108
+ logger = konduktor_logging.init_logger(__name__)
109
+ # The subprocess_util.run('konduktor status') inside
110
+ # konduktor.execution::_execute cannot be redirect, since we cannot
111
+ # directly operate on the stdout/stderr of the subprocess. This
112
+ # is because some code in konduktor will specify the stdout/stderr
113
+ # of the subprocess.
114
+ try:
115
+ self.func(*args, **kwargs)
116
+ except Exception as e: # pylint: disable=broad-except
117
+ logger.error(f'Failed to run {self.func.__name__}. ' f'Details: {e}')
118
+ with enable_traceback():
119
+ logger.error(f' Traceback:\n{traceback.format_exc()}')
120
+ raise
121
+
122
+
123
+ def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
124
+ """Gets the log path hint for the given log path."""
125
+ log_path = str(log_path)
126
+ expanded_home = os.path.expanduser('~')
127
+ if log_path.startswith(expanded_home):
128
+ log_path = '~' + log_path[len(expanded_home) :]
129
+ return _LOG_PATH_HINT.format(log_path=log_path)
130
+
131
+
132
+ def starting_message(message: str) -> str:
133
+ """Gets the starting message for the given message."""
134
+ # We have to reset the color before the message, because sometimes if a
135
+ # previous spinner with dimmed color overflows in a narrow terminal, the
136
+ # color might be messed up.
137
+ return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
138
+
139
+
140
+ def finishing_message(
141
+ message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
142
+ ) -> str:
143
+ """Gets the finishing message for the given message."""
144
+ # We have to reset the color before the message, because sometimes if a
145
+ # previous spinner with dimmed color overflows in a narrow terminal, the
146
+ # color might be messed up.
147
+ success_prefix = (
148
+ f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
149
+ f'{message}{colorama.Style.RESET_ALL}'
150
+ )
151
+ if log_path is None:
152
+ return success_prefix
153
+ path_hint = log_path_hint(log_path)
154
+ return f'{success_prefix} {path_hint}'
155
+
156
+
157
+ def error_message(
158
+ message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
159
+ ) -> str:
160
+ """Gets the error message for the given message."""
161
+ # We have to reset the color before the message, because sometimes if a
162
+ # previous spinner with dimmed color overflows in a narrow terminal, the
163
+ # color might be messed up.
164
+ error_prefix = (
165
+ f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
166
+ f'{colorama.Style.RESET_ALL} {message}'
167
+ )
168
+ if log_path is None:
169
+ return error_prefix
170
+ path_hint = log_path_hint(log_path)
171
+ return f'{error_prefix} {path_hint}'
172
+
173
+
174
+ def retry_message(message: str) -> str:
175
+ """Gets the retry message for the given message."""
176
+ # We have to reset the color before the message, because sometimes if a
177
+ # previous spinner with dimmed color overflows in a narrow terminal, the
178
+ # color might be messed up.
179
+ return (
180
+ f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
181
+ f'{colorama.Style.RESET_ALL} {message}'
182
+ )
183
+
184
+
185
+ def spinner_message(
186
+ message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
187
+ ) -> str:
188
+ """Gets the spinner message for the given message and log path."""
189
+ colored_spinner = f'[bold cyan]{message}[/]'
190
+ if log_path is None:
191
+ return colored_spinner
192
+ path_hint = log_path_hint(log_path)
193
+ return f'{colored_spinner} {path_hint}'
194
+
195
+
196
+ class CommandHintType(enum.Enum):
197
+ JOB = 'JOB'
198
+
199
+
200
+ def command_hint_messages(hint_type: CommandHintType, job_id: str) -> str:
201
+ """Gets the command hint messages for the given job id."""
202
+ if hint_type == CommandHintType.JOB:
203
+ job_hint_str = (
204
+ f'\nJob ID: {job_id}'
205
+ f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
206
+ f'{BOLD}konduktor down {job_id} {RESET_BOLD}'
207
+ f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
208
+ f'{BOLD}konduktor logs {job_id} {RESET_BOLD}'
209
+ f'\n{INDENT_SYMBOL}To list all jobs:\t\t'
210
+ f'{BOLD}konduktor status{RESET_BOLD}'
211
+ )
212
+ hint_str = '\n📋 Useful Commands'
213
+ hint_str += f'{job_hint_str}'
214
+ return hint_str
215
+ else:
216
+ raise ValueError(f'Invalid hint type: {hint_type}')
@@ -0,0 +1,20 @@
1
+ """This module contains a custom validator for the JSON Schema specification.
2
+
3
+ The main motivation behind extending the existing JSON Schema validator is to
4
+ allow for case-insensitive enum matching since this is currently not supported
5
+ by the JSON Schema specification.
6
+ """
7
+
8
+ import jsonschema
9
+
10
+
11
+ def case_insensitive_enum(validator, enums, instance, schema):
12
+ del validator, schema # Unused.
13
+ if instance.lower() not in [enum.lower() for enum in enums]:
14
+ yield jsonschema.ValidationError(f'{instance!r} is not one of {enums!r}')
15
+
16
+
17
+ SchemaValidator = jsonschema.validators.extend(
18
+ jsonschema.Draft7Validator,
19
+ validators={'case_insensitive_enum': case_insensitive_enum},
20
+ )
@@ -75,7 +75,6 @@ The modifications are proprietary and subject to the terms of the Trainy Softwar
75
75
  Copyright 2024 Trainy Inc.
76
76
 
77
77
  Code is modified from https://github.com/skypilot-org/skypilot
78
- Git Revision: 9e50959e03146b599d9d1b3646573c913ec95bac
79
78
 
80
79
  The original files are licensed under the Apache License, Version 2.0 (the "License");
81
80
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250209104336
3
+ Version: 0.1.0.dev20250313070642
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -12,7 +12,18 @@ Classifier: Programming Language :: Python :: 3.11
12
12
  Classifier: Programming Language :: Python :: 3.12
13
13
  Requires-Dist: click (>=8.1.7,<9.0.0)
14
14
  Requires-Dist: colorama (>=0.4.6,<0.5.0)
15
+ Requires-Dist: google-api-python-client[gcp] (>=2.161.0,<3.0.0)
16
+ Requires-Dist: google-cloud-storage[gcp] (>=3.0.0,<4.0.0)
17
+ Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
18
+ Requires-Dist: jsonschema (>=4.23.0,<5.0.0)
19
+ Requires-Dist: kr8s (>=0.20.1,<0.21.0)
15
20
  Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
21
+ Requires-Dist: posthog (>=3.7.4,<4.0.0)
22
+ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
23
+ Requires-Dist: psutil (>=7.0.0,<8.0.0)
24
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
25
+ Requires-Dist: rich (>=13.9.4,<14.0.0)
26
+ Requires-Dist: websockets (>=15.0.1,<16.0.0)
16
27
  Description-Content-Type: text/markdown
17
28
 
18
29
 
@@ -39,7 +50,7 @@ Konduktor can be self-hosted and run on any certified Kubernetes distribution or
39
50
  </p>
40
51
 
41
52
  For ML researchers
42
- - [Skypilot](https://skypilot.readthedocs.io/en/latest/docs/index.html) - user friendly batch job framework, where users only need to specify the resource requirements of their job and a script to launch that makes simple to scale work across multiple nodes. Works with most ML application frameworks. [Examples](https://github.com/skypilot-org/skypilot/tree/master/examples)
53
+ - Konduktor CLI & SDK - user friendly batch job framework, where users only need to specify the resource requirements of their job and a script to launch that makes simple to scale work across multiple nodes. Works with most ML application frameworks out of the box.
43
54
 
44
55
  ```
45
56
  num_nodes: 100
@@ -0,0 +1,94 @@
1
+ konduktor/__init__.py,sha256=rTbCPyHL64EVG_QGzhnVHSKstM23_MXHWiLDtJZFz6k,1477
2
+ konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ konduktor/adaptors/common.py,sha256=mYb_6c3u5MghtiFfiW5OO-EH6t7cIR5npbkgUmz6FYE,3517
4
+ konduktor/adaptors/gcp.py,sha256=liCm4_D_qSci0DZA2t5bckLIoGDkJ8qx31EO_hSBzo0,3751
5
+ konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
6
+ konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
7
+ konduktor/backends/jobset.py,sha256=LNTBNKPpb1Z2LK50C7N6HD75TSkVSYleEIqrmxHypN8,8255
8
+ konduktor/backends/jobset_utils.py,sha256=-TBVffZDkuy4jtr7dzzbFg9iEYZGNfSTB60oOCiYm8Q,16637
9
+ konduktor/check.py,sha256=hIrxDMKaGX2eZP-Pj9TCymGUHQAp93m48Gj3XMiqadA,7833
10
+ konduktor/cli.py,sha256=90bnh3nIobfBkzqS_SXgw9Z8Zqh4ouwpLDj0kx_6kL8,23562
11
+ konduktor/cloud_stores.py,sha256=KX3u5YlXGslMCe_q8zYtFy62_KGCmmLTrYuK7Y9jFIM,6277
12
+ konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
13
+ konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
14
+ konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ konduktor/controller/constants.py,sha256=SGAgu9yTDWYXyVwxlaw1vfRJFOflPR549mKwgdzbI9w,1124
16
+ konduktor/controller/launch.py,sha256=F_ybOxscbsnmL1c7v5is6QrTDcaV26dq8CsSn4YdmTc,1522
17
+ konduktor/controller/node.py,sha256=9uKFtgxmonxtr-73DRAd7qJjHUjyfS1E4sgXT0agzPg,2982
18
+ konduktor/controller/parse.py,sha256=U1G747N6Hef2cEgXvoRI2V1NEU316VDDHLCY1rYtnNc,3840
19
+ konduktor/dashboard/README.md,sha256=xOeItNLb3t0k8AiiQcjqleRfrlcWGK-n6yJF-rLv4M0,718
20
+ konduktor/dashboard/backend/main.py,sha256=IOgTqOWLMuV2vAafbvyftk09d6y9O85etfPNUa4Beco,4860
21
+ konduktor/dashboard/backend/sockets.py,sha256=AdcnhCgTb6GvDi7_bVx-BhCFQ0-i972S1zWFIAT0iAM,4611
22
+ konduktor/dashboard/frontend/.eslintrc.json,sha256=FYmhXlX768ZRnJXZHY8KCQYY8g3HjkeYWLnCdVJISWE,40
23
+ konduktor/dashboard/frontend/.gitignore,sha256=835-G5A1Oha6e-yJeVhTEe5tADESCTkeWrRxYt9MqSI,391
24
+ konduktor/dashboard/frontend/app/api/jobs/route.js,sha256=DHXJtKeOB-V2jCk4UnLCjnqWmKsFtqGD6iq5zxkipE4,2310
25
+ konduktor/dashboard/frontend/app/api/namespaces/route.js,sha256=wEGWgpMYIk70pURjZiQRyr8Pb5HOCsCFWPNbdV3qbbs,2145
26
+ konduktor/dashboard/frontend/app/components/Grafana.jsx,sha256=R13dvp39vYL1sJVohQexARzV5T31ikhOx9nUVE9fm1o,2715
27
+ konduktor/dashboard/frontend/app/components/JobsData.jsx,sha256=7S9zlmswraT-gcvVSJXWe6buhlDGvsndfMRuaO9zMw8,5792
28
+ konduktor/dashboard/frontend/app/components/LogsData.jsx,sha256=VZ1Rh6WqmizDtZAh6R18-cW62USarpHmXNpFf7Q5IF0,5325
29
+ konduktor/dashboard/frontend/app/components/NavMenu.jsx,sha256=3b-s3rJjhDMql4rfEzzZV5GVuoTqKSlNAIlopP8-uEA,1374
30
+ konduktor/dashboard/frontend/app/components/NavTabs.jsx,sha256=eiITGvOOtQ4CjS1IV50ZcYy4mKetdJzczg5EYxkow44,1963
31
+ konduktor/dashboard/frontend/app/components/NavTabs2.jsx,sha256=M7sYE35oekrkTLgZR1yhJ8vP7VuQHhKsJYHOeqW3tVg,936
32
+ konduktor/dashboard/frontend/app/components/SelectBtn.jsx,sha256=TpdFzTB7xyUMZ5H9GlfLlymr8q-tEAtzWBn7f-yTqbI,692
33
+ konduktor/dashboard/frontend/app/components/lib/utils.js,sha256=uiUjJth_vpFE7EEGOEfRpQwWHGXIE0scEskJ0cwurlo,135
34
+ konduktor/dashboard/frontend/app/components/ui/chip-select.jsx,sha256=D0l8tN79PjRZzwhO82LDaEZlz7fCkDsQnE_06L-mEjk,2524
35
+ konduktor/dashboard/frontend/app/components/ui/input.jsx,sha256=lT3QQvcD1HuAraYd7AWfb8kMZhK5mMXPENLNcKhUwqA,699
36
+ konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx,sha256=8ektMrAc7jDsvix_kBA5oLKm35tUh2dYxqGlhEYcRKc,4326
37
+ konduktor/dashboard/frontend/app/components/ui/select.jsx,sha256=OCvRDP4nh3waRJRYGKE3A49Old9cthkW_mqND2rCmNo,4774
38
+ konduktor/dashboard/frontend/app/favicon.ico,sha256=K4rS0zRVqPc2_DqOv48L3qiEitTA20iigzvQ-c13WTI,25931
39
+ konduktor/dashboard/frontend/app/globals.css,sha256=BwtmaZmVRxKmxkz0zN7MXVB_nrw9ynlHhKiV9JiU4lI,2750
40
+ konduktor/dashboard/frontend/app/jobs/page.js,sha256=_dEIe1qREUR0KEsBFGqBlm5DG0J-2pMtVNM2soccrgU,188
41
+ konduktor/dashboard/frontend/app/layout.js,sha256=QCllsC6zUsS_tm95Lt0_QC1Yi9JK3Dg5QngI604rFLU,429
42
+ konduktor/dashboard/frontend/app/logs/page.js,sha256=gohgEZ5hSnqOQEzpaCnYxuh4l36M1mXRVuxfTxRA0_g,213
43
+ konduktor/dashboard/frontend/app/page.js,sha256=NTmwBsmsdJyB0yh-xBufy2a6X6BtYvAI2NuyaBp6MWE,209
44
+ konduktor/dashboard/frontend/jsconfig.json,sha256=nJtecz7VFXXo2ZENxlvZ57mUtCs8kyN7euRInr_srPw,73
45
+ konduktor/dashboard/frontend/next.config.mjs,sha256=HjIGyyAOJk_d2027Lkdy2uvIOf28lLeDFtdIkqDeb6I,92
46
+ konduktor/dashboard/frontend/package-lock.json,sha256=cMQEByVjrx1-FJgS8fC01JVHFMe7z_4OXWC0aOhKv7Y,245859
47
+ konduktor/dashboard/frontend/package.json,sha256=CDPlG00EUKJbDdqwf0QxKQ1J9Vn6MxuBYvRI5kywU-o,1016
48
+ konduktor/dashboard/frontend/postcss.config.mjs,sha256=rDHiqV72T-J860Ek4QFnUnMQeq4uM7xJCwHZGefwNkY,135
49
+ konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZnVY6Kh-J6A,1789
50
+ konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
51
+ konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
52
+ konduktor/data/constants.py,sha256=tAYHrmzPCI2lwK_Fy7AfwE1RTcb38AYW99Zkw8fEtDM,299
53
+ konduktor/data/data_utils.py,sha256=aIv3q2H1GSiN2w8WNjZgVaglm-hoiHSb4KR-MAiKKXs,8383
54
+ konduktor/data/gcp/__init__.py,sha256=m40OpIx2UDGnREokqfQ52OsAfo9WXC748hF98YWyG-A,517
55
+ konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
56
+ konduktor/data/gcp/gcs.py,sha256=wJri7wG7FJBQvEkAqWQsNHJMLWqMH2n10d8vyy0M_5o,38650
57
+ konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
58
+ konduktor/data/storage.py,sha256=KppuO1bYSyMA7RSkmBbJnw8xwBPjKYQ8IbtxYv3abz4,34731
59
+ konduktor/data/storage_utils.py,sha256=v_OZyWEb9DmNeLxn_OEJLCzKa0Y3MxcKI5CmPL3jQ6I,19566
60
+ konduktor/execution.py,sha256=wwJUQJO3Rc0u0TiF78ilwsEw3a7gGa6uEwBULfeIzAQ,18403
61
+ konduktor/kube_client.py,sha256=Dhza1605wmS4TaFCrw63Y7nh3oSc2P3ekUE2-RI-Qlw,6155
62
+ konduktor/logging.py,sha256=mBCoCTNhDEkUxd4tsse4mw-aVzSGohhXYf16ViR0ch4,2722
63
+ konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
64
+ konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
65
+ konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
66
+ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
67
+ konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
68
+ konduktor/task.py,sha256=edHgMLYECGux6WLCilqsNZNYr3dEcw_miWvu4FYpu5U,34713
69
+ konduktor/templates/jobset.yaml.j2,sha256=NevmZYDUBQbzVHiQ6EzlWX8FzdHLcz1bcLxOvD03PKQ,940
70
+ konduktor/templates/pod.yaml.j2,sha256=zrYwxTyAFmjh6NtMmiGaOZBFwqCBZW2dRex4RpLh4iE,8142
71
+ konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
+ konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
73
+ konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
+ konduktor/utils/accelerator_registry.py,sha256=-qHLD_ecklhexYHgzGhPYGvWmIHohWQgeciwZ-NfJ3g,538
75
+ konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
76
+ konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMugyI,3130
77
+ konduktor/utils/common_utils.py,sha256=1_j-nRikKmTnB8BFE0xQb7LquKVAOLaJnsy4LxZlNbI,13869
78
+ konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
79
+ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
80
+ konduktor/utils/exceptions.py,sha256=GBOFIkk9nikqWGR0FXGXOWVVImoH7nWnMl_L3Oux3fo,6581
81
+ konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
82
+ konduktor/utils/kubernetes_utils.py,sha256=O1DmScyGIv0goNrti0IwYu-nyRrDKubwEyn6yiiJ0Tg,23492
83
+ konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
84
+ konduktor/utils/loki_utils.py,sha256=SrRwTYHWGfiqqufY2XcKk4imgdxUCFBZ5oxyhCEJI0Y,3221
85
+ konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
86
+ konduktor/utils/schemas.py,sha256=4Goihc-NpFQpiJ7RSiKirAIPNWqw_DV_TRqVwejqTDY,17479
87
+ konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
88
+ konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
89
+ konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
90
+ konduktor_nightly-0.1.0.dev20250313070642.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
91
+ konduktor_nightly-0.1.0.dev20250313070642.dist-info/METADATA,sha256=psJZae90v2J35ilwCIwZwZWeeEB7K5OyLFcDgA5l_j8,4070
92
+ konduktor_nightly-0.1.0.dev20250313070642.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
93
+ konduktor_nightly-0.1.0.dev20250313070642.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
94
+ konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD,,