konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Utility functions for subprocesses."""
|
14
|
+
|
15
|
+
import os
|
16
|
+
import resource
|
17
|
+
import subprocess
|
18
|
+
from multiprocessing import pool
|
19
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
20
|
+
|
21
|
+
import colorama
|
22
|
+
import psutil
|
23
|
+
|
24
|
+
from konduktor import constants, logging
|
25
|
+
from konduktor.utils import exceptions, log_utils, ux_utils
|
26
|
+
|
27
|
+
logger = logging.get_logger(__name__)
|
28
|
+
|
29
|
+
_fd_limit_warning_shown = False
|
30
|
+
|
31
|
+
|
32
|
+
def run(cmd, **kwargs):
|
33
|
+
# Should be careful to use this function, as the child process cmd spawn may
|
34
|
+
# keep running in the background after the current program is killed. To get
|
35
|
+
# rid of this problem, use `log_utils.run_with_log`.
|
36
|
+
shell = kwargs.pop('shell', True)
|
37
|
+
check = kwargs.pop('check', True)
|
38
|
+
executable = kwargs.pop('executable', '/bin/bash')
|
39
|
+
if not shell:
|
40
|
+
executable = None
|
41
|
+
return subprocess.run(
|
42
|
+
cmd, shell=shell, check=check, executable=executable, **kwargs
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
def run_no_outputs(cmd, **kwargs):
|
47
|
+
return run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, **kwargs)
|
48
|
+
|
49
|
+
|
50
|
+
def _get_thread_multiplier(cloud_str: Optional[str] = None) -> int:
|
51
|
+
# If using Kubernetes, we use 4x the number of cores.
|
52
|
+
if cloud_str and cloud_str.lower() == 'kubernetes':
|
53
|
+
return 4
|
54
|
+
return 1
|
55
|
+
|
56
|
+
|
57
|
+
def get_max_workers_for_file_mounts(
|
58
|
+
common_file_mounts: Dict[str, str], cloud_str: Optional[str] = None
|
59
|
+
) -> int:
|
60
|
+
global _fd_limit_warning_shown
|
61
|
+
fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
62
|
+
|
63
|
+
# Raise warning for low fd_limit (only once)
|
64
|
+
if fd_limit < 1024 and not _fd_limit_warning_shown:
|
65
|
+
logger.warning(
|
66
|
+
f'Open file descriptor limit ({fd_limit}) is low. File sync to '
|
67
|
+
'remote clusters may be slow. Consider increasing the limit using '
|
68
|
+
'`ulimit -n <number>` or modifying system limits.'
|
69
|
+
)
|
70
|
+
_fd_limit_warning_shown = True
|
71
|
+
|
72
|
+
fd_per_rsync = 5
|
73
|
+
for src in common_file_mounts.values():
|
74
|
+
if os.path.isdir(src):
|
75
|
+
# Assume that each file/folder under src takes 5 file descriptors
|
76
|
+
# on average.
|
77
|
+
fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
|
78
|
+
|
79
|
+
# Reserve some file descriptors for the system and other processes
|
80
|
+
fd_reserve = 100
|
81
|
+
|
82
|
+
max_workers = (fd_limit - fd_reserve) // fd_per_rsync
|
83
|
+
# At least 1 worker, and avoid too many workers overloading the system.
|
84
|
+
num_threads = get_parallel_threads(cloud_str)
|
85
|
+
max_workers = min(max(max_workers, 1), num_threads)
|
86
|
+
logger.debug(f'Using {max_workers} workers for file mounts.')
|
87
|
+
return max_workers
|
88
|
+
|
89
|
+
|
90
|
+
def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
91
|
+
"""Returns the number of threads to use for parallel execution.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
cloud_str: The cloud
|
95
|
+
"""
|
96
|
+
cpu_count = os.cpu_count()
|
97
|
+
if cpu_count is None:
|
98
|
+
cpu_count = 1
|
99
|
+
return max(4, cpu_count - 1) * _get_thread_multiplier(cloud_str)
|
100
|
+
|
101
|
+
|
102
|
+
def run_in_parallel(
|
103
|
+
func: Callable, args: List[Any], num_threads: Optional[int] = None
|
104
|
+
) -> List[Any]:
|
105
|
+
"""Run a function in parallel on a list of arguments.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
func: The function to run in parallel
|
109
|
+
args: Iterable of arguments to pass to func
|
110
|
+
num_threads: Number of threads to use. If None, uses
|
111
|
+
get_parallel_threads()
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
A list of the return values of the function func, in the same order as the
|
115
|
+
arguments.
|
116
|
+
|
117
|
+
Raises:
|
118
|
+
Exception: The first exception encountered.
|
119
|
+
"""
|
120
|
+
# Short-circuit for short lists
|
121
|
+
if len(args) == 0:
|
122
|
+
return []
|
123
|
+
if len(args) == 1:
|
124
|
+
return [func(args[0])]
|
125
|
+
|
126
|
+
processes = num_threads if num_threads is not None else get_parallel_threads()
|
127
|
+
|
128
|
+
with pool.ThreadPool(processes=processes) as p:
|
129
|
+
ordered_iterators = p.imap(func, args)
|
130
|
+
return list(ordered_iterators)
|
131
|
+
|
132
|
+
|
133
|
+
def handle_returncode(
|
134
|
+
returncode: int,
|
135
|
+
command: str,
|
136
|
+
error_msg: Union[str, Callable[[], str]],
|
137
|
+
stderr: Optional[str] = None,
|
138
|
+
stream_logs: bool = True,
|
139
|
+
) -> None:
|
140
|
+
"""Handle the returncode of a command.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
returncode: The returncode of the command.
|
144
|
+
command: The command that was run.
|
145
|
+
error_msg: The error message to print.
|
146
|
+
stderr: The stderr of the command.
|
147
|
+
stream_logs: Whether to stream logs.
|
148
|
+
"""
|
149
|
+
echo = logger.error if stream_logs else logger.debug
|
150
|
+
if returncode != 0:
|
151
|
+
if stderr is not None:
|
152
|
+
echo(stderr)
|
153
|
+
|
154
|
+
if callable(error_msg):
|
155
|
+
error_msg = error_msg()
|
156
|
+
format_err_msg = f'{colorama.Fore.RED}{error_msg}{colorama.Style.RESET_ALL}'
|
157
|
+
with ux_utils.print_exception_no_traceback():
|
158
|
+
raise exceptions.CommandError(returncode, command, format_err_msg, stderr)
|
159
|
+
|
160
|
+
|
161
|
+
def kill_children_processes(
|
162
|
+
parent_pids: Optional[Union[int, List[Optional[int]]]] = None, force: bool = False
|
163
|
+
) -> None:
|
164
|
+
"""Kill children processes recursively.
|
165
|
+
|
166
|
+
We need to kill the children, so that
|
167
|
+
1. The underlying subprocess will not print the logs to the terminal,
|
168
|
+
after this program exits.
|
169
|
+
2. The underlying subprocess will not continue with starting a cluster
|
170
|
+
etc. while we are cleaning up the clusters.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
parent_pids: Optional PIDs of a series of processes. The processes and
|
174
|
+
their children will be killed. If a list of PID is specified, it is
|
175
|
+
killed by the order in the list. This is for guaranteeing the order
|
176
|
+
of cleaning up and suppress flaky errors.
|
177
|
+
force: bool, send SIGKILL if force, otherwise, use SIGTERM for
|
178
|
+
gracefully kill the process.
|
179
|
+
"""
|
180
|
+
if isinstance(parent_pids, int):
|
181
|
+
parent_pids = [parent_pids]
|
182
|
+
|
183
|
+
def kill(proc: psutil.Process):
|
184
|
+
if not proc.is_running():
|
185
|
+
# Skip if the process is not running.
|
186
|
+
return
|
187
|
+
logger.debug(f'Killing process {proc.pid}')
|
188
|
+
try:
|
189
|
+
if force:
|
190
|
+
proc.kill()
|
191
|
+
else:
|
192
|
+
proc.terminate()
|
193
|
+
proc.wait(timeout=10)
|
194
|
+
except psutil.NoSuchProcess:
|
195
|
+
# The child process may have already been terminated.
|
196
|
+
pass
|
197
|
+
except psutil.TimeoutExpired:
|
198
|
+
logger.debug(f'Process {proc.pid} did not terminate after 10 seconds')
|
199
|
+
# Attempt to force kill if the normal termination fails
|
200
|
+
if not force:
|
201
|
+
logger.debug(f'Force killing process {proc.pid}')
|
202
|
+
proc.kill()
|
203
|
+
proc.wait(timeout=5) # Shorter timeout after force kill
|
204
|
+
|
205
|
+
parent_processes = []
|
206
|
+
if parent_pids is None:
|
207
|
+
parent_processes = [psutil.Process()]
|
208
|
+
else:
|
209
|
+
for pid in parent_pids:
|
210
|
+
try:
|
211
|
+
process = psutil.Process(pid)
|
212
|
+
except psutil.NoSuchProcess:
|
213
|
+
continue
|
214
|
+
parent_processes.append(process)
|
215
|
+
|
216
|
+
for parent_process in parent_processes:
|
217
|
+
child_processes = parent_process.children(recursive=True)
|
218
|
+
if parent_pids is not None:
|
219
|
+
kill(parent_process)
|
220
|
+
logger.debug(f'Killing child processes: {child_processes}')
|
221
|
+
for child in child_processes:
|
222
|
+
kill(child)
|
223
|
+
|
224
|
+
|
225
|
+
def kill_process_daemon(process_pid: int) -> None:
|
226
|
+
"""Start a daemon as a safety net to kill the process.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
process_pid: The PID of the process to kill.
|
230
|
+
"""
|
231
|
+
# Get initial children list
|
232
|
+
try:
|
233
|
+
process = psutil.Process(process_pid)
|
234
|
+
initial_children = [p.pid for p in process.children(recursive=True)]
|
235
|
+
except psutil.NoSuchProcess:
|
236
|
+
initial_children = []
|
237
|
+
|
238
|
+
parent_pid = os.getpid()
|
239
|
+
daemon_script = os.path.join(
|
240
|
+
os.path.dirname(os.path.abspath(log_utils.__file__)), 'subprocess_daemon.py'
|
241
|
+
)
|
242
|
+
python_path = subprocess.check_output(
|
243
|
+
constants.GET_PYTHON_PATH_CMD,
|
244
|
+
shell=True,
|
245
|
+
stderr=subprocess.DEVNULL,
|
246
|
+
encoding='utf-8',
|
247
|
+
).strip()
|
248
|
+
daemon_cmd = [
|
249
|
+
python_path,
|
250
|
+
daemon_script,
|
251
|
+
'--parent-pid',
|
252
|
+
str(parent_pid),
|
253
|
+
'--proc-pid',
|
254
|
+
str(process_pid),
|
255
|
+
# We pass the initial children list to avoid the race condition where
|
256
|
+
# the process_pid is terminated before the daemon starts and gets the
|
257
|
+
# children list.
|
258
|
+
'--initial-children',
|
259
|
+
','.join(map(str, initial_children)),
|
260
|
+
]
|
261
|
+
|
262
|
+
# We do not need to set `start_new_session=True` here, as the
|
263
|
+
# daemon script will detach itself from the parent process with
|
264
|
+
# fork to avoid being killed by parent process. See the reason we
|
265
|
+
# daemonize the process in `sky/skylet/subprocess_daemon.py`.
|
266
|
+
subprocess.Popen(
|
267
|
+
daemon_cmd,
|
268
|
+
# Suppress output
|
269
|
+
stdout=subprocess.DEVNULL,
|
270
|
+
stderr=subprocess.DEVNULL,
|
271
|
+
# Disable input
|
272
|
+
stdin=subprocess.DEVNULL,
|
273
|
+
)
|
@@ -0,0 +1,216 @@
|
|
1
|
+
"""Utility functions for UX."""
|
2
|
+
|
3
|
+
import contextlib
|
4
|
+
import enum
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
import traceback
|
8
|
+
import typing
|
9
|
+
from typing import Callable, Optional, Union
|
10
|
+
|
11
|
+
import colorama
|
12
|
+
import rich.console as rich_console
|
13
|
+
|
14
|
+
from konduktor import logging as konduktor_logging
|
15
|
+
|
16
|
+
if typing.TYPE_CHECKING:
|
17
|
+
import pathlib
|
18
|
+
|
19
|
+
console = rich_console.Console()
|
20
|
+
|
21
|
+
INDENT_SYMBOL = f'{colorama.Style.DIM}├── {colorama.Style.RESET_ALL}'
|
22
|
+
INDENT_LAST_SYMBOL = f'{colorama.Style.DIM}└── {colorama.Style.RESET_ALL}'
|
23
|
+
|
24
|
+
# Console formatting constants
|
25
|
+
BOLD = '\033[1m'
|
26
|
+
RESET_BOLD = '\033[0m'
|
27
|
+
|
28
|
+
# Log path hint in the spinner during launching
|
29
|
+
_LOG_PATH_HINT = (
|
30
|
+
f'{colorama.Style.DIM}View logs at: {{log_path}}' f'{colorama.Style.RESET_ALL}'
|
31
|
+
)
|
32
|
+
|
33
|
+
|
34
|
+
def console_newline():
|
35
|
+
"""Prints a newline to the console using rich.
|
36
|
+
|
37
|
+
Useful when catching exceptions inside console.status()
|
38
|
+
"""
|
39
|
+
console.print()
|
40
|
+
|
41
|
+
|
42
|
+
@contextlib.contextmanager
|
43
|
+
def print_exception_no_traceback():
|
44
|
+
"""A context manager that prints out an exception without traceback.
|
45
|
+
|
46
|
+
Mainly for UX: user-facing errors, e.g., ValueError, should suppress long
|
47
|
+
tracebacks.
|
48
|
+
|
49
|
+
If KONDUKTOR_DEBUG environment variable is set, this context manager is a
|
50
|
+
no-op and the full traceback will be shown.
|
51
|
+
|
52
|
+
Example usage:
|
53
|
+
|
54
|
+
with print_exception_no_traceback():
|
55
|
+
if error():
|
56
|
+
raise ValueError('...')
|
57
|
+
"""
|
58
|
+
if os.environ.get('KONDUKTOR_DEBUG'):
|
59
|
+
# When KONDUKTOR_DEBUG is set, show the full traceback
|
60
|
+
yield
|
61
|
+
else:
|
62
|
+
original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
|
63
|
+
sys.tracebacklimit = 0
|
64
|
+
yield
|
65
|
+
sys.tracebacklimit = original_tracelimit
|
66
|
+
|
67
|
+
|
68
|
+
@contextlib.contextmanager
|
69
|
+
def enable_traceback():
|
70
|
+
"""Reverts the effect of print_exception_no_traceback().
|
71
|
+
|
72
|
+
This is used for usage_lib to collect the full traceback.
|
73
|
+
"""
|
74
|
+
original_tracelimit = getattr(sys, 'tracebacklimit', 1000)
|
75
|
+
sys.tracebacklimit = 1000
|
76
|
+
yield
|
77
|
+
sys.tracebacklimit = original_tracelimit
|
78
|
+
|
79
|
+
|
80
|
+
class RedirectOutputForProcess:
|
81
|
+
"""Redirects stdout and stderr to a file.
|
82
|
+
|
83
|
+
This class enabled output redirect for multiprocessing.Process.
|
84
|
+
Example usage:
|
85
|
+
|
86
|
+
p = multiprocessing.Process(
|
87
|
+
target=RedirectOutputForProcess(func, file_name).run, args=...)
|
88
|
+
|
89
|
+
This is equal to:
|
90
|
+
|
91
|
+
p = multiprocessing.Process(target=func, args=...)
|
92
|
+
|
93
|
+
Plus redirect all stdout/stderr to file_name.
|
94
|
+
"""
|
95
|
+
|
96
|
+
def __init__(self, func: Callable, file: str, mode: str = 'w') -> None:
|
97
|
+
self.func = func
|
98
|
+
self.file = file
|
99
|
+
self.mode = mode
|
100
|
+
|
101
|
+
def run(self, *args, **kwargs):
|
102
|
+
with open(self.file, self.mode, encoding='utf-8') as f:
|
103
|
+
sys.stdout = f
|
104
|
+
sys.stderr = f
|
105
|
+
# reconfigure logger since the logger is initialized before
|
106
|
+
# with previous stdout/stderr
|
107
|
+
konduktor_logging.reload_logger()
|
108
|
+
logger = konduktor_logging.init_logger(__name__)
|
109
|
+
# The subprocess_util.run('konduktor status') inside
|
110
|
+
# konduktor.execution::_execute cannot be redirect, since we cannot
|
111
|
+
# directly operate on the stdout/stderr of the subprocess. This
|
112
|
+
# is because some code in konduktor will specify the stdout/stderr
|
113
|
+
# of the subprocess.
|
114
|
+
try:
|
115
|
+
self.func(*args, **kwargs)
|
116
|
+
except Exception as e: # pylint: disable=broad-except
|
117
|
+
logger.error(f'Failed to run {self.func.__name__}. ' f'Details: {e}')
|
118
|
+
with enable_traceback():
|
119
|
+
logger.error(f' Traceback:\n{traceback.format_exc()}')
|
120
|
+
raise
|
121
|
+
|
122
|
+
|
123
|
+
def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
|
124
|
+
"""Gets the log path hint for the given log path."""
|
125
|
+
log_path = str(log_path)
|
126
|
+
expanded_home = os.path.expanduser('~')
|
127
|
+
if log_path.startswith(expanded_home):
|
128
|
+
log_path = '~' + log_path[len(expanded_home) :]
|
129
|
+
return _LOG_PATH_HINT.format(log_path=log_path)
|
130
|
+
|
131
|
+
|
132
|
+
def starting_message(message: str) -> str:
|
133
|
+
"""Gets the starting message for the given message."""
|
134
|
+
# We have to reset the color before the message, because sometimes if a
|
135
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
136
|
+
# color might be messed up.
|
137
|
+
return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
|
138
|
+
|
139
|
+
|
140
|
+
def finishing_message(
|
141
|
+
message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
|
142
|
+
) -> str:
|
143
|
+
"""Gets the finishing message for the given message."""
|
144
|
+
# We have to reset the color before the message, because sometimes if a
|
145
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
146
|
+
# color might be messed up.
|
147
|
+
success_prefix = (
|
148
|
+
f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
|
149
|
+
f'{message}{colorama.Style.RESET_ALL}'
|
150
|
+
)
|
151
|
+
if log_path is None:
|
152
|
+
return success_prefix
|
153
|
+
path_hint = log_path_hint(log_path)
|
154
|
+
return f'{success_prefix} {path_hint}'
|
155
|
+
|
156
|
+
|
157
|
+
def error_message(
|
158
|
+
message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
|
159
|
+
) -> str:
|
160
|
+
"""Gets the error message for the given message."""
|
161
|
+
# We have to reset the color before the message, because sometimes if a
|
162
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
163
|
+
# color might be messed up.
|
164
|
+
error_prefix = (
|
165
|
+
f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
|
166
|
+
f'{colorama.Style.RESET_ALL} {message}'
|
167
|
+
)
|
168
|
+
if log_path is None:
|
169
|
+
return error_prefix
|
170
|
+
path_hint = log_path_hint(log_path)
|
171
|
+
return f'{error_prefix} {path_hint}'
|
172
|
+
|
173
|
+
|
174
|
+
def retry_message(message: str) -> str:
|
175
|
+
"""Gets the retry message for the given message."""
|
176
|
+
# We have to reset the color before the message, because sometimes if a
|
177
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
178
|
+
# color might be messed up.
|
179
|
+
return (
|
180
|
+
f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
|
181
|
+
f'{colorama.Style.RESET_ALL} {message}'
|
182
|
+
)
|
183
|
+
|
184
|
+
|
185
|
+
def spinner_message(
|
186
|
+
message: str, log_path: Optional[Union[str, 'pathlib.Path']] = None
|
187
|
+
) -> str:
|
188
|
+
"""Gets the spinner message for the given message and log path."""
|
189
|
+
colored_spinner = f'[bold cyan]{message}[/]'
|
190
|
+
if log_path is None:
|
191
|
+
return colored_spinner
|
192
|
+
path_hint = log_path_hint(log_path)
|
193
|
+
return f'{colored_spinner} {path_hint}'
|
194
|
+
|
195
|
+
|
196
|
+
class CommandHintType(enum.Enum):
|
197
|
+
JOB = 'JOB'
|
198
|
+
|
199
|
+
|
200
|
+
def command_hint_messages(hint_type: CommandHintType, job_id: str) -> str:
|
201
|
+
"""Gets the command hint messages for the given job id."""
|
202
|
+
if hint_type == CommandHintType.JOB:
|
203
|
+
job_hint_str = (
|
204
|
+
f'\nJob ID: {job_id}'
|
205
|
+
f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
|
206
|
+
f'{BOLD}konduktor down {job_id} {RESET_BOLD}'
|
207
|
+
f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
|
208
|
+
f'{BOLD}konduktor logs {job_id} {RESET_BOLD}'
|
209
|
+
f'\n{INDENT_SYMBOL}To list all jobs:\t\t'
|
210
|
+
f'{BOLD}konduktor status{RESET_BOLD}'
|
211
|
+
)
|
212
|
+
hint_str = '\n📋 Useful Commands'
|
213
|
+
hint_str += f'{job_hint_str}'
|
214
|
+
return hint_str
|
215
|
+
else:
|
216
|
+
raise ValueError(f'Invalid hint type: {hint_type}')
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""This module contains a custom validator for the JSON Schema specification.
|
2
|
+
|
3
|
+
The main motivation behind extending the existing JSON Schema validator is to
|
4
|
+
allow for case-insensitive enum matching since this is currently not supported
|
5
|
+
by the JSON Schema specification.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import jsonschema
|
9
|
+
|
10
|
+
|
11
|
+
def case_insensitive_enum(validator, enums, instance, schema):
|
12
|
+
del validator, schema # Unused.
|
13
|
+
if instance.lower() not in [enum.lower() for enum in enums]:
|
14
|
+
yield jsonschema.ValidationError(f'{instance!r} is not one of {enums!r}')
|
15
|
+
|
16
|
+
|
17
|
+
SchemaValidator = jsonschema.validators.extend(
|
18
|
+
jsonschema.Draft7Validator,
|
19
|
+
validators={'case_insensitive_enum': case_insensitive_enum},
|
20
|
+
)
|
@@ -75,7 +75,6 @@ The modifications are proprietary and subject to the terms of the Trainy Softwar
|
|
75
75
|
Copyright 2024 Trainy Inc.
|
76
76
|
|
77
77
|
Code is modified from https://github.com/skypilot-org/skypilot
|
78
|
-
Git Revision: 9e50959e03146b599d9d1b3646573c913ec95bac
|
79
78
|
|
80
79
|
The original files are licensed under the Apache License, Version 2.0 (the "License");
|
81
80
|
you may not use this file except in compliance with the License.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: konduktor-nightly
|
3
|
-
Version: 0.1.0.
|
3
|
+
Version: 0.1.0.dev20250313070642
|
4
4
|
Summary: GPU Cluster Health Management
|
5
5
|
Author: Andrew Aikawa
|
6
6
|
Author-email: asai@berkeley.edu
|
@@ -12,7 +12,18 @@ Classifier: Programming Language :: Python :: 3.11
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.12
|
13
13
|
Requires-Dist: click (>=8.1.7,<9.0.0)
|
14
14
|
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
15
|
+
Requires-Dist: google-api-python-client[gcp] (>=2.161.0,<3.0.0)
|
16
|
+
Requires-Dist: google-cloud-storage[gcp] (>=3.0.0,<4.0.0)
|
17
|
+
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
18
|
+
Requires-Dist: jsonschema (>=4.23.0,<5.0.0)
|
19
|
+
Requires-Dist: kr8s (>=0.20.1,<0.21.0)
|
15
20
|
Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
|
21
|
+
Requires-Dist: posthog (>=3.7.4,<4.0.0)
|
22
|
+
Requires-Dist: prettytable (>=3.12.0,<4.0.0)
|
23
|
+
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
24
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
25
|
+
Requires-Dist: rich (>=13.9.4,<14.0.0)
|
26
|
+
Requires-Dist: websockets (>=15.0.1,<16.0.0)
|
16
27
|
Description-Content-Type: text/markdown
|
17
28
|
|
18
29
|
|
@@ -39,7 +50,7 @@ Konduktor can be self-hosted and run on any certified Kubernetes distribution or
|
|
39
50
|
</p>
|
40
51
|
|
41
52
|
For ML researchers
|
42
|
-
-
|
53
|
+
- Konduktor CLI & SDK - user friendly batch job framework, where users only need to specify the resource requirements of their job and a script to launch that makes simple to scale work across multiple nodes. Works with most ML application frameworks out of the box.
|
43
54
|
|
44
55
|
```
|
45
56
|
num_nodes: 100
|
@@ -0,0 +1,94 @@
|
|
1
|
+
konduktor/__init__.py,sha256=rTbCPyHL64EVG_QGzhnVHSKstM23_MXHWiLDtJZFz6k,1477
|
2
|
+
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
konduktor/adaptors/common.py,sha256=mYb_6c3u5MghtiFfiW5OO-EH6t7cIR5npbkgUmz6FYE,3517
|
4
|
+
konduktor/adaptors/gcp.py,sha256=liCm4_D_qSci0DZA2t5bckLIoGDkJ8qx31EO_hSBzo0,3751
|
5
|
+
konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
|
6
|
+
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
7
|
+
konduktor/backends/jobset.py,sha256=LNTBNKPpb1Z2LK50C7N6HD75TSkVSYleEIqrmxHypN8,8255
|
8
|
+
konduktor/backends/jobset_utils.py,sha256=-TBVffZDkuy4jtr7dzzbFg9iEYZGNfSTB60oOCiYm8Q,16637
|
9
|
+
konduktor/check.py,sha256=hIrxDMKaGX2eZP-Pj9TCymGUHQAp93m48Gj3XMiqadA,7833
|
10
|
+
konduktor/cli.py,sha256=90bnh3nIobfBkzqS_SXgw9Z8Zqh4ouwpLDj0kx_6kL8,23562
|
11
|
+
konduktor/cloud_stores.py,sha256=KX3u5YlXGslMCe_q8zYtFy62_KGCmmLTrYuK7Y9jFIM,6277
|
12
|
+
konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
|
13
|
+
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
14
|
+
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
konduktor/controller/constants.py,sha256=SGAgu9yTDWYXyVwxlaw1vfRJFOflPR549mKwgdzbI9w,1124
|
16
|
+
konduktor/controller/launch.py,sha256=F_ybOxscbsnmL1c7v5is6QrTDcaV26dq8CsSn4YdmTc,1522
|
17
|
+
konduktor/controller/node.py,sha256=9uKFtgxmonxtr-73DRAd7qJjHUjyfS1E4sgXT0agzPg,2982
|
18
|
+
konduktor/controller/parse.py,sha256=U1G747N6Hef2cEgXvoRI2V1NEU316VDDHLCY1rYtnNc,3840
|
19
|
+
konduktor/dashboard/README.md,sha256=xOeItNLb3t0k8AiiQcjqleRfrlcWGK-n6yJF-rLv4M0,718
|
20
|
+
konduktor/dashboard/backend/main.py,sha256=IOgTqOWLMuV2vAafbvyftk09d6y9O85etfPNUa4Beco,4860
|
21
|
+
konduktor/dashboard/backend/sockets.py,sha256=AdcnhCgTb6GvDi7_bVx-BhCFQ0-i972S1zWFIAT0iAM,4611
|
22
|
+
konduktor/dashboard/frontend/.eslintrc.json,sha256=FYmhXlX768ZRnJXZHY8KCQYY8g3HjkeYWLnCdVJISWE,40
|
23
|
+
konduktor/dashboard/frontend/.gitignore,sha256=835-G5A1Oha6e-yJeVhTEe5tADESCTkeWrRxYt9MqSI,391
|
24
|
+
konduktor/dashboard/frontend/app/api/jobs/route.js,sha256=DHXJtKeOB-V2jCk4UnLCjnqWmKsFtqGD6iq5zxkipE4,2310
|
25
|
+
konduktor/dashboard/frontend/app/api/namespaces/route.js,sha256=wEGWgpMYIk70pURjZiQRyr8Pb5HOCsCFWPNbdV3qbbs,2145
|
26
|
+
konduktor/dashboard/frontend/app/components/Grafana.jsx,sha256=R13dvp39vYL1sJVohQexARzV5T31ikhOx9nUVE9fm1o,2715
|
27
|
+
konduktor/dashboard/frontend/app/components/JobsData.jsx,sha256=7S9zlmswraT-gcvVSJXWe6buhlDGvsndfMRuaO9zMw8,5792
|
28
|
+
konduktor/dashboard/frontend/app/components/LogsData.jsx,sha256=VZ1Rh6WqmizDtZAh6R18-cW62USarpHmXNpFf7Q5IF0,5325
|
29
|
+
konduktor/dashboard/frontend/app/components/NavMenu.jsx,sha256=3b-s3rJjhDMql4rfEzzZV5GVuoTqKSlNAIlopP8-uEA,1374
|
30
|
+
konduktor/dashboard/frontend/app/components/NavTabs.jsx,sha256=eiITGvOOtQ4CjS1IV50ZcYy4mKetdJzczg5EYxkow44,1963
|
31
|
+
konduktor/dashboard/frontend/app/components/NavTabs2.jsx,sha256=M7sYE35oekrkTLgZR1yhJ8vP7VuQHhKsJYHOeqW3tVg,936
|
32
|
+
konduktor/dashboard/frontend/app/components/SelectBtn.jsx,sha256=TpdFzTB7xyUMZ5H9GlfLlymr8q-tEAtzWBn7f-yTqbI,692
|
33
|
+
konduktor/dashboard/frontend/app/components/lib/utils.js,sha256=uiUjJth_vpFE7EEGOEfRpQwWHGXIE0scEskJ0cwurlo,135
|
34
|
+
konduktor/dashboard/frontend/app/components/ui/chip-select.jsx,sha256=D0l8tN79PjRZzwhO82LDaEZlz7fCkDsQnE_06L-mEjk,2524
|
35
|
+
konduktor/dashboard/frontend/app/components/ui/input.jsx,sha256=lT3QQvcD1HuAraYd7AWfb8kMZhK5mMXPENLNcKhUwqA,699
|
36
|
+
konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx,sha256=8ektMrAc7jDsvix_kBA5oLKm35tUh2dYxqGlhEYcRKc,4326
|
37
|
+
konduktor/dashboard/frontend/app/components/ui/select.jsx,sha256=OCvRDP4nh3waRJRYGKE3A49Old9cthkW_mqND2rCmNo,4774
|
38
|
+
konduktor/dashboard/frontend/app/favicon.ico,sha256=K4rS0zRVqPc2_DqOv48L3qiEitTA20iigzvQ-c13WTI,25931
|
39
|
+
konduktor/dashboard/frontend/app/globals.css,sha256=BwtmaZmVRxKmxkz0zN7MXVB_nrw9ynlHhKiV9JiU4lI,2750
|
40
|
+
konduktor/dashboard/frontend/app/jobs/page.js,sha256=_dEIe1qREUR0KEsBFGqBlm5DG0J-2pMtVNM2soccrgU,188
|
41
|
+
konduktor/dashboard/frontend/app/layout.js,sha256=QCllsC6zUsS_tm95Lt0_QC1Yi9JK3Dg5QngI604rFLU,429
|
42
|
+
konduktor/dashboard/frontend/app/logs/page.js,sha256=gohgEZ5hSnqOQEzpaCnYxuh4l36M1mXRVuxfTxRA0_g,213
|
43
|
+
konduktor/dashboard/frontend/app/page.js,sha256=NTmwBsmsdJyB0yh-xBufy2a6X6BtYvAI2NuyaBp6MWE,209
|
44
|
+
konduktor/dashboard/frontend/jsconfig.json,sha256=nJtecz7VFXXo2ZENxlvZ57mUtCs8kyN7euRInr_srPw,73
|
45
|
+
konduktor/dashboard/frontend/next.config.mjs,sha256=HjIGyyAOJk_d2027Lkdy2uvIOf28lLeDFtdIkqDeb6I,92
|
46
|
+
konduktor/dashboard/frontend/package-lock.json,sha256=cMQEByVjrx1-FJgS8fC01JVHFMe7z_4OXWC0aOhKv7Y,245859
|
47
|
+
konduktor/dashboard/frontend/package.json,sha256=CDPlG00EUKJbDdqwf0QxKQ1J9Vn6MxuBYvRI5kywU-o,1016
|
48
|
+
konduktor/dashboard/frontend/postcss.config.mjs,sha256=rDHiqV72T-J860Ek4QFnUnMQeq4uM7xJCwHZGefwNkY,135
|
49
|
+
konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZnVY6Kh-J6A,1789
|
50
|
+
konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
|
51
|
+
konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
|
52
|
+
konduktor/data/constants.py,sha256=tAYHrmzPCI2lwK_Fy7AfwE1RTcb38AYW99Zkw8fEtDM,299
|
53
|
+
konduktor/data/data_utils.py,sha256=aIv3q2H1GSiN2w8WNjZgVaglm-hoiHSb4KR-MAiKKXs,8383
|
54
|
+
konduktor/data/gcp/__init__.py,sha256=m40OpIx2UDGnREokqfQ52OsAfo9WXC748hF98YWyG-A,517
|
55
|
+
konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
|
56
|
+
konduktor/data/gcp/gcs.py,sha256=wJri7wG7FJBQvEkAqWQsNHJMLWqMH2n10d8vyy0M_5o,38650
|
57
|
+
konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
|
58
|
+
konduktor/data/storage.py,sha256=KppuO1bYSyMA7RSkmBbJnw8xwBPjKYQ8IbtxYv3abz4,34731
|
59
|
+
konduktor/data/storage_utils.py,sha256=v_OZyWEb9DmNeLxn_OEJLCzKa0Y3MxcKI5CmPL3jQ6I,19566
|
60
|
+
konduktor/execution.py,sha256=wwJUQJO3Rc0u0TiF78ilwsEw3a7gGa6uEwBULfeIzAQ,18403
|
61
|
+
konduktor/kube_client.py,sha256=Dhza1605wmS4TaFCrw63Y7nh3oSc2P3ekUE2-RI-Qlw,6155
|
62
|
+
konduktor/logging.py,sha256=mBCoCTNhDEkUxd4tsse4mw-aVzSGohhXYf16ViR0ch4,2722
|
63
|
+
konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
|
64
|
+
konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
|
65
|
+
konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
|
66
|
+
konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
|
67
|
+
konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
|
68
|
+
konduktor/task.py,sha256=edHgMLYECGux6WLCilqsNZNYr3dEcw_miWvu4FYpu5U,34713
|
69
|
+
konduktor/templates/jobset.yaml.j2,sha256=NevmZYDUBQbzVHiQ6EzlWX8FzdHLcz1bcLxOvD03PKQ,940
|
70
|
+
konduktor/templates/pod.yaml.j2,sha256=zrYwxTyAFmjh6NtMmiGaOZBFwqCBZW2dRex4RpLh4iE,8142
|
71
|
+
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
|
+
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
73
|
+
konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
|
+
konduktor/utils/accelerator_registry.py,sha256=-qHLD_ecklhexYHgzGhPYGvWmIHohWQgeciwZ-NfJ3g,538
|
75
|
+
konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
|
76
|
+
konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMugyI,3130
|
77
|
+
konduktor/utils/common_utils.py,sha256=1_j-nRikKmTnB8BFE0xQb7LquKVAOLaJnsy4LxZlNbI,13869
|
78
|
+
konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
|
79
|
+
konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
|
80
|
+
konduktor/utils/exceptions.py,sha256=GBOFIkk9nikqWGR0FXGXOWVVImoH7nWnMl_L3Oux3fo,6581
|
81
|
+
konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
|
82
|
+
konduktor/utils/kubernetes_utils.py,sha256=O1DmScyGIv0goNrti0IwYu-nyRrDKubwEyn6yiiJ0Tg,23492
|
83
|
+
konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
|
84
|
+
konduktor/utils/loki_utils.py,sha256=SrRwTYHWGfiqqufY2XcKk4imgdxUCFBZ5oxyhCEJI0Y,3221
|
85
|
+
konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
|
86
|
+
konduktor/utils/schemas.py,sha256=4Goihc-NpFQpiJ7RSiKirAIPNWqw_DV_TRqVwejqTDY,17479
|
87
|
+
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
88
|
+
konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
|
89
|
+
konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
|
90
|
+
konduktor_nightly-0.1.0.dev20250313070642.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
91
|
+
konduktor_nightly-0.1.0.dev20250313070642.dist-info/METADATA,sha256=psJZae90v2J35ilwCIwZwZWeeEB7K5OyLFcDgA5l_j8,4070
|
92
|
+
konduktor_nightly-0.1.0.dev20250313070642.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
93
|
+
konduktor_nightly-0.1.0.dev20250313070642.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
94
|
+
konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD,,
|