konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,251 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import copy
14
+ import io
15
+ import multiprocessing
16
+ import os
17
+ import subprocess
18
+ import sys
19
+ import types
20
+ from typing import List, Optional, Tuple, Type, Union
21
+
22
+ import prettytable
23
+
24
+ from konduktor.utils import subprocess_utils
25
+
26
+
27
+ class LineProcessor(object):
28
+ """A processor for log lines."""
29
+
30
+ def __enter__(self) -> None:
31
+ pass
32
+
33
+ def process_line(self, log_line: str) -> None:
34
+ pass
35
+
36
+ def __exit__(
37
+ self,
38
+ except_type: Optional[Type[BaseException]],
39
+ except_value: Optional[BaseException],
40
+ traceback: Optional[types.TracebackType],
41
+ ) -> None:
42
+ del except_type, except_value, traceback # unused
43
+ pass
44
+
45
+
46
+ class _ProcessingArgs:
47
+ """Arguments for processing logs."""
48
+
49
+ def __init__(
50
+ self,
51
+ log_path: str,
52
+ stream_logs: bool,
53
+ start_streaming_at: str = '',
54
+ end_streaming_at: Optional[str] = None,
55
+ skip_lines: Optional[List[str]] = None,
56
+ replace_crlf: bool = False,
57
+ line_processor: Optional[LineProcessor] = None,
58
+ streaming_prefix: Optional[str] = None,
59
+ ) -> None:
60
+ self.log_path = log_path
61
+ self.stream_logs = stream_logs
62
+ self.start_streaming_at = start_streaming_at
63
+ self.end_streaming_at = end_streaming_at
64
+ self.skip_lines = skip_lines
65
+ self.replace_crlf = replace_crlf
66
+ self.line_processor = line_processor
67
+ self.streaming_prefix = streaming_prefix
68
+
69
+
70
+ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
71
+ """Process the stream of a process."""
72
+ out_io = io.TextIOWrapper(
73
+ io_stream, encoding='utf-8', newline='', errors='replace', write_through=True
74
+ )
75
+
76
+ start_streaming_flag = False
77
+ end_streaming_flag = False
78
+ streaming_prefix = args.streaming_prefix if args.streaming_prefix else ''
79
+ line_processor = (
80
+ LineProcessor() if args.line_processor is None else args.line_processor
81
+ )
82
+
83
+ out = []
84
+ with open(args.log_path, 'a', encoding='utf-8') as fout:
85
+ with line_processor:
86
+ while True:
87
+ line = out_io.readline()
88
+ if not line:
89
+ break
90
+ # start_streaming_at logic in processor.process_line(line)
91
+ if args.replace_crlf and line.endswith('\r\n'):
92
+ # Replace CRLF with LF to avoid ray logging to the same
93
+ # line due to separating lines with '\n'.
94
+ line = line[:-2] + '\n'
95
+ if args.skip_lines is not None and any(
96
+ skip in line for skip in args.skip_lines
97
+ ):
98
+ continue
99
+ if args.start_streaming_at in line:
100
+ start_streaming_flag = True
101
+ if args.end_streaming_at is not None and args.end_streaming_at in line:
102
+ # Keep executing the loop, only stop streaming.
103
+ # E.g., this is used for `sky bench` to hide the
104
+ # redundant messages of `sky launch` while
105
+ # saving them in log files.
106
+ end_streaming_flag = True
107
+ if args.stream_logs and start_streaming_flag and not end_streaming_flag:
108
+ print(streaming_prefix + line, end='', file=out_stream, flush=True)
109
+ if args.log_path != '/dev/null':
110
+ fout.write(line)
111
+ fout.flush()
112
+ line_processor.process_line(line)
113
+ out.append(line)
114
+ return ''.join(out)
115
+
116
+
117
+ def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
118
+ """Redirect the process's filtered stdout/stderr to both stream and file"""
119
+ if proc.stderr is not None:
120
+ # Asyncio does not work as the output processing can be executed in a
121
+ # different thread.
122
+ # selectors is possible to handle the multiplexing of stdout/stderr,
123
+ # but it introduces buffering making the output not streaming.
124
+ with multiprocessing.pool.ThreadPool(processes=1) as pool:
125
+ err_args = copy.copy(args)
126
+ err_args.line_processor = None
127
+ stderr_fut = pool.apply_async(
128
+ _handle_io_stream, args=(proc.stderr, sys.stderr, err_args)
129
+ )
130
+ # Do not launch a thread for stdout as the rich.status does not
131
+ # work in a thread, which is used in
132
+ # log_utils.RayUpLineProcessor.
133
+ stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
134
+ stderr = stderr_fut.get()
135
+ else:
136
+ stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
137
+ stderr = ''
138
+ return stdout, stderr
139
+
140
+
141
+ def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
142
+ """Creates table with default style."""
143
+ border = kwargs.pop('border', False)
144
+ align = kwargs.pop('align', 'l')
145
+ table = prettytable.PrettyTable(
146
+ align=align, border=border, field_names=field_names, **kwargs
147
+ )
148
+ table.left_padding_width = 0
149
+ table.right_padding_width = 2
150
+ return table
151
+
152
+
153
+ def run_with_log(
154
+ cmd: Union[List[str], str],
155
+ log_path: str,
156
+ *,
157
+ require_outputs: bool = False,
158
+ stream_logs: bool = False,
159
+ start_streaming_at: str = '',
160
+ end_streaming_at: Optional[str] = None,
161
+ skip_lines: Optional[List[str]] = None,
162
+ shell: bool = False,
163
+ with_ray: bool = False,
164
+ process_stream: bool = True,
165
+ line_processor: Optional[LineProcessor] = None,
166
+ streaming_prefix: Optional[str] = None,
167
+ **kwargs,
168
+ ) -> Union[int, Tuple[int, str, str]]:
169
+ """Runs a command and logs its output to a file.
170
+
171
+ Args:
172
+ cmd: The command to run.
173
+ log_path: The path to the log file.
174
+ stream_logs: Whether to stream the logs to stdout/stderr.
175
+ require_outputs: Whether to return the stdout/stderr of the command.
176
+ process_stream: Whether to post-process the stdout/stderr of the
177
+ command, such as replacing or skipping lines on the fly. If
178
+ enabled, lines are printed only when '\r' or '\n' is found.
179
+
180
+ Returns the returncode or returncode, stdout and stderr of the command.
181
+ Note that the stdout and stderr is already decoded.
182
+ """
183
+ assert process_stream or not require_outputs, (
184
+ process_stream,
185
+ require_outputs,
186
+ 'require_outputs should be False when process_stream is False',
187
+ )
188
+
189
+ log_path = os.path.expanduser(log_path)
190
+ dirname = os.path.dirname(log_path)
191
+ os.makedirs(dirname, exist_ok=True)
192
+ # Redirect stderr to stdout when using ray, to preserve the order of
193
+ # stdout and stderr.
194
+ stdout_arg = stderr_arg = None
195
+ if process_stream:
196
+ stdout_arg = subprocess.PIPE
197
+ stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
198
+ # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
199
+ # the terminal output when typing in the terminal that starts the API
200
+ # server.
201
+ stdin = kwargs.pop('stdin', subprocess.DEVNULL)
202
+ with subprocess.Popen(
203
+ cmd,
204
+ stdout=stdout_arg,
205
+ stderr=stderr_arg,
206
+ start_new_session=True,
207
+ shell=shell,
208
+ stdin=stdin,
209
+ **kwargs,
210
+ ) as proc:
211
+ try:
212
+ subprocess_utils.kill_process_daemon(proc.pid)
213
+ stdout = ''
214
+ stderr = ''
215
+
216
+ if process_stream:
217
+ if skip_lines is None:
218
+ skip_lines = []
219
+ # Skip these lines caused by `-i` option of bash. Failed to
220
+ # find other way to turn off these two warning.
221
+ # https://stackoverflow.com/questions/13300764/how-to-tell-bash-not-to-issue-warnings-cannot-set-terminal-process-group-and # noqa: E501
222
+ # `ssh -T -i -tt` still cause the problem.
223
+ skip_lines += [
224
+ 'bash: cannot set terminal process group',
225
+ 'bash: no job control in this shell',
226
+ ]
227
+ # We need this even if the log_path is '/dev/null' to ensure the
228
+ # progress bar is shown.
229
+ # NOTE: Lines are printed only when '\r' or '\n' is found.
230
+ args = _ProcessingArgs(
231
+ log_path=log_path,
232
+ stream_logs=stream_logs,
233
+ start_streaming_at=start_streaming_at,
234
+ end_streaming_at=end_streaming_at,
235
+ skip_lines=skip_lines,
236
+ line_processor=line_processor,
237
+ # Replace CRLF when the output is logged to driver by ray.
238
+ replace_crlf=with_ray,
239
+ streaming_prefix=streaming_prefix,
240
+ )
241
+ stdout, stderr = process_subprocess_stream(proc, args)
242
+ proc.wait()
243
+ if require_outputs:
244
+ return proc.returncode, stdout, stderr
245
+ return proc.returncode
246
+ except KeyboardInterrupt:
247
+ # Kill the subprocess directly, otherwise, the underlying
248
+ # process will only be killed after the python program exits,
249
+ # causing the stream handling stuck at `readline`.
250
+ subprocess_utils.kill_children_processes()
251
+ raise
@@ -0,0 +1,85 @@
1
+ """Loki utils: query/tail logs from Loki"""
2
+ # TODO(asaiacai): eventually support querying
3
+ # centralized loki that lives outside the cluster
4
+
5
+ import asyncio
6
+ import json
7
+ import urllib.parse
8
+
9
+ import colorama
10
+ import kr8s
11
+ import websockets
12
+
13
+ from konduktor import logging
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+ LOKI_PORT = 3100
18
+ WEBSOCKET_TIMEOUT = 10
19
+ INFINITY = 999999
20
+
21
+
22
+ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id: int):
23
+ ws = await asyncio.wait_for(websockets.connect(loki_url), timeout=WEBSOCKET_TIMEOUT)
24
+ logger.info(
25
+ f'{colorama.Fore.YELLOW}Tailing logs from Loki. '
26
+ f'Forwarding to port {LOKI_PORT}. Press Ctrl+C to stop. '
27
+ f'{colorama.Style.RESET_ALL}'
28
+ )
29
+ try:
30
+ while True:
31
+ message = await asyncio.wait_for(ws.recv(), timeout=timeout)
32
+ try:
33
+ payload = json.loads(message)
34
+ for stream in payload['streams']:
35
+ if stream['values'][0][1] is not None:
36
+ print(
37
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
38
+ f"(job_name={job_name} worker_id={worker_id})"
39
+ f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
40
+ flush=True,
41
+ )
42
+ except json.JSONDecodeError:
43
+ logger.warning(f'Failed to decode log skipping: {message}')
44
+ logger.debug(f'Dropped log: {message}')
45
+ continue
46
+ except asyncio.exceptions.TimeoutError:
47
+ logger.debug('Websocket timed-out, closing the connection!')
48
+
49
+
50
+ def tail_loki_logs_ws(
51
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
52
+ ):
53
+ if num_logs > 5000:
54
+ # TODO(asaiacai): we should not have a limit on the number of logs, but rather
55
+ # let the user specify any number of lines, and we can print the last N lines.
56
+ # this can be done in chunks. Potentially, we can query range
57
+ # until we reach the end of the log and then invoke tail again.
58
+ # Also include checks that the job is running/ever ran.
59
+ raise ValueError('num_logs must be less than or equal to 5000')
60
+ loki_url = f'ws://localhost:{LOKI_PORT}/loki/api/v1/tail'
61
+ params = {
62
+ 'query': urllib.parse.quote(
63
+ f'{{k8s_job_name="{job_name}-workers-0"}} '
64
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
65
+ ),
66
+ 'limit': num_logs,
67
+ 'delay': 5,
68
+ # TODO(asaiacai): need to auto-generate the start and end times.
69
+ }
70
+
71
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
72
+ loki_url += f'?{query_string}'
73
+
74
+ loki_svc = kr8s.objects.Service.get('loki', namespace='loki')
75
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
76
+ with kr8s.portforward.PortForward(loki_svc, LOKI_PORT):
77
+ asyncio.run(_read_loki_logs(loki_url, timeout, job_name, worker_id))
78
+
79
+
80
+ # TODO(asaiacai): write a query_range function to get all the
81
+ # logs for a job for not tailing option
82
+
83
+ # Run the WebSocket log tailing function
84
+ if __name__ == '__main__':
85
+ tail_loki_logs_ws('tune-c3c8', worker_id=0, follow=False)
@@ -0,0 +1,123 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Rich status spinner utils."""
14
+
15
+ import contextlib
16
+ import logging
17
+ import threading
18
+ from typing import Union
19
+
20
+ import rich.console as rich_console
21
+
22
+ console = rich_console.Console(soft_wrap=True)
23
+ _status = None
24
+ _status_nesting_level = 0
25
+
26
+ _logging_lock = threading.RLock()
27
+
28
+
29
+ class _NoOpConsoleStatus:
30
+ """An empty class for multi-threaded console.status."""
31
+
32
+ def __enter__(self):
33
+ return self
34
+
35
+ def __exit__(self, exc_type, exc_val, exc_tb):
36
+ pass
37
+
38
+ def update(self, text):
39
+ pass
40
+
41
+ def stop(self):
42
+ pass
43
+
44
+ def start(self):
45
+ pass
46
+
47
+
48
+ class _RevertibleStatus:
49
+ """A wrapper for status that can revert to previous message after exit."""
50
+
51
+ def __init__(self, message: str):
52
+ if _status is not None:
53
+ self.previous_message = _status.status
54
+ else:
55
+ self.previous_message = None
56
+ self.message = message
57
+
58
+ def __enter__(self):
59
+ global _status_nesting_level
60
+ _status.update(self.message)
61
+ _status_nesting_level += 1
62
+ _status.__enter__()
63
+ return _status
64
+
65
+ def __exit__(self, exc_type, exc_val, exc_tb):
66
+ global _status_nesting_level, _status
67
+ _status_nesting_level -= 1
68
+ if _status_nesting_level <= 0:
69
+ _status_nesting_level = 0
70
+ if _status is not None:
71
+ _status.__exit__(exc_type, exc_val, exc_tb)
72
+ _status = None
73
+ else:
74
+ _status.update(self.previous_message)
75
+
76
+ def update(self, *args, **kwargs):
77
+ _status.update(*args, **kwargs)
78
+
79
+ def stop(self):
80
+ _status.stop()
81
+
82
+ def start(self):
83
+ _status.start()
84
+
85
+
86
+ @contextlib.contextmanager
87
+ def safe_logger():
88
+ logged = False
89
+ with _logging_lock:
90
+ if _status is not None and _status._live.is_started: # pylint: disable=protected-access
91
+ _status.stop()
92
+ yield
93
+ logged = True
94
+ _status.start()
95
+ if not logged:
96
+ yield
97
+
98
+
99
+ class RichSafeStreamHandler(logging.StreamHandler):
100
+ def emit(self, record: logging.LogRecord) -> None:
101
+ with safe_logger():
102
+ return super().emit(record)
103
+
104
+
105
+ def force_update_status(msg: str):
106
+ """Update the status message even if sky_logging.is_silent() is true."""
107
+ if threading.current_thread() is threading.main_thread() and _status is not None:
108
+ _status.update(msg)
109
+
110
+
111
+ def safe_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
112
+ """A wrapper for multi-threaded console.status."""
113
+ from konduktor import logging # pylint: disable=import-outside-toplevel
114
+
115
+ global _status
116
+ if (
117
+ threading.current_thread() is threading.main_thread()
118
+ and not logging.is_silent()
119
+ ):
120
+ if _status is None:
121
+ _status = console.status(msg, refresh_per_second=8)
122
+ return _RevertibleStatus(msg)
123
+ return _NoOpConsoleStatus()