konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,467 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import asyncio
14
+ import copy
15
+ import enum
16
+ import io
17
+ import json
18
+ import multiprocessing
19
+ import os
20
+ import subprocess
21
+ import sys
22
+ import types
23
+ import urllib.parse
24
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
25
+
26
+ import colorama
27
+ import kr8s
28
+ import prettytable
29
+ import requests
30
+ import websockets
31
+
32
+ from konduktor import config, logging
33
+ from konduktor.utils import kubernetes_utils, subprocess_utils
34
+
35
+ logger = logging.get_logger(__name__)
36
+
37
+
38
+ LOKI_REMOTE_PORT = 3100
39
+ WEBSOCKET_TIMEOUT = 10
40
+ INFINITY = 999999
41
+ VICKY_REMOTE_PORT = 9428
42
+
43
+
44
+ class LogBackend(enum.Enum):
45
+ VICTORIA = 'victoria'
46
+ LOKI = 'loki'
47
+
48
+
49
+ class LineProcessor(object):
50
+ """A processor for log lines."""
51
+
52
+ def __enter__(self) -> None:
53
+ pass
54
+
55
+ def process_line(self, log_line: str) -> None:
56
+ pass
57
+
58
+ def __exit__(
59
+ self,
60
+ except_type: Optional[Type[BaseException]],
61
+ except_value: Optional[BaseException],
62
+ traceback: Optional[types.TracebackType],
63
+ ) -> None:
64
+ del except_type, except_value, traceback # unused
65
+ pass
66
+
67
+
68
+ class _ProcessingArgs:
69
+ """Arguments for processing logs."""
70
+
71
+ def __init__(
72
+ self,
73
+ log_path: str,
74
+ stream_logs: bool,
75
+ start_streaming_at: str = '',
76
+ end_streaming_at: Optional[str] = None,
77
+ skip_lines: Optional[List[str]] = None,
78
+ replace_crlf: bool = False,
79
+ line_processor: Optional[LineProcessor] = None,
80
+ streaming_prefix: Optional[str] = None,
81
+ ) -> None:
82
+ self.log_path = log_path
83
+ self.stream_logs = stream_logs
84
+ self.start_streaming_at = start_streaming_at
85
+ self.end_streaming_at = end_streaming_at
86
+ self.skip_lines = skip_lines
87
+ self.replace_crlf = replace_crlf
88
+ self.line_processor = line_processor
89
+ self.streaming_prefix = streaming_prefix
90
+
91
+
92
+ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
93
+ """Process the stream of a process."""
94
+ out_io = io.TextIOWrapper(
95
+ io_stream, encoding='utf-8', newline='', errors='replace', write_through=True
96
+ )
97
+
98
+ start_streaming_flag = False
99
+ end_streaming_flag = False
100
+ streaming_prefix = args.streaming_prefix if args.streaming_prefix else ''
101
+ line_processor = (
102
+ LineProcessor() if args.line_processor is None else args.line_processor
103
+ )
104
+
105
+ out = []
106
+ with open(args.log_path, 'a', encoding='utf-8') as fout:
107
+ with line_processor:
108
+ while True:
109
+ line = out_io.readline()
110
+ if not line:
111
+ break
112
+ # start_streaming_at logic in processor.process_line(line)
113
+ if args.replace_crlf and line.endswith('\r\n'):
114
+ # Replace CRLF with LF to avoid ray logging to the same
115
+ # line due to separating lines with '\n'.
116
+ line = line[:-2] + '\n'
117
+ if args.skip_lines is not None and any(
118
+ skip in line for skip in args.skip_lines
119
+ ):
120
+ continue
121
+ if args.start_streaming_at in line:
122
+ start_streaming_flag = True
123
+ if args.end_streaming_at is not None and args.end_streaming_at in line:
124
+ # Keep executing the loop, only stop streaming.
125
+ # saving them in log files.
126
+ end_streaming_flag = True
127
+ if args.stream_logs and start_streaming_flag and not end_streaming_flag:
128
+ print(streaming_prefix + line, end='', file=out_stream, flush=True)
129
+ if args.log_path != '/dev/null':
130
+ fout.write(line)
131
+ fout.flush()
132
+ line_processor.process_line(line)
133
+ out.append(line)
134
+ return ''.join(out)
135
+
136
+
137
+ def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
138
+ """Redirect the process's filtered stdout/stderr to both stream and file"""
139
+ if proc.stderr is not None:
140
+ # Asyncio does not work as the output processing can be executed in a
141
+ # different thread.
142
+ # selectors is possible to handle the multiplexing of stdout/stderr,
143
+ # but it introduces buffering making the output not streaming.
144
+ with multiprocessing.pool.ThreadPool(processes=1) as pool:
145
+ err_args = copy.copy(args)
146
+ err_args.line_processor = None
147
+ stderr_fut = pool.apply_async(
148
+ _handle_io_stream, args=(proc.stderr, sys.stderr, err_args)
149
+ )
150
+ # Do not launch a thread for stdout as the rich.status does not
151
+ # work in a thread, which is used in
152
+ # log_utils.RayUpLineProcessor.
153
+ stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
154
+ stderr = stderr_fut.get()
155
+ else:
156
+ stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
157
+ stderr = ''
158
+ return stdout, stderr
159
+
160
+
161
+ def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
162
+ """Creates table with default style."""
163
+ border = kwargs.pop('border', False)
164
+ align = kwargs.pop('align', 'l')
165
+ table = prettytable.PrettyTable(
166
+ align=align, border=border, field_names=field_names, **kwargs
167
+ )
168
+ table.left_padding_width = 0
169
+ table.right_padding_width = 2
170
+ return table
171
+
172
+
173
+ def _get_kr8s_api():
174
+ """Return a kr8s API configured for the selected context.
175
+
176
+ Honors kubernetes.allowed_contexts via
177
+ get_current_kube_config_context_name(). Fails with a clear error if a
178
+ configured context cannot be initialized.
179
+ """
180
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
181
+ try:
182
+ # kr8s exposes a factory function `api(...)` that accepts context.
183
+ api = kr8s.api(context=ctx) if ctx else kr8s.api()
184
+ if ctx:
185
+ logger.debug('Initialized kr8s API for context: %s', ctx)
186
+ return api
187
+ except Exception as e: # defensive: surface a clear error if context fails
188
+ if ctx:
189
+ raise ValueError(
190
+ 'Failed to initialize kr8s client for context '
191
+ f'{ctx!r}. Ensure the context exists and your kubeconfig is valid.'
192
+ ) from e
193
+ raise
194
+
195
+
196
+ def run_with_log(
197
+ cmd: Union[List[str], str],
198
+ log_path: str,
199
+ *,
200
+ require_outputs: bool = False,
201
+ stream_logs: bool = False,
202
+ start_streaming_at: str = '',
203
+ end_streaming_at: Optional[str] = None,
204
+ skip_lines: Optional[List[str]] = None,
205
+ shell: bool = False,
206
+ with_ray: bool = False,
207
+ process_stream: bool = True,
208
+ line_processor: Optional[LineProcessor] = None,
209
+ streaming_prefix: Optional[str] = None,
210
+ **kwargs,
211
+ ) -> Union[int, Tuple[int, str, str]]:
212
+ """Runs a command and logs its output to a file.
213
+
214
+ Args:
215
+ cmd: The command to run.
216
+ log_path: The path to the log file.
217
+ stream_logs: Whether to stream the logs to stdout/stderr.
218
+ require_outputs: Whether to return the stdout/stderr of the command.
219
+ process_stream: Whether to post-process the stdout/stderr of the
220
+ command, such as replacing or skipping lines on the fly. If
221
+ enabled, lines are printed only when '\r' or '\n' is found.
222
+
223
+ Returns the returncode or returncode, stdout and stderr of the command.
224
+ Note that the stdout and stderr is already decoded.
225
+ """
226
+ assert process_stream or not require_outputs, (
227
+ process_stream,
228
+ require_outputs,
229
+ 'require_outputs should be False when process_stream is False',
230
+ )
231
+
232
+ log_path = os.path.expanduser(log_path)
233
+ dirname = os.path.dirname(log_path)
234
+ os.makedirs(dirname, exist_ok=True)
235
+ # Redirect stderr to stdout when using ray, to preserve the order of
236
+ # stdout and stderr.
237
+ stdout_arg = stderr_arg = None
238
+ if process_stream:
239
+ stdout_arg = subprocess.PIPE
240
+ stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
241
+ # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
242
+ # the terminal output when typing in the terminal that starts the API
243
+ # server.
244
+ stdin = kwargs.pop('stdin', subprocess.DEVNULL)
245
+ with subprocess.Popen(
246
+ cmd,
247
+ stdout=stdout_arg,
248
+ stderr=stderr_arg,
249
+ start_new_session=True,
250
+ shell=shell,
251
+ stdin=stdin,
252
+ **kwargs,
253
+ ) as proc:
254
+ try:
255
+ subprocess_utils.kill_process_daemon(proc.pid)
256
+ stdout = ''
257
+ stderr = ''
258
+
259
+ if process_stream:
260
+ if skip_lines is None:
261
+ skip_lines = []
262
+ # Skip these lines caused by `-i` option of bash. Failed to
263
+ # find other way to turn off these two warning.
264
+ # https://stackoverflow.com/questions/13300764/how-to-tell-bash-not-to-issue-warnings-cannot-set-terminal-process-group-and # noqa: E501
265
+ # `ssh -T -i -tt` still cause the problem.
266
+ skip_lines += [
267
+ 'bash: cannot set terminal process group',
268
+ 'bash: no job control in this shell',
269
+ ]
270
+ # We need this even if the log_path is '/dev/null' to ensure the
271
+ # progress bar is shown.
272
+ # NOTE: Lines are printed only when '\r' or '\n' is found.
273
+ args = _ProcessingArgs(
274
+ log_path=log_path,
275
+ stream_logs=stream_logs,
276
+ start_streaming_at=start_streaming_at,
277
+ end_streaming_at=end_streaming_at,
278
+ skip_lines=skip_lines,
279
+ line_processor=line_processor,
280
+ # Replace CRLF when the output is logged to driver by ray.
281
+ replace_crlf=with_ray,
282
+ streaming_prefix=streaming_prefix,
283
+ )
284
+ stdout, stderr = process_subprocess_stream(proc, args)
285
+ proc.wait()
286
+ if require_outputs:
287
+ return proc.returncode, stdout, stderr
288
+ return proc.returncode
289
+ except KeyboardInterrupt:
290
+ # Kill the subprocess directly, otherwise, the underlying
291
+ # process will only be killed after the python program exits,
292
+ # causing the stream handling stuck at `readline`.
293
+ subprocess_utils.kill_children_processes()
294
+ raise
295
+
296
+
297
+ async def _read_logs(url: str, timeout: int, job_name: str, worker_id: int, port: int):
298
+ ws = await asyncio.wait_for(websockets.connect(url), timeout=WEBSOCKET_TIMEOUT)
299
+ logger.info(
300
+ f'{colorama.Fore.YELLOW}Tailing logs. '
301
+ f'Forwarding from remote port {port}. Press Ctrl+C to stop. '
302
+ f'{colorama.Style.RESET_ALL}'
303
+ )
304
+ try:
305
+ while True:
306
+ message = await asyncio.wait_for(ws.recv(), timeout=timeout)
307
+ try:
308
+ payload = json.loads(message)
309
+ for stream in payload['streams']:
310
+ if stream['values'][0][1] is not None:
311
+ print(
312
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
313
+ f"(job_name={job_name} worker_id={worker_id})"
314
+ f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
315
+ flush=True,
316
+ )
317
+ except json.JSONDecodeError:
318
+ logger.warning(f'Failed to decode log skipping: {message}')
319
+ logger.debug(f'Dropped log: {message}')
320
+ continue
321
+ except asyncio.exceptions.TimeoutError:
322
+ logger.debug('Websocket timed-out, closing log stream!')
323
+ except KeyboardInterrupt:
324
+ logger.debug('Keyboard interrupt, closing log stream!')
325
+
326
+
327
+ def tail_loki_logs_ws(
328
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
329
+ ):
330
+ if num_logs > 5000:
331
+ # TODO(asaiacai): we should not have a limit on the number of logs, but rather
332
+ # let the user specify any number of lines, and we can print the last N lines.
333
+ # this can be done in chunks. Potentially, we can query range
334
+ # until we reach the end of the log and then invoke tail again.
335
+ # Also include checks that the job is running/ever ran.
336
+ raise ValueError('num_logs must be less than or equal to 5000')
337
+ api = _get_kr8s_api()
338
+ loki_svc = kr8s.objects.Service.get('loki', namespace='loki', api=api)
339
+ with kr8s.portforward.PortForward(
340
+ loki_svc, LOKI_REMOTE_PORT, local_port='auto'
341
+ ) as port:
342
+ loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
343
+ logger.debug(f'Loki URL: {loki_url}')
344
+ params = {
345
+ 'query': urllib.parse.quote(
346
+ r'{' + f'k8s_job_name="{job_name}-workers-0",'
347
+ r' k8s_container_name="konduktor-container"} '
348
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
349
+ ),
350
+ 'limit': num_logs,
351
+ 'delay': 5,
352
+ # TODO(asaiacai): need to auto-generate the start and end times.
353
+ }
354
+
355
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
356
+ loki_url += f'?{query_string}'
357
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
358
+ asyncio.run(
359
+ _read_logs(loki_url, timeout, job_name, worker_id, LOKI_REMOTE_PORT)
360
+ )
361
+
362
+
363
+ def tail_vicky_logs(
364
+ job_name: str,
365
+ worker_id: int = 0,
366
+ num_logs: int = -1,
367
+ follow: bool = True,
368
+ start_offset: str = '1h',
369
+ ):
370
+ context = kubernetes_utils.get_current_kube_config_context_name()
371
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
372
+ query: Dict[str, Any] = {}
373
+ api = _get_kr8s_api()
374
+ vicky_svc = kr8s.objects.Service.get(
375
+ 'vls-victoria-logs-single-server', namespace='victoria-logs', api=api
376
+ )
377
+
378
+ if num_logs == -1:
379
+ query = {}
380
+ else:
381
+ assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
382
+ query = {'limit': num_logs}
383
+ if follow:
384
+ effective_offset = start_offset or '1h'
385
+ logger.info(
386
+ f'Tailing logs from {effective_offset} ago. '
387
+ 'If logs come up empty, there might be logs just earlier '
388
+ 'than that window, check Grafana or use:\n'
389
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
390
+ f'`konduktor logs --no-follow {job_name}`'
391
+ f'{colorama.Style.RESET_ALL}'
392
+ )
393
+ query['start_offset'] = effective_offset
394
+ query['query'] = (
395
+ f'k8s.namespace.name: "{namespace}" AND '
396
+ f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
397
+ f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
398
+ )
399
+
400
+ with kr8s.portforward.PortForward(
401
+ vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
402
+ ) as port:
403
+ if follow:
404
+ timeout = INFINITY
405
+ vicky_url = f'http://localhost:{port}/select/logsql/tail'
406
+ else:
407
+ vicky_url = f'http://localhost:{port}/select/logsql/query'
408
+ timeout = config.get_nested(('logs', 'timeout'), 60)
409
+ logger.debug(f'Vicky URL: {vicky_url}')
410
+
411
+ try:
412
+ logger.debug(f'Making request to {vicky_url} with query: {query}')
413
+ with requests.post(
414
+ vicky_url, data=query, stream=True, timeout=timeout
415
+ ) as response: # type: requests.Response
416
+ logger.debug(f'Response status: {response.status_code}')
417
+ if response.status_code != 200:
418
+ logger.error(
419
+ f'VictoriaLogs API returned status {response.status_code}: '
420
+ f'{response.text}'
421
+ )
422
+ return
423
+
424
+ for line in response.iter_lines(decode_unicode=True):
425
+ if line:
426
+ payload = json.loads(line)
427
+ if 'missing _msg field' in payload['_msg']:
428
+ payload['_msg'] = ''
429
+ print(
430
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
431
+ f"(job_name={job_name} worker_id={worker_id})"
432
+ f"{colorama.Style.RESET_ALL} {payload['_msg']}",
433
+ flush=True,
434
+ )
435
+
436
+ except KeyboardInterrupt:
437
+ logger.info('\nStopping log stream...')
438
+ except requests.exceptions.Timeout:
439
+ logger.error(
440
+ f'Request to VictoriaLogs timed out after {timeout} seconds. '
441
+ 'Try increasing the timeout in the config '
442
+ 'file under `logs.timeout`. If '
443
+ 'you are still seeing issues, please contact support.'
444
+ )
445
+ except requests.exceptions.ConnectionError as e:
446
+ logger.error(f'Failed to connect to VictoriaLogs at {vicky_url}: {e}')
447
+ except requests.exceptions.RequestException as e:
448
+ logger.error(f'Request to VictoriaLogs failed: {e}')
449
+ except Exception as e:
450
+ logger.error(f'Unexpected error while tailing VictoriaLogs: {e}')
451
+
452
+
453
+ def tail_logs(
454
+ job_name: str,
455
+ worker_id: int = 0,
456
+ num_logs: int = 1000,
457
+ follow: bool = True,
458
+ start_offset: str = '1h',
459
+ ):
460
+ logs_backend = config.get_nested(('logs', 'backend'), None)
461
+ if logs_backend == LogBackend.VICTORIA:
462
+ tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
463
+ elif logs_backend == LogBackend.LOKI:
464
+ tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
465
+ else:
466
+ logger.info('Defaulting to VictoriaLogs')
467
+ tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
@@ -0,0 +1,102 @@
1
+ """Loki utils: query/tail logs from Loki"""
2
+ # TODO(asaiacai): eventually support querying
3
+ # centralized loki that lives outside the cluster
4
+
5
+ import asyncio
6
+ import json
7
+ import urllib.parse
8
+
9
+ import colorama
10
+ import kr8s
11
+ import websockets
12
+
13
+ from konduktor import logging
14
+ from konduktor.utils import kubernetes_utils
15
+
16
+ logger = logging.get_logger(__name__)
17
+
18
+ LOKI_REMOTE_PORT = 3100
19
+ WEBSOCKET_TIMEOUT = 10
20
+ INFINITY = 999999
21
+
22
+
23
+ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id: int):
24
+ ws = await asyncio.wait_for(websockets.connect(loki_url), timeout=WEBSOCKET_TIMEOUT)
25
+ logger.info(
26
+ f'{colorama.Fore.YELLOW}Tailing logs from Loki. '
27
+ f'Forwarding from remote port {LOKI_REMOTE_PORT}. Press Ctrl+C to stop. '
28
+ f'{colorama.Style.RESET_ALL}'
29
+ )
30
+ try:
31
+ while True:
32
+ message = await asyncio.wait_for(ws.recv(), timeout=timeout)
33
+ try:
34
+ payload = json.loads(message)
35
+ for stream in payload['streams']:
36
+ if stream['values'][0][1] is not None:
37
+ print(
38
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
39
+ f"(job_name={job_name} worker_id={worker_id})"
40
+ f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
41
+ flush=True,
42
+ )
43
+ except json.JSONDecodeError:
44
+ logger.warning(f'Failed to decode log skipping: {message}')
45
+ logger.debug(f'Dropped log: {message}')
46
+ continue
47
+ except asyncio.exceptions.TimeoutError:
48
+ logger.debug('Websocket timed-out, closing log stream!')
49
+ except KeyboardInterrupt:
50
+ logger.debug('Keyboard interrupt, closing log stream!')
51
+
52
+
53
+ def tail_loki_logs_ws(
54
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
55
+ ):
56
+ if num_logs > 5000:
57
+ # TODO(asaiacai): we should not have a limit on the number of logs, but rather
58
+ # let the user specify any number of lines, and we can print the last N lines.
59
+ # this can be done in chunks. Potentially, we can query range
60
+ # until we reach the end of the log and then invoke tail again.
61
+ # Also include checks that the job is running/ever ran.
62
+ raise ValueError('num_logs must be less than or equal to 5000')
63
+ # Initialize kr8s API honoring allowed_contexts if configured.
64
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
65
+ try:
66
+ api = kr8s.api(context=ctx) if ctx else kr8s.api()
67
+ except Exception as e:
68
+ if ctx:
69
+ raise ValueError(
70
+ 'Failed to initialize kr8s client for context '
71
+ f'{ctx!r}. Ensure the context exists and your kubeconfig is valid.'
72
+ ) from e
73
+ raise
74
+ loki_svc = kr8s.objects.Service.get('loki', namespace='loki', api=api)
75
+ with kr8s.portforward.PortForward(
76
+ loki_svc, LOKI_REMOTE_PORT, local_port='auto'
77
+ ) as port:
78
+ loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
79
+ logger.debug(f'Loki URL: {loki_url}')
80
+ params = {
81
+ 'query': urllib.parse.quote(
82
+ r'{' + f'k8s_job_name="{job_name}-workers-0",'
83
+ r' k8s_container_name="konduktor-container"} '
84
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
85
+ ),
86
+ 'limit': num_logs,
87
+ 'delay': 5,
88
+ # TODO(asaiacai): need to auto-generate the start and end times.
89
+ }
90
+
91
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
92
+ loki_url += f'?{query_string}'
93
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
94
+ asyncio.run(_read_loki_logs(loki_url, timeout, job_name, worker_id))
95
+
96
+
97
+ # TODO(asaiacai): write a query_range function to get all the
98
+ # logs for a job for not tailing option
99
+
100
+ # Run the WebSocket log tailing function
101
+ if __name__ == '__main__':
102
+ tail_loki_logs_ws('tune-bc43', worker_id=0, follow=True)
@@ -0,0 +1,123 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Rich status spinner utils."""
14
+
15
+ import contextlib
16
+ import logging
17
+ import threading
18
+ from typing import Union
19
+
20
+ import rich.console as rich_console
21
+
22
+ console = rich_console.Console(soft_wrap=True)
23
+ _status = None
24
+ _status_nesting_level = 0
25
+
26
+ _logging_lock = threading.RLock()
27
+
28
+
29
+ class _NoOpConsoleStatus:
30
+ """An empty class for multi-threaded console.status."""
31
+
32
+ def __enter__(self):
33
+ return self
34
+
35
+ def __exit__(self, exc_type, exc_val, exc_tb):
36
+ pass
37
+
38
+ def update(self, text):
39
+ pass
40
+
41
+ def stop(self):
42
+ pass
43
+
44
+ def start(self):
45
+ pass
46
+
47
+
48
+ class _RevertibleStatus:
49
+ """A wrapper for status that can revert to previous message after exit."""
50
+
51
+ def __init__(self, message: str):
52
+ if _status is not None:
53
+ self.previous_message = _status.status
54
+ else:
55
+ self.previous_message = None
56
+ self.message = message
57
+
58
+ def __enter__(self):
59
+ global _status_nesting_level
60
+ _status.update(self.message)
61
+ _status_nesting_level += 1
62
+ _status.__enter__()
63
+ return _status
64
+
65
+ def __exit__(self, exc_type, exc_val, exc_tb):
66
+ global _status_nesting_level, _status
67
+ _status_nesting_level -= 1
68
+ if _status_nesting_level <= 0:
69
+ _status_nesting_level = 0
70
+ if _status is not None:
71
+ _status.__exit__(exc_type, exc_val, exc_tb)
72
+ _status = None
73
+ else:
74
+ _status.update(self.previous_message)
75
+
76
+ def update(self, *args, **kwargs):
77
+ _status.update(*args, **kwargs)
78
+
79
+ def stop(self):
80
+ _status.stop()
81
+
82
+ def start(self):
83
+ _status.start()
84
+
85
+
86
+ @contextlib.contextmanager
87
+ def safe_logger():
88
+ logged = False
89
+ with _logging_lock:
90
+ if _status is not None and _status._live.is_started: # pylint: disable=protected-access
91
+ _status.stop()
92
+ yield
93
+ logged = True
94
+ _status.start()
95
+ if not logged:
96
+ yield
97
+
98
+
99
+ class RichSafeStreamHandler(logging.StreamHandler):
100
+ def emit(self, record: logging.LogRecord) -> None:
101
+ with safe_logger():
102
+ return super().emit(record)
103
+
104
+
105
+ def force_update_status(msg: str):
106
+ """Update the status message even if konduktor_logging.is_silent() is true."""
107
+ if threading.current_thread() is threading.main_thread() and _status is not None:
108
+ _status.update(msg)
109
+
110
+
111
+ def safe_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
112
+ """A wrapper for multi-threaded console.status."""
113
+ from konduktor import logging # pylint: disable=import-outside-toplevel
114
+
115
+ global _status
116
+ if (
117
+ threading.current_thread() is threading.main_thread()
118
+ and not logging.is_silent()
119
+ ):
120
+ if _status is None:
121
+ _status = console.status(msg, refresh_per_second=8)
122
+ return _RevertibleStatus(msg)
123
+ return _NoOpConsoleStatus()