PyPI - konduktor-nightly - Versions diffs - 0.1.0.dev20251128104812__py3-none-any.whl - Mend

konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

konduktor/__init__.py +49 -0
konduktor/adaptors/__init__.py +0 -0
konduktor/adaptors/aws.py +221 -0
konduktor/adaptors/common.py +118 -0
konduktor/adaptors/gcp.py +126 -0
konduktor/authentication.py +124 -0
konduktor/backends/__init__.py +6 -0
konduktor/backends/backend.py +86 -0
konduktor/backends/constants.py +21 -0
konduktor/backends/deployment.py +204 -0
konduktor/backends/deployment_utils.py +1351 -0
konduktor/backends/jobset.py +225 -0
konduktor/backends/jobset_utils.py +726 -0
konduktor/backends/pod_utils.py +501 -0
konduktor/check.py +184 -0
konduktor/cli.py +1945 -0
konduktor/config.py +420 -0
konduktor/constants.py +36 -0
konduktor/controller/__init__.py +0 -0
konduktor/controller/constants.py +56 -0
konduktor/controller/launch.py +44 -0
konduktor/controller/node.py +116 -0
konduktor/controller/parse.py +111 -0
konduktor/dashboard/README.md +30 -0
konduktor/dashboard/backend/main.py +169 -0
konduktor/dashboard/backend/sockets.py +154 -0
konduktor/dashboard/frontend/.eslintrc.json +3 -0
konduktor/dashboard/frontend/.gitignore +36 -0
konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
konduktor/dashboard/frontend/app/favicon.ico +0 -0
konduktor/dashboard/frontend/app/globals.css +120 -0
konduktor/dashboard/frontend/app/jobs/page.js +10 -0
konduktor/dashboard/frontend/app/layout.js +22 -0
konduktor/dashboard/frontend/app/logs/page.js +11 -0
konduktor/dashboard/frontend/app/page.js +12 -0
konduktor/dashboard/frontend/jsconfig.json +7 -0
konduktor/dashboard/frontend/next.config.mjs +4 -0
konduktor/dashboard/frontend/package-lock.json +6687 -0
konduktor/dashboard/frontend/package.json +37 -0
konduktor/dashboard/frontend/postcss.config.mjs +8 -0
konduktor/dashboard/frontend/server.js +64 -0
konduktor/dashboard/frontend/tailwind.config.js +17 -0
konduktor/data/__init__.py +9 -0
konduktor/data/aws/__init__.py +15 -0
konduktor/data/aws/s3.py +1138 -0
konduktor/data/constants.py +7 -0
konduktor/data/data_utils.py +268 -0
konduktor/data/gcp/__init__.py +19 -0
konduktor/data/gcp/constants.py +42 -0
konduktor/data/gcp/gcs.py +994 -0
konduktor/data/gcp/utils.py +9 -0
konduktor/data/registry.py +19 -0
konduktor/data/storage.py +812 -0
konduktor/data/storage_utils.py +535 -0
konduktor/execution.py +447 -0
konduktor/kube_client.py +237 -0
konduktor/logging.py +111 -0
konduktor/manifests/aibrix-setup.yaml +430 -0
konduktor/manifests/apoxy-setup.yaml +184 -0
konduktor/manifests/apoxy-setup2.yaml +98 -0
konduktor/manifests/controller_deployment.yaml +69 -0
konduktor/manifests/dashboard_deployment.yaml +131 -0
konduktor/manifests/dmesg_daemonset.yaml +57 -0
konduktor/manifests/pod_cleanup_controller.yaml +129 -0
konduktor/resource.py +546 -0
konduktor/serving.py +153 -0
konduktor/task.py +949 -0
konduktor/templates/deployment.yaml.j2 +191 -0
konduktor/templates/jobset.yaml.j2 +43 -0
konduktor/templates/pod.yaml.j2 +563 -0
konduktor/usage/__init__.py +0 -0
konduktor/usage/constants.py +21 -0
konduktor/utils/__init__.py +0 -0
konduktor/utils/accelerator_registry.py +17 -0
konduktor/utils/annotations.py +62 -0
konduktor/utils/base64_utils.py +95 -0
konduktor/utils/common_utils.py +426 -0
konduktor/utils/constants.py +5 -0
konduktor/utils/env_options.py +55 -0
konduktor/utils/exceptions.py +234 -0
konduktor/utils/kubernetes_enums.py +8 -0
konduktor/utils/kubernetes_utils.py +763 -0
konduktor/utils/log_utils.py +467 -0
konduktor/utils/loki_utils.py +102 -0
konduktor/utils/rich_utils.py +123 -0
konduktor/utils/schemas.py +625 -0
konduktor/utils/subprocess_utils.py +273 -0
konduktor/utils/ux_utils.py +247 -0
konduktor/utils/validator.py +461 -0
konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0

konduktor/utils/log_utils.py ADDED Viewed

@@ -0,0 +1,467 @@
+# Proprietary Changes made for Trainy under the Trainy Software License
+# Original source: skypilot: https://github.com/skypilot-org/skypilot
+# which is Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import copy
+import enum
+import io
+import json
+import multiprocessing
+import os
+import subprocess
+import sys
+import types
+import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+import colorama
+import kr8s
+import prettytable
+import requests
+import websockets
+from konduktor import config, logging
+from konduktor.utils import kubernetes_utils, subprocess_utils
+logger = logging.get_logger(__name__)
+LOKI_REMOTE_PORT = 3100
+WEBSOCKET_TIMEOUT = 10
+INFINITY = 999999
+VICKY_REMOTE_PORT = 9428
+class LogBackend(enum.Enum):
+    VICTORIA = 'victoria'
+    LOKI = 'loki'
+class LineProcessor(object):
+    """A processor for log lines."""
+    def __enter__(self) -> None:
+        pass
+    def process_line(self, log_line: str) -> None:
+        pass
+    def __exit__(
+        self,
+        except_type: Optional[Type[BaseException]],
+        except_value: Optional[BaseException],
+        traceback: Optional[types.TracebackType],
+    ) -> None:
+        del except_type, except_value, traceback  # unused
+        pass
+class _ProcessingArgs:
+    """Arguments for processing logs."""
+    def __init__(
+        self,
+        log_path: str,
+        stream_logs: bool,
+        start_streaming_at: str = '',
+        end_streaming_at: Optional[str] = None,
+        skip_lines: Optional[List[str]] = None,
+        replace_crlf: bool = False,
+        line_processor: Optional[LineProcessor] = None,
+        streaming_prefix: Optional[str] = None,
+    ) -> None:
+        self.log_path = log_path
+        self.stream_logs = stream_logs
+        self.start_streaming_at = start_streaming_at
+        self.end_streaming_at = end_streaming_at
+        self.skip_lines = skip_lines
+        self.replace_crlf = replace_crlf
+        self.line_processor = line_processor
+        self.streaming_prefix = streaming_prefix
+def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
+    """Process the stream of a process."""
+    out_io = io.TextIOWrapper(
+        io_stream, encoding='utf-8', newline='', errors='replace', write_through=True
+    )
+    start_streaming_flag = False
+    end_streaming_flag = False
+    streaming_prefix = args.streaming_prefix if args.streaming_prefix else ''
+    line_processor = (
+        LineProcessor() if args.line_processor is None else args.line_processor
+    )
+    out = []
+    with open(args.log_path, 'a', encoding='utf-8') as fout:
+        with line_processor:
+            while True:
+                line = out_io.readline()
+                if not line:
+                    break
+                # start_streaming_at logic in processor.process_line(line)
+                if args.replace_crlf and line.endswith('\r\n'):
+                    # Replace CRLF with LF to avoid ray logging to the same
+                    # line due to separating lines with '\n'.
+                    line = line[:-2] + '\n'
+                if args.skip_lines is not None and any(
+                    skip in line for skip in args.skip_lines
+                ):
+                    continue
+                if args.start_streaming_at in line:
+                    start_streaming_flag = True
+                if args.end_streaming_at is not None and args.end_streaming_at in line:
+                    # Keep executing the loop, only stop streaming.
+                    # saving them in log files.
+                    end_streaming_flag = True
+                if args.stream_logs and start_streaming_flag and not end_streaming_flag:
+                    print(streaming_prefix + line, end='', file=out_stream, flush=True)
+                if args.log_path != '/dev/null':
+                    fout.write(line)
+                    fout.flush()
+                line_processor.process_line(line)
+                out.append(line)
+    return ''.join(out)
+def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
+    """Redirect the process's filtered stdout/stderr to both stream and file"""
+    if proc.stderr is not None:
+        # Asyncio does not work as the output processing can be executed in a
+        # different thread.
+        # selectors is possible to handle the multiplexing of stdout/stderr,
+        # but it introduces buffering making the output not streaming.
+        with multiprocessing.pool.ThreadPool(processes=1) as pool:
+            err_args = copy.copy(args)
+            err_args.line_processor = None
+            stderr_fut = pool.apply_async(
+                _handle_io_stream, args=(proc.stderr, sys.stderr, err_args)
+            )
+            # Do not launch a thread for stdout as the rich.status does not
+            # work in a thread, which is used in
+            # log_utils.RayUpLineProcessor.
+            stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
+            stderr = stderr_fut.get()
+    else:
+        stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
+        stderr = ''
+    return stdout, stderr
+def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
+    """Creates table with default style."""
+    border = kwargs.pop('border', False)
+    align = kwargs.pop('align', 'l')
+    table = prettytable.PrettyTable(
+        align=align, border=border, field_names=field_names, **kwargs
+    )
+    table.left_padding_width = 0
+    table.right_padding_width = 2
+    return table
+def _get_kr8s_api():
+    """Return a kr8s API configured for the selected context.
+    Honors kubernetes.allowed_contexts via
+    get_current_kube_config_context_name(). Fails with a clear error if a
+    configured context cannot be initialized.
+    """
+    ctx = kubernetes_utils.get_current_kube_config_context_name()
+    try:
+        # kr8s exposes a factory function `api(...)` that accepts context.
+        api = kr8s.api(context=ctx) if ctx else kr8s.api()
+        if ctx:
+            logger.debug('Initialized kr8s API for context: %s', ctx)
+        return api
+    except Exception as e:  # defensive: surface a clear error if context fails
+        if ctx:
+            raise ValueError(
+                'Failed to initialize kr8s client for context '
+                f'{ctx!r}. Ensure the context exists and your kubeconfig is valid.'
+            ) from e
+        raise
+def run_with_log(
+    cmd: Union[List[str], str],
+    log_path: str,
+    *,
+    require_outputs: bool = False,
+    stream_logs: bool = False,
+    start_streaming_at: str = '',
+    end_streaming_at: Optional[str] = None,
+    skip_lines: Optional[List[str]] = None,
+    shell: bool = False,
+    with_ray: bool = False,
+    process_stream: bool = True,
+    line_processor: Optional[LineProcessor] = None,
+    streaming_prefix: Optional[str] = None,
+    **kwargs,
+) -> Union[int, Tuple[int, str, str]]:
+    """Runs a command and logs its output to a file.
+    Args:
+        cmd: The command to run.
+        log_path: The path to the log file.
+        stream_logs: Whether to stream the logs to stdout/stderr.
+        require_outputs: Whether to return the stdout/stderr of the command.
+        process_stream: Whether to post-process the stdout/stderr of the
+            command, such as replacing or skipping lines on the fly. If
+            enabled, lines are printed only when '\r' or '\n' is found.
+    Returns the returncode or returncode, stdout and stderr of the command.
+      Note that the stdout and stderr is already decoded.
+    """
+    assert process_stream or not require_outputs, (
+        process_stream,
+        require_outputs,
+        'require_outputs should be False when process_stream is False',
+    )
+    log_path = os.path.expanduser(log_path)
+    dirname = os.path.dirname(log_path)
+    os.makedirs(dirname, exist_ok=True)
+    # Redirect stderr to stdout when using ray, to preserve the order of
+    # stdout and stderr.
+    stdout_arg = stderr_arg = None
+    if process_stream:
+        stdout_arg = subprocess.PIPE
+        stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
+    # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
+    # the terminal output when typing in the terminal that starts the API
+    # server.
+    stdin = kwargs.pop('stdin', subprocess.DEVNULL)
+    with subprocess.Popen(
+        cmd,
+        stdout=stdout_arg,
+        stderr=stderr_arg,
+        start_new_session=True,
+        shell=shell,
+        stdin=stdin,
+        **kwargs,
+    ) as proc:
+        try:
+            subprocess_utils.kill_process_daemon(proc.pid)
+            stdout = ''
+            stderr = ''
+            if process_stream:
+                if skip_lines is None:
+                    skip_lines = []
+                # Skip these lines caused by `-i` option of bash. Failed to
+                # find other way to turn off these two warning.
+                # https://stackoverflow.com/questions/13300764/how-to-tell-bash-not-to-issue-warnings-cannot-set-terminal-process-group-and # noqa: E501
+                # `ssh -T -i -tt` still cause the problem.
+                skip_lines += [
+                    'bash: cannot set terminal process group',
+                    'bash: no job control in this shell',
+                ]
+                # We need this even if the log_path is '/dev/null' to ensure the
+                # progress bar is shown.
+                # NOTE: Lines are printed only when '\r' or '\n' is found.
+                args = _ProcessingArgs(
+                    log_path=log_path,
+                    stream_logs=stream_logs,
+                    start_streaming_at=start_streaming_at,
+                    end_streaming_at=end_streaming_at,
+                    skip_lines=skip_lines,
+                    line_processor=line_processor,
+                    # Replace CRLF when the output is logged to driver by ray.
+                    replace_crlf=with_ray,
+                    streaming_prefix=streaming_prefix,
+                )
+                stdout, stderr = process_subprocess_stream(proc, args)
+            proc.wait()
+            if require_outputs:
+                return proc.returncode, stdout, stderr
+            return proc.returncode
+        except KeyboardInterrupt:
+            # Kill the subprocess directly, otherwise, the underlying
+            # process will only be killed after the python program exits,
+            # causing the stream handling stuck at `readline`.
+            subprocess_utils.kill_children_processes()
+            raise
+async def _read_logs(url: str, timeout: int, job_name: str, worker_id: int, port: int):
+    ws = await asyncio.wait_for(websockets.connect(url), timeout=WEBSOCKET_TIMEOUT)
+    logger.info(
+        f'{colorama.Fore.YELLOW}Tailing logs. '
+        f'Forwarding from remote port {port}. Press Ctrl+C to stop. '
+        f'{colorama.Style.RESET_ALL}'
+    )
+    try:
+        while True:
+            message = await asyncio.wait_for(ws.recv(), timeout=timeout)
+            try:
+                payload = json.loads(message)
+                for stream in payload['streams']:
+                    if stream['values'][0][1] is not None:
+                        print(
+                            f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
+                            f"(job_name={job_name} worker_id={worker_id})"
+                            f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
+                            flush=True,
+                        )
+            except json.JSONDecodeError:
+                logger.warning(f'Failed to decode log skipping: {message}')
+                logger.debug(f'Dropped log: {message}')
+                continue
+    except asyncio.exceptions.TimeoutError:
+        logger.debug('Websocket timed-out, closing log stream!')
+    except KeyboardInterrupt:
+        logger.debug('Keyboard interrupt, closing log stream!')
+def tail_loki_logs_ws(
+    job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
+):
+    if num_logs > 5000:
+        # TODO(asaiacai): we should not have a limit on the number of logs, but rather
+        # let the user specify any number of lines, and we can print the last N lines.
+        # this can be done in chunks. Potentially, we can query range
+        # until we reach the end of the log and then invoke tail again.
+        # Also include checks that the job is running/ever ran.
+        raise ValueError('num_logs must be less than or equal to 5000')
+    api = _get_kr8s_api()
+    loki_svc = kr8s.objects.Service.get('loki', namespace='loki', api=api)
+    with kr8s.portforward.PortForward(
+        loki_svc, LOKI_REMOTE_PORT, local_port='auto'
+    ) as port:
+        loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
+        logger.debug(f'Loki URL: {loki_url}')
+        params = {
+            'query': urllib.parse.quote(
+                r'{' + f'k8s_job_name="{job_name}-workers-0",'
+                r' k8s_container_name="konduktor-container"} '
+                f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
+            ),
+            'limit': num_logs,
+            'delay': 5,
+            # TODO(asaiacai): need to auto-generate the start and end times.
+        }
+        query_string = '&'.join(f'{key}={value}' for key, value in params.items())
+        loki_url += f'?{query_string}'
+        timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
+        asyncio.run(
+            _read_logs(loki_url, timeout, job_name, worker_id, LOKI_REMOTE_PORT)
+        )
+def tail_vicky_logs(
+    job_name: str,
+    worker_id: int = 0,
+    num_logs: int = -1,
+    follow: bool = True,
+    start_offset: str = '1h',
+):
+    context = kubernetes_utils.get_current_kube_config_context_name()
+    namespace = kubernetes_utils.get_kube_config_context_namespace(context)
+    query: Dict[str, Any] = {}
+    api = _get_kr8s_api()
+    vicky_svc = kr8s.objects.Service.get(
+        'vls-victoria-logs-single-server', namespace='victoria-logs', api=api
+    )
+    if num_logs == -1:
+        query = {}
+    else:
+        assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
+        query = {'limit': num_logs}
+    if follow:
+        effective_offset = start_offset or '1h'
+        logger.info(
+            f'Tailing logs from {effective_offset} ago. '
+            'If logs come up empty, there might be logs just earlier '
+            'than that window, check Grafana or use:\n'
+            f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
+            f'`konduktor logs --no-follow {job_name}`'
+            f'{colorama.Style.RESET_ALL}'
+        )
+        query['start_offset'] = effective_offset
+    query['query'] = (
+        f'k8s.namespace.name: "{namespace}" AND '
+        f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
+        f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
+    )
+    with kr8s.portforward.PortForward(
+        vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
+    ) as port:
+        if follow:
+            timeout = INFINITY
+            vicky_url = f'http://localhost:{port}/select/logsql/tail'
+        else:
+            vicky_url = f'http://localhost:{port}/select/logsql/query'
+            timeout = config.get_nested(('logs', 'timeout'), 60)
+        logger.debug(f'Vicky URL: {vicky_url}')
+        try:
+            logger.debug(f'Making request to {vicky_url} with query: {query}')
+            with requests.post(
+                vicky_url, data=query, stream=True, timeout=timeout
+            ) as response:  # type: requests.Response
+                logger.debug(f'Response status: {response.status_code}')
+                if response.status_code != 200:
+                    logger.error(
+                        f'VictoriaLogs API returned status {response.status_code}: '
+                        f'{response.text}'
+                    )
+                    return
+                for line in response.iter_lines(decode_unicode=True):
+                    if line:
+                        payload = json.loads(line)
+                        if 'missing _msg field' in payload['_msg']:
+                            payload['_msg'] = ''
+                        print(
+                            f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
+                            f"(job_name={job_name} worker_id={worker_id})"
+                            f"{colorama.Style.RESET_ALL} {payload['_msg']}",
+                            flush=True,
+                        )
+        except KeyboardInterrupt:
+            logger.info('\nStopping log stream...')
+        except requests.exceptions.Timeout:
+            logger.error(
+                f'Request to VictoriaLogs timed out after {timeout} seconds. '
+                'Try increasing the timeout in the config '
+                'file under `logs.timeout`. If '
+                'you are still seeing issues, please contact support.'
+            )
+        except requests.exceptions.ConnectionError as e:
+            logger.error(f'Failed to connect to VictoriaLogs at {vicky_url}: {e}')
+        except requests.exceptions.RequestException as e:
+            logger.error(f'Request to VictoriaLogs failed: {e}')
+        except Exception as e:
+            logger.error(f'Unexpected error while tailing VictoriaLogs: {e}')
+def tail_logs(
+    job_name: str,
+    worker_id: int = 0,
+    num_logs: int = 1000,
+    follow: bool = True,
+    start_offset: str = '1h',
+):
+    logs_backend = config.get_nested(('logs', 'backend'), None)
+    if logs_backend == LogBackend.VICTORIA:
+        tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)
+    elif logs_backend == LogBackend.LOKI:
+        tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
+    else:
+        logger.info('Defaulting to VictoriaLogs')
+        tail_vicky_logs(job_name, worker_id, num_logs, follow, start_offset)

konduktor/utils/loki_utils.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Loki utils: query/tail logs from Loki"""
+# TODO(asaiacai): eventually support querying
+# centralized loki that lives outside the cluster
+import asyncio
+import json
+import urllib.parse
+import colorama
+import kr8s
+import websockets
+from konduktor import logging
+from konduktor.utils import kubernetes_utils
+logger = logging.get_logger(__name__)
+LOKI_REMOTE_PORT = 3100
+WEBSOCKET_TIMEOUT = 10
+INFINITY = 999999
+async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id: int):
+    ws = await asyncio.wait_for(websockets.connect(loki_url), timeout=WEBSOCKET_TIMEOUT)
+    logger.info(
+        f'{colorama.Fore.YELLOW}Tailing logs from Loki. '
+        f'Forwarding from remote port {LOKI_REMOTE_PORT}. Press Ctrl+C to stop. '
+        f'{colorama.Style.RESET_ALL}'
+    )
+    try:
+        while True:
+            message = await asyncio.wait_for(ws.recv(), timeout=timeout)
+            try:
+                payload = json.loads(message)
+                for stream in payload['streams']:
+                    if stream['values'][0][1] is not None:
+                        print(
+                            f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
+                            f"(job_name={job_name} worker_id={worker_id})"
+                            f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
+                            flush=True,
+                        )
+            except json.JSONDecodeError:
+                logger.warning(f'Failed to decode log skipping: {message}')
+                logger.debug(f'Dropped log: {message}')
+                continue
+    except asyncio.exceptions.TimeoutError:
+        logger.debug('Websocket timed-out, closing log stream!')
+    except KeyboardInterrupt:
+        logger.debug('Keyboard interrupt, closing log stream!')
+def tail_loki_logs_ws(
+    job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
+):
+    if num_logs > 5000:
+        # TODO(asaiacai): we should not have a limit on the number of logs, but rather
+        # let the user specify any number of lines, and we can print the last N lines.
+        # this can be done in chunks. Potentially, we can query range
+        # until we reach the end of the log and then invoke tail again.
+        # Also include checks that the job is running/ever ran.
+        raise ValueError('num_logs must be less than or equal to 5000')
+    # Initialize kr8s API honoring allowed_contexts if configured.
+    ctx = kubernetes_utils.get_current_kube_config_context_name()
+    try:
+        api = kr8s.api(context=ctx) if ctx else kr8s.api()
+    except Exception as e:
+        if ctx:
+            raise ValueError(
+                'Failed to initialize kr8s client for context '
+                f'{ctx!r}. Ensure the context exists and your kubeconfig is valid.'
+            ) from e
+        raise
+    loki_svc = kr8s.objects.Service.get('loki', namespace='loki', api=api)
+    with kr8s.portforward.PortForward(
+        loki_svc, LOKI_REMOTE_PORT, local_port='auto'
+    ) as port:
+        loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
+        logger.debug(f'Loki URL: {loki_url}')
+        params = {
+            'query': urllib.parse.quote(
+                r'{' + f'k8s_job_name="{job_name}-workers-0",'
+                r' k8s_container_name="konduktor-container"} '
+                f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
+            ),
+            'limit': num_logs,
+            'delay': 5,
+            # TODO(asaiacai): need to auto-generate the start and end times.
+        }
+        query_string = '&'.join(f'{key}={value}' for key, value in params.items())
+        loki_url += f'?{query_string}'
+        timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
+        asyncio.run(_read_loki_logs(loki_url, timeout, job_name, worker_id))
+# TODO(asaiacai): write a query_range function to get all the
+# logs for a job for not tailing option
+# Run the WebSocket log tailing function
+if __name__ == '__main__':
+    tail_loki_logs_ws('tune-bc43', worker_id=0, follow=True)

konduktor/utils/rich_utils.py ADDED Viewed

@@ -0,0 +1,123 @@
+# Proprietary Changes made for Trainy under the Trainy Software License
+# Original source: skypilot: https://github.com/skypilot-org/skypilot
+# which is Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Rich status spinner utils."""
+import contextlib
+import logging
+import threading
+from typing import Union
+import rich.console as rich_console
+console = rich_console.Console(soft_wrap=True)
+_status = None
+_status_nesting_level = 0
+_logging_lock = threading.RLock()
+class _NoOpConsoleStatus:
+    """An empty class for multi-threaded console.status."""
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+    def update(self, text):
+        pass
+    def stop(self):
+        pass
+    def start(self):
+        pass
+class _RevertibleStatus:
+    """A wrapper for status that can revert to previous message after exit."""
+    def __init__(self, message: str):
+        if _status is not None:
+            self.previous_message = _status.status
+        else:
+            self.previous_message = None
+        self.message = message
+    def __enter__(self):
+        global _status_nesting_level
+        _status.update(self.message)
+        _status_nesting_level += 1
+        _status.__enter__()
+        return _status
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global _status_nesting_level, _status
+        _status_nesting_level -= 1
+        if _status_nesting_level <= 0:
+            _status_nesting_level = 0
+            if _status is not None:
+                _status.__exit__(exc_type, exc_val, exc_tb)
+                _status = None
+        else:
+            _status.update(self.previous_message)
+    def update(self, *args, **kwargs):
+        _status.update(*args, **kwargs)
+    def stop(self):
+        _status.stop()
+    def start(self):
+        _status.start()
+@contextlib.contextmanager
+def safe_logger():
+    logged = False
+    with _logging_lock:
+        if _status is not None and _status._live.is_started:  # pylint: disable=protected-access
+            _status.stop()
+            yield
+            logged = True
+            _status.start()
+    if not logged:
+        yield
+class RichSafeStreamHandler(logging.StreamHandler):
+    def emit(self, record: logging.LogRecord) -> None:
+        with safe_logger():
+            return super().emit(record)
+def force_update_status(msg: str):
+    """Update the status message even if konduktor_logging.is_silent() is true."""
+    if threading.current_thread() is threading.main_thread() and _status is not None:
+        _status.update(msg)
+def safe_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
+    """A wrapper for multi-threaded console.status."""
+    from konduktor import logging  # pylint: disable=import-outside-toplevel
+    global _status
+    if (
+        threading.current_thread() is threading.main_thread()
+        and not logging.is_silent()
+    ):
+        if _status is None:
+            _status = console.status(msg, refresh_per_second=8)
+        return _RevertibleStatus(msg)
+    return _NoOpConsoleStatus()