skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +38 -14
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +18 -9
- sky/client/cli/table_utils.py +34 -0
- sky/client/common.py +4 -2
- sky/client/sdk.py +11 -7
- sky/client/sdk_async.py +5 -5
- sky/core.py +6 -6
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +57 -34
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +98 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/provision/runpod/__init__.py +2 -0
- sky/schemas/api/responses.py +18 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/payloads.py +2 -1
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -3
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
- sky/volumes/server/core.py +1 -0
- sky/volumes/volume.py +16 -17
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
- /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
|
@@ -8,11 +8,13 @@ import functools
|
|
|
8
8
|
import io
|
|
9
9
|
import multiprocessing.pool
|
|
10
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import textwrap
|
|
17
|
+
import threading
|
|
16
18
|
import time
|
|
17
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
18
20
|
Tuple, Union)
|
|
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
41
|
|
|
40
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
41
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
class _ProcessingArgs:
|
|
44
51
|
"""Arguments for processing logs."""
|
|
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
|
|
|
563
570
|
except FileNotFoundError:
|
|
564
571
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
565
572
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
576
|
+
log_dir: Optional[str],
|
|
577
|
+
managed_job_id: Optional[int] = None,
|
|
578
|
+
follow: bool = True,
|
|
579
|
+
tail: int = 0) -> Iterator[str]:
|
|
580
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
581
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
582
|
+
if job_id is None:
|
|
583
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
584
|
+
# which means no job has been submitted to this cluster. See
|
|
585
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
586
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
587
|
+
return
|
|
588
|
+
job_str = f'job {job_id}'
|
|
589
|
+
if managed_job_id is not None:
|
|
590
|
+
job_str = f'managed job {managed_job_id}'
|
|
591
|
+
if log_dir is None:
|
|
592
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
593
|
+
yield msg + '\n'
|
|
594
|
+
return
|
|
595
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
596
|
+
f'{managed_job_id}.')
|
|
597
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
598
|
+
log_path = os.path.expanduser(log_path)
|
|
599
|
+
|
|
600
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
601
|
+
|
|
602
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
603
|
+
# will take some time to start the job and write the log.
|
|
604
|
+
retry_cnt = 0
|
|
605
|
+
while status is not None and not status.is_terminal():
|
|
606
|
+
retry_cnt += 1
|
|
607
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
608
|
+
break
|
|
609
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
610
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
611
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
612
|
+
f'after retrying {retry_cnt} times.'
|
|
613
|
+
f'{colorama.Style.RESET_ALL}')
|
|
614
|
+
yield err + '\n'
|
|
615
|
+
return
|
|
616
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
617
|
+
'to be written...')
|
|
618
|
+
yield waiting + '\n'
|
|
619
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
620
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
621
|
+
|
|
622
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
623
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
624
|
+
lines: Iterable[str] = []
|
|
625
|
+
if follow and status in [
|
|
626
|
+
job_lib.JobStatus.SETTING_UP,
|
|
627
|
+
job_lib.JobStatus.PENDING,
|
|
628
|
+
job_lib.JobStatus.RUNNING,
|
|
629
|
+
]:
|
|
630
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
631
|
+
# multiple lines.
|
|
632
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
633
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
634
|
+
# log and creating a new process for tail.
|
|
635
|
+
start_streaming = False
|
|
636
|
+
if tail > 0:
|
|
637
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
638
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
639
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
640
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
641
|
+
for line in lines:
|
|
642
|
+
if start_stream_at in line:
|
|
643
|
+
start_streaming = True
|
|
644
|
+
if start_streaming:
|
|
645
|
+
yield line
|
|
646
|
+
# Now, the cursor is at the end of the last lines
|
|
647
|
+
# if tail > 0
|
|
648
|
+
for line in _follow_job_logs(log_file,
|
|
649
|
+
job_id=job_id,
|
|
650
|
+
start_streaming=start_streaming,
|
|
651
|
+
start_streaming_at=start_stream_at):
|
|
652
|
+
yield line
|
|
653
|
+
else:
|
|
654
|
+
try:
|
|
655
|
+
start_streaming = False
|
|
656
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
657
|
+
if tail > 0:
|
|
658
|
+
# If tail > 0, we need to read the last n lines.
|
|
659
|
+
# We use double ended queue to rotate the last n lines.
|
|
660
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
661
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
662
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
663
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
664
|
+
else:
|
|
665
|
+
lines = log_file
|
|
666
|
+
for line in lines:
|
|
667
|
+
if start_stream_at in line:
|
|
668
|
+
start_streaming = True
|
|
669
|
+
if start_streaming:
|
|
670
|
+
yield line
|
|
671
|
+
status_str = status.value if status is not None else 'None'
|
|
672
|
+
# Only show "Job finished" for actually terminal states
|
|
673
|
+
if status is not None and status.is_terminal():
|
|
674
|
+
finish = ux_utils.finishing_message(
|
|
675
|
+
f'Job finished (status: {status_str}).')
|
|
676
|
+
yield finish + '\n'
|
|
677
|
+
return
|
|
678
|
+
except FileNotFoundError:
|
|
679
|
+
err = (
|
|
680
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
681
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
682
|
+
yield err + '\n'
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
class LogBuffer:
|
|
686
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
687
|
+
|
|
688
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
689
|
+
"""Initialize the log buffer.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
693
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
694
|
+
could be more than this, depending on the characters,
|
|
695
|
+
i.e. ASCII characters take 1 byte, while others
|
|
696
|
+
may take 2-4 bytes. But this is fine as our default
|
|
697
|
+
chunk size is well below the default value of
|
|
698
|
+
grpc.max_receive_message_length which is 4MB.
|
|
699
|
+
"""
|
|
700
|
+
self.max_chars = max_chars
|
|
701
|
+
self._buffer = io.StringIO()
|
|
702
|
+
|
|
703
|
+
def _should_flush(self) -> bool:
|
|
704
|
+
return self._buffer.tell() >= self.max_chars
|
|
705
|
+
|
|
706
|
+
def flush(self) -> str:
|
|
707
|
+
"""Get the current buffered content and clear the buffer.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
The buffered log lines as a single string
|
|
711
|
+
"""
|
|
712
|
+
if not self._buffer.tell():
|
|
713
|
+
return ''
|
|
714
|
+
chunk = self._buffer.getvalue()
|
|
715
|
+
self._buffer.truncate(0)
|
|
716
|
+
self._buffer.seek(0)
|
|
717
|
+
return chunk
|
|
718
|
+
|
|
719
|
+
def write(self, line: str) -> bool:
|
|
720
|
+
"""Add a line to the buffer.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
line: The log line to add
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
True if buffer should be flushed after adding the line
|
|
727
|
+
"""
|
|
728
|
+
self._buffer.write(line)
|
|
729
|
+
return self._should_flush()
|
|
730
|
+
|
|
731
|
+
def close(self):
|
|
732
|
+
self._buffer.close()
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
736
|
+
timeout: float) -> Iterable[str]:
|
|
737
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
738
|
+
and flushing the buffer when it is full or no item is
|
|
739
|
+
yielded within the timeout duration."""
|
|
740
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
741
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
742
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
743
|
+
|
|
744
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
745
|
+
sentinel = object()
|
|
746
|
+
|
|
747
|
+
def producer():
|
|
748
|
+
try:
|
|
749
|
+
for item in iterable:
|
|
750
|
+
queue.put(item)
|
|
751
|
+
finally:
|
|
752
|
+
queue.put(sentinel)
|
|
753
|
+
|
|
754
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
755
|
+
thread.start()
|
|
756
|
+
|
|
757
|
+
while True:
|
|
758
|
+
try:
|
|
759
|
+
item = queue.get(timeout=timeout)
|
|
760
|
+
except queue_lib.Empty:
|
|
761
|
+
out = buffer.flush()
|
|
762
|
+
if out:
|
|
763
|
+
yield out
|
|
764
|
+
continue
|
|
765
|
+
|
|
766
|
+
if item is sentinel:
|
|
767
|
+
thread.join()
|
|
768
|
+
out = buffer.flush()
|
|
769
|
+
if out:
|
|
770
|
+
yield out
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if buffer.write(item):
|
|
774
|
+
out = buffer.flush()
|
|
775
|
+
if out:
|
|
776
|
+
yield out
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
|
|
|
4
4
|
the return type based on the value of require_outputs.
|
|
5
5
|
"""
|
|
6
6
|
import typing
|
|
7
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
|
|
|
143
143
|
managed_job_id: Optional[int] = ...,
|
|
144
144
|
follow: bool = ...) -> None:
|
|
145
145
|
...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
149
|
+
log_dir: Optional[str],
|
|
150
|
+
managed_job_id: Optional[int] = ...,
|
|
151
|
+
follow: bool = ...,
|
|
152
|
+
tail: int = ...) -> Iterator[str]:
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class LogBuffer:
|
|
157
|
+
max_chars: int
|
|
158
|
+
|
|
159
|
+
def __init__(self, max_chars: int = ...):
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def flush(self) -> str:
|
|
163
|
+
...
|
|
164
|
+
|
|
165
|
+
def write(self, line: str) -> bool:
|
|
166
|
+
...
|
|
167
|
+
|
|
168
|
+
def close(self):
|
|
169
|
+
...
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
173
|
+
timeout: float) -> Iterable[str]:
|
|
174
|
+
...
|
sky/skylet/services.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
"""gRPC service implementations for skylet."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
import grpc
|
|
6
7
|
|
|
8
|
+
from sky import exceptions
|
|
7
9
|
from sky import sky_logging
|
|
8
10
|
from sky.jobs import state as managed_job_state
|
|
11
|
+
from sky.jobs import utils as managed_job_utils
|
|
9
12
|
from sky.schemas.generated import autostopv1_pb2
|
|
10
13
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
11
14
|
from sky.schemas.generated import jobsv1_pb2
|
|
12
15
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
16
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
18
|
from sky.schemas.generated import servev1_pb2
|
|
14
19
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
15
20
|
from sky.serve import serve_rpc_utils
|
|
@@ -18,9 +23,14 @@ from sky.serve import serve_utils
|
|
|
18
23
|
from sky.skylet import autostop_lib
|
|
19
24
|
from sky.skylet import constants
|
|
20
25
|
from sky.skylet import job_lib
|
|
26
|
+
from sky.skylet import log_lib
|
|
21
27
|
|
|
22
28
|
logger = sky_logging.init_logger(__name__)
|
|
23
29
|
|
|
30
|
+
# In the worst case, flush the log buffer every 50ms,
|
|
31
|
+
# to ensure responsiveness.
|
|
32
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
|
|
33
|
+
|
|
24
34
|
|
|
25
35
|
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
26
36
|
"""Implementation of the AutostopService gRPC service."""
|
|
@@ -275,8 +285,39 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
275
285
|
self,
|
|
276
286
|
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
277
287
|
context: grpc.ServicerContext):
|
|
278
|
-
|
|
279
|
-
|
|
288
|
+
buffer = log_lib.LogBuffer()
|
|
289
|
+
try:
|
|
290
|
+
job_id = request.job_id if request.HasField(
|
|
291
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
292
|
+
managed_job_id = request.managed_job_id if request.HasField(
|
|
293
|
+
'managed_job_id') else None
|
|
294
|
+
log_dir = job_lib.get_log_dir_for_job(job_id)
|
|
295
|
+
if log_dir is None:
|
|
296
|
+
run_timestamp = job_lib.get_run_timestamp(job_id)
|
|
297
|
+
log_dir = None if run_timestamp is None else os.path.join(
|
|
298
|
+
constants.SKY_LOGS_DIRECTORY, run_timestamp)
|
|
299
|
+
|
|
300
|
+
for line in log_lib.buffered_iter_with_timeout(
|
|
301
|
+
buffer,
|
|
302
|
+
log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
|
|
303
|
+
request.follow, request.tail),
|
|
304
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
|
|
305
|
+
yield jobsv1_pb2.TailLogsResponse(log_line=line)
|
|
306
|
+
|
|
307
|
+
job_status = job_lib.get_status(job_id)
|
|
308
|
+
exit_code = exceptions.JobExitCode.from_job_status(job_status)
|
|
309
|
+
# Fix for dashboard: When follow=False and job is still running
|
|
310
|
+
# (NOT_FINISHED=101), exit with success (0) since fetching current
|
|
311
|
+
# logs is a successful operation.
|
|
312
|
+
# This prevents shell wrappers from printing "command terminated
|
|
313
|
+
# with exit code 101".
|
|
314
|
+
exit_code_int = 0 if not request.follow and int(
|
|
315
|
+
exit_code) == 101 else int(exit_code)
|
|
316
|
+
yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
|
|
317
|
+
except Exception as e: # pylint: disable=broad-except
|
|
318
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
319
|
+
finally:
|
|
320
|
+
buffer.close()
|
|
280
321
|
|
|
281
322
|
def GetJobStatus( # type: ignore[return]
|
|
282
323
|
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
@@ -343,3 +384,168 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
343
384
|
job_log_dirs=job_log_dirs)
|
|
344
385
|
except Exception as e: # pylint: disable=broad-except
|
|
345
386
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
390
|
+
):
|
|
391
|
+
"""Implementation of the ManagedJobsService gRPC service."""
|
|
392
|
+
|
|
393
|
+
def GetVersion( # type: ignore[return]
|
|
394
|
+
self, request: managed_jobsv1_pb2.GetVersionRequest,
|
|
395
|
+
context: grpc.ServicerContext
|
|
396
|
+
) -> managed_jobsv1_pb2.GetVersionResponse:
|
|
397
|
+
try:
|
|
398
|
+
return managed_jobsv1_pb2.GetVersionResponse(
|
|
399
|
+
controller_version=constants.SKYLET_VERSION)
|
|
400
|
+
except Exception as e: # pylint: disable=broad-except
|
|
401
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
402
|
+
|
|
403
|
+
def GetJobTable( # type: ignore[return]
|
|
404
|
+
self, request: managed_jobsv1_pb2.GetJobTableRequest,
|
|
405
|
+
context: grpc.ServicerContext
|
|
406
|
+
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
407
|
+
try:
|
|
408
|
+
accessible_workspaces = list(request.accessible_workspaces)
|
|
409
|
+
job_ids = list(request.job_ids.ids) if request.job_ids else None
|
|
410
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
411
|
+
if request.user_hashes:
|
|
412
|
+
user_hashes = list(request.user_hashes.hashes)
|
|
413
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
414
|
+
# user_hash. TODO: Remove before 0.12.0.
|
|
415
|
+
if request.show_jobs_without_user_hash:
|
|
416
|
+
user_hashes.append(None)
|
|
417
|
+
statuses = list(
|
|
418
|
+
request.statuses.statuses) if request.statuses else None
|
|
419
|
+
|
|
420
|
+
job_queue = managed_job_utils.get_managed_job_queue(
|
|
421
|
+
skip_finished=request.skip_finished,
|
|
422
|
+
accessible_workspaces=accessible_workspaces,
|
|
423
|
+
job_ids=job_ids,
|
|
424
|
+
workspace_match=request.workspace_match
|
|
425
|
+
if request.HasField('workspace_match') else None,
|
|
426
|
+
name_match=request.name_match
|
|
427
|
+
if request.HasField('name_match') else None,
|
|
428
|
+
pool_match=request.pool_match
|
|
429
|
+
if request.HasField('pool_match') else None,
|
|
430
|
+
page=request.page if request.HasField('page') else None,
|
|
431
|
+
limit=request.limit if request.HasField('limit') else None,
|
|
432
|
+
user_hashes=user_hashes,
|
|
433
|
+
statuses=statuses)
|
|
434
|
+
jobs = job_queue['jobs']
|
|
435
|
+
total = job_queue['total']
|
|
436
|
+
total_no_filter = job_queue['total_no_filter']
|
|
437
|
+
status_counts = job_queue['status_counts']
|
|
438
|
+
|
|
439
|
+
jobs_info = []
|
|
440
|
+
for job in jobs:
|
|
441
|
+
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
442
|
+
job_id=job.get('job_id'),
|
|
443
|
+
task_id=job.get('task_id'),
|
|
444
|
+
job_name=job.get('job_name'),
|
|
445
|
+
task_name=job.get('task_name'),
|
|
446
|
+
job_duration=job.get('job_duration'),
|
|
447
|
+
workspace=job.get('workspace'),
|
|
448
|
+
status=managed_job_state.ManagedJobStatus(
|
|
449
|
+
job.get('status')).to_protobuf(),
|
|
450
|
+
schedule_state=managed_job_state.ManagedJobScheduleState(
|
|
451
|
+
job.get('schedule_state')).to_protobuf(),
|
|
452
|
+
resources=job.get('resources'),
|
|
453
|
+
cluster_resources=job.get('cluster_resources'),
|
|
454
|
+
cluster_resources_full=job.get('cluster_resources_full'),
|
|
455
|
+
cloud=job.get('cloud'),
|
|
456
|
+
region=job.get('region'),
|
|
457
|
+
infra=job.get('infra'),
|
|
458
|
+
accelerators=job.get('accelerators'),
|
|
459
|
+
recovery_count=job.get('recovery_count'),
|
|
460
|
+
details=job.get('details'),
|
|
461
|
+
failure_reason=job.get('failure_reason'),
|
|
462
|
+
user_name=job.get('user_name'),
|
|
463
|
+
user_hash=job.get('user_hash'),
|
|
464
|
+
submitted_at=job.get('submitted_at'),
|
|
465
|
+
start_at=job.get('start_at'),
|
|
466
|
+
end_at=job.get('end_at'),
|
|
467
|
+
user_yaml=job.get('user_yaml'),
|
|
468
|
+
entrypoint=job.get('entrypoint'),
|
|
469
|
+
metadata={
|
|
470
|
+
k: v
|
|
471
|
+
for k, v in job.get('metadata', {}).items()
|
|
472
|
+
if v is not None
|
|
473
|
+
},
|
|
474
|
+
pool=job.get('pool'),
|
|
475
|
+
pool_hash=job.get('pool_hash'))
|
|
476
|
+
jobs_info.append(job_info)
|
|
477
|
+
|
|
478
|
+
return managed_jobsv1_pb2.GetJobTableResponse(
|
|
479
|
+
jobs=jobs_info,
|
|
480
|
+
total=total,
|
|
481
|
+
total_no_filter=total_no_filter,
|
|
482
|
+
status_counts=status_counts)
|
|
483
|
+
except Exception as e: # pylint: disable=broad-except
|
|
484
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
485
|
+
|
|
486
|
+
def GetAllJobIdsByName( # type: ignore[return]
|
|
487
|
+
self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
|
|
488
|
+
context: grpc.ServicerContext
|
|
489
|
+
) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
|
|
490
|
+
try:
|
|
491
|
+
job_name = request.job_name if request.HasField(
|
|
492
|
+
'job_name') else None
|
|
493
|
+
job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
|
|
494
|
+
return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
|
|
495
|
+
job_ids=job_ids)
|
|
496
|
+
except Exception as e: # pylint: disable=broad-except
|
|
497
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
498
|
+
|
|
499
|
+
def CancelJobs( # type: ignore[return]
|
|
500
|
+
self, request: managed_jobsv1_pb2.CancelJobsRequest,
|
|
501
|
+
context: grpc.ServicerContext
|
|
502
|
+
) -> managed_jobsv1_pb2.CancelJobsResponse:
|
|
503
|
+
try:
|
|
504
|
+
cancellation_criteria = request.WhichOneof('cancellation_criteria')
|
|
505
|
+
if cancellation_criteria is None:
|
|
506
|
+
context.abort(
|
|
507
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
508
|
+
'exactly one cancellation criteria must be specified.')
|
|
509
|
+
|
|
510
|
+
if cancellation_criteria == 'all_users':
|
|
511
|
+
user_hash = request.user_hash if request.HasField(
|
|
512
|
+
'user_hash') else None
|
|
513
|
+
all_users = request.all_users
|
|
514
|
+
if not all_users and user_hash is None:
|
|
515
|
+
context.abort(
|
|
516
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
517
|
+
'user_hash is required when all_users is False')
|
|
518
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
519
|
+
job_ids=None,
|
|
520
|
+
all_users=all_users,
|
|
521
|
+
current_workspace=request.current_workspace,
|
|
522
|
+
user_hash=user_hash)
|
|
523
|
+
elif cancellation_criteria == 'job_ids':
|
|
524
|
+
job_ids = list(request.job_ids.ids)
|
|
525
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
526
|
+
job_ids=job_ids,
|
|
527
|
+
current_workspace=request.current_workspace)
|
|
528
|
+
elif cancellation_criteria == 'job_name':
|
|
529
|
+
message = managed_job_utils.cancel_job_by_name(
|
|
530
|
+
job_name=request.job_name,
|
|
531
|
+
current_workspace=request.current_workspace)
|
|
532
|
+
elif cancellation_criteria == 'pool_name':
|
|
533
|
+
message = managed_job_utils.cancel_jobs_by_pool(
|
|
534
|
+
pool_name=request.pool_name,
|
|
535
|
+
current_workspace=request.current_workspace)
|
|
536
|
+
else:
|
|
537
|
+
context.abort(
|
|
538
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
539
|
+
f'invalid cancellation criteria: {cancellation_criteria}')
|
|
540
|
+
return managed_jobsv1_pb2.CancelJobsResponse(message=message)
|
|
541
|
+
except Exception as e: # pylint: disable=broad-except
|
|
542
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
543
|
+
|
|
544
|
+
def StreamLogs(
|
|
545
|
+
self,
|
|
546
|
+
request: managed_jobsv1_pb2.
|
|
547
|
+
StreamLogsRequest, # type: ignore[return]
|
|
548
|
+
context: grpc.ServicerContext):
|
|
549
|
+
# TODO(kevin): implement this
|
|
550
|
+
context.abort(grpc.StatusCode.UNIMPLEMENTED,
|
|
551
|
+
'StreamLogs is not implemented')
|
sky/skylet/skylet.py
CHANGED
|
@@ -10,6 +10,7 @@ import sky
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
12
12
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
13
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
14
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.skylet import events
|
|
@@ -55,6 +56,8 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
55
56
|
services.JobsServiceImpl(), server)
|
|
56
57
|
servev1_pb2_grpc.add_ServeServiceServicer_to_server(
|
|
57
58
|
services.ServeServiceImpl(), server)
|
|
59
|
+
managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
|
|
60
|
+
services.ManagedJobsServiceImpl(), server)
|
|
58
61
|
|
|
59
62
|
listen_addr = f'127.0.0.1:{port}'
|
|
60
63
|
server.add_insecure_port(listen_addr)
|
|
@@ -36,6 +36,9 @@ setup: |
|
|
|
36
36
|
grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
|
|
37
37
|
{% endif %}
|
|
38
38
|
|
|
39
|
+
# This is used by the skylet events to check if we are a jobs controller.
|
|
40
|
+
touch {{job_controller_indicator_file}}
|
|
41
|
+
|
|
39
42
|
run: |
|
|
40
43
|
{%- if consolidation_mode_job_id is none %}
|
|
41
44
|
{{ sky_activate_python_env }}
|
|
@@ -901,15 +901,20 @@ available_node_types:
|
|
|
901
901
|
{{ conda_installation_commands }}
|
|
902
902
|
{{ ray_installation_commands }}
|
|
903
903
|
|
|
904
|
-
|
|
904
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
905
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
906
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
|
905
907
|
# Wait for `patch` package to be installed before applying ray patches
|
|
906
908
|
until dpkg -l | grep -q "^ii patch "; do
|
|
907
909
|
sleep 0.1
|
|
908
910
|
echo "Waiting for patch package to be installed..."
|
|
909
911
|
done
|
|
910
912
|
# Apply Ray patches for progress bar fix
|
|
911
|
-
|
|
912
|
-
|
|
913
|
+
# set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
|
|
914
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
915
|
+
# ~/.sky/python_path is seeded by conda_installation_commands
|
|
916
|
+
VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
|
|
917
|
+
$(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
|
|
913
918
|
}
|
|
914
919
|
touch /tmp/ray_skypilot_installation_complete
|
|
915
920
|
echo "=== Ray and skypilot installation completed ==="
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -201,6 +201,7 @@ def add_column_to_table_alembic(
|
|
|
201
201
|
server_default: Optional[str] = None,
|
|
202
202
|
copy_from: Optional[str] = None,
|
|
203
203
|
value_to_replace_existing_entries: Optional[Any] = None,
|
|
204
|
+
index: Optional[bool] = None,
|
|
204
205
|
):
|
|
205
206
|
"""Add a column to a table using Alembic operations.
|
|
206
207
|
|
|
@@ -215,6 +216,8 @@ def add_column_to_table_alembic(
|
|
|
215
216
|
copy_from: Column name to copy values from (for existing rows)
|
|
216
217
|
value_to_replace_existing_entries: Default value for existing NULL
|
|
217
218
|
entries
|
|
219
|
+
index: If True, create an index on this column. If None, no index
|
|
220
|
+
is created.
|
|
218
221
|
"""
|
|
219
222
|
from alembic import op # pylint: disable=import-outside-toplevel
|
|
220
223
|
|
|
@@ -222,7 +225,8 @@ def add_column_to_table_alembic(
|
|
|
222
225
|
# Create the column with server_default if provided
|
|
223
226
|
column = sqlalchemy.Column(column_name,
|
|
224
227
|
column_type,
|
|
225
|
-
server_default=server_default
|
|
228
|
+
server_default=server_default,
|
|
229
|
+
index=index)
|
|
226
230
|
op.add_column(table_name, column)
|
|
227
231
|
|
|
228
232
|
# Handle data migration
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -17,7 +17,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
17
17
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
18
18
|
|
|
19
19
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
20
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
20
|
+
GLOBAL_USER_STATE_VERSION = '009'
|
|
21
21
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
22
22
|
|
|
23
23
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|