skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +25 -4
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +2 -1
- sky/client/cli/table_utils.py +34 -0
- sky/client/sdk.py +7 -5
- sky/client/sdk_async.py +5 -5
- sky/core.py +3 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/server/core.py +96 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/schemas/api/responses.py +18 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
|
@@ -8,11 +8,13 @@ import functools
|
|
|
8
8
|
import io
|
|
9
9
|
import multiprocessing.pool
|
|
10
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import textwrap
|
|
17
|
+
import threading
|
|
16
18
|
import time
|
|
17
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
18
20
|
Tuple, Union)
|
|
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
41
|
|
|
40
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
41
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
class _ProcessingArgs:
|
|
44
51
|
"""Arguments for processing logs."""
|
|
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
|
|
|
563
570
|
except FileNotFoundError:
|
|
564
571
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
565
572
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
576
|
+
log_dir: Optional[str],
|
|
577
|
+
managed_job_id: Optional[int] = None,
|
|
578
|
+
follow: bool = True,
|
|
579
|
+
tail: int = 0) -> Iterator[str]:
|
|
580
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
581
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
582
|
+
if job_id is None:
|
|
583
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
584
|
+
# which means no job has been submitted to this cluster. See
|
|
585
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
586
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
587
|
+
return
|
|
588
|
+
job_str = f'job {job_id}'
|
|
589
|
+
if managed_job_id is not None:
|
|
590
|
+
job_str = f'managed job {managed_job_id}'
|
|
591
|
+
if log_dir is None:
|
|
592
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
593
|
+
yield msg + '\n'
|
|
594
|
+
return
|
|
595
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
596
|
+
f'{managed_job_id}.')
|
|
597
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
598
|
+
log_path = os.path.expanduser(log_path)
|
|
599
|
+
|
|
600
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
601
|
+
|
|
602
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
603
|
+
# will take some time to start the job and write the log.
|
|
604
|
+
retry_cnt = 0
|
|
605
|
+
while status is not None and not status.is_terminal():
|
|
606
|
+
retry_cnt += 1
|
|
607
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
608
|
+
break
|
|
609
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
610
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
611
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
612
|
+
f'after retrying {retry_cnt} times.'
|
|
613
|
+
f'{colorama.Style.RESET_ALL}')
|
|
614
|
+
yield err + '\n'
|
|
615
|
+
return
|
|
616
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
617
|
+
'to be written...')
|
|
618
|
+
yield waiting + '\n'
|
|
619
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
620
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
621
|
+
|
|
622
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
623
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
624
|
+
lines: Iterable[str] = []
|
|
625
|
+
if follow and status in [
|
|
626
|
+
job_lib.JobStatus.SETTING_UP,
|
|
627
|
+
job_lib.JobStatus.PENDING,
|
|
628
|
+
job_lib.JobStatus.RUNNING,
|
|
629
|
+
]:
|
|
630
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
631
|
+
# multiple lines.
|
|
632
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
633
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
634
|
+
# log and creating a new process for tail.
|
|
635
|
+
start_streaming = False
|
|
636
|
+
if tail > 0:
|
|
637
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
638
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
639
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
640
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
641
|
+
for line in lines:
|
|
642
|
+
if start_stream_at in line:
|
|
643
|
+
start_streaming = True
|
|
644
|
+
if start_streaming:
|
|
645
|
+
yield line
|
|
646
|
+
# Now, the cursor is at the end of the last lines
|
|
647
|
+
# if tail > 0
|
|
648
|
+
for line in _follow_job_logs(log_file,
|
|
649
|
+
job_id=job_id,
|
|
650
|
+
start_streaming=start_streaming,
|
|
651
|
+
start_streaming_at=start_stream_at):
|
|
652
|
+
yield line
|
|
653
|
+
else:
|
|
654
|
+
try:
|
|
655
|
+
start_streaming = False
|
|
656
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
657
|
+
if tail > 0:
|
|
658
|
+
# If tail > 0, we need to read the last n lines.
|
|
659
|
+
# We use double ended queue to rotate the last n lines.
|
|
660
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
661
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
662
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
663
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
664
|
+
else:
|
|
665
|
+
lines = log_file
|
|
666
|
+
for line in lines:
|
|
667
|
+
if start_stream_at in line:
|
|
668
|
+
start_streaming = True
|
|
669
|
+
if start_streaming:
|
|
670
|
+
yield line
|
|
671
|
+
status_str = status.value if status is not None else 'None'
|
|
672
|
+
# Only show "Job finished" for actually terminal states
|
|
673
|
+
if status is not None and status.is_terminal():
|
|
674
|
+
finish = ux_utils.finishing_message(
|
|
675
|
+
f'Job finished (status: {status_str}).')
|
|
676
|
+
yield finish + '\n'
|
|
677
|
+
return
|
|
678
|
+
except FileNotFoundError:
|
|
679
|
+
err = (
|
|
680
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
681
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
682
|
+
yield err + '\n'
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
class LogBuffer:
|
|
686
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
687
|
+
|
|
688
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
689
|
+
"""Initialize the log buffer.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
693
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
694
|
+
could be more than this, depending on the characters,
|
|
695
|
+
i.e. ASCII characters take 1 byte, while others
|
|
696
|
+
may take 2-4 bytes. But this is fine as our default
|
|
697
|
+
chunk size is well below the default value of
|
|
698
|
+
grpc.max_receive_message_length which is 4MB.
|
|
699
|
+
"""
|
|
700
|
+
self.max_chars = max_chars
|
|
701
|
+
self._buffer = io.StringIO()
|
|
702
|
+
|
|
703
|
+
def _should_flush(self) -> bool:
|
|
704
|
+
return self._buffer.tell() >= self.max_chars
|
|
705
|
+
|
|
706
|
+
def flush(self) -> str:
|
|
707
|
+
"""Get the current buffered content and clear the buffer.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
The buffered log lines as a single string
|
|
711
|
+
"""
|
|
712
|
+
if not self._buffer.tell():
|
|
713
|
+
return ''
|
|
714
|
+
chunk = self._buffer.getvalue()
|
|
715
|
+
self._buffer.truncate(0)
|
|
716
|
+
self._buffer.seek(0)
|
|
717
|
+
return chunk
|
|
718
|
+
|
|
719
|
+
def write(self, line: str) -> bool:
|
|
720
|
+
"""Add a line to the buffer.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
line: The log line to add
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
True if buffer should be flushed after adding the line
|
|
727
|
+
"""
|
|
728
|
+
self._buffer.write(line)
|
|
729
|
+
return self._should_flush()
|
|
730
|
+
|
|
731
|
+
def close(self):
|
|
732
|
+
self._buffer.close()
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
736
|
+
timeout: float) -> Iterable[str]:
|
|
737
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
738
|
+
and flushing the buffer when it is full or no item is
|
|
739
|
+
yielded within the timeout duration."""
|
|
740
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
741
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
742
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
743
|
+
|
|
744
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
745
|
+
sentinel = object()
|
|
746
|
+
|
|
747
|
+
def producer():
|
|
748
|
+
try:
|
|
749
|
+
for item in iterable:
|
|
750
|
+
queue.put(item)
|
|
751
|
+
finally:
|
|
752
|
+
queue.put(sentinel)
|
|
753
|
+
|
|
754
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
755
|
+
thread.start()
|
|
756
|
+
|
|
757
|
+
while True:
|
|
758
|
+
try:
|
|
759
|
+
item = queue.get(timeout=timeout)
|
|
760
|
+
except queue_lib.Empty:
|
|
761
|
+
out = buffer.flush()
|
|
762
|
+
if out:
|
|
763
|
+
yield out
|
|
764
|
+
continue
|
|
765
|
+
|
|
766
|
+
if item is sentinel:
|
|
767
|
+
thread.join()
|
|
768
|
+
out = buffer.flush()
|
|
769
|
+
if out:
|
|
770
|
+
yield out
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if buffer.write(item):
|
|
774
|
+
out = buffer.flush()
|
|
775
|
+
if out:
|
|
776
|
+
yield out
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
|
|
|
4
4
|
the return type based on the value of require_outputs.
|
|
5
5
|
"""
|
|
6
6
|
import typing
|
|
7
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
|
|
|
143
143
|
managed_job_id: Optional[int] = ...,
|
|
144
144
|
follow: bool = ...) -> None:
|
|
145
145
|
...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
149
|
+
log_dir: Optional[str],
|
|
150
|
+
managed_job_id: Optional[int] = ...,
|
|
151
|
+
follow: bool = ...,
|
|
152
|
+
tail: int = ...) -> Iterator[str]:
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class LogBuffer:
|
|
157
|
+
max_chars: int
|
|
158
|
+
|
|
159
|
+
def __init__(self, max_chars: int = ...):
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def flush(self) -> str:
|
|
163
|
+
...
|
|
164
|
+
|
|
165
|
+
def write(self, line: str) -> bool:
|
|
166
|
+
...
|
|
167
|
+
|
|
168
|
+
def close(self):
|
|
169
|
+
...
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
173
|
+
timeout: float) -> Iterable[str]:
|
|
174
|
+
...
|
sky/skylet/services.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
"""gRPC service implementations for skylet."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
import grpc
|
|
6
7
|
|
|
8
|
+
from sky import exceptions
|
|
7
9
|
from sky import sky_logging
|
|
8
10
|
from sky.jobs import state as managed_job_state
|
|
11
|
+
from sky.jobs import utils as managed_job_utils
|
|
9
12
|
from sky.schemas.generated import autostopv1_pb2
|
|
10
13
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
11
14
|
from sky.schemas.generated import jobsv1_pb2
|
|
12
15
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
16
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
18
|
from sky.schemas.generated import servev1_pb2
|
|
14
19
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
15
20
|
from sky.serve import serve_rpc_utils
|
|
@@ -18,9 +23,14 @@ from sky.serve import serve_utils
|
|
|
18
23
|
from sky.skylet import autostop_lib
|
|
19
24
|
from sky.skylet import constants
|
|
20
25
|
from sky.skylet import job_lib
|
|
26
|
+
from sky.skylet import log_lib
|
|
21
27
|
|
|
22
28
|
logger = sky_logging.init_logger(__name__)
|
|
23
29
|
|
|
30
|
+
# In the worst case, flush the log buffer every 50ms,
|
|
31
|
+
# to ensure responsiveness.
|
|
32
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
|
|
33
|
+
|
|
24
34
|
|
|
25
35
|
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
26
36
|
"""Implementation of the AutostopService gRPC service."""
|
|
@@ -275,8 +285,39 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
275
285
|
self,
|
|
276
286
|
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
277
287
|
context: grpc.ServicerContext):
|
|
278
|
-
|
|
279
|
-
|
|
288
|
+
buffer = log_lib.LogBuffer()
|
|
289
|
+
try:
|
|
290
|
+
job_id = request.job_id if request.HasField(
|
|
291
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
292
|
+
managed_job_id = request.managed_job_id if request.HasField(
|
|
293
|
+
'managed_job_id') else None
|
|
294
|
+
log_dir = job_lib.get_log_dir_for_job(job_id)
|
|
295
|
+
if log_dir is None:
|
|
296
|
+
run_timestamp = job_lib.get_run_timestamp(job_id)
|
|
297
|
+
log_dir = None if run_timestamp is None else os.path.join(
|
|
298
|
+
constants.SKY_LOGS_DIRECTORY, run_timestamp)
|
|
299
|
+
|
|
300
|
+
for line in log_lib.buffered_iter_with_timeout(
|
|
301
|
+
buffer,
|
|
302
|
+
log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
|
|
303
|
+
request.follow, request.tail),
|
|
304
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
|
|
305
|
+
yield jobsv1_pb2.TailLogsResponse(log_line=line)
|
|
306
|
+
|
|
307
|
+
job_status = job_lib.get_status(job_id)
|
|
308
|
+
exit_code = exceptions.JobExitCode.from_job_status(job_status)
|
|
309
|
+
# Fix for dashboard: When follow=False and job is still running
|
|
310
|
+
# (NOT_FINISHED=101), exit with success (0) since fetching current
|
|
311
|
+
# logs is a successful operation.
|
|
312
|
+
# This prevents shell wrappers from printing "command terminated
|
|
313
|
+
# with exit code 101".
|
|
314
|
+
exit_code_int = 0 if not request.follow and int(
|
|
315
|
+
exit_code) == 101 else int(exit_code)
|
|
316
|
+
yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
|
|
317
|
+
except Exception as e: # pylint: disable=broad-except
|
|
318
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
319
|
+
finally:
|
|
320
|
+
buffer.close()
|
|
280
321
|
|
|
281
322
|
def GetJobStatus( # type: ignore[return]
|
|
282
323
|
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
@@ -343,3 +384,168 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
343
384
|
job_log_dirs=job_log_dirs)
|
|
344
385
|
except Exception as e: # pylint: disable=broad-except
|
|
345
386
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
390
|
+
):
|
|
391
|
+
"""Implementation of the ManagedJobsService gRPC service."""
|
|
392
|
+
|
|
393
|
+
def GetVersion( # type: ignore[return]
|
|
394
|
+
self, request: managed_jobsv1_pb2.GetVersionRequest,
|
|
395
|
+
context: grpc.ServicerContext
|
|
396
|
+
) -> managed_jobsv1_pb2.GetVersionResponse:
|
|
397
|
+
try:
|
|
398
|
+
return managed_jobsv1_pb2.GetVersionResponse(
|
|
399
|
+
controller_version=constants.SKYLET_VERSION)
|
|
400
|
+
except Exception as e: # pylint: disable=broad-except
|
|
401
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
402
|
+
|
|
403
|
+
def GetJobTable( # type: ignore[return]
|
|
404
|
+
self, request: managed_jobsv1_pb2.GetJobTableRequest,
|
|
405
|
+
context: grpc.ServicerContext
|
|
406
|
+
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
407
|
+
try:
|
|
408
|
+
accessible_workspaces = list(request.accessible_workspaces)
|
|
409
|
+
job_ids = list(request.job_ids.ids) if request.job_ids else None
|
|
410
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
411
|
+
if request.user_hashes:
|
|
412
|
+
user_hashes = list(request.user_hashes.hashes)
|
|
413
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
414
|
+
# user_hash. TODO: Remove before 0.12.0.
|
|
415
|
+
if request.show_jobs_without_user_hash:
|
|
416
|
+
user_hashes.append(None)
|
|
417
|
+
statuses = list(
|
|
418
|
+
request.statuses.statuses) if request.statuses else None
|
|
419
|
+
|
|
420
|
+
job_queue = managed_job_utils.get_managed_job_queue(
|
|
421
|
+
skip_finished=request.skip_finished,
|
|
422
|
+
accessible_workspaces=accessible_workspaces,
|
|
423
|
+
job_ids=job_ids,
|
|
424
|
+
workspace_match=request.workspace_match
|
|
425
|
+
if request.HasField('workspace_match') else None,
|
|
426
|
+
name_match=request.name_match
|
|
427
|
+
if request.HasField('name_match') else None,
|
|
428
|
+
pool_match=request.pool_match
|
|
429
|
+
if request.HasField('pool_match') else None,
|
|
430
|
+
page=request.page if request.HasField('page') else None,
|
|
431
|
+
limit=request.limit if request.HasField('limit') else None,
|
|
432
|
+
user_hashes=user_hashes,
|
|
433
|
+
statuses=statuses)
|
|
434
|
+
jobs = job_queue['jobs']
|
|
435
|
+
total = job_queue['total']
|
|
436
|
+
total_no_filter = job_queue['total_no_filter']
|
|
437
|
+
status_counts = job_queue['status_counts']
|
|
438
|
+
|
|
439
|
+
jobs_info = []
|
|
440
|
+
for job in jobs:
|
|
441
|
+
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
442
|
+
job_id=job.get('job_id'),
|
|
443
|
+
task_id=job.get('task_id'),
|
|
444
|
+
job_name=job.get('job_name'),
|
|
445
|
+
task_name=job.get('task_name'),
|
|
446
|
+
job_duration=job.get('job_duration'),
|
|
447
|
+
workspace=job.get('workspace'),
|
|
448
|
+
status=managed_job_state.ManagedJobStatus(
|
|
449
|
+
job.get('status')).to_protobuf(),
|
|
450
|
+
schedule_state=managed_job_state.ManagedJobScheduleState(
|
|
451
|
+
job.get('schedule_state')).to_protobuf(),
|
|
452
|
+
resources=job.get('resources'),
|
|
453
|
+
cluster_resources=job.get('cluster_resources'),
|
|
454
|
+
cluster_resources_full=job.get('cluster_resources_full'),
|
|
455
|
+
cloud=job.get('cloud'),
|
|
456
|
+
region=job.get('region'),
|
|
457
|
+
infra=job.get('infra'),
|
|
458
|
+
accelerators=job.get('accelerators'),
|
|
459
|
+
recovery_count=job.get('recovery_count'),
|
|
460
|
+
details=job.get('details'),
|
|
461
|
+
failure_reason=job.get('failure_reason'),
|
|
462
|
+
user_name=job.get('user_name'),
|
|
463
|
+
user_hash=job.get('user_hash'),
|
|
464
|
+
submitted_at=job.get('submitted_at'),
|
|
465
|
+
start_at=job.get('start_at'),
|
|
466
|
+
end_at=job.get('end_at'),
|
|
467
|
+
user_yaml=job.get('user_yaml'),
|
|
468
|
+
entrypoint=job.get('entrypoint'),
|
|
469
|
+
metadata={
|
|
470
|
+
k: v
|
|
471
|
+
for k, v in job.get('metadata', {}).items()
|
|
472
|
+
if v is not None
|
|
473
|
+
},
|
|
474
|
+
pool=job.get('pool'),
|
|
475
|
+
pool_hash=job.get('pool_hash'))
|
|
476
|
+
jobs_info.append(job_info)
|
|
477
|
+
|
|
478
|
+
return managed_jobsv1_pb2.GetJobTableResponse(
|
|
479
|
+
jobs=jobs_info,
|
|
480
|
+
total=total,
|
|
481
|
+
total_no_filter=total_no_filter,
|
|
482
|
+
status_counts=status_counts)
|
|
483
|
+
except Exception as e: # pylint: disable=broad-except
|
|
484
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
485
|
+
|
|
486
|
+
def GetAllJobIdsByName( # type: ignore[return]
|
|
487
|
+
self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
|
|
488
|
+
context: grpc.ServicerContext
|
|
489
|
+
) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
|
|
490
|
+
try:
|
|
491
|
+
job_name = request.job_name if request.HasField(
|
|
492
|
+
'job_name') else None
|
|
493
|
+
job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
|
|
494
|
+
return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
|
|
495
|
+
job_ids=job_ids)
|
|
496
|
+
except Exception as e: # pylint: disable=broad-except
|
|
497
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
498
|
+
|
|
499
|
+
def CancelJobs( # type: ignore[return]
|
|
500
|
+
self, request: managed_jobsv1_pb2.CancelJobsRequest,
|
|
501
|
+
context: grpc.ServicerContext
|
|
502
|
+
) -> managed_jobsv1_pb2.CancelJobsResponse:
|
|
503
|
+
try:
|
|
504
|
+
cancellation_criteria = request.WhichOneof('cancellation_criteria')
|
|
505
|
+
if cancellation_criteria is None:
|
|
506
|
+
context.abort(
|
|
507
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
508
|
+
'exactly one cancellation criteria must be specified.')
|
|
509
|
+
|
|
510
|
+
if cancellation_criteria == 'all_users':
|
|
511
|
+
user_hash = request.user_hash if request.HasField(
|
|
512
|
+
'user_hash') else None
|
|
513
|
+
all_users = request.all_users
|
|
514
|
+
if not all_users and user_hash is None:
|
|
515
|
+
context.abort(
|
|
516
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
517
|
+
'user_hash is required when all_users is False')
|
|
518
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
519
|
+
job_ids=None,
|
|
520
|
+
all_users=all_users,
|
|
521
|
+
current_workspace=request.current_workspace,
|
|
522
|
+
user_hash=user_hash)
|
|
523
|
+
elif cancellation_criteria == 'job_ids':
|
|
524
|
+
job_ids = list(request.job_ids.ids)
|
|
525
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
526
|
+
job_ids=job_ids,
|
|
527
|
+
current_workspace=request.current_workspace)
|
|
528
|
+
elif cancellation_criteria == 'job_name':
|
|
529
|
+
message = managed_job_utils.cancel_job_by_name(
|
|
530
|
+
job_name=request.job_name,
|
|
531
|
+
current_workspace=request.current_workspace)
|
|
532
|
+
elif cancellation_criteria == 'pool_name':
|
|
533
|
+
message = managed_job_utils.cancel_jobs_by_pool(
|
|
534
|
+
pool_name=request.pool_name,
|
|
535
|
+
current_workspace=request.current_workspace)
|
|
536
|
+
else:
|
|
537
|
+
context.abort(
|
|
538
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
539
|
+
f'invalid cancellation criteria: {cancellation_criteria}')
|
|
540
|
+
return managed_jobsv1_pb2.CancelJobsResponse(message=message)
|
|
541
|
+
except Exception as e: # pylint: disable=broad-except
|
|
542
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
543
|
+
|
|
544
|
+
def StreamLogs(
|
|
545
|
+
self,
|
|
546
|
+
request: managed_jobsv1_pb2.
|
|
547
|
+
StreamLogsRequest, # type: ignore[return]
|
|
548
|
+
context: grpc.ServicerContext):
|
|
549
|
+
# TODO(kevin): implement this
|
|
550
|
+
context.abort(grpc.StatusCode.UNIMPLEMENTED,
|
|
551
|
+
'StreamLogs is not implemented')
|
sky/skylet/skylet.py
CHANGED
|
@@ -10,6 +10,7 @@ import sky
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
12
12
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
13
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
14
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.skylet import events
|
|
@@ -55,6 +56,8 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
55
56
|
services.JobsServiceImpl(), server)
|
|
56
57
|
servev1_pb2_grpc.add_ServeServiceServicer_to_server(
|
|
57
58
|
services.ServeServiceImpl(), server)
|
|
59
|
+
managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
|
|
60
|
+
services.ManagedJobsServiceImpl(), server)
|
|
58
61
|
|
|
59
62
|
listen_addr = f'127.0.0.1:{port}'
|
|
60
63
|
server.add_insecure_port(listen_addr)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250927
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -154,51 +154,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
|
154
154
|
Requires-Dist: aiosqlite; extra == "server"
|
|
155
155
|
Requires-Dist: greenlet; extra == "server"
|
|
156
156
|
Provides-Extra: all
|
|
157
|
-
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
158
157
|
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
159
|
-
Requires-Dist:
|
|
158
|
+
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
160
159
|
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
161
|
-
Requires-Dist:
|
|
162
|
-
Requires-Dist:
|
|
163
|
-
Requires-Dist:
|
|
164
|
-
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
160
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
161
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
162
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
165
163
|
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
164
|
+
Requires-Dist: websockets; extra == "all"
|
|
165
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
166
|
+
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
166
167
|
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
167
|
-
Requires-Dist:
|
|
168
|
+
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
168
169
|
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
169
|
-
Requires-Dist:
|
|
170
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
171
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
|
172
|
+
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
170
173
|
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
171
|
-
Requires-Dist:
|
|
172
|
-
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
173
|
-
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
174
|
+
Requires-Dist: docker; extra == "all"
|
|
174
175
|
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
175
|
-
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
176
|
-
Requires-Dist: casbin; extra == "all"
|
|
177
176
|
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
178
|
-
Requires-Dist:
|
|
179
|
-
Requires-Dist:
|
|
180
|
-
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
181
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
|
182
|
-
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
183
|
-
Requires-Dist: passlib; extra == "all"
|
|
184
|
-
Requires-Dist: anyio; extra == "all"
|
|
185
|
-
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
186
|
-
Requires-Dist: python-dateutil; extra == "all"
|
|
187
|
-
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
177
|
+
Requires-Dist: pyjwt; extra == "all"
|
|
178
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
188
179
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
180
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
181
|
+
Requires-Dist: msrestazure; extra == "all"
|
|
189
182
|
Requires-Dist: oci; extra == "all"
|
|
190
|
-
Requires-Dist:
|
|
191
|
-
Requires-Dist:
|
|
192
|
-
Requires-Dist:
|
|
183
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
184
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
185
|
+
Requires-Dist: casbin; extra == "all"
|
|
186
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
187
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
193
188
|
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
194
|
-
Requires-Dist:
|
|
189
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
195
190
|
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
196
|
-
Requires-Dist:
|
|
191
|
+
Requires-Dist: greenlet; extra == "all"
|
|
192
|
+
Requires-Dist: azure-common; extra == "all"
|
|
197
193
|
Requires-Dist: aiosqlite; extra == "all"
|
|
194
|
+
Requires-Dist: anyio; extra == "all"
|
|
195
|
+
Requires-Dist: python-dateutil; extra == "all"
|
|
198
196
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
199
|
-
Requires-Dist:
|
|
200
|
-
Requires-Dist:
|
|
201
|
-
Requires-Dist:
|
|
197
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
198
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
|
199
|
+
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
200
|
+
Requires-Dist: passlib; extra == "all"
|
|
201
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
202
202
|
Dynamic: author
|
|
203
203
|
Dynamic: classifier
|
|
204
204
|
Dynamic: description
|