skypilot-nightly 1.0.0.dev20251015__py3-none-any.whl → 1.0.0.dev20251017__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +17 -157
- sky/backends/backend_utils.py +6 -5
- sky/backends/cloud_vm_ray_backend.py +25 -12
- sky/catalog/kubernetes_catalog.py +5 -3
- sky/client/cli/command.py +0 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-ac3a34c8f9fef041.js → webpack-3c431f6c9086e487.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +13 -1
- sky/jobs/constants.py +1 -1
- sky/jobs/scheduler.py +2 -4
- sky/jobs/server/core.py +2 -1
- sky/jobs/server/server.py +5 -3
- sky/jobs/state.py +12 -6
- sky/jobs/utils.py +8 -2
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/seeweb/instance.py +3 -3
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/serve/server/server.py +1 -0
- sky/serve/service.py +2 -2
- sky/server/requests/executor.py +51 -15
- sky/server/requests/preconditions.py +2 -2
- sky/server/requests/requests.py +33 -24
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +36 -18
- sky/server/server.py +26 -4
- sky/server/stream_utils.py +10 -3
- sky/setup_files/dependencies.py +19 -8
- sky/skylet/constants.py +1 -1
- sky/skylet/services.py +3 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/command_runner.py +3 -0
- sky/utils/context_utils.py +2 -0
- sky/utils/locks.py +5 -2
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/METADATA +281 -52
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/RECORD +58 -56
- /sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → 3xvBA5BSGbiQ87tVmfbpY}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → 3xvBA5BSGbiQ87tVmfbpY}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/top_level.txt +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -48,6 +48,7 @@ from sky.server.requests import payloads
|
|
|
48
48
|
from sky.server.requests import preconditions
|
|
49
49
|
from sky.server.requests import process
|
|
50
50
|
from sky.server.requests import requests as api_requests
|
|
51
|
+
from sky.server.requests import threads
|
|
51
52
|
from sky.server.requests.queues import local_queue
|
|
52
53
|
from sky.server.requests.queues import mp_queue
|
|
53
54
|
from sky.skylet import constants
|
|
@@ -81,23 +82,28 @@ logger = sky_logging.init_logger(__name__)
|
|
|
81
82
|
# platforms, including macOS.
|
|
82
83
|
multiprocessing.set_start_method('spawn', force=True)
|
|
83
84
|
|
|
84
|
-
#
|
|
85
|
-
#
|
|
86
|
-
|
|
85
|
+
# An upper limit of max threads for request execution per server process that
|
|
86
|
+
# unlikely to be reached to allow higher concurrency while still prevent the
|
|
87
|
+
# server process become overloaded.
|
|
88
|
+
_REQUEST_THREADS_LIMIT = 128
|
|
87
89
|
|
|
88
90
|
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
89
|
-
# A dedicated thread pool executor for synced requests execution in coroutine
|
|
90
|
-
|
|
91
|
+
# A dedicated thread pool executor for synced requests execution in coroutine to
|
|
92
|
+
# avoid:
|
|
93
|
+
# 1. blocking the event loop;
|
|
94
|
+
# 2. exhausting the default thread pool executor of event loop;
|
|
95
|
+
_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
|
|
91
96
|
|
|
92
97
|
|
|
93
|
-
def get_request_thread_executor() ->
|
|
98
|
+
def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
|
|
94
99
|
"""Lazy init and return the request thread executor for current process."""
|
|
95
100
|
global _REQUEST_THREAD_EXECUTOR
|
|
96
101
|
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
97
102
|
return _REQUEST_THREAD_EXECUTOR
|
|
98
103
|
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
99
104
|
if _REQUEST_THREAD_EXECUTOR is None:
|
|
100
|
-
_REQUEST_THREAD_EXECUTOR =
|
|
105
|
+
_REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
|
|
106
|
+
name='request_thread_executor',
|
|
101
107
|
max_workers=_REQUEST_THREADS_LIMIT)
|
|
102
108
|
return _REQUEST_THREAD_EXECUTOR
|
|
103
109
|
|
|
@@ -561,6 +567,21 @@ class CoroutineTask:
|
|
|
561
567
|
pass
|
|
562
568
|
|
|
563
569
|
|
|
570
|
+
def check_request_thread_executor_available() -> None:
|
|
571
|
+
"""Check if the request thread executor is available.
|
|
572
|
+
|
|
573
|
+
This is a best effort check to hint the client to retry other server
|
|
574
|
+
processes when there is no avaiable thread worker in current one. But
|
|
575
|
+
a request may pass this check and still cannot get worker on execution
|
|
576
|
+
time due to race condition. In this case, the client will see a failed
|
|
577
|
+
request instead of retry.
|
|
578
|
+
|
|
579
|
+
TODO(aylei): this can be refined with a refactor of our coroutine
|
|
580
|
+
execution flow.
|
|
581
|
+
"""
|
|
582
|
+
get_request_thread_executor().check_available()
|
|
583
|
+
|
|
584
|
+
|
|
564
585
|
def execute_request_in_coroutine(
|
|
565
586
|
request: api_requests.Request) -> CoroutineTask:
|
|
566
587
|
"""Execute a request in current event loop.
|
|
@@ -575,6 +596,18 @@ def execute_request_in_coroutine(
|
|
|
575
596
|
return CoroutineTask(task)
|
|
576
597
|
|
|
577
598
|
|
|
599
|
+
def _execute_with_config_override(func: Callable,
|
|
600
|
+
request_body: payloads.RequestBody,
|
|
601
|
+
request_id: str, request_name: str,
|
|
602
|
+
**kwargs) -> Any:
|
|
603
|
+
"""Execute a function with env and config override inside a thread."""
|
|
604
|
+
# Override the environment and config within this thread's context,
|
|
605
|
+
# which gets copied when we call to_thread.
|
|
606
|
+
with override_request_env_and_config(request_body, request_id,
|
|
607
|
+
request_name):
|
|
608
|
+
return func(**kwargs)
|
|
609
|
+
|
|
610
|
+
|
|
578
611
|
async def _execute_request_coroutine(request: api_requests.Request):
|
|
579
612
|
"""Execute a request in current event loop.
|
|
580
613
|
|
|
@@ -592,14 +625,17 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
592
625
|
request_task.status = api_requests.RequestStatus.RUNNING
|
|
593
626
|
# Redirect stdout and stderr to the request log path.
|
|
594
627
|
original_output = ctx.redirect_log(request.log_path)
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
628
|
+
try:
|
|
629
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
630
|
+
get_request_thread_executor(), _execute_with_config_override, func,
|
|
631
|
+
request_body, request.request_id, request.name,
|
|
632
|
+
**request_body.to_kwargs())
|
|
633
|
+
except Exception as e: # pylint: disable=broad-except
|
|
634
|
+
ctx.redirect_log(original_output)
|
|
635
|
+
api_requests.set_request_failed(request.request_id, e)
|
|
636
|
+
logger.error(f'Failed to run request {request.request_id} due to '
|
|
637
|
+
f'{common_utils.format_exception(e)}')
|
|
638
|
+
return
|
|
603
639
|
|
|
604
640
|
async def poll_task(request_id: str) -> bool:
|
|
605
641
|
req_status = await api_requests.get_request_status_async(request_id)
|
|
@@ -162,8 +162,8 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
162
162
|
requests = await api_requests.get_request_tasks_async(
|
|
163
163
|
req_filter=api_requests.RequestTaskFilter(
|
|
164
164
|
status=[
|
|
165
|
-
api_requests.RequestStatus.
|
|
166
|
-
api_requests.RequestStatus.
|
|
165
|
+
api_requests.RequestStatus.PENDING,
|
|
166
|
+
api_requests.RequestStatus.RUNNING
|
|
167
167
|
],
|
|
168
168
|
include_request_names=['sky.launch', 'sky.start'],
|
|
169
169
|
cluster_names=[self.cluster_name]))
|
sky/server/requests/requests.py
CHANGED
|
@@ -398,9 +398,9 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
398
398
|
request_ids = [
|
|
399
399
|
request_task.request_id
|
|
400
400
|
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
401
|
-
cluster_names=[cluster_name],
|
|
402
401
|
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
403
|
-
exclude_request_names=[exclude_request_name]
|
|
402
|
+
exclude_request_names=[exclude_request_name],
|
|
403
|
+
cluster_names=[cluster_name]))
|
|
404
404
|
]
|
|
405
405
|
kill_requests(request_ids)
|
|
406
406
|
|
|
@@ -422,10 +422,10 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
422
422
|
request_ids = [
|
|
423
423
|
request_task.request_id
|
|
424
424
|
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
425
|
-
|
|
426
|
-
status=[RequestStatus.RUNNING, RequestStatus.PENDING],
|
|
425
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
427
426
|
# Avoid cancelling the cancel request itself.
|
|
428
|
-
exclude_request_names=['sky.api_cancel']
|
|
427
|
+
exclude_request_names=['sky.api_cancel'],
|
|
428
|
+
user_id=user_id))
|
|
429
429
|
]
|
|
430
430
|
cancelled_request_ids = []
|
|
431
431
|
for request_id in request_ids:
|
|
@@ -497,6 +497,21 @@ def create_table(cursor, conn):
|
|
|
497
497
|
db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
|
|
498
498
|
'REAL')
|
|
499
499
|
|
|
500
|
+
# Add an index on (status, name) to speed up queries
|
|
501
|
+
# that filter on these columns.
|
|
502
|
+
cursor.execute(f"""\
|
|
503
|
+
CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
|
|
504
|
+
""")
|
|
505
|
+
# Add an index on cluster_name to speed up queries
|
|
506
|
+
# that filter on this column.
|
|
507
|
+
cursor.execute(f"""\
|
|
508
|
+
CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
|
|
509
|
+
""")
|
|
510
|
+
# Add an index on created_at to speed up queries that sort on this column.
|
|
511
|
+
cursor.execute(f"""\
|
|
512
|
+
CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
|
|
513
|
+
""")
|
|
514
|
+
|
|
500
515
|
|
|
501
516
|
_DB = None
|
|
502
517
|
_init_db_lock = threading.Lock()
|
|
@@ -642,6 +657,7 @@ def get_request(request_id: str) -> Optional[Request]:
|
|
|
642
657
|
@asyncio_utils.shield
|
|
643
658
|
async def get_request_async(request_id: str) -> Optional[Request]:
|
|
644
659
|
"""Async version of get_request."""
|
|
660
|
+
# TODO(aylei): figure out how to remove FileLock here to avoid the overhead
|
|
645
661
|
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
646
662
|
return await _get_request_no_lock_async(request_id)
|
|
647
663
|
|
|
@@ -752,6 +768,10 @@ class RequestTaskFilter:
|
|
|
752
768
|
status_list_str = ','.join(
|
|
753
769
|
repr(status.value) for status in self.status)
|
|
754
770
|
filters.append(f'status IN ({status_list_str})')
|
|
771
|
+
if self.include_request_names is not None:
|
|
772
|
+
request_names_str = ','.join(
|
|
773
|
+
repr(name) for name in self.include_request_names)
|
|
774
|
+
filters.append(f'name IN ({request_names_str})')
|
|
755
775
|
if self.exclude_request_names is not None:
|
|
756
776
|
exclude_request_names_str = ','.join(
|
|
757
777
|
repr(name) for name in self.exclude_request_names)
|
|
@@ -763,10 +783,6 @@ class RequestTaskFilter:
|
|
|
763
783
|
if self.user_id is not None:
|
|
764
784
|
filters.append(f'{COL_USER_ID} = ?')
|
|
765
785
|
filter_params.append(self.user_id)
|
|
766
|
-
if self.include_request_names is not None:
|
|
767
|
-
request_names_str = ','.join(
|
|
768
|
-
repr(name) for name in self.include_request_names)
|
|
769
|
-
filters.append(f'name IN ({request_names_str})')
|
|
770
786
|
if self.finished_before is not None:
|
|
771
787
|
filters.append('finished_at < ?')
|
|
772
788
|
filter_params.append(self.finished_before)
|
|
@@ -799,6 +815,10 @@ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
|
|
|
799
815
|
rows = cursor.fetchall()
|
|
800
816
|
if rows is None:
|
|
801
817
|
return []
|
|
818
|
+
if req_filter.fields:
|
|
819
|
+
rows = [
|
|
820
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
821
|
+
]
|
|
802
822
|
return [Request.from_row(row) for row in rows]
|
|
803
823
|
|
|
804
824
|
|
|
@@ -811,21 +831,10 @@ async def get_request_tasks_async(
|
|
|
811
831
|
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
812
832
|
if not rows:
|
|
813
833
|
return []
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
@metrics_lib.time_me_async
|
|
819
|
-
async def get_request_tasks_with_fields_async(
|
|
820
|
-
req_filter: RequestTaskFilter,
|
|
821
|
-
fields: Optional[List[str]] = None,
|
|
822
|
-
) -> List[Request]:
|
|
823
|
-
"""Async version of get_request_tasks."""
|
|
824
|
-
assert _DB is not None
|
|
825
|
-
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
826
|
-
if not rows:
|
|
827
|
-
return []
|
|
828
|
-
rows = [_update_request_row_fields(row, fields) for row in rows]
|
|
834
|
+
if req_filter.fields:
|
|
835
|
+
rows = [
|
|
836
|
+
_update_request_row_fields(row, req_filter.fields) for row in rows
|
|
837
|
+
]
|
|
829
838
|
return [Request.from_row(row) for row in rows]
|
|
830
839
|
|
|
831
840
|
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Request execution threads management."""
|
|
2
|
+
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import threading
|
|
5
|
+
from typing import Callable, Set
|
|
6
|
+
|
|
7
|
+
from sky import exceptions
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.utils import atomic
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OnDemandThreadExecutor(concurrent.futures.Executor):
|
|
15
|
+
"""An executor that creates a new thread for each task and destroys it
|
|
16
|
+
after the task is completed.
|
|
17
|
+
|
|
18
|
+
Note(dev):
|
|
19
|
+
We raise an error instead of queuing the request if the limit is reached, so
|
|
20
|
+
that:
|
|
21
|
+
1. the request might be handled by other processes that have idle workers
|
|
22
|
+
upon retry;
|
|
23
|
+
2. if not, then users can be clearly hinted that they need to scale the API
|
|
24
|
+
server to support higher concurrency.
|
|
25
|
+
So this executor is only suitable for carefully selected cases where the
|
|
26
|
+
error can be properly handled by caller. To make this executor general, we
|
|
27
|
+
need to support configuring the queuing behavior (exception or queueing).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, name: str, max_workers: int):
|
|
31
|
+
self.name: str = name
|
|
32
|
+
self.max_workers: int = max_workers
|
|
33
|
+
self.running: atomic.AtomicInt = atomic.AtomicInt(0)
|
|
34
|
+
self._shutdown: bool = False
|
|
35
|
+
self._shutdown_lock: threading.Lock = threading.Lock()
|
|
36
|
+
self._threads: Set[threading.Thread] = set()
|
|
37
|
+
self._threads_lock: threading.Lock = threading.Lock()
|
|
38
|
+
|
|
39
|
+
def _cleanup_thread(self, thread: threading.Thread):
|
|
40
|
+
with self._threads_lock:
|
|
41
|
+
self._threads.discard(thread)
|
|
42
|
+
|
|
43
|
+
def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
|
|
44
|
+
*args, **kwargs):
|
|
45
|
+
try:
|
|
46
|
+
result = fn(*args, **kwargs)
|
|
47
|
+
fut.set_result(result)
|
|
48
|
+
except Exception as e: # pylint: disable=broad-except
|
|
49
|
+
logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
|
|
50
|
+
fut.set_exception(e)
|
|
51
|
+
finally:
|
|
52
|
+
self.running.decrement()
|
|
53
|
+
self._cleanup_thread(threading.current_thread())
|
|
54
|
+
|
|
55
|
+
def check_available(self, borrow: bool = False) -> int:
|
|
56
|
+
"""Check if there are available workers.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
borrow: If True, the caller borrow a worker from the executor.
|
|
60
|
+
The caller is responsible for returning the worker to the
|
|
61
|
+
executor after the task is completed.
|
|
62
|
+
"""
|
|
63
|
+
count = self.running.increment()
|
|
64
|
+
if count > self.max_workers:
|
|
65
|
+
self.running.decrement()
|
|
66
|
+
raise exceptions.ConcurrentWorkerExhaustedError(
|
|
67
|
+
f'Maximum concurrent workers {self.max_workers} of threads '
|
|
68
|
+
f'executor [{self.name}] reached')
|
|
69
|
+
if not borrow:
|
|
70
|
+
self.running.decrement()
|
|
71
|
+
return count
|
|
72
|
+
|
|
73
|
+
def submit(self, fn, /, *args, **kwargs):
|
|
74
|
+
with self._shutdown_lock:
|
|
75
|
+
if self._shutdown:
|
|
76
|
+
raise RuntimeError(
|
|
77
|
+
'Cannot submit task after executor is shutdown')
|
|
78
|
+
count = self.check_available(borrow=True)
|
|
79
|
+
fut: concurrent.futures.Future = concurrent.futures.Future()
|
|
80
|
+
# Name is assigned for debugging purpose, duplication is fine
|
|
81
|
+
thread = threading.Thread(target=self._task_wrapper,
|
|
82
|
+
name=f'{self.name}-{count}',
|
|
83
|
+
args=(fn, fut, *args),
|
|
84
|
+
kwargs=kwargs,
|
|
85
|
+
daemon=True)
|
|
86
|
+
with self._threads_lock:
|
|
87
|
+
self._threads.add(thread)
|
|
88
|
+
try:
|
|
89
|
+
thread.start()
|
|
90
|
+
except Exception as e:
|
|
91
|
+
self.running.decrement()
|
|
92
|
+
self._cleanup_thread(thread)
|
|
93
|
+
fut.set_exception(e)
|
|
94
|
+
raise
|
|
95
|
+
assert thread.ident is not None, 'Thread should be started'
|
|
96
|
+
return fut
|
|
97
|
+
|
|
98
|
+
def shutdown(self, wait=True):
|
|
99
|
+
with self._shutdown_lock:
|
|
100
|
+
self._shutdown = True
|
|
101
|
+
if not wait:
|
|
102
|
+
return
|
|
103
|
+
with self._threads_lock:
|
|
104
|
+
threads = list(self._threads)
|
|
105
|
+
for t in threads:
|
|
106
|
+
t.join()
|
sky/server/rest.py
CHANGED
|
@@ -178,14 +178,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
178
178
|
Notes(dev):
|
|
179
179
|
"""
|
|
180
180
|
|
|
181
|
+
def _readable_error_msg(message: str) -> str:
|
|
182
|
+
return (f'{colorama.Fore.YELLOW}API server is temporarily '
|
|
183
|
+
f'unavailable: {message}.\nRetrying...'
|
|
184
|
+
f'{colorama.Style.RESET_ALL}')
|
|
185
|
+
|
|
181
186
|
def decorator(func: F) -> F:
|
|
182
187
|
|
|
183
188
|
@functools.wraps(func)
|
|
184
189
|
def wrapper(*args, **kwargs) -> Any:
|
|
185
|
-
|
|
186
|
-
f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
|
|
187
|
-
'upgrade in progress. Waiting to resume...'
|
|
188
|
-
f'{colorama.Style.RESET_ALL}')
|
|
190
|
+
|
|
189
191
|
backoff = common_utils.Backoff(
|
|
190
192
|
initial_backoff=initial_backoff,
|
|
191
193
|
max_backoff_factor=max_backoff_factor)
|
|
@@ -203,7 +205,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
203
205
|
# stop the status spinner before retrying func() to
|
|
204
206
|
# avoid the status spinner get stuck if the func() runs
|
|
205
207
|
# for a long time without update status, e.g. sky logs.
|
|
206
|
-
with rich_utils.client_status(
|
|
208
|
+
with rich_utils.client_status(
|
|
209
|
+
_readable_error_msg(e.message)):
|
|
207
210
|
if time.time() - start_time > max_wait_seconds:
|
|
208
211
|
# pylint: disable=line-too-long
|
|
209
212
|
raise exceptions.ServerTemporarilyUnavailableError(
|
|
@@ -224,14 +227,33 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
|
|
|
224
227
|
|
|
225
228
|
|
|
226
229
|
def handle_server_unavailable(response: 'requests.Response') -> None:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
230
|
+
"""Handle 503 (Service Unavailable) error
|
|
231
|
+
|
|
232
|
+
The client get 503 error in the following cases:
|
|
233
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
234
|
+
request, e.g. when there is and rolling-update.
|
|
235
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
236
|
+
cucurrency of the handling process is exhausted.
|
|
237
|
+
|
|
238
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
239
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
240
|
+
request.
|
|
241
|
+
"""
|
|
242
|
+
if response.status_code != 503:
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
# error_msg = 'SkyPilot API server is temporarily unavailable. '
|
|
246
|
+
error_msg = ''
|
|
247
|
+
try:
|
|
248
|
+
response_data = response.json()
|
|
249
|
+
if 'detail' in response_data:
|
|
250
|
+
error_msg = response_data['detail']
|
|
251
|
+
except Exception: # pylint: disable=broad-except
|
|
252
|
+
if response.text:
|
|
253
|
+
error_msg = response.text
|
|
254
|
+
|
|
255
|
+
with ux_utils.print_exception_no_traceback():
|
|
256
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
235
257
|
|
|
236
258
|
|
|
237
259
|
@_retry_on_server_unavailable()
|
|
@@ -310,11 +332,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
|
|
|
310
332
|
response = await session.request(method, url, **kwargs)
|
|
311
333
|
|
|
312
334
|
# Handle server unavailability (503 status) - same as sync version
|
|
313
|
-
|
|
314
|
-
with ux_utils.print_exception_no_traceback():
|
|
315
|
-
raise exceptions.ServerTemporarilyUnavailableError(
|
|
316
|
-
'SkyPilot API server is temporarily unavailable. '
|
|
317
|
-
'Please try again later.')
|
|
335
|
+
handle_server_unavailable(response)
|
|
318
336
|
|
|
319
337
|
# Set remote API version and version from headers - same as sync version
|
|
320
338
|
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|
sky/server/server.py
CHANGED
|
@@ -17,6 +17,7 @@ import resource
|
|
|
17
17
|
import shutil
|
|
18
18
|
import sys
|
|
19
19
|
import threading
|
|
20
|
+
import traceback
|
|
20
21
|
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
21
22
|
import uuid
|
|
22
23
|
import zipfile
|
|
@@ -74,6 +75,7 @@ from sky.utils import dag_utils
|
|
|
74
75
|
from sky.utils import perf_utils
|
|
75
76
|
from sky.utils import status_lib
|
|
76
77
|
from sky.utils import subprocess_utils
|
|
78
|
+
from sky.utils import ux_utils
|
|
77
79
|
from sky.utils.db import db_utils
|
|
78
80
|
from sky.volumes.server import server as volumes_rest
|
|
79
81
|
from sky.workspaces import server as workspaces_rest
|
|
@@ -664,6 +666,25 @@ except Exception: # pylint: disable=broad-except
|
|
|
664
666
|
pass # no issue, we will warn the user later if its too low
|
|
665
667
|
|
|
666
668
|
|
|
669
|
+
@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
|
|
670
|
+
def handle_concurrent_worker_exhausted_error(
|
|
671
|
+
request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
|
|
672
|
+
del request # request is not used
|
|
673
|
+
# Print detailed error message to server log
|
|
674
|
+
logger.error('Concurrent worker exhausted: '
|
|
675
|
+
f'{common_utils.format_exception(e)}')
|
|
676
|
+
with ux_utils.enable_traceback():
|
|
677
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
678
|
+
# Return human readable error message to client
|
|
679
|
+
return fastapi.responses.JSONResponse(
|
|
680
|
+
status_code=503,
|
|
681
|
+
content={
|
|
682
|
+
'detail':
|
|
683
|
+
('The server has exhausted its concurrent worker limit. '
|
|
684
|
+
'Please try again or scale the server if the load persists.')
|
|
685
|
+
})
|
|
686
|
+
|
|
687
|
+
|
|
667
688
|
@app.get('/token')
|
|
668
689
|
async def token(request: fastapi.Request,
|
|
669
690
|
local_port: Optional[int] = None) -> fastapi.responses.Response:
|
|
@@ -1232,6 +1253,7 @@ async def logs(
|
|
|
1232
1253
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
|
1233
1254
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1234
1255
|
# request status.
|
|
1256
|
+
executor.check_request_thread_executor_available()
|
|
1235
1257
|
request_task = executor.prepare_request(
|
|
1236
1258
|
request_id=request.state.request_id,
|
|
1237
1259
|
request_name='logs',
|
|
@@ -1466,6 +1488,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1466
1488
|
# to avoid storming the DB and CPU in the meantime
|
|
1467
1489
|
await asyncio.sleep(0.1)
|
|
1468
1490
|
request_task = await requests_lib.get_request_async(request_id)
|
|
1491
|
+
# TODO(aylei): refine this, /api/get will not be retried and this is
|
|
1492
|
+
# meaningless to retry. It is the original request that should be retried.
|
|
1469
1493
|
if request_task.should_retry:
|
|
1470
1494
|
raise fastapi.HTTPException(
|
|
1471
1495
|
status_code=503, detail=f'Request {request_id!r} should be retried')
|
|
@@ -1643,14 +1667,12 @@ async def api_status(
|
|
|
1643
1667
|
requests_lib.RequestStatus.PENDING,
|
|
1644
1668
|
requests_lib.RequestStatus.RUNNING,
|
|
1645
1669
|
]
|
|
1646
|
-
request_tasks = await requests_lib.
|
|
1670
|
+
request_tasks = await requests_lib.get_request_tasks_async(
|
|
1647
1671
|
req_filter=requests_lib.RequestTaskFilter(
|
|
1648
1672
|
status=statuses,
|
|
1649
1673
|
limit=limit,
|
|
1650
1674
|
fields=fields,
|
|
1651
|
-
)
|
|
1652
|
-
fields=fields,
|
|
1653
|
-
)
|
|
1675
|
+
))
|
|
1654
1676
|
return requests_lib.encode_requests(request_tasks)
|
|
1655
1677
|
else:
|
|
1656
1678
|
encoded_request_tasks = []
|
sky/server/stream_utils.py
CHANGED
|
@@ -215,11 +215,18 @@ async def _tail_log_file(
|
|
|
215
215
|
# periodically to see if provisioning is done.
|
|
216
216
|
if cluster_name is not None and should_check_status:
|
|
217
217
|
last_status_check_time = current_time
|
|
218
|
-
|
|
218
|
+
cluster_status = await (
|
|
219
219
|
global_user_state.get_status_from_cluster_name_async(
|
|
220
220
|
cluster_name))
|
|
221
|
-
if
|
|
222
|
-
|
|
221
|
+
if cluster_status is None:
|
|
222
|
+
logger.debug(
|
|
223
|
+
'Stop tailing provision logs for cluster'
|
|
224
|
+
f' status for cluster {cluster_name} not found')
|
|
225
|
+
break
|
|
226
|
+
if cluster_status != status_lib.ClusterStatus.INIT:
|
|
227
|
+
logger.debug(f'Stop tailing provision logs for cluster'
|
|
228
|
+
f' {cluster_name} has status {cluster_status} '
|
|
229
|
+
'(not in INIT state)')
|
|
223
230
|
break
|
|
224
231
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
225
232
|
# Currently just used to keep the connection busy, refer to
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -86,7 +86,6 @@ install_requires = [
|
|
|
86
86
|
'types-paramiko',
|
|
87
87
|
'alembic',
|
|
88
88
|
'aiohttp',
|
|
89
|
-
'aiosqlite',
|
|
90
89
|
'anyio',
|
|
91
90
|
]
|
|
92
91
|
|
|
@@ -104,6 +103,10 @@ GRPC = 'grpcio>=1.63.0'
|
|
|
104
103
|
PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
|
|
105
104
|
|
|
106
105
|
server_dependencies = [
|
|
106
|
+
# TODO: Some of these dependencies are also specified in install_requires,
|
|
107
|
+
# so they are redundant here. We should figure out if they are only needed
|
|
108
|
+
# on the server (should remove from install_requires), or if they are needed
|
|
109
|
+
# on the client (should remove from here).
|
|
107
110
|
'casbin',
|
|
108
111
|
'sqlalchemy_adapter',
|
|
109
112
|
'passlib',
|
|
@@ -148,7 +151,7 @@ aws_dependencies = [
|
|
|
148
151
|
# a few places.
|
|
149
152
|
AZURE_CLI = 'azure-cli>=2.65.0'
|
|
150
153
|
|
|
151
|
-
|
|
154
|
+
cloud_dependencies: Dict[str, List[str]] = {
|
|
152
155
|
'aws': aws_dependencies,
|
|
153
156
|
# TODO(zongheng): azure-cli is huge and takes a long time to install.
|
|
154
157
|
# Tracked in: https://github.com/Azure/azure-cli/issues/7387
|
|
@@ -191,7 +194,6 @@ extras_require: Dict[str, List[str]] = {
|
|
|
191
194
|
'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
|
|
192
195
|
],
|
|
193
196
|
'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
|
|
194
|
-
'remote': remote,
|
|
195
197
|
# For the container registry auth api. Reference:
|
|
196
198
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
197
199
|
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
|
|
@@ -221,13 +223,11 @@ extras_require: Dict[str, List[str]] = {
|
|
|
221
223
|
] + aws_dependencies,
|
|
222
224
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
223
225
|
'seeweb': ['ecsapi>=0.2.0'],
|
|
224
|
-
'server': server_dependencies,
|
|
225
226
|
'shadeform': [], # No dependencies needed for shadeform
|
|
226
227
|
}
|
|
227
228
|
|
|
228
229
|
# Calculate which clouds should be included in the [all] installation.
|
|
229
|
-
clouds_for_all = set(
|
|
230
|
-
clouds_for_all.remove('remote')
|
|
230
|
+
clouds_for_all = set(cloud_dependencies)
|
|
231
231
|
|
|
232
232
|
if sys.version_info < (3, 10):
|
|
233
233
|
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
|
@@ -242,5 +242,16 @@ if sys.version_info >= (3, 12):
|
|
|
242
242
|
# TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
|
|
243
243
|
clouds_for_all.remove('vast')
|
|
244
244
|
|
|
245
|
-
|
|
246
|
-
|
|
245
|
+
cloud_extras = {
|
|
246
|
+
cloud: dependencies + server_dependencies
|
|
247
|
+
for cloud, dependencies in cloud_dependencies.items()
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
extras_require: Dict[str, List[str]] = {
|
|
251
|
+
# Include server_dependencies with each cloud.
|
|
252
|
+
**cloud_extras,
|
|
253
|
+
'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
|
|
254
|
+
),
|
|
255
|
+
'remote': remote,
|
|
256
|
+
'server': server_dependencies,
|
|
257
|
+
}
|
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '23'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/services.py
CHANGED
|
@@ -216,10 +216,12 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
216
216
|
if pool is not None:
|
|
217
217
|
pool_hash = serve_state.get_service_hash(pool)
|
|
218
218
|
# Add the managed job to job queue database.
|
|
219
|
+
user_id = managed_job.user_id if managed_job.HasField(
|
|
220
|
+
'user_id') else None
|
|
219
221
|
managed_job_state.set_job_info(job_id, managed_job.name,
|
|
220
222
|
managed_job.workspace,
|
|
221
223
|
managed_job.entrypoint, pool,
|
|
222
|
-
pool_hash)
|
|
224
|
+
pool_hash, user_id)
|
|
223
225
|
# Set the managed job to PENDING state to make sure that
|
|
224
226
|
# this managed job appears in the `sky jobs queue`, even
|
|
225
227
|
# if it needs to wait to be submitted.
|