skypilot-nightly 1.0.0.dev20251015__py3-none-any.whl → 1.0.0.dev20251017__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +17 -157
  3. sky/backends/backend_utils.py +6 -5
  4. sky/backends/cloud_vm_ray_backend.py +25 -12
  5. sky/catalog/kubernetes_catalog.py +5 -3
  6. sky/client/cli/command.py +0 -1
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/chunks/{webpack-ac3a34c8f9fef041.js → webpack-3c431f6c9086e487.js} +1 -1
  9. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  10. sky/dashboard/out/clusters/[cluster].html +1 -1
  11. sky/dashboard/out/clusters.html +1 -1
  12. sky/dashboard/out/config.html +1 -1
  13. sky/dashboard/out/index.html +1 -1
  14. sky/dashboard/out/infra/[context].html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  18. sky/dashboard/out/jobs.html +1 -1
  19. sky/dashboard/out/users.html +1 -1
  20. sky/dashboard/out/volumes.html +1 -1
  21. sky/dashboard/out/workspace/new.html +1 -1
  22. sky/dashboard/out/workspaces/[name].html +1 -1
  23. sky/dashboard/out/workspaces.html +1 -1
  24. sky/exceptions.py +13 -1
  25. sky/jobs/constants.py +1 -1
  26. sky/jobs/scheduler.py +2 -4
  27. sky/jobs/server/core.py +2 -1
  28. sky/jobs/server/server.py +5 -3
  29. sky/jobs/state.py +12 -6
  30. sky/jobs/utils.py +8 -2
  31. sky/provision/fluidstack/instance.py +2 -2
  32. sky/provision/seeweb/instance.py +3 -3
  33. sky/schemas/generated/jobsv1_pb2.py +52 -52
  34. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  35. sky/serve/server/server.py +1 -0
  36. sky/serve/service.py +2 -2
  37. sky/server/requests/executor.py +51 -15
  38. sky/server/requests/preconditions.py +2 -2
  39. sky/server/requests/requests.py +33 -24
  40. sky/server/requests/threads.py +106 -0
  41. sky/server/rest.py +36 -18
  42. sky/server/server.py +26 -4
  43. sky/server/stream_utils.py +10 -3
  44. sky/setup_files/dependencies.py +19 -8
  45. sky/skylet/constants.py +1 -1
  46. sky/skylet/services.py +3 -1
  47. sky/utils/auth_utils.py +153 -0
  48. sky/utils/command_runner.py +3 -0
  49. sky/utils/context_utils.py +2 -0
  50. sky/utils/locks.py +5 -2
  51. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/METADATA +281 -52
  52. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/RECORD +58 -56
  53. /sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → 3xvBA5BSGbiQ87tVmfbpY}/_buildManifest.js +0 -0
  54. /sky/dashboard/out/_next/static/{-bih7JVStsXyeasac-dvQ → 3xvBA5BSGbiQ87tVmfbpY}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20251015.dist-info → skypilot_nightly-1.0.0.dev20251017.dist-info}/top_level.txt +0 -0
@@ -48,6 +48,7 @@ from sky.server.requests import payloads
48
48
  from sky.server.requests import preconditions
49
49
  from sky.server.requests import process
50
50
  from sky.server.requests import requests as api_requests
51
+ from sky.server.requests import threads
51
52
  from sky.server.requests.queues import local_queue
52
53
  from sky.server.requests.queues import mp_queue
53
54
  from sky.skylet import constants
@@ -81,23 +82,28 @@ logger = sky_logging.init_logger(__name__)
81
82
  # platforms, including macOS.
82
83
  multiprocessing.set_start_method('spawn', force=True)
83
84
 
84
- # Max threads that is equivalent to the number of thread workers in the
85
- # default thread pool executor of event loop.
86
- _REQUEST_THREADS_LIMIT = min(32, (os.cpu_count() or 0) + 4)
85
+ # An upper limit of max threads for request execution per server process that
86
+ # unlikely to be reached to allow higher concurrency while still prevent the
87
+ # server process become overloaded.
88
+ _REQUEST_THREADS_LIMIT = 128
87
89
 
88
90
  _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
89
- # A dedicated thread pool executor for synced requests execution in coroutine
90
- _REQUEST_THREAD_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
91
+ # A dedicated thread pool executor for synced requests execution in coroutine to
92
+ # avoid:
93
+ # 1. blocking the event loop;
94
+ # 2. exhausting the default thread pool executor of event loop;
95
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
91
96
 
92
97
 
93
- def get_request_thread_executor() -> concurrent.futures.ThreadPoolExecutor:
98
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
94
99
  """Lazy init and return the request thread executor for current process."""
95
100
  global _REQUEST_THREAD_EXECUTOR
96
101
  if _REQUEST_THREAD_EXECUTOR is not None:
97
102
  return _REQUEST_THREAD_EXECUTOR
98
103
  with _REQUEST_THREAD_EXECUTOR_LOCK:
99
104
  if _REQUEST_THREAD_EXECUTOR is None:
100
- _REQUEST_THREAD_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
105
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
106
+ name='request_thread_executor',
101
107
  max_workers=_REQUEST_THREADS_LIMIT)
102
108
  return _REQUEST_THREAD_EXECUTOR
103
109
 
@@ -561,6 +567,21 @@ class CoroutineTask:
561
567
  pass
562
568
 
563
569
 
570
+ def check_request_thread_executor_available() -> None:
571
+ """Check if the request thread executor is available.
572
+
573
+ This is a best effort check to hint the client to retry other server
574
+ processes when there is no avaiable thread worker in current one. But
575
+ a request may pass this check and still cannot get worker on execution
576
+ time due to race condition. In this case, the client will see a failed
577
+ request instead of retry.
578
+
579
+ TODO(aylei): this can be refined with a refactor of our coroutine
580
+ execution flow.
581
+ """
582
+ get_request_thread_executor().check_available()
583
+
584
+
564
585
  def execute_request_in_coroutine(
565
586
  request: api_requests.Request) -> CoroutineTask:
566
587
  """Execute a request in current event loop.
@@ -575,6 +596,18 @@ def execute_request_in_coroutine(
575
596
  return CoroutineTask(task)
576
597
 
577
598
 
599
+ def _execute_with_config_override(func: Callable,
600
+ request_body: payloads.RequestBody,
601
+ request_id: str, request_name: str,
602
+ **kwargs) -> Any:
603
+ """Execute a function with env and config override inside a thread."""
604
+ # Override the environment and config within this thread's context,
605
+ # which gets copied when we call to_thread.
606
+ with override_request_env_and_config(request_body, request_id,
607
+ request_name):
608
+ return func(**kwargs)
609
+
610
+
578
611
  async def _execute_request_coroutine(request: api_requests.Request):
579
612
  """Execute a request in current event loop.
580
613
 
@@ -592,14 +625,17 @@ async def _execute_request_coroutine(request: api_requests.Request):
592
625
  request_task.status = api_requests.RequestStatus.RUNNING
593
626
  # Redirect stdout and stderr to the request log path.
594
627
  original_output = ctx.redirect_log(request.log_path)
595
- # Override environment variables that backs env_options.Options
596
- # TODO(aylei): compared to process executor, running task in coroutine has
597
- # two issues to fix:
598
- # 1. skypilot config is not contextual
599
- # 2. envs that read directly from os.environ are not contextual
600
- ctx.override_envs(request_body.env_vars)
601
- fut: asyncio.Future = context_utils.to_thread_with_executor(
602
- get_request_thread_executor(), func, **request_body.to_kwargs())
628
+ try:
629
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
630
+ get_request_thread_executor(), _execute_with_config_override, func,
631
+ request_body, request.request_id, request.name,
632
+ **request_body.to_kwargs())
633
+ except Exception as e: # pylint: disable=broad-except
634
+ ctx.redirect_log(original_output)
635
+ api_requests.set_request_failed(request.request_id, e)
636
+ logger.error(f'Failed to run request {request.request_id} due to '
637
+ f'{common_utils.format_exception(e)}')
638
+ return
603
639
 
604
640
  async def poll_task(request_id: str) -> bool:
605
641
  req_status = await api_requests.get_request_status_async(request_id)
@@ -162,8 +162,8 @@ class ClusterStartCompletePrecondition(Precondition):
162
162
  requests = await api_requests.get_request_tasks_async(
163
163
  req_filter=api_requests.RequestTaskFilter(
164
164
  status=[
165
- api_requests.RequestStatus.RUNNING,
166
- api_requests.RequestStatus.PENDING
165
+ api_requests.RequestStatus.PENDING,
166
+ api_requests.RequestStatus.RUNNING
167
167
  ],
168
168
  include_request_names=['sky.launch', 'sky.start'],
169
169
  cluster_names=[self.cluster_name]))
@@ -398,9 +398,9 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
398
398
  request_ids = [
399
399
  request_task.request_id
400
400
  for request_task in get_request_tasks(req_filter=RequestTaskFilter(
401
- cluster_names=[cluster_name],
402
401
  status=[RequestStatus.PENDING, RequestStatus.RUNNING],
403
- exclude_request_names=[exclude_request_name]))
402
+ exclude_request_names=[exclude_request_name],
403
+ cluster_names=[cluster_name]))
404
404
  ]
405
405
  kill_requests(request_ids)
406
406
 
@@ -422,10 +422,10 @@ def kill_requests(request_ids: Optional[List[str]] = None,
422
422
  request_ids = [
423
423
  request_task.request_id
424
424
  for request_task in get_request_tasks(req_filter=RequestTaskFilter(
425
- user_id=user_id,
426
- status=[RequestStatus.RUNNING, RequestStatus.PENDING],
425
+ status=[RequestStatus.PENDING, RequestStatus.RUNNING],
427
426
  # Avoid cancelling the cancel request itself.
428
- exclude_request_names=['sky.api_cancel']))
427
+ exclude_request_names=['sky.api_cancel'],
428
+ user_id=user_id))
429
429
  ]
430
430
  cancelled_request_ids = []
431
431
  for request_id in request_ids:
@@ -497,6 +497,21 @@ def create_table(cursor, conn):
497
497
  db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_FINISHED_AT,
498
498
  'REAL')
499
499
 
500
+ # Add an index on (status, name) to speed up queries
501
+ # that filter on these columns.
502
+ cursor.execute(f"""\
503
+ CREATE INDEX IF NOT EXISTS status_name_idx ON {REQUEST_TABLE} (status, name) WHERE status IN ('PENDING', 'RUNNING');
504
+ """)
505
+ # Add an index on cluster_name to speed up queries
506
+ # that filter on this column.
507
+ cursor.execute(f"""\
508
+ CREATE INDEX IF NOT EXISTS cluster_name_idx ON {REQUEST_TABLE} ({COL_CLUSTER_NAME}) WHERE status IN ('PENDING', 'RUNNING');
509
+ """)
510
+ # Add an index on created_at to speed up queries that sort on this column.
511
+ cursor.execute(f"""\
512
+ CREATE INDEX IF NOT EXISTS created_at_idx ON {REQUEST_TABLE} (created_at);
513
+ """)
514
+
500
515
 
501
516
  _DB = None
502
517
  _init_db_lock = threading.Lock()
@@ -642,6 +657,7 @@ def get_request(request_id: str) -> Optional[Request]:
642
657
  @asyncio_utils.shield
643
658
  async def get_request_async(request_id: str) -> Optional[Request]:
644
659
  """Async version of get_request."""
660
+ # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
645
661
  async with filelock.AsyncFileLock(request_lock_path(request_id)):
646
662
  return await _get_request_no_lock_async(request_id)
647
663
 
@@ -752,6 +768,10 @@ class RequestTaskFilter:
752
768
  status_list_str = ','.join(
753
769
  repr(status.value) for status in self.status)
754
770
  filters.append(f'status IN ({status_list_str})')
771
+ if self.include_request_names is not None:
772
+ request_names_str = ','.join(
773
+ repr(name) for name in self.include_request_names)
774
+ filters.append(f'name IN ({request_names_str})')
755
775
  if self.exclude_request_names is not None:
756
776
  exclude_request_names_str = ','.join(
757
777
  repr(name) for name in self.exclude_request_names)
@@ -763,10 +783,6 @@ class RequestTaskFilter:
763
783
  if self.user_id is not None:
764
784
  filters.append(f'{COL_USER_ID} = ?')
765
785
  filter_params.append(self.user_id)
766
- if self.include_request_names is not None:
767
- request_names_str = ','.join(
768
- repr(name) for name in self.include_request_names)
769
- filters.append(f'name IN ({request_names_str})')
770
786
  if self.finished_before is not None:
771
787
  filters.append('finished_at < ?')
772
788
  filter_params.append(self.finished_before)
@@ -799,6 +815,10 @@ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
799
815
  rows = cursor.fetchall()
800
816
  if rows is None:
801
817
  return []
818
+ if req_filter.fields:
819
+ rows = [
820
+ _update_request_row_fields(row, req_filter.fields) for row in rows
821
+ ]
802
822
  return [Request.from_row(row) for row in rows]
803
823
 
804
824
 
@@ -811,21 +831,10 @@ async def get_request_tasks_async(
811
831
  async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
812
832
  if not rows:
813
833
  return []
814
- return [Request.from_row(row) for row in rows]
815
-
816
-
817
- @init_db_async
818
- @metrics_lib.time_me_async
819
- async def get_request_tasks_with_fields_async(
820
- req_filter: RequestTaskFilter,
821
- fields: Optional[List[str]] = None,
822
- ) -> List[Request]:
823
- """Async version of get_request_tasks."""
824
- assert _DB is not None
825
- async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
826
- if not rows:
827
- return []
828
- rows = [_update_request_row_fields(row, fields) for row in rows]
834
+ if req_filter.fields:
835
+ rows = [
836
+ _update_request_row_fields(row, req_filter.fields) for row in rows
837
+ ]
829
838
  return [Request.from_row(row) for row in rows]
830
839
 
831
840
 
@@ -0,0 +1,106 @@
1
+ """Request execution threads management."""
2
+
3
+ import concurrent.futures
4
+ import threading
5
+ from typing import Callable, Set
6
+
7
+ from sky import exceptions
8
+ from sky import sky_logging
9
+ from sky.utils import atomic
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ class OnDemandThreadExecutor(concurrent.futures.Executor):
15
+ """An executor that creates a new thread for each task and destroys it
16
+ after the task is completed.
17
+
18
+ Note(dev):
19
+ We raise an error instead of queuing the request if the limit is reached, so
20
+ that:
21
+ 1. the request might be handled by other processes that have idle workers
22
+ upon retry;
23
+ 2. if not, then users can be clearly hinted that they need to scale the API
24
+ server to support higher concurrency.
25
+ So this executor is only suitable for carefully selected cases where the
26
+ error can be properly handled by caller. To make this executor general, we
27
+ need to support configuring the queuing behavior (exception or queueing).
28
+ """
29
+
30
+ def __init__(self, name: str, max_workers: int):
31
+ self.name: str = name
32
+ self.max_workers: int = max_workers
33
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
34
+ self._shutdown: bool = False
35
+ self._shutdown_lock: threading.Lock = threading.Lock()
36
+ self._threads: Set[threading.Thread] = set()
37
+ self._threads_lock: threading.Lock = threading.Lock()
38
+
39
+ def _cleanup_thread(self, thread: threading.Thread):
40
+ with self._threads_lock:
41
+ self._threads.discard(thread)
42
+
43
+ def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
44
+ *args, **kwargs):
45
+ try:
46
+ result = fn(*args, **kwargs)
47
+ fut.set_result(result)
48
+ except Exception as e: # pylint: disable=broad-except
49
+ logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
50
+ fut.set_exception(e)
51
+ finally:
52
+ self.running.decrement()
53
+ self._cleanup_thread(threading.current_thread())
54
+
55
+ def check_available(self, borrow: bool = False) -> int:
56
+ """Check if there are available workers.
57
+
58
+ Args:
59
+ borrow: If True, the caller borrow a worker from the executor.
60
+ The caller is responsible for returning the worker to the
61
+ executor after the task is completed.
62
+ """
63
+ count = self.running.increment()
64
+ if count > self.max_workers:
65
+ self.running.decrement()
66
+ raise exceptions.ConcurrentWorkerExhaustedError(
67
+ f'Maximum concurrent workers {self.max_workers} of threads '
68
+ f'executor [{self.name}] reached')
69
+ if not borrow:
70
+ self.running.decrement()
71
+ return count
72
+
73
+ def submit(self, fn, /, *args, **kwargs):
74
+ with self._shutdown_lock:
75
+ if self._shutdown:
76
+ raise RuntimeError(
77
+ 'Cannot submit task after executor is shutdown')
78
+ count = self.check_available(borrow=True)
79
+ fut: concurrent.futures.Future = concurrent.futures.Future()
80
+ # Name is assigned for debugging purpose, duplication is fine
81
+ thread = threading.Thread(target=self._task_wrapper,
82
+ name=f'{self.name}-{count}',
83
+ args=(fn, fut, *args),
84
+ kwargs=kwargs,
85
+ daemon=True)
86
+ with self._threads_lock:
87
+ self._threads.add(thread)
88
+ try:
89
+ thread.start()
90
+ except Exception as e:
91
+ self.running.decrement()
92
+ self._cleanup_thread(thread)
93
+ fut.set_exception(e)
94
+ raise
95
+ assert thread.ident is not None, 'Thread should be started'
96
+ return fut
97
+
98
+ def shutdown(self, wait=True):
99
+ with self._shutdown_lock:
100
+ self._shutdown = True
101
+ if not wait:
102
+ return
103
+ with self._threads_lock:
104
+ threads = list(self._threads)
105
+ for t in threads:
106
+ t.join()
sky/server/rest.py CHANGED
@@ -178,14 +178,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
178
178
  Notes(dev):
179
179
  """
180
180
 
181
+ def _readable_error_msg(message: str) -> str:
182
+ return (f'{colorama.Fore.YELLOW}API server is temporarily '
183
+ f'unavailable: {message}.\nRetrying...'
184
+ f'{colorama.Style.RESET_ALL}')
185
+
181
186
  def decorator(func: F) -> F:
182
187
 
183
188
  @functools.wraps(func)
184
189
  def wrapper(*args, **kwargs) -> Any:
185
- msg = (
186
- f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
187
- 'upgrade in progress. Waiting to resume...'
188
- f'{colorama.Style.RESET_ALL}')
190
+
189
191
  backoff = common_utils.Backoff(
190
192
  initial_backoff=initial_backoff,
191
193
  max_backoff_factor=max_backoff_factor)
@@ -203,7 +205,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
203
205
  # stop the status spinner before retrying func() to
204
206
  # avoid the status spinner get stuck if the func() runs
205
207
  # for a long time without update status, e.g. sky logs.
206
- with rich_utils.client_status(msg):
208
+ with rich_utils.client_status(
209
+ _readable_error_msg(e.message)):
207
210
  if time.time() - start_time > max_wait_seconds:
208
211
  # pylint: disable=line-too-long
209
212
  raise exceptions.ServerTemporarilyUnavailableError(
@@ -224,14 +227,33 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
224
227
 
225
228
 
226
229
  def handle_server_unavailable(response: 'requests.Response') -> None:
227
- if response.status_code == 503:
228
- # TODO(aylei): Hacky, depends on how nginx controller handles backends
229
- # with no ready endpoints. Should use self-defined status code or header
230
- # to distinguish retryable server error from general 503 errors.
231
- with ux_utils.print_exception_no_traceback():
232
- raise exceptions.ServerTemporarilyUnavailableError(
233
- 'SkyPilot API server is temporarily unavailable. '
234
- 'Please try again later.')
230
+ """Handle 503 (Service Unavailable) error
231
+
232
+ The client get 503 error in the following cases:
233
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
234
+ request, e.g. when there is and rolling-update.
235
+ 2. The skypilot API server has temporary resource issue, e.g. when the
236
+ cucurrency of the handling process is exhausted.
237
+
238
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
239
+ message to the user to let user decide whether keep waiting or abort the
240
+ request.
241
+ """
242
+ if response.status_code != 503:
243
+ return
244
+
245
+ # error_msg = 'SkyPilot API server is temporarily unavailable. '
246
+ error_msg = ''
247
+ try:
248
+ response_data = response.json()
249
+ if 'detail' in response_data:
250
+ error_msg = response_data['detail']
251
+ except Exception: # pylint: disable=broad-except
252
+ if response.text:
253
+ error_msg = response.text
254
+
255
+ with ux_utils.print_exception_no_traceback():
256
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
235
257
 
236
258
 
237
259
  @_retry_on_server_unavailable()
@@ -310,11 +332,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
310
332
  response = await session.request(method, url, **kwargs)
311
333
 
312
334
  # Handle server unavailability (503 status) - same as sync version
313
- if response.status == 503:
314
- with ux_utils.print_exception_no_traceback():
315
- raise exceptions.ServerTemporarilyUnavailableError(
316
- 'SkyPilot API server is temporarily unavailable. '
317
- 'Please try again later.')
335
+ handle_server_unavailable(response)
318
336
 
319
337
  # Set remote API version and version from headers - same as sync version
320
338
  remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
sky/server/server.py CHANGED
@@ -17,6 +17,7 @@ import resource
17
17
  import shutil
18
18
  import sys
19
19
  import threading
20
+ import traceback
20
21
  from typing import Dict, List, Literal, Optional, Set, Tuple
21
22
  import uuid
22
23
  import zipfile
@@ -74,6 +75,7 @@ from sky.utils import dag_utils
74
75
  from sky.utils import perf_utils
75
76
  from sky.utils import status_lib
76
77
  from sky.utils import subprocess_utils
78
+ from sky.utils import ux_utils
77
79
  from sky.utils.db import db_utils
78
80
  from sky.volumes.server import server as volumes_rest
79
81
  from sky.workspaces import server as workspaces_rest
@@ -664,6 +666,25 @@ except Exception: # pylint: disable=broad-except
664
666
  pass # no issue, we will warn the user later if its too low
665
667
 
666
668
 
669
+ @app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
670
+ def handle_concurrent_worker_exhausted_error(
671
+ request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
672
+ del request # request is not used
673
+ # Print detailed error message to server log
674
+ logger.error('Concurrent worker exhausted: '
675
+ f'{common_utils.format_exception(e)}')
676
+ with ux_utils.enable_traceback():
677
+ logger.error(f' Traceback: {traceback.format_exc()}')
678
+ # Return human readable error message to client
679
+ return fastapi.responses.JSONResponse(
680
+ status_code=503,
681
+ content={
682
+ 'detail':
683
+ ('The server has exhausted its concurrent worker limit. '
684
+ 'Please try again or scale the server if the load persists.')
685
+ })
686
+
687
+
667
688
  @app.get('/token')
668
689
  async def token(request: fastapi.Request,
669
690
  local_port: Optional[int] = None) -> fastapi.responses.Response:
@@ -1232,6 +1253,7 @@ async def logs(
1232
1253
  # TODO(zhwu): This should wait for the request on the cluster, e.g., async
1233
1254
  # launch, to finish, so that a user does not need to manually pull the
1234
1255
  # request status.
1256
+ executor.check_request_thread_executor_available()
1235
1257
  request_task = executor.prepare_request(
1236
1258
  request_id=request.state.request_id,
1237
1259
  request_name='logs',
@@ -1466,6 +1488,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1466
1488
  # to avoid storming the DB and CPU in the meantime
1467
1489
  await asyncio.sleep(0.1)
1468
1490
  request_task = await requests_lib.get_request_async(request_id)
1491
+ # TODO(aylei): refine this, /api/get will not be retried and this is
1492
+ # meaningless to retry. It is the original request that should be retried.
1469
1493
  if request_task.should_retry:
1470
1494
  raise fastapi.HTTPException(
1471
1495
  status_code=503, detail=f'Request {request_id!r} should be retried')
@@ -1643,14 +1667,12 @@ async def api_status(
1643
1667
  requests_lib.RequestStatus.PENDING,
1644
1668
  requests_lib.RequestStatus.RUNNING,
1645
1669
  ]
1646
- request_tasks = await requests_lib.get_request_tasks_with_fields_async(
1670
+ request_tasks = await requests_lib.get_request_tasks_async(
1647
1671
  req_filter=requests_lib.RequestTaskFilter(
1648
1672
  status=statuses,
1649
1673
  limit=limit,
1650
1674
  fields=fields,
1651
- ),
1652
- fields=fields,
1653
- )
1675
+ ))
1654
1676
  return requests_lib.encode_requests(request_tasks)
1655
1677
  else:
1656
1678
  encoded_request_tasks = []
@@ -215,11 +215,18 @@ async def _tail_log_file(
215
215
  # periodically to see if provisioning is done.
216
216
  if cluster_name is not None and should_check_status:
217
217
  last_status_check_time = current_time
218
- cluster_record = await (
218
+ cluster_status = await (
219
219
  global_user_state.get_status_from_cluster_name_async(
220
220
  cluster_name))
221
- if (cluster_record is None or
222
- cluster_record != status_lib.ClusterStatus.INIT):
221
+ if cluster_status is None:
222
+ logger.debug(
223
+ 'Stop tailing provision logs for cluster'
224
+ f' status for cluster {cluster_name} not found')
225
+ break
226
+ if cluster_status != status_lib.ClusterStatus.INIT:
227
+ logger.debug(f'Stop tailing provision logs for cluster'
228
+ f' {cluster_name} has status {cluster_status} '
229
+ '(not in INIT state)')
223
230
  break
224
231
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
225
232
  # Currently just used to keep the connection busy, refer to
@@ -86,7 +86,6 @@ install_requires = [
86
86
  'types-paramiko',
87
87
  'alembic',
88
88
  'aiohttp',
89
- 'aiosqlite',
90
89
  'anyio',
91
90
  ]
92
91
 
@@ -104,6 +103,10 @@ GRPC = 'grpcio>=1.63.0'
104
103
  PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
105
104
 
106
105
  server_dependencies = [
106
+ # TODO: Some of these dependencies are also specified in install_requires,
107
+ # so they are redundant here. We should figure out if they are only needed
108
+ # on the server (should remove from install_requires), or if they are needed
109
+ # on the client (should remove from here).
107
110
  'casbin',
108
111
  'sqlalchemy_adapter',
109
112
  'passlib',
@@ -148,7 +151,7 @@ aws_dependencies = [
148
151
  # a few places.
149
152
  AZURE_CLI = 'azure-cli>=2.65.0'
150
153
 
151
- extras_require: Dict[str, List[str]] = {
154
+ cloud_dependencies: Dict[str, List[str]] = {
152
155
  'aws': aws_dependencies,
153
156
  # TODO(zongheng): azure-cli is huge and takes a long time to install.
154
157
  # Tracked in: https://github.com/Azure/azure-cli/issues/7387
@@ -191,7 +194,6 @@ extras_require: Dict[str, List[str]] = {
191
194
  'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
192
195
  ],
193
196
  'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
194
- 'remote': remote,
195
197
  # For the container registry auth api. Reference:
196
198
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
197
199
  # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
@@ -221,13 +223,11 @@ extras_require: Dict[str, List[str]] = {
221
223
  ] + aws_dependencies,
222
224
  'hyperbolic': [], # No dependencies needed for hyperbolic
223
225
  'seeweb': ['ecsapi>=0.2.0'],
224
- 'server': server_dependencies,
225
226
  'shadeform': [], # No dependencies needed for shadeform
226
227
  }
227
228
 
228
229
  # Calculate which clouds should be included in the [all] installation.
229
- clouds_for_all = set(extras_require)
230
- clouds_for_all.remove('remote')
230
+ clouds_for_all = set(cloud_dependencies)
231
231
 
232
232
  if sys.version_info < (3, 10):
233
233
  # Nebius needs python3.10. If python 3.9 [all] will not install nebius
@@ -242,5 +242,16 @@ if sys.version_info >= (3, 12):
242
242
  # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
243
243
  clouds_for_all.remove('vast')
244
244
 
245
- extras_require['all'] = list(
246
- set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
245
+ cloud_extras = {
246
+ cloud: dependencies + server_dependencies
247
+ for cloud, dependencies in cloud_dependencies.items()
248
+ }
249
+
250
+ extras_require: Dict[str, List[str]] = {
251
+ # Include server_dependencies with each cloud.
252
+ **cloud_extras,
253
+ 'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
254
+ ),
255
+ 'remote': remote,
256
+ 'server': server_dependencies,
257
+ }
sky/skylet/constants.py CHANGED
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
100
100
  # cluster yaml is updated.
101
101
  #
102
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
103
- SKYLET_VERSION = '22'
103
+ SKYLET_VERSION = '23'
104
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
105
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
106
106
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/skylet/services.py CHANGED
@@ -216,10 +216,12 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
216
216
  if pool is not None:
217
217
  pool_hash = serve_state.get_service_hash(pool)
218
218
  # Add the managed job to job queue database.
219
+ user_id = managed_job.user_id if managed_job.HasField(
220
+ 'user_id') else None
219
221
  managed_job_state.set_job_info(job_id, managed_job.name,
220
222
  managed_job.workspace,
221
223
  managed_job.entrypoint, pool,
222
- pool_hash)
224
+ pool_hash, user_id)
223
225
  # Set the managed job to PENDING state to make sure that
224
226
  # this managed job appears in the `sky jobs queue`, even
225
227
  # if it needs to wait to be submitted.