skypilot-nightly 1.0.0.dev20250311__py3-none-any.whl → 1.0.0.dev20250312__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '439de1a24a6f0a9601051ecdc3e565308bac442a'
8
+ _SKYPILOT_COMMIT_SHA = '78a42b6e733bbc29b68efe0e9c79191eaaca9fcd'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250311'
38
+ __version__ = '1.0.0.dev20250312'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/nebius.py CHANGED
@@ -6,9 +6,11 @@ from sky.adaptors import common
6
6
  NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
7
7
  NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
8
8
  NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
9
+ NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
9
10
  NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
10
11
  NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
11
12
  NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
13
+ NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
12
14
 
13
15
  MAX_RETRIES_TO_DISK_CREATE = 120
14
16
  MAX_RETRIES_TO_INSTANCE_STOP = 120
@@ -72,6 +74,11 @@ def get_iam_token():
72
74
  return _iam_token
73
75
 
74
76
 
77
+ def is_token_or_cred_file_exist():
78
+ return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
79
+ os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
80
+
81
+
75
82
  def get_project_id():
76
83
  global _project_id
77
84
  if _project_id is None:
@@ -97,4 +104,7 @@ def get_tenant_id():
97
104
 
98
105
 
99
106
  def sdk():
100
- return nebius.sdk.SDK(credentials=get_iam_token())
107
+ if get_iam_token() is not None:
108
+ return nebius.sdk.SDK(credentials=get_iam_token())
109
+ return nebius.sdk.SDK(
110
+ credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
@@ -1802,6 +1802,21 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1802
1802
  status == status_lib.ClusterStatus.UP for status in node_statuses) and
1803
1803
  len(node_statuses) == handle.launched_nodes)
1804
1804
 
1805
+ def get_node_counts_from_ray_status(
1806
+ runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
1807
+ rc, output, stderr = runner.run(
1808
+ instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
1809
+ stream_logs=False,
1810
+ require_outputs=True,
1811
+ separate_stderr=True)
1812
+ if rc:
1813
+ raise RuntimeError(
1814
+ f'Refreshing status ({cluster_name!r}): Failed to check '
1815
+ f'ray cluster\'s healthiness with '
1816
+ f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1817
+ f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
1818
+ return (*_count_healthy_nodes_from_ray(output), output, stderr)
1819
+
1805
1820
  def run_ray_status_to_check_ray_cluster_healthy() -> bool:
1806
1821
  try:
1807
1822
  # NOTE: fetching the IPs is very slow as it calls into
@@ -1822,26 +1837,34 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1822
1837
  raise exceptions.FetchClusterInfoError(
1823
1838
  reason=exceptions.FetchClusterInfoError.Reason.HEAD)
1824
1839
  head_runner = runners[0]
1825
- rc, output, stderr = head_runner.run(
1826
- instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
1827
- stream_logs=False,
1828
- require_outputs=True,
1829
- separate_stderr=True)
1830
- if rc:
1831
- raise RuntimeError(
1832
- f'Refreshing status ({cluster_name!r}): Failed to check '
1833
- f'ray cluster\'s healthiness with '
1834
- f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1835
- f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
1836
1840
 
1837
- ready_head, ready_workers = _count_healthy_nodes_from_ray(output)
1838
1841
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
1839
- if ready_head + ready_workers == total_nodes:
1840
- return True
1842
+
1843
+ for i in range(5):
1844
+ ready_head, ready_workers, output, stderr = (
1845
+ get_node_counts_from_ray_status(head_runner))
1846
+ if ready_head + ready_workers == total_nodes:
1847
+ return True
1848
+ logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
1849
+ f'{i}: ray status not showing all nodes '
1850
+ f'({ready_head + ready_workers}/{total_nodes});\n'
1851
+ f'output:\n{output}\nstderr:\n{stderr}')
1852
+
1853
+ # If cluster JUST started, maybe not all the nodes have shown
1854
+ # up. Try again for a few seconds.
1855
+ # Note: We are okay with this performance hit because it's very
1856
+ # rare to normally hit this case. It requires:
1857
+ # - All the instances in the cluster are up on the cloud side
1858
+ # (not preempted), but
1859
+ # - The ray cluster is somehow degraded so not all instances are
1860
+ # showing up
1861
+ time.sleep(1)
1862
+
1841
1863
  raise RuntimeError(
1842
1864
  f'Refreshing status ({cluster_name!r}): ray status not showing '
1843
1865
  f'all nodes ({ready_head + ready_workers}/'
1844
- f'{total_nodes}); output: {output}; stderr: {stderr}')
1866
+ f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
1867
+
1845
1868
  except exceptions.FetchClusterInfoError:
1846
1869
  logger.debug(
1847
1870
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
@@ -772,32 +772,6 @@ class FailoverCloudErrorHandlerV1:
772
772
  setattr(e, 'detailed_reason', detailed_reason)
773
773
  raise e
774
774
 
775
- @staticmethod
776
- def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
777
- launchable_resources: 'resources_lib.Resources',
778
- region: 'clouds.Region',
779
- zones: Optional[List['clouds.Zone']], stdout: str,
780
- stderr: str):
781
- del region, zones # Unused.
782
- errors = FailoverCloudErrorHandlerV1._handle_errors(
783
- stdout,
784
- stderr,
785
- is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
786
- messages = '\n '.join(errors)
787
- style = colorama.Style
788
- logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
789
- _add_to_blocked_resources(blocked_resources,
790
- launchable_resources.copy(zone=None))
791
-
792
- # Sometimes, LambdaCloudError will list available regions.
793
- for e in errors:
794
- if e.find('Regions with capacity available:') != -1:
795
- for r in service_catalog.regions('lambda'):
796
- if e.find(r.name) == -1:
797
- _add_to_blocked_resources(
798
- blocked_resources,
799
- launchable_resources.copy(region=r.name, zone=None))
800
-
801
775
  @staticmethod
802
776
  def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
803
777
  launchable_resources: 'resources_lib.Resources',
@@ -846,32 +820,6 @@ class FailoverCloudErrorHandlerV1:
846
820
  _add_to_blocked_resources(blocked_resources,
847
821
  launchable_resources.copy(zone=zone.name))
848
822
 
849
- # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
850
- @staticmethod
851
- def _oci_handler(blocked_resources: Set['resources_lib.Resources'],
852
- launchable_resources: 'resources_lib.Resources',
853
- region: 'clouds.Region',
854
- zones: Optional[List['clouds.Zone']], stdout: str,
855
- stderr: str):
856
- known_service_errors = [
857
- 'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
858
- 'LimitExceeded', 'NotAuthenticated'
859
- ]
860
- errors = FailoverCloudErrorHandlerV1._handle_errors(
861
- stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
862
- ('oci.exceptions.ServiceError' in x.strip() and any(
863
- known_err in x.strip() for known_err in known_service_errors)))
864
- logger.warning(f'Got error(s) in {region.name}:')
865
- messages = '\n\t'.join(errors)
866
- style = colorama.Style
867
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
868
-
869
- if zones is not None:
870
- for zone in zones:
871
- _add_to_blocked_resources(
872
- blocked_resources,
873
- launchable_resources.copy(zone=zone.name))
874
-
875
823
  @staticmethod
876
824
  def update_blocklist_on_error(
877
825
  blocked_resources: Set['resources_lib.Resources'],
@@ -1123,6 +1071,23 @@ class FailoverCloudErrorHandlerV2:
1123
1071
  blocked_resources,
1124
1072
  launchable_resources.copy(zone=zone.name))
1125
1073
 
1074
+ @staticmethod
1075
+ def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
1076
+ launchable_resources: 'resources_lib.Resources',
1077
+ region: 'clouds.Region',
1078
+ zones: Optional[List['clouds.Zone']], error: Exception):
1079
+ output = str(error)
1080
+ # Sometimes, lambda cloud error will list available regions.
1081
+ if output.find('Regions with capacity available:') != -1:
1082
+ for r in service_catalog.regions('lambda'):
1083
+ if output.find(r.name) == -1:
1084
+ _add_to_blocked_resources(
1085
+ blocked_resources,
1086
+ launchable_resources.copy(region=r.name, zone=None))
1087
+ else:
1088
+ FailoverCloudErrorHandlerV2._default_handler(
1089
+ blocked_resources, launchable_resources, region, zones, error)
1090
+
1126
1091
  @staticmethod
1127
1092
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1128
1093
  launchable_resources: 'resources_lib.Resources',
sky/clouds/nebius.py CHANGED
@@ -17,6 +17,7 @@ _CREDENTIAL_FILES = [
17
17
  nebius.NEBIUS_TENANT_ID_FILENAME,
18
18
  nebius.NEBIUS_IAM_TOKEN_FILENAME,
19
19
  nebius.NEBIUS_PROJECT_ID_FILENAME,
20
+ nebius.NEBIUS_CREDENTIALS_FILENAME
20
21
  ]
21
22
 
22
23
 
@@ -252,15 +253,16 @@ class Nebius(clouds.Cloud):
252
253
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
253
254
  """ Verify that the user has valid credentials for Nebius. """
254
255
  logging.debug('Nebius cloud check credentials')
255
- token = nebius.get_iam_token()
256
- token_msg = (' Credentials can be set up by running: \n'\
257
- f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
256
+ token_cred_msg = (' Credentials can be set up by running: \n'\
257
+ f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n'\
258
+ ' or generate ~/.nebius/credentials.json') # pylint: disable=line-too-long
259
+
258
260
  tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
259
261
  f' $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
260
262
  ' Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
261
263
  f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
262
- if token is None:
263
- return False, f'{token_msg}'
264
+ if not nebius.is_token_or_cred_file_exist():
265
+ return False, f'{token_cred_msg}'
264
266
  sdk = nebius.sdk()
265
267
  tenant_id = nebius.get_tenant_id()
266
268
  if tenant_id is None:
@@ -272,7 +274,7 @@ class Nebius(clouds.Cloud):
272
274
  except nebius.request_error() as e:
273
275
  return False, (
274
276
  f'{e.status} \n' # First line is indented by 4 spaces
275
- f'{token_msg}'
277
+ f'{token_cred_msg}'
276
278
  f'{tenant_msg}')
277
279
  return True, None
278
280
 
sky/exceptions.py CHANGED
@@ -28,12 +28,19 @@ GIT_FATAL_EXIT_CODE = 128
28
28
  ARCH_NOT_SUPPORTED_EXIT_CODE = 133
29
29
 
30
30
 
31
- def is_safe_exception(exc: Exception) -> bool:
31
+ def is_safe_exception(exc: BaseException) -> bool:
32
32
  """Returns True if the exception is safe to send to clients.
33
33
 
34
34
  Safe exceptions are:
35
35
  1. Built-in exceptions
36
36
  2. SkyPilot's own exceptions
37
+
38
+ Args:
39
+ exc: The exception to check, accept BaseException to handle SystemExit
40
+ and KeyboardInterrupt.
41
+
42
+ Returns:
43
+ True if the exception is safe to send to clients, False otherwise.
37
44
  """
38
45
  module = type(exc).__module__
39
46
 
@@ -48,7 +55,7 @@ def is_safe_exception(exc: Exception) -> bool:
48
55
  return False
49
56
 
50
57
 
51
- def wrap_exception(exc: Exception) -> Exception:
58
+ def wrap_exception(exc: BaseException) -> BaseException:
52
59
  """Wraps non-safe exceptions into SkyPilot exceptions
53
60
 
54
61
  This is used to wrap exceptions that are not safe to deserialize at clients.
@@ -64,7 +71,8 @@ def wrap_exception(exc: Exception) -> Exception:
64
71
  error_type=type(exc).__name__)
65
72
 
66
73
 
67
- def serialize_exception(e: Exception) -> Dict[str, Any]:
74
+ # Accept BaseException to handle SystemExit and KeyboardInterrupt
75
+ def serialize_exception(e: BaseException) -> Dict[str, Any]:
68
76
  """Serialize the exception.
69
77
 
70
78
  This function also wraps any unsafe exceptions (e.g., cloud exceptions)
@@ -853,7 +853,7 @@ def get_accelerator_label_key_value(
853
853
  for label, value in label_list:
854
854
  if (label_formatter.match_label_key(label) and
855
855
  label_formatter.get_accelerator_from_label_value(
856
- value) == acc_type):
856
+ value).lower() == acc_type.lower()):
857
857
  if is_tpu_on_gke(acc_type):
858
858
  assert isinstance(label_formatter,
859
859
  GKELabelFormatter)
@@ -0,0 +1,31 @@
1
+ """Executor event loop to process tasks in coroutines."""
2
+ import asyncio
3
+ import concurrent.futures
4
+ import threading
5
+ from typing import Coroutine, Optional
6
+
7
+ # Dedicated event loop for requests, isolated with the event loop managed
8
+ # by uvicorn. This is responsible for light-weight async tasks or sub-tasks,
9
+ # refer to `executor.py` for more details about cooperation between the event
10
+ # loop and executor process pool.
11
+ _EVENT_LOOP: Optional[asyncio.AbstractEventLoop] = None
12
+ _LOCK = threading.Lock()
13
+
14
+
15
+ def run(coro: Coroutine) -> concurrent.futures.Future:
16
+ """Run a coroutine asynchronously in the request event loop."""
17
+ return asyncio.run_coroutine_threadsafe(coro, get_event_loop())
18
+
19
+
20
+ def get_event_loop() -> asyncio.AbstractEventLoop:
21
+ """Open and get the event loop."""
22
+ global _EVENT_LOOP
23
+ if _EVENT_LOOP is not None and not _EVENT_LOOP.is_closed():
24
+ return _EVENT_LOOP
25
+ with _LOCK:
26
+ if _EVENT_LOOP is None or _EVENT_LOOP.is_closed():
27
+ _EVENT_LOOP = asyncio.new_event_loop()
28
+ loop_thread = threading.Thread(target=_EVENT_LOOP.run_forever,
29
+ daemon=True)
30
+ loop_thread.start()
31
+ return _EVENT_LOOP
@@ -27,8 +27,8 @@ import os
27
27
  import queue as queue_lib
28
28
  import signal
29
29
  import sys
30
+ import threading
30
31
  import time
31
- import traceback
32
32
  import typing
33
33
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
34
34
 
@@ -41,11 +41,13 @@ from sky import skypilot_config
41
41
  from sky.server import common as server_common
42
42
  from sky.server import constants as server_constants
43
43
  from sky.server.requests import payloads
44
+ from sky.server.requests import preconditions
44
45
  from sky.server.requests import requests as api_requests
45
46
  from sky.server.requests.queues import mp_queue
46
47
  from sky.skylet import constants
47
48
  from sky.utils import annotations
48
49
  from sky.utils import common_utils
50
+ from sky.utils import subprocess_utils
49
51
  from sky.utils import timeline
50
52
  from sky.utils import ux_utils
51
53
 
@@ -262,13 +264,7 @@ def _request_execution_wrapper(request_id: str,
262
264
  _restore_output(original_stdout, original_stderr)
263
265
  return
264
266
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
265
- with ux_utils.enable_traceback():
266
- stacktrace = traceback.format_exc()
267
- setattr(e, 'stacktrace', stacktrace)
268
- with api_requests.update_request(request_id) as request_task:
269
- assert request_task is not None, request_id
270
- request_task.status = api_requests.RequestStatus.FAILED
271
- request_task.set_error(e)
267
+ api_requests.set_request_failed(request_id, e)
272
268
  _restore_output(original_stdout, original_stderr)
273
269
  logger.info(f'Request {request_id} failed due to '
274
270
  f'{common_utils.format_exception(e)}')
@@ -283,16 +279,37 @@ def _request_execution_wrapper(request_id: str,
283
279
  logger.info(f'Request {request_id} finished')
284
280
 
285
281
 
286
- def schedule_request(request_id: str,
287
- request_name: str,
288
- request_body: payloads.RequestBody,
289
- func: Callable[P, Any],
290
- request_cluster_name: Optional[str] = None,
291
- ignore_return_value: bool = False,
292
- schedule_type: api_requests.ScheduleType = api_requests.
293
- ScheduleType.LONG,
294
- is_skypilot_system: bool = False) -> None:
295
- """Enqueue a request to the request queue."""
282
+ def schedule_request(
283
+ request_id: str,
284
+ request_name: str,
285
+ request_body: payloads.RequestBody,
286
+ func: Callable[P, Any],
287
+ request_cluster_name: Optional[str] = None,
288
+ ignore_return_value: bool = False,
289
+ schedule_type: api_requests.ScheduleType = (
290
+ api_requests.ScheduleType.LONG),
291
+ is_skypilot_system: bool = False,
292
+ precondition: Optional[preconditions.Precondition] = None) -> None:
293
+ """Enqueue a request to the request queue.
294
+
295
+ Args:
296
+ request_id: ID of the request.
297
+ request_name: Name of the request type, e.g. "sky.launch".
298
+ request_body: The request body containing parameters and environment
299
+ variables.
300
+ func: The function to execute when the request is processed.
301
+ request_cluster_name: The name of the cluster associated with this
302
+ request, if any.
303
+ ignore_return_value: If True, the return value of the function will be
304
+ ignored.
305
+ schedule_type: The type of scheduling to use for this request, refer to
306
+ `api_requests.ScheduleType` for more details.
307
+ is_skypilot_system: Denote whether the request is from SkyPilot system.
308
+ precondition: If a precondition is provided, the request will only be
309
+ scheduled for execution when the precondition is met (returns True).
310
+ The precondition is waited asynchronously and does not block the
311
+ caller.
312
+ """
296
313
  user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
297
314
  if is_skypilot_system:
298
315
  user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
@@ -314,10 +331,17 @@ def schedule_request(request_id: str,
314
331
  return
315
332
 
316
333
  request.log_path.touch()
317
- input_tuple = (request_id, ignore_return_value)
318
334
 
319
- logger.info(f'Queuing request: {request_id}')
320
- _get_queue(schedule_type).put(input_tuple)
335
+ def enqueue():
336
+ input_tuple = (request_id, ignore_return_value)
337
+ logger.info(f'Queuing request: {request_id}')
338
+ _get_queue(schedule_type).put(input_tuple)
339
+
340
+ if precondition is not None:
341
+ # Wait async to avoid blocking caller.
342
+ precondition.wait_async(on_condition_met=enqueue)
343
+ else:
344
+ enqueue()
321
345
 
322
346
 
323
347
  def executor_initializer(proc_group: str):
@@ -431,13 +455,17 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
431
455
 
432
456
  logger.info('Request queues created')
433
457
 
458
+ long_workers = []
434
459
  for worker_id in range(max_parallel_for_long):
435
460
  worker = RequestWorker(id=worker_id,
436
461
  schedule_type=api_requests.ScheduleType.LONG)
437
462
  worker_proc = multiprocessing.Process(target=request_worker,
438
463
  args=(worker, 1))
439
- worker_proc.start()
464
+ long_workers.append(worker_proc)
440
465
  sub_procs.append(worker_proc)
466
+ threading.Thread(target=subprocess_utils.slow_start_processes,
467
+ args=(long_workers,),
468
+ daemon=True).start()
441
469
 
442
470
  # Start a worker for short requests.
443
471
  worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
@@ -0,0 +1,174 @@
1
+ """Precondition for a request to be executed.
2
+
3
+ Preconditions are introduced so that:
4
+ - Wait for precondition does not block executor process, which is expensive;
5
+ - Cross requests knowledge (e.g. waiting for other requests to be completed)
6
+ can be handled at precondition level, instead of invading the execution
7
+ logic of specific requests.
8
+ """
9
+ import abc
10
+ import asyncio
11
+ import time
12
+ from typing import Callable, Optional, Tuple
13
+
14
+ from sky import exceptions
15
+ from sky import global_user_state
16
+ from sky import sky_logging
17
+ from sky.server.requests import event_loop
18
+ from sky.server.requests import requests as api_requests
19
+ from sky.utils import common_utils
20
+ from sky.utils import status_lib
21
+
22
+ # The default interval seconds to check the precondition.
23
+ _PRECONDITION_CHECK_INTERVAL = 1
24
+ # The default timeout seconds to wait for the precondition to be met.
25
+ _PRECONDITION_TIMEOUT = 60 * 60
26
+
27
+ logger = sky_logging.init_logger(__name__)
28
+
29
+
30
+ class Precondition(abc.ABC):
31
+ """Abstract base class for a precondition for a request to be executed.
32
+
33
+ A Precondition can be waited in either of the following ways:
34
+ - await Precondition: wait for the precondition to be met.
35
+ - Precondition.wait_async: wait for the precondition to be met in background
36
+ and execute the given callback on met.
37
+ """
38
+
39
+ def __init__(self,
40
+ request_id: str,
41
+ check_interval: float = _PRECONDITION_CHECK_INTERVAL,
42
+ timeout: float = _PRECONDITION_TIMEOUT):
43
+ self.request_id = request_id
44
+ self.check_interval = check_interval
45
+ self.timeout = timeout
46
+
47
+ def __await__(self):
48
+ """Make Precondition awaitable."""
49
+ return self._wait().__await__()
50
+
51
+ def wait_async(
52
+ self,
53
+ on_condition_met: Optional[Callable[[], None]] = None) -> None:
54
+ """Wait precondition asynchronously and execute the callback on met."""
55
+
56
+ async def wait_with_callback():
57
+ met = await self
58
+ if met and on_condition_met is not None:
59
+ on_condition_met()
60
+
61
+ event_loop.run(wait_with_callback())
62
+
63
+ @abc.abstractmethod
64
+ async def check(self) -> Tuple[bool, Optional[str]]:
65
+ """Check if the precondition is met.
66
+
67
+ Note that compared to _request_execution_wrapper, the env vars and
68
+ skypilot config here are not overridden since the lack of process
69
+ isolation, which may cause issues if the check accidentally depends on
70
+ these. Make sure the check function is independent of the request
71
+ environment.
72
+ TODO(aylei): a new request context isolation mechanism is needed to
73
+ enable more tasks/sub-tasks to be processed in coroutines or threads.
74
+
75
+ Returns:
76
+ A tuple of (bool, Optional[str]).
77
+ The bool indicates if the precondition is met.
78
+ The str is the current status of the precondition if any.
79
+ """
80
+ raise NotImplementedError
81
+
82
+ async def _wait(self) -> bool:
83
+ """Wait for the precondition to be met.
84
+
85
+ Args:
86
+ on_condition_met: Callback to execute when the precondition is met.
87
+ """
88
+ start_time = time.time()
89
+ last_status_msg = ''
90
+ while True:
91
+ if self.timeout > 0 and time.time() - start_time > self.timeout:
92
+ # Cancel the request on timeout.
93
+ api_requests.set_request_failed(
94
+ self.request_id,
95
+ exceptions.RequestCancelled(
96
+ f'Request {self.request_id} precondition wait timed '
97
+ f'out after {self.timeout}s'))
98
+ return False
99
+
100
+ # Check if the request has been cancelled
101
+ request = api_requests.get_request(self.request_id)
102
+ if request is None:
103
+ logger.error(f'Request {self.request_id} not found')
104
+ return False
105
+ if request.status == api_requests.RequestStatus.CANCELLED:
106
+ logger.debug(f'Request {self.request_id} cancelled')
107
+ return False
108
+
109
+ try:
110
+ met, status_msg = await self.check()
111
+ if met:
112
+ return True
113
+ if status_msg is not None and status_msg != last_status_msg:
114
+ # Update the status message if it has changed.
115
+ with api_requests.update_request(self.request_id) as req:
116
+ assert req is not None, self.request_id
117
+ req.status_msg = status_msg
118
+ last_status_msg = status_msg
119
+ except (Exception, SystemExit, KeyboardInterrupt) as e: # pylint: disable=broad-except
120
+ api_requests.set_request_failed(self.request_id, e)
121
+ logger.info(f'Request {self.request_id} failed due to '
122
+ f'{common_utils.format_exception(e)}')
123
+ return False
124
+
125
+ await asyncio.sleep(self.check_interval)
126
+
127
+
128
+ class ClusterStartCompletePrecondition(Precondition):
129
+ """Whether the start process of a cluster is complete.
130
+
131
+ This condition only waits the start process of a cluster to complete, e.g.
132
+ `sky launch` or `sky start`.
133
+ For cluster that has been started but not in UP status, bypass the waiting
134
+ in favor of:
135
+ - allowing the task to refresh cluster status from cloud vendor;
136
+ - unified error message in task handlers.
137
+
138
+ Args:
139
+ request_id: The request ID of the task.
140
+ cluster_name: The name of the cluster to wait for.
141
+ """
142
+
143
+ def __init__(self, request_id: str, cluster_name: str, **kwargs):
144
+ super().__init__(request_id=request_id, **kwargs)
145
+ self.cluster_name = cluster_name
146
+
147
+ async def check(self) -> Tuple[bool, Optional[str]]:
148
+ cluster_record = global_user_state.get_cluster_from_name(
149
+ self.cluster_name)
150
+ if (cluster_record and
151
+ cluster_record['status'] is status_lib.ClusterStatus.UP):
152
+ # Shortcut for started clusters, ignore cluster not found
153
+ # since the cluster record might not yet be created by the
154
+ # launch task.
155
+ return True, None
156
+ # Check if there is a task starting the cluster, we do not check
157
+ # SUCCEEDED requests since successfully launched cluster can be
158
+ # restarted later on.
159
+ # Note that since the requests are not persistent yet between restarts,
160
+ # a cluster might be started in halfway and requests are lost.
161
+ # We unify these situations into a single state: the process of starting
162
+ # the cluster is done (either normally or abnormally) but cluster is not
163
+ # in UP status.
164
+ requests = api_requests.get_request_tasks(
165
+ status=[
166
+ api_requests.RequestStatus.RUNNING,
167
+ api_requests.RequestStatus.PENDING
168
+ ],
169
+ include_request_names=['sky.launch', 'sky.start'],
170
+ cluster_names=[self.cluster_name])
171
+ if len(requests) == 0:
172
+ # No runnning or pending tasks, the start process is done.
173
+ return True, None
174
+ return False, f'Waiting for cluster {self.cluster_name} to be UP.'
@@ -10,6 +10,7 @@ import shutil
10
10
  import signal
11
11
  import sqlite3
12
12
  import time
13
+ import traceback
13
14
  from typing import Any, Callable, Dict, List, Optional, Tuple
14
15
 
15
16
  import colorama
@@ -27,6 +28,7 @@ from sky.utils import common
27
28
  from sky.utils import common_utils
28
29
  from sky.utils import db_utils
29
30
  from sky.utils import env_options
31
+ from sky.utils import ux_utils
30
32
 
31
33
  logger = sky_logging.init_logger(__name__)
32
34
 
@@ -34,6 +36,7 @@ logger = sky_logging.init_logger(__name__)
34
36
  REQUEST_TABLE = 'requests'
35
37
  COL_CLUSTER_NAME = 'cluster_name'
36
38
  COL_USER_ID = 'user_id'
39
+ COL_STATUS_MSG = 'status_msg'
37
40
  REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
38
41
 
39
42
  # TODO(zhwu): For scalability, there are several TODOs:
@@ -81,6 +84,7 @@ REQUEST_COLUMNS = [
81
84
  COL_CLUSTER_NAME,
82
85
  'schedule_type',
83
86
  COL_USER_ID,
87
+ COL_STATUS_MSG,
84
88
  ]
85
89
 
86
90
 
@@ -109,6 +113,7 @@ class RequestPayload:
109
113
  user_name: Optional[str] = None
110
114
  # Resources the request operates on.
111
115
  cluster_name: Optional[str] = None
116
+ status_msg: Optional[str] = None
112
117
 
113
118
 
114
119
  @dataclasses.dataclass
@@ -129,6 +134,8 @@ class Request:
129
134
  schedule_type: ScheduleType = ScheduleType.LONG
130
135
  # Resources the request operates on.
131
136
  cluster_name: Optional[str] = None
137
+ # Status message of the request, indicates the reason of current status.
138
+ status_msg: Optional[str] = None
132
139
 
133
140
  @property
134
141
  def log_path(self) -> pathlib.Path:
@@ -138,7 +145,7 @@ class Request:
138
145
  log_path = (log_path_prefix / self.request_id).with_suffix('.log')
139
146
  return log_path
140
147
 
141
- def set_error(self, error: Exception) -> None:
148
+ def set_error(self, error: BaseException) -> None:
142
149
  """Set the error."""
143
150
  # TODO(zhwu): pickle.dump does not work well with custom exceptions if
144
151
  # it has more than 1 arguments.
@@ -212,6 +219,7 @@ class Request:
212
219
  user_id=self.user_id,
213
220
  user_name=user_name,
214
221
  cluster_name=self.cluster_name,
222
+ status_msg=self.status_msg,
215
223
  )
216
224
 
217
225
  def encode(self) -> RequestPayload:
@@ -232,6 +240,7 @@ class Request:
232
240
  schedule_type=self.schedule_type.value,
233
241
  user_id=self.user_id,
234
242
  cluster_name=self.cluster_name,
243
+ status_msg=self.status_msg,
235
244
  )
236
245
  except (TypeError, ValueError) as e:
237
246
  # The error is unexpected, so we don't suppress the stack trace.
@@ -262,6 +271,7 @@ class Request:
262
271
  schedule_type=ScheduleType(payload.schedule_type),
263
272
  user_id=payload.user_id,
264
273
  cluster_name=payload.cluster_name,
274
+ status_msg=payload.status_msg,
265
275
  )
266
276
  except (TypeError, ValueError) as e:
267
277
  logger.error(
@@ -415,7 +425,8 @@ def create_table(cursor, conn):
415
425
  pid INTEGER,
416
426
  {COL_CLUSTER_NAME} TEXT,
417
427
  schedule_type TEXT,
418
- {COL_USER_ID} TEXT)""")
428
+ {COL_USER_ID} TEXT,
429
+ {COL_STATUS_MSG} TEXT)""")
419
430
 
420
431
 
421
432
  _DB = None
@@ -507,8 +518,9 @@ def create_if_not_exists(request: Request) -> bool:
507
518
  def get_request_tasks(
508
519
  status: Optional[List[RequestStatus]] = None,
509
520
  cluster_names: Optional[List[str]] = None,
510
- exclude_request_names: Optional[List[str]] = None,
511
521
  user_id: Optional[str] = None,
522
+ exclude_request_names: Optional[List[str]] = None,
523
+ include_request_names: Optional[List[str]] = None,
512
524
  ) -> List[Request]:
513
525
  """Get a list of requests that match the given filters.
514
526
 
@@ -516,9 +528,21 @@ def get_request_tasks(
516
528
  status: a list of statuses of the requests to filter on.
517
529
  cluster_names: a list of cluster names to filter requests on.
518
530
  exclude_request_names: a list of request names to exclude from results.
531
+ Mutually exclusive with include_request_names.
519
532
  user_id: the user ID to filter requests on.
520
533
  If None, all users are included.
534
+ include_request_names: a list of request names to filter on.
535
+ Mutually exclusive with exclude_request_names.
536
+
537
+ Raises:
538
+ ValueError: If both exclude_request_names and include_request_names are
539
+ provided.
521
540
  """
541
+ if exclude_request_names is not None and include_request_names is not None:
542
+ raise ValueError(
543
+ 'Only one of exclude_request_names or include_request_names can be '
544
+ 'provided, not both.')
545
+
522
546
  filters = []
523
547
  filter_params = []
524
548
  if status is not None:
@@ -534,6 +558,10 @@ def get_request_tasks(
534
558
  if user_id is not None:
535
559
  filters.append(f'{COL_USER_ID} = ?')
536
560
  filter_params.append(user_id)
561
+ if include_request_names is not None:
562
+ request_names_str = ','.join(
563
+ repr(name) for name in include_request_names)
564
+ filters.append(f'name IN ({request_names_str})')
537
565
  assert _DB is not None
538
566
  with _DB.conn:
539
567
  cursor = _DB.conn.cursor()
@@ -565,3 +593,14 @@ def _add_or_update_request_no_lock(request: Request):
565
593
  cursor.execute(
566
594
  f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
567
595
  f'VALUES ({fill_str})', row)
596
+
597
+
598
+ def set_request_failed(request_id: str, e: BaseException) -> None:
599
+ """Set a request to failed and populate the error message."""
600
+ with ux_utils.enable_traceback():
601
+ stacktrace = traceback.format_exc()
602
+ setattr(e, 'stacktrace', stacktrace)
603
+ with update_request(request_id) as request_task:
604
+ assert request_task is not None, request_id
605
+ request_task.status = RequestStatus.FAILED
606
+ request_task.set_error(e)
sky/server/server.py CHANGED
@@ -6,6 +6,7 @@ import contextlib
6
6
  import dataclasses
7
7
  import datetime
8
8
  import logging
9
+ import multiprocessing
9
10
  import os
10
11
  import pathlib
11
12
  import re
@@ -38,6 +39,7 @@ from sky.server import constants as server_constants
38
39
  from sky.server import stream_utils
39
40
  from sky.server.requests import executor
40
41
  from sky.server.requests import payloads
42
+ from sky.server.requests import preconditions
41
43
  from sky.server.requests import requests as requests_lib
42
44
  from sky.skylet import constants
43
45
  from sky.usage import usage_lib
@@ -47,6 +49,7 @@ from sky.utils import common_utils
47
49
  from sky.utils import dag_utils
48
50
  from sky.utils import env_options
49
51
  from sky.utils import status_lib
52
+ from sky.utils import subprocess_utils
50
53
 
51
54
  # pylint: disable=ungrouped-imports
52
55
  if sys.version_info >= (3, 10):
@@ -496,13 +499,18 @@ async def launch(launch_body: payloads.LaunchBody,
496
499
  # pylint: disable=redefined-builtin
497
500
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
498
501
  """Executes a task on an existing cluster."""
502
+ cluster_name = exec_body.cluster_name
499
503
  executor.schedule_request(
500
504
  request_id=request.state.request_id,
501
505
  request_name='exec',
502
506
  request_body=exec_body,
503
507
  func=execution.exec,
508
+ precondition=preconditions.ClusterStartCompletePrecondition(
509
+ request_id=request.state.request_id,
510
+ cluster_name=cluster_name,
511
+ ),
504
512
  schedule_type=requests_lib.ScheduleType.LONG,
505
- request_cluster_name=exec_body.cluster_name,
513
+ request_cluster_name=cluster_name,
506
514
  )
507
515
 
508
516
 
@@ -1088,6 +1096,9 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1088
1096
 
1089
1097
  if __name__ == '__main__':
1090
1098
  import uvicorn
1099
+
1100
+ from sky.server import uvicorn as skyuvicorn
1101
+
1091
1102
  requests_lib.reset_db_and_logs()
1092
1103
 
1093
1104
  parser = argparse.ArgumentParser()
@@ -1109,16 +1120,26 @@ if __name__ == '__main__':
1109
1120
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1110
1121
  # We don't support reload for now, since it may cause leakage of request
1111
1122
  # workers or interrupt running requests.
1112
- uvicorn.run('sky.server.server:app',
1113
- host=cmd_args.host,
1114
- port=cmd_args.port,
1115
- workers=num_workers)
1123
+ config = uvicorn.Config('sky.server.server:app',
1124
+ host=cmd_args.host,
1125
+ port=cmd_args.port,
1126
+ workers=num_workers)
1127
+ skyuvicorn.run(config)
1116
1128
  except Exception as exc: # pylint: disable=broad-except
1117
1129
  logger.error(f'Failed to start SkyPilot API server: '
1118
1130
  f'{common_utils.format_exception(exc, use_bracket=True)}')
1119
1131
  raise
1120
1132
  finally:
1121
1133
  logger.info('Shutting down SkyPilot API server...')
1122
- for sub_proc in sub_procs:
1123
- sub_proc.terminate()
1124
- sub_proc.join()
1134
+
1135
+ def cleanup(proc: multiprocessing.Process) -> None:
1136
+ try:
1137
+ proc.terminate()
1138
+ proc.join()
1139
+ finally:
1140
+ # The process may not be started yet, close it anyway.
1141
+ proc.close()
1142
+
1143
+ subprocess_utils.run_in_parallel(cleanup,
1144
+ sub_procs,
1145
+ num_threads=len(sub_procs))
@@ -55,19 +55,22 @@ async def log_streamer(request_id: Optional[str],
55
55
  if show_request_waiting_spinner:
56
56
  yield status_msg.init()
57
57
  yield status_msg.start()
58
- is_waiting_msg_logged = False
58
+ last_waiting_msg = ''
59
59
  waiting_msg = (f'Waiting for {request_task.name!r} request to be '
60
60
  f'scheduled: {request_id}')
61
61
  while request_task.status < requests_lib.RequestStatus.RUNNING:
62
+ if request_task.status_msg is not None:
63
+ waiting_msg = request_task.status_msg
62
64
  if show_request_waiting_spinner:
63
65
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
64
- elif plain_logs and not is_waiting_msg_logged:
65
- is_waiting_msg_logged = True
66
+ elif plain_logs and waiting_msg != last_waiting_msg:
67
+ # Only log when waiting message changes.
68
+ last_waiting_msg = waiting_msg
66
69
  # Use smaller padding (1024 bytes) to force browser rendering
67
70
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
68
- # Sleep 0 to yield, so other coroutines can run. This busy waiting
69
- # loop is performance critical for short-running requests, so we do
70
- # not want to yield too long.
71
+ # Sleep shortly to avoid storming the DB and CPU and allow other
72
+ # coroutines to run. This busy waiting loop is performance critical
73
+ # for short-running requests, so we do not want to yield too long.
71
74
  await asyncio.sleep(0.1)
72
75
  request_task = requests_lib.get_request(request_id)
73
76
  if not follow:
sky/server/uvicorn.py ADDED
@@ -0,0 +1,81 @@
1
+ """Uvicorn wrapper for SkyPilot API server.
2
+
3
+ This module is a wrapper around uvicorn to customize the behavior of the
4
+ server.
5
+ """
6
+ import os
7
+ import threading
8
+ from typing import Optional
9
+
10
+ import uvicorn
11
+ from uvicorn.supervisors import multiprocess
12
+
13
+ from sky.utils import subprocess_utils
14
+
15
+
16
+ def run(config: uvicorn.Config):
17
+ """Run unvicorn server."""
18
+ if config.reload:
19
+ # Reload and multi-workers are mutually exclusive
20
+ # in uvicorn. Since we do not use reload now, simply
21
+ # guard by an exception.
22
+ raise ValueError('Reload is not supported yet.')
23
+ server = uvicorn.Server(config=config)
24
+ try:
25
+ if config.workers is not None and config.workers > 1:
26
+ sock = config.bind_socket()
27
+ SlowStartMultiprocess(config, target=server.run,
28
+ sockets=[sock]).run()
29
+ else:
30
+ server.run()
31
+ finally:
32
+ # Copied from unvicorn.run()
33
+ if config.uds and os.path.exists(config.uds):
34
+ os.remove(config.uds)
35
+
36
+
37
+ class SlowStartMultiprocess(multiprocess.Multiprocess):
38
+ """Uvicorn Multiprocess wrapper with slow start.
39
+
40
+ Slow start offers faster and more stable start time.
41
+ Profile shows the start time is more stable and accelerated from
42
+ ~7s to ~3.3s on a 12-core machine after switching LONG workers and
43
+ Uvicorn workers to slow start.
44
+ Refer to subprocess_utils.slow_start_processes() for more details.
45
+ """
46
+
47
+ def __init__(self, config: uvicorn.Config, **kwargs):
48
+ """Initialize the multiprocess wrapper.
49
+
50
+ Args:
51
+ config: The uvicorn config.
52
+ """
53
+ super().__init__(config, **kwargs)
54
+ self._init_thread: Optional[threading.Thread] = None
55
+
56
+ def init_processes(self) -> None:
57
+ # Slow start worker processes asynchronously to avoid blocking signal
58
+ # handling of uvicorn.
59
+ self._init_thread = threading.Thread(target=self.slow_start_processes,
60
+ daemon=True)
61
+ self._init_thread.start()
62
+
63
+ def slow_start_processes(self) -> None:
64
+ """Initialize processes with slow start."""
65
+ to_start = []
66
+ # Init N worker processes
67
+ for _ in range(self.processes_num):
68
+ to_start.append(
69
+ multiprocess.Process(self.config, self.target, self.sockets))
70
+ # Start the processes with slow start, we only append start to
71
+ # self.processes because Uvicorn periodically restarts unstarted
72
+ # workers.
73
+ subprocess_utils.slow_start_processes(to_start,
74
+ on_start=self.processes.append,
75
+ should_exit=self.should_exit)
76
+
77
+ def terminate_all(self) -> None:
78
+ """Wait init thread to finish before terminating all processes."""
79
+ if self._init_thread is not None:
80
+ self._init_thread.join()
81
+ super().terminate_all()
@@ -77,7 +77,7 @@ def canonicalize_accelerator_name(accelerator: str,
77
77
  # Look for Kubernetes accelerators online if the accelerator is not found
78
78
  # in the public cloud catalog. This is to make sure custom accelerators
79
79
  # on Kubernetes can be correctly canonicalized.
80
- if not names and cloud_str in ['kubernetes', None]:
80
+ if not names and cloud_str in ['Kubernetes', None]:
81
81
  with rich_utils.safe_status(
82
82
  ux_utils.spinner_message('Listing accelerators on Kubernetes')):
83
83
  searched = service_catalog.list_accelerators(
@@ -5,8 +5,9 @@ import random
5
5
  import resource
6
6
  import shlex
7
7
  import subprocess
8
+ import threading
8
9
  import time
9
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
+ from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
10
11
 
11
12
  import colorama
12
13
  import psutil
@@ -15,6 +16,7 @@ from sky import exceptions
15
16
  from sky import sky_logging
16
17
  from sky.skylet import constants
17
18
  from sky.skylet import log_lib
19
+ from sky.utils import common_utils
18
20
  from sky.utils import timeline
19
21
  from sky.utils import ux_utils
20
22
 
@@ -353,3 +355,56 @@ def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
353
355
  text=True)
354
356
  # Get the PID of the detached process
355
357
  return int(proc.stdout.strip())
358
+
359
+
360
+ # A protocol for objects that can be started, designed to be used with
361
+ # slow_start_processes() so that we can handle different wrappers of
362
+ # multiprocessing.Process in a uniform way.
363
+ class Startable(Protocol):
364
+
365
+ def start(self) -> None:
366
+ ...
367
+
368
+
369
+ OnStartFn = Callable[[Startable], None]
370
+
371
+
372
+ def slow_start_processes(processes: List[Startable],
373
+ delay: float = 2.0,
374
+ on_start: Optional[OnStartFn] = None,
375
+ should_exit: Optional[threading.Event] = None) -> None:
376
+ """Start processes with slow start.
377
+
378
+ Profile shows that it takes 1~2 seconds to start a worker process when
379
+ CPU is relatively idle. However, starting all workers simultaneously will
380
+ overwhelm the CPU and cause the time for the first worker to be ready to
381
+ be delayed. Slow start start a group of workers slowly to accelerate the
382
+ start time (i.e. the time for the first worker to be ready), while
383
+ gradually increasing the batch size in exponential manner to make the
384
+ time of achieving full parallelism as short as possible.
385
+
386
+ Args:
387
+ processes: The list of processes to start.
388
+ delay: The delay between starting each process, default to 2.0 seconds,
389
+ based on profile.
390
+ on_start: An optional function to callback when a process starts.
391
+ should_exit: An optional event to check if the function should exit
392
+ before starting all the processes.
393
+ """
394
+ max_batch_size = max(1, int(common_utils.get_cpu_count() / 2))
395
+ batch_size = 1
396
+ left = len(processes)
397
+ while left > 0:
398
+ if should_exit and should_exit.is_set():
399
+ break
400
+ current_batch = min(batch_size, left)
401
+ for i in range(current_batch):
402
+ worker_idx = len(processes) - left + i
403
+ processes[worker_idx].start()
404
+ if on_start:
405
+ on_start(processes[worker_idx])
406
+ left -= current_batch
407
+ if left <= 0:
408
+ break
409
+ batch_size = min(batch_size * 2, max_batch_size)
410
+ time.sleep(delay)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250311
3
+ Version: 1.0.0.dev20250312
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=BsWzCznVm1cQKZDKOkladqi6DUUPJs0mc7iSx5QIw_E,6428
1
+ sky/__init__.py,sha256=15ZYL6HUs43go7VjSHq7_BlZEptubkQ6aeBIx534zkU,6428
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
4
4
  sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
@@ -6,7 +6,7 @@ sky/cli.py,sha256=qBRqtKVV_GurbCFZBHkF2UIahy3A7bsOsmfCNm6mZ54,221503
6
6
  sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
7
7
  sky/core.py,sha256=MU9hcTdh8baMGrr2ZXmbxx12vNlhajrkeyg5QtV717c,47609
8
8
  sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
9
- sky/exceptions.py,sha256=KvKQDPmlO7Qk90_NyRRYO9yNYBifbDGfxsRIe_L_fWw,16345
9
+ sky/exceptions.py,sha256=cEZ5nm7RhTW22Npw-oYS5Wp9rtxoHxdPQHfkNa92wOo,16641
10
10
  sky/execution.py,sha256=0M4RTEzWn-B9oz221XdZOIGH12XOACmNq0j-WGUT_No,28023
11
11
  sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
12
12
  sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
@@ -26,15 +26,15 @@ sky/adaptors/docker.py,sha256=_kzpZ0fkWHqqQAVVl0llTsCE31KYz3Sjn8psTBQHVkA,468
26
26
  sky/adaptors/gcp.py,sha256=OQ9RaqjR0r0iaWYpjvEtIx5vnEhyB4LhUCwbtdxsmVk,3115
27
27
  sky/adaptors/ibm.py,sha256=H87vD6izq_wQI8oQC7cx9iVtRgPi_QkAcrfa1Z3PNqU,4906
28
28
  sky/adaptors/kubernetes.py,sha256=UIUc3zI0MgWcv1GTBu-pZUSx_NTLf0zRI20JUdtA1HI,6594
29
- sky/adaptors/nebius.py,sha256=JOvwniQT-Pkp9-af6IdL_FUkjIbsEAUXVNUkwdaEeb0,2732
29
+ sky/adaptors/nebius.py,sha256=QAqU_reFk7MKQ39TE1FiNgNnDPH5L5-HT19j6CtJcJE,3175
30
30
  sky/adaptors/oci.py,sha256=LfMSFUmkkNT6Yoz9FZHNl6UFSg4X1lJO4-x4ZbDdXTs,2831
31
31
  sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
32
32
  sky/adaptors/vast.py,sha256=tpvmHi7IkQNzbbHVkeo04kUSajoEpSzXr2XgeO_I1LU,695
33
33
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
34
34
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
35
35
  sky/backends/backend.py,sha256=4BOqKZ-bwBTpjNnZF4JAHX2m2Iga7EmEn8Ao3tEivaM,7527
36
- sky/backends/backend_utils.py,sha256=B_46tG9PyrppxLWdg4mWGuuIr3TEcWTz6qhYXjAY2bw,133452
37
- sky/backends/cloud_vm_ray_backend.py,sha256=KIU4IkUTBGE__7MC3ayjYMwE14mSxeiHjrGnK7wAQXw,247773
36
+ sky/backends/backend_utils.py,sha256=lOkufcDQiBFHKf5TYppaQ1SKCRmUxAM-71q3EmXM_QY,134525
37
+ sky/backends/cloud_vm_ray_backend.py,sha256=aNRjxeVe_1GmYYbU3KUCCr2_-PW9KWUeCO-atAg9RKU,246171
38
38
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
39
39
  sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
40
40
  sky/backends/wheel_utils.py,sha256=meypuMaygSXXjGdXfq6dhWl-OrpAybg9KVRoup4D0wU,9098
@@ -57,7 +57,7 @@ sky/clouds/gcp.py,sha256=FUCUq94yGUZ_yyKxA3prRKTqetObbIMkfjAPTPbhXyA,55824
57
57
  sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
58
58
  sky/clouds/kubernetes.py,sha256=xsYX8HhdcRzsdx6Gd_3kumNqjMjpo_l4cinhs3ZMwZM,35067
59
59
  sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
60
- sky/clouds/nebius.py,sha256=4180IruRMib7L9o60lrxrUDJtYhpX4lWFfAznbZoY6Q,12560
60
+ sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
61
61
  sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
62
62
  sky/clouds/paperspace.py,sha256=O7bH8YaHBLFuyj6rDz2bPDz_6OYWmNB9OLqnZH70yfY,10922
63
63
  sky/clouds/runpod.py,sha256=hzYB4td6qaged83xMAVKZ96bH40oZnrHXL7a_CKxXIw,11926
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
165
165
  sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
166
166
  sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
167
167
  sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
168
- sky/provision/kubernetes/utils.py,sha256=pmtjphlon6ANdMFy7aqGFhh4bSUYAEdMQ5ARSUD2s4w,109746
168
+ sky/provision/kubernetes/utils.py,sha256=aGIYhGFnvInMqd8INwR7RirKrstSsMQxM0RvZUFia4Q,109762
169
169
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
170
170
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
171
171
  sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -230,13 +230,16 @@ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,32
230
230
  sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
231
231
  sky/server/common.py,sha256=pEa-q3P5aOm6RMlit0pVzlDoJnZU_6zViO7aK_7htn0,17843
232
232
  sky/server/constants.py,sha256=_ZNrxYh8vmgbf3DmkGDduxjvO2y43ZSPTkH5rCNsVjU,770
233
- sky/server/server.py,sha256=ag2vXO3ESU2BYOMLRkgZhpYR_WrfDB0Zo6wMTnRuy5k,43458
234
- sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
233
+ sky/server/server.py,sha256=kEjwRjA7PJDZzx6KqD_NAFxryVLkzwCnuPfbmY_p30A,44232
234
+ sky/server/stream_utils.py,sha256=4JMHgtoXPpCT8JwtqyUcDQ9IdZFir9om0JaCRr8rvbQ,5849
235
+ sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
235
236
  sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
236
237
  sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
237
- sky/server/requests/executor.py,sha256=Jk8RJoQlicDqaHhgVWMH3UiL-dJS7lGSGd05GPv-Lrc,19781
238
+ sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
239
+ sky/server/requests/executor.py,sha256=SuSr-cVrRnMzf-1SEz6O8HpcLzGM3mrbNc8re7QduYk,20862
238
240
  sky/server/requests/payloads.py,sha256=nVb7vr1SNAq6ay2dNe9301zLHp7NrM79M7nsWAECBms,16340
239
- sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
241
+ sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
242
+ sky/server/requests/requests.py,sha256=HrBDrJyWPaKk52ykHp34A6UjipXPH-f2Eh2sNvhWt4g,21228
240
243
  sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
241
244
  sky/server/requests/queues/mp_queue.py,sha256=_7AFas__0b1L8e7Bwy4lu0VYU18R85YwMlDHPhQCfh0,2998
242
245
  sky/server/requests/serializers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -303,7 +306,7 @@ sky/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
303
306
  sky/usage/constants.py,sha256=mFrTgrFIfFf4kpcl-M1VDU7_moD5_mJazUJTUDrybms,1102
304
307
  sky/usage/usage_lib.py,sha256=rInJW2kj2O1wwXUZAbeVVLhnoa7T_xBHqDhbBBrUqfI,21400
305
308
  sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
306
- sky/utils/accelerator_registry.py,sha256=GjOgqT0s0n5hT-wcpCcTRu74rnKb8LwQ6MJl6dKL-1I,3905
309
+ sky/utils/accelerator_registry.py,sha256=rZniDbqqPAF-vjkrwxGwEErFSAp6puOimkRj3ppOSRY,3905
307
310
  sky/utils/admin_policy_utils.py,sha256=y_do0VH6qh163EqSuRW1uGeKvTnJhiYNrHUs77uoOcA,6013
308
311
  sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
309
312
  sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
@@ -325,7 +328,7 @@ sky/utils/resources_utils.py,sha256=URp6OS9B9nc9tIB5ibZCgGK4XSABmI4kRG0wOM6qgvs,
325
328
  sky/utils/rich_utils.py,sha256=3xdDzmn-TQXAE83EevAtOf9N4aak3Bl4ZeD33xIxjOo,11931
326
329
  sky/utils/schemas.py,sha256=KJCHrn1nMZ3XqzddWuu_nFQoRQw01cZh9qh19OrRtps,30145
327
330
  sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
328
- sky/utils/subprocess_utils.py,sha256=lqhSHoy93GsVeQgQ48C6f77bixD6yfsGQP40rbXofts,12779
331
+ sky/utils/subprocess_utils.py,sha256=Q42CyjDNICXze2WCGuGxgpEjtjlka43_2ihRqKhSnQw,14916
329
332
  sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
330
333
  sky/utils/ux_utils.py,sha256=ngcOCg1K44p-SOk6XfwxJGXwjoP__PRvNuEzj7t05Yc,10185
331
334
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
@@ -344,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
344
347
  sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
345
348
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
346
349
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
347
- skypilot_nightly-1.0.0.dev20250311.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
348
- skypilot_nightly-1.0.0.dev20250311.dist-info/METADATA,sha256=sSJcOjrZzxkaeM8U9koQpUk4DNlQa4RfH21iDGPCbXo,18051
349
- skypilot_nightly-1.0.0.dev20250311.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
350
- skypilot_nightly-1.0.0.dev20250311.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
351
- skypilot_nightly-1.0.0.dev20250311.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
352
- skypilot_nightly-1.0.0.dev20250311.dist-info/RECORD,,
350
+ skypilot_nightly-1.0.0.dev20250312.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
351
+ skypilot_nightly-1.0.0.dev20250312.dist-info/METADATA,sha256=q1Bn6vuOOsagTfsfIAPxoyhpt2hWE2H6hzCmLvH65jM,18051
352
+ skypilot_nightly-1.0.0.dev20250312.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
353
+ skypilot_nightly-1.0.0.dev20250312.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
354
+ skypilot_nightly-1.0.0.dev20250312.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
355
+ skypilot_nightly-1.0.0.dev20250312.dist-info/RECORD,,