skypilot-nightly 1.0.0.dev20250218__py3-none-any.whl → 1.0.0.dev20250220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '912b8293b3ebeba84941c108dbede1e6dcbc9b6f'
8
+ _SKYPILOT_COMMIT_SHA = '6b2b31d8358f3ff8394a7a33ec49e9985ada230f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250218'
38
+ __version__ = '1.0.0.dev20250220'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
1419
1419
  try:
1420
1420
  # Check the controller status again, as the RuntimeError is likely
1421
1421
  # due to the controller being autostopped when querying the jobs.
1422
- controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1423
- # Query status of the controller cluster. We add a wildcard because
1424
- # the controller cluster name can have a suffix like
1425
- # '-remote-<hash>' when using remote API server.
1422
+ # Since we are client-side, we may not know the exact name of the
1423
+ # controller, so use the prefix with a wildcard.
1424
+ # Query status of the controller cluster.
1426
1425
  records = sdk.get(
1427
- sdk.status(
1428
- cluster_names=[controller_type.value.cluster_name + '*']))
1426
+ sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
1427
+ all_users=True))
1429
1428
  if (not records or
1430
1429
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1431
- msg = controller_type.value.default_hint_if_non_existent
1430
+ controller = controller_utils.Controllers.JOBS_CONTROLLER.value
1431
+ msg = controller.default_hint_if_non_existent
1432
1432
  except Exception: # pylint: disable=broad-except
1433
1433
  # This is to an best effort to find the latest controller status to
1434
1434
  # print more helpful message, so we can ignore any exception to
@@ -1494,16 +1494,18 @@ def _handle_services_request(
1494
1494
  # Check the controller status again, as the RuntimeError is likely
1495
1495
  # due to the controller being autostopped when querying the
1496
1496
  # services.
1497
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1498
- # Query status of the controller cluster. We add a wildcard because
1499
- # the controller cluster name can have a suffix like
1500
- # '-remote-<hash>' when using remote API server.
1497
+ # Since we are client-side, we may not know the exact name of the
1498
+ # controller, so use the prefix with a wildcard.
1499
+ # Query status of the controller cluster.
1501
1500
  records = sdk.get(
1502
1501
  sdk.status(
1503
- cluster_names=[controller_type.value.cluster_name + '*']))
1502
+ cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
1503
+ all_users=True))
1504
1504
  if (not records or
1505
1505
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1506
- msg = controller_type.value.default_hint_if_non_existent
1506
+ controller = (
1507
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
1508
+ msg = controller.default_hint_if_non_existent
1507
1509
  except Exception: # pylint: disable=broad-except
1508
1510
  # This is to an best effort to find the latest controller status to
1509
1511
  # print more helpful message, so we can ignore any exception to
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2804
2806
  to be torn down (e.g., because it has jobs running or
2805
2807
  it is in init state)
2806
2808
  """
2807
- if not common.is_current_user_controller(controller_name):
2808
- with ux_utils.print_exception_no_traceback():
2809
- raise exceptions.NotSupportedError(
2810
- f'Tearing down other user\'s managed job controller '
2811
- f'{controller_name!r} is not allowed.')
2812
2809
  controller = controller_utils.Controllers.from_name(controller_name)
2813
2810
  assert controller is not None, controller_name
2814
2811
 
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2868
2865
  to be torn down (e.g., because it has services running or
2869
2866
  it is in init state)
2870
2867
  """
2871
- # TODO(zhwu): Move this check to the sdk or even API server side.
2872
- if not common.is_current_user_controller(controller_name):
2873
- with ux_utils.print_exception_no_traceback():
2874
- raise exceptions.NotSupportedError(
2875
- f'Tearing down other user\'s sky serve controller '
2876
- f'{controller_name!r} is not allowed.')
2877
2868
  controller = controller_utils.Controllers.from_name(controller_name)
2878
2869
  assert controller is not None, controller_name
2879
2870
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
sky/client/cli.py CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
1419
1419
  try:
1420
1420
  # Check the controller status again, as the RuntimeError is likely
1421
1421
  # due to the controller being autostopped when querying the jobs.
1422
- controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1423
- # Query status of the controller cluster. We add a wildcard because
1424
- # the controller cluster name can have a suffix like
1425
- # '-remote-<hash>' when using remote API server.
1422
+ # Since we are client-side, we may not know the exact name of the
1423
+ # controller, so use the prefix with a wildcard.
1424
+ # Query status of the controller cluster.
1426
1425
  records = sdk.get(
1427
- sdk.status(
1428
- cluster_names=[controller_type.value.cluster_name + '*']))
1426
+ sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
1427
+ all_users=True))
1429
1428
  if (not records or
1430
1429
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1431
- msg = controller_type.value.default_hint_if_non_existent
1430
+ controller = controller_utils.Controllers.JOBS_CONTROLLER.value
1431
+ msg = controller.default_hint_if_non_existent
1432
1432
  except Exception: # pylint: disable=broad-except
1433
1433
  # This is to an best effort to find the latest controller status to
1434
1434
  # print more helpful message, so we can ignore any exception to
@@ -1494,16 +1494,18 @@ def _handle_services_request(
1494
1494
  # Check the controller status again, as the RuntimeError is likely
1495
1495
  # due to the controller being autostopped when querying the
1496
1496
  # services.
1497
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1498
- # Query status of the controller cluster. We add a wildcard because
1499
- # the controller cluster name can have a suffix like
1500
- # '-remote-<hash>' when using remote API server.
1497
+ # Since we are client-side, we may not know the exact name of the
1498
+ # controller, so use the prefix with a wildcard.
1499
+ # Query status of the controller cluster.
1501
1500
  records = sdk.get(
1502
1501
  sdk.status(
1503
- cluster_names=[controller_type.value.cluster_name + '*']))
1502
+ cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
1503
+ all_users=True))
1504
1504
  if (not records or
1505
1505
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1506
- msg = controller_type.value.default_hint_if_non_existent
1506
+ controller = (
1507
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
1508
+ msg = controller.default_hint_if_non_existent
1507
1509
  except Exception: # pylint: disable=broad-except
1508
1510
  # This is to an best effort to find the latest controller status to
1509
1511
  # print more helpful message, so we can ignore any exception to
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2804
2806
  to be torn down (e.g., because it has jobs running or
2805
2807
  it is in init state)
2806
2808
  """
2807
- if not common.is_current_user_controller(controller_name):
2808
- with ux_utils.print_exception_no_traceback():
2809
- raise exceptions.NotSupportedError(
2810
- f'Tearing down other user\'s managed job controller '
2811
- f'{controller_name!r} is not allowed.')
2812
2809
  controller = controller_utils.Controllers.from_name(controller_name)
2813
2810
  assert controller is not None, controller_name
2814
2811
 
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2868
2865
  to be torn down (e.g., because it has services running or
2869
2866
  it is in init state)
2870
2867
  """
2871
- # TODO(zhwu): Move this check to the sdk or even API server side.
2872
- if not common.is_current_user_controller(controller_name):
2873
- with ux_utils.print_exception_no_traceback():
2874
- raise exceptions.NotSupportedError(
2875
- f'Tearing down other user\'s sky serve controller '
2876
- f'{controller_name!r} is not allowed.')
2877
2868
  controller = controller_utils.Controllers.from_name(controller_name)
2878
2869
  assert controller is not None, controller_name
2879
2870
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
@@ -60,8 +60,8 @@ HIDDEN_TPU_DF = pd.read_csv(
60
60
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
61
61
  """)))
62
62
 
63
- # TPU V6e price for us-central2 is missing in the SKUs.
64
- TPU_V6E_MISSING_REGIONS = ['us-central2']
63
+ # TPU V6e price for the following regions is missing in the SKUs.
64
+ TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
65
65
 
66
66
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
67
67
  # NOTE(dev): Keep the zones and the df in sync.
sky/jobs/server/core.py CHANGED
@@ -21,10 +21,11 @@ from sky.backends import backend_utils
21
21
  from sky.clouds.service_catalog import common as service_catalog_common
22
22
  from sky.jobs import constants as managed_job_constants
23
23
  from sky.jobs import utils as managed_job_utils
24
- from sky.provision import common
24
+ from sky.provision import common as provision_common
25
25
  from sky.skylet import constants as skylet_constants
26
26
  from sky.usage import usage_lib
27
27
  from sky.utils import admin_policy_utils
28
+ from sky.utils import common
28
29
  from sky.utils import common_utils
29
30
  from sky.utils import controller_utils
30
31
  from sky.utils import dag_utils
@@ -149,14 +150,18 @@ def launch(
149
150
  f'{colorama.Fore.YELLOW}'
150
151
  f'Launching managed job {dag.name!r} from jobs controller...'
151
152
  f'{colorama.Style.RESET_ALL}')
152
- return execution.launch(task=controller_task,
153
- cluster_name=controller_name,
154
- stream_logs=stream_logs,
155
- idle_minutes_to_autostop=skylet_constants.
156
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
157
- retry_until_up=True,
158
- fast=True,
159
- _disable_controller_check=True)
153
+
154
+ # Launch with the api server's user hash, so that sky status does not
155
+ # show the owner of the controller as whatever user launched it first.
156
+ with common.with_server_user_hash():
157
+ return execution.launch(task=controller_task,
158
+ cluster_name=controller_name,
159
+ stream_logs=stream_logs,
160
+ idle_minutes_to_autostop=skylet_constants.
161
+ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
162
+ retry_until_up=True,
163
+ fast=True,
164
+ _disable_controller_check=True)
160
165
 
161
166
 
162
167
  def queue_from_kubernetes_pod(
@@ -194,16 +199,16 @@ def queue_from_kubernetes_pod(
194
199
  provider_config = {'context': context}
195
200
  instances = {
196
201
  pod_name: [
197
- common.InstanceInfo(instance_id=pod_name,
198
- internal_ip='',
199
- external_ip='',
200
- tags={})
202
+ provision_common.InstanceInfo(instance_id=pod_name,
203
+ internal_ip='',
204
+ external_ip='',
205
+ tags={})
201
206
  ]
202
207
  } # Internal IP is not required for Kubernetes
203
- cluster_info = common.ClusterInfo(provider_name='kubernetes',
204
- head_instance_id=pod_name,
205
- provider_config=provider_config,
206
- instances=instances)
208
+ cluster_info = provision_common.ClusterInfo(provider_name='kubernetes',
209
+ head_instance_id=pod_name,
210
+ provider_config=provider_config,
211
+ instances=instances)
207
212
  managed_jobs_runner = provision_lib.get_command_runners(
208
213
  'kubernetes', cluster_info)[0]
209
214
 
@@ -1,4 +1,9 @@
1
- """Persistent dashboard sessions."""
1
+ """Persistent dashboard sessions.
2
+
3
+ Note: before #4717, this was useful because we needed to tunnel to multiple
4
+ controllers - one per user. Now, there is only one controller for the whole API
5
+ server, so this is not very useful. TODO(cooperc): Remove or fix this.
6
+ """
2
7
  import pathlib
3
8
  from typing import Tuple
4
9
 
sky/jobs/server/server.py CHANGED
@@ -21,11 +21,6 @@ logger = sky_logging.init_logger(__name__)
21
21
  router = fastapi.APIRouter()
22
22
 
23
23
 
24
- def _get_controller_name(request_body: payloads.RequestBody) -> str:
25
- user_hash = request_body.user_hash
26
- return common.get_controller_name(common.ControllerType.JOBS, user_hash)
27
-
28
-
29
24
  @router.post('/launch')
30
25
  async def launch(request: fastapi.Request,
31
26
  jobs_launch_body: payloads.JobsLaunchBody) -> None:
@@ -35,7 +30,7 @@ async def launch(request: fastapi.Request,
35
30
  request_body=jobs_launch_body,
36
31
  func=core.launch,
37
32
  schedule_type=api_requests.ScheduleType.LONG,
38
- request_cluster_name=_get_controller_name(jobs_launch_body),
33
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
39
34
  )
40
35
 
41
36
 
@@ -49,7 +44,7 @@ async def queue(request: fastapi.Request,
49
44
  func=core.queue,
50
45
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
51
46
  else api_requests.ScheduleType.SHORT),
52
- request_cluster_name=_get_controller_name(jobs_queue_body),
47
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
53
48
  )
54
49
 
55
50
 
@@ -62,7 +57,7 @@ async def cancel(request: fastapi.Request,
62
57
  request_body=jobs_cancel_body,
63
58
  func=core.cancel,
64
59
  schedule_type=api_requests.ScheduleType.SHORT,
65
- request_cluster_name=_get_controller_name(jobs_cancel_body),
60
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
66
61
  )
67
62
 
68
63
 
@@ -78,7 +73,7 @@ async def logs(
78
73
  func=core.tail_logs,
79
74
  schedule_type=api_requests.ScheduleType.SHORT
80
75
  if jobs_logs_body.refresh else api_requests.ScheduleType.LONG,
81
- request_cluster_name=_get_controller_name(jobs_logs_body),
76
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
82
77
  )
83
78
  request_task = api_requests.get_request(request.state.request_id)
84
79
 
@@ -107,13 +102,16 @@ async def download_logs(
107
102
  func=core.download_logs,
108
103
  schedule_type=api_requests.ScheduleType.LONG
109
104
  if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
110
- request_cluster_name=_get_controller_name(jobs_download_logs_body),
105
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
111
106
  )
112
107
 
113
108
 
114
109
  @router.get('/dashboard')
115
110
  async def dashboard(request: fastapi.Request,
116
111
  user_hash: str) -> fastapi.Response:
112
+ # Note: before #4717, each user had their own controller, and thus their own
113
+ # dashboard. Now, all users share the same controller, so this isn't really
114
+ # necessary. TODO(cooperc): clean up.
117
115
  # Find the port for the dashboard of the user
118
116
  os.environ[constants.USER_ID_ENV_VAR] = user_hash
119
117
  server_common.reload_for_new_request(client_entrypoint=None,
sky/serve/server/core.py CHANGED
@@ -249,13 +249,16 @@ def up(
249
249
  # with the current job id, we know the service is up and running
250
250
  # for the first time; otherwise it is a name conflict.
251
251
  idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
252
- controller_job_id, controller_handle = execution.launch(
253
- task=controller_task,
254
- cluster_name=controller_name,
255
- idle_minutes_to_autostop=idle_minutes_to_autostop,
256
- retry_until_up=True,
257
- _disable_controller_check=True,
258
- )
252
+ # Since the controller may be shared among multiple users, launch the
253
+ # controller with the API server's user hash.
254
+ with common.with_server_user_hash():
255
+ controller_job_id, controller_handle = execution.launch(
256
+ task=controller_task,
257
+ cluster_name=controller_name,
258
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
259
+ retry_until_up=True,
260
+ _disable_controller_check=True,
261
+ )
259
262
 
260
263
  style = colorama.Style
261
264
  fore = colorama.Fore
@@ -14,11 +14,6 @@ logger = sky_logging.init_logger(__name__)
14
14
  router = fastapi.APIRouter()
15
15
 
16
16
 
17
- def _get_controller_name(request_body: payloads.RequestBody) -> str:
18
- user_hash = request_body.user_hash
19
- return common.get_controller_name(common.ControllerType.SERVE, user_hash)
20
-
21
-
22
17
  @router.post('/up')
23
18
  async def up(
24
19
  request: fastapi.Request,
@@ -30,7 +25,7 @@ async def up(
30
25
  request_body=up_body,
31
26
  func=core.up,
32
27
  schedule_type=api_requests.ScheduleType.LONG,
33
- request_cluster_name=_get_controller_name(up_body),
28
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
34
29
  )
35
30
 
36
31
 
@@ -45,7 +40,7 @@ async def update(
45
40
  request_body=update_body,
46
41
  func=core.update,
47
42
  schedule_type=api_requests.ScheduleType.SHORT,
48
- request_cluster_name=_get_controller_name(update_body),
43
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
49
44
  )
50
45
 
51
46
 
@@ -60,7 +55,7 @@ async def down(
60
55
  request_body=down_body,
61
56
  func=core.down,
62
57
  schedule_type=api_requests.ScheduleType.SHORT,
63
- request_cluster_name=_get_controller_name(down_body),
58
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
64
59
  )
65
60
 
66
61
 
@@ -75,7 +70,7 @@ async def terminate_replica(
75
70
  request_body=terminate_replica_body,
76
71
  func=core.terminate_replica,
77
72
  schedule_type=api_requests.ScheduleType.SHORT,
78
- request_cluster_name=_get_controller_name(terminate_replica_body),
73
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
79
74
  )
80
75
 
81
76
 
@@ -90,7 +85,7 @@ async def status(
90
85
  request_body=status_body,
91
86
  func=core.status,
92
87
  schedule_type=api_requests.ScheduleType.SHORT,
93
- request_cluster_name=_get_controller_name(status_body),
88
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
94
89
  )
95
90
 
96
91
 
@@ -105,7 +100,7 @@ async def tail_logs(
105
100
  request_body=log_body,
106
101
  func=core.tail_logs,
107
102
  schedule_type=api_requests.ScheduleType.SHORT,
108
- request_cluster_name=_get_controller_name(log_body),
103
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
109
104
  )
110
105
 
111
106
  request_task = api_requests.get_request(request.state.request_id)
sky/server/common.py CHANGED
@@ -3,7 +3,6 @@
3
3
  import dataclasses
4
4
  import enum
5
5
  import functools
6
- import importlib
7
6
  import json
8
7
  import os
9
8
  import pathlib
@@ -16,7 +15,6 @@ import uuid
16
15
 
17
16
  import colorama
18
17
  import filelock
19
- import psutil
20
18
  import pydantic
21
19
  import requests
22
20
 
@@ -28,14 +26,12 @@ from sky.server import constants as server_constants
28
26
  from sky.skylet import constants
29
27
  from sky.usage import usage_lib
30
28
  from sky.utils import annotations
31
- from sky.utils import common
32
29
  from sky.utils import common_utils
33
30
  from sky.utils import rich_utils
34
31
  from sky.utils import ux_utils
35
32
 
36
33
  if typing.TYPE_CHECKING:
37
34
  from sky import dag as dag_lib
38
- from sky.server.requests import payloads
39
35
 
40
36
  DEFAULT_SERVER_URL = 'http://127.0.0.1:46580'
41
37
  AVAILBLE_LOCAL_API_SERVER_HOSTS = ['0.0.0.0', 'localhost', '127.0.0.1']
@@ -149,13 +145,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
149
145
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
150
146
 
151
147
 
152
- def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
148
+ def start_api_server_in_background(deploy: bool = False,
149
+ host: str = '127.0.0.1'):
153
150
  if not is_api_server_local():
154
151
  raise RuntimeError(
155
152
  f'Cannot start API server: {get_server_url()} is not a local URL')
156
153
 
157
154
  # Check available memory before starting the server.
158
- avail_mem_size_gb: float = psutil.virtual_memory().available / (1024**3)
155
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
159
156
  if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
160
157
  logger.warning(
161
158
  f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only has '
@@ -166,8 +163,6 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
166
163
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
167
164
  os.makedirs(os.path.dirname(log_path), exist_ok=True)
168
165
 
169
- # The command to run uvicorn. Adjust the app:app to your application's
170
- # location.
171
166
  api_server_cmd = API_SERVER_CMD
172
167
  if deploy:
173
168
  api_server_cmd += ' --deploy'
@@ -175,7 +170,7 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
175
170
  api_server_cmd += f' --host {host}'
176
171
  cmd = f'{sys.executable} {api_server_cmd} > {log_path} 2>&1'
177
172
 
178
- # Start the uvicorn process in the background and don't wait for it.
173
+ # Start the API server process in the background and don't wait for it.
179
174
  # If this is called from a CLI invocation, we need start_new_session=True so
180
175
  # that SIGINT on the CLI will not also kill the API server.
181
176
  subprocess.Popen(cmd, shell=True, start_new_session=True)
@@ -235,7 +230,7 @@ def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
235
230
  f'SkyPilot API server at {server_url}. '
236
231
  'Starting a local server.'
237
232
  f'{colorama.Style.RESET_ALL}')
238
- start_uvicorn_in_background(deploy=deploy, host=host)
233
+ start_api_server_in_background(deploy=deploy, host=host)
239
234
  logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
240
235
 
241
236
 
@@ -407,23 +402,6 @@ def request_body_to_params(body: pydantic.BaseModel) -> Dict[str, Any]:
407
402
  def reload_for_new_request(client_entrypoint: Optional[str],
408
403
  client_command: Optional[str]):
409
404
  """Reload modules, global variables, and usage message for a new request."""
410
- # When a user request is sent to api server, it changes the user hash in the
411
- # env vars, but since controller_utils is imported before the env vars are
412
- # set, it doesn't get updated. So we need to reload it here.
413
- # pylint: disable=import-outside-toplevel
414
- from sky.utils import controller_utils
415
- common.SKY_SERVE_CONTROLLER_NAME = common.get_controller_name(
416
- common.ControllerType.SERVE)
417
- common.JOB_CONTROLLER_NAME = common.get_controller_name(
418
- common.ControllerType.JOBS)
419
- # TODO(zhwu): We should avoid reloading the controller_utils module.
420
- # Instead, we should reload required cache or global variables.
421
- # TODO(zhwu): Reloading the controller_utils module may cause the global
422
- # variables in other modules referring the `controller_utils.Controllers`
423
- # dangling, as they will be pointing to the old object. We should not use
424
- # it in global variables.
425
- importlib.reload(controller_utils)
426
-
427
405
  # Reset the client entrypoint and command for the usage message.
428
406
  common_utils.set_client_entrypoint_and_command(
429
407
  client_entrypoint=client_entrypoint,
@@ -32,7 +32,6 @@ import traceback
32
32
  import typing
33
33
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
34
34
 
35
- import psutil
36
35
  import setproctitle
37
36
 
38
37
  from sky import global_user_state
@@ -70,18 +69,36 @@ logger = sky_logging.init_logger(__name__)
70
69
  # platforms, including macOS.
71
70
  multiprocessing.set_start_method('spawn', force=True)
72
71
 
73
- # Constants based on profiling the peak memory usage of
74
- # various sky commands. See `tests/load_test/` for details.
75
- # Max memory consumption for each request.
76
- _PER_BLOCKING_REQUEST_MEM_GB = 0.25
77
- _PER_NON_BLOCKING_REQUEST_MEM_GB = 0.15
78
- # To control the number of blocking workers.
79
- _CPU_MULTIPLIER_FOR_BLOCKING_WORKERS = 2
80
- _MAX_BLOCKING_WORKERS_LOCAL = 4
81
- # Percentage of memory for blocking requests
72
+ # Constants based on profiling the peak memory usage while serving various
73
+ # sky commands. These estimation are highly related to usage patterns
74
+ # (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
75
+ # the profiling covers major clouds and common usage patterns. For user has
76
+ # deviated usage pattern, they can override the default estimation by
77
+ # environment variables.
78
+ # NOTE(dev): update these constants for each release according to the load
79
+ # test results.
80
+ # TODO(aylei): maintaining these constants is error-prone, we may need to
81
+ # automatically tune parallelism at runtime according to system usage stats
82
+ # in the future.
83
+ _LONG_WORKER_MEM_GB = 0.4
84
+ _SHORT_WORKER_MEM_GB = 0.25
85
+ # To control the number of long workers.
86
+ _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
87
+ # Limit the number of long workers of local API server, since local server is
88
+ # typically:
89
+ # 1. launched automatically in an environment with high resource contention
90
+ # (e.g. Laptop)
91
+ # 2. used by a single user
92
+ _MAX_LONG_WORKERS_LOCAL = 4
93
+ # Percentage of memory for long requests
82
94
  # from the memory reserved for SkyPilot.
83
- # This is to reserve some memory for non-blocking requests.
95
+ # This is to reserve some memory for short requests.
84
96
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
97
+ # Minimal number of long workers to ensure responsiveness.
98
+ _MIN_LONG_WORKERS = 1
99
+ # Minimal number of short workers, there is a daemon task running on short
100
+ # workers so at least 2 workers are needed to ensure responsiveness.
101
+ _MIN_SHORT_WORKERS = 2
85
102
 
86
103
 
87
104
  class QueueBackend(enum.Enum):
@@ -301,34 +318,32 @@ def schedule_request(request_id: str,
301
318
  _get_queue(schedule_type).put(input_tuple)
302
319
 
303
320
 
321
+ def executor_initializer(proc_group: str):
322
+ setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
323
+ f'{multiprocessing.current_process().pid}')
324
+
325
+
304
326
  def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
305
327
  """Worker for the requests.
306
328
 
307
329
  Args:
308
330
  max_parallel_size: Maximum number of parallel jobs this worker can run.
309
331
  """
310
- logger.info(f'Starting {worker} with pid '
311
- f'{multiprocessing.current_process().pid}')
312
- setproctitle.setproctitle(
313
- f'SkyPilot:worker:{worker.schedule_type.value}-{worker.id}')
332
+ proc_group = f'{worker.schedule_type.value}-{worker.id}'
333
+ setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
314
334
  queue = _get_queue(worker.schedule_type)
315
- # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
316
- # because the former is more efficient with the support of lazy creation of
317
- # worker processes.
318
- # We use executor instead of individual multiprocessing.Process to avoid
319
- # the overhead of forking a new process for each request, which can be about
320
- # 1s delay.
321
- with concurrent.futures.ProcessPoolExecutor(
322
- max_workers=max_parallel_size) as executor:
323
- while True:
335
+
336
+ def process_request(executor: concurrent.futures.ProcessPoolExecutor):
337
+ try:
324
338
  request_element = queue.get()
325
339
  if request_element is None:
326
340
  time.sleep(0.1)
327
- continue
341
+ return
328
342
  request_id, ignore_return_value = request_element
329
343
  request = api_requests.get_request(request_id)
344
+ assert request is not None, f'Request with ID {request_id} is None'
330
345
  if request.status == api_requests.RequestStatus.CANCELLED:
331
- continue
346
+ return
332
347
  logger.info(f'[{worker}] Submitting request: {request_id}')
333
348
  # Start additional process to run the request, so that it can be
334
349
  # cancelled when requested by a user.
@@ -347,60 +362,49 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
347
362
  logger.info(f'[{worker}] Finished request: {request_id}')
348
363
  else:
349
364
  logger.info(f'[{worker}] Submitted request: {request_id}')
365
+ except KeyboardInterrupt:
366
+ # Interrupt the worker process will stop request execution, but
367
+ # the SIGTERM request should be respected anyway since it might
368
+ # be explicitly sent by user.
369
+ # TODO(aylei): crash the API server or recreate the worker process
370
+ # to avoid broken state.
371
+ logger.error(f'[{worker}] Worker process interrupted')
372
+ raise
373
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
374
+ # Catch any other exceptions to avoid crashing the worker process.
375
+ logger.error(
376
+ f'[{worker}] Error processing request {request_id}: '
377
+ f'{common_utils.format_exception(e, use_bracket=True)}')
350
378
 
351
-
352
- def _get_cpu_count() -> int:
353
- """Get the number of CPUs.
354
-
355
- If the API server is deployed as a pod in k8s cluster, we assume the
356
- number of CPUs is provided by the downward API.
357
- """
358
- cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
359
- if cpu_count is not None:
360
- try:
361
- return int(float(cpu_count))
362
- except ValueError as e:
363
- with ux_utils.print_exception_no_traceback():
364
- raise ValueError(
365
- f'Failed to parse the number of CPUs from {cpu_count}'
366
- ) from e
367
- return psutil.cpu_count()
368
-
369
-
370
- def _get_mem_size_gb() -> float:
371
- """Get the memory size in GB.
372
-
373
- If the API server is deployed as a pod in k8s cluster, we assume the
374
- memory size is provided by the downward API.
375
- """
376
- mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
377
- if mem_size is not None:
378
- try:
379
- return float(mem_size)
380
- except ValueError as e:
381
- with ux_utils.print_exception_no_traceback():
382
- raise ValueError(
383
- f'Failed to parse the memory size from {mem_size}') from e
384
- return psutil.virtual_memory().total / (1024**3)
379
+ # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
380
+ # because the former is more efficient with the support of lazy creation of
381
+ # worker processes.
382
+ # We use executor instead of individual multiprocessing.Process to avoid
383
+ # the overhead of forking a new process for each request, which can be about
384
+ # 1s delay.
385
+ with concurrent.futures.ProcessPoolExecutor(
386
+ max_workers=max_parallel_size,
387
+ initializer=executor_initializer,
388
+ initargs=(proc_group,)) as executor:
389
+ while True:
390
+ process_request(executor)
385
391
 
386
392
 
387
393
  def start(deploy: bool) -> List[multiprocessing.Process]:
388
394
  """Start the request workers."""
389
395
  # Determine the job capacity of the workers based on the system resources.
390
- cpu_count = _get_cpu_count()
391
- mem_size_gb = _get_mem_size_gb()
396
+ cpu_count = common_utils.get_cpu_count()
397
+ mem_size_gb = common_utils.get_mem_size_gb()
392
398
  mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
393
- parallel_for_blocking = _max_parallel_size_for_blocking(
394
- cpu_count, mem_size_gb)
395
- if not deploy:
396
- parallel_for_blocking = min(parallel_for_blocking,
397
- _MAX_BLOCKING_WORKERS_LOCAL)
398
- max_parallel_for_non_blocking = _max_parallel_size_for_non_blocking(
399
- mem_size_gb, parallel_for_blocking)
399
+ max_parallel_for_long = _max_long_worker_parallism(cpu_count,
400
+ mem_size_gb,
401
+ local=not deploy)
402
+ max_parallel_for_short = _max_short_worker_parallism(
403
+ mem_size_gb, max_parallel_for_long)
400
404
  logger.info(
401
- f'SkyPilot API server will start {parallel_for_blocking} workers for '
402
- f'blocking requests and will allow at max '
403
- f'{max_parallel_for_non_blocking} non-blocking requests in parallel.')
405
+ f'SkyPilot API server will start {max_parallel_for_long} workers for '
406
+ f'long requests and will allow at max '
407
+ f'{max_parallel_for_short} short requests in parallel.')
404
408
 
405
409
  # Setup the queues.
406
410
  if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -424,7 +428,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
424
428
  logger.info('Request queues created')
425
429
 
426
430
  worker_procs = []
427
- for worker_id in range(parallel_for_blocking):
431
+ for worker_id in range(max_parallel_for_long):
428
432
  worker = RequestWorker(id=worker_id,
429
433
  schedule_type=api_requests.ScheduleType.LONG)
430
434
  worker_proc = multiprocessing.Process(target=request_worker,
@@ -432,31 +436,34 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
432
436
  worker_proc.start()
433
437
  worker_procs.append(worker_proc)
434
438
 
435
- # Start a non-blocking worker.
439
+ # Start a worker for short requests.
436
440
  worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
437
441
  worker_proc = multiprocessing.Process(target=request_worker,
438
- args=(worker,
439
- max_parallel_for_non_blocking))
442
+ args=(worker, max_parallel_for_short))
440
443
  worker_proc.start()
441
444
  worker_procs.append(worker_proc)
442
445
  return worker_procs
443
446
 
444
447
 
445
448
  @annotations.lru_cache(scope='global', maxsize=1)
446
- def _max_parallel_size_for_blocking(cpu_count: int, mem_size_gb: float) -> int:
447
- """Max parallelism for blocking requests."""
448
- cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_BLOCKING_WORKERS
449
+ def _max_long_worker_parallism(cpu_count: int,
450
+ mem_size_gb: float,
451
+ local=False) -> int:
452
+ """Max parallelism for long workers."""
453
+ cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
449
454
  mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
450
- _PER_BLOCKING_REQUEST_MEM_GB)
451
- n = max(1, min(cpu_based_max_parallel, mem_based_max_parallel))
455
+ _LONG_WORKER_MEM_GB)
456
+ n = max(_MIN_LONG_WORKERS,
457
+ min(cpu_based_max_parallel, mem_based_max_parallel))
458
+ if local:
459
+ return min(n, _MAX_LONG_WORKERS_LOCAL)
452
460
  return n
453
461
 
454
462
 
455
463
  @annotations.lru_cache(scope='global', maxsize=1)
456
- def _max_parallel_size_for_non_blocking(mem_size_gb: float,
457
- parallel_size_for_blocking: int) -> int:
458
- """Max parallelism for non-blocking requests."""
459
- available_mem = mem_size_gb - (parallel_size_for_blocking *
460
- _PER_BLOCKING_REQUEST_MEM_GB)
461
- n = max(1, int(available_mem / _PER_NON_BLOCKING_REQUEST_MEM_GB))
464
+ def _max_short_worker_parallism(mem_size_gb: float,
465
+ long_worker_parallism: int) -> int:
466
+ """Max parallelism for short workers."""
467
+ available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
468
+ n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
462
469
  return n
sky/server/server.py CHANGED
@@ -57,7 +57,9 @@ P = ParamSpec('P')
57
57
 
58
58
  def _add_timestamp_prefix_for_server_logs() -> None:
59
59
  server_logger = sky_logging.init_logger('sky.server')
60
- # Disable propagation to avoid the root logger of SkyPilot being affected.
60
+ # Clear existing handlers first to prevent duplicates
61
+ server_logger.handlers.clear()
62
+ # Disable propagation to avoid the root logger of SkyPilot being affected
61
63
  server_logger.propagate = False
62
64
  # Add date prefix to the log message printed by loggers under
63
65
  # server.
@@ -460,6 +462,7 @@ async def launch(launch_body: payloads.LaunchBody,
460
462
  request: fastapi.Request) -> None:
461
463
  """Launches a cluster or task."""
462
464
  request_id = request.state.request_id
465
+ logger.info(f'Launching request: {request_id}')
463
466
  executor.schedule_request(
464
467
  request_id,
465
468
  request_name='launch',
@@ -627,6 +630,9 @@ async def logs(
627
630
  request_name='logs',
628
631
  request_body=cluster_job_body,
629
632
  func=core.tail_logs,
633
+ # TODO(aylei): We have tail logs scheduled as SHORT request, because it
634
+ # should be responsive. However, it can be long running if the user's
635
+ # job keeps running, and we should avoid it taking the SHORT worker.
630
636
  schedule_type=requests_lib.ScheduleType.SHORT,
631
637
  request_cluster_name=cluster_job_body.cluster_name,
632
638
  )
@@ -794,10 +800,9 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
794
800
  detail=dataclasses.asdict(
795
801
  request_task.encode()))
796
802
  return request_task.encode()
797
- # Sleep 0 to yield, so other coroutines can run. This busy waiting
798
- # loop is performance critical for short-running requests, so we do
799
- # not want to yield too long.
800
- await asyncio.sleep(0)
803
+ # yield control to allow other coroutines to run, sleep shortly
804
+ # to avoid storming the DB and CPU in the meantime
805
+ await asyncio.sleep(0.1)
801
806
 
802
807
 
803
808
  @app.get('/api/stream')
@@ -68,7 +68,7 @@ async def log_streamer(request_id: Optional[str],
68
68
  # Sleep 0 to yield, so other coroutines can run. This busy waiting
69
69
  # loop is performance critical for short-running requests, so we do
70
70
  # not want to yield too long.
71
- await asyncio.sleep(0)
71
+ await asyncio.sleep(0.1)
72
72
  request_task = requests_lib.get_request(request_id)
73
73
  if not follow:
74
74
  break
@@ -88,6 +88,9 @@ async def log_streamer(request_id: Optional[str],
88
88
  yield line_str
89
89
 
90
90
  while True:
91
+ # Sleep 0 to yield control to allow other coroutines to run,
92
+ # while keeps the loop tight to make log stream responsive.
93
+ await asyncio.sleep(0)
91
94
  line: Optional[bytes] = await f.readline()
92
95
  if not line:
93
96
  if request_id is not None:
@@ -100,24 +103,18 @@ async def log_streamer(request_id: Optional[str],
100
103
  break
101
104
  if not follow:
102
105
  break
103
-
104
- # Sleep 0 to yield, so other coroutines can run. This busy
105
- # waiting loop is performance critical for short-running
106
- # requests, so we do not want to yield too long.
107
- await asyncio.sleep(0)
106
+ # Sleep shortly to avoid storming the DB and CPU, this has
107
+ # little impact on the responsivness here since we are waiting
108
+ # for a new line to come in.
109
+ await asyncio.sleep(0.1)
108
110
  continue
109
111
  line_str = line.decode('utf-8')
110
112
  if plain_logs:
111
113
  is_payload, line_str = message_utils.decode_payload(
112
114
  line_str, raise_for_mismatch=False)
113
115
  if is_payload:
114
- # Sleep 0 to yield, so other coroutines can run. This busy
115
- # waiting loop is performance critical for short-running
116
- # requests, so we do not want to yield too long.
117
- await asyncio.sleep(0)
118
116
  continue
119
117
  yield line_str
120
- await asyncio.sleep(0) # Allow other tasks to run
121
118
 
122
119
 
123
120
  def stream_response(
sky/utils/common.py CHANGED
@@ -1,53 +1,41 @@
1
1
  """Common enumerators and classes."""
2
2
 
3
+ import contextlib
3
4
  import enum
4
- from typing import Optional
5
+ import os
6
+ from typing import Generator
5
7
 
8
+ from sky.skylet import constants
6
9
  from sky.utils import common_utils
7
10
 
8
11
  SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
9
12
  JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
10
- SERVER_ID_CONNECTOR: str = '-remote-'
11
- # We use the user hash (machine-specific) hash of the server to determine if a
12
- # SkyPilot API server is started by the same user. It will be the same across
13
- # the whole lifecycle of the server, including:
13
+ # We use the user hash (machine-specific) for the controller name. It will be
14
+ # the same across the whole lifecycle of the server, including:
14
15
  # 1. all requests, because this global variable is set once during server
15
16
  # starts.
16
17
  # 2. SkyPilot API server restarts, as long as the `~/.sky` folder is persisted
17
18
  # and the env var set during starting the server is the same.
19
+ # This behavior is the same for the local API server (where SERVER_ID is the
20
+ # same as the normal user hash). This ensures backwards-compatibility with jobs
21
+ # controllers from before #4660.
18
22
  SERVER_ID = common_utils.get_user_hash()
23
+ SKY_SERVE_CONTROLLER_NAME: str = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
24
+ JOB_CONTROLLER_NAME: str = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
19
25
 
20
26
 
21
- class ControllerType(enum.Enum):
22
- SERVE = 'SERVE'
23
- JOBS = 'JOBS'
24
-
25
-
26
- def get_controller_name(controller_type: ControllerType,
27
- user_hash: Optional[str] = None) -> str:
28
- prefix = JOB_CONTROLLER_PREFIX
29
- if controller_type == ControllerType.SERVE:
30
- prefix = SKY_SERVE_CONTROLLER_PREFIX
31
- if user_hash is None:
32
- user_hash = common_utils.get_user_hash()
33
- # Comparing the two IDs can determine if the caller is trying to get the
34
- # controller created by their local API server or a remote API server.
35
- if user_hash == SERVER_ID:
36
- # Not adding server ID for locally created controller because
37
- # of backward compatibility.
38
- return f'{prefix}{user_hash}'
39
- return f'{prefix}{user_hash}{SERVER_ID_CONNECTOR}{SERVER_ID}'
40
-
41
-
42
- # Controller names differ per user and per SkyPilot API server.
43
- # If local: <prefix>-<user_id>
44
- # If remote: <prefix>-<user_id>-remote-<api_server_user_id>
45
- # DO NOT use these variables on the client side because client side doesn't know
46
- # the remote server's user id, so client side will get local-version controller
47
- # name.
48
- # TODO(SKY-1106): remove dynamic constants like this.
49
- SKY_SERVE_CONTROLLER_NAME: str = get_controller_name(ControllerType.SERVE)
50
- JOB_CONTROLLER_NAME: str = get_controller_name(ControllerType.JOBS)
27
+ @contextlib.contextmanager
28
+ def with_server_user_hash() -> Generator[None, None, None]:
29
+ """Temporarily set the user hash to common.SERVER_ID."""
30
+ old_env_user_hash = os.getenv(constants.USER_ID_ENV_VAR)
31
+ os.environ[constants.USER_ID_ENV_VAR] = SERVER_ID
32
+ try:
33
+ yield
34
+ finally:
35
+ if old_env_user_hash is not None:
36
+ os.environ[constants.USER_ID_ENV_VAR] = old_env_user_hash
37
+ else:
38
+ os.environ.pop(constants.USER_ID_ENV_VAR)
51
39
 
52
40
 
53
41
  class StatusRefreshMode(enum.Enum):
@@ -64,11 +52,3 @@ class StatusRefreshMode(enum.Enum):
64
52
  class OptimizeTarget(enum.Enum):
65
53
  COST = 0
66
54
  TIME = 1
67
-
68
-
69
- def is_current_user_controller(controller_name: str) -> bool:
70
- """If the controller name belongs to the current user."""
71
- if SERVER_ID_CONNECTOR in controller_name:
72
- controller_name = controller_name.split(SERVER_ID_CONNECTOR)[0]
73
- controller_user_id = controller_name.split('-')[-1]
74
- return controller_user_id == common_utils.get_user_hash()
sky/utils/common_utils.py CHANGED
@@ -18,6 +18,7 @@ import uuid
18
18
 
19
19
  import jinja2
20
20
  import jsonschema
21
+ import psutil
21
22
  import yaml
22
23
 
23
24
  from sky import exceptions
@@ -755,3 +756,40 @@ def is_port_available(port: int, reuse_addr: bool = True) -> bool:
755
756
  return True
756
757
  except OSError:
757
758
  return False
759
+
760
+
761
+ # TODO(aylei): should be aware of cgroups
762
+ def get_cpu_count() -> int:
763
+ """Get the number of CPUs.
764
+
765
+ If the API server is deployed as a pod in k8s cluster, we assume the
766
+ number of CPUs is provided by the downward API.
767
+ """
768
+ cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
769
+ if cpu_count is not None:
770
+ try:
771
+ return int(float(cpu_count))
772
+ except ValueError as e:
773
+ with ux_utils.print_exception_no_traceback():
774
+ raise ValueError(
775
+ f'Failed to parse the number of CPUs from {cpu_count}'
776
+ ) from e
777
+ return psutil.cpu_count()
778
+
779
+
780
+ # TODO(aylei): should be aware of cgroups
781
+ def get_mem_size_gb() -> float:
782
+ """Get the memory size in GB.
783
+
784
+ If the API server is deployed as a pod in k8s cluster, we assume the
785
+ memory size is provided by the downward API.
786
+ """
787
+ mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
788
+ if mem_size is not None:
789
+ try:
790
+ return float(mem_size)
791
+ except ValueError as e:
792
+ with ux_utils.print_exception_no_traceback():
793
+ raise ValueError(
794
+ f'Failed to parse the memory size from {mem_size}') from e
795
+ return psutil.virtual_memory().total / (1024**3)
@@ -91,10 +91,6 @@ class Controllers(enum.Enum):
91
91
  JOBS_CONTROLLER = _ControllerSpec(
92
92
  controller_type='jobs',
93
93
  name='managed jobs controller',
94
- # Default cluster name is the current user's controller cluster unless
95
- # caller initiate with a different controller name.
96
- # TODO(zhwu): by having the controller name loaded in common, it
97
- # will not respect the latest updated user hash.
98
94
  cluster_name=common.JOB_CONTROLLER_NAME,
99
95
  in_progress_hint=(
100
96
  '* {job_info}To see all managed jobs: '
@@ -164,13 +160,18 @@ class Controllers(enum.Enum):
164
160
  if name is None:
165
161
  return None
166
162
  controller = None
163
+ # The controller name is always the same. However, on the client-side,
164
+ # we may not know the exact name, because we are missing the server-side
165
+ # common.SERVER_ID. So, we will assume anything that matches the prefix
166
+ # is a controller.
167
167
  if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
168
168
  controller = cls.SKY_SERVE_CONTROLLER
169
169
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
170
170
  controller = cls.JOBS_CONTROLLER
171
171
  if controller is not None and name != controller.value.cluster_name:
172
- # Input name is not the current user's controller name,
173
- # so need to set the controller's cluster name to the input name.
172
+ # The client-side cluster_name is not accurate. Assume that `name`
173
+ # is the actual cluster name, so need to set the controller's
174
+ # cluster name to the input name.
174
175
  controller.value.cluster_name = name
175
176
  return controller
176
177
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250218
3
+ Version: 1.0.0.dev20250220
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=Pna6srpHox46eHU0RFPPTH60skVAjqhun9lDfN7QwHM,6391
1
+ sky/__init__.py,sha256=2WOLIr_y7h-Dzd_2cUqq56HiHaF6TBVULtoUaAeb-5c,6391
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
4
4
  sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
5
- sky/cli.py,sha256=Z_w8p_qGWdg2-7yWlimrqmaB1Yqy1hXWQkGy08jEee4,218814
5
+ sky/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
6
6
  sky/cloud_stores.py,sha256=-95XIqi_ouo7hvoN5mQNP6bGm07MyF6Yk-YP4Txb5wg,24034
7
7
  sky/core.py,sha256=gw_TrQOxz28sLAJJq6ajPnlRlrKQ2G1DtqLuntMejFU,45508
8
8
  sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
@@ -42,7 +42,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
43
43
  sky/benchmark/benchmark_utils.py,sha256=o4RymqSceq5mLEZL0upQM6NVEzJJQzj9s9tTm49uUTc,26365
44
44
  sky/client/__init__.py,sha256=pz6xvVSd9X-gwqbsDL0E9QOojYqM0KAD0j-NCyCIF1k,38
45
- sky/client/cli.py,sha256=Z_w8p_qGWdg2-7yWlimrqmaB1Yqy1hXWQkGy08jEee4,218814
45
+ sky/client/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
46
46
  sky/client/common.py,sha256=axDic7WOG1e78SdFm5XIwdhX7YNvf3g4k7INrsW3X4s,14611
47
47
  sky/client/sdk.py,sha256=q5R0_AquHAiLSLXpha8fIecQ9cgqqFba436xVzJ48oI,66943
48
48
  sky/clouds/__init__.py,sha256=taKUCz6gWoKZhqHLYJXX-d0Ux6ZSQZEwxcNFdniupL0,1365
@@ -87,7 +87,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=Zj4bqWPiDcT_ZFyHxQw
87
87
  sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=7YVnoGDGGZI2TK02bj_LOoD4E5J5CFl6eqz2XlR4Vy8,12790
88
88
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
89
89
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=yKuAFbjBRNz_e2RNNDT_aHHAuKQ86Ac7GKgIie5O6Pg,7273
90
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=4bU0j-mWZCymzUq7uyJfoIDaXXeJg49gUlM9oybBFI0,30903
90
+ sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=JnugFifzHPQITlbDKoKexE8NqgagOEfQWTxon7P6vJ0,30935
91
91
  sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=MUzogyLruLQmIt-To6TsfnGPgv_nnlp49XYbeshsd7I,5003
92
92
  sky/clouds/service_catalog/data_fetchers/fetch_vast.py,sha256=zR9icM3ty5C8tGw13pQbsBtQQMgG4kl1j_jSGqqrgOA,4741
93
93
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=Opp2r3KSzXPtwk3lKNbO8IX9QzjoRSwy1kW3jPjtS1c,21453
@@ -116,9 +116,9 @@ sky/jobs/dashboard/dashboard.py,sha256=kUKSXMAWAvPwJ_W_JK3wyz65Uope90_rNvhl8rZ1I
116
116
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
117
117
  sky/jobs/dashboard/templates/index.html,sha256=tz95q8O2pF7IvfY6yv0rnPyhj4DX8WX4RIVVxqFKV1Y,28519
118
118
  sky/jobs/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
119
- sky/jobs/server/core.py,sha256=KUMmObt0rWhuCR50lQYmF6bFSKAjHbb8sw53WBnJzv0,22251
120
- sky/jobs/server/dashboard_utils.py,sha256=BKafOhnwU_e6LtKLqqmf_CyUtkbFWRwSbdjMwhSBQrM,2086
121
- sky/jobs/server/server.py,sha256=6W9FUPT-QFfX50Qwu6MBdJ2ScSW994w5jsyM-bHW8lE,7459
119
+ sky/jobs/server/core.py,sha256=zMLSSdNFQkP-RsfzCZ9jIcHNCL0lSvRd7PH3Sie0yPA,22615
120
+ sky/jobs/server/dashboard_utils.py,sha256=2Mbx40W1pQqPEPHsSDbHeaF0j5cgyKy-_A9Owdwp_AQ,2315
121
+ sky/jobs/server/server.py,sha256=s3wULAh4u4drdIz2VA8l0HiXxHWdUzsBDYCstzU0Vxs,7411
122
122
  sky/provision/__init__.py,sha256=jiTOawg_wpy0s3Z-SEoOf7r280arLHUZzj-KPh-w7ek,6424
123
123
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
124
124
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
@@ -218,16 +218,16 @@ sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,169
218
218
  sky/serve/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  sky/serve/client/sdk.py,sha256=fVYQfvNuJxa8aZiS7LJoXFeGcjRidko0Tph5b6m0yMQ,11539
220
220
  sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
221
- sky/serve/server/core.py,sha256=cide83JrRMl45WvA0KdPtj36_g75nSiblsFtPbJ4Qyc,36660
222
- sky/serve/server/server.py,sha256=IVEjseLX4h1EZGSpJofzEJl6lkGaBKlEY4IBlngQWD8,3479
221
+ sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
222
+ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
223
223
  sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
224
- sky/server/common.py,sha256=8J1RZ2IGJtySw-gbLE_JEb9Hm24os5qwadmQDhQMqf4,18447
224
+ sky/server/common.py,sha256=uBshF4a-U8NGgm8XOHTW2YNSq0CsByfdIFgiybU5PEg,17321
225
225
  sky/server/constants.py,sha256=SqhWJMassFyvWAJn2UJHvuA_0_C6f5vngMzZ2KYLsKw,770
226
- sky/server/server.py,sha256=TZplXKA0KMs4UHLV3K5NSyhUPD0l2cmsiYgAZohn_Gs,41902
227
- sky/server/stream_utils.py,sha256=6jo1Dq8EtD0AHmJ3e3zCUNAiSYQlUKbPil4h8pA-2ac,5813
226
+ sky/server/server.py,sha256=0gcIn3jr_4DkHpBJYdNq--uPo9Im8bn2ftxgd8mBMcU,42225
227
+ sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
228
228
  sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
229
229
  sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
230
- sky/server/requests/executor.py,sha256=4PVgEK11YqWGG4ihhVPK2MPVFlCDkE9U9D07q_TbdBA,18759
230
+ sky/server/requests/executor.py,sha256=NxVB0aFA05GddXDdt89wEwEYyJcIIrsQxE2wowklhUI,19597
231
231
  sky/server/requests/payloads.py,sha256=PeEkqQoTO3ellelkFX5yzPKbPkDV-NfVXkxHndYlrjE,15769
232
232
  sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
233
233
  sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -301,11 +301,11 @@ sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
301
301
  sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
302
302
  sky/utils/command_runner.py,sha256=-7vxLvwZnTvYMQ_nScmuQWY6ZvQYv69yvvIp2uOaOqU,39063
303
303
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
304
- sky/utils/common.py,sha256=zBUmQjlSD7aF6tDG8mzbf-oU6JG3oYM2EAQ9sgSWSrA,2833
305
- sky/utils/common_utils.py,sha256=wPECJDpeloyixalXNrdmVKXFyU1UKUtBES6D0mRd2mE,26180
304
+ sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
305
+ sky/utils/common_utils.py,sha256=-O0GthIockeJy8LlA4heVYYtaUdQwNA-5mFMqHajRf8,27457
306
306
  sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
307
307
  sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
308
- sky/utils/controller_utils.py,sha256=1tnRFw9ANVyACGswIsl67uSK0fYDHLOoO6BQpxmFDgA,45674
308
+ sky/utils/controller_utils.py,sha256=4Nck10XV6gNJKjBl7y_CIxIGqP3bbISuZSVTHbBumgs,45725
309
309
  sky/utils/dag_utils.py,sha256=sAus0aL1wtuuFZSDnpO4LY-6WK4u5iJY952oWQzHo3Y,7532
310
310
  sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
311
311
  sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
@@ -336,9 +336,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
336
336
  sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
337
337
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
338
338
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
339
- skypilot_nightly-1.0.0.dev20250218.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
340
- skypilot_nightly-1.0.0.dev20250218.dist-info/METADATA,sha256=LTMWhkCmIQwt9zptcjlq9Se2Cs9MCe7IIMcqHEE7lN0,18916
341
- skypilot_nightly-1.0.0.dev20250218.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
342
- skypilot_nightly-1.0.0.dev20250218.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
343
- skypilot_nightly-1.0.0.dev20250218.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
344
- skypilot_nightly-1.0.0.dev20250218.dist-info/RECORD,,
339
+ skypilot_nightly-1.0.0.dev20250220.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
340
+ skypilot_nightly-1.0.0.dev20250220.dist-info/METADATA,sha256=uYtMxJQSUuL9hPmfqny_uQvuqWy65W5mHUHv7HvJb-o,18916
341
+ skypilot_nightly-1.0.0.dev20250220.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
342
+ skypilot_nightly-1.0.0.dev20250220.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
343
+ skypilot_nightly-1.0.0.dev20250220.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
344
+ skypilot_nightly-1.0.0.dev20250220.dist-info/RECORD,,