skypilot-nightly 1.0.0.dev20250218__py3-none-any.whl → 1.0.0.dev20250219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '912b8293b3ebeba84941c108dbede1e6dcbc9b6f'
8
+ _SKYPILOT_COMMIT_SHA = '0ec86b2b1ec8aeaae0514b5a7ce4eb99caaa0728'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250218'
38
+ __version__ = '1.0.0.dev20250219'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
1419
1419
  try:
1420
1420
  # Check the controller status again, as the RuntimeError is likely
1421
1421
  # due to the controller being autostopped when querying the jobs.
1422
- controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1423
- # Query status of the controller cluster. We add a wildcard because
1424
- # the controller cluster name can have a suffix like
1425
- # '-remote-<hash>' when using remote API server.
1422
+ # Since we are client-side, we may not know the exact name of the
1423
+ # controller, so use the prefix with a wildcard.
1424
+ # Query status of the controller cluster.
1426
1425
  records = sdk.get(
1427
- sdk.status(
1428
- cluster_names=[controller_type.value.cluster_name + '*']))
1426
+ sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
1427
+ all_users=True))
1429
1428
  if (not records or
1430
1429
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1431
- msg = controller_type.value.default_hint_if_non_existent
1430
+ controller = controller_utils.Controllers.JOBS_CONTROLLER.value
1431
+ msg = controller.default_hint_if_non_existent
1432
1432
  except Exception: # pylint: disable=broad-except
1433
1433
  # This is to an best effort to find the latest controller status to
1434
1434
  # print more helpful message, so we can ignore any exception to
@@ -1494,16 +1494,18 @@ def _handle_services_request(
1494
1494
  # Check the controller status again, as the RuntimeError is likely
1495
1495
  # due to the controller being autostopped when querying the
1496
1496
  # services.
1497
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1498
- # Query status of the controller cluster. We add a wildcard because
1499
- # the controller cluster name can have a suffix like
1500
- # '-remote-<hash>' when using remote API server.
1497
+ # Since we are client-side, we may not know the exact name of the
1498
+ # controller, so use the prefix with a wildcard.
1499
+ # Query status of the controller cluster.
1501
1500
  records = sdk.get(
1502
1501
  sdk.status(
1503
- cluster_names=[controller_type.value.cluster_name + '*']))
1502
+ cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
1503
+ all_users=True))
1504
1504
  if (not records or
1505
1505
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1506
- msg = controller_type.value.default_hint_if_non_existent
1506
+ controller = (
1507
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
1508
+ msg = controller.default_hint_if_non_existent
1507
1509
  except Exception: # pylint: disable=broad-except
1508
1510
  # This is to an best effort to find the latest controller status to
1509
1511
  # print more helpful message, so we can ignore any exception to
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2804
2806
  to be torn down (e.g., because it has jobs running or
2805
2807
  it is in init state)
2806
2808
  """
2807
- if not common.is_current_user_controller(controller_name):
2808
- with ux_utils.print_exception_no_traceback():
2809
- raise exceptions.NotSupportedError(
2810
- f'Tearing down other user\'s managed job controller '
2811
- f'{controller_name!r} is not allowed.')
2812
2809
  controller = controller_utils.Controllers.from_name(controller_name)
2813
2810
  assert controller is not None, controller_name
2814
2811
 
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2868
2865
  to be torn down (e.g., because it has services running or
2869
2866
  it is in init state)
2870
2867
  """
2871
- # TODO(zhwu): Move this check to the sdk or even API server side.
2872
- if not common.is_current_user_controller(controller_name):
2873
- with ux_utils.print_exception_no_traceback():
2874
- raise exceptions.NotSupportedError(
2875
- f'Tearing down other user\'s sky serve controller '
2876
- f'{controller_name!r} is not allowed.')
2877
2868
  controller = controller_utils.Controllers.from_name(controller_name)
2878
2869
  assert controller is not None, controller_name
2879
2870
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
sky/client/cli.py CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
1419
1419
  try:
1420
1420
  # Check the controller status again, as the RuntimeError is likely
1421
1421
  # due to the controller being autostopped when querying the jobs.
1422
- controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1423
- # Query status of the controller cluster. We add a wildcard because
1424
- # the controller cluster name can have a suffix like
1425
- # '-remote-<hash>' when using remote API server.
1422
+ # Since we are client-side, we may not know the exact name of the
1423
+ # controller, so use the prefix with a wildcard.
1424
+ # Query status of the controller cluster.
1426
1425
  records = sdk.get(
1427
- sdk.status(
1428
- cluster_names=[controller_type.value.cluster_name + '*']))
1426
+ sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
1427
+ all_users=True))
1429
1428
  if (not records or
1430
1429
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1431
- msg = controller_type.value.default_hint_if_non_existent
1430
+ controller = controller_utils.Controllers.JOBS_CONTROLLER.value
1431
+ msg = controller.default_hint_if_non_existent
1432
1432
  except Exception: # pylint: disable=broad-except
1433
1433
  # This is to an best effort to find the latest controller status to
1434
1434
  # print more helpful message, so we can ignore any exception to
@@ -1494,16 +1494,18 @@ def _handle_services_request(
1494
1494
  # Check the controller status again, as the RuntimeError is likely
1495
1495
  # due to the controller being autostopped when querying the
1496
1496
  # services.
1497
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1498
- # Query status of the controller cluster. We add a wildcard because
1499
- # the controller cluster name can have a suffix like
1500
- # '-remote-<hash>' when using remote API server.
1497
+ # Since we are client-side, we may not know the exact name of the
1498
+ # controller, so use the prefix with a wildcard.
1499
+ # Query status of the controller cluster.
1501
1500
  records = sdk.get(
1502
1501
  sdk.status(
1503
- cluster_names=[controller_type.value.cluster_name + '*']))
1502
+ cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
1503
+ all_users=True))
1504
1504
  if (not records or
1505
1505
  records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1506
- msg = controller_type.value.default_hint_if_non_existent
1506
+ controller = (
1507
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
1508
+ msg = controller.default_hint_if_non_existent
1507
1509
  except Exception: # pylint: disable=broad-except
1508
1510
  # This is to an best effort to find the latest controller status to
1509
1511
  # print more helpful message, so we can ignore any exception to
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2804
2806
  to be torn down (e.g., because it has jobs running or
2805
2807
  it is in init state)
2806
2808
  """
2807
- if not common.is_current_user_controller(controller_name):
2808
- with ux_utils.print_exception_no_traceback():
2809
- raise exceptions.NotSupportedError(
2810
- f'Tearing down other user\'s managed job controller '
2811
- f'{controller_name!r} is not allowed.')
2812
2809
  controller = controller_utils.Controllers.from_name(controller_name)
2813
2810
  assert controller is not None, controller_name
2814
2811
 
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2868
2865
  to be torn down (e.g., because it has services running or
2869
2866
  it is in init state)
2870
2867
  """
2871
- # TODO(zhwu): Move this check to the sdk or even API server side.
2872
- if not common.is_current_user_controller(controller_name):
2873
- with ux_utils.print_exception_no_traceback():
2874
- raise exceptions.NotSupportedError(
2875
- f'Tearing down other user\'s sky serve controller '
2876
- f'{controller_name!r} is not allowed.')
2877
2868
  controller = controller_utils.Controllers.from_name(controller_name)
2878
2869
  assert controller is not None, controller_name
2879
2870
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
@@ -60,8 +60,8 @@ HIDDEN_TPU_DF = pd.read_csv(
60
60
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
61
61
  """)))
62
62
 
63
- # TPU V6e price for us-central2 is missing in the SKUs.
64
- TPU_V6E_MISSING_REGIONS = ['us-central2']
63
+ # TPU V6e price for the following regions is missing in the SKUs.
64
+ TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
65
65
 
66
66
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
67
67
  # NOTE(dev): Keep the zones and the df in sync.
sky/jobs/server/core.py CHANGED
@@ -21,10 +21,11 @@ from sky.backends import backend_utils
21
21
  from sky.clouds.service_catalog import common as service_catalog_common
22
22
  from sky.jobs import constants as managed_job_constants
23
23
  from sky.jobs import utils as managed_job_utils
24
- from sky.provision import common
24
+ from sky.provision import common as provision_common
25
25
  from sky.skylet import constants as skylet_constants
26
26
  from sky.usage import usage_lib
27
27
  from sky.utils import admin_policy_utils
28
+ from sky.utils import common
28
29
  from sky.utils import common_utils
29
30
  from sky.utils import controller_utils
30
31
  from sky.utils import dag_utils
@@ -149,14 +150,18 @@ def launch(
149
150
  f'{colorama.Fore.YELLOW}'
150
151
  f'Launching managed job {dag.name!r} from jobs controller...'
151
152
  f'{colorama.Style.RESET_ALL}')
152
- return execution.launch(task=controller_task,
153
- cluster_name=controller_name,
154
- stream_logs=stream_logs,
155
- idle_minutes_to_autostop=skylet_constants.
156
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
157
- retry_until_up=True,
158
- fast=True,
159
- _disable_controller_check=True)
153
+
154
+ # Launch with the api server's user hash, so that sky status does not
155
+ # show the owner of the controller as whatever user launched it first.
156
+ with common.with_server_user_hash():
157
+ return execution.launch(task=controller_task,
158
+ cluster_name=controller_name,
159
+ stream_logs=stream_logs,
160
+ idle_minutes_to_autostop=skylet_constants.
161
+ CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
162
+ retry_until_up=True,
163
+ fast=True,
164
+ _disable_controller_check=True)
160
165
 
161
166
 
162
167
  def queue_from_kubernetes_pod(
@@ -194,16 +199,16 @@ def queue_from_kubernetes_pod(
194
199
  provider_config = {'context': context}
195
200
  instances = {
196
201
  pod_name: [
197
- common.InstanceInfo(instance_id=pod_name,
198
- internal_ip='',
199
- external_ip='',
200
- tags={})
202
+ provision_common.InstanceInfo(instance_id=pod_name,
203
+ internal_ip='',
204
+ external_ip='',
205
+ tags={})
201
206
  ]
202
207
  } # Internal IP is not required for Kubernetes
203
- cluster_info = common.ClusterInfo(provider_name='kubernetes',
204
- head_instance_id=pod_name,
205
- provider_config=provider_config,
206
- instances=instances)
208
+ cluster_info = provision_common.ClusterInfo(provider_name='kubernetes',
209
+ head_instance_id=pod_name,
210
+ provider_config=provider_config,
211
+ instances=instances)
207
212
  managed_jobs_runner = provision_lib.get_command_runners(
208
213
  'kubernetes', cluster_info)[0]
209
214
 
@@ -1,4 +1,9 @@
1
- """Persistent dashboard sessions."""
1
+ """Persistent dashboard sessions.
2
+
3
+ Note: before #4717, this was useful because we needed to tunnel to multiple
4
+ controllers - one per user. Now, there is only one controller for the whole API
5
+ server, so this is not very useful. TODO(cooperc): Remove or fix this.
6
+ """
2
7
  import pathlib
3
8
  from typing import Tuple
4
9
 
sky/jobs/server/server.py CHANGED
@@ -21,11 +21,6 @@ logger = sky_logging.init_logger(__name__)
21
21
  router = fastapi.APIRouter()
22
22
 
23
23
 
24
- def _get_controller_name(request_body: payloads.RequestBody) -> str:
25
- user_hash = request_body.user_hash
26
- return common.get_controller_name(common.ControllerType.JOBS, user_hash)
27
-
28
-
29
24
  @router.post('/launch')
30
25
  async def launch(request: fastapi.Request,
31
26
  jobs_launch_body: payloads.JobsLaunchBody) -> None:
@@ -35,7 +30,7 @@ async def launch(request: fastapi.Request,
35
30
  request_body=jobs_launch_body,
36
31
  func=core.launch,
37
32
  schedule_type=api_requests.ScheduleType.LONG,
38
- request_cluster_name=_get_controller_name(jobs_launch_body),
33
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
39
34
  )
40
35
 
41
36
 
@@ -49,7 +44,7 @@ async def queue(request: fastapi.Request,
49
44
  func=core.queue,
50
45
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
51
46
  else api_requests.ScheduleType.SHORT),
52
- request_cluster_name=_get_controller_name(jobs_queue_body),
47
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
53
48
  )
54
49
 
55
50
 
@@ -62,7 +57,7 @@ async def cancel(request: fastapi.Request,
62
57
  request_body=jobs_cancel_body,
63
58
  func=core.cancel,
64
59
  schedule_type=api_requests.ScheduleType.SHORT,
65
- request_cluster_name=_get_controller_name(jobs_cancel_body),
60
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
66
61
  )
67
62
 
68
63
 
@@ -78,7 +73,7 @@ async def logs(
78
73
  func=core.tail_logs,
79
74
  schedule_type=api_requests.ScheduleType.SHORT
80
75
  if jobs_logs_body.refresh else api_requests.ScheduleType.LONG,
81
- request_cluster_name=_get_controller_name(jobs_logs_body),
76
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
82
77
  )
83
78
  request_task = api_requests.get_request(request.state.request_id)
84
79
 
@@ -107,13 +102,16 @@ async def download_logs(
107
102
  func=core.download_logs,
108
103
  schedule_type=api_requests.ScheduleType.LONG
109
104
  if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
110
- request_cluster_name=_get_controller_name(jobs_download_logs_body),
105
+ request_cluster_name=common.JOB_CONTROLLER_NAME,
111
106
  )
112
107
 
113
108
 
114
109
  @router.get('/dashboard')
115
110
  async def dashboard(request: fastapi.Request,
116
111
  user_hash: str) -> fastapi.Response:
112
+ # Note: before #4717, each user had their own controller, and thus their own
113
+ # dashboard. Now, all users share the same controller, so this isn't really
114
+ # necessary. TODO(cooperc): clean up.
117
115
  # Find the port for the dashboard of the user
118
116
  os.environ[constants.USER_ID_ENV_VAR] = user_hash
119
117
  server_common.reload_for_new_request(client_entrypoint=None,
sky/serve/server/core.py CHANGED
@@ -249,13 +249,16 @@ def up(
249
249
  # with the current job id, we know the service is up and running
250
250
  # for the first time; otherwise it is a name conflict.
251
251
  idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
252
- controller_job_id, controller_handle = execution.launch(
253
- task=controller_task,
254
- cluster_name=controller_name,
255
- idle_minutes_to_autostop=idle_minutes_to_autostop,
256
- retry_until_up=True,
257
- _disable_controller_check=True,
258
- )
252
+ # Since the controller may be shared among multiple users, launch the
253
+ # controller with the API server's user hash.
254
+ with common.with_server_user_hash():
255
+ controller_job_id, controller_handle = execution.launch(
256
+ task=controller_task,
257
+ cluster_name=controller_name,
258
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
259
+ retry_until_up=True,
260
+ _disable_controller_check=True,
261
+ )
259
262
 
260
263
  style = colorama.Style
261
264
  fore = colorama.Fore
@@ -14,11 +14,6 @@ logger = sky_logging.init_logger(__name__)
14
14
  router = fastapi.APIRouter()
15
15
 
16
16
 
17
- def _get_controller_name(request_body: payloads.RequestBody) -> str:
18
- user_hash = request_body.user_hash
19
- return common.get_controller_name(common.ControllerType.SERVE, user_hash)
20
-
21
-
22
17
  @router.post('/up')
23
18
  async def up(
24
19
  request: fastapi.Request,
@@ -30,7 +25,7 @@ async def up(
30
25
  request_body=up_body,
31
26
  func=core.up,
32
27
  schedule_type=api_requests.ScheduleType.LONG,
33
- request_cluster_name=_get_controller_name(up_body),
28
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
34
29
  )
35
30
 
36
31
 
@@ -45,7 +40,7 @@ async def update(
45
40
  request_body=update_body,
46
41
  func=core.update,
47
42
  schedule_type=api_requests.ScheduleType.SHORT,
48
- request_cluster_name=_get_controller_name(update_body),
43
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
49
44
  )
50
45
 
51
46
 
@@ -60,7 +55,7 @@ async def down(
60
55
  request_body=down_body,
61
56
  func=core.down,
62
57
  schedule_type=api_requests.ScheduleType.SHORT,
63
- request_cluster_name=_get_controller_name(down_body),
58
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
64
59
  )
65
60
 
66
61
 
@@ -75,7 +70,7 @@ async def terminate_replica(
75
70
  request_body=terminate_replica_body,
76
71
  func=core.terminate_replica,
77
72
  schedule_type=api_requests.ScheduleType.SHORT,
78
- request_cluster_name=_get_controller_name(terminate_replica_body),
73
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
79
74
  )
80
75
 
81
76
 
@@ -90,7 +85,7 @@ async def status(
90
85
  request_body=status_body,
91
86
  func=core.status,
92
87
  schedule_type=api_requests.ScheduleType.SHORT,
93
- request_cluster_name=_get_controller_name(status_body),
88
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
94
89
  )
95
90
 
96
91
 
@@ -105,7 +100,7 @@ async def tail_logs(
105
100
  request_body=log_body,
106
101
  func=core.tail_logs,
107
102
  schedule_type=api_requests.ScheduleType.SHORT,
108
- request_cluster_name=_get_controller_name(log_body),
103
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
109
104
  )
110
105
 
111
106
  request_task = api_requests.get_request(request.state.request_id)
sky/server/common.py CHANGED
@@ -3,7 +3,6 @@
3
3
  import dataclasses
4
4
  import enum
5
5
  import functools
6
- import importlib
7
6
  import json
8
7
  import os
9
8
  import pathlib
@@ -28,14 +27,12 @@ from sky.server import constants as server_constants
28
27
  from sky.skylet import constants
29
28
  from sky.usage import usage_lib
30
29
  from sky.utils import annotations
31
- from sky.utils import common
32
30
  from sky.utils import common_utils
33
31
  from sky.utils import rich_utils
34
32
  from sky.utils import ux_utils
35
33
 
36
34
  if typing.TYPE_CHECKING:
37
35
  from sky import dag as dag_lib
38
- from sky.server.requests import payloads
39
36
 
40
37
  DEFAULT_SERVER_URL = 'http://127.0.0.1:46580'
41
38
  AVAILBLE_LOCAL_API_SERVER_HOSTS = ['0.0.0.0', 'localhost', '127.0.0.1']
@@ -407,23 +404,6 @@ def request_body_to_params(body: pydantic.BaseModel) -> Dict[str, Any]:
407
404
  def reload_for_new_request(client_entrypoint: Optional[str],
408
405
  client_command: Optional[str]):
409
406
  """Reload modules, global variables, and usage message for a new request."""
410
- # When a user request is sent to api server, it changes the user hash in the
411
- # env vars, but since controller_utils is imported before the env vars are
412
- # set, it doesn't get updated. So we need to reload it here.
413
- # pylint: disable=import-outside-toplevel
414
- from sky.utils import controller_utils
415
- common.SKY_SERVE_CONTROLLER_NAME = common.get_controller_name(
416
- common.ControllerType.SERVE)
417
- common.JOB_CONTROLLER_NAME = common.get_controller_name(
418
- common.ControllerType.JOBS)
419
- # TODO(zhwu): We should avoid reloading the controller_utils module.
420
- # Instead, we should reload required cache or global variables.
421
- # TODO(zhwu): Reloading the controller_utils module may cause the global
422
- # variables in other modules referring the `controller_utils.Controllers`
423
- # dangling, as they will be pointing to the old object. We should not use
424
- # it in global variables.
425
- importlib.reload(controller_utils)
426
-
427
407
  # Reset the client entrypoint and command for the usage message.
428
408
  common_utils.set_client_entrypoint_and_command(
429
409
  client_entrypoint=client_entrypoint,
sky/utils/common.py CHANGED
@@ -1,53 +1,41 @@
1
1
  """Common enumerators and classes."""
2
2
 
3
+ import contextlib
3
4
  import enum
4
- from typing import Optional
5
+ import os
6
+ from typing import Generator
5
7
 
8
+ from sky.skylet import constants
6
9
  from sky.utils import common_utils
7
10
 
8
11
  SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
9
12
  JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
10
- SERVER_ID_CONNECTOR: str = '-remote-'
11
- # We use the user hash (machine-specific) hash of the server to determine if a
12
- # SkyPilot API server is started by the same user. It will be the same across
13
- # the whole lifecycle of the server, including:
13
+ # We use the user hash (machine-specific) for the controller name. It will be
14
+ # the same across the whole lifecycle of the server, including:
14
15
  # 1. all requests, because this global variable is set once during server
15
16
  # starts.
16
17
  # 2. SkyPilot API server restarts, as long as the `~/.sky` folder is persisted
17
18
  # and the env var set during starting the server is the same.
19
+ # This behavior is the same for the local API server (where SERVER_ID is the
20
+ # same as the normal user hash). This ensures backwards-compatibility with jobs
21
+ # controllers from before #4660.
18
22
  SERVER_ID = common_utils.get_user_hash()
23
+ SKY_SERVE_CONTROLLER_NAME: str = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
24
+ JOB_CONTROLLER_NAME: str = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
19
25
 
20
26
 
21
- class ControllerType(enum.Enum):
22
- SERVE = 'SERVE'
23
- JOBS = 'JOBS'
24
-
25
-
26
- def get_controller_name(controller_type: ControllerType,
27
- user_hash: Optional[str] = None) -> str:
28
- prefix = JOB_CONTROLLER_PREFIX
29
- if controller_type == ControllerType.SERVE:
30
- prefix = SKY_SERVE_CONTROLLER_PREFIX
31
- if user_hash is None:
32
- user_hash = common_utils.get_user_hash()
33
- # Comparing the two IDs can determine if the caller is trying to get the
34
- # controller created by their local API server or a remote API server.
35
- if user_hash == SERVER_ID:
36
- # Not adding server ID for locally created controller because
37
- # of backward compatibility.
38
- return f'{prefix}{user_hash}'
39
- return f'{prefix}{user_hash}{SERVER_ID_CONNECTOR}{SERVER_ID}'
40
-
41
-
42
- # Controller names differ per user and per SkyPilot API server.
43
- # If local: <prefix>-<user_id>
44
- # If remote: <prefix>-<user_id>-remote-<api_server_user_id>
45
- # DO NOT use these variables on the client side because client side doesn't know
46
- # the remote server's user id, so client side will get local-version controller
47
- # name.
48
- # TODO(SKY-1106): remove dynamic constants like this.
49
- SKY_SERVE_CONTROLLER_NAME: str = get_controller_name(ControllerType.SERVE)
50
- JOB_CONTROLLER_NAME: str = get_controller_name(ControllerType.JOBS)
27
+ @contextlib.contextmanager
28
+ def with_server_user_hash() -> Generator[None, None, None]:
29
+ """Temporarily set the user hash to common.SERVER_ID."""
30
+ old_env_user_hash = os.getenv(constants.USER_ID_ENV_VAR)
31
+ os.environ[constants.USER_ID_ENV_VAR] = SERVER_ID
32
+ try:
33
+ yield
34
+ finally:
35
+ if old_env_user_hash is not None:
36
+ os.environ[constants.USER_ID_ENV_VAR] = old_env_user_hash
37
+ else:
38
+ os.environ.pop(constants.USER_ID_ENV_VAR)
51
39
 
52
40
 
53
41
  class StatusRefreshMode(enum.Enum):
@@ -64,11 +52,3 @@ class StatusRefreshMode(enum.Enum):
64
52
  class OptimizeTarget(enum.Enum):
65
53
  COST = 0
66
54
  TIME = 1
67
-
68
-
69
- def is_current_user_controller(controller_name: str) -> bool:
70
- """If the controller name belongs to the current user."""
71
- if SERVER_ID_CONNECTOR in controller_name:
72
- controller_name = controller_name.split(SERVER_ID_CONNECTOR)[0]
73
- controller_user_id = controller_name.split('-')[-1]
74
- return controller_user_id == common_utils.get_user_hash()
@@ -91,10 +91,6 @@ class Controllers(enum.Enum):
91
91
  JOBS_CONTROLLER = _ControllerSpec(
92
92
  controller_type='jobs',
93
93
  name='managed jobs controller',
94
- # Default cluster name is the current user's controller cluster unless
95
- # caller initiate with a different controller name.
96
- # TODO(zhwu): by having the controller name loaded in common, it
97
- # will not respect the latest updated user hash.
98
94
  cluster_name=common.JOB_CONTROLLER_NAME,
99
95
  in_progress_hint=(
100
96
  '* {job_info}To see all managed jobs: '
@@ -164,13 +160,18 @@ class Controllers(enum.Enum):
164
160
  if name is None:
165
161
  return None
166
162
  controller = None
163
+ # The controller name is always the same. However, on the client-side,
164
+ # we may not know the exact name, because we are missing the server-side
165
+ # common.SERVER_ID. So, we will assume anything that matches the prefix
166
+ # is a controller.
167
167
  if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
168
168
  controller = cls.SKY_SERVE_CONTROLLER
169
169
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
170
170
  controller = cls.JOBS_CONTROLLER
171
171
  if controller is not None and name != controller.value.cluster_name:
172
- # Input name is not the current user's controller name,
173
- # so need to set the controller's cluster name to the input name.
172
+ # The client-side cluster_name is not accurate. Assume that `name`
173
+ # is the actual cluster name, so need to set the controller's
174
+ # cluster name to the input name.
174
175
  controller.value.cluster_name = name
175
176
  return controller
176
177
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250218
3
+ Version: 1.0.0.dev20250219
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=Pna6srpHox46eHU0RFPPTH60skVAjqhun9lDfN7QwHM,6391
1
+ sky/__init__.py,sha256=eSspYNfxrf0xj8B8E1z5prY7j2xz0DdjEeOO8s5sMLU,6391
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
4
4
  sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
5
- sky/cli.py,sha256=Z_w8p_qGWdg2-7yWlimrqmaB1Yqy1hXWQkGy08jEee4,218814
5
+ sky/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
6
6
  sky/cloud_stores.py,sha256=-95XIqi_ouo7hvoN5mQNP6bGm07MyF6Yk-YP4Txb5wg,24034
7
7
  sky/core.py,sha256=gw_TrQOxz28sLAJJq6ajPnlRlrKQ2G1DtqLuntMejFU,45508
8
8
  sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
@@ -42,7 +42,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
43
43
  sky/benchmark/benchmark_utils.py,sha256=o4RymqSceq5mLEZL0upQM6NVEzJJQzj9s9tTm49uUTc,26365
44
44
  sky/client/__init__.py,sha256=pz6xvVSd9X-gwqbsDL0E9QOojYqM0KAD0j-NCyCIF1k,38
45
- sky/client/cli.py,sha256=Z_w8p_qGWdg2-7yWlimrqmaB1Yqy1hXWQkGy08jEee4,218814
45
+ sky/client/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
46
46
  sky/client/common.py,sha256=axDic7WOG1e78SdFm5XIwdhX7YNvf3g4k7INrsW3X4s,14611
47
47
  sky/client/sdk.py,sha256=q5R0_AquHAiLSLXpha8fIecQ9cgqqFba436xVzJ48oI,66943
48
48
  sky/clouds/__init__.py,sha256=taKUCz6gWoKZhqHLYJXX-d0Ux6ZSQZEwxcNFdniupL0,1365
@@ -87,7 +87,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=Zj4bqWPiDcT_ZFyHxQw
87
87
  sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=7YVnoGDGGZI2TK02bj_LOoD4E5J5CFl6eqz2XlR4Vy8,12790
88
88
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
89
89
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=yKuAFbjBRNz_e2RNNDT_aHHAuKQ86Ac7GKgIie5O6Pg,7273
90
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=4bU0j-mWZCymzUq7uyJfoIDaXXeJg49gUlM9oybBFI0,30903
90
+ sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=JnugFifzHPQITlbDKoKexE8NqgagOEfQWTxon7P6vJ0,30935
91
91
  sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=MUzogyLruLQmIt-To6TsfnGPgv_nnlp49XYbeshsd7I,5003
92
92
  sky/clouds/service_catalog/data_fetchers/fetch_vast.py,sha256=zR9icM3ty5C8tGw13pQbsBtQQMgG4kl1j_jSGqqrgOA,4741
93
93
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=Opp2r3KSzXPtwk3lKNbO8IX9QzjoRSwy1kW3jPjtS1c,21453
@@ -116,9 +116,9 @@ sky/jobs/dashboard/dashboard.py,sha256=kUKSXMAWAvPwJ_W_JK3wyz65Uope90_rNvhl8rZ1I
116
116
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
117
117
  sky/jobs/dashboard/templates/index.html,sha256=tz95q8O2pF7IvfY6yv0rnPyhj4DX8WX4RIVVxqFKV1Y,28519
118
118
  sky/jobs/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
119
- sky/jobs/server/core.py,sha256=KUMmObt0rWhuCR50lQYmF6bFSKAjHbb8sw53WBnJzv0,22251
120
- sky/jobs/server/dashboard_utils.py,sha256=BKafOhnwU_e6LtKLqqmf_CyUtkbFWRwSbdjMwhSBQrM,2086
121
- sky/jobs/server/server.py,sha256=6W9FUPT-QFfX50Qwu6MBdJ2ScSW994w5jsyM-bHW8lE,7459
119
+ sky/jobs/server/core.py,sha256=zMLSSdNFQkP-RsfzCZ9jIcHNCL0lSvRd7PH3Sie0yPA,22615
120
+ sky/jobs/server/dashboard_utils.py,sha256=2Mbx40W1pQqPEPHsSDbHeaF0j5cgyKy-_A9Owdwp_AQ,2315
121
+ sky/jobs/server/server.py,sha256=s3wULAh4u4drdIz2VA8l0HiXxHWdUzsBDYCstzU0Vxs,7411
122
122
  sky/provision/__init__.py,sha256=jiTOawg_wpy0s3Z-SEoOf7r280arLHUZzj-KPh-w7ek,6424
123
123
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
124
124
  sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
@@ -218,10 +218,10 @@ sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,169
218
218
  sky/serve/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  sky/serve/client/sdk.py,sha256=fVYQfvNuJxa8aZiS7LJoXFeGcjRidko0Tph5b6m0yMQ,11539
220
220
  sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
221
- sky/serve/server/core.py,sha256=cide83JrRMl45WvA0KdPtj36_g75nSiblsFtPbJ4Qyc,36660
222
- sky/serve/server/server.py,sha256=IVEjseLX4h1EZGSpJofzEJl6lkGaBKlEY4IBlngQWD8,3479
221
+ sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
222
+ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
223
223
  sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
224
- sky/server/common.py,sha256=8J1RZ2IGJtySw-gbLE_JEb9Hm24os5qwadmQDhQMqf4,18447
224
+ sky/server/common.py,sha256=64sg18ehgkGadkiG18ekqEbqNN_8S4Ca1BLgiFvQ8b8,17397
225
225
  sky/server/constants.py,sha256=SqhWJMassFyvWAJn2UJHvuA_0_C6f5vngMzZ2KYLsKw,770
226
226
  sky/server/server.py,sha256=TZplXKA0KMs4UHLV3K5NSyhUPD0l2cmsiYgAZohn_Gs,41902
227
227
  sky/server/stream_utils.py,sha256=6jo1Dq8EtD0AHmJ3e3zCUNAiSYQlUKbPil4h8pA-2ac,5813
@@ -301,11 +301,11 @@ sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
301
301
  sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
302
302
  sky/utils/command_runner.py,sha256=-7vxLvwZnTvYMQ_nScmuQWY6ZvQYv69yvvIp2uOaOqU,39063
303
303
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
304
- sky/utils/common.py,sha256=zBUmQjlSD7aF6tDG8mzbf-oU6JG3oYM2EAQ9sgSWSrA,2833
304
+ sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
305
305
  sky/utils/common_utils.py,sha256=wPECJDpeloyixalXNrdmVKXFyU1UKUtBES6D0mRd2mE,26180
306
306
  sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
307
307
  sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
308
- sky/utils/controller_utils.py,sha256=1tnRFw9ANVyACGswIsl67uSK0fYDHLOoO6BQpxmFDgA,45674
308
+ sky/utils/controller_utils.py,sha256=4Nck10XV6gNJKjBl7y_CIxIGqP3bbISuZSVTHbBumgs,45725
309
309
  sky/utils/dag_utils.py,sha256=sAus0aL1wtuuFZSDnpO4LY-6WK4u5iJY952oWQzHo3Y,7532
310
310
  sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
311
311
  sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
@@ -336,9 +336,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
336
336
  sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
337
337
  sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
338
338
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
339
- skypilot_nightly-1.0.0.dev20250218.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
340
- skypilot_nightly-1.0.0.dev20250218.dist-info/METADATA,sha256=LTMWhkCmIQwt9zptcjlq9Se2Cs9MCe7IIMcqHEE7lN0,18916
341
- skypilot_nightly-1.0.0.dev20250218.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
342
- skypilot_nightly-1.0.0.dev20250218.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
343
- skypilot_nightly-1.0.0.dev20250218.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
344
- skypilot_nightly-1.0.0.dev20250218.dist-info/RECORD,,
339
+ skypilot_nightly-1.0.0.dev20250219.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
340
+ skypilot_nightly-1.0.0.dev20250219.dist-info/METADATA,sha256=wO3b_7Wt5UkHrHx5QDuqB-UKy3tIumd6DsrdpHfr03c,18916
341
+ skypilot_nightly-1.0.0.dev20250219.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
342
+ skypilot_nightly-1.0.0.dev20250219.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
343
+ skypilot_nightly-1.0.0.dev20250219.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
344
+ skypilot_nightly-1.0.0.dev20250219.dist-info/RECORD,,