skypilot-nightly 1.0.0.dev20250218__py3-none-any.whl → 1.0.0.dev20250219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +15 -24
- sky/client/cli.py +15 -24
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/jobs/server/core.py +22 -17
- sky/jobs/server/dashboard_utils.py +6 -1
- sky/jobs/server/server.py +8 -10
- sky/serve/server/core.py +10 -7
- sky/serve/server/server.py +6 -11
- sky/server/common.py +0 -20
- sky/utils/common.py +23 -43
- sky/utils/controller_utils.py +7 -6
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/RECORD +18 -18
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '0ec86b2b1ec8aeaae0514b5a7ce4eb99caaa0728'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250219'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
|
|
1419
1419
|
try:
|
1420
1420
|
# Check the controller status again, as the RuntimeError is likely
|
1421
1421
|
# due to the controller being autostopped when querying the jobs.
|
1422
|
-
|
1423
|
-
#
|
1424
|
-
# the controller cluster
|
1425
|
-
# '-remote-<hash>' when using remote API server.
|
1422
|
+
# Since we are client-side, we may not know the exact name of the
|
1423
|
+
# controller, so use the prefix with a wildcard.
|
1424
|
+
# Query status of the controller cluster.
|
1426
1425
|
records = sdk.get(
|
1427
|
-
sdk.status(
|
1428
|
-
|
1426
|
+
sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
|
1427
|
+
all_users=True))
|
1429
1428
|
if (not records or
|
1430
1429
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1431
|
-
|
1430
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER.value
|
1431
|
+
msg = controller.default_hint_if_non_existent
|
1432
1432
|
except Exception: # pylint: disable=broad-except
|
1433
1433
|
# This is to an best effort to find the latest controller status to
|
1434
1434
|
# print more helpful message, so we can ignore any exception to
|
@@ -1494,16 +1494,18 @@ def _handle_services_request(
|
|
1494
1494
|
# Check the controller status again, as the RuntimeError is likely
|
1495
1495
|
# due to the controller being autostopped when querying the
|
1496
1496
|
# services.
|
1497
|
-
|
1498
|
-
#
|
1499
|
-
# the controller cluster
|
1500
|
-
# '-remote-<hash>' when using remote API server.
|
1497
|
+
# Since we are client-side, we may not know the exact name of the
|
1498
|
+
# controller, so use the prefix with a wildcard.
|
1499
|
+
# Query status of the controller cluster.
|
1501
1500
|
records = sdk.get(
|
1502
1501
|
sdk.status(
|
1503
|
-
cluster_names=[
|
1502
|
+
cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
|
1503
|
+
all_users=True))
|
1504
1504
|
if (not records or
|
1505
1505
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1506
|
-
|
1506
|
+
controller = (
|
1507
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
|
1508
|
+
msg = controller.default_hint_if_non_existent
|
1507
1509
|
except Exception: # pylint: disable=broad-except
|
1508
1510
|
# This is to an best effort to find the latest controller status to
|
1509
1511
|
# print more helpful message, so we can ignore any exception to
|
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2804
2806
|
to be torn down (e.g., because it has jobs running or
|
2805
2807
|
it is in init state)
|
2806
2808
|
"""
|
2807
|
-
if not common.is_current_user_controller(controller_name):
|
2808
|
-
with ux_utils.print_exception_no_traceback():
|
2809
|
-
raise exceptions.NotSupportedError(
|
2810
|
-
f'Tearing down other user\'s managed job controller '
|
2811
|
-
f'{controller_name!r} is not allowed.')
|
2812
2809
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2813
2810
|
assert controller is not None, controller_name
|
2814
2811
|
|
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
2868
2865
|
to be torn down (e.g., because it has services running or
|
2869
2866
|
it is in init state)
|
2870
2867
|
"""
|
2871
|
-
# TODO(zhwu): Move this check to the sdk or even API server side.
|
2872
|
-
if not common.is_current_user_controller(controller_name):
|
2873
|
-
with ux_utils.print_exception_no_traceback():
|
2874
|
-
raise exceptions.NotSupportedError(
|
2875
|
-
f'Tearing down other user\'s sky serve controller '
|
2876
|
-
f'{controller_name!r} is not allowed.')
|
2877
2868
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2878
2869
|
assert controller is not None, controller_name
|
2879
2870
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
sky/client/cli.py
CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
|
|
1419
1419
|
try:
|
1420
1420
|
# Check the controller status again, as the RuntimeError is likely
|
1421
1421
|
# due to the controller being autostopped when querying the jobs.
|
1422
|
-
|
1423
|
-
#
|
1424
|
-
# the controller cluster
|
1425
|
-
# '-remote-<hash>' when using remote API server.
|
1422
|
+
# Since we are client-side, we may not know the exact name of the
|
1423
|
+
# controller, so use the prefix with a wildcard.
|
1424
|
+
# Query status of the controller cluster.
|
1426
1425
|
records = sdk.get(
|
1427
|
-
sdk.status(
|
1428
|
-
|
1426
|
+
sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
|
1427
|
+
all_users=True))
|
1429
1428
|
if (not records or
|
1430
1429
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1431
|
-
|
1430
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER.value
|
1431
|
+
msg = controller.default_hint_if_non_existent
|
1432
1432
|
except Exception: # pylint: disable=broad-except
|
1433
1433
|
# This is to an best effort to find the latest controller status to
|
1434
1434
|
# print more helpful message, so we can ignore any exception to
|
@@ -1494,16 +1494,18 @@ def _handle_services_request(
|
|
1494
1494
|
# Check the controller status again, as the RuntimeError is likely
|
1495
1495
|
# due to the controller being autostopped when querying the
|
1496
1496
|
# services.
|
1497
|
-
|
1498
|
-
#
|
1499
|
-
# the controller cluster
|
1500
|
-
# '-remote-<hash>' when using remote API server.
|
1497
|
+
# Since we are client-side, we may not know the exact name of the
|
1498
|
+
# controller, so use the prefix with a wildcard.
|
1499
|
+
# Query status of the controller cluster.
|
1501
1500
|
records = sdk.get(
|
1502
1501
|
sdk.status(
|
1503
|
-
cluster_names=[
|
1502
|
+
cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
|
1503
|
+
all_users=True))
|
1504
1504
|
if (not records or
|
1505
1505
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1506
|
-
|
1506
|
+
controller = (
|
1507
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
|
1508
|
+
msg = controller.default_hint_if_non_existent
|
1507
1509
|
except Exception: # pylint: disable=broad-except
|
1508
1510
|
# This is to an best effort to find the latest controller status to
|
1509
1511
|
# print more helpful message, so we can ignore any exception to
|
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2804
2806
|
to be torn down (e.g., because it has jobs running or
|
2805
2807
|
it is in init state)
|
2806
2808
|
"""
|
2807
|
-
if not common.is_current_user_controller(controller_name):
|
2808
|
-
with ux_utils.print_exception_no_traceback():
|
2809
|
-
raise exceptions.NotSupportedError(
|
2810
|
-
f'Tearing down other user\'s managed job controller '
|
2811
|
-
f'{controller_name!r} is not allowed.')
|
2812
2809
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2813
2810
|
assert controller is not None, controller_name
|
2814
2811
|
|
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
2868
2865
|
to be torn down (e.g., because it has services running or
|
2869
2866
|
it is in init state)
|
2870
2867
|
"""
|
2871
|
-
# TODO(zhwu): Move this check to the sdk or even API server side.
|
2872
|
-
if not common.is_current_user_controller(controller_name):
|
2873
|
-
with ux_utils.print_exception_no_traceback():
|
2874
|
-
raise exceptions.NotSupportedError(
|
2875
|
-
f'Tearing down other user\'s sky serve controller '
|
2876
|
-
f'{controller_name!r} is not allowed.')
|
2877
2868
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2878
2869
|
assert controller is not None, controller_name
|
2879
2870
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
@@ -60,8 +60,8 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
60
60
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
61
61
|
""")))
|
62
62
|
|
63
|
-
# TPU V6e price for
|
64
|
-
TPU_V6E_MISSING_REGIONS = ['us-central2']
|
63
|
+
# TPU V6e price for the following regions is missing in the SKUs.
|
64
|
+
TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
|
65
65
|
|
66
66
|
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
67
67
|
# NOTE(dev): Keep the zones and the df in sync.
|
sky/jobs/server/core.py
CHANGED
@@ -21,10 +21,11 @@ from sky.backends import backend_utils
|
|
21
21
|
from sky.clouds.service_catalog import common as service_catalog_common
|
22
22
|
from sky.jobs import constants as managed_job_constants
|
23
23
|
from sky.jobs import utils as managed_job_utils
|
24
|
-
from sky.provision import common
|
24
|
+
from sky.provision import common as provision_common
|
25
25
|
from sky.skylet import constants as skylet_constants
|
26
26
|
from sky.usage import usage_lib
|
27
27
|
from sky.utils import admin_policy_utils
|
28
|
+
from sky.utils import common
|
28
29
|
from sky.utils import common_utils
|
29
30
|
from sky.utils import controller_utils
|
30
31
|
from sky.utils import dag_utils
|
@@ -149,14 +150,18 @@ def launch(
|
|
149
150
|
f'{colorama.Fore.YELLOW}'
|
150
151
|
f'Launching managed job {dag.name!r} from jobs controller...'
|
151
152
|
f'{colorama.Style.RESET_ALL}')
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
153
|
+
|
154
|
+
# Launch with the api server's user hash, so that sky status does not
|
155
|
+
# show the owner of the controller as whatever user launched it first.
|
156
|
+
with common.with_server_user_hash():
|
157
|
+
return execution.launch(task=controller_task,
|
158
|
+
cluster_name=controller_name,
|
159
|
+
stream_logs=stream_logs,
|
160
|
+
idle_minutes_to_autostop=skylet_constants.
|
161
|
+
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
|
162
|
+
retry_until_up=True,
|
163
|
+
fast=True,
|
164
|
+
_disable_controller_check=True)
|
160
165
|
|
161
166
|
|
162
167
|
def queue_from_kubernetes_pod(
|
@@ -194,16 +199,16 @@ def queue_from_kubernetes_pod(
|
|
194
199
|
provider_config = {'context': context}
|
195
200
|
instances = {
|
196
201
|
pod_name: [
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
202
|
+
provision_common.InstanceInfo(instance_id=pod_name,
|
203
|
+
internal_ip='',
|
204
|
+
external_ip='',
|
205
|
+
tags={})
|
201
206
|
]
|
202
207
|
} # Internal IP is not required for Kubernetes
|
203
|
-
cluster_info =
|
204
|
-
|
205
|
-
|
206
|
-
|
208
|
+
cluster_info = provision_common.ClusterInfo(provider_name='kubernetes',
|
209
|
+
head_instance_id=pod_name,
|
210
|
+
provider_config=provider_config,
|
211
|
+
instances=instances)
|
207
212
|
managed_jobs_runner = provision_lib.get_command_runners(
|
208
213
|
'kubernetes', cluster_info)[0]
|
209
214
|
|
@@ -1,4 +1,9 @@
|
|
1
|
-
"""Persistent dashboard sessions.
|
1
|
+
"""Persistent dashboard sessions.
|
2
|
+
|
3
|
+
Note: before #4717, this was useful because we needed to tunnel to multiple
|
4
|
+
controllers - one per user. Now, there is only one controller for the whole API
|
5
|
+
server, so this is not very useful. TODO(cooperc): Remove or fix this.
|
6
|
+
"""
|
2
7
|
import pathlib
|
3
8
|
from typing import Tuple
|
4
9
|
|
sky/jobs/server/server.py
CHANGED
@@ -21,11 +21,6 @@ logger = sky_logging.init_logger(__name__)
|
|
21
21
|
router = fastapi.APIRouter()
|
22
22
|
|
23
23
|
|
24
|
-
def _get_controller_name(request_body: payloads.RequestBody) -> str:
|
25
|
-
user_hash = request_body.user_hash
|
26
|
-
return common.get_controller_name(common.ControllerType.JOBS, user_hash)
|
27
|
-
|
28
|
-
|
29
24
|
@router.post('/launch')
|
30
25
|
async def launch(request: fastapi.Request,
|
31
26
|
jobs_launch_body: payloads.JobsLaunchBody) -> None:
|
@@ -35,7 +30,7 @@ async def launch(request: fastapi.Request,
|
|
35
30
|
request_body=jobs_launch_body,
|
36
31
|
func=core.launch,
|
37
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
38
|
-
request_cluster_name=
|
33
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
39
34
|
)
|
40
35
|
|
41
36
|
|
@@ -49,7 +44,7 @@ async def queue(request: fastapi.Request,
|
|
49
44
|
func=core.queue,
|
50
45
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
51
46
|
else api_requests.ScheduleType.SHORT),
|
52
|
-
request_cluster_name=
|
47
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
53
48
|
)
|
54
49
|
|
55
50
|
|
@@ -62,7 +57,7 @@ async def cancel(request: fastapi.Request,
|
|
62
57
|
request_body=jobs_cancel_body,
|
63
58
|
func=core.cancel,
|
64
59
|
schedule_type=api_requests.ScheduleType.SHORT,
|
65
|
-
request_cluster_name=
|
60
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
66
61
|
)
|
67
62
|
|
68
63
|
|
@@ -78,7 +73,7 @@ async def logs(
|
|
78
73
|
func=core.tail_logs,
|
79
74
|
schedule_type=api_requests.ScheduleType.SHORT
|
80
75
|
if jobs_logs_body.refresh else api_requests.ScheduleType.LONG,
|
81
|
-
request_cluster_name=
|
76
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
82
77
|
)
|
83
78
|
request_task = api_requests.get_request(request.state.request_id)
|
84
79
|
|
@@ -107,13 +102,16 @@ async def download_logs(
|
|
107
102
|
func=core.download_logs,
|
108
103
|
schedule_type=api_requests.ScheduleType.LONG
|
109
104
|
if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
110
|
-
request_cluster_name=
|
105
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
111
106
|
)
|
112
107
|
|
113
108
|
|
114
109
|
@router.get('/dashboard')
|
115
110
|
async def dashboard(request: fastapi.Request,
|
116
111
|
user_hash: str) -> fastapi.Response:
|
112
|
+
# Note: before #4717, each user had their own controller, and thus their own
|
113
|
+
# dashboard. Now, all users share the same controller, so this isn't really
|
114
|
+
# necessary. TODO(cooperc): clean up.
|
117
115
|
# Find the port for the dashboard of the user
|
118
116
|
os.environ[constants.USER_ID_ENV_VAR] = user_hash
|
119
117
|
server_common.reload_for_new_request(client_entrypoint=None,
|
sky/serve/server/core.py
CHANGED
@@ -249,13 +249,16 @@ def up(
|
|
249
249
|
# with the current job id, we know the service is up and running
|
250
250
|
# for the first time; otherwise it is a name conflict.
|
251
251
|
idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
252
|
+
# Since the controller may be shared among multiple users, launch the
|
253
|
+
# controller with the API server's user hash.
|
254
|
+
with common.with_server_user_hash():
|
255
|
+
controller_job_id, controller_handle = execution.launch(
|
256
|
+
task=controller_task,
|
257
|
+
cluster_name=controller_name,
|
258
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
259
|
+
retry_until_up=True,
|
260
|
+
_disable_controller_check=True,
|
261
|
+
)
|
259
262
|
|
260
263
|
style = colorama.Style
|
261
264
|
fore = colorama.Fore
|
sky/serve/server/server.py
CHANGED
@@ -14,11 +14,6 @@ logger = sky_logging.init_logger(__name__)
|
|
14
14
|
router = fastapi.APIRouter()
|
15
15
|
|
16
16
|
|
17
|
-
def _get_controller_name(request_body: payloads.RequestBody) -> str:
|
18
|
-
user_hash = request_body.user_hash
|
19
|
-
return common.get_controller_name(common.ControllerType.SERVE, user_hash)
|
20
|
-
|
21
|
-
|
22
17
|
@router.post('/up')
|
23
18
|
async def up(
|
24
19
|
request: fastapi.Request,
|
@@ -30,7 +25,7 @@ async def up(
|
|
30
25
|
request_body=up_body,
|
31
26
|
func=core.up,
|
32
27
|
schedule_type=api_requests.ScheduleType.LONG,
|
33
|
-
request_cluster_name=
|
28
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
34
29
|
)
|
35
30
|
|
36
31
|
|
@@ -45,7 +40,7 @@ async def update(
|
|
45
40
|
request_body=update_body,
|
46
41
|
func=core.update,
|
47
42
|
schedule_type=api_requests.ScheduleType.SHORT,
|
48
|
-
request_cluster_name=
|
43
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
49
44
|
)
|
50
45
|
|
51
46
|
|
@@ -60,7 +55,7 @@ async def down(
|
|
60
55
|
request_body=down_body,
|
61
56
|
func=core.down,
|
62
57
|
schedule_type=api_requests.ScheduleType.SHORT,
|
63
|
-
request_cluster_name=
|
58
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
64
59
|
)
|
65
60
|
|
66
61
|
|
@@ -75,7 +70,7 @@ async def terminate_replica(
|
|
75
70
|
request_body=terminate_replica_body,
|
76
71
|
func=core.terminate_replica,
|
77
72
|
schedule_type=api_requests.ScheduleType.SHORT,
|
78
|
-
request_cluster_name=
|
73
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
79
74
|
)
|
80
75
|
|
81
76
|
|
@@ -90,7 +85,7 @@ async def status(
|
|
90
85
|
request_body=status_body,
|
91
86
|
func=core.status,
|
92
87
|
schedule_type=api_requests.ScheduleType.SHORT,
|
93
|
-
request_cluster_name=
|
88
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
94
89
|
)
|
95
90
|
|
96
91
|
|
@@ -105,7 +100,7 @@ async def tail_logs(
|
|
105
100
|
request_body=log_body,
|
106
101
|
func=core.tail_logs,
|
107
102
|
schedule_type=api_requests.ScheduleType.SHORT,
|
108
|
-
request_cluster_name=
|
103
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
109
104
|
)
|
110
105
|
|
111
106
|
request_task = api_requests.get_request(request.state.request_id)
|
sky/server/common.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
import dataclasses
|
4
4
|
import enum
|
5
5
|
import functools
|
6
|
-
import importlib
|
7
6
|
import json
|
8
7
|
import os
|
9
8
|
import pathlib
|
@@ -28,14 +27,12 @@ from sky.server import constants as server_constants
|
|
28
27
|
from sky.skylet import constants
|
29
28
|
from sky.usage import usage_lib
|
30
29
|
from sky.utils import annotations
|
31
|
-
from sky.utils import common
|
32
30
|
from sky.utils import common_utils
|
33
31
|
from sky.utils import rich_utils
|
34
32
|
from sky.utils import ux_utils
|
35
33
|
|
36
34
|
if typing.TYPE_CHECKING:
|
37
35
|
from sky import dag as dag_lib
|
38
|
-
from sky.server.requests import payloads
|
39
36
|
|
40
37
|
DEFAULT_SERVER_URL = 'http://127.0.0.1:46580'
|
41
38
|
AVAILBLE_LOCAL_API_SERVER_HOSTS = ['0.0.0.0', 'localhost', '127.0.0.1']
|
@@ -407,23 +404,6 @@ def request_body_to_params(body: pydantic.BaseModel) -> Dict[str, Any]:
|
|
407
404
|
def reload_for_new_request(client_entrypoint: Optional[str],
|
408
405
|
client_command: Optional[str]):
|
409
406
|
"""Reload modules, global variables, and usage message for a new request."""
|
410
|
-
# When a user request is sent to api server, it changes the user hash in the
|
411
|
-
# env vars, but since controller_utils is imported before the env vars are
|
412
|
-
# set, it doesn't get updated. So we need to reload it here.
|
413
|
-
# pylint: disable=import-outside-toplevel
|
414
|
-
from sky.utils import controller_utils
|
415
|
-
common.SKY_SERVE_CONTROLLER_NAME = common.get_controller_name(
|
416
|
-
common.ControllerType.SERVE)
|
417
|
-
common.JOB_CONTROLLER_NAME = common.get_controller_name(
|
418
|
-
common.ControllerType.JOBS)
|
419
|
-
# TODO(zhwu): We should avoid reloading the controller_utils module.
|
420
|
-
# Instead, we should reload required cache or global variables.
|
421
|
-
# TODO(zhwu): Reloading the controller_utils module may cause the global
|
422
|
-
# variables in other modules referring the `controller_utils.Controllers`
|
423
|
-
# dangling, as they will be pointing to the old object. We should not use
|
424
|
-
# it in global variables.
|
425
|
-
importlib.reload(controller_utils)
|
426
|
-
|
427
407
|
# Reset the client entrypoint and command for the usage message.
|
428
408
|
common_utils.set_client_entrypoint_and_command(
|
429
409
|
client_entrypoint=client_entrypoint,
|
sky/utils/common.py
CHANGED
@@ -1,53 +1,41 @@
|
|
1
1
|
"""Common enumerators and classes."""
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
import enum
|
4
|
-
|
5
|
+
import os
|
6
|
+
from typing import Generator
|
5
7
|
|
8
|
+
from sky.skylet import constants
|
6
9
|
from sky.utils import common_utils
|
7
10
|
|
8
11
|
SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
|
9
12
|
JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
|
10
|
-
|
11
|
-
#
|
12
|
-
# SkyPilot API server is started by the same user. It will be the same across
|
13
|
-
# the whole lifecycle of the server, including:
|
13
|
+
# We use the user hash (machine-specific) for the controller name. It will be
|
14
|
+
# the same across the whole lifecycle of the server, including:
|
14
15
|
# 1. all requests, because this global variable is set once during server
|
15
16
|
# starts.
|
16
17
|
# 2. SkyPilot API server restarts, as long as the `~/.sky` folder is persisted
|
17
18
|
# and the env var set during starting the server is the same.
|
19
|
+
# This behavior is the same for the local API server (where SERVER_ID is the
|
20
|
+
# same as the normal user hash). This ensures backwards-compatibility with jobs
|
21
|
+
# controllers from before #4660.
|
18
22
|
SERVER_ID = common_utils.get_user_hash()
|
23
|
+
SKY_SERVE_CONTROLLER_NAME: str = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
|
24
|
+
JOB_CONTROLLER_NAME: str = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
19
25
|
|
20
26
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# Comparing the two IDs can determine if the caller is trying to get the
|
34
|
-
# controller created by their local API server or a remote API server.
|
35
|
-
if user_hash == SERVER_ID:
|
36
|
-
# Not adding server ID for locally created controller because
|
37
|
-
# of backward compatibility.
|
38
|
-
return f'{prefix}{user_hash}'
|
39
|
-
return f'{prefix}{user_hash}{SERVER_ID_CONNECTOR}{SERVER_ID}'
|
40
|
-
|
41
|
-
|
42
|
-
# Controller names differ per user and per SkyPilot API server.
|
43
|
-
# If local: <prefix>-<user_id>
|
44
|
-
# If remote: <prefix>-<user_id>-remote-<api_server_user_id>
|
45
|
-
# DO NOT use these variables on the client side because client side doesn't know
|
46
|
-
# the remote server's user id, so client side will get local-version controller
|
47
|
-
# name.
|
48
|
-
# TODO(SKY-1106): remove dynamic constants like this.
|
49
|
-
SKY_SERVE_CONTROLLER_NAME: str = get_controller_name(ControllerType.SERVE)
|
50
|
-
JOB_CONTROLLER_NAME: str = get_controller_name(ControllerType.JOBS)
|
27
|
+
@contextlib.contextmanager
|
28
|
+
def with_server_user_hash() -> Generator[None, None, None]:
|
29
|
+
"""Temporarily set the user hash to common.SERVER_ID."""
|
30
|
+
old_env_user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
31
|
+
os.environ[constants.USER_ID_ENV_VAR] = SERVER_ID
|
32
|
+
try:
|
33
|
+
yield
|
34
|
+
finally:
|
35
|
+
if old_env_user_hash is not None:
|
36
|
+
os.environ[constants.USER_ID_ENV_VAR] = old_env_user_hash
|
37
|
+
else:
|
38
|
+
os.environ.pop(constants.USER_ID_ENV_VAR)
|
51
39
|
|
52
40
|
|
53
41
|
class StatusRefreshMode(enum.Enum):
|
@@ -64,11 +52,3 @@ class StatusRefreshMode(enum.Enum):
|
|
64
52
|
class OptimizeTarget(enum.Enum):
|
65
53
|
COST = 0
|
66
54
|
TIME = 1
|
67
|
-
|
68
|
-
|
69
|
-
def is_current_user_controller(controller_name: str) -> bool:
|
70
|
-
"""If the controller name belongs to the current user."""
|
71
|
-
if SERVER_ID_CONNECTOR in controller_name:
|
72
|
-
controller_name = controller_name.split(SERVER_ID_CONNECTOR)[0]
|
73
|
-
controller_user_id = controller_name.split('-')[-1]
|
74
|
-
return controller_user_id == common_utils.get_user_hash()
|
sky/utils/controller_utils.py
CHANGED
@@ -91,10 +91,6 @@ class Controllers(enum.Enum):
|
|
91
91
|
JOBS_CONTROLLER = _ControllerSpec(
|
92
92
|
controller_type='jobs',
|
93
93
|
name='managed jobs controller',
|
94
|
-
# Default cluster name is the current user's controller cluster unless
|
95
|
-
# caller initiate with a different controller name.
|
96
|
-
# TODO(zhwu): by having the controller name loaded in common, it
|
97
|
-
# will not respect the latest updated user hash.
|
98
94
|
cluster_name=common.JOB_CONTROLLER_NAME,
|
99
95
|
in_progress_hint=(
|
100
96
|
'* {job_info}To see all managed jobs: '
|
@@ -164,13 +160,18 @@ class Controllers(enum.Enum):
|
|
164
160
|
if name is None:
|
165
161
|
return None
|
166
162
|
controller = None
|
163
|
+
# The controller name is always the same. However, on the client-side,
|
164
|
+
# we may not know the exact name, because we are missing the server-side
|
165
|
+
# common.SERVER_ID. So, we will assume anything that matches the prefix
|
166
|
+
# is a controller.
|
167
167
|
if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
|
168
168
|
controller = cls.SKY_SERVE_CONTROLLER
|
169
169
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
170
170
|
controller = cls.JOBS_CONTROLLER
|
171
171
|
if controller is not None and name != controller.value.cluster_name:
|
172
|
-
#
|
173
|
-
# so need to set the controller's
|
172
|
+
# The client-side cluster_name is not accurate. Assume that `name`
|
173
|
+
# is the actual cluster name, so need to set the controller's
|
174
|
+
# cluster name to the input name.
|
174
175
|
controller.value.cluster_name = name
|
175
176
|
return controller
|
176
177
|
|
{skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=eSspYNfxrf0xj8B8E1z5prY7j2xz0DdjEeOO8s5sMLU,6391
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
|
4
4
|
sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
|
6
6
|
sky/cloud_stores.py,sha256=-95XIqi_ouo7hvoN5mQNP6bGm07MyF6Yk-YP4Txb5wg,24034
|
7
7
|
sky/core.py,sha256=gw_TrQOxz28sLAJJq6ajPnlRlrKQ2G1DtqLuntMejFU,45508
|
8
8
|
sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
|
@@ -42,7 +42,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
43
43
|
sky/benchmark/benchmark_utils.py,sha256=o4RymqSceq5mLEZL0upQM6NVEzJJQzj9s9tTm49uUTc,26365
|
44
44
|
sky/client/__init__.py,sha256=pz6xvVSd9X-gwqbsDL0E9QOojYqM0KAD0j-NCyCIF1k,38
|
45
|
-
sky/client/cli.py,sha256=
|
45
|
+
sky/client/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
|
46
46
|
sky/client/common.py,sha256=axDic7WOG1e78SdFm5XIwdhX7YNvf3g4k7INrsW3X4s,14611
|
47
47
|
sky/client/sdk.py,sha256=q5R0_AquHAiLSLXpha8fIecQ9cgqqFba436xVzJ48oI,66943
|
48
48
|
sky/clouds/__init__.py,sha256=taKUCz6gWoKZhqHLYJXX-d0Ux6ZSQZEwxcNFdniupL0,1365
|
@@ -87,7 +87,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=Zj4bqWPiDcT_ZFyHxQw
|
|
87
87
|
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=7YVnoGDGGZI2TK02bj_LOoD4E5J5CFl6eqz2XlR4Vy8,12790
|
88
88
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
89
89
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=yKuAFbjBRNz_e2RNNDT_aHHAuKQ86Ac7GKgIie5O6Pg,7273
|
90
|
-
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=
|
90
|
+
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=JnugFifzHPQITlbDKoKexE8NqgagOEfQWTxon7P6vJ0,30935
|
91
91
|
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=MUzogyLruLQmIt-To6TsfnGPgv_nnlp49XYbeshsd7I,5003
|
92
92
|
sky/clouds/service_catalog/data_fetchers/fetch_vast.py,sha256=zR9icM3ty5C8tGw13pQbsBtQQMgG4kl1j_jSGqqrgOA,4741
|
93
93
|
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=Opp2r3KSzXPtwk3lKNbO8IX9QzjoRSwy1kW3jPjtS1c,21453
|
@@ -116,9 +116,9 @@ sky/jobs/dashboard/dashboard.py,sha256=kUKSXMAWAvPwJ_W_JK3wyz65Uope90_rNvhl8rZ1I
|
|
116
116
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
117
117
|
sky/jobs/dashboard/templates/index.html,sha256=tz95q8O2pF7IvfY6yv0rnPyhj4DX8WX4RIVVxqFKV1Y,28519
|
118
118
|
sky/jobs/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
119
|
-
sky/jobs/server/core.py,sha256=
|
120
|
-
sky/jobs/server/dashboard_utils.py,sha256=
|
121
|
-
sky/jobs/server/server.py,sha256=
|
119
|
+
sky/jobs/server/core.py,sha256=zMLSSdNFQkP-RsfzCZ9jIcHNCL0lSvRd7PH3Sie0yPA,22615
|
120
|
+
sky/jobs/server/dashboard_utils.py,sha256=2Mbx40W1pQqPEPHsSDbHeaF0j5cgyKy-_A9Owdwp_AQ,2315
|
121
|
+
sky/jobs/server/server.py,sha256=s3wULAh4u4drdIz2VA8l0HiXxHWdUzsBDYCstzU0Vxs,7411
|
122
122
|
sky/provision/__init__.py,sha256=jiTOawg_wpy0s3Z-SEoOf7r280arLHUZzj-KPh-w7ek,6424
|
123
123
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
124
124
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
@@ -218,10 +218,10 @@ sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,169
|
|
218
218
|
sky/serve/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
219
219
|
sky/serve/client/sdk.py,sha256=fVYQfvNuJxa8aZiS7LJoXFeGcjRidko0Tph5b6m0yMQ,11539
|
220
220
|
sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
221
|
-
sky/serve/server/core.py,sha256=
|
222
|
-
sky/serve/server/server.py,sha256=
|
221
|
+
sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
|
222
|
+
sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
|
223
223
|
sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
|
224
|
-
sky/server/common.py,sha256=
|
224
|
+
sky/server/common.py,sha256=64sg18ehgkGadkiG18ekqEbqNN_8S4Ca1BLgiFvQ8b8,17397
|
225
225
|
sky/server/constants.py,sha256=SqhWJMassFyvWAJn2UJHvuA_0_C6f5vngMzZ2KYLsKw,770
|
226
226
|
sky/server/server.py,sha256=TZplXKA0KMs4UHLV3K5NSyhUPD0l2cmsiYgAZohn_Gs,41902
|
227
227
|
sky/server/stream_utils.py,sha256=6jo1Dq8EtD0AHmJ3e3zCUNAiSYQlUKbPil4h8pA-2ac,5813
|
@@ -301,11 +301,11 @@ sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
|
|
301
301
|
sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
|
302
302
|
sky/utils/command_runner.py,sha256=-7vxLvwZnTvYMQ_nScmuQWY6ZvQYv69yvvIp2uOaOqU,39063
|
303
303
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
304
|
-
sky/utils/common.py,sha256=
|
304
|
+
sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
|
305
305
|
sky/utils/common_utils.py,sha256=wPECJDpeloyixalXNrdmVKXFyU1UKUtBES6D0mRd2mE,26180
|
306
306
|
sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
|
307
307
|
sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
|
308
|
-
sky/utils/controller_utils.py,sha256=
|
308
|
+
sky/utils/controller_utils.py,sha256=4Nck10XV6gNJKjBl7y_CIxIGqP3bbISuZSVTHbBumgs,45725
|
309
309
|
sky/utils/dag_utils.py,sha256=sAus0aL1wtuuFZSDnpO4LY-6WK4u5iJY952oWQzHo3Y,7532
|
310
310
|
sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
|
311
311
|
sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
|
@@ -336,9 +336,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
|
|
336
336
|
sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
|
337
337
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
338
338
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
339
|
-
skypilot_nightly-1.0.0.
|
340
|
-
skypilot_nightly-1.0.0.
|
341
|
-
skypilot_nightly-1.0.0.
|
342
|
-
skypilot_nightly-1.0.0.
|
343
|
-
skypilot_nightly-1.0.0.
|
344
|
-
skypilot_nightly-1.0.0.
|
339
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
340
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/METADATA,sha256=wO3b_7Wt5UkHrHx5QDuqB-UKy3tIumd6DsrdpHfr03c,18916
|
341
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
342
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
343
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
344
|
+
skypilot_nightly-1.0.0.dev20250219.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250219.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|