skypilot-nightly 1.0.0.dev20250218__py3-none-any.whl → 1.0.0.dev20250220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +15 -24
- sky/client/cli.py +15 -24
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/jobs/server/core.py +22 -17
- sky/jobs/server/dashboard_utils.py +6 -1
- sky/jobs/server/server.py +8 -10
- sky/serve/server/core.py +10 -7
- sky/serve/server/server.py +6 -11
- sky/server/common.py +5 -27
- sky/server/requests/executor.py +94 -87
- sky/server/server.py +10 -5
- sky/server/stream_utils.py +8 -11
- sky/utils/common.py +23 -43
- sky/utils/common_utils.py +38 -0
- sky/utils/controller_utils.py +7 -6
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/RECORD +22 -22
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '6b2b31d8358f3ff8394a7a33ec49e9985ada230f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250220'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
|
|
1419
1419
|
try:
|
1420
1420
|
# Check the controller status again, as the RuntimeError is likely
|
1421
1421
|
# due to the controller being autostopped when querying the jobs.
|
1422
|
-
|
1423
|
-
#
|
1424
|
-
# the controller cluster
|
1425
|
-
# '-remote-<hash>' when using remote API server.
|
1422
|
+
# Since we are client-side, we may not know the exact name of the
|
1423
|
+
# controller, so use the prefix with a wildcard.
|
1424
|
+
# Query status of the controller cluster.
|
1426
1425
|
records = sdk.get(
|
1427
|
-
sdk.status(
|
1428
|
-
|
1426
|
+
sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
|
1427
|
+
all_users=True))
|
1429
1428
|
if (not records or
|
1430
1429
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1431
|
-
|
1430
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER.value
|
1431
|
+
msg = controller.default_hint_if_non_existent
|
1432
1432
|
except Exception: # pylint: disable=broad-except
|
1433
1433
|
# This is to an best effort to find the latest controller status to
|
1434
1434
|
# print more helpful message, so we can ignore any exception to
|
@@ -1494,16 +1494,18 @@ def _handle_services_request(
|
|
1494
1494
|
# Check the controller status again, as the RuntimeError is likely
|
1495
1495
|
# due to the controller being autostopped when querying the
|
1496
1496
|
# services.
|
1497
|
-
|
1498
|
-
#
|
1499
|
-
# the controller cluster
|
1500
|
-
# '-remote-<hash>' when using remote API server.
|
1497
|
+
# Since we are client-side, we may not know the exact name of the
|
1498
|
+
# controller, so use the prefix with a wildcard.
|
1499
|
+
# Query status of the controller cluster.
|
1501
1500
|
records = sdk.get(
|
1502
1501
|
sdk.status(
|
1503
|
-
cluster_names=[
|
1502
|
+
cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
|
1503
|
+
all_users=True))
|
1504
1504
|
if (not records or
|
1505
1505
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1506
|
-
|
1506
|
+
controller = (
|
1507
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
|
1508
|
+
msg = controller.default_hint_if_non_existent
|
1507
1509
|
except Exception: # pylint: disable=broad-except
|
1508
1510
|
# This is to an best effort to find the latest controller status to
|
1509
1511
|
# print more helpful message, so we can ignore any exception to
|
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2804
2806
|
to be torn down (e.g., because it has jobs running or
|
2805
2807
|
it is in init state)
|
2806
2808
|
"""
|
2807
|
-
if not common.is_current_user_controller(controller_name):
|
2808
|
-
with ux_utils.print_exception_no_traceback():
|
2809
|
-
raise exceptions.NotSupportedError(
|
2810
|
-
f'Tearing down other user\'s managed job controller '
|
2811
|
-
f'{controller_name!r} is not allowed.')
|
2812
2809
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2813
2810
|
assert controller is not None, controller_name
|
2814
2811
|
|
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
2868
2865
|
to be torn down (e.g., because it has services running or
|
2869
2866
|
it is in init state)
|
2870
2867
|
"""
|
2871
|
-
# TODO(zhwu): Move this check to the sdk or even API server side.
|
2872
|
-
if not common.is_current_user_controller(controller_name):
|
2873
|
-
with ux_utils.print_exception_no_traceback():
|
2874
|
-
raise exceptions.NotSupportedError(
|
2875
|
-
f'Tearing down other user\'s sky serve controller '
|
2876
|
-
f'{controller_name!r} is not allowed.')
|
2877
2868
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2878
2869
|
assert controller is not None, controller_name
|
2879
2870
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
sky/client/cli.py
CHANGED
@@ -1419,16 +1419,16 @@ def _handle_jobs_queue_request(
|
|
1419
1419
|
try:
|
1420
1420
|
# Check the controller status again, as the RuntimeError is likely
|
1421
1421
|
# due to the controller being autostopped when querying the jobs.
|
1422
|
-
|
1423
|
-
#
|
1424
|
-
# the controller cluster
|
1425
|
-
# '-remote-<hash>' when using remote API server.
|
1422
|
+
# Since we are client-side, we may not know the exact name of the
|
1423
|
+
# controller, so use the prefix with a wildcard.
|
1424
|
+
# Query status of the controller cluster.
|
1426
1425
|
records = sdk.get(
|
1427
|
-
sdk.status(
|
1428
|
-
|
1426
|
+
sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
|
1427
|
+
all_users=True))
|
1429
1428
|
if (not records or
|
1430
1429
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1431
|
-
|
1430
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER.value
|
1431
|
+
msg = controller.default_hint_if_non_existent
|
1432
1432
|
except Exception: # pylint: disable=broad-except
|
1433
1433
|
# This is to an best effort to find the latest controller status to
|
1434
1434
|
# print more helpful message, so we can ignore any exception to
|
@@ -1494,16 +1494,18 @@ def _handle_services_request(
|
|
1494
1494
|
# Check the controller status again, as the RuntimeError is likely
|
1495
1495
|
# due to the controller being autostopped when querying the
|
1496
1496
|
# services.
|
1497
|
-
|
1498
|
-
#
|
1499
|
-
# the controller cluster
|
1500
|
-
# '-remote-<hash>' when using remote API server.
|
1497
|
+
# Since we are client-side, we may not know the exact name of the
|
1498
|
+
# controller, so use the prefix with a wildcard.
|
1499
|
+
# Query status of the controller cluster.
|
1501
1500
|
records = sdk.get(
|
1502
1501
|
sdk.status(
|
1503
|
-
cluster_names=[
|
1502
|
+
cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
|
1503
|
+
all_users=True))
|
1504
1504
|
if (not records or
|
1505
1505
|
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1506
|
-
|
1506
|
+
controller = (
|
1507
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
|
1508
|
+
msg = controller.default_hint_if_non_existent
|
1507
1509
|
except Exception: # pylint: disable=broad-except
|
1508
1510
|
# This is to an best effort to find the latest controller status to
|
1509
1511
|
# print more helpful message, so we can ignore any exception to
|
@@ -2804,11 +2806,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2804
2806
|
to be torn down (e.g., because it has jobs running or
|
2805
2807
|
it is in init state)
|
2806
2808
|
"""
|
2807
|
-
if not common.is_current_user_controller(controller_name):
|
2808
|
-
with ux_utils.print_exception_no_traceback():
|
2809
|
-
raise exceptions.NotSupportedError(
|
2810
|
-
f'Tearing down other user\'s managed job controller '
|
2811
|
-
f'{controller_name!r} is not allowed.')
|
2812
2809
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2813
2810
|
assert controller is not None, controller_name
|
2814
2811
|
|
@@ -2868,12 +2865,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
2868
2865
|
to be torn down (e.g., because it has services running or
|
2869
2866
|
it is in init state)
|
2870
2867
|
"""
|
2871
|
-
# TODO(zhwu): Move this check to the sdk or even API server side.
|
2872
|
-
if not common.is_current_user_controller(controller_name):
|
2873
|
-
with ux_utils.print_exception_no_traceback():
|
2874
|
-
raise exceptions.NotSupportedError(
|
2875
|
-
f'Tearing down other user\'s sky serve controller '
|
2876
|
-
f'{controller_name!r} is not allowed.')
|
2877
2868
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2878
2869
|
assert controller is not None, controller_name
|
2879
2870
|
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
@@ -60,8 +60,8 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
60
60
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
61
61
|
""")))
|
62
62
|
|
63
|
-
# TPU V6e price for
|
64
|
-
TPU_V6E_MISSING_REGIONS = ['us-central2']
|
63
|
+
# TPU V6e price for the following regions is missing in the SKUs.
|
64
|
+
TPU_V6E_MISSING_REGIONS = ['us-central2', 'southamerica-west1']
|
65
65
|
|
66
66
|
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
67
67
|
# NOTE(dev): Keep the zones and the df in sync.
|
sky/jobs/server/core.py
CHANGED
@@ -21,10 +21,11 @@ from sky.backends import backend_utils
|
|
21
21
|
from sky.clouds.service_catalog import common as service_catalog_common
|
22
22
|
from sky.jobs import constants as managed_job_constants
|
23
23
|
from sky.jobs import utils as managed_job_utils
|
24
|
-
from sky.provision import common
|
24
|
+
from sky.provision import common as provision_common
|
25
25
|
from sky.skylet import constants as skylet_constants
|
26
26
|
from sky.usage import usage_lib
|
27
27
|
from sky.utils import admin_policy_utils
|
28
|
+
from sky.utils import common
|
28
29
|
from sky.utils import common_utils
|
29
30
|
from sky.utils import controller_utils
|
30
31
|
from sky.utils import dag_utils
|
@@ -149,14 +150,18 @@ def launch(
|
|
149
150
|
f'{colorama.Fore.YELLOW}'
|
150
151
|
f'Launching managed job {dag.name!r} from jobs controller...'
|
151
152
|
f'{colorama.Style.RESET_ALL}')
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
153
|
+
|
154
|
+
# Launch with the api server's user hash, so that sky status does not
|
155
|
+
# show the owner of the controller as whatever user launched it first.
|
156
|
+
with common.with_server_user_hash():
|
157
|
+
return execution.launch(task=controller_task,
|
158
|
+
cluster_name=controller_name,
|
159
|
+
stream_logs=stream_logs,
|
160
|
+
idle_minutes_to_autostop=skylet_constants.
|
161
|
+
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
|
162
|
+
retry_until_up=True,
|
163
|
+
fast=True,
|
164
|
+
_disable_controller_check=True)
|
160
165
|
|
161
166
|
|
162
167
|
def queue_from_kubernetes_pod(
|
@@ -194,16 +199,16 @@ def queue_from_kubernetes_pod(
|
|
194
199
|
provider_config = {'context': context}
|
195
200
|
instances = {
|
196
201
|
pod_name: [
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
202
|
+
provision_common.InstanceInfo(instance_id=pod_name,
|
203
|
+
internal_ip='',
|
204
|
+
external_ip='',
|
205
|
+
tags={})
|
201
206
|
]
|
202
207
|
} # Internal IP is not required for Kubernetes
|
203
|
-
cluster_info =
|
204
|
-
|
205
|
-
|
206
|
-
|
208
|
+
cluster_info = provision_common.ClusterInfo(provider_name='kubernetes',
|
209
|
+
head_instance_id=pod_name,
|
210
|
+
provider_config=provider_config,
|
211
|
+
instances=instances)
|
207
212
|
managed_jobs_runner = provision_lib.get_command_runners(
|
208
213
|
'kubernetes', cluster_info)[0]
|
209
214
|
|
@@ -1,4 +1,9 @@
|
|
1
|
-
"""Persistent dashboard sessions.
|
1
|
+
"""Persistent dashboard sessions.
|
2
|
+
|
3
|
+
Note: before #4717, this was useful because we needed to tunnel to multiple
|
4
|
+
controllers - one per user. Now, there is only one controller for the whole API
|
5
|
+
server, so this is not very useful. TODO(cooperc): Remove or fix this.
|
6
|
+
"""
|
2
7
|
import pathlib
|
3
8
|
from typing import Tuple
|
4
9
|
|
sky/jobs/server/server.py
CHANGED
@@ -21,11 +21,6 @@ logger = sky_logging.init_logger(__name__)
|
|
21
21
|
router = fastapi.APIRouter()
|
22
22
|
|
23
23
|
|
24
|
-
def _get_controller_name(request_body: payloads.RequestBody) -> str:
|
25
|
-
user_hash = request_body.user_hash
|
26
|
-
return common.get_controller_name(common.ControllerType.JOBS, user_hash)
|
27
|
-
|
28
|
-
|
29
24
|
@router.post('/launch')
|
30
25
|
async def launch(request: fastapi.Request,
|
31
26
|
jobs_launch_body: payloads.JobsLaunchBody) -> None:
|
@@ -35,7 +30,7 @@ async def launch(request: fastapi.Request,
|
|
35
30
|
request_body=jobs_launch_body,
|
36
31
|
func=core.launch,
|
37
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
38
|
-
request_cluster_name=
|
33
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
39
34
|
)
|
40
35
|
|
41
36
|
|
@@ -49,7 +44,7 @@ async def queue(request: fastapi.Request,
|
|
49
44
|
func=core.queue,
|
50
45
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
51
46
|
else api_requests.ScheduleType.SHORT),
|
52
|
-
request_cluster_name=
|
47
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
53
48
|
)
|
54
49
|
|
55
50
|
|
@@ -62,7 +57,7 @@ async def cancel(request: fastapi.Request,
|
|
62
57
|
request_body=jobs_cancel_body,
|
63
58
|
func=core.cancel,
|
64
59
|
schedule_type=api_requests.ScheduleType.SHORT,
|
65
|
-
request_cluster_name=
|
60
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
66
61
|
)
|
67
62
|
|
68
63
|
|
@@ -78,7 +73,7 @@ async def logs(
|
|
78
73
|
func=core.tail_logs,
|
79
74
|
schedule_type=api_requests.ScheduleType.SHORT
|
80
75
|
if jobs_logs_body.refresh else api_requests.ScheduleType.LONG,
|
81
|
-
request_cluster_name=
|
76
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
82
77
|
)
|
83
78
|
request_task = api_requests.get_request(request.state.request_id)
|
84
79
|
|
@@ -107,13 +102,16 @@ async def download_logs(
|
|
107
102
|
func=core.download_logs,
|
108
103
|
schedule_type=api_requests.ScheduleType.LONG
|
109
104
|
if jobs_download_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
110
|
-
request_cluster_name=
|
105
|
+
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
111
106
|
)
|
112
107
|
|
113
108
|
|
114
109
|
@router.get('/dashboard')
|
115
110
|
async def dashboard(request: fastapi.Request,
|
116
111
|
user_hash: str) -> fastapi.Response:
|
112
|
+
# Note: before #4717, each user had their own controller, and thus their own
|
113
|
+
# dashboard. Now, all users share the same controller, so this isn't really
|
114
|
+
# necessary. TODO(cooperc): clean up.
|
117
115
|
# Find the port for the dashboard of the user
|
118
116
|
os.environ[constants.USER_ID_ENV_VAR] = user_hash
|
119
117
|
server_common.reload_for_new_request(client_entrypoint=None,
|
sky/serve/server/core.py
CHANGED
@@ -249,13 +249,16 @@ def up(
|
|
249
249
|
# with the current job id, we know the service is up and running
|
250
250
|
# for the first time; otherwise it is a name conflict.
|
251
251
|
idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
252
|
+
# Since the controller may be shared among multiple users, launch the
|
253
|
+
# controller with the API server's user hash.
|
254
|
+
with common.with_server_user_hash():
|
255
|
+
controller_job_id, controller_handle = execution.launch(
|
256
|
+
task=controller_task,
|
257
|
+
cluster_name=controller_name,
|
258
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
259
|
+
retry_until_up=True,
|
260
|
+
_disable_controller_check=True,
|
261
|
+
)
|
259
262
|
|
260
263
|
style = colorama.Style
|
261
264
|
fore = colorama.Fore
|
sky/serve/server/server.py
CHANGED
@@ -14,11 +14,6 @@ logger = sky_logging.init_logger(__name__)
|
|
14
14
|
router = fastapi.APIRouter()
|
15
15
|
|
16
16
|
|
17
|
-
def _get_controller_name(request_body: payloads.RequestBody) -> str:
|
18
|
-
user_hash = request_body.user_hash
|
19
|
-
return common.get_controller_name(common.ControllerType.SERVE, user_hash)
|
20
|
-
|
21
|
-
|
22
17
|
@router.post('/up')
|
23
18
|
async def up(
|
24
19
|
request: fastapi.Request,
|
@@ -30,7 +25,7 @@ async def up(
|
|
30
25
|
request_body=up_body,
|
31
26
|
func=core.up,
|
32
27
|
schedule_type=api_requests.ScheduleType.LONG,
|
33
|
-
request_cluster_name=
|
28
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
34
29
|
)
|
35
30
|
|
36
31
|
|
@@ -45,7 +40,7 @@ async def update(
|
|
45
40
|
request_body=update_body,
|
46
41
|
func=core.update,
|
47
42
|
schedule_type=api_requests.ScheduleType.SHORT,
|
48
|
-
request_cluster_name=
|
43
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
49
44
|
)
|
50
45
|
|
51
46
|
|
@@ -60,7 +55,7 @@ async def down(
|
|
60
55
|
request_body=down_body,
|
61
56
|
func=core.down,
|
62
57
|
schedule_type=api_requests.ScheduleType.SHORT,
|
63
|
-
request_cluster_name=
|
58
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
64
59
|
)
|
65
60
|
|
66
61
|
|
@@ -75,7 +70,7 @@ async def terminate_replica(
|
|
75
70
|
request_body=terminate_replica_body,
|
76
71
|
func=core.terminate_replica,
|
77
72
|
schedule_type=api_requests.ScheduleType.SHORT,
|
78
|
-
request_cluster_name=
|
73
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
79
74
|
)
|
80
75
|
|
81
76
|
|
@@ -90,7 +85,7 @@ async def status(
|
|
90
85
|
request_body=status_body,
|
91
86
|
func=core.status,
|
92
87
|
schedule_type=api_requests.ScheduleType.SHORT,
|
93
|
-
request_cluster_name=
|
88
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
94
89
|
)
|
95
90
|
|
96
91
|
|
@@ -105,7 +100,7 @@ async def tail_logs(
|
|
105
100
|
request_body=log_body,
|
106
101
|
func=core.tail_logs,
|
107
102
|
schedule_type=api_requests.ScheduleType.SHORT,
|
108
|
-
request_cluster_name=
|
103
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
109
104
|
)
|
110
105
|
|
111
106
|
request_task = api_requests.get_request(request.state.request_id)
|
sky/server/common.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
import dataclasses
|
4
4
|
import enum
|
5
5
|
import functools
|
6
|
-
import importlib
|
7
6
|
import json
|
8
7
|
import os
|
9
8
|
import pathlib
|
@@ -16,7 +15,6 @@ import uuid
|
|
16
15
|
|
17
16
|
import colorama
|
18
17
|
import filelock
|
19
|
-
import psutil
|
20
18
|
import pydantic
|
21
19
|
import requests
|
22
20
|
|
@@ -28,14 +26,12 @@ from sky.server import constants as server_constants
|
|
28
26
|
from sky.skylet import constants
|
29
27
|
from sky.usage import usage_lib
|
30
28
|
from sky.utils import annotations
|
31
|
-
from sky.utils import common
|
32
29
|
from sky.utils import common_utils
|
33
30
|
from sky.utils import rich_utils
|
34
31
|
from sky.utils import ux_utils
|
35
32
|
|
36
33
|
if typing.TYPE_CHECKING:
|
37
34
|
from sky import dag as dag_lib
|
38
|
-
from sky.server.requests import payloads
|
39
35
|
|
40
36
|
DEFAULT_SERVER_URL = 'http://127.0.0.1:46580'
|
41
37
|
AVAILBLE_LOCAL_API_SERVER_HOSTS = ['0.0.0.0', 'localhost', '127.0.0.1']
|
@@ -149,13 +145,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
149
145
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
|
150
146
|
|
151
147
|
|
152
|
-
def
|
148
|
+
def start_api_server_in_background(deploy: bool = False,
|
149
|
+
host: str = '127.0.0.1'):
|
153
150
|
if not is_api_server_local():
|
154
151
|
raise RuntimeError(
|
155
152
|
f'Cannot start API server: {get_server_url()} is not a local URL')
|
156
153
|
|
157
154
|
# Check available memory before starting the server.
|
158
|
-
avail_mem_size_gb: float =
|
155
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
159
156
|
if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
|
160
157
|
logger.warning(
|
161
158
|
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only has '
|
@@ -166,8 +163,6 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
|
|
166
163
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
167
164
|
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
168
165
|
|
169
|
-
# The command to run uvicorn. Adjust the app:app to your application's
|
170
|
-
# location.
|
171
166
|
api_server_cmd = API_SERVER_CMD
|
172
167
|
if deploy:
|
173
168
|
api_server_cmd += ' --deploy'
|
@@ -175,7 +170,7 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
|
|
175
170
|
api_server_cmd += f' --host {host}'
|
176
171
|
cmd = f'{sys.executable} {api_server_cmd} > {log_path} 2>&1'
|
177
172
|
|
178
|
-
# Start the
|
173
|
+
# Start the API server process in the background and don't wait for it.
|
179
174
|
# If this is called from a CLI invocation, we need start_new_session=True so
|
180
175
|
# that SIGINT on the CLI will not also kill the API server.
|
181
176
|
subprocess.Popen(cmd, shell=True, start_new_session=True)
|
@@ -235,7 +230,7 @@ def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
|
|
235
230
|
f'SkyPilot API server at {server_url}. '
|
236
231
|
'Starting a local server.'
|
237
232
|
f'{colorama.Style.RESET_ALL}')
|
238
|
-
|
233
|
+
start_api_server_in_background(deploy=deploy, host=host)
|
239
234
|
logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
|
240
235
|
|
241
236
|
|
@@ -407,23 +402,6 @@ def request_body_to_params(body: pydantic.BaseModel) -> Dict[str, Any]:
|
|
407
402
|
def reload_for_new_request(client_entrypoint: Optional[str],
|
408
403
|
client_command: Optional[str]):
|
409
404
|
"""Reload modules, global variables, and usage message for a new request."""
|
410
|
-
# When a user request is sent to api server, it changes the user hash in the
|
411
|
-
# env vars, but since controller_utils is imported before the env vars are
|
412
|
-
# set, it doesn't get updated. So we need to reload it here.
|
413
|
-
# pylint: disable=import-outside-toplevel
|
414
|
-
from sky.utils import controller_utils
|
415
|
-
common.SKY_SERVE_CONTROLLER_NAME = common.get_controller_name(
|
416
|
-
common.ControllerType.SERVE)
|
417
|
-
common.JOB_CONTROLLER_NAME = common.get_controller_name(
|
418
|
-
common.ControllerType.JOBS)
|
419
|
-
# TODO(zhwu): We should avoid reloading the controller_utils module.
|
420
|
-
# Instead, we should reload required cache or global variables.
|
421
|
-
# TODO(zhwu): Reloading the controller_utils module may cause the global
|
422
|
-
# variables in other modules referring the `controller_utils.Controllers`
|
423
|
-
# dangling, as they will be pointing to the old object. We should not use
|
424
|
-
# it in global variables.
|
425
|
-
importlib.reload(controller_utils)
|
426
|
-
|
427
405
|
# Reset the client entrypoint and command for the usage message.
|
428
406
|
common_utils.set_client_entrypoint_and_command(
|
429
407
|
client_entrypoint=client_entrypoint,
|
sky/server/requests/executor.py
CHANGED
@@ -32,7 +32,6 @@ import traceback
|
|
32
32
|
import typing
|
33
33
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
34
34
|
|
35
|
-
import psutil
|
36
35
|
import setproctitle
|
37
36
|
|
38
37
|
from sky import global_user_state
|
@@ -70,18 +69,36 @@ logger = sky_logging.init_logger(__name__)
|
|
70
69
|
# platforms, including macOS.
|
71
70
|
multiprocessing.set_start_method('spawn', force=True)
|
72
71
|
|
73
|
-
# Constants based on profiling the peak memory usage
|
74
|
-
#
|
75
|
-
#
|
76
|
-
|
77
|
-
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
#
|
72
|
+
# Constants based on profiling the peak memory usage while serving various
|
73
|
+
# sky commands. These estimation are highly related to usage patterns
|
74
|
+
# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
|
75
|
+
# the profiling covers major clouds and common usage patterns. For user has
|
76
|
+
# deviated usage pattern, they can override the default estimation by
|
77
|
+
# environment variables.
|
78
|
+
# NOTE(dev): update these constants for each release according to the load
|
79
|
+
# test results.
|
80
|
+
# TODO(aylei): maintaining these constants is error-prone, we may need to
|
81
|
+
# automatically tune parallelism at runtime according to system usage stats
|
82
|
+
# in the future.
|
83
|
+
_LONG_WORKER_MEM_GB = 0.4
|
84
|
+
_SHORT_WORKER_MEM_GB = 0.25
|
85
|
+
# To control the number of long workers.
|
86
|
+
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
87
|
+
# Limit the number of long workers of local API server, since local server is
|
88
|
+
# typically:
|
89
|
+
# 1. launched automatically in an environment with high resource contention
|
90
|
+
# (e.g. Laptop)
|
91
|
+
# 2. used by a single user
|
92
|
+
_MAX_LONG_WORKERS_LOCAL = 4
|
93
|
+
# Percentage of memory for long requests
|
82
94
|
# from the memory reserved for SkyPilot.
|
83
|
-
# This is to reserve some memory for
|
95
|
+
# This is to reserve some memory for short requests.
|
84
96
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
97
|
+
# Minimal number of long workers to ensure responsiveness.
|
98
|
+
_MIN_LONG_WORKERS = 1
|
99
|
+
# Minimal number of short workers, there is a daemon task running on short
|
100
|
+
# workers so at least 2 workers are needed to ensure responsiveness.
|
101
|
+
_MIN_SHORT_WORKERS = 2
|
85
102
|
|
86
103
|
|
87
104
|
class QueueBackend(enum.Enum):
|
@@ -301,34 +318,32 @@ def schedule_request(request_id: str,
|
|
301
318
|
_get_queue(schedule_type).put(input_tuple)
|
302
319
|
|
303
320
|
|
321
|
+
def executor_initializer(proc_group: str):
|
322
|
+
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
323
|
+
f'{multiprocessing.current_process().pid}')
|
324
|
+
|
325
|
+
|
304
326
|
def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
305
327
|
"""Worker for the requests.
|
306
328
|
|
307
329
|
Args:
|
308
330
|
max_parallel_size: Maximum number of parallel jobs this worker can run.
|
309
331
|
"""
|
310
|
-
|
311
|
-
|
312
|
-
setproctitle.setproctitle(
|
313
|
-
f'SkyPilot:worker:{worker.schedule_type.value}-{worker.id}')
|
332
|
+
proc_group = f'{worker.schedule_type.value}-{worker.id}'
|
333
|
+
setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
|
314
334
|
queue = _get_queue(worker.schedule_type)
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
# We use executor instead of individual multiprocessing.Process to avoid
|
319
|
-
# the overhead of forking a new process for each request, which can be about
|
320
|
-
# 1s delay.
|
321
|
-
with concurrent.futures.ProcessPoolExecutor(
|
322
|
-
max_workers=max_parallel_size) as executor:
|
323
|
-
while True:
|
335
|
+
|
336
|
+
def process_request(executor: concurrent.futures.ProcessPoolExecutor):
|
337
|
+
try:
|
324
338
|
request_element = queue.get()
|
325
339
|
if request_element is None:
|
326
340
|
time.sleep(0.1)
|
327
|
-
|
341
|
+
return
|
328
342
|
request_id, ignore_return_value = request_element
|
329
343
|
request = api_requests.get_request(request_id)
|
344
|
+
assert request is not None, f'Request with ID {request_id} is None'
|
330
345
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
331
|
-
|
346
|
+
return
|
332
347
|
logger.info(f'[{worker}] Submitting request: {request_id}')
|
333
348
|
# Start additional process to run the request, so that it can be
|
334
349
|
# cancelled when requested by a user.
|
@@ -347,60 +362,49 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
|
|
347
362
|
logger.info(f'[{worker}] Finished request: {request_id}')
|
348
363
|
else:
|
349
364
|
logger.info(f'[{worker}] Submitted request: {request_id}')
|
365
|
+
except KeyboardInterrupt:
|
366
|
+
# Interrupt the worker process will stop request execution, but
|
367
|
+
# the SIGTERM request should be respected anyway since it might
|
368
|
+
# be explicitly sent by user.
|
369
|
+
# TODO(aylei): crash the API server or recreate the worker process
|
370
|
+
# to avoid broken state.
|
371
|
+
logger.error(f'[{worker}] Worker process interrupted')
|
372
|
+
raise
|
373
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
374
|
+
# Catch any other exceptions to avoid crashing the worker process.
|
375
|
+
logger.error(
|
376
|
+
f'[{worker}] Error processing request {request_id}: '
|
377
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
350
378
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
with ux_utils.print_exception_no_traceback():
|
364
|
-
raise ValueError(
|
365
|
-
f'Failed to parse the number of CPUs from {cpu_count}'
|
366
|
-
) from e
|
367
|
-
return psutil.cpu_count()
|
368
|
-
|
369
|
-
|
370
|
-
def _get_mem_size_gb() -> float:
|
371
|
-
"""Get the memory size in GB.
|
372
|
-
|
373
|
-
If the API server is deployed as a pod in k8s cluster, we assume the
|
374
|
-
memory size is provided by the downward API.
|
375
|
-
"""
|
376
|
-
mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
377
|
-
if mem_size is not None:
|
378
|
-
try:
|
379
|
-
return float(mem_size)
|
380
|
-
except ValueError as e:
|
381
|
-
with ux_utils.print_exception_no_traceback():
|
382
|
-
raise ValueError(
|
383
|
-
f'Failed to parse the memory size from {mem_size}') from e
|
384
|
-
return psutil.virtual_memory().total / (1024**3)
|
379
|
+
# Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
|
380
|
+
# because the former is more efficient with the support of lazy creation of
|
381
|
+
# worker processes.
|
382
|
+
# We use executor instead of individual multiprocessing.Process to avoid
|
383
|
+
# the overhead of forking a new process for each request, which can be about
|
384
|
+
# 1s delay.
|
385
|
+
with concurrent.futures.ProcessPoolExecutor(
|
386
|
+
max_workers=max_parallel_size,
|
387
|
+
initializer=executor_initializer,
|
388
|
+
initargs=(proc_group,)) as executor:
|
389
|
+
while True:
|
390
|
+
process_request(executor)
|
385
391
|
|
386
392
|
|
387
393
|
def start(deploy: bool) -> List[multiprocessing.Process]:
|
388
394
|
"""Start the request workers."""
|
389
395
|
# Determine the job capacity of the workers based on the system resources.
|
390
|
-
cpu_count =
|
391
|
-
mem_size_gb =
|
396
|
+
cpu_count = common_utils.get_cpu_count()
|
397
|
+
mem_size_gb = common_utils.get_mem_size_gb()
|
392
398
|
mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
max_parallel_for_non_blocking = _max_parallel_size_for_non_blocking(
|
399
|
-
mem_size_gb, parallel_for_blocking)
|
399
|
+
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
400
|
+
mem_size_gb,
|
401
|
+
local=not deploy)
|
402
|
+
max_parallel_for_short = _max_short_worker_parallism(
|
403
|
+
mem_size_gb, max_parallel_for_long)
|
400
404
|
logger.info(
|
401
|
-
f'SkyPilot API server will start {
|
402
|
-
f'
|
403
|
-
f'{
|
405
|
+
f'SkyPilot API server will start {max_parallel_for_long} workers for '
|
406
|
+
f'long requests and will allow at max '
|
407
|
+
f'{max_parallel_for_short} short requests in parallel.')
|
404
408
|
|
405
409
|
# Setup the queues.
|
406
410
|
if queue_backend == QueueBackend.MULTIPROCESSING:
|
@@ -424,7 +428,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
424
428
|
logger.info('Request queues created')
|
425
429
|
|
426
430
|
worker_procs = []
|
427
|
-
for worker_id in range(
|
431
|
+
for worker_id in range(max_parallel_for_long):
|
428
432
|
worker = RequestWorker(id=worker_id,
|
429
433
|
schedule_type=api_requests.ScheduleType.LONG)
|
430
434
|
worker_proc = multiprocessing.Process(target=request_worker,
|
@@ -432,31 +436,34 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
432
436
|
worker_proc.start()
|
433
437
|
worker_procs.append(worker_proc)
|
434
438
|
|
435
|
-
# Start a
|
439
|
+
# Start a worker for short requests.
|
436
440
|
worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
|
437
441
|
worker_proc = multiprocessing.Process(target=request_worker,
|
438
|
-
args=(worker,
|
439
|
-
max_parallel_for_non_blocking))
|
442
|
+
args=(worker, max_parallel_for_short))
|
440
443
|
worker_proc.start()
|
441
444
|
worker_procs.append(worker_proc)
|
442
445
|
return worker_procs
|
443
446
|
|
444
447
|
|
445
448
|
@annotations.lru_cache(scope='global', maxsize=1)
|
446
|
-
def
|
447
|
-
|
448
|
-
|
449
|
+
def _max_long_worker_parallism(cpu_count: int,
|
450
|
+
mem_size_gb: float,
|
451
|
+
local=False) -> int:
|
452
|
+
"""Max parallelism for long workers."""
|
453
|
+
cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
|
449
454
|
mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
|
450
|
-
|
451
|
-
n = max(
|
455
|
+
_LONG_WORKER_MEM_GB)
|
456
|
+
n = max(_MIN_LONG_WORKERS,
|
457
|
+
min(cpu_based_max_parallel, mem_based_max_parallel))
|
458
|
+
if local:
|
459
|
+
return min(n, _MAX_LONG_WORKERS_LOCAL)
|
452
460
|
return n
|
453
461
|
|
454
462
|
|
455
463
|
@annotations.lru_cache(scope='global', maxsize=1)
|
456
|
-
def
|
457
|
-
|
458
|
-
"""Max parallelism for
|
459
|
-
available_mem = mem_size_gb - (
|
460
|
-
|
461
|
-
n = max(1, int(available_mem / _PER_NON_BLOCKING_REQUEST_MEM_GB))
|
464
|
+
def _max_short_worker_parallism(mem_size_gb: float,
|
465
|
+
long_worker_parallism: int) -> int:
|
466
|
+
"""Max parallelism for short workers."""
|
467
|
+
available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
|
468
|
+
n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
|
462
469
|
return n
|
sky/server/server.py
CHANGED
@@ -57,7 +57,9 @@ P = ParamSpec('P')
|
|
57
57
|
|
58
58
|
def _add_timestamp_prefix_for_server_logs() -> None:
|
59
59
|
server_logger = sky_logging.init_logger('sky.server')
|
60
|
-
#
|
60
|
+
# Clear existing handlers first to prevent duplicates
|
61
|
+
server_logger.handlers.clear()
|
62
|
+
# Disable propagation to avoid the root logger of SkyPilot being affected
|
61
63
|
server_logger.propagate = False
|
62
64
|
# Add date prefix to the log message printed by loggers under
|
63
65
|
# server.
|
@@ -460,6 +462,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
460
462
|
request: fastapi.Request) -> None:
|
461
463
|
"""Launches a cluster or task."""
|
462
464
|
request_id = request.state.request_id
|
465
|
+
logger.info(f'Launching request: {request_id}')
|
463
466
|
executor.schedule_request(
|
464
467
|
request_id,
|
465
468
|
request_name='launch',
|
@@ -627,6 +630,9 @@ async def logs(
|
|
627
630
|
request_name='logs',
|
628
631
|
request_body=cluster_job_body,
|
629
632
|
func=core.tail_logs,
|
633
|
+
# TODO(aylei): We have tail logs scheduled as SHORT request, because it
|
634
|
+
# should be responsive. However, it can be long running if the user's
|
635
|
+
# job keeps running, and we should avoid it taking the SHORT worker.
|
630
636
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
631
637
|
request_cluster_name=cluster_job_body.cluster_name,
|
632
638
|
)
|
@@ -794,10 +800,9 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
|
|
794
800
|
detail=dataclasses.asdict(
|
795
801
|
request_task.encode()))
|
796
802
|
return request_task.encode()
|
797
|
-
#
|
798
|
-
#
|
799
|
-
|
800
|
-
await asyncio.sleep(0)
|
803
|
+
# yield control to allow other coroutines to run, sleep shortly
|
804
|
+
# to avoid storming the DB and CPU in the meantime
|
805
|
+
await asyncio.sleep(0.1)
|
801
806
|
|
802
807
|
|
803
808
|
@app.get('/api/stream')
|
sky/server/stream_utils.py
CHANGED
@@ -68,7 +68,7 @@ async def log_streamer(request_id: Optional[str],
|
|
68
68
|
# Sleep 0 to yield, so other coroutines can run. This busy waiting
|
69
69
|
# loop is performance critical for short-running requests, so we do
|
70
70
|
# not want to yield too long.
|
71
|
-
await asyncio.sleep(0)
|
71
|
+
await asyncio.sleep(0.1)
|
72
72
|
request_task = requests_lib.get_request(request_id)
|
73
73
|
if not follow:
|
74
74
|
break
|
@@ -88,6 +88,9 @@ async def log_streamer(request_id: Optional[str],
|
|
88
88
|
yield line_str
|
89
89
|
|
90
90
|
while True:
|
91
|
+
# Sleep 0 to yield control to allow other coroutines to run,
|
92
|
+
# while keeps the loop tight to make log stream responsive.
|
93
|
+
await asyncio.sleep(0)
|
91
94
|
line: Optional[bytes] = await f.readline()
|
92
95
|
if not line:
|
93
96
|
if request_id is not None:
|
@@ -100,24 +103,18 @@ async def log_streamer(request_id: Optional[str],
|
|
100
103
|
break
|
101
104
|
if not follow:
|
102
105
|
break
|
103
|
-
|
104
|
-
#
|
105
|
-
#
|
106
|
-
|
107
|
-
await asyncio.sleep(0)
|
106
|
+
# Sleep shortly to avoid storming the DB and CPU, this has
|
107
|
+
# little impact on the responsivness here since we are waiting
|
108
|
+
# for a new line to come in.
|
109
|
+
await asyncio.sleep(0.1)
|
108
110
|
continue
|
109
111
|
line_str = line.decode('utf-8')
|
110
112
|
if plain_logs:
|
111
113
|
is_payload, line_str = message_utils.decode_payload(
|
112
114
|
line_str, raise_for_mismatch=False)
|
113
115
|
if is_payload:
|
114
|
-
# Sleep 0 to yield, so other coroutines can run. This busy
|
115
|
-
# waiting loop is performance critical for short-running
|
116
|
-
# requests, so we do not want to yield too long.
|
117
|
-
await asyncio.sleep(0)
|
118
116
|
continue
|
119
117
|
yield line_str
|
120
|
-
await asyncio.sleep(0) # Allow other tasks to run
|
121
118
|
|
122
119
|
|
123
120
|
def stream_response(
|
sky/utils/common.py
CHANGED
@@ -1,53 +1,41 @@
|
|
1
1
|
"""Common enumerators and classes."""
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
import enum
|
4
|
-
|
5
|
+
import os
|
6
|
+
from typing import Generator
|
5
7
|
|
8
|
+
from sky.skylet import constants
|
6
9
|
from sky.utils import common_utils
|
7
10
|
|
8
11
|
SKY_SERVE_CONTROLLER_PREFIX: str = 'sky-serve-controller-'
|
9
12
|
JOB_CONTROLLER_PREFIX: str = 'sky-jobs-controller-'
|
10
|
-
|
11
|
-
#
|
12
|
-
# SkyPilot API server is started by the same user. It will be the same across
|
13
|
-
# the whole lifecycle of the server, including:
|
13
|
+
# We use the user hash (machine-specific) for the controller name. It will be
|
14
|
+
# the same across the whole lifecycle of the server, including:
|
14
15
|
# 1. all requests, because this global variable is set once during server
|
15
16
|
# starts.
|
16
17
|
# 2. SkyPilot API server restarts, as long as the `~/.sky` folder is persisted
|
17
18
|
# and the env var set during starting the server is the same.
|
19
|
+
# This behavior is the same for the local API server (where SERVER_ID is the
|
20
|
+
# same as the normal user hash). This ensures backwards-compatibility with jobs
|
21
|
+
# controllers from before #4660.
|
18
22
|
SERVER_ID = common_utils.get_user_hash()
|
23
|
+
SKY_SERVE_CONTROLLER_NAME: str = f'{SKY_SERVE_CONTROLLER_PREFIX}{SERVER_ID}'
|
24
|
+
JOB_CONTROLLER_NAME: str = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
|
19
25
|
|
20
26
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# Comparing the two IDs can determine if the caller is trying to get the
|
34
|
-
# controller created by their local API server or a remote API server.
|
35
|
-
if user_hash == SERVER_ID:
|
36
|
-
# Not adding server ID for locally created controller because
|
37
|
-
# of backward compatibility.
|
38
|
-
return f'{prefix}{user_hash}'
|
39
|
-
return f'{prefix}{user_hash}{SERVER_ID_CONNECTOR}{SERVER_ID}'
|
40
|
-
|
41
|
-
|
42
|
-
# Controller names differ per user and per SkyPilot API server.
|
43
|
-
# If local: <prefix>-<user_id>
|
44
|
-
# If remote: <prefix>-<user_id>-remote-<api_server_user_id>
|
45
|
-
# DO NOT use these variables on the client side because client side doesn't know
|
46
|
-
# the remote server's user id, so client side will get local-version controller
|
47
|
-
# name.
|
48
|
-
# TODO(SKY-1106): remove dynamic constants like this.
|
49
|
-
SKY_SERVE_CONTROLLER_NAME: str = get_controller_name(ControllerType.SERVE)
|
50
|
-
JOB_CONTROLLER_NAME: str = get_controller_name(ControllerType.JOBS)
|
27
|
+
@contextlib.contextmanager
|
28
|
+
def with_server_user_hash() -> Generator[None, None, None]:
|
29
|
+
"""Temporarily set the user hash to common.SERVER_ID."""
|
30
|
+
old_env_user_hash = os.getenv(constants.USER_ID_ENV_VAR)
|
31
|
+
os.environ[constants.USER_ID_ENV_VAR] = SERVER_ID
|
32
|
+
try:
|
33
|
+
yield
|
34
|
+
finally:
|
35
|
+
if old_env_user_hash is not None:
|
36
|
+
os.environ[constants.USER_ID_ENV_VAR] = old_env_user_hash
|
37
|
+
else:
|
38
|
+
os.environ.pop(constants.USER_ID_ENV_VAR)
|
51
39
|
|
52
40
|
|
53
41
|
class StatusRefreshMode(enum.Enum):
|
@@ -64,11 +52,3 @@ class StatusRefreshMode(enum.Enum):
|
|
64
52
|
class OptimizeTarget(enum.Enum):
|
65
53
|
COST = 0
|
66
54
|
TIME = 1
|
67
|
-
|
68
|
-
|
69
|
-
def is_current_user_controller(controller_name: str) -> bool:
|
70
|
-
"""If the controller name belongs to the current user."""
|
71
|
-
if SERVER_ID_CONNECTOR in controller_name:
|
72
|
-
controller_name = controller_name.split(SERVER_ID_CONNECTOR)[0]
|
73
|
-
controller_user_id = controller_name.split('-')[-1]
|
74
|
-
return controller_user_id == common_utils.get_user_hash()
|
sky/utils/common_utils.py
CHANGED
@@ -18,6 +18,7 @@ import uuid
|
|
18
18
|
|
19
19
|
import jinja2
|
20
20
|
import jsonschema
|
21
|
+
import psutil
|
21
22
|
import yaml
|
22
23
|
|
23
24
|
from sky import exceptions
|
@@ -755,3 +756,40 @@ def is_port_available(port: int, reuse_addr: bool = True) -> bool:
|
|
755
756
|
return True
|
756
757
|
except OSError:
|
757
758
|
return False
|
759
|
+
|
760
|
+
|
761
|
+
# TODO(aylei): should be aware of cgroups
|
762
|
+
def get_cpu_count() -> int:
|
763
|
+
"""Get the number of CPUs.
|
764
|
+
|
765
|
+
If the API server is deployed as a pod in k8s cluster, we assume the
|
766
|
+
number of CPUs is provided by the downward API.
|
767
|
+
"""
|
768
|
+
cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
|
769
|
+
if cpu_count is not None:
|
770
|
+
try:
|
771
|
+
return int(float(cpu_count))
|
772
|
+
except ValueError as e:
|
773
|
+
with ux_utils.print_exception_no_traceback():
|
774
|
+
raise ValueError(
|
775
|
+
f'Failed to parse the number of CPUs from {cpu_count}'
|
776
|
+
) from e
|
777
|
+
return psutil.cpu_count()
|
778
|
+
|
779
|
+
|
780
|
+
# TODO(aylei): should be aware of cgroups
|
781
|
+
def get_mem_size_gb() -> float:
|
782
|
+
"""Get the memory size in GB.
|
783
|
+
|
784
|
+
If the API server is deployed as a pod in k8s cluster, we assume the
|
785
|
+
memory size is provided by the downward API.
|
786
|
+
"""
|
787
|
+
mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
|
788
|
+
if mem_size is not None:
|
789
|
+
try:
|
790
|
+
return float(mem_size)
|
791
|
+
except ValueError as e:
|
792
|
+
with ux_utils.print_exception_no_traceback():
|
793
|
+
raise ValueError(
|
794
|
+
f'Failed to parse the memory size from {mem_size}') from e
|
795
|
+
return psutil.virtual_memory().total / (1024**3)
|
sky/utils/controller_utils.py
CHANGED
@@ -91,10 +91,6 @@ class Controllers(enum.Enum):
|
|
91
91
|
JOBS_CONTROLLER = _ControllerSpec(
|
92
92
|
controller_type='jobs',
|
93
93
|
name='managed jobs controller',
|
94
|
-
# Default cluster name is the current user's controller cluster unless
|
95
|
-
# caller initiate with a different controller name.
|
96
|
-
# TODO(zhwu): by having the controller name loaded in common, it
|
97
|
-
# will not respect the latest updated user hash.
|
98
94
|
cluster_name=common.JOB_CONTROLLER_NAME,
|
99
95
|
in_progress_hint=(
|
100
96
|
'* {job_info}To see all managed jobs: '
|
@@ -164,13 +160,18 @@ class Controllers(enum.Enum):
|
|
164
160
|
if name is None:
|
165
161
|
return None
|
166
162
|
controller = None
|
163
|
+
# The controller name is always the same. However, on the client-side,
|
164
|
+
# we may not know the exact name, because we are missing the server-side
|
165
|
+
# common.SERVER_ID. So, we will assume anything that matches the prefix
|
166
|
+
# is a controller.
|
167
167
|
if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
|
168
168
|
controller = cls.SKY_SERVE_CONTROLLER
|
169
169
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
170
170
|
controller = cls.JOBS_CONTROLLER
|
171
171
|
if controller is not None and name != controller.value.cluster_name:
|
172
|
-
#
|
173
|
-
# so need to set the controller's
|
172
|
+
# The client-side cluster_name is not accurate. Assume that `name`
|
173
|
+
# is the actual cluster name, so need to set the controller's
|
174
|
+
# cluster name to the input name.
|
174
175
|
controller.value.cluster_name = name
|
175
176
|
return controller
|
176
177
|
|
{skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=2WOLIr_y7h-Dzd_2cUqq56HiHaF6TBVULtoUaAeb-5c,6391
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
|
4
4
|
sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
|
6
6
|
sky/cloud_stores.py,sha256=-95XIqi_ouo7hvoN5mQNP6bGm07MyF6Yk-YP4Txb5wg,24034
|
7
7
|
sky/core.py,sha256=gw_TrQOxz28sLAJJq6ajPnlRlrKQ2G1DtqLuntMejFU,45508
|
8
8
|
sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
|
@@ -42,7 +42,7 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
43
43
|
sky/benchmark/benchmark_utils.py,sha256=o4RymqSceq5mLEZL0upQM6NVEzJJQzj9s9tTm49uUTc,26365
|
44
44
|
sky/client/__init__.py,sha256=pz6xvVSd9X-gwqbsDL0E9QOojYqM0KAD0j-NCyCIF1k,38
|
45
|
-
sky/client/cli.py,sha256=
|
45
|
+
sky/client/cli.py,sha256=iwYBgEt3tgsYmOIp-ivPmL2FHoalvhH4Ng--C31ubws,218201
|
46
46
|
sky/client/common.py,sha256=axDic7WOG1e78SdFm5XIwdhX7YNvf3g4k7INrsW3X4s,14611
|
47
47
|
sky/client/sdk.py,sha256=q5R0_AquHAiLSLXpha8fIecQ9cgqqFba436xVzJ48oI,66943
|
48
48
|
sky/clouds/__init__.py,sha256=taKUCz6gWoKZhqHLYJXX-d0Ux6ZSQZEwxcNFdniupL0,1365
|
@@ -87,7 +87,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=Zj4bqWPiDcT_ZFyHxQw
|
|
87
87
|
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=7YVnoGDGGZI2TK02bj_LOoD4E5J5CFl6eqz2XlR4Vy8,12790
|
88
88
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
89
89
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=yKuAFbjBRNz_e2RNNDT_aHHAuKQ86Ac7GKgIie5O6Pg,7273
|
90
|
-
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=
|
90
|
+
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=JnugFifzHPQITlbDKoKexE8NqgagOEfQWTxon7P6vJ0,30935
|
91
91
|
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=MUzogyLruLQmIt-To6TsfnGPgv_nnlp49XYbeshsd7I,5003
|
92
92
|
sky/clouds/service_catalog/data_fetchers/fetch_vast.py,sha256=zR9icM3ty5C8tGw13pQbsBtQQMgG4kl1j_jSGqqrgOA,4741
|
93
93
|
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=Opp2r3KSzXPtwk3lKNbO8IX9QzjoRSwy1kW3jPjtS1c,21453
|
@@ -116,9 +116,9 @@ sky/jobs/dashboard/dashboard.py,sha256=kUKSXMAWAvPwJ_W_JK3wyz65Uope90_rNvhl8rZ1I
|
|
116
116
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
117
117
|
sky/jobs/dashboard/templates/index.html,sha256=tz95q8O2pF7IvfY6yv0rnPyhj4DX8WX4RIVVxqFKV1Y,28519
|
118
118
|
sky/jobs/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
119
|
-
sky/jobs/server/core.py,sha256=
|
120
|
-
sky/jobs/server/dashboard_utils.py,sha256=
|
121
|
-
sky/jobs/server/server.py,sha256=
|
119
|
+
sky/jobs/server/core.py,sha256=zMLSSdNFQkP-RsfzCZ9jIcHNCL0lSvRd7PH3Sie0yPA,22615
|
120
|
+
sky/jobs/server/dashboard_utils.py,sha256=2Mbx40W1pQqPEPHsSDbHeaF0j5cgyKy-_A9Owdwp_AQ,2315
|
121
|
+
sky/jobs/server/server.py,sha256=s3wULAh4u4drdIz2VA8l0HiXxHWdUzsBDYCstzU0Vxs,7411
|
122
122
|
sky/provision/__init__.py,sha256=jiTOawg_wpy0s3Z-SEoOf7r280arLHUZzj-KPh-w7ek,6424
|
123
123
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
124
124
|
sky/provision/constants.py,sha256=oc_XDUkcoLQ_lwDy5yMeMSWviKS0j0s1c0pjlvpNeWY,800
|
@@ -218,16 +218,16 @@ sky/serve/service_spec.py,sha256=Q0qnFRjNnfGIpksubH5VqPKIlvpWs5had_Ma_PSHyo8,169
|
|
218
218
|
sky/serve/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
219
219
|
sky/serve/client/sdk.py,sha256=fVYQfvNuJxa8aZiS7LJoXFeGcjRidko0Tph5b6m0yMQ,11539
|
220
220
|
sky/serve/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
221
|
-
sky/serve/server/core.py,sha256=
|
222
|
-
sky/serve/server/server.py,sha256=
|
221
|
+
sky/serve/server/core.py,sha256=pRvFadEIH_WTUkTtSmuFoPBP4JFq8Obt68ifi9DWuog,36865
|
222
|
+
sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,3283
|
223
223
|
sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
|
224
|
-
sky/server/common.py,sha256=
|
224
|
+
sky/server/common.py,sha256=uBshF4a-U8NGgm8XOHTW2YNSq0CsByfdIFgiybU5PEg,17321
|
225
225
|
sky/server/constants.py,sha256=SqhWJMassFyvWAJn2UJHvuA_0_C6f5vngMzZ2KYLsKw,770
|
226
|
-
sky/server/server.py,sha256=
|
227
|
-
sky/server/stream_utils.py,sha256
|
226
|
+
sky/server/server.py,sha256=0gcIn3jr_4DkHpBJYdNq--uPo9Im8bn2ftxgd8mBMcU,42225
|
227
|
+
sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
|
228
228
|
sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
|
229
229
|
sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
230
|
-
sky/server/requests/executor.py,sha256=
|
230
|
+
sky/server/requests/executor.py,sha256=NxVB0aFA05GddXDdt89wEwEYyJcIIrsQxE2wowklhUI,19597
|
231
231
|
sky/server/requests/payloads.py,sha256=PeEkqQoTO3ellelkFX5yzPKbPkDV-NfVXkxHndYlrjE,15769
|
232
232
|
sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
|
233
233
|
sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -301,11 +301,11 @@ sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
|
|
301
301
|
sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
|
302
302
|
sky/utils/command_runner.py,sha256=-7vxLvwZnTvYMQ_nScmuQWY6ZvQYv69yvvIp2uOaOqU,39063
|
303
303
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
304
|
-
sky/utils/common.py,sha256=
|
305
|
-
sky/utils/common_utils.py,sha256
|
304
|
+
sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
|
305
|
+
sky/utils/common_utils.py,sha256=-O0GthIockeJy8LlA4heVYYtaUdQwNA-5mFMqHajRf8,27457
|
306
306
|
sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
|
307
307
|
sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
|
308
|
-
sky/utils/controller_utils.py,sha256=
|
308
|
+
sky/utils/controller_utils.py,sha256=4Nck10XV6gNJKjBl7y_CIxIGqP3bbISuZSVTHbBumgs,45725
|
309
309
|
sky/utils/dag_utils.py,sha256=sAus0aL1wtuuFZSDnpO4LY-6WK4u5iJY952oWQzHo3Y,7532
|
310
310
|
sky/utils/db_utils.py,sha256=K2-OHPg0FeHCarevMdWe0IWzm6wWumViEeYeJuGoFUE,3747
|
311
311
|
sky/utils/env_options.py,sha256=aaD6GoYK0LaZIqjOEZ-R7eccQuiRriW3EuLWtOI5En8,1578
|
@@ -336,9 +336,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
|
|
336
336
|
sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=iAjfyPclOs8qlALACcfxLpRAO9CZ-h16leFqXZ6tNaY,10096
|
337
337
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
338
338
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
339
|
-
skypilot_nightly-1.0.0.
|
340
|
-
skypilot_nightly-1.0.0.
|
341
|
-
skypilot_nightly-1.0.0.
|
342
|
-
skypilot_nightly-1.0.0.
|
343
|
-
skypilot_nightly-1.0.0.
|
344
|
-
skypilot_nightly-1.0.0.
|
339
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
340
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/METADATA,sha256=uYtMxJQSUuL9hPmfqny_uQvuqWy65W5mHUHv7HvJb-o,18916
|
341
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
342
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
343
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
344
|
+
skypilot_nightly-1.0.0.dev20250220.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250218.dist-info → skypilot_nightly-1.0.0.dev20250220.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|