skypilot-nightly 1.0.0.dev20250328__py3-none-any.whl → 1.0.0.dev20250330__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +3 -0
- sky/client/cli.py +3 -0
- sky/clouds/do.py +2 -0
- sky/data/storage_utils.py +2 -8
- sky/optimizer.py +2 -45
- sky/serve/__init__.py +1 -0
- sky/serve/autoscalers.py +26 -11
- sky/serve/replica_managers.py +77 -6
- sky/serve/serve_utils.py +80 -0
- sky/serve/server/core.py +4 -73
- sky/serve/service_spec.py +35 -3
- sky/serve/spot_placer.py +278 -0
- sky/server/common.py +15 -7
- sky/server/requests/executor.py +1 -1
- sky/server/requests/queues/mp_queue.py +8 -1
- sky/server/server.py +1 -1
- sky/utils/registry.py +2 -0
- sky/utils/resources_utils.py +50 -0
- sky/utils/schemas.py +10 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/RECORD +26 -25
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'f90ccc1757680ccbff2fb8d86fc6dfd4242bd182'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250330'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -3943,6 +3943,7 @@ def jobs_launch(
|
|
3943
3943
|
required=False,
|
3944
3944
|
help='Show jobs from all users.')
|
3945
3945
|
@click.option('--all',
|
3946
|
+
'-a',
|
3946
3947
|
default=False,
|
3947
3948
|
is_flag=True,
|
3948
3949
|
required=False,
|
@@ -4394,6 +4395,7 @@ def serve_up(
|
|
4394
4395
|
)
|
4395
4396
|
click.secho('Service spec:', fg='cyan')
|
4396
4397
|
click.echo(task.service)
|
4398
|
+
serve_lib.validate_service_task(task)
|
4397
4399
|
|
4398
4400
|
click.secho('Each replica will use the following resources (estimated):',
|
4399
4401
|
fg='cyan')
|
@@ -4493,6 +4495,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
|
4493
4495
|
)
|
4494
4496
|
click.secho('Service spec:', fg='cyan')
|
4495
4497
|
click.echo(task.service)
|
4498
|
+
serve_lib.validate_service_task(task)
|
4496
4499
|
|
4497
4500
|
click.secho('New replica will use the following resources (estimated):',
|
4498
4501
|
fg='cyan')
|
sky/client/cli.py
CHANGED
@@ -3943,6 +3943,7 @@ def jobs_launch(
|
|
3943
3943
|
required=False,
|
3944
3944
|
help='Show jobs from all users.')
|
3945
3945
|
@click.option('--all',
|
3946
|
+
'-a',
|
3946
3947
|
default=False,
|
3947
3948
|
is_flag=True,
|
3948
3949
|
required=False,
|
@@ -4394,6 +4395,7 @@ def serve_up(
|
|
4394
4395
|
)
|
4395
4396
|
click.secho('Service spec:', fg='cyan')
|
4396
4397
|
click.echo(task.service)
|
4398
|
+
serve_lib.validate_service_task(task)
|
4397
4399
|
|
4398
4400
|
click.secho('Each replica will use the following resources (estimated):',
|
4399
4401
|
fg='cyan')
|
@@ -4493,6 +4495,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
|
4493
4495
|
)
|
4494
4496
|
click.secho('Service spec:', fg='cyan')
|
4495
4497
|
click.echo(task.service)
|
4498
|
+
serve_lib.validate_service_task(task)
|
4496
4499
|
|
4497
4500
|
click.secho('New replica will use the following resources (estimated):',
|
4498
4501
|
fg='cyan')
|
sky/clouds/do.py
CHANGED
@@ -280,6 +280,8 @@ class DO(clouds.Cloud):
|
|
280
280
|
return True, None
|
281
281
|
|
282
282
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
283
|
+
if do_utils.CREDENTIALS_PATH is None:
|
284
|
+
return {}
|
283
285
|
if not os.path.exists(os.path.expanduser(do_utils.CREDENTIALS_PATH)):
|
284
286
|
return {}
|
285
287
|
return {
|
sky/data/storage_utils.py
CHANGED
@@ -18,12 +18,6 @@ from sky.utils import log_utils
|
|
18
18
|
|
19
19
|
logger = sky_logging.init_logger(__name__)
|
20
20
|
|
21
|
-
_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
|
22
|
-
f'{colorama.Fore.YELLOW}Warning: Files/dirs '
|
23
|
-
'specified in .gitignore will be uploaded '
|
24
|
-
'to the cloud storage for {path!r}'
|
25
|
-
'due to the following error: {error_msg!r}')
|
26
|
-
|
27
21
|
_USE_SKYIGNORE_HINT = (
|
28
22
|
'To avoid using .gitignore, you can create a .skyignore file instead.')
|
29
23
|
|
@@ -172,7 +166,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
172
166
|
submodules = submodules_output.stdout.split('\0')[:-1]
|
173
167
|
|
174
168
|
# The empty string is the relative reference to the src_dir_path.
|
175
|
-
all_git_repos = ['
|
169
|
+
all_git_repos = [''] + [
|
176
170
|
# We only care about submodules that are a subdirectory of src_dir_path.
|
177
171
|
submodule for submodule in submodules if not submodule.startswith('../')
|
178
172
|
]
|
@@ -208,7 +202,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
208
202
|
|
209
203
|
for item in output_list:
|
210
204
|
|
211
|
-
if repo == '
|
205
|
+
if repo == '' and item == './':
|
212
206
|
logger.warning(f'{src_dir_path} is within a git repo, but the '
|
213
207
|
'entire directory is ignored by git. We will '
|
214
208
|
'ignore all git exclusions. '
|
sky/optimizer.py
CHANGED
@@ -1140,50 +1140,6 @@ class DummyCloud(clouds.Cloud):
|
|
1140
1140
|
pass
|
1141
1141
|
|
1142
1142
|
|
1143
|
-
def _make_launchables_for_valid_region_zones(
|
1144
|
-
launchable_resources: resources_lib.Resources
|
1145
|
-
) -> List[resources_lib.Resources]:
|
1146
|
-
assert launchable_resources.is_launchable()
|
1147
|
-
# In principle, all provisioning requests should be made at the granularity
|
1148
|
-
# of a single zone. However, for on-demand instances, we batch the requests
|
1149
|
-
# to the zones in the same region in order to leverage the region-level
|
1150
|
-
# provisioning APIs of AWS and Azure. This way, we can reduce the number of
|
1151
|
-
# API calls, and thus the overall failover time. Note that this optimization
|
1152
|
-
# does not affect the user cost since the clouds charge the same prices for
|
1153
|
-
# on-demand instances in the same region regardless of the zones. On the
|
1154
|
-
# other hand, for spot instances, we do not batch the requests because the
|
1155
|
-
# "AWS" spot prices may vary across zones.
|
1156
|
-
# For GCP, we do not batch the requests because GCP reservation system is
|
1157
|
-
# zone based. Therefore, price estimation is potentially different across
|
1158
|
-
# zones.
|
1159
|
-
|
1160
|
-
# NOTE(woosuk): GCP does not support region-level provisioning APIs. Thus,
|
1161
|
-
# while we return per-region resources here, the provisioner will still
|
1162
|
-
# issue the request for one zone at a time.
|
1163
|
-
# NOTE(woosuk): If we support Azure spot instances, we should batch the
|
1164
|
-
# requests since Azure spot prices are region-level.
|
1165
|
-
# TODO(woosuk): Batch the per-zone AWS spot instance requests if they are
|
1166
|
-
# in the same region and have the same price.
|
1167
|
-
# TODO(woosuk): A better design is to implement batching at a higher level
|
1168
|
-
# (e.g., in provisioner or optimizer), not here.
|
1169
|
-
launchables = []
|
1170
|
-
regions = launchable_resources.get_valid_regions_for_launchable()
|
1171
|
-
for region in regions:
|
1172
|
-
if (launchable_resources.use_spot and region.zones is not None or
|
1173
|
-
launchable_resources.cloud.optimize_by_zone()):
|
1174
|
-
# Spot instances.
|
1175
|
-
# Do not batch the per-zone requests.
|
1176
|
-
for zone in region.zones:
|
1177
|
-
launchables.append(
|
1178
|
-
launchable_resources.copy(region=region.name,
|
1179
|
-
zone=zone.name))
|
1180
|
-
else:
|
1181
|
-
# On-demand instances.
|
1182
|
-
# Batch the requests at the granularity of a single region.
|
1183
|
-
launchables.append(launchable_resources.copy(region=region.name))
|
1184
|
-
return launchables
|
1185
|
-
|
1186
|
-
|
1187
1143
|
def _filter_out_blocked_launchable_resources(
|
1188
1144
|
launchable_resources: Iterable[resources_lib.Resources],
|
1189
1145
|
blocked_resources: Iterable[resources_lib.Resources]):
|
@@ -1313,7 +1269,8 @@ def _fill_in_launchable_resources(
|
|
1313
1269
|
cheapest = feasible_resources.resources_list[0]
|
1314
1270
|
# Generate region/zone-specified resources.
|
1315
1271
|
launchable[resources].extend(
|
1316
|
-
|
1272
|
+
resources_utils.make_launchables_for_valid_region_zones(
|
1273
|
+
cheapest))
|
1317
1274
|
cloud_candidates[cloud] = feasible_resources.resources_list
|
1318
1275
|
else:
|
1319
1276
|
all_fuzzy_candidates.update(
|
sky/serve/__init__.py
CHANGED
@@ -21,6 +21,7 @@ from sky.serve.serve_utils import generate_service_name
|
|
21
21
|
from sky.serve.serve_utils import ServeCodeGen
|
22
22
|
from sky.serve.serve_utils import ServiceComponent
|
23
23
|
from sky.serve.serve_utils import UpdateMode
|
24
|
+
from sky.serve.serve_utils import validate_service_task
|
24
25
|
from sky.serve.service_spec import SkyServiceSpec
|
25
26
|
|
26
27
|
os.makedirs(os.path.expanduser(SKYSERVE_METADATA_DIR), exist_ok=True)
|
sky/serve/autoscalers.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Autoscalers: perform autoscaling by monitoring metrics."""
|
2
2
|
import bisect
|
3
|
+
import copy
|
3
4
|
import dataclasses
|
4
5
|
import enum
|
5
6
|
import math
|
@@ -56,8 +57,8 @@ class AutoscalerDecision:
|
|
56
57
|
def _generate_scale_up_decisions(
|
57
58
|
num: int, target: Optional[Dict[str, Any]]) -> List[AutoscalerDecision]:
|
58
59
|
return [
|
59
|
-
AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP,
|
60
|
-
|
60
|
+
AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP,
|
61
|
+
copy.copy(target)) for _ in range(num)
|
61
62
|
]
|
62
63
|
|
63
64
|
|
@@ -134,6 +135,7 @@ class Autoscaler:
|
|
134
135
|
self.min_replicas: int = spec.min_replicas
|
135
136
|
self.max_replicas: int = (spec.max_replicas if spec.max_replicas
|
136
137
|
is not None else spec.min_replicas)
|
138
|
+
self.num_overprovision: Optional[int] = spec.num_overprovision
|
137
139
|
# Target number of replicas is initialized to min replicas
|
138
140
|
self.target_num_replicas: int = spec.min_replicas
|
139
141
|
self.latest_version: int = constants.INITIAL_VERSION
|
@@ -143,6 +145,12 @@ class Autoscaler:
|
|
143
145
|
self.latest_version_ever_ready: int = self.latest_version - 1
|
144
146
|
self.update_mode = serve_utils.DEFAULT_UPDATE_MODE
|
145
147
|
|
148
|
+
def get_final_target_num_replicas(self) -> int:
|
149
|
+
"""Get the final target number of replicas."""
|
150
|
+
if self.num_overprovision is None:
|
151
|
+
return self.target_num_replicas
|
152
|
+
return self.target_num_replicas + self.num_overprovision
|
153
|
+
|
146
154
|
def _calculate_target_num_replicas(self) -> int:
|
147
155
|
"""Calculate target number of replicas."""
|
148
156
|
raise NotImplementedError
|
@@ -207,7 +215,7 @@ class Autoscaler:
|
|
207
215
|
0, to make the service scale faster when the service is not running.
|
208
216
|
This will happen when min_replicas = 0 and no traffic.
|
209
217
|
"""
|
210
|
-
if self.
|
218
|
+
if self.get_final_target_num_replicas() == 0:
|
211
219
|
return constants.AUTOSCALER_NO_REPLICA_DECISION_INTERVAL_SECONDS
|
212
220
|
else:
|
213
221
|
return constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS
|
@@ -236,13 +244,14 @@ class Autoscaler:
|
|
236
244
|
# old and latest versions are allowed in rolling update, this will
|
237
245
|
# not affect the time it takes for the service to updated to the
|
238
246
|
# latest version.
|
239
|
-
if num_latest_ready_replicas >=
|
247
|
+
if (num_latest_ready_replicas >=
|
248
|
+
self.get_final_target_num_replicas()):
|
240
249
|
# Once the number of ready new replicas is greater than or equal
|
241
250
|
# to the target, we can scale down all old replicas.
|
242
251
|
return [info.replica_id for info in old_nonterminal_replicas]
|
243
252
|
# If rolling update is in progress, we scale down old replicas
|
244
253
|
# based on the number of ready new replicas.
|
245
|
-
num_old_replicas_to_keep = (self.
|
254
|
+
num_old_replicas_to_keep = (self.get_final_target_num_replicas() -
|
246
255
|
num_latest_ready_replicas)
|
247
256
|
# Remove old replicas (especially old launching replicas) and only
|
248
257
|
# keep the required number of replicas, as we want to let the new
|
@@ -422,6 +431,7 @@ class _AutoscalerWithHysteresis(Autoscaler):
|
|
422
431
|
f'Old target number of replicas: {old_target_num_replicas}. '
|
423
432
|
f'Current target number of replicas: {target_num_replicas}. '
|
424
433
|
f'Final target number of replicas: {self.target_num_replicas}. '
|
434
|
+
f'Num overprovision: {self.num_overprovision}. '
|
425
435
|
f'Upscale counter: {self.upscale_counter}/'
|
426
436
|
f'{self.scale_up_threshold}. '
|
427
437
|
f'Downscale counter: {self.downscale_counter}/'
|
@@ -505,8 +515,9 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
|
|
505
515
|
|
506
516
|
# Case 1. when latest_nonterminal_replicas is less
|
507
517
|
# than num_to_provision, we always scale up new replicas.
|
508
|
-
|
509
|
-
|
518
|
+
target_num_replicas = self.get_final_target_num_replicas()
|
519
|
+
if len(latest_nonterminal_replicas) < target_num_replicas:
|
520
|
+
num_replicas_to_scale_up = (target_num_replicas -
|
510
521
|
len(latest_nonterminal_replicas))
|
511
522
|
logger.info('Number of replicas to scale up: '
|
512
523
|
f'{num_replicas_to_scale_up}')
|
@@ -514,11 +525,11 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
|
|
514
525
|
_generate_scale_up_decisions(num_replicas_to_scale_up, None))
|
515
526
|
|
516
527
|
# Case 2: when latest_nonterminal_replicas is more
|
517
|
-
# than
|
528
|
+
# than target_num_replicas, we scale down new replicas.
|
518
529
|
replicas_to_scale_down = []
|
519
|
-
if len(latest_nonterminal_replicas) >
|
530
|
+
if len(latest_nonterminal_replicas) > target_num_replicas:
|
520
531
|
num_replicas_to_scale_down = (len(latest_nonterminal_replicas) -
|
521
|
-
|
532
|
+
target_num_replicas)
|
522
533
|
replicas_to_scale_down = (
|
523
534
|
_select_nonterminal_replicas_to_scale_down(
|
524
535
|
num_replicas_to_scale_down, latest_nonterminal_replicas))
|
@@ -633,7 +644,7 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
|
|
633
644
|
all_replica_ids_to_scale_down: List[int] = []
|
634
645
|
|
635
646
|
# Decide how many spot instances to launch.
|
636
|
-
num_spot_to_provision = (self.
|
647
|
+
num_spot_to_provision = (self.get_final_target_num_replicas() -
|
637
648
|
self.base_ondemand_fallback_replicas)
|
638
649
|
if num_nonterminal_spot < num_spot_to_provision:
|
639
650
|
# Not enough spot instances, scale up.
|
@@ -668,6 +679,10 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
|
|
668
679
|
num_ondemand_to_provision += (num_spot_to_provision -
|
669
680
|
num_ready_spot)
|
670
681
|
|
682
|
+
# Make sure we don't launch on-demand fallback for
|
683
|
+
# overprovisioned replicas.
|
684
|
+
num_ondemand_to_provision = min(num_ondemand_to_provision,
|
685
|
+
self.target_num_replicas)
|
671
686
|
if num_ondemand_to_provision > num_nonterminal_ondemand:
|
672
687
|
num_ondemand_to_scale_up = (num_ondemand_to_provision -
|
673
688
|
num_nonterminal_ondemand)
|
sky/serve/replica_managers.py
CHANGED
@@ -27,6 +27,7 @@ from sky.serve import constants as serve_constants
|
|
27
27
|
from sky.serve import serve_state
|
28
28
|
from sky.serve import serve_utils
|
29
29
|
from sky.serve import service
|
30
|
+
from sky.serve import spot_placer
|
30
31
|
from sky.skylet import constants
|
31
32
|
from sky.skylet import job_lib
|
32
33
|
from sky.usage import usage_lib
|
@@ -60,6 +61,7 @@ def launch_cluster(replica_id: int,
|
|
60
61
|
task_yaml_path: str,
|
61
62
|
cluster_name: str,
|
62
63
|
resources_override: Optional[Dict[str, Any]] = None,
|
64
|
+
retry_until_up: bool = True,
|
63
65
|
max_retry: int = 3) -> None:
|
64
66
|
"""Launch a sky serve replica cluster.
|
65
67
|
|
@@ -71,6 +73,10 @@ def launch_cluster(replica_id: int,
|
|
71
73
|
or some error happened before provisioning and will happen again
|
72
74
|
if retry.
|
73
75
|
"""
|
76
|
+
if resources_override is not None:
|
77
|
+
logger.info(f'Scaling up replica (id: {replica_id}) cluster '
|
78
|
+
f'{cluster_name} with resources override: '
|
79
|
+
f'{resources_override}')
|
74
80
|
try:
|
75
81
|
config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
|
76
82
|
task = sky.Task.from_yaml_config(config)
|
@@ -98,7 +104,7 @@ def launch_cluster(replica_id: int,
|
|
98
104
|
usage_lib.messages.usage.set_internal()
|
99
105
|
execution.launch(task,
|
100
106
|
cluster_name,
|
101
|
-
retry_until_up=
|
107
|
+
retry_until_up=retry_until_up,
|
102
108
|
_is_launched_by_sky_serve_controller=True)
|
103
109
|
logger.info(f'Replica cluster {cluster_name} launched.')
|
104
110
|
except (exceptions.InvalidClusterNameError,
|
@@ -246,6 +252,10 @@ class ReplicaStatusProperty:
|
|
246
252
|
preempted: bool = False
|
247
253
|
# Whether the replica is purged.
|
248
254
|
purged: bool = False
|
255
|
+
# Whether the replica failed to launch due to spot availability.
|
256
|
+
# This is only possible when spot placer is enabled, so the retry until up
|
257
|
+
# is set to True and it can fail immediately due to spot availability.
|
258
|
+
failed_spot_availability: bool = False
|
249
259
|
|
250
260
|
def remove_terminated_replica(self) -> bool:
|
251
261
|
"""Whether to remove the replica record from the replica table.
|
@@ -385,10 +395,11 @@ class ReplicaStatusProperty:
|
|
385
395
|
class ReplicaInfo:
|
386
396
|
"""Replica info for each replica."""
|
387
397
|
|
388
|
-
_VERSION =
|
398
|
+
_VERSION = 1
|
389
399
|
|
390
400
|
def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
|
391
|
-
is_spot: bool,
|
401
|
+
is_spot: bool, location: Optional[spot_placer.Location],
|
402
|
+
version: int) -> None:
|
392
403
|
self._version = self._VERSION
|
393
404
|
self.replica_id: int = replica_id
|
394
405
|
self.cluster_name: str = cluster_name
|
@@ -398,6 +409,11 @@ class ReplicaInfo:
|
|
398
409
|
self.consecutive_failure_times: List[float] = []
|
399
410
|
self.status_property: ReplicaStatusProperty = ReplicaStatusProperty()
|
400
411
|
self.is_spot: bool = is_spot
|
412
|
+
self.location: Optional[Dict[str, Optional[str]]] = (
|
413
|
+
location.to_pickleable() if location is not None else None)
|
414
|
+
|
415
|
+
def get_spot_location(self) -> Optional[spot_placer.Location]:
|
416
|
+
return spot_placer.Location.from_pickleable(self.location)
|
401
417
|
|
402
418
|
def handle(
|
403
419
|
self,
|
@@ -483,6 +499,7 @@ class ReplicaInfo:
|
|
483
499
|
f'version={self.version}, '
|
484
500
|
f'replica_port={self.replica_port}, '
|
485
501
|
f'is_spot={self.is_spot}, '
|
502
|
+
f'location={self.location}, '
|
486
503
|
f'status={self.status}, '
|
487
504
|
f'launched_at={info_dict["launched_at"]}{handle_str})')
|
488
505
|
return info
|
@@ -557,6 +574,9 @@ class ReplicaInfo:
|
|
557
574
|
# Treated similar to on-demand instances.
|
558
575
|
self.is_spot = False
|
559
576
|
|
577
|
+
if version < 1:
|
578
|
+
self.location = None
|
579
|
+
|
560
580
|
self.__dict__.update(state)
|
561
581
|
|
562
582
|
|
@@ -620,6 +640,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
620
640
|
task_yaml_path: str) -> None:
|
621
641
|
super().__init__(service_name, spec)
|
622
642
|
self._task_yaml_path = task_yaml_path
|
643
|
+
task = sky.Task.from_yaml(task_yaml_path)
|
644
|
+
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
645
|
+
spot_placer.SpotPlacer.from_task(spec, task))
|
623
646
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
624
647
|
# manager more persistent. Current blocker is that we need to manually
|
625
648
|
# poll the Process (by join or is_launch), otherwise, it will never
|
@@ -639,6 +662,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
639
662
|
# Replica management functions #
|
640
663
|
################################
|
641
664
|
|
665
|
+
# Adding lock here to make sure spot placer's current locations are
|
666
|
+
# consistent with the replicas' status.
|
667
|
+
@with_lock
|
642
668
|
def _launch_replica(
|
643
669
|
self,
|
644
670
|
replica_id: int,
|
@@ -653,19 +679,41 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
653
679
|
self._service_name, replica_id)
|
654
680
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
655
681
|
self._service_name, replica_id)
|
682
|
+
use_spot = _should_use_spot(self._task_yaml_path, resources_override)
|
683
|
+
retry_until_up = True
|
684
|
+
location = None
|
685
|
+
if use_spot and self._spot_placer is not None:
|
686
|
+
# For spot placer, we don't retry until up so any launch failed
|
687
|
+
# due to availability issue will be handled by the placer.
|
688
|
+
retry_until_up = False
|
689
|
+
# TODO(tian): Currently, the resources_override can only be
|
690
|
+
# `use_spot=True/False`, which will not cause any conflict with
|
691
|
+
# spot placer's cloud, region & zone. When we add more resources
|
692
|
+
# to the resources_override, we need to make sure they won't
|
693
|
+
# conflict with the spot placer's selection.
|
694
|
+
if resources_override is None:
|
695
|
+
resources_override = {}
|
696
|
+
current_spot_locations: List[spot_placer.Location] = []
|
697
|
+
for info in serve_state.get_replica_infos(self._service_name):
|
698
|
+
if info.is_spot:
|
699
|
+
spot_location = info.get_spot_location()
|
700
|
+
if spot_location is not None:
|
701
|
+
current_spot_locations.append(spot_location)
|
702
|
+
location = self._spot_placer.select_next_location(
|
703
|
+
current_spot_locations)
|
704
|
+
resources_override.update(location.to_dict())
|
656
705
|
p = multiprocessing.Process(
|
657
706
|
target=ux_utils.RedirectOutputForProcess(
|
658
707
|
launch_cluster,
|
659
708
|
log_file_name,
|
660
709
|
).run,
|
661
710
|
args=(replica_id, self._task_yaml_path, cluster_name,
|
662
|
-
resources_override),
|
711
|
+
resources_override, retry_until_up),
|
663
712
|
)
|
664
713
|
replica_port = _get_resources_ports(self._task_yaml_path)
|
665
|
-
use_spot = _should_use_spot(self._task_yaml_path, resources_override)
|
666
714
|
|
667
715
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
668
|
-
self.latest_version)
|
716
|
+
location, self.latest_version)
|
669
717
|
serve_state.add_or_update_replica(self._service_name, replica_id, info)
|
670
718
|
# Don't start right now; we will start it later in _refresh_process_pool
|
671
719
|
# to avoid too many sky.launch running at the same time.
|
@@ -814,6 +862,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
814
862
|
logger.info(
|
815
863
|
f'Replica {info.replica_id} is preempted{cluster_status_str}.')
|
816
864
|
info.status_property.preempted = True
|
865
|
+
if self._spot_placer is not None:
|
866
|
+
spot_location = info.get_spot_location()
|
867
|
+
assert spot_location is not None
|
868
|
+
self._spot_placer.set_preemptive(spot_location)
|
817
869
|
serve_state.add_or_update_replica(self._service_name, info.replica_id,
|
818
870
|
info)
|
819
871
|
self._terminate_replica(info.replica_id,
|
@@ -868,6 +920,23 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
868
920
|
else:
|
869
921
|
info.status_property.sky_launch_status = (
|
870
922
|
ProcessStatus.SUCCEEDED)
|
923
|
+
if self._spot_placer is not None and info.is_spot:
|
924
|
+
# TODO(tian): Currently, we set the location to
|
925
|
+
# preemptive if the launch process failed. This is
|
926
|
+
# because if the error is not related to the
|
927
|
+
# availability of the location, then all locations
|
928
|
+
# should failed for same reason. So it does not matter
|
929
|
+
# which location is preemptive or not, instead, all
|
930
|
+
# locations would fail. We should implement a log parser
|
931
|
+
# to detect if the error is actually related to the
|
932
|
+
# availability of the location later.
|
933
|
+
location = info.get_spot_location()
|
934
|
+
assert location is not None
|
935
|
+
if p.exitcode != 0:
|
936
|
+
self._spot_placer.set_preemptive(location)
|
937
|
+
info.status_property.failed_spot_availability = True
|
938
|
+
else:
|
939
|
+
self._spot_placer.set_active(location)
|
871
940
|
serve_state.add_or_update_replica(self._service_name,
|
872
941
|
replica_id, info)
|
873
942
|
if error_in_sky_launch:
|
@@ -918,6 +987,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
918
987
|
removal_reason = 'for version outdated'
|
919
988
|
elif info.status_property.purged:
|
920
989
|
removal_reason = 'for purge'
|
990
|
+
elif info.status_property.failed_spot_availability:
|
991
|
+
removal_reason = 'for spot availability failure'
|
921
992
|
else:
|
922
993
|
logger.info(f'Termination of replica {replica_id} '
|
923
994
|
'finished. Replica info is kept since some '
|
sky/serve/serve_utils.py
CHANGED
@@ -25,6 +25,7 @@ from sky import global_user_state
|
|
25
25
|
from sky.adaptors import common as adaptors_common
|
26
26
|
from sky.serve import constants
|
27
27
|
from sky.serve import serve_state
|
28
|
+
from sky.serve import spot_placer
|
28
29
|
from sky.skylet import constants as skylet_constants
|
29
30
|
from sky.skylet import job_lib
|
30
31
|
from sky.utils import annotations
|
@@ -40,6 +41,7 @@ if typing.TYPE_CHECKING:
|
|
40
41
|
import psutil
|
41
42
|
import requests
|
42
43
|
|
44
|
+
import sky
|
43
45
|
from sky.serve import replica_managers
|
44
46
|
else:
|
45
47
|
psutil = adaptors_common.LazyImport('psutil')
|
@@ -210,6 +212,84 @@ class RequestTimestamp(RequestsAggregator):
|
|
210
212
|
return f'RequestTimestamp(timestamps={self.timestamps})'
|
211
213
|
|
212
214
|
|
215
|
+
def validate_service_task(task: 'sky.Task') -> None:
|
216
|
+
"""Validate the task for Sky Serve.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
task: sky.Task to validate
|
220
|
+
|
221
|
+
Raises:
|
222
|
+
ValueError: if the arguments are invalid.
|
223
|
+
RuntimeError: if the task.serve is not found.
|
224
|
+
"""
|
225
|
+
spot_resources: List['sky.Resources'] = [
|
226
|
+
resource for resource in task.resources if resource.use_spot
|
227
|
+
]
|
228
|
+
# TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
|
229
|
+
# On-demand fallback should go to the resources specified as on-demand.
|
230
|
+
if len(spot_resources) not in [0, len(task.resources)]:
|
231
|
+
with ux_utils.print_exception_no_traceback():
|
232
|
+
raise ValueError(
|
233
|
+
'Resources must either all use spot or none use spot. '
|
234
|
+
'To use on-demand and spot instances together, '
|
235
|
+
'use `dynamic_ondemand_fallback` or set '
|
236
|
+
'base_ondemand_fallback_replicas.')
|
237
|
+
|
238
|
+
if task.service is None:
|
239
|
+
with ux_utils.print_exception_no_traceback():
|
240
|
+
raise RuntimeError('Service section not found.')
|
241
|
+
|
242
|
+
policy_description = ('on-demand'
|
243
|
+
if task.service.dynamic_ondemand_fallback else 'spot')
|
244
|
+
for resource in list(task.resources):
|
245
|
+
if resource.job_recovery is not None:
|
246
|
+
with ux_utils.print_exception_no_traceback():
|
247
|
+
raise ValueError('job_recovery is disabled for SkyServe. '
|
248
|
+
'SkyServe will replenish preempted spot '
|
249
|
+
f'with {policy_description} instances.')
|
250
|
+
|
251
|
+
# Try to create a spot placer from the task yaml. Check if the task yaml
|
252
|
+
# is valid for spot placer.
|
253
|
+
spot_placer.SpotPlacer.from_task(task.service, task)
|
254
|
+
|
255
|
+
replica_ingress_port: Optional[int] = int(
|
256
|
+
task.service.ports) if (task.service.ports is not None) else None
|
257
|
+
for requested_resources in task.resources:
|
258
|
+
if (task.service.use_ondemand_fallback and
|
259
|
+
not requested_resources.use_spot):
|
260
|
+
with ux_utils.print_exception_no_traceback():
|
261
|
+
raise ValueError(
|
262
|
+
'`use_ondemand_fallback` is only supported '
|
263
|
+
'for spot resources. Please explicitly specify '
|
264
|
+
'`use_spot: true` in resources for on-demand fallback.')
|
265
|
+
if (task.service.spot_placer is not None and
|
266
|
+
not requested_resources.use_spot):
|
267
|
+
with ux_utils.print_exception_no_traceback():
|
268
|
+
raise ValueError(
|
269
|
+
'`spot_placer` is only supported for spot resources. '
|
270
|
+
'Please explicitly specify `use_spot: true` in resources.')
|
271
|
+
if task.service.ports is None:
|
272
|
+
requested_ports = list(
|
273
|
+
resources_utils.port_ranges_to_set(requested_resources.ports))
|
274
|
+
if len(requested_ports) != 1:
|
275
|
+
with ux_utils.print_exception_no_traceback():
|
276
|
+
raise ValueError(
|
277
|
+
'To open multiple ports on the replica, please set the '
|
278
|
+
'`service.ports` field to specify a main service port. '
|
279
|
+
'Must only specify one port in resources otherwise. '
|
280
|
+
'Each replica will use the port specified as '
|
281
|
+
'application ingress port.')
|
282
|
+
service_port = requested_ports[0]
|
283
|
+
if replica_ingress_port is None:
|
284
|
+
replica_ingress_port = service_port
|
285
|
+
elif service_port != replica_ingress_port:
|
286
|
+
with ux_utils.print_exception_no_traceback():
|
287
|
+
raise ValueError(
|
288
|
+
f'Got multiple ports: {service_port} and '
|
289
|
+
f'{replica_ingress_port} in different resources. '
|
290
|
+
'Please specify the same port instead.')
|
291
|
+
|
292
|
+
|
213
293
|
def generate_service_name():
|
214
294
|
return f'sky-service-{uuid.uuid4().hex[:4]}'
|
215
295
|
|