skypilot-nightly 1.0.0.dev20250329__py3-none-any.whl → 1.0.0.dev20250331__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +3 -0
- sky/client/cli.py +3 -0
- sky/data/storage_utils.py +2 -8
- sky/optimizer.py +2 -45
- sky/serve/__init__.py +1 -0
- sky/serve/autoscalers.py +26 -11
- sky/serve/replica_managers.py +77 -6
- sky/serve/serve_utils.py +80 -0
- sky/serve/server/core.py +4 -73
- sky/serve/service_spec.py +35 -3
- sky/serve/spot_placer.py +278 -0
- sky/utils/registry.py +2 -0
- sky/utils/resources_utils.py +50 -0
- sky/utils/schemas.py +10 -0
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/RECORD +21 -20
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250329.dist-info → skypilot_nightly-1.0.0.dev20250331.dist-info}/top_level.txt +0 -0
sky/serve/server/core.py
CHANGED
@@ -25,7 +25,6 @@ from sky.utils import command_runner
|
|
25
25
|
from sky.utils import common
|
26
26
|
from sky.utils import common_utils
|
27
27
|
from sky.utils import controller_utils
|
28
|
-
from sky.utils import resources_utils
|
29
28
|
from sky.utils import rich_utils
|
30
29
|
from sky.utils import subprocess_utils
|
31
30
|
from sky.utils import ux_utils
|
@@ -33,74 +32,6 @@ from sky.utils import ux_utils
|
|
33
32
|
logger = sky_logging.init_logger(__name__)
|
34
33
|
|
35
34
|
|
36
|
-
def _validate_service_task(task: 'sky.Task') -> None:
|
37
|
-
"""Validate the task for Sky Serve.
|
38
|
-
|
39
|
-
Args:
|
40
|
-
task: sky.Task to validate
|
41
|
-
|
42
|
-
Raises:
|
43
|
-
ValueError: if the arguments are invalid.
|
44
|
-
RuntimeError: if the task.serve is not found.
|
45
|
-
"""
|
46
|
-
spot_resources: List['sky.Resources'] = [
|
47
|
-
resource for resource in task.resources if resource.use_spot
|
48
|
-
]
|
49
|
-
# TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
|
50
|
-
# On-demand fallback should go to the resources specified as on-demand.
|
51
|
-
if len(spot_resources) not in [0, len(task.resources)]:
|
52
|
-
with ux_utils.print_exception_no_traceback():
|
53
|
-
raise ValueError(
|
54
|
-
'Resources must either all use spot or none use spot. '
|
55
|
-
'To use on-demand and spot instances together, '
|
56
|
-
'use `dynamic_ondemand_fallback` or set '
|
57
|
-
'base_ondemand_fallback_replicas.')
|
58
|
-
|
59
|
-
if task.service is None:
|
60
|
-
with ux_utils.print_exception_no_traceback():
|
61
|
-
raise RuntimeError('Service section not found.')
|
62
|
-
|
63
|
-
policy_description = ('on-demand'
|
64
|
-
if task.service.dynamic_ondemand_fallback else 'spot')
|
65
|
-
for resource in list(task.resources):
|
66
|
-
if resource.job_recovery is not None:
|
67
|
-
with ux_utils.print_exception_no_traceback():
|
68
|
-
raise ValueError('job_recovery is disabled for SkyServe. '
|
69
|
-
'SkyServe will replenish preempted spot '
|
70
|
-
f'with {policy_description} instances.')
|
71
|
-
|
72
|
-
replica_ingress_port: Optional[int] = int(
|
73
|
-
task.service.ports) if (task.service.ports is not None) else None
|
74
|
-
for requested_resources in task.resources:
|
75
|
-
if (task.service.use_ondemand_fallback and
|
76
|
-
not requested_resources.use_spot):
|
77
|
-
with ux_utils.print_exception_no_traceback():
|
78
|
-
raise ValueError(
|
79
|
-
'`use_ondemand_fallback` is only supported '
|
80
|
-
'for spot resources. Please explicitly specify '
|
81
|
-
'`use_spot: true` in resources for on-demand fallback.')
|
82
|
-
if task.service.ports is None:
|
83
|
-
requested_ports = list(
|
84
|
-
resources_utils.port_ranges_to_set(requested_resources.ports))
|
85
|
-
if len(requested_ports) != 1:
|
86
|
-
with ux_utils.print_exception_no_traceback():
|
87
|
-
raise ValueError(
|
88
|
-
'To open multiple ports on the replica, please set the '
|
89
|
-
'`service.ports` field to specify a main service port. '
|
90
|
-
'Must only specify one port in resources otherwise. '
|
91
|
-
'Each replica will use the port specified as '
|
92
|
-
'application ingress port.')
|
93
|
-
service_port = requested_ports[0]
|
94
|
-
if replica_ingress_port is None:
|
95
|
-
replica_ingress_port = service_port
|
96
|
-
elif service_port != replica_ingress_port:
|
97
|
-
with ux_utils.print_exception_no_traceback():
|
98
|
-
raise ValueError(
|
99
|
-
f'Got multiple ports: {service_port} and '
|
100
|
-
f'{replica_ingress_port} in different resources. '
|
101
|
-
'Please specify the same port instead.')
|
102
|
-
|
103
|
-
|
104
35
|
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
105
36
|
service_name: str, task: 'sky.Task') -> Dict[str, Any]:
|
106
37
|
"""Rewrite the paths of TLS credentials in the task.
|
@@ -113,7 +44,7 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
113
44
|
The generated template variables for TLS.
|
114
45
|
"""
|
115
46
|
service_spec = task.service
|
116
|
-
# Already checked by
|
47
|
+
# Already checked by validate_service_task
|
117
48
|
assert service_spec is not None
|
118
49
|
if service_spec.tls_credential is None:
|
119
50
|
return {'use_tls': False}
|
@@ -166,7 +97,7 @@ def up(
|
|
166
97
|
'only contains lower letters, numbers and dash): '
|
167
98
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
168
99
|
|
169
|
-
|
100
|
+
serve_utils.validate_service_task(task)
|
170
101
|
# Always apply the policy again here, even though it might have been applied
|
171
102
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
172
103
|
# and get the mutated config.
|
@@ -322,7 +253,7 @@ def up(
|
|
322
253
|
skip_status_check=True).get(lb_port)
|
323
254
|
assert socket_endpoint is not None, (
|
324
255
|
'Did not get endpoint for controller.')
|
325
|
-
# Already checked by
|
256
|
+
# Already checked by validate_service_task
|
326
257
|
assert task.service is not None
|
327
258
|
protocol = ('http'
|
328
259
|
if task.service.tls_credential is None else 'https')
|
@@ -376,7 +307,7 @@ def update(
|
|
376
307
|
mode: Update mode.
|
377
308
|
"""
|
378
309
|
task.validate()
|
379
|
-
|
310
|
+
serve_utils.validate_service_task(task)
|
380
311
|
|
381
312
|
# Always apply the policy again here, even though it might have been applied
|
382
313
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
sky/serve/service_spec.py
CHANGED
@@ -10,6 +10,7 @@ from sky.adaptors import common as adaptors_common
|
|
10
10
|
from sky.serve import constants
|
11
11
|
from sky.serve import load_balancing_policies as lb_policies
|
12
12
|
from sky.serve import serve_utils
|
13
|
+
from sky.serve import spot_placer as spot_placer_lib
|
13
14
|
from sky.utils import common_utils
|
14
15
|
from sky.utils import schemas
|
15
16
|
from sky.utils import ux_utils
|
@@ -30,6 +31,7 @@ class SkyServiceSpec:
|
|
30
31
|
readiness_timeout_seconds: int,
|
31
32
|
min_replicas: int,
|
32
33
|
max_replicas: Optional[int] = None,
|
34
|
+
num_overprovision: Optional[int] = None,
|
33
35
|
ports: Optional[str] = None,
|
34
36
|
target_qps_per_replica: Optional[float] = None,
|
35
37
|
post_data: Optional[Dict[str, Any]] = None,
|
@@ -37,6 +39,7 @@ class SkyServiceSpec:
|
|
37
39
|
readiness_headers: Optional[Dict[str, str]] = None,
|
38
40
|
dynamic_ondemand_fallback: Optional[bool] = None,
|
39
41
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
42
|
+
spot_placer: Optional[str] = None,
|
40
43
|
upscale_delay_seconds: Optional[int] = None,
|
41
44
|
downscale_delay_seconds: Optional[int] = None,
|
42
45
|
load_balancing_policy: Optional[str] = None,
|
@@ -78,6 +81,7 @@ class SkyServiceSpec:
|
|
78
81
|
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
79
82
|
self._min_replicas: int = min_replicas
|
80
83
|
self._max_replicas: Optional[int] = max_replicas
|
84
|
+
self._num_overprovision: Optional[int] = num_overprovision
|
81
85
|
self._ports: Optional[str] = ports
|
82
86
|
self._target_qps_per_replica: Optional[float] = target_qps_per_replica
|
83
87
|
self._post_data: Optional[Dict[str, Any]] = post_data
|
@@ -88,6 +92,7 @@ class SkyServiceSpec:
|
|
88
92
|
bool] = dynamic_ondemand_fallback
|
89
93
|
self._base_ondemand_fallback_replicas: Optional[
|
90
94
|
int] = base_ondemand_fallback_replicas
|
95
|
+
self._spot_placer: Optional[str] = spot_placer
|
91
96
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
92
97
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
93
98
|
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
@@ -161,6 +166,7 @@ class SkyServiceSpec:
|
|
161
166
|
min_replicas = constants.DEFAULT_MIN_REPLICAS
|
162
167
|
service_config['min_replicas'] = min_replicas
|
163
168
|
service_config['max_replicas'] = None
|
169
|
+
service_config['num_overprovision'] = None
|
164
170
|
service_config['target_qps_per_replica'] = None
|
165
171
|
service_config['upscale_delay_seconds'] = None
|
166
172
|
service_config['downscale_delay_seconds'] = None
|
@@ -168,6 +174,8 @@ class SkyServiceSpec:
|
|
168
174
|
service_config['min_replicas'] = policy_section['min_replicas']
|
169
175
|
service_config['max_replicas'] = policy_section.get(
|
170
176
|
'max_replicas', None)
|
177
|
+
service_config['num_overprovision'] = policy_section.get(
|
178
|
+
'num_overprovision', None)
|
171
179
|
service_config['target_qps_per_replica'] = policy_section.get(
|
172
180
|
'target_qps_per_replica', None)
|
173
181
|
service_config['upscale_delay_seconds'] = policy_section.get(
|
@@ -179,6 +187,8 @@ class SkyServiceSpec:
|
|
179
187
|
'base_ondemand_fallback_replicas', None)
|
180
188
|
service_config['dynamic_ondemand_fallback'] = policy_section.get(
|
181
189
|
'dynamic_ondemand_fallback', None)
|
190
|
+
service_config['spot_placer'] = policy_section.get(
|
191
|
+
'spot_placer', None)
|
182
192
|
|
183
193
|
service_config['load_balancing_policy'] = config.get(
|
184
194
|
'load_balancing_policy', None)
|
@@ -238,12 +248,15 @@ class SkyServiceSpec:
|
|
238
248
|
add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
|
239
249
|
add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
|
240
250
|
add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
|
251
|
+
add_if_not_none('replica_policy', 'num_overprovision',
|
252
|
+
self.num_overprovision)
|
241
253
|
add_if_not_none('replica_policy', 'target_qps_per_replica',
|
242
254
|
self.target_qps_per_replica)
|
243
255
|
add_if_not_none('replica_policy', 'dynamic_ondemand_fallback',
|
244
256
|
self.dynamic_ondemand_fallback)
|
245
257
|
add_if_not_none('replica_policy', 'base_ondemand_fallback_replicas',
|
246
258
|
self.base_ondemand_fallback_replicas)
|
259
|
+
add_if_not_none('replica_policy', 'spot_placer', self.spot_placer)
|
247
260
|
add_if_not_none('replica_policy', 'upscale_delay_seconds',
|
248
261
|
self.upscale_delay_seconds)
|
249
262
|
add_if_not_none('replica_policy', 'downscale_delay_seconds',
|
@@ -269,6 +282,9 @@ class SkyServiceSpec:
|
|
269
282
|
policy_strs: List[str] = []
|
270
283
|
if (self.dynamic_ondemand_fallback is not None and
|
271
284
|
self.dynamic_ondemand_fallback):
|
285
|
+
if self.spot_placer is not None:
|
286
|
+
if self.spot_placer == spot_placer_lib.SPOT_HEDGE_PLACER:
|
287
|
+
return 'SpotHedge'
|
272
288
|
policy_strs.append('Dynamic on-demand fallback')
|
273
289
|
if self.base_ondemand_fallback_replicas is not None:
|
274
290
|
policy_strs.append(
|
@@ -281,8 +297,12 @@ class SkyServiceSpec:
|
|
281
297
|
policy_strs.append('Static spot mixture with '
|
282
298
|
f'{self.base_ondemand_fallback_replicas} '
|
283
299
|
f'base on-demand replica{plural}')
|
300
|
+
if self.spot_placer is not None:
|
301
|
+
if not policy_strs:
|
302
|
+
policy_strs.append('Spot placement')
|
303
|
+
policy_strs.append(f'with {self.spot_placer} placer')
|
284
304
|
if not policy_strs:
|
285
|
-
return 'No spot
|
305
|
+
return 'No spot policy'
|
286
306
|
return ' '.join(policy_strs)
|
287
307
|
|
288
308
|
def autoscaling_policy_str(self):
|
@@ -294,9 +314,13 @@ class SkyServiceSpec:
|
|
294
314
|
assert self.target_qps_per_replica is not None
|
295
315
|
# TODO(tian): Refactor to contain more information
|
296
316
|
max_plural = '' if self.max_replicas == 1 else 's'
|
317
|
+
overprovision_str = ''
|
318
|
+
if self.num_overprovision is not None:
|
319
|
+
overprovision_str = (
|
320
|
+
f' with {self.num_overprovision} overprovisioned replicas')
|
297
321
|
return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
|
298
|
-
f'replica{max_plural} (target QPS per
|
299
|
-
f'{self.target_qps_per_replica})')
|
322
|
+
f'replica{max_plural}{overprovision_str} (target QPS per '
|
323
|
+
f'replica: {self.target_qps_per_replica})')
|
300
324
|
|
301
325
|
def set_ports(self, ports: str) -> None:
|
302
326
|
self._ports = ports
|
@@ -339,6 +363,10 @@ class SkyServiceSpec:
|
|
339
363
|
# If None, treated as having the same value of min_replicas.
|
340
364
|
return self._max_replicas
|
341
365
|
|
366
|
+
@property
|
367
|
+
def num_overprovision(self) -> Optional[int]:
|
368
|
+
return self._num_overprovision
|
369
|
+
|
342
370
|
@property
|
343
371
|
def ports(self) -> Optional[str]:
|
344
372
|
return self._ports
|
@@ -372,6 +400,10 @@ class SkyServiceSpec:
|
|
372
400
|
def dynamic_ondemand_fallback(self) -> Optional[bool]:
|
373
401
|
return self._dynamic_ondemand_fallback
|
374
402
|
|
403
|
+
@property
|
404
|
+
def spot_placer(self) -> Optional[str]:
|
405
|
+
return self._spot_placer
|
406
|
+
|
375
407
|
@property
|
376
408
|
def upscale_delay_seconds(self) -> Optional[int]:
|
377
409
|
return self._upscale_delay_seconds
|
sky/serve/spot_placer.py
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
"""Spot Placer for SpotHedge."""
|
2
|
+
|
3
|
+
import collections
|
4
|
+
import dataclasses
|
5
|
+
import enum
|
6
|
+
import typing
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
8
|
+
|
9
|
+
from sky import check as sky_check
|
10
|
+
from sky import clouds as sky_clouds
|
11
|
+
from sky import sky_logging
|
12
|
+
from sky.clouds import cloud as sky_cloud
|
13
|
+
from sky.utils import registry
|
14
|
+
from sky.utils import resources_utils
|
15
|
+
from sky.utils import ux_utils
|
16
|
+
|
17
|
+
if typing.TYPE_CHECKING:
|
18
|
+
from sky import resources as resources_lib
|
19
|
+
from sky import task as task_lib
|
20
|
+
from sky.serve import service_spec
|
21
|
+
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
23
|
+
|
24
|
+
SPOT_PLACERS = {}
|
25
|
+
DEFAULT_SPOT_PLACER = None
|
26
|
+
SPOT_HEDGE_PLACER = 'dynamic_fallback'
|
27
|
+
|
28
|
+
|
29
|
+
@dataclasses.dataclass
|
30
|
+
class Location:
|
31
|
+
"""Location class of a spot instance."""
|
32
|
+
cloud: 'sky_clouds.Cloud'
|
33
|
+
region: str
|
34
|
+
zone: Optional[str]
|
35
|
+
|
36
|
+
def __eq__(self, other) -> bool:
|
37
|
+
if isinstance(other, Location):
|
38
|
+
return (self.cloud.is_same_cloud(other.cloud) and
|
39
|
+
self.region == other.region and self.zone == other.zone)
|
40
|
+
return False
|
41
|
+
|
42
|
+
def __hash__(self) -> int:
|
43
|
+
return hash(
|
44
|
+
str(self.cloud) + self.region +
|
45
|
+
(self.zone if self.zone is not None else ''))
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
49
|
+
return cls(resources.cloud, resources.region, resources.zone)
|
50
|
+
|
51
|
+
def to_dict(self) -> Dict[str, Any]:
|
52
|
+
return {'cloud': self.cloud, 'region': self.region, 'zone': self.zone}
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def from_pickleable(
|
56
|
+
cls,
|
57
|
+
data: Optional[Dict[str, Optional[str]]],
|
58
|
+
) -> Optional['Location']:
|
59
|
+
if data is None:
|
60
|
+
return None
|
61
|
+
cloud = registry.CLOUD_REGISTRY.from_str(data['cloud'])
|
62
|
+
assert cloud is not None
|
63
|
+
assert data['region'] is not None
|
64
|
+
return cls(
|
65
|
+
cloud=cloud,
|
66
|
+
region=data['region'],
|
67
|
+
zone=data['zone'],
|
68
|
+
)
|
69
|
+
|
70
|
+
def to_pickleable(self) -> Dict[str, Optional[str]]:
|
71
|
+
return {
|
72
|
+
'cloud': str(self.cloud),
|
73
|
+
'region': self.region,
|
74
|
+
'zone': self.zone,
|
75
|
+
}
|
76
|
+
|
77
|
+
|
78
|
+
class LocationStatus(enum.Enum):
|
79
|
+
"""Location Spot Status."""
|
80
|
+
ACTIVE = 'ACTIVE'
|
81
|
+
PREEMPTED = 'PREEMPTED'
|
82
|
+
|
83
|
+
|
84
|
+
def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
85
|
+
|
86
|
+
def _without_location(
|
87
|
+
resources: 'resources_lib.Resources') -> 'resources_lib.Resources':
|
88
|
+
return resources.copy(cloud=None, region=None, zone=None)
|
89
|
+
|
90
|
+
assert task.resources # Guaranteed in task constructor
|
91
|
+
empty_location_resources = _without_location(list(task.resources)[0])
|
92
|
+
empty_location_resources_config = empty_location_resources.to_yaml_config()
|
93
|
+
|
94
|
+
location_requirements: Dict[str, Dict[str, Set[str]]] = (
|
95
|
+
collections.defaultdict(lambda: collections.defaultdict(set)))
|
96
|
+
|
97
|
+
for r in task.resources:
|
98
|
+
if (_without_location(r).to_yaml_config() !=
|
99
|
+
empty_location_resources_config):
|
100
|
+
with ux_utils.print_exception_no_traceback():
|
101
|
+
raise ValueError(
|
102
|
+
'Different resource configurations are not supported '
|
103
|
+
'for spot placement. All resources must have the same '
|
104
|
+
'configuration except for cloud/region/zone.')
|
105
|
+
if r.cloud is None:
|
106
|
+
continue
|
107
|
+
cloud_str = str(r.cloud)
|
108
|
+
if r.region is None:
|
109
|
+
# Access defaultdict to create empty entry if it doesn't exist.
|
110
|
+
_ = location_requirements[cloud_str]
|
111
|
+
continue
|
112
|
+
if r.zone is None:
|
113
|
+
# Same as above.
|
114
|
+
_ = location_requirements[cloud_str][r.region]
|
115
|
+
continue
|
116
|
+
location_requirements[cloud_str][r.region].add(r.zone)
|
117
|
+
|
118
|
+
clouds_list: List[sky_clouds.Cloud] = []
|
119
|
+
for c in location_requirements.keys():
|
120
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(c)
|
121
|
+
assert cloud_obj is not None
|
122
|
+
clouds_list.append(cloud_obj)
|
123
|
+
if not clouds_list:
|
124
|
+
# If the cloud list is empty, that means the user has no location
|
125
|
+
# related requirements. Then we start with all enabled clouds and
|
126
|
+
# all possible regions and zones.
|
127
|
+
clouds_list = sky_check.get_cached_enabled_clouds_or_refresh(
|
128
|
+
capability=sky_cloud.CloudCapability.COMPUTE,
|
129
|
+
raise_if_no_cloud_access=False)
|
130
|
+
for cloud in clouds_list:
|
131
|
+
# Create empty entry for each cloud.
|
132
|
+
_ = location_requirements[str(cloud)]
|
133
|
+
|
134
|
+
possible_locations = set()
|
135
|
+
for cloud in clouds_list:
|
136
|
+
feasible_resources: resources_utils.FeasibleResources = (
|
137
|
+
cloud.get_feasible_launchable_resources(empty_location_resources,
|
138
|
+
num_nodes=task.num_nodes))
|
139
|
+
for feasible in feasible_resources.resources_list:
|
140
|
+
# We set override_optimize_by_zone=True to force the provisioner
|
141
|
+
# to use zone-level provisioning. This is to get accurate location
|
142
|
+
# information.
|
143
|
+
launchables: List['resources_lib.Resources'] = (
|
144
|
+
resources_utils.make_launchables_for_valid_region_zones(
|
145
|
+
feasible, override_optimize_by_zone=True))
|
146
|
+
for launchable in launchables:
|
147
|
+
cloud_str = str(launchable.cloud)
|
148
|
+
region = launchable.region
|
149
|
+
zone = launchable.zone
|
150
|
+
if (cloud_str not in location_requirements and
|
151
|
+
location_requirements):
|
152
|
+
continue
|
153
|
+
# We need to use .get() here to avoid creating extra entries in
|
154
|
+
# location_requirements, and being treated as user's requirement
|
155
|
+
# in the following regions.
|
156
|
+
cloud_reqs = location_requirements.get(cloud_str, {})
|
157
|
+
if region not in cloud_reqs and cloud_reqs:
|
158
|
+
continue
|
159
|
+
region_reqs = cloud_reqs.get(region, set())
|
160
|
+
if zone not in region_reqs and region_reqs:
|
161
|
+
continue
|
162
|
+
possible_locations.add(Location.from_resources(launchable))
|
163
|
+
|
164
|
+
return list(possible_locations)
|
165
|
+
|
166
|
+
|
167
|
+
class SpotPlacer:
|
168
|
+
"""Spot Placement specification."""
|
169
|
+
|
170
|
+
def __init__(self, task: 'task_lib.Task') -> None:
|
171
|
+
possible_locations = _get_possible_location_from_task(task)
|
172
|
+
logger.info(f'{len(possible_locations)} possible location candidates '
|
173
|
+
'are enabled for spot placement.')
|
174
|
+
logger.debug(f'All possible locations: {possible_locations}')
|
175
|
+
self.location2status: Dict[Location, LocationStatus] = {
|
176
|
+
location: LocationStatus.ACTIVE for location in possible_locations
|
177
|
+
}
|
178
|
+
self.location2cost: Dict[Location, float] = {}
|
179
|
+
# Already checked there is only one resource in the task.
|
180
|
+
self.resources = list(task.resources)[0]
|
181
|
+
self.num_nodes = task.num_nodes
|
182
|
+
|
183
|
+
def __init_subclass__(cls, name: str, default: bool = False):
|
184
|
+
SPOT_PLACERS[name] = cls
|
185
|
+
if default:
|
186
|
+
global DEFAULT_SPOT_PLACER
|
187
|
+
assert DEFAULT_SPOT_PLACER is None, (
|
188
|
+
'Only one policy can be default.')
|
189
|
+
DEFAULT_SPOT_PLACER = name
|
190
|
+
|
191
|
+
def select_next_location(self,
|
192
|
+
current_locations: List[Location]) -> Location:
|
193
|
+
"""Select next location to place spot instance."""
|
194
|
+
raise NotImplementedError
|
195
|
+
|
196
|
+
def set_active(self, location: Location) -> None:
|
197
|
+
assert location in self.location2status, location
|
198
|
+
self.location2status[location] = LocationStatus.ACTIVE
|
199
|
+
|
200
|
+
def set_preemptive(self, location: Location) -> None:
|
201
|
+
assert location in self.location2status, location
|
202
|
+
self.location2status[location] = LocationStatus.PREEMPTED
|
203
|
+
|
204
|
+
def clear_preemptive_locations(self) -> None:
|
205
|
+
for location in self.location2status:
|
206
|
+
self.location2status[location] = LocationStatus.ACTIVE
|
207
|
+
|
208
|
+
def _min_cost_location(self, locations: List[Location]) -> Location:
|
209
|
+
|
210
|
+
def _get_cost_per_hour(location: Location) -> float:
|
211
|
+
if location in self.location2cost:
|
212
|
+
return self.location2cost[location]
|
213
|
+
# TODO(tian): Is there a better way to do this? This is for filling
|
214
|
+
# instance type so the get_cost() can operate normally.
|
215
|
+
r: 'resources_lib.Resources' = self.resources.copy(
|
216
|
+
**location.to_dict())
|
217
|
+
assert r.cloud is not None
|
218
|
+
rs = r.cloud.get_feasible_launchable_resources(
|
219
|
+
r, num_nodes=self.num_nodes).resources_list
|
220
|
+
# For some clouds, there might have multiple instance types
|
221
|
+
# satisfying the resource request. In such case we choose the
|
222
|
+
# cheapest one, as the optimizer does. Reference:
|
223
|
+
# sky/optimizer.py::Optimizer::_print_candidates
|
224
|
+
cost = min(r.get_cost(seconds=3600) for r in rs)
|
225
|
+
self.location2cost[location] = cost
|
226
|
+
return cost
|
227
|
+
|
228
|
+
return min(locations, key=_get_cost_per_hour)
|
229
|
+
|
230
|
+
def _location_with_status(self, status: LocationStatus) -> List[Location]:
|
231
|
+
return [
|
232
|
+
location
|
233
|
+
for location, location_type in self.location2status.items()
|
234
|
+
if location_type == status
|
235
|
+
]
|
236
|
+
|
237
|
+
def active_locations(self) -> List[Location]:
|
238
|
+
return self._location_with_status(LocationStatus.ACTIVE)
|
239
|
+
|
240
|
+
def preemptive_locations(self) -> List[Location]:
|
241
|
+
return self._location_with_status(LocationStatus.PREEMPTED)
|
242
|
+
|
243
|
+
@classmethod
|
244
|
+
def from_task(cls, spec: 'service_spec.SkyServiceSpec',
|
245
|
+
task: 'task_lib.Task') -> Optional['SpotPlacer']:
|
246
|
+
if spec.spot_placer is None:
|
247
|
+
return None
|
248
|
+
return SPOT_PLACERS[spec.spot_placer](task)
|
249
|
+
|
250
|
+
|
251
|
+
class DynamicFallbackSpotPlacer(SpotPlacer,
|
252
|
+
name=SPOT_HEDGE_PLACER,
|
253
|
+
default=True):
|
254
|
+
"""Dynamic Fallback Placer."""
|
255
|
+
|
256
|
+
def select_next_location(self,
|
257
|
+
current_locations: List[Location]) -> Location:
|
258
|
+
active_locations = self.active_locations()
|
259
|
+
# Prioritize locations that are not currently used.
|
260
|
+
candidate_locations: List[Location] = [
|
261
|
+
location for location in active_locations
|
262
|
+
if location not in current_locations
|
263
|
+
]
|
264
|
+
# If no candidate locations, use all active locations.
|
265
|
+
if not candidate_locations:
|
266
|
+
candidate_locations = active_locations
|
267
|
+
res = self._min_cost_location(candidate_locations)
|
268
|
+
logger.info(f'Active locations: {active_locations}\n'
|
269
|
+
f'Current locations: {current_locations}\n'
|
270
|
+
f'Candidate locations: {candidate_locations}\n'
|
271
|
+
f'Selected location: {res}\n')
|
272
|
+
return res
|
273
|
+
|
274
|
+
def set_preemptive(self, location: Location) -> None:
|
275
|
+
super().set_preemptive(location)
|
276
|
+
# Prevent the case with only one active location.
|
277
|
+
if len(self.active_locations()) < 2:
|
278
|
+
self.clear_preemptive_locations()
|
sky/utils/registry.py
CHANGED
sky/utils/resources_utils.py
CHANGED
@@ -216,3 +216,53 @@ def need_to_query_reservations() -> bool:
|
|
216
216
|
cloud_prioritize_reservations):
|
217
217
|
return True
|
218
218
|
return False
|
219
|
+
|
220
|
+
|
221
|
+
def make_launchables_for_valid_region_zones(
|
222
|
+
launchable_resources: 'resources_lib.Resources',
|
223
|
+
override_optimize_by_zone: bool = False,
|
224
|
+
) -> List['resources_lib.Resources']:
|
225
|
+
assert launchable_resources.is_launchable()
|
226
|
+
# In principle, all provisioning requests should be made at the granularity
|
227
|
+
# of a single zone. However, for on-demand instances, we batch the requests
|
228
|
+
# to the zones in the same region in order to leverage the region-level
|
229
|
+
# provisioning APIs of AWS and Azure. This way, we can reduce the number of
|
230
|
+
# API calls, and thus the overall failover time. Note that this optimization
|
231
|
+
# does not affect the user cost since the clouds charge the same prices for
|
232
|
+
# on-demand instances in the same region regardless of the zones. On the
|
233
|
+
# other hand, for spot instances, we do not batch the requests because the
|
234
|
+
# "AWS" spot prices may vary across zones.
|
235
|
+
# For GCP, we do not batch the requests because GCP reservation system is
|
236
|
+
# zone based. Therefore, price estimation is potentially different across
|
237
|
+
# zones.
|
238
|
+
|
239
|
+
# NOTE(woosuk): GCP does not support region-level provisioning APIs. Thus,
|
240
|
+
# while we return per-region resources here, the provisioner will still
|
241
|
+
# issue the request for one zone at a time.
|
242
|
+
# NOTE(woosuk): If we support Azure spot instances, we should batch the
|
243
|
+
# requests since Azure spot prices are region-level.
|
244
|
+
# TODO(woosuk): Batch the per-zone AWS spot instance requests if they are
|
245
|
+
# in the same region and have the same price.
|
246
|
+
# TODO(woosuk): A better design is to implement batching at a higher level
|
247
|
+
# (e.g., in provisioner or optimizer), not here.
|
248
|
+
launchables = []
|
249
|
+
regions = launchable_resources.get_valid_regions_for_launchable()
|
250
|
+
for region in regions:
|
251
|
+
optimize_by_zone = (override_optimize_by_zone or
|
252
|
+
launchable_resources.cloud.optimize_by_zone())
|
253
|
+
# It is possible that we force the optimize_by_zone but some clouds
|
254
|
+
# do not support zone-level provisioning (i.e. Azure). So we check
|
255
|
+
# if there is zone-level information in the region first.
|
256
|
+
if (region.zones is not None and
|
257
|
+
(launchable_resources.use_spot or optimize_by_zone)):
|
258
|
+
# Spot instances.
|
259
|
+
# Do not batch the per-zone requests.
|
260
|
+
for zone in region.zones:
|
261
|
+
launchables.append(
|
262
|
+
launchable_resources.copy(region=region.name,
|
263
|
+
zone=zone.name))
|
264
|
+
else:
|
265
|
+
# On-demand instances.
|
266
|
+
# Batch the requests at the granularity of a single region.
|
267
|
+
launchables.append(launchable_resources.copy(region=region.name))
|
268
|
+
return launchables
|
sky/utils/schemas.py
CHANGED
@@ -310,6 +310,7 @@ def get_service_schema():
|
|
310
310
|
# To avoid circular imports, only import when needed.
|
311
311
|
# pylint: disable=import-outside-toplevel
|
312
312
|
from sky.serve import load_balancing_policies
|
313
|
+
from sky.serve import spot_placer
|
313
314
|
return {
|
314
315
|
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
315
316
|
'type': 'object',
|
@@ -362,6 +363,10 @@ def get_service_schema():
|
|
362
363
|
'type': 'integer',
|
363
364
|
'minimum': 0,
|
364
365
|
},
|
366
|
+
'num_overprovision': {
|
367
|
+
'type': 'integer',
|
368
|
+
'minimum': 0,
|
369
|
+
},
|
365
370
|
'target_qps_per_replica': {
|
366
371
|
'type': 'number',
|
367
372
|
'minimum': 0,
|
@@ -373,6 +378,11 @@ def get_service_schema():
|
|
373
378
|
'type': 'integer',
|
374
379
|
'minimum': 0,
|
375
380
|
},
|
381
|
+
'spot_placer': {
|
382
|
+
'type': 'string',
|
383
|
+
'case_insensitive_enum': list(
|
384
|
+
spot_placer.SPOT_PLACERS.keys())
|
385
|
+
},
|
376
386
|
'upscale_delay_seconds': {
|
377
387
|
'type': 'number',
|
378
388
|
},
|