skypilot-nightly 1.0.0.dev20250328__py3-none-any.whl → 1.0.0.dev20250330__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +3 -0
- sky/client/cli.py +3 -0
- sky/clouds/do.py +2 -0
- sky/data/storage_utils.py +2 -8
- sky/optimizer.py +2 -45
- sky/serve/__init__.py +1 -0
- sky/serve/autoscalers.py +26 -11
- sky/serve/replica_managers.py +77 -6
- sky/serve/serve_utils.py +80 -0
- sky/serve/server/core.py +4 -73
- sky/serve/service_spec.py +35 -3
- sky/serve/spot_placer.py +278 -0
- sky/server/common.py +15 -7
- sky/server/requests/executor.py +1 -1
- sky/server/requests/queues/mp_queue.py +8 -1
- sky/server/server.py +1 -1
- sky/utils/registry.py +2 -0
- sky/utils/resources_utils.py +50 -0
- sky/utils/schemas.py +10 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/RECORD +26 -25
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250328.dist-info → skypilot_nightly-1.0.0.dev20250330.dist-info}/top_level.txt +0 -0
sky/serve/server/core.py
CHANGED
@@ -25,7 +25,6 @@ from sky.utils import command_runner
|
|
25
25
|
from sky.utils import common
|
26
26
|
from sky.utils import common_utils
|
27
27
|
from sky.utils import controller_utils
|
28
|
-
from sky.utils import resources_utils
|
29
28
|
from sky.utils import rich_utils
|
30
29
|
from sky.utils import subprocess_utils
|
31
30
|
from sky.utils import ux_utils
|
@@ -33,74 +32,6 @@ from sky.utils import ux_utils
|
|
33
32
|
logger = sky_logging.init_logger(__name__)
|
34
33
|
|
35
34
|
|
36
|
-
def _validate_service_task(task: 'sky.Task') -> None:
|
37
|
-
"""Validate the task for Sky Serve.
|
38
|
-
|
39
|
-
Args:
|
40
|
-
task: sky.Task to validate
|
41
|
-
|
42
|
-
Raises:
|
43
|
-
ValueError: if the arguments are invalid.
|
44
|
-
RuntimeError: if the task.serve is not found.
|
45
|
-
"""
|
46
|
-
spot_resources: List['sky.Resources'] = [
|
47
|
-
resource for resource in task.resources if resource.use_spot
|
48
|
-
]
|
49
|
-
# TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
|
50
|
-
# On-demand fallback should go to the resources specified as on-demand.
|
51
|
-
if len(spot_resources) not in [0, len(task.resources)]:
|
52
|
-
with ux_utils.print_exception_no_traceback():
|
53
|
-
raise ValueError(
|
54
|
-
'Resources must either all use spot or none use spot. '
|
55
|
-
'To use on-demand and spot instances together, '
|
56
|
-
'use `dynamic_ondemand_fallback` or set '
|
57
|
-
'base_ondemand_fallback_replicas.')
|
58
|
-
|
59
|
-
if task.service is None:
|
60
|
-
with ux_utils.print_exception_no_traceback():
|
61
|
-
raise RuntimeError('Service section not found.')
|
62
|
-
|
63
|
-
policy_description = ('on-demand'
|
64
|
-
if task.service.dynamic_ondemand_fallback else 'spot')
|
65
|
-
for resource in list(task.resources):
|
66
|
-
if resource.job_recovery is not None:
|
67
|
-
with ux_utils.print_exception_no_traceback():
|
68
|
-
raise ValueError('job_recovery is disabled for SkyServe. '
|
69
|
-
'SkyServe will replenish preempted spot '
|
70
|
-
f'with {policy_description} instances.')
|
71
|
-
|
72
|
-
replica_ingress_port: Optional[int] = int(
|
73
|
-
task.service.ports) if (task.service.ports is not None) else None
|
74
|
-
for requested_resources in task.resources:
|
75
|
-
if (task.service.use_ondemand_fallback and
|
76
|
-
not requested_resources.use_spot):
|
77
|
-
with ux_utils.print_exception_no_traceback():
|
78
|
-
raise ValueError(
|
79
|
-
'`use_ondemand_fallback` is only supported '
|
80
|
-
'for spot resources. Please explicitly specify '
|
81
|
-
'`use_spot: true` in resources for on-demand fallback.')
|
82
|
-
if task.service.ports is None:
|
83
|
-
requested_ports = list(
|
84
|
-
resources_utils.port_ranges_to_set(requested_resources.ports))
|
85
|
-
if len(requested_ports) != 1:
|
86
|
-
with ux_utils.print_exception_no_traceback():
|
87
|
-
raise ValueError(
|
88
|
-
'To open multiple ports on the replica, please set the '
|
89
|
-
'`service.ports` field to specify a main service port. '
|
90
|
-
'Must only specify one port in resources otherwise. '
|
91
|
-
'Each replica will use the port specified as '
|
92
|
-
'application ingress port.')
|
93
|
-
service_port = requested_ports[0]
|
94
|
-
if replica_ingress_port is None:
|
95
|
-
replica_ingress_port = service_port
|
96
|
-
elif service_port != replica_ingress_port:
|
97
|
-
with ux_utils.print_exception_no_traceback():
|
98
|
-
raise ValueError(
|
99
|
-
f'Got multiple ports: {service_port} and '
|
100
|
-
f'{replica_ingress_port} in different resources. '
|
101
|
-
'Please specify the same port instead.')
|
102
|
-
|
103
|
-
|
104
35
|
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
105
36
|
service_name: str, task: 'sky.Task') -> Dict[str, Any]:
|
106
37
|
"""Rewrite the paths of TLS credentials in the task.
|
@@ -113,7 +44,7 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
113
44
|
The generated template variables for TLS.
|
114
45
|
"""
|
115
46
|
service_spec = task.service
|
116
|
-
# Already checked by
|
47
|
+
# Already checked by validate_service_task
|
117
48
|
assert service_spec is not None
|
118
49
|
if service_spec.tls_credential is None:
|
119
50
|
return {'use_tls': False}
|
@@ -166,7 +97,7 @@ def up(
|
|
166
97
|
'only contains lower letters, numbers and dash): '
|
167
98
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
168
99
|
|
169
|
-
|
100
|
+
serve_utils.validate_service_task(task)
|
170
101
|
# Always apply the policy again here, even though it might have been applied
|
171
102
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
172
103
|
# and get the mutated config.
|
@@ -322,7 +253,7 @@ def up(
|
|
322
253
|
skip_status_check=True).get(lb_port)
|
323
254
|
assert socket_endpoint is not None, (
|
324
255
|
'Did not get endpoint for controller.')
|
325
|
-
# Already checked by
|
256
|
+
# Already checked by validate_service_task
|
326
257
|
assert task.service is not None
|
327
258
|
protocol = ('http'
|
328
259
|
if task.service.tls_credential is None else 'https')
|
@@ -376,7 +307,7 @@ def update(
|
|
376
307
|
mode: Update mode.
|
377
308
|
"""
|
378
309
|
task.validate()
|
379
|
-
|
310
|
+
serve_utils.validate_service_task(task)
|
380
311
|
|
381
312
|
# Always apply the policy again here, even though it might have been applied
|
382
313
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
sky/serve/service_spec.py
CHANGED
@@ -10,6 +10,7 @@ from sky.adaptors import common as adaptors_common
|
|
10
10
|
from sky.serve import constants
|
11
11
|
from sky.serve import load_balancing_policies as lb_policies
|
12
12
|
from sky.serve import serve_utils
|
13
|
+
from sky.serve import spot_placer as spot_placer_lib
|
13
14
|
from sky.utils import common_utils
|
14
15
|
from sky.utils import schemas
|
15
16
|
from sky.utils import ux_utils
|
@@ -30,6 +31,7 @@ class SkyServiceSpec:
|
|
30
31
|
readiness_timeout_seconds: int,
|
31
32
|
min_replicas: int,
|
32
33
|
max_replicas: Optional[int] = None,
|
34
|
+
num_overprovision: Optional[int] = None,
|
33
35
|
ports: Optional[str] = None,
|
34
36
|
target_qps_per_replica: Optional[float] = None,
|
35
37
|
post_data: Optional[Dict[str, Any]] = None,
|
@@ -37,6 +39,7 @@ class SkyServiceSpec:
|
|
37
39
|
readiness_headers: Optional[Dict[str, str]] = None,
|
38
40
|
dynamic_ondemand_fallback: Optional[bool] = None,
|
39
41
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
42
|
+
spot_placer: Optional[str] = None,
|
40
43
|
upscale_delay_seconds: Optional[int] = None,
|
41
44
|
downscale_delay_seconds: Optional[int] = None,
|
42
45
|
load_balancing_policy: Optional[str] = None,
|
@@ -78,6 +81,7 @@ class SkyServiceSpec:
|
|
78
81
|
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
79
82
|
self._min_replicas: int = min_replicas
|
80
83
|
self._max_replicas: Optional[int] = max_replicas
|
84
|
+
self._num_overprovision: Optional[int] = num_overprovision
|
81
85
|
self._ports: Optional[str] = ports
|
82
86
|
self._target_qps_per_replica: Optional[float] = target_qps_per_replica
|
83
87
|
self._post_data: Optional[Dict[str, Any]] = post_data
|
@@ -88,6 +92,7 @@ class SkyServiceSpec:
|
|
88
92
|
bool] = dynamic_ondemand_fallback
|
89
93
|
self._base_ondemand_fallback_replicas: Optional[
|
90
94
|
int] = base_ondemand_fallback_replicas
|
95
|
+
self._spot_placer: Optional[str] = spot_placer
|
91
96
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
92
97
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
93
98
|
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
@@ -161,6 +166,7 @@ class SkyServiceSpec:
|
|
161
166
|
min_replicas = constants.DEFAULT_MIN_REPLICAS
|
162
167
|
service_config['min_replicas'] = min_replicas
|
163
168
|
service_config['max_replicas'] = None
|
169
|
+
service_config['num_overprovision'] = None
|
164
170
|
service_config['target_qps_per_replica'] = None
|
165
171
|
service_config['upscale_delay_seconds'] = None
|
166
172
|
service_config['downscale_delay_seconds'] = None
|
@@ -168,6 +174,8 @@ class SkyServiceSpec:
|
|
168
174
|
service_config['min_replicas'] = policy_section['min_replicas']
|
169
175
|
service_config['max_replicas'] = policy_section.get(
|
170
176
|
'max_replicas', None)
|
177
|
+
service_config['num_overprovision'] = policy_section.get(
|
178
|
+
'num_overprovision', None)
|
171
179
|
service_config['target_qps_per_replica'] = policy_section.get(
|
172
180
|
'target_qps_per_replica', None)
|
173
181
|
service_config['upscale_delay_seconds'] = policy_section.get(
|
@@ -179,6 +187,8 @@ class SkyServiceSpec:
|
|
179
187
|
'base_ondemand_fallback_replicas', None)
|
180
188
|
service_config['dynamic_ondemand_fallback'] = policy_section.get(
|
181
189
|
'dynamic_ondemand_fallback', None)
|
190
|
+
service_config['spot_placer'] = policy_section.get(
|
191
|
+
'spot_placer', None)
|
182
192
|
|
183
193
|
service_config['load_balancing_policy'] = config.get(
|
184
194
|
'load_balancing_policy', None)
|
@@ -238,12 +248,15 @@ class SkyServiceSpec:
|
|
238
248
|
add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
|
239
249
|
add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
|
240
250
|
add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
|
251
|
+
add_if_not_none('replica_policy', 'num_overprovision',
|
252
|
+
self.num_overprovision)
|
241
253
|
add_if_not_none('replica_policy', 'target_qps_per_replica',
|
242
254
|
self.target_qps_per_replica)
|
243
255
|
add_if_not_none('replica_policy', 'dynamic_ondemand_fallback',
|
244
256
|
self.dynamic_ondemand_fallback)
|
245
257
|
add_if_not_none('replica_policy', 'base_ondemand_fallback_replicas',
|
246
258
|
self.base_ondemand_fallback_replicas)
|
259
|
+
add_if_not_none('replica_policy', 'spot_placer', self.spot_placer)
|
247
260
|
add_if_not_none('replica_policy', 'upscale_delay_seconds',
|
248
261
|
self.upscale_delay_seconds)
|
249
262
|
add_if_not_none('replica_policy', 'downscale_delay_seconds',
|
@@ -269,6 +282,9 @@ class SkyServiceSpec:
|
|
269
282
|
policy_strs: List[str] = []
|
270
283
|
if (self.dynamic_ondemand_fallback is not None and
|
271
284
|
self.dynamic_ondemand_fallback):
|
285
|
+
if self.spot_placer is not None:
|
286
|
+
if self.spot_placer == spot_placer_lib.SPOT_HEDGE_PLACER:
|
287
|
+
return 'SpotHedge'
|
272
288
|
policy_strs.append('Dynamic on-demand fallback')
|
273
289
|
if self.base_ondemand_fallback_replicas is not None:
|
274
290
|
policy_strs.append(
|
@@ -281,8 +297,12 @@ class SkyServiceSpec:
|
|
281
297
|
policy_strs.append('Static spot mixture with '
|
282
298
|
f'{self.base_ondemand_fallback_replicas} '
|
283
299
|
f'base on-demand replica{plural}')
|
300
|
+
if self.spot_placer is not None:
|
301
|
+
if not policy_strs:
|
302
|
+
policy_strs.append('Spot placement')
|
303
|
+
policy_strs.append(f'with {self.spot_placer} placer')
|
284
304
|
if not policy_strs:
|
285
|
-
return 'No spot
|
305
|
+
return 'No spot policy'
|
286
306
|
return ' '.join(policy_strs)
|
287
307
|
|
288
308
|
def autoscaling_policy_str(self):
|
@@ -294,9 +314,13 @@ class SkyServiceSpec:
|
|
294
314
|
assert self.target_qps_per_replica is not None
|
295
315
|
# TODO(tian): Refactor to contain more information
|
296
316
|
max_plural = '' if self.max_replicas == 1 else 's'
|
317
|
+
overprovision_str = ''
|
318
|
+
if self.num_overprovision is not None:
|
319
|
+
overprovision_str = (
|
320
|
+
f' with {self.num_overprovision} overprovisioned replicas')
|
297
321
|
return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
|
298
|
-
f'replica{max_plural} (target QPS per
|
299
|
-
f'{self.target_qps_per_replica})')
|
322
|
+
f'replica{max_plural}{overprovision_str} (target QPS per '
|
323
|
+
f'replica: {self.target_qps_per_replica})')
|
300
324
|
|
301
325
|
def set_ports(self, ports: str) -> None:
|
302
326
|
self._ports = ports
|
@@ -339,6 +363,10 @@ class SkyServiceSpec:
|
|
339
363
|
# If None, treated as having the same value of min_replicas.
|
340
364
|
return self._max_replicas
|
341
365
|
|
366
|
+
@property
|
367
|
+
def num_overprovision(self) -> Optional[int]:
|
368
|
+
return self._num_overprovision
|
369
|
+
|
342
370
|
@property
|
343
371
|
def ports(self) -> Optional[str]:
|
344
372
|
return self._ports
|
@@ -372,6 +400,10 @@ class SkyServiceSpec:
|
|
372
400
|
def dynamic_ondemand_fallback(self) -> Optional[bool]:
|
373
401
|
return self._dynamic_ondemand_fallback
|
374
402
|
|
403
|
+
@property
|
404
|
+
def spot_placer(self) -> Optional[str]:
|
405
|
+
return self._spot_placer
|
406
|
+
|
375
407
|
@property
|
376
408
|
def upscale_delay_seconds(self) -> Optional[int]:
|
377
409
|
return self._upscale_delay_seconds
|
sky/serve/spot_placer.py
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
"""Spot Placer for SpotHedge."""
|
2
|
+
|
3
|
+
import collections
|
4
|
+
import dataclasses
|
5
|
+
import enum
|
6
|
+
import typing
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
8
|
+
|
9
|
+
from sky import check as sky_check
|
10
|
+
from sky import clouds as sky_clouds
|
11
|
+
from sky import sky_logging
|
12
|
+
from sky.clouds import cloud as sky_cloud
|
13
|
+
from sky.utils import registry
|
14
|
+
from sky.utils import resources_utils
|
15
|
+
from sky.utils import ux_utils
|
16
|
+
|
17
|
+
if typing.TYPE_CHECKING:
|
18
|
+
from sky import resources as resources_lib
|
19
|
+
from sky import task as task_lib
|
20
|
+
from sky.serve import service_spec
|
21
|
+
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
23
|
+
|
24
|
+
SPOT_PLACERS = {}
|
25
|
+
DEFAULT_SPOT_PLACER = None
|
26
|
+
SPOT_HEDGE_PLACER = 'dynamic_fallback'
|
27
|
+
|
28
|
+
|
29
|
+
@dataclasses.dataclass
|
30
|
+
class Location:
|
31
|
+
"""Location class of a spot instance."""
|
32
|
+
cloud: 'sky_clouds.Cloud'
|
33
|
+
region: str
|
34
|
+
zone: Optional[str]
|
35
|
+
|
36
|
+
def __eq__(self, other) -> bool:
|
37
|
+
if isinstance(other, Location):
|
38
|
+
return (self.cloud.is_same_cloud(other.cloud) and
|
39
|
+
self.region == other.region and self.zone == other.zone)
|
40
|
+
return False
|
41
|
+
|
42
|
+
def __hash__(self) -> int:
|
43
|
+
return hash(
|
44
|
+
str(self.cloud) + self.region +
|
45
|
+
(self.zone if self.zone is not None else ''))
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
49
|
+
return cls(resources.cloud, resources.region, resources.zone)
|
50
|
+
|
51
|
+
def to_dict(self) -> Dict[str, Any]:
|
52
|
+
return {'cloud': self.cloud, 'region': self.region, 'zone': self.zone}
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def from_pickleable(
|
56
|
+
cls,
|
57
|
+
data: Optional[Dict[str, Optional[str]]],
|
58
|
+
) -> Optional['Location']:
|
59
|
+
if data is None:
|
60
|
+
return None
|
61
|
+
cloud = registry.CLOUD_REGISTRY.from_str(data['cloud'])
|
62
|
+
assert cloud is not None
|
63
|
+
assert data['region'] is not None
|
64
|
+
return cls(
|
65
|
+
cloud=cloud,
|
66
|
+
region=data['region'],
|
67
|
+
zone=data['zone'],
|
68
|
+
)
|
69
|
+
|
70
|
+
def to_pickleable(self) -> Dict[str, Optional[str]]:
|
71
|
+
return {
|
72
|
+
'cloud': str(self.cloud),
|
73
|
+
'region': self.region,
|
74
|
+
'zone': self.zone,
|
75
|
+
}
|
76
|
+
|
77
|
+
|
78
|
+
class LocationStatus(enum.Enum):
|
79
|
+
"""Location Spot Status."""
|
80
|
+
ACTIVE = 'ACTIVE'
|
81
|
+
PREEMPTED = 'PREEMPTED'
|
82
|
+
|
83
|
+
|
84
|
+
def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
85
|
+
|
86
|
+
def _without_location(
|
87
|
+
resources: 'resources_lib.Resources') -> 'resources_lib.Resources':
|
88
|
+
return resources.copy(cloud=None, region=None, zone=None)
|
89
|
+
|
90
|
+
assert task.resources # Guaranteed in task constructor
|
91
|
+
empty_location_resources = _without_location(list(task.resources)[0])
|
92
|
+
empty_location_resources_config = empty_location_resources.to_yaml_config()
|
93
|
+
|
94
|
+
location_requirements: Dict[str, Dict[str, Set[str]]] = (
|
95
|
+
collections.defaultdict(lambda: collections.defaultdict(set)))
|
96
|
+
|
97
|
+
for r in task.resources:
|
98
|
+
if (_without_location(r).to_yaml_config() !=
|
99
|
+
empty_location_resources_config):
|
100
|
+
with ux_utils.print_exception_no_traceback():
|
101
|
+
raise ValueError(
|
102
|
+
'Different resource configurations are not supported '
|
103
|
+
'for spot placement. All resources must have the same '
|
104
|
+
'configuration except for cloud/region/zone.')
|
105
|
+
if r.cloud is None:
|
106
|
+
continue
|
107
|
+
cloud_str = str(r.cloud)
|
108
|
+
if r.region is None:
|
109
|
+
# Access defaultdict to create empty entry if it doesn't exist.
|
110
|
+
_ = location_requirements[cloud_str]
|
111
|
+
continue
|
112
|
+
if r.zone is None:
|
113
|
+
# Same as above.
|
114
|
+
_ = location_requirements[cloud_str][r.region]
|
115
|
+
continue
|
116
|
+
location_requirements[cloud_str][r.region].add(r.zone)
|
117
|
+
|
118
|
+
clouds_list: List[sky_clouds.Cloud] = []
|
119
|
+
for c in location_requirements.keys():
|
120
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(c)
|
121
|
+
assert cloud_obj is not None
|
122
|
+
clouds_list.append(cloud_obj)
|
123
|
+
if not clouds_list:
|
124
|
+
# If the cloud list is empty, that means the user has no location
|
125
|
+
# related requirements. Then we start with all enabled clouds and
|
126
|
+
# all possible regions and zones.
|
127
|
+
clouds_list = sky_check.get_cached_enabled_clouds_or_refresh(
|
128
|
+
capability=sky_cloud.CloudCapability.COMPUTE,
|
129
|
+
raise_if_no_cloud_access=False)
|
130
|
+
for cloud in clouds_list:
|
131
|
+
# Create empty entry for each cloud.
|
132
|
+
_ = location_requirements[str(cloud)]
|
133
|
+
|
134
|
+
possible_locations = set()
|
135
|
+
for cloud in clouds_list:
|
136
|
+
feasible_resources: resources_utils.FeasibleResources = (
|
137
|
+
cloud.get_feasible_launchable_resources(empty_location_resources,
|
138
|
+
num_nodes=task.num_nodes))
|
139
|
+
for feasible in feasible_resources.resources_list:
|
140
|
+
# We set override_optimize_by_zone=True to force the provisioner
|
141
|
+
# to use zone-level provisioning. This is to get accurate location
|
142
|
+
# information.
|
143
|
+
launchables: List['resources_lib.Resources'] = (
|
144
|
+
resources_utils.make_launchables_for_valid_region_zones(
|
145
|
+
feasible, override_optimize_by_zone=True))
|
146
|
+
for launchable in launchables:
|
147
|
+
cloud_str = str(launchable.cloud)
|
148
|
+
region = launchable.region
|
149
|
+
zone = launchable.zone
|
150
|
+
if (cloud_str not in location_requirements and
|
151
|
+
location_requirements):
|
152
|
+
continue
|
153
|
+
# We need to use .get() here to avoid creating extra entries in
|
154
|
+
# location_requirements, and being treated as user's requirement
|
155
|
+
# in the following regions.
|
156
|
+
cloud_reqs = location_requirements.get(cloud_str, {})
|
157
|
+
if region not in cloud_reqs and cloud_reqs:
|
158
|
+
continue
|
159
|
+
region_reqs = cloud_reqs.get(region, set())
|
160
|
+
if zone not in region_reqs and region_reqs:
|
161
|
+
continue
|
162
|
+
possible_locations.add(Location.from_resources(launchable))
|
163
|
+
|
164
|
+
return list(possible_locations)
|
165
|
+
|
166
|
+
|
167
|
+
class SpotPlacer:
|
168
|
+
"""Spot Placement specification."""
|
169
|
+
|
170
|
+
def __init__(self, task: 'task_lib.Task') -> None:
|
171
|
+
possible_locations = _get_possible_location_from_task(task)
|
172
|
+
logger.info(f'{len(possible_locations)} possible location candidates '
|
173
|
+
'are enabled for spot placement.')
|
174
|
+
logger.debug(f'All possible locations: {possible_locations}')
|
175
|
+
self.location2status: Dict[Location, LocationStatus] = {
|
176
|
+
location: LocationStatus.ACTIVE for location in possible_locations
|
177
|
+
}
|
178
|
+
self.location2cost: Dict[Location, float] = {}
|
179
|
+
# Already checked there is only one resource in the task.
|
180
|
+
self.resources = list(task.resources)[0]
|
181
|
+
self.num_nodes = task.num_nodes
|
182
|
+
|
183
|
+
def __init_subclass__(cls, name: str, default: bool = False):
|
184
|
+
SPOT_PLACERS[name] = cls
|
185
|
+
if default:
|
186
|
+
global DEFAULT_SPOT_PLACER
|
187
|
+
assert DEFAULT_SPOT_PLACER is None, (
|
188
|
+
'Only one policy can be default.')
|
189
|
+
DEFAULT_SPOT_PLACER = name
|
190
|
+
|
191
|
+
def select_next_location(self,
|
192
|
+
current_locations: List[Location]) -> Location:
|
193
|
+
"""Select next location to place spot instance."""
|
194
|
+
raise NotImplementedError
|
195
|
+
|
196
|
+
def set_active(self, location: Location) -> None:
|
197
|
+
assert location in self.location2status, location
|
198
|
+
self.location2status[location] = LocationStatus.ACTIVE
|
199
|
+
|
200
|
+
def set_preemptive(self, location: Location) -> None:
|
201
|
+
assert location in self.location2status, location
|
202
|
+
self.location2status[location] = LocationStatus.PREEMPTED
|
203
|
+
|
204
|
+
def clear_preemptive_locations(self) -> None:
|
205
|
+
for location in self.location2status:
|
206
|
+
self.location2status[location] = LocationStatus.ACTIVE
|
207
|
+
|
208
|
+
def _min_cost_location(self, locations: List[Location]) -> Location:
|
209
|
+
|
210
|
+
def _get_cost_per_hour(location: Location) -> float:
|
211
|
+
if location in self.location2cost:
|
212
|
+
return self.location2cost[location]
|
213
|
+
# TODO(tian): Is there a better way to do this? This is for filling
|
214
|
+
# instance type so the get_cost() can operate normally.
|
215
|
+
r: 'resources_lib.Resources' = self.resources.copy(
|
216
|
+
**location.to_dict())
|
217
|
+
assert r.cloud is not None
|
218
|
+
rs = r.cloud.get_feasible_launchable_resources(
|
219
|
+
r, num_nodes=self.num_nodes).resources_list
|
220
|
+
# For some clouds, there might have multiple instance types
|
221
|
+
# satisfying the resource request. In such case we choose the
|
222
|
+
# cheapest one, as the optimizer does. Reference:
|
223
|
+
# sky/optimizer.py::Optimizer::_print_candidates
|
224
|
+
cost = min(r.get_cost(seconds=3600) for r in rs)
|
225
|
+
self.location2cost[location] = cost
|
226
|
+
return cost
|
227
|
+
|
228
|
+
return min(locations, key=_get_cost_per_hour)
|
229
|
+
|
230
|
+
def _location_with_status(self, status: LocationStatus) -> List[Location]:
|
231
|
+
return [
|
232
|
+
location
|
233
|
+
for location, location_type in self.location2status.items()
|
234
|
+
if location_type == status
|
235
|
+
]
|
236
|
+
|
237
|
+
def active_locations(self) -> List[Location]:
|
238
|
+
return self._location_with_status(LocationStatus.ACTIVE)
|
239
|
+
|
240
|
+
def preemptive_locations(self) -> List[Location]:
|
241
|
+
return self._location_with_status(LocationStatus.PREEMPTED)
|
242
|
+
|
243
|
+
@classmethod
|
244
|
+
def from_task(cls, spec: 'service_spec.SkyServiceSpec',
|
245
|
+
task: 'task_lib.Task') -> Optional['SpotPlacer']:
|
246
|
+
if spec.spot_placer is None:
|
247
|
+
return None
|
248
|
+
return SPOT_PLACERS[spec.spot_placer](task)
|
249
|
+
|
250
|
+
|
251
|
+
class DynamicFallbackSpotPlacer(SpotPlacer,
|
252
|
+
name=SPOT_HEDGE_PLACER,
|
253
|
+
default=True):
|
254
|
+
"""Dynamic Fallback Placer."""
|
255
|
+
|
256
|
+
def select_next_location(self,
|
257
|
+
current_locations: List[Location]) -> Location:
|
258
|
+
active_locations = self.active_locations()
|
259
|
+
# Prioritize locations that are not currently used.
|
260
|
+
candidate_locations: List[Location] = [
|
261
|
+
location for location in active_locations
|
262
|
+
if location not in current_locations
|
263
|
+
]
|
264
|
+
# If no candidate locations, use all active locations.
|
265
|
+
if not candidate_locations:
|
266
|
+
candidate_locations = active_locations
|
267
|
+
res = self._min_cost_location(candidate_locations)
|
268
|
+
logger.info(f'Active locations: {active_locations}\n'
|
269
|
+
f'Current locations: {current_locations}\n'
|
270
|
+
f'Candidate locations: {candidate_locations}\n'
|
271
|
+
f'Selected location: {res}\n')
|
272
|
+
return res
|
273
|
+
|
274
|
+
def set_preemptive(self, location: Location) -> None:
|
275
|
+
super().set_preemptive(location)
|
276
|
+
# Prevent the case with only one active location.
|
277
|
+
if len(self.active_locations()) < 2:
|
278
|
+
self.clear_preemptive_locations()
|
sky/server/common.py
CHANGED
@@ -51,6 +51,11 @@ API_SERVER_CMD = '-m sky.server.server'
|
|
51
51
|
API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
|
52
52
|
RETRY_COUNT_ON_TIMEOUT = 3
|
53
53
|
|
54
|
+
# The maximum time to wait for the API server to start, set to a conservative
|
55
|
+
# value that unlikely to reach since the server might be just starting slowly
|
56
|
+
# (e.g. in high contention env) and we will exit eagerly if server exit.
|
57
|
+
WAIT_APISERVER_START_TIMEOUT_SEC = 60
|
58
|
+
|
54
59
|
SKY_API_VERSION_WARNING = (
|
55
60
|
f'{colorama.Fore.YELLOW}SkyPilot API server is too old: '
|
56
61
|
f'v{{server_version}} (client version is v{{client_version}}). '
|
@@ -179,7 +184,8 @@ def _start_api_server(deploy: bool = False,
|
|
179
184
|
server_url = get_server_url(host)
|
180
185
|
assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
|
181
186
|
f'server url {server_url} is not a local url')
|
182
|
-
with rich_utils.client_status('Starting SkyPilot API server'
|
187
|
+
with rich_utils.client_status('Starting SkyPilot API server, '
|
188
|
+
f'view logs at {constants.API_SERVER_LOGS}'):
|
183
189
|
logger.info(f'{colorama.Style.DIM}Failed to connect to '
|
184
190
|
f'SkyPilot API server at {server_url}. '
|
185
191
|
'Starting a local server.'
|
@@ -216,14 +222,16 @@ def _start_api_server(deploy: bool = False,
|
|
216
222
|
# If this is called from a CLI invocation, we need
|
217
223
|
# start_new_session=True so that SIGINT on the CLI will not also kill
|
218
224
|
# the API server.
|
219
|
-
subprocess.Popen(cmd, shell=True, start_new_session=True)
|
225
|
+
proc = subprocess.Popen(cmd, shell=True, start_new_session=True)
|
220
226
|
|
221
|
-
# Wait for the server to start until timeout.
|
222
|
-
# Conservative upper time bound for starting the server based on
|
223
|
-
# profiling.
|
224
|
-
timeout_sec = 12
|
225
227
|
start_time = time.time()
|
226
228
|
while True:
|
229
|
+
# Check if process has exited
|
230
|
+
if proc.poll() is not None:
|
231
|
+
with ux_utils.print_exception_no_traceback():
|
232
|
+
raise RuntimeError(
|
233
|
+
'SkyPilot API server process exited unexpectedly.\n'
|
234
|
+
f'View logs at: {constants.API_SERVER_LOGS}')
|
227
235
|
api_server_info = get_api_server_status()
|
228
236
|
assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
|
229
237
|
f'API server version mismatch when starting the server. '
|
@@ -231,7 +239,7 @@ def _start_api_server(deploy: bool = False,
|
|
231
239
|
f'Client version: {server_constants.API_VERSION}')
|
232
240
|
if api_server_info.status == ApiServerStatus.HEALTHY:
|
233
241
|
break
|
234
|
-
elif time.time() - start_time >=
|
242
|
+
elif time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
|
235
243
|
with ux_utils.print_exception_no_traceback():
|
236
244
|
raise RuntimeError(
|
237
245
|
'Failed to start SkyPilot API server at '
|
sky/server/requests/executor.py
CHANGED
@@ -465,7 +465,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
465
465
|
target=mp_queue.start_queue_manager, args=(queue_names, port))
|
466
466
|
queue_server.start()
|
467
467
|
sub_procs.append(queue_server)
|
468
|
-
mp_queue.wait_for_queues_to_be_ready(queue_names, port
|
468
|
+
mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
|
469
469
|
|
470
470
|
logger.info('Request queues created')
|
471
471
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Shared queues for multiprocessing."""
|
2
|
+
import multiprocessing
|
2
3
|
from multiprocessing import managers
|
3
4
|
import queue
|
4
5
|
import time
|
@@ -57,10 +58,13 @@ def get_queue(queue_name: str,
|
|
57
58
|
|
58
59
|
|
59
60
|
def wait_for_queues_to_be_ready(queue_names: List[str],
|
61
|
+
queue_server: multiprocessing.Process,
|
60
62
|
port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
|
61
63
|
"""Wait for the queues to be ready after queue manager is just started."""
|
62
64
|
initial_time = time.time()
|
63
|
-
|
65
|
+
# Wait for queue manager to be ready. Exit eagerly if the manager process
|
66
|
+
# exits, just wait for a long timeout that is unlikely to reach otherwise.
|
67
|
+
max_wait_time = 60
|
64
68
|
while queue_names:
|
65
69
|
try:
|
66
70
|
get_queue(queue_names[0], port)
|
@@ -70,6 +74,9 @@ def wait_for_queues_to_be_ready(queue_names: List[str],
|
|
70
74
|
logger.info(f'Waiting for request queue, named {queue_names[0]!r}, '
|
71
75
|
f'to be ready...')
|
72
76
|
time.sleep(0.2)
|
77
|
+
if not queue_server.is_alive():
|
78
|
+
raise RuntimeError(
|
79
|
+
'Queue manager process exited unexpectedly.') from e
|
73
80
|
if time.time() - initial_time > max_wait_time:
|
74
81
|
raise RuntimeError(
|
75
82
|
f'Request queue, named {queue_names[0]!r}, '
|
sky/server/server.py
CHANGED
@@ -1116,7 +1116,7 @@ if __name__ == '__main__':
|
|
1116
1116
|
|
1117
1117
|
sub_procs = []
|
1118
1118
|
try:
|
1119
|
-
sub_procs = executor.start(cmd_args.deploy)
|
1119
|
+
sub_procs = executor.start(deploy=cmd_args.deploy)
|
1120
1120
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
1121
1121
|
# We don't support reload for now, since it may cause leakage of request
|
1122
1122
|
# workers or interrupt running requests.
|