skypilot-nightly 1.0.0.dev20250328__py3-none-any.whl → 1.0.0.dev20250330__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/serve/server/core.py CHANGED
@@ -25,7 +25,6 @@ from sky.utils import command_runner
25
25
  from sky.utils import common
26
26
  from sky.utils import common_utils
27
27
  from sky.utils import controller_utils
28
- from sky.utils import resources_utils
29
28
  from sky.utils import rich_utils
30
29
  from sky.utils import subprocess_utils
31
30
  from sky.utils import ux_utils
@@ -33,74 +32,6 @@ from sky.utils import ux_utils
33
32
  logger = sky_logging.init_logger(__name__)
34
33
 
35
34
 
36
- def _validate_service_task(task: 'sky.Task') -> None:
37
- """Validate the task for Sky Serve.
38
-
39
- Args:
40
- task: sky.Task to validate
41
-
42
- Raises:
43
- ValueError: if the arguments are invalid.
44
- RuntimeError: if the task.serve is not found.
45
- """
46
- spot_resources: List['sky.Resources'] = [
47
- resource for resource in task.resources if resource.use_spot
48
- ]
49
- # TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
50
- # On-demand fallback should go to the resources specified as on-demand.
51
- if len(spot_resources) not in [0, len(task.resources)]:
52
- with ux_utils.print_exception_no_traceback():
53
- raise ValueError(
54
- 'Resources must either all use spot or none use spot. '
55
- 'To use on-demand and spot instances together, '
56
- 'use `dynamic_ondemand_fallback` or set '
57
- 'base_ondemand_fallback_replicas.')
58
-
59
- if task.service is None:
60
- with ux_utils.print_exception_no_traceback():
61
- raise RuntimeError('Service section not found.')
62
-
63
- policy_description = ('on-demand'
64
- if task.service.dynamic_ondemand_fallback else 'spot')
65
- for resource in list(task.resources):
66
- if resource.job_recovery is not None:
67
- with ux_utils.print_exception_no_traceback():
68
- raise ValueError('job_recovery is disabled for SkyServe. '
69
- 'SkyServe will replenish preempted spot '
70
- f'with {policy_description} instances.')
71
-
72
- replica_ingress_port: Optional[int] = int(
73
- task.service.ports) if (task.service.ports is not None) else None
74
- for requested_resources in task.resources:
75
- if (task.service.use_ondemand_fallback and
76
- not requested_resources.use_spot):
77
- with ux_utils.print_exception_no_traceback():
78
- raise ValueError(
79
- '`use_ondemand_fallback` is only supported '
80
- 'for spot resources. Please explicitly specify '
81
- '`use_spot: true` in resources for on-demand fallback.')
82
- if task.service.ports is None:
83
- requested_ports = list(
84
- resources_utils.port_ranges_to_set(requested_resources.ports))
85
- if len(requested_ports) != 1:
86
- with ux_utils.print_exception_no_traceback():
87
- raise ValueError(
88
- 'To open multiple ports on the replica, please set the '
89
- '`service.ports` field to specify a main service port. '
90
- 'Must only specify one port in resources otherwise. '
91
- 'Each replica will use the port specified as '
92
- 'application ingress port.')
93
- service_port = requested_ports[0]
94
- if replica_ingress_port is None:
95
- replica_ingress_port = service_port
96
- elif service_port != replica_ingress_port:
97
- with ux_utils.print_exception_no_traceback():
98
- raise ValueError(
99
- f'Got multiple ports: {service_port} and '
100
- f'{replica_ingress_port} in different resources. '
101
- 'Please specify the same port instead.')
102
-
103
-
104
35
  def _rewrite_tls_credential_paths_and_get_tls_env_vars(
105
36
  service_name: str, task: 'sky.Task') -> Dict[str, Any]:
106
37
  """Rewrite the paths of TLS credentials in the task.
@@ -113,7 +44,7 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
113
44
  The generated template variables for TLS.
114
45
  """
115
46
  service_spec = task.service
116
- # Already checked by _validate_service_task
47
+ # Already checked by validate_service_task
117
48
  assert service_spec is not None
118
49
  if service_spec.tls_credential is None:
119
50
  return {'use_tls': False}
@@ -166,7 +97,7 @@ def up(
166
97
  'only contains lower letters, numbers and dash): '
167
98
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
168
99
 
169
- _validate_service_task(task)
100
+ serve_utils.validate_service_task(task)
170
101
  # Always apply the policy again here, even though it might have been applied
171
102
  # in the CLI. This is to ensure that we apply the policy to the final DAG
172
103
  # and get the mutated config.
@@ -322,7 +253,7 @@ def up(
322
253
  skip_status_check=True).get(lb_port)
323
254
  assert socket_endpoint is not None, (
324
255
  'Did not get endpoint for controller.')
325
- # Already checked by _validate_service_task
256
+ # Already checked by validate_service_task
326
257
  assert task.service is not None
327
258
  protocol = ('http'
328
259
  if task.service.tls_credential is None else 'https')
@@ -376,7 +307,7 @@ def update(
376
307
  mode: Update mode.
377
308
  """
378
309
  task.validate()
379
- _validate_service_task(task)
310
+ serve_utils.validate_service_task(task)
380
311
 
381
312
  # Always apply the policy again here, even though it might have been applied
382
313
  # in the CLI. This is to ensure that we apply the policy to the final DAG
sky/serve/service_spec.py CHANGED
@@ -10,6 +10,7 @@ from sky.adaptors import common as adaptors_common
10
10
  from sky.serve import constants
11
11
  from sky.serve import load_balancing_policies as lb_policies
12
12
  from sky.serve import serve_utils
13
+ from sky.serve import spot_placer as spot_placer_lib
13
14
  from sky.utils import common_utils
14
15
  from sky.utils import schemas
15
16
  from sky.utils import ux_utils
@@ -30,6 +31,7 @@ class SkyServiceSpec:
30
31
  readiness_timeout_seconds: int,
31
32
  min_replicas: int,
32
33
  max_replicas: Optional[int] = None,
34
+ num_overprovision: Optional[int] = None,
33
35
  ports: Optional[str] = None,
34
36
  target_qps_per_replica: Optional[float] = None,
35
37
  post_data: Optional[Dict[str, Any]] = None,
@@ -37,6 +39,7 @@ class SkyServiceSpec:
37
39
  readiness_headers: Optional[Dict[str, str]] = None,
38
40
  dynamic_ondemand_fallback: Optional[bool] = None,
39
41
  base_ondemand_fallback_replicas: Optional[int] = None,
42
+ spot_placer: Optional[str] = None,
40
43
  upscale_delay_seconds: Optional[int] = None,
41
44
  downscale_delay_seconds: Optional[int] = None,
42
45
  load_balancing_policy: Optional[str] = None,
@@ -78,6 +81,7 @@ class SkyServiceSpec:
78
81
  self._readiness_timeout_seconds: int = readiness_timeout_seconds
79
82
  self._min_replicas: int = min_replicas
80
83
  self._max_replicas: Optional[int] = max_replicas
84
+ self._num_overprovision: Optional[int] = num_overprovision
81
85
  self._ports: Optional[str] = ports
82
86
  self._target_qps_per_replica: Optional[float] = target_qps_per_replica
83
87
  self._post_data: Optional[Dict[str, Any]] = post_data
@@ -88,6 +92,7 @@ class SkyServiceSpec:
88
92
  bool] = dynamic_ondemand_fallback
89
93
  self._base_ondemand_fallback_replicas: Optional[
90
94
  int] = base_ondemand_fallback_replicas
95
+ self._spot_placer: Optional[str] = spot_placer
91
96
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
92
97
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
93
98
  self._load_balancing_policy: Optional[str] = load_balancing_policy
@@ -161,6 +166,7 @@ class SkyServiceSpec:
161
166
  min_replicas = constants.DEFAULT_MIN_REPLICAS
162
167
  service_config['min_replicas'] = min_replicas
163
168
  service_config['max_replicas'] = None
169
+ service_config['num_overprovision'] = None
164
170
  service_config['target_qps_per_replica'] = None
165
171
  service_config['upscale_delay_seconds'] = None
166
172
  service_config['downscale_delay_seconds'] = None
@@ -168,6 +174,8 @@ class SkyServiceSpec:
168
174
  service_config['min_replicas'] = policy_section['min_replicas']
169
175
  service_config['max_replicas'] = policy_section.get(
170
176
  'max_replicas', None)
177
+ service_config['num_overprovision'] = policy_section.get(
178
+ 'num_overprovision', None)
171
179
  service_config['target_qps_per_replica'] = policy_section.get(
172
180
  'target_qps_per_replica', None)
173
181
  service_config['upscale_delay_seconds'] = policy_section.get(
@@ -179,6 +187,8 @@ class SkyServiceSpec:
179
187
  'base_ondemand_fallback_replicas', None)
180
188
  service_config['dynamic_ondemand_fallback'] = policy_section.get(
181
189
  'dynamic_ondemand_fallback', None)
190
+ service_config['spot_placer'] = policy_section.get(
191
+ 'spot_placer', None)
182
192
 
183
193
  service_config['load_balancing_policy'] = config.get(
184
194
  'load_balancing_policy', None)
@@ -238,12 +248,15 @@ class SkyServiceSpec:
238
248
  add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
239
249
  add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
240
250
  add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
251
+ add_if_not_none('replica_policy', 'num_overprovision',
252
+ self.num_overprovision)
241
253
  add_if_not_none('replica_policy', 'target_qps_per_replica',
242
254
  self.target_qps_per_replica)
243
255
  add_if_not_none('replica_policy', 'dynamic_ondemand_fallback',
244
256
  self.dynamic_ondemand_fallback)
245
257
  add_if_not_none('replica_policy', 'base_ondemand_fallback_replicas',
246
258
  self.base_ondemand_fallback_replicas)
259
+ add_if_not_none('replica_policy', 'spot_placer', self.spot_placer)
247
260
  add_if_not_none('replica_policy', 'upscale_delay_seconds',
248
261
  self.upscale_delay_seconds)
249
262
  add_if_not_none('replica_policy', 'downscale_delay_seconds',
@@ -269,6 +282,9 @@ class SkyServiceSpec:
269
282
  policy_strs: List[str] = []
270
283
  if (self.dynamic_ondemand_fallback is not None and
271
284
  self.dynamic_ondemand_fallback):
285
+ if self.spot_placer is not None:
286
+ if self.spot_placer == spot_placer_lib.SPOT_HEDGE_PLACER:
287
+ return 'SpotHedge'
272
288
  policy_strs.append('Dynamic on-demand fallback')
273
289
  if self.base_ondemand_fallback_replicas is not None:
274
290
  policy_strs.append(
@@ -281,8 +297,12 @@ class SkyServiceSpec:
281
297
  policy_strs.append('Static spot mixture with '
282
298
  f'{self.base_ondemand_fallback_replicas} '
283
299
  f'base on-demand replica{plural}')
300
+ if self.spot_placer is not None:
301
+ if not policy_strs:
302
+ policy_strs.append('Spot placement')
303
+ policy_strs.append(f'with {self.spot_placer} placer')
284
304
  if not policy_strs:
285
- return 'No spot fallback policy'
305
+ return 'No spot policy'
286
306
  return ' '.join(policy_strs)
287
307
 
288
308
  def autoscaling_policy_str(self):
@@ -294,9 +314,13 @@ class SkyServiceSpec:
294
314
  assert self.target_qps_per_replica is not None
295
315
  # TODO(tian): Refactor to contain more information
296
316
  max_plural = '' if self.max_replicas == 1 else 's'
317
+ overprovision_str = ''
318
+ if self.num_overprovision is not None:
319
+ overprovision_str = (
320
+ f' with {self.num_overprovision} overprovisioned replicas')
297
321
  return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
298
- f'replica{max_plural} (target QPS per replica: '
299
- f'{self.target_qps_per_replica})')
322
+ f'replica{max_plural}{overprovision_str} (target QPS per '
323
+ f'replica: {self.target_qps_per_replica})')
300
324
 
301
325
  def set_ports(self, ports: str) -> None:
302
326
  self._ports = ports
@@ -339,6 +363,10 @@ class SkyServiceSpec:
339
363
  # If None, treated as having the same value of min_replicas.
340
364
  return self._max_replicas
341
365
 
366
+ @property
367
+ def num_overprovision(self) -> Optional[int]:
368
+ return self._num_overprovision
369
+
342
370
  @property
343
371
  def ports(self) -> Optional[str]:
344
372
  return self._ports
@@ -372,6 +400,10 @@ class SkyServiceSpec:
372
400
  def dynamic_ondemand_fallback(self) -> Optional[bool]:
373
401
  return self._dynamic_ondemand_fallback
374
402
 
403
+ @property
404
+ def spot_placer(self) -> Optional[str]:
405
+ return self._spot_placer
406
+
375
407
  @property
376
408
  def upscale_delay_seconds(self) -> Optional[int]:
377
409
  return self._upscale_delay_seconds
@@ -0,0 +1,278 @@
1
+ """Spot Placer for SpotHedge."""
2
+
3
+ import collections
4
+ import dataclasses
5
+ import enum
6
+ import typing
7
+ from typing import Any, Dict, List, Optional, Set
8
+
9
+ from sky import check as sky_check
10
+ from sky import clouds as sky_clouds
11
+ from sky import sky_logging
12
+ from sky.clouds import cloud as sky_cloud
13
+ from sky.utils import registry
14
+ from sky.utils import resources_utils
15
+ from sky.utils import ux_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ from sky import resources as resources_lib
19
+ from sky import task as task_lib
20
+ from sky.serve import service_spec
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ SPOT_PLACERS = {}
25
+ DEFAULT_SPOT_PLACER = None
26
+ SPOT_HEDGE_PLACER = 'dynamic_fallback'
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Location:
31
+ """Location class of a spot instance."""
32
+ cloud: 'sky_clouds.Cloud'
33
+ region: str
34
+ zone: Optional[str]
35
+
36
+ def __eq__(self, other) -> bool:
37
+ if isinstance(other, Location):
38
+ return (self.cloud.is_same_cloud(other.cloud) and
39
+ self.region == other.region and self.zone == other.zone)
40
+ return False
41
+
42
+ def __hash__(self) -> int:
43
+ return hash(
44
+ str(self.cloud) + self.region +
45
+ (self.zone if self.zone is not None else ''))
46
+
47
+ @classmethod
48
+ def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
49
+ return cls(resources.cloud, resources.region, resources.zone)
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ return {'cloud': self.cloud, 'region': self.region, 'zone': self.zone}
53
+
54
+ @classmethod
55
+ def from_pickleable(
56
+ cls,
57
+ data: Optional[Dict[str, Optional[str]]],
58
+ ) -> Optional['Location']:
59
+ if data is None:
60
+ return None
61
+ cloud = registry.CLOUD_REGISTRY.from_str(data['cloud'])
62
+ assert cloud is not None
63
+ assert data['region'] is not None
64
+ return cls(
65
+ cloud=cloud,
66
+ region=data['region'],
67
+ zone=data['zone'],
68
+ )
69
+
70
+ def to_pickleable(self) -> Dict[str, Optional[str]]:
71
+ return {
72
+ 'cloud': str(self.cloud),
73
+ 'region': self.region,
74
+ 'zone': self.zone,
75
+ }
76
+
77
+
78
+ class LocationStatus(enum.Enum):
79
+ """Location Spot Status."""
80
+ ACTIVE = 'ACTIVE'
81
+ PREEMPTED = 'PREEMPTED'
82
+
83
+
84
+ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
85
+
86
+ def _without_location(
87
+ resources: 'resources_lib.Resources') -> 'resources_lib.Resources':
88
+ return resources.copy(cloud=None, region=None, zone=None)
89
+
90
+ assert task.resources # Guaranteed in task constructor
91
+ empty_location_resources = _without_location(list(task.resources)[0])
92
+ empty_location_resources_config = empty_location_resources.to_yaml_config()
93
+
94
+ location_requirements: Dict[str, Dict[str, Set[str]]] = (
95
+ collections.defaultdict(lambda: collections.defaultdict(set)))
96
+
97
+ for r in task.resources:
98
+ if (_without_location(r).to_yaml_config() !=
99
+ empty_location_resources_config):
100
+ with ux_utils.print_exception_no_traceback():
101
+ raise ValueError(
102
+ 'Different resource configurations are not supported '
103
+ 'for spot placement. All resources must have the same '
104
+ 'configuration except for cloud/region/zone.')
105
+ if r.cloud is None:
106
+ continue
107
+ cloud_str = str(r.cloud)
108
+ if r.region is None:
109
+ # Access defaultdict to create empty entry if it doesn't exist.
110
+ _ = location_requirements[cloud_str]
111
+ continue
112
+ if r.zone is None:
113
+ # Same as above.
114
+ _ = location_requirements[cloud_str][r.region]
115
+ continue
116
+ location_requirements[cloud_str][r.region].add(r.zone)
117
+
118
+ clouds_list: List[sky_clouds.Cloud] = []
119
+ for c in location_requirements.keys():
120
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(c)
121
+ assert cloud_obj is not None
122
+ clouds_list.append(cloud_obj)
123
+ if not clouds_list:
124
+ # If the cloud list is empty, that means the user has no location
125
+ # related requirements. Then we start with all enabled clouds and
126
+ # all possible regions and zones.
127
+ clouds_list = sky_check.get_cached_enabled_clouds_or_refresh(
128
+ capability=sky_cloud.CloudCapability.COMPUTE,
129
+ raise_if_no_cloud_access=False)
130
+ for cloud in clouds_list:
131
+ # Create empty entry for each cloud.
132
+ _ = location_requirements[str(cloud)]
133
+
134
+ possible_locations = set()
135
+ for cloud in clouds_list:
136
+ feasible_resources: resources_utils.FeasibleResources = (
137
+ cloud.get_feasible_launchable_resources(empty_location_resources,
138
+ num_nodes=task.num_nodes))
139
+ for feasible in feasible_resources.resources_list:
140
+ # We set override_optimize_by_zone=True to force the provisioner
141
+ # to use zone-level provisioning. This is to get accurate location
142
+ # information.
143
+ launchables: List['resources_lib.Resources'] = (
144
+ resources_utils.make_launchables_for_valid_region_zones(
145
+ feasible, override_optimize_by_zone=True))
146
+ for launchable in launchables:
147
+ cloud_str = str(launchable.cloud)
148
+ region = launchable.region
149
+ zone = launchable.zone
150
+ if (cloud_str not in location_requirements and
151
+ location_requirements):
152
+ continue
153
+ # We need to use .get() here to avoid creating extra entries in
154
+ # location_requirements, and being treated as user's requirement
155
+ # in the following regions.
156
+ cloud_reqs = location_requirements.get(cloud_str, {})
157
+ if region not in cloud_reqs and cloud_reqs:
158
+ continue
159
+ region_reqs = cloud_reqs.get(region, set())
160
+ if zone not in region_reqs and region_reqs:
161
+ continue
162
+ possible_locations.add(Location.from_resources(launchable))
163
+
164
+ return list(possible_locations)
165
+
166
+
167
+ class SpotPlacer:
168
+ """Spot Placement specification."""
169
+
170
+ def __init__(self, task: 'task_lib.Task') -> None:
171
+ possible_locations = _get_possible_location_from_task(task)
172
+ logger.info(f'{len(possible_locations)} possible location candidates '
173
+ 'are enabled for spot placement.')
174
+ logger.debug(f'All possible locations: {possible_locations}')
175
+ self.location2status: Dict[Location, LocationStatus] = {
176
+ location: LocationStatus.ACTIVE for location in possible_locations
177
+ }
178
+ self.location2cost: Dict[Location, float] = {}
179
+ # Already checked there is only one resource in the task.
180
+ self.resources = list(task.resources)[0]
181
+ self.num_nodes = task.num_nodes
182
+
183
+ def __init_subclass__(cls, name: str, default: bool = False):
184
+ SPOT_PLACERS[name] = cls
185
+ if default:
186
+ global DEFAULT_SPOT_PLACER
187
+ assert DEFAULT_SPOT_PLACER is None, (
188
+ 'Only one policy can be default.')
189
+ DEFAULT_SPOT_PLACER = name
190
+
191
+ def select_next_location(self,
192
+ current_locations: List[Location]) -> Location:
193
+ """Select next location to place spot instance."""
194
+ raise NotImplementedError
195
+
196
+ def set_active(self, location: Location) -> None:
197
+ assert location in self.location2status, location
198
+ self.location2status[location] = LocationStatus.ACTIVE
199
+
200
+ def set_preemptive(self, location: Location) -> None:
201
+ assert location in self.location2status, location
202
+ self.location2status[location] = LocationStatus.PREEMPTED
203
+
204
+ def clear_preemptive_locations(self) -> None:
205
+ for location in self.location2status:
206
+ self.location2status[location] = LocationStatus.ACTIVE
207
+
208
+ def _min_cost_location(self, locations: List[Location]) -> Location:
209
+
210
+ def _get_cost_per_hour(location: Location) -> float:
211
+ if location in self.location2cost:
212
+ return self.location2cost[location]
213
+ # TODO(tian): Is there a better way to do this? This is for filling
214
+ # instance type so the get_cost() can operate normally.
215
+ r: 'resources_lib.Resources' = self.resources.copy(
216
+ **location.to_dict())
217
+ assert r.cloud is not None
218
+ rs = r.cloud.get_feasible_launchable_resources(
219
+ r, num_nodes=self.num_nodes).resources_list
220
+ # For some clouds, there might have multiple instance types
221
+ # satisfying the resource request. In such case we choose the
222
+ # cheapest one, as the optimizer does. Reference:
223
+ # sky/optimizer.py::Optimizer::_print_candidates
224
+ cost = min(r.get_cost(seconds=3600) for r in rs)
225
+ self.location2cost[location] = cost
226
+ return cost
227
+
228
+ return min(locations, key=_get_cost_per_hour)
229
+
230
+ def _location_with_status(self, status: LocationStatus) -> List[Location]:
231
+ return [
232
+ location
233
+ for location, location_type in self.location2status.items()
234
+ if location_type == status
235
+ ]
236
+
237
+ def active_locations(self) -> List[Location]:
238
+ return self._location_with_status(LocationStatus.ACTIVE)
239
+
240
+ def preemptive_locations(self) -> List[Location]:
241
+ return self._location_with_status(LocationStatus.PREEMPTED)
242
+
243
+ @classmethod
244
+ def from_task(cls, spec: 'service_spec.SkyServiceSpec',
245
+ task: 'task_lib.Task') -> Optional['SpotPlacer']:
246
+ if spec.spot_placer is None:
247
+ return None
248
+ return SPOT_PLACERS[spec.spot_placer](task)
249
+
250
+
251
+ class DynamicFallbackSpotPlacer(SpotPlacer,
252
+ name=SPOT_HEDGE_PLACER,
253
+ default=True):
254
+ """Dynamic Fallback Placer."""
255
+
256
+ def select_next_location(self,
257
+ current_locations: List[Location]) -> Location:
258
+ active_locations = self.active_locations()
259
+ # Prioritize locations that are not currently used.
260
+ candidate_locations: List[Location] = [
261
+ location for location in active_locations
262
+ if location not in current_locations
263
+ ]
264
+ # If no candidate locations, use all active locations.
265
+ if not candidate_locations:
266
+ candidate_locations = active_locations
267
+ res = self._min_cost_location(candidate_locations)
268
+ logger.info(f'Active locations: {active_locations}\n'
269
+ f'Current locations: {current_locations}\n'
270
+ f'Candidate locations: {candidate_locations}\n'
271
+ f'Selected location: {res}\n')
272
+ return res
273
+
274
+ def set_preemptive(self, location: Location) -> None:
275
+ super().set_preemptive(location)
276
+ # Prevent the case with only one active location.
277
+ if len(self.active_locations()) < 2:
278
+ self.clear_preemptive_locations()
sky/server/common.py CHANGED
@@ -51,6 +51,11 @@ API_SERVER_CMD = '-m sky.server.server'
51
51
  API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
52
52
  RETRY_COUNT_ON_TIMEOUT = 3
53
53
 
54
+ # The maximum time to wait for the API server to start, set to a conservative
55
+ # value that unlikely to reach since the server might be just starting slowly
56
+ # (e.g. in high contention env) and we will exit eagerly if server exit.
57
+ WAIT_APISERVER_START_TIMEOUT_SEC = 60
58
+
54
59
  SKY_API_VERSION_WARNING = (
55
60
  f'{colorama.Fore.YELLOW}SkyPilot API server is too old: '
56
61
  f'v{{server_version}} (client version is v{{client_version}}). '
@@ -179,7 +184,8 @@ def _start_api_server(deploy: bool = False,
179
184
  server_url = get_server_url(host)
180
185
  assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
181
186
  f'server url {server_url} is not a local url')
182
- with rich_utils.client_status('Starting SkyPilot API server'):
187
+ with rich_utils.client_status('Starting SkyPilot API server, '
188
+ f'view logs at {constants.API_SERVER_LOGS}'):
183
189
  logger.info(f'{colorama.Style.DIM}Failed to connect to '
184
190
  f'SkyPilot API server at {server_url}. '
185
191
  'Starting a local server.'
@@ -216,14 +222,16 @@ def _start_api_server(deploy: bool = False,
216
222
  # If this is called from a CLI invocation, we need
217
223
  # start_new_session=True so that SIGINT on the CLI will not also kill
218
224
  # the API server.
219
- subprocess.Popen(cmd, shell=True, start_new_session=True)
225
+ proc = subprocess.Popen(cmd, shell=True, start_new_session=True)
220
226
 
221
- # Wait for the server to start until timeout.
222
- # Conservative upper time bound for starting the server based on
223
- # profiling.
224
- timeout_sec = 12
225
227
  start_time = time.time()
226
228
  while True:
229
+ # Check if process has exited
230
+ if proc.poll() is not None:
231
+ with ux_utils.print_exception_no_traceback():
232
+ raise RuntimeError(
233
+ 'SkyPilot API server process exited unexpectedly.\n'
234
+ f'View logs at: {constants.API_SERVER_LOGS}')
227
235
  api_server_info = get_api_server_status()
228
236
  assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
229
237
  f'API server version mismatch when starting the server. '
@@ -231,7 +239,7 @@ def _start_api_server(deploy: bool = False,
231
239
  f'Client version: {server_constants.API_VERSION}')
232
240
  if api_server_info.status == ApiServerStatus.HEALTHY:
233
241
  break
234
- elif time.time() - start_time >= timeout_sec:
242
+ elif time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
235
243
  with ux_utils.print_exception_no_traceback():
236
244
  raise RuntimeError(
237
245
  'Failed to start SkyPilot API server at '
@@ -465,7 +465,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
465
465
  target=mp_queue.start_queue_manager, args=(queue_names, port))
466
466
  queue_server.start()
467
467
  sub_procs.append(queue_server)
468
- mp_queue.wait_for_queues_to_be_ready(queue_names, port=port)
468
+ mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
469
469
 
470
470
  logger.info('Request queues created')
471
471
 
@@ -1,4 +1,5 @@
1
1
  """Shared queues for multiprocessing."""
2
+ import multiprocessing
2
3
  from multiprocessing import managers
3
4
  import queue
4
5
  import time
@@ -57,10 +58,13 @@ def get_queue(queue_name: str,
57
58
 
58
59
 
59
60
  def wait_for_queues_to_be_ready(queue_names: List[str],
61
+ queue_server: multiprocessing.Process,
60
62
  port: int = DEFAULT_QUEUE_MANAGER_PORT) -> None:
61
63
  """Wait for the queues to be ready after queue manager is just started."""
62
64
  initial_time = time.time()
63
- max_wait_time = 5
65
+ # Wait for queue manager to be ready. Exit eagerly if the manager process
66
+ # exits, just wait for a long timeout that is unlikely to reach otherwise.
67
+ max_wait_time = 60
64
68
  while queue_names:
65
69
  try:
66
70
  get_queue(queue_names[0], port)
@@ -70,6 +74,9 @@ def wait_for_queues_to_be_ready(queue_names: List[str],
70
74
  logger.info(f'Waiting for request queue, named {queue_names[0]!r}, '
71
75
  f'to be ready...')
72
76
  time.sleep(0.2)
77
+ if not queue_server.is_alive():
78
+ raise RuntimeError(
79
+ 'Queue manager process exited unexpectedly.') from e
73
80
  if time.time() - initial_time > max_wait_time:
74
81
  raise RuntimeError(
75
82
  f'Request queue, named {queue_names[0]!r}, '
sky/server/server.py CHANGED
@@ -1116,7 +1116,7 @@ if __name__ == '__main__':
1116
1116
 
1117
1117
  sub_procs = []
1118
1118
  try:
1119
- sub_procs = executor.start(cmd_args.deploy)
1119
+ sub_procs = executor.start(deploy=cmd_args.deploy)
1120
1120
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1121
1121
  # We don't support reload for now, since it may cause leakage of request
1122
1122
  # workers or interrupt running requests.
sky/utils/registry.py CHANGED
@@ -125,3 +125,5 @@ JOBS_RECOVERY_STRATEGY_REGISTRY: _Registry = (
125
125
  registry_name='jobs recovery strategy',
126
126
  exclude=None,
127
127
  type_register=True))
128
+
129
+ # TODO(tian): Add a registry for spot placer.