skypilot-nightly 1.0.0.dev20250329__py3-none-any.whl → 1.0.0.dev20250331__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/serve/server/core.py CHANGED
@@ -25,7 +25,6 @@ from sky.utils import command_runner
25
25
  from sky.utils import common
26
26
  from sky.utils import common_utils
27
27
  from sky.utils import controller_utils
28
- from sky.utils import resources_utils
29
28
  from sky.utils import rich_utils
30
29
  from sky.utils import subprocess_utils
31
30
  from sky.utils import ux_utils
@@ -33,74 +32,6 @@ from sky.utils import ux_utils
33
32
  logger = sky_logging.init_logger(__name__)
34
33
 
35
34
 
36
- def _validate_service_task(task: 'sky.Task') -> None:
37
- """Validate the task for Sky Serve.
38
-
39
- Args:
40
- task: sky.Task to validate
41
-
42
- Raises:
43
- ValueError: if the arguments are invalid.
44
- RuntimeError: if the task.serve is not found.
45
- """
46
- spot_resources: List['sky.Resources'] = [
47
- resource for resource in task.resources if resource.use_spot
48
- ]
49
- # TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
50
- # On-demand fallback should go to the resources specified as on-demand.
51
- if len(spot_resources) not in [0, len(task.resources)]:
52
- with ux_utils.print_exception_no_traceback():
53
- raise ValueError(
54
- 'Resources must either all use spot or none use spot. '
55
- 'To use on-demand and spot instances together, '
56
- 'use `dynamic_ondemand_fallback` or set '
57
- 'base_ondemand_fallback_replicas.')
58
-
59
- if task.service is None:
60
- with ux_utils.print_exception_no_traceback():
61
- raise RuntimeError('Service section not found.')
62
-
63
- policy_description = ('on-demand'
64
- if task.service.dynamic_ondemand_fallback else 'spot')
65
- for resource in list(task.resources):
66
- if resource.job_recovery is not None:
67
- with ux_utils.print_exception_no_traceback():
68
- raise ValueError('job_recovery is disabled for SkyServe. '
69
- 'SkyServe will replenish preempted spot '
70
- f'with {policy_description} instances.')
71
-
72
- replica_ingress_port: Optional[int] = int(
73
- task.service.ports) if (task.service.ports is not None) else None
74
- for requested_resources in task.resources:
75
- if (task.service.use_ondemand_fallback and
76
- not requested_resources.use_spot):
77
- with ux_utils.print_exception_no_traceback():
78
- raise ValueError(
79
- '`use_ondemand_fallback` is only supported '
80
- 'for spot resources. Please explicitly specify '
81
- '`use_spot: true` in resources for on-demand fallback.')
82
- if task.service.ports is None:
83
- requested_ports = list(
84
- resources_utils.port_ranges_to_set(requested_resources.ports))
85
- if len(requested_ports) != 1:
86
- with ux_utils.print_exception_no_traceback():
87
- raise ValueError(
88
- 'To open multiple ports on the replica, please set the '
89
- '`service.ports` field to specify a main service port. '
90
- 'Must only specify one port in resources otherwise. '
91
- 'Each replica will use the port specified as '
92
- 'application ingress port.')
93
- service_port = requested_ports[0]
94
- if replica_ingress_port is None:
95
- replica_ingress_port = service_port
96
- elif service_port != replica_ingress_port:
97
- with ux_utils.print_exception_no_traceback():
98
- raise ValueError(
99
- f'Got multiple ports: {service_port} and '
100
- f'{replica_ingress_port} in different resources. '
101
- 'Please specify the same port instead.')
102
-
103
-
104
35
  def _rewrite_tls_credential_paths_and_get_tls_env_vars(
105
36
  service_name: str, task: 'sky.Task') -> Dict[str, Any]:
106
37
  """Rewrite the paths of TLS credentials in the task.
@@ -113,7 +44,7 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
113
44
  The generated template variables for TLS.
114
45
  """
115
46
  service_spec = task.service
116
- # Already checked by _validate_service_task
47
+ # Already checked by validate_service_task
117
48
  assert service_spec is not None
118
49
  if service_spec.tls_credential is None:
119
50
  return {'use_tls': False}
@@ -166,7 +97,7 @@ def up(
166
97
  'only contains lower letters, numbers and dash): '
167
98
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
168
99
 
169
- _validate_service_task(task)
100
+ serve_utils.validate_service_task(task)
170
101
  # Always apply the policy again here, even though it might have been applied
171
102
  # in the CLI. This is to ensure that we apply the policy to the final DAG
172
103
  # and get the mutated config.
@@ -322,7 +253,7 @@ def up(
322
253
  skip_status_check=True).get(lb_port)
323
254
  assert socket_endpoint is not None, (
324
255
  'Did not get endpoint for controller.')
325
- # Already checked by _validate_service_task
256
+ # Already checked by validate_service_task
326
257
  assert task.service is not None
327
258
  protocol = ('http'
328
259
  if task.service.tls_credential is None else 'https')
@@ -376,7 +307,7 @@ def update(
376
307
  mode: Update mode.
377
308
  """
378
309
  task.validate()
379
- _validate_service_task(task)
310
+ serve_utils.validate_service_task(task)
380
311
 
381
312
  # Always apply the policy again here, even though it might have been applied
382
313
  # in the CLI. This is to ensure that we apply the policy to the final DAG
sky/serve/service_spec.py CHANGED
@@ -10,6 +10,7 @@ from sky.adaptors import common as adaptors_common
10
10
  from sky.serve import constants
11
11
  from sky.serve import load_balancing_policies as lb_policies
12
12
  from sky.serve import serve_utils
13
+ from sky.serve import spot_placer as spot_placer_lib
13
14
  from sky.utils import common_utils
14
15
  from sky.utils import schemas
15
16
  from sky.utils import ux_utils
@@ -30,6 +31,7 @@ class SkyServiceSpec:
30
31
  readiness_timeout_seconds: int,
31
32
  min_replicas: int,
32
33
  max_replicas: Optional[int] = None,
34
+ num_overprovision: Optional[int] = None,
33
35
  ports: Optional[str] = None,
34
36
  target_qps_per_replica: Optional[float] = None,
35
37
  post_data: Optional[Dict[str, Any]] = None,
@@ -37,6 +39,7 @@ class SkyServiceSpec:
37
39
  readiness_headers: Optional[Dict[str, str]] = None,
38
40
  dynamic_ondemand_fallback: Optional[bool] = None,
39
41
  base_ondemand_fallback_replicas: Optional[int] = None,
42
+ spot_placer: Optional[str] = None,
40
43
  upscale_delay_seconds: Optional[int] = None,
41
44
  downscale_delay_seconds: Optional[int] = None,
42
45
  load_balancing_policy: Optional[str] = None,
@@ -78,6 +81,7 @@ class SkyServiceSpec:
78
81
  self._readiness_timeout_seconds: int = readiness_timeout_seconds
79
82
  self._min_replicas: int = min_replicas
80
83
  self._max_replicas: Optional[int] = max_replicas
84
+ self._num_overprovision: Optional[int] = num_overprovision
81
85
  self._ports: Optional[str] = ports
82
86
  self._target_qps_per_replica: Optional[float] = target_qps_per_replica
83
87
  self._post_data: Optional[Dict[str, Any]] = post_data
@@ -88,6 +92,7 @@ class SkyServiceSpec:
88
92
  bool] = dynamic_ondemand_fallback
89
93
  self._base_ondemand_fallback_replicas: Optional[
90
94
  int] = base_ondemand_fallback_replicas
95
+ self._spot_placer: Optional[str] = spot_placer
91
96
  self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
92
97
  self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
93
98
  self._load_balancing_policy: Optional[str] = load_balancing_policy
@@ -161,6 +166,7 @@ class SkyServiceSpec:
161
166
  min_replicas = constants.DEFAULT_MIN_REPLICAS
162
167
  service_config['min_replicas'] = min_replicas
163
168
  service_config['max_replicas'] = None
169
+ service_config['num_overprovision'] = None
164
170
  service_config['target_qps_per_replica'] = None
165
171
  service_config['upscale_delay_seconds'] = None
166
172
  service_config['downscale_delay_seconds'] = None
@@ -168,6 +174,8 @@ class SkyServiceSpec:
168
174
  service_config['min_replicas'] = policy_section['min_replicas']
169
175
  service_config['max_replicas'] = policy_section.get(
170
176
  'max_replicas', None)
177
+ service_config['num_overprovision'] = policy_section.get(
178
+ 'num_overprovision', None)
171
179
  service_config['target_qps_per_replica'] = policy_section.get(
172
180
  'target_qps_per_replica', None)
173
181
  service_config['upscale_delay_seconds'] = policy_section.get(
@@ -179,6 +187,8 @@ class SkyServiceSpec:
179
187
  'base_ondemand_fallback_replicas', None)
180
188
  service_config['dynamic_ondemand_fallback'] = policy_section.get(
181
189
  'dynamic_ondemand_fallback', None)
190
+ service_config['spot_placer'] = policy_section.get(
191
+ 'spot_placer', None)
182
192
 
183
193
  service_config['load_balancing_policy'] = config.get(
184
194
  'load_balancing_policy', None)
@@ -238,12 +248,15 @@ class SkyServiceSpec:
238
248
  add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
239
249
  add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
240
250
  add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
251
+ add_if_not_none('replica_policy', 'num_overprovision',
252
+ self.num_overprovision)
241
253
  add_if_not_none('replica_policy', 'target_qps_per_replica',
242
254
  self.target_qps_per_replica)
243
255
  add_if_not_none('replica_policy', 'dynamic_ondemand_fallback',
244
256
  self.dynamic_ondemand_fallback)
245
257
  add_if_not_none('replica_policy', 'base_ondemand_fallback_replicas',
246
258
  self.base_ondemand_fallback_replicas)
259
+ add_if_not_none('replica_policy', 'spot_placer', self.spot_placer)
247
260
  add_if_not_none('replica_policy', 'upscale_delay_seconds',
248
261
  self.upscale_delay_seconds)
249
262
  add_if_not_none('replica_policy', 'downscale_delay_seconds',
@@ -269,6 +282,9 @@ class SkyServiceSpec:
269
282
  policy_strs: List[str] = []
270
283
  if (self.dynamic_ondemand_fallback is not None and
271
284
  self.dynamic_ondemand_fallback):
285
+ if self.spot_placer is not None:
286
+ if self.spot_placer == spot_placer_lib.SPOT_HEDGE_PLACER:
287
+ return 'SpotHedge'
272
288
  policy_strs.append('Dynamic on-demand fallback')
273
289
  if self.base_ondemand_fallback_replicas is not None:
274
290
  policy_strs.append(
@@ -281,8 +297,12 @@ class SkyServiceSpec:
281
297
  policy_strs.append('Static spot mixture with '
282
298
  f'{self.base_ondemand_fallback_replicas} '
283
299
  f'base on-demand replica{plural}')
300
+ if self.spot_placer is not None:
301
+ if not policy_strs:
302
+ policy_strs.append('Spot placement')
303
+ policy_strs.append(f'with {self.spot_placer} placer')
284
304
  if not policy_strs:
285
- return 'No spot fallback policy'
305
+ return 'No spot policy'
286
306
  return ' '.join(policy_strs)
287
307
 
288
308
  def autoscaling_policy_str(self):
@@ -294,9 +314,13 @@ class SkyServiceSpec:
294
314
  assert self.target_qps_per_replica is not None
295
315
  # TODO(tian): Refactor to contain more information
296
316
  max_plural = '' if self.max_replicas == 1 else 's'
317
+ overprovision_str = ''
318
+ if self.num_overprovision is not None:
319
+ overprovision_str = (
320
+ f' with {self.num_overprovision} overprovisioned replicas')
297
321
  return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
298
- f'replica{max_plural} (target QPS per replica: '
299
- f'{self.target_qps_per_replica})')
322
+ f'replica{max_plural}{overprovision_str} (target QPS per '
323
+ f'replica: {self.target_qps_per_replica})')
300
324
 
301
325
  def set_ports(self, ports: str) -> None:
302
326
  self._ports = ports
@@ -339,6 +363,10 @@ class SkyServiceSpec:
339
363
  # If None, treated as having the same value of min_replicas.
340
364
  return self._max_replicas
341
365
 
366
+ @property
367
+ def num_overprovision(self) -> Optional[int]:
368
+ return self._num_overprovision
369
+
342
370
  @property
343
371
  def ports(self) -> Optional[str]:
344
372
  return self._ports
@@ -372,6 +400,10 @@ class SkyServiceSpec:
372
400
  def dynamic_ondemand_fallback(self) -> Optional[bool]:
373
401
  return self._dynamic_ondemand_fallback
374
402
 
403
+ @property
404
+ def spot_placer(self) -> Optional[str]:
405
+ return self._spot_placer
406
+
375
407
  @property
376
408
  def upscale_delay_seconds(self) -> Optional[int]:
377
409
  return self._upscale_delay_seconds
@@ -0,0 +1,278 @@
1
+ """Spot Placer for SpotHedge."""
2
+
3
+ import collections
4
+ import dataclasses
5
+ import enum
6
+ import typing
7
+ from typing import Any, Dict, List, Optional, Set
8
+
9
+ from sky import check as sky_check
10
+ from sky import clouds as sky_clouds
11
+ from sky import sky_logging
12
+ from sky.clouds import cloud as sky_cloud
13
+ from sky.utils import registry
14
+ from sky.utils import resources_utils
15
+ from sky.utils import ux_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ from sky import resources as resources_lib
19
+ from sky import task as task_lib
20
+ from sky.serve import service_spec
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ SPOT_PLACERS = {}
25
+ DEFAULT_SPOT_PLACER = None
26
+ SPOT_HEDGE_PLACER = 'dynamic_fallback'
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Location:
31
+ """Location class of a spot instance."""
32
+ cloud: 'sky_clouds.Cloud'
33
+ region: str
34
+ zone: Optional[str]
35
+
36
+ def __eq__(self, other) -> bool:
37
+ if isinstance(other, Location):
38
+ return (self.cloud.is_same_cloud(other.cloud) and
39
+ self.region == other.region and self.zone == other.zone)
40
+ return False
41
+
42
+ def __hash__(self) -> int:
43
+ return hash(
44
+ str(self.cloud) + self.region +
45
+ (self.zone if self.zone is not None else ''))
46
+
47
+ @classmethod
48
+ def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
49
+ return cls(resources.cloud, resources.region, resources.zone)
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ return {'cloud': self.cloud, 'region': self.region, 'zone': self.zone}
53
+
54
+ @classmethod
55
+ def from_pickleable(
56
+ cls,
57
+ data: Optional[Dict[str, Optional[str]]],
58
+ ) -> Optional['Location']:
59
+ if data is None:
60
+ return None
61
+ cloud = registry.CLOUD_REGISTRY.from_str(data['cloud'])
62
+ assert cloud is not None
63
+ assert data['region'] is not None
64
+ return cls(
65
+ cloud=cloud,
66
+ region=data['region'],
67
+ zone=data['zone'],
68
+ )
69
+
70
+ def to_pickleable(self) -> Dict[str, Optional[str]]:
71
+ return {
72
+ 'cloud': str(self.cloud),
73
+ 'region': self.region,
74
+ 'zone': self.zone,
75
+ }
76
+
77
+
78
+ class LocationStatus(enum.Enum):
79
+ """Location Spot Status."""
80
+ ACTIVE = 'ACTIVE'
81
+ PREEMPTED = 'PREEMPTED'
82
+
83
+
84
+ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
85
+
86
+ def _without_location(
87
+ resources: 'resources_lib.Resources') -> 'resources_lib.Resources':
88
+ return resources.copy(cloud=None, region=None, zone=None)
89
+
90
+ assert task.resources # Guaranteed in task constructor
91
+ empty_location_resources = _without_location(list(task.resources)[0])
92
+ empty_location_resources_config = empty_location_resources.to_yaml_config()
93
+
94
+ location_requirements: Dict[str, Dict[str, Set[str]]] = (
95
+ collections.defaultdict(lambda: collections.defaultdict(set)))
96
+
97
+ for r in task.resources:
98
+ if (_without_location(r).to_yaml_config() !=
99
+ empty_location_resources_config):
100
+ with ux_utils.print_exception_no_traceback():
101
+ raise ValueError(
102
+ 'Different resource configurations are not supported '
103
+ 'for spot placement. All resources must have the same '
104
+ 'configuration except for cloud/region/zone.')
105
+ if r.cloud is None:
106
+ continue
107
+ cloud_str = str(r.cloud)
108
+ if r.region is None:
109
+ # Access defaultdict to create empty entry if it doesn't exist.
110
+ _ = location_requirements[cloud_str]
111
+ continue
112
+ if r.zone is None:
113
+ # Same as above.
114
+ _ = location_requirements[cloud_str][r.region]
115
+ continue
116
+ location_requirements[cloud_str][r.region].add(r.zone)
117
+
118
+ clouds_list: List[sky_clouds.Cloud] = []
119
+ for c in location_requirements.keys():
120
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(c)
121
+ assert cloud_obj is not None
122
+ clouds_list.append(cloud_obj)
123
+ if not clouds_list:
124
+ # If the cloud list is empty, that means the user has no location
125
+ # related requirements. Then we start with all enabled clouds and
126
+ # all possible regions and zones.
127
+ clouds_list = sky_check.get_cached_enabled_clouds_or_refresh(
128
+ capability=sky_cloud.CloudCapability.COMPUTE,
129
+ raise_if_no_cloud_access=False)
130
+ for cloud in clouds_list:
131
+ # Create empty entry for each cloud.
132
+ _ = location_requirements[str(cloud)]
133
+
134
+ possible_locations = set()
135
+ for cloud in clouds_list:
136
+ feasible_resources: resources_utils.FeasibleResources = (
137
+ cloud.get_feasible_launchable_resources(empty_location_resources,
138
+ num_nodes=task.num_nodes))
139
+ for feasible in feasible_resources.resources_list:
140
+ # We set override_optimize_by_zone=True to force the provisioner
141
+ # to use zone-level provisioning. This is to get accurate location
142
+ # information.
143
+ launchables: List['resources_lib.Resources'] = (
144
+ resources_utils.make_launchables_for_valid_region_zones(
145
+ feasible, override_optimize_by_zone=True))
146
+ for launchable in launchables:
147
+ cloud_str = str(launchable.cloud)
148
+ region = launchable.region
149
+ zone = launchable.zone
150
+ if (cloud_str not in location_requirements and
151
+ location_requirements):
152
+ continue
153
+ # We need to use .get() here to avoid creating extra entries in
154
+ # location_requirements, and being treated as user's requirement
155
+ # in the following regions.
156
+ cloud_reqs = location_requirements.get(cloud_str, {})
157
+ if region not in cloud_reqs and cloud_reqs:
158
+ continue
159
+ region_reqs = cloud_reqs.get(region, set())
160
+ if zone not in region_reqs and region_reqs:
161
+ continue
162
+ possible_locations.add(Location.from_resources(launchable))
163
+
164
+ return list(possible_locations)
165
+
166
+
167
+ class SpotPlacer:
168
+ """Spot Placement specification."""
169
+
170
+ def __init__(self, task: 'task_lib.Task') -> None:
171
+ possible_locations = _get_possible_location_from_task(task)
172
+ logger.info(f'{len(possible_locations)} possible location candidates '
173
+ 'are enabled for spot placement.')
174
+ logger.debug(f'All possible locations: {possible_locations}')
175
+ self.location2status: Dict[Location, LocationStatus] = {
176
+ location: LocationStatus.ACTIVE for location in possible_locations
177
+ }
178
+ self.location2cost: Dict[Location, float] = {}
179
+ # Already checked there is only one resource in the task.
180
+ self.resources = list(task.resources)[0]
181
+ self.num_nodes = task.num_nodes
182
+
183
+ def __init_subclass__(cls, name: str, default: bool = False):
184
+ SPOT_PLACERS[name] = cls
185
+ if default:
186
+ global DEFAULT_SPOT_PLACER
187
+ assert DEFAULT_SPOT_PLACER is None, (
188
+ 'Only one policy can be default.')
189
+ DEFAULT_SPOT_PLACER = name
190
+
191
+ def select_next_location(self,
192
+ current_locations: List[Location]) -> Location:
193
+ """Select next location to place spot instance."""
194
+ raise NotImplementedError
195
+
196
+ def set_active(self, location: Location) -> None:
197
+ assert location in self.location2status, location
198
+ self.location2status[location] = LocationStatus.ACTIVE
199
+
200
+ def set_preemptive(self, location: Location) -> None:
201
+ assert location in self.location2status, location
202
+ self.location2status[location] = LocationStatus.PREEMPTED
203
+
204
+ def clear_preemptive_locations(self) -> None:
205
+ for location in self.location2status:
206
+ self.location2status[location] = LocationStatus.ACTIVE
207
+
208
+ def _min_cost_location(self, locations: List[Location]) -> Location:
209
+
210
+ def _get_cost_per_hour(location: Location) -> float:
211
+ if location in self.location2cost:
212
+ return self.location2cost[location]
213
+ # TODO(tian): Is there a better way to do this? This is for filling
214
+ # instance type so the get_cost() can operate normally.
215
+ r: 'resources_lib.Resources' = self.resources.copy(
216
+ **location.to_dict())
217
+ assert r.cloud is not None
218
+ rs = r.cloud.get_feasible_launchable_resources(
219
+ r, num_nodes=self.num_nodes).resources_list
220
+ # For some clouds, there might have multiple instance types
221
+ # satisfying the resource request. In such case we choose the
222
+ # cheapest one, as the optimizer does. Reference:
223
+ # sky/optimizer.py::Optimizer::_print_candidates
224
+ cost = min(r.get_cost(seconds=3600) for r in rs)
225
+ self.location2cost[location] = cost
226
+ return cost
227
+
228
+ return min(locations, key=_get_cost_per_hour)
229
+
230
+ def _location_with_status(self, status: LocationStatus) -> List[Location]:
231
+ return [
232
+ location
233
+ for location, location_type in self.location2status.items()
234
+ if location_type == status
235
+ ]
236
+
237
+ def active_locations(self) -> List[Location]:
238
+ return self._location_with_status(LocationStatus.ACTIVE)
239
+
240
+ def preemptive_locations(self) -> List[Location]:
241
+ return self._location_with_status(LocationStatus.PREEMPTED)
242
+
243
+ @classmethod
244
+ def from_task(cls, spec: 'service_spec.SkyServiceSpec',
245
+ task: 'task_lib.Task') -> Optional['SpotPlacer']:
246
+ if spec.spot_placer is None:
247
+ return None
248
+ return SPOT_PLACERS[spec.spot_placer](task)
249
+
250
+
251
+ class DynamicFallbackSpotPlacer(SpotPlacer,
252
+ name=SPOT_HEDGE_PLACER,
253
+ default=True):
254
+ """Dynamic Fallback Placer."""
255
+
256
+ def select_next_location(self,
257
+ current_locations: List[Location]) -> Location:
258
+ active_locations = self.active_locations()
259
+ # Prioritize locations that are not currently used.
260
+ candidate_locations: List[Location] = [
261
+ location for location in active_locations
262
+ if location not in current_locations
263
+ ]
264
+ # If no candidate locations, use all active locations.
265
+ if not candidate_locations:
266
+ candidate_locations = active_locations
267
+ res = self._min_cost_location(candidate_locations)
268
+ logger.info(f'Active locations: {active_locations}\n'
269
+ f'Current locations: {current_locations}\n'
270
+ f'Candidate locations: {candidate_locations}\n'
271
+ f'Selected location: {res}\n')
272
+ return res
273
+
274
+ def set_preemptive(self, location: Location) -> None:
275
+ super().set_preemptive(location)
276
+ # Prevent the case with only one active location.
277
+ if len(self.active_locations()) < 2:
278
+ self.clear_preemptive_locations()
sky/utils/registry.py CHANGED
@@ -125,3 +125,5 @@ JOBS_RECOVERY_STRATEGY_REGISTRY: _Registry = (
125
125
  registry_name='jobs recovery strategy',
126
126
  exclude=None,
127
127
  type_register=True))
128
+
129
+ # TODO(tian): Add a registry for spot placer.
@@ -216,3 +216,53 @@ def need_to_query_reservations() -> bool:
216
216
  cloud_prioritize_reservations):
217
217
  return True
218
218
  return False
219
+
220
+
221
+ def make_launchables_for_valid_region_zones(
222
+ launchable_resources: 'resources_lib.Resources',
223
+ override_optimize_by_zone: bool = False,
224
+ ) -> List['resources_lib.Resources']:
225
+ assert launchable_resources.is_launchable()
226
+ # In principle, all provisioning requests should be made at the granularity
227
+ # of a single zone. However, for on-demand instances, we batch the requests
228
+ # to the zones in the same region in order to leverage the region-level
229
+ # provisioning APIs of AWS and Azure. This way, we can reduce the number of
230
+ # API calls, and thus the overall failover time. Note that this optimization
231
+ # does not affect the user cost since the clouds charge the same prices for
232
+ # on-demand instances in the same region regardless of the zones. On the
233
+ # other hand, for spot instances, we do not batch the requests because the
234
+ # "AWS" spot prices may vary across zones.
235
+ # For GCP, we do not batch the requests because GCP reservation system is
236
+ # zone based. Therefore, price estimation is potentially different across
237
+ # zones.
238
+
239
+ # NOTE(woosuk): GCP does not support region-level provisioning APIs. Thus,
240
+ # while we return per-region resources here, the provisioner will still
241
+ # issue the request for one zone at a time.
242
+ # NOTE(woosuk): If we support Azure spot instances, we should batch the
243
+ # requests since Azure spot prices are region-level.
244
+ # TODO(woosuk): Batch the per-zone AWS spot instance requests if they are
245
+ # in the same region and have the same price.
246
+ # TODO(woosuk): A better design is to implement batching at a higher level
247
+ # (e.g., in provisioner or optimizer), not here.
248
+ launchables = []
249
+ regions = launchable_resources.get_valid_regions_for_launchable()
250
+ for region in regions:
251
+ optimize_by_zone = (override_optimize_by_zone or
252
+ launchable_resources.cloud.optimize_by_zone())
253
+ # It is possible that we force the optimize_by_zone but some clouds
254
+ # do not support zone-level provisioning (i.e. Azure). So we check
255
+ # if there is zone-level information in the region first.
256
+ if (region.zones is not None and
257
+ (launchable_resources.use_spot or optimize_by_zone)):
258
+ # Spot instances.
259
+ # Do not batch the per-zone requests.
260
+ for zone in region.zones:
261
+ launchables.append(
262
+ launchable_resources.copy(region=region.name,
263
+ zone=zone.name))
264
+ else:
265
+ # On-demand instances.
266
+ # Batch the requests at the granularity of a single region.
267
+ launchables.append(launchable_resources.copy(region=region.name))
268
+ return launchables
sky/utils/schemas.py CHANGED
@@ -310,6 +310,7 @@ def get_service_schema():
310
310
  # To avoid circular imports, only import when needed.
311
311
  # pylint: disable=import-outside-toplevel
312
312
  from sky.serve import load_balancing_policies
313
+ from sky.serve import spot_placer
313
314
  return {
314
315
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
315
316
  'type': 'object',
@@ -362,6 +363,10 @@ def get_service_schema():
362
363
  'type': 'integer',
363
364
  'minimum': 0,
364
365
  },
366
+ 'num_overprovision': {
367
+ 'type': 'integer',
368
+ 'minimum': 0,
369
+ },
365
370
  'target_qps_per_replica': {
366
371
  'type': 'number',
367
372
  'minimum': 0,
@@ -373,6 +378,11 @@ def get_service_schema():
373
378
  'type': 'integer',
374
379
  'minimum': 0,
375
380
  },
381
+ 'spot_placer': {
382
+ 'type': 'string',
383
+ 'case_insensitive_enum': list(
384
+ spot_placer.SPOT_PLACERS.keys())
385
+ },
376
386
  'upscale_delay_seconds': {
377
387
  'type': 'number',
378
388
  },
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250329
3
+ Version: 1.0.0.dev20250331
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0