skypilot-nightly 1.0.0.dev20250329__py3-none-any.whl → 1.0.0.dev20250331__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '9e6cef22ae679a1fef4539cbfc02a6d5cdc405ed'
8
+ _SKYPILOT_COMMIT_SHA = 'f90ccc1757680ccbff2fb8d86fc6dfd4242bd182'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250329'
38
+ __version__ = '1.0.0.dev20250331'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -3943,6 +3943,7 @@ def jobs_launch(
3943
3943
  required=False,
3944
3944
  help='Show jobs from all users.')
3945
3945
  @click.option('--all',
3946
+ '-a',
3946
3947
  default=False,
3947
3948
  is_flag=True,
3948
3949
  required=False,
@@ -4394,6 +4395,7 @@ def serve_up(
4394
4395
  )
4395
4396
  click.secho('Service spec:', fg='cyan')
4396
4397
  click.echo(task.service)
4398
+ serve_lib.validate_service_task(task)
4397
4399
 
4398
4400
  click.secho('Each replica will use the following resources (estimated):',
4399
4401
  fg='cyan')
@@ -4493,6 +4495,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
4493
4495
  )
4494
4496
  click.secho('Service spec:', fg='cyan')
4495
4497
  click.echo(task.service)
4498
+ serve_lib.validate_service_task(task)
4496
4499
 
4497
4500
  click.secho('New replica will use the following resources (estimated):',
4498
4501
  fg='cyan')
sky/client/cli.py CHANGED
@@ -3943,6 +3943,7 @@ def jobs_launch(
3943
3943
  required=False,
3944
3944
  help='Show jobs from all users.')
3945
3945
  @click.option('--all',
3946
+ '-a',
3946
3947
  default=False,
3947
3948
  is_flag=True,
3948
3949
  required=False,
@@ -4394,6 +4395,7 @@ def serve_up(
4394
4395
  )
4395
4396
  click.secho('Service spec:', fg='cyan')
4396
4397
  click.echo(task.service)
4398
+ serve_lib.validate_service_task(task)
4397
4399
 
4398
4400
  click.secho('Each replica will use the following resources (estimated):',
4399
4401
  fg='cyan')
@@ -4493,6 +4495,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
4493
4495
  )
4494
4496
  click.secho('Service spec:', fg='cyan')
4495
4497
  click.echo(task.service)
4498
+ serve_lib.validate_service_task(task)
4496
4499
 
4497
4500
  click.secho('New replica will use the following resources (estimated):',
4498
4501
  fg='cyan')
sky/data/storage_utils.py CHANGED
@@ -18,12 +18,6 @@ from sky.utils import log_utils
18
18
 
19
19
  logger = sky_logging.init_logger(__name__)
20
20
 
21
- _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
22
- f'{colorama.Fore.YELLOW}Warning: Files/dirs '
23
- 'specified in .gitignore will be uploaded '
24
- 'to the cloud storage for {path!r}'
25
- 'due to the following error: {error_msg!r}')
26
-
27
21
  _USE_SKYIGNORE_HINT = (
28
22
  'To avoid using .gitignore, you can create a .skyignore file instead.')
29
23
 
@@ -172,7 +166,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
172
166
  submodules = submodules_output.stdout.split('\0')[:-1]
173
167
 
174
168
  # The empty string is the relative reference to the src_dir_path.
175
- all_git_repos = ['.'] + [
169
+ all_git_repos = [''] + [
176
170
  # We only care about submodules that are a subdirectory of src_dir_path.
177
171
  submodule for submodule in submodules if not submodule.startswith('../')
178
172
  ]
@@ -208,7 +202,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
208
202
 
209
203
  for item in output_list:
210
204
 
211
- if repo == '.' and item == './':
205
+ if repo == '' and item == './':
212
206
  logger.warning(f'{src_dir_path} is within a git repo, but the '
213
207
  'entire directory is ignored by git. We will '
214
208
  'ignore all git exclusions. '
sky/optimizer.py CHANGED
@@ -1140,50 +1140,6 @@ class DummyCloud(clouds.Cloud):
1140
1140
  pass
1141
1141
 
1142
1142
 
1143
- def _make_launchables_for_valid_region_zones(
1144
- launchable_resources: resources_lib.Resources
1145
- ) -> List[resources_lib.Resources]:
1146
- assert launchable_resources.is_launchable()
1147
- # In principle, all provisioning requests should be made at the granularity
1148
- # of a single zone. However, for on-demand instances, we batch the requests
1149
- # to the zones in the same region in order to leverage the region-level
1150
- # provisioning APIs of AWS and Azure. This way, we can reduce the number of
1151
- # API calls, and thus the overall failover time. Note that this optimization
1152
- # does not affect the user cost since the clouds charge the same prices for
1153
- # on-demand instances in the same region regardless of the zones. On the
1154
- # other hand, for spot instances, we do not batch the requests because the
1155
- # "AWS" spot prices may vary across zones.
1156
- # For GCP, we do not batch the requests because GCP reservation system is
1157
- # zone based. Therefore, price estimation is potentially different across
1158
- # zones.
1159
-
1160
- # NOTE(woosuk): GCP does not support region-level provisioning APIs. Thus,
1161
- # while we return per-region resources here, the provisioner will still
1162
- # issue the request for one zone at a time.
1163
- # NOTE(woosuk): If we support Azure spot instances, we should batch the
1164
- # requests since Azure spot prices are region-level.
1165
- # TODO(woosuk): Batch the per-zone AWS spot instance requests if they are
1166
- # in the same region and have the same price.
1167
- # TODO(woosuk): A better design is to implement batching at a higher level
1168
- # (e.g., in provisioner or optimizer), not here.
1169
- launchables = []
1170
- regions = launchable_resources.get_valid_regions_for_launchable()
1171
- for region in regions:
1172
- if (launchable_resources.use_spot and region.zones is not None or
1173
- launchable_resources.cloud.optimize_by_zone()):
1174
- # Spot instances.
1175
- # Do not batch the per-zone requests.
1176
- for zone in region.zones:
1177
- launchables.append(
1178
- launchable_resources.copy(region=region.name,
1179
- zone=zone.name))
1180
- else:
1181
- # On-demand instances.
1182
- # Batch the requests at the granularity of a single region.
1183
- launchables.append(launchable_resources.copy(region=region.name))
1184
- return launchables
1185
-
1186
-
1187
1143
  def _filter_out_blocked_launchable_resources(
1188
1144
  launchable_resources: Iterable[resources_lib.Resources],
1189
1145
  blocked_resources: Iterable[resources_lib.Resources]):
@@ -1313,7 +1269,8 @@ def _fill_in_launchable_resources(
1313
1269
  cheapest = feasible_resources.resources_list[0]
1314
1270
  # Generate region/zone-specified resources.
1315
1271
  launchable[resources].extend(
1316
- _make_launchables_for_valid_region_zones(cheapest))
1272
+ resources_utils.make_launchables_for_valid_region_zones(
1273
+ cheapest))
1317
1274
  cloud_candidates[cloud] = feasible_resources.resources_list
1318
1275
  else:
1319
1276
  all_fuzzy_candidates.update(
sky/serve/__init__.py CHANGED
@@ -21,6 +21,7 @@ from sky.serve.serve_utils import generate_service_name
21
21
  from sky.serve.serve_utils import ServeCodeGen
22
22
  from sky.serve.serve_utils import ServiceComponent
23
23
  from sky.serve.serve_utils import UpdateMode
24
+ from sky.serve.serve_utils import validate_service_task
24
25
  from sky.serve.service_spec import SkyServiceSpec
25
26
 
26
27
  os.makedirs(os.path.expanduser(SKYSERVE_METADATA_DIR), exist_ok=True)
sky/serve/autoscalers.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Autoscalers: perform autoscaling by monitoring metrics."""
2
2
  import bisect
3
+ import copy
3
4
  import dataclasses
4
5
  import enum
5
6
  import math
@@ -56,8 +57,8 @@ class AutoscalerDecision:
56
57
  def _generate_scale_up_decisions(
57
58
  num: int, target: Optional[Dict[str, Any]]) -> List[AutoscalerDecision]:
58
59
  return [
59
- AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP, target)
60
- for _ in range(num)
60
+ AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP,
61
+ copy.copy(target)) for _ in range(num)
61
62
  ]
62
63
 
63
64
 
@@ -134,6 +135,7 @@ class Autoscaler:
134
135
  self.min_replicas: int = spec.min_replicas
135
136
  self.max_replicas: int = (spec.max_replicas if spec.max_replicas
136
137
  is not None else spec.min_replicas)
138
+ self.num_overprovision: Optional[int] = spec.num_overprovision
137
139
  # Target number of replicas is initialized to min replicas
138
140
  self.target_num_replicas: int = spec.min_replicas
139
141
  self.latest_version: int = constants.INITIAL_VERSION
@@ -143,6 +145,12 @@ class Autoscaler:
143
145
  self.latest_version_ever_ready: int = self.latest_version - 1
144
146
  self.update_mode = serve_utils.DEFAULT_UPDATE_MODE
145
147
 
148
+ def get_final_target_num_replicas(self) -> int:
149
+ """Get the final target number of replicas."""
150
+ if self.num_overprovision is None:
151
+ return self.target_num_replicas
152
+ return self.target_num_replicas + self.num_overprovision
153
+
146
154
  def _calculate_target_num_replicas(self) -> int:
147
155
  """Calculate target number of replicas."""
148
156
  raise NotImplementedError
@@ -207,7 +215,7 @@ class Autoscaler:
207
215
  0, to make the service scale faster when the service is not running.
208
216
  This will happen when min_replicas = 0 and no traffic.
209
217
  """
210
- if self.target_num_replicas == 0:
218
+ if self.get_final_target_num_replicas() == 0:
211
219
  return constants.AUTOSCALER_NO_REPLICA_DECISION_INTERVAL_SECONDS
212
220
  else:
213
221
  return constants.AUTOSCALER_DEFAULT_DECISION_INTERVAL_SECONDS
@@ -236,13 +244,14 @@ class Autoscaler:
236
244
  # old and latest versions are allowed in rolling update, this will
237
245
  # not affect the time it takes for the service to updated to the
238
246
  # latest version.
239
- if num_latest_ready_replicas >= self.target_num_replicas:
247
+ if (num_latest_ready_replicas >=
248
+ self.get_final_target_num_replicas()):
240
249
  # Once the number of ready new replicas is greater than or equal
241
250
  # to the target, we can scale down all old replicas.
242
251
  return [info.replica_id for info in old_nonterminal_replicas]
243
252
  # If rolling update is in progress, we scale down old replicas
244
253
  # based on the number of ready new replicas.
245
- num_old_replicas_to_keep = (self.target_num_replicas -
254
+ num_old_replicas_to_keep = (self.get_final_target_num_replicas() -
246
255
  num_latest_ready_replicas)
247
256
  # Remove old replicas (especially old launching replicas) and only
248
257
  # keep the required number of replicas, as we want to let the new
@@ -422,6 +431,7 @@ class _AutoscalerWithHysteresis(Autoscaler):
422
431
  f'Old target number of replicas: {old_target_num_replicas}. '
423
432
  f'Current target number of replicas: {target_num_replicas}. '
424
433
  f'Final target number of replicas: {self.target_num_replicas}. '
434
+ f'Num overprovision: {self.num_overprovision}. '
425
435
  f'Upscale counter: {self.upscale_counter}/'
426
436
  f'{self.scale_up_threshold}. '
427
437
  f'Downscale counter: {self.downscale_counter}/'
@@ -505,8 +515,9 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
505
515
 
506
516
  # Case 1. when latest_nonterminal_replicas is less
507
517
  # than num_to_provision, we always scale up new replicas.
508
- if len(latest_nonterminal_replicas) < self.target_num_replicas:
509
- num_replicas_to_scale_up = (self.target_num_replicas -
518
+ target_num_replicas = self.get_final_target_num_replicas()
519
+ if len(latest_nonterminal_replicas) < target_num_replicas:
520
+ num_replicas_to_scale_up = (target_num_replicas -
510
521
  len(latest_nonterminal_replicas))
511
522
  logger.info('Number of replicas to scale up: '
512
523
  f'{num_replicas_to_scale_up}')
@@ -514,11 +525,11 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
514
525
  _generate_scale_up_decisions(num_replicas_to_scale_up, None))
515
526
 
516
527
  # Case 2: when latest_nonterminal_replicas is more
517
- # than self.target_num_replicas, we scale down new replicas.
528
+ # than target_num_replicas, we scale down new replicas.
518
529
  replicas_to_scale_down = []
519
- if len(latest_nonterminal_replicas) > self.target_num_replicas:
530
+ if len(latest_nonterminal_replicas) > target_num_replicas:
520
531
  num_replicas_to_scale_down = (len(latest_nonterminal_replicas) -
521
- self.target_num_replicas)
532
+ target_num_replicas)
522
533
  replicas_to_scale_down = (
523
534
  _select_nonterminal_replicas_to_scale_down(
524
535
  num_replicas_to_scale_down, latest_nonterminal_replicas))
@@ -633,7 +644,7 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
633
644
  all_replica_ids_to_scale_down: List[int] = []
634
645
 
635
646
  # Decide how many spot instances to launch.
636
- num_spot_to_provision = (self.target_num_replicas -
647
+ num_spot_to_provision = (self.get_final_target_num_replicas() -
637
648
  self.base_ondemand_fallback_replicas)
638
649
  if num_nonterminal_spot < num_spot_to_provision:
639
650
  # Not enough spot instances, scale up.
@@ -668,6 +679,10 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
668
679
  num_ondemand_to_provision += (num_spot_to_provision -
669
680
  num_ready_spot)
670
681
 
682
+ # Make sure we don't launch on-demand fallback for
683
+ # overprovisioned replicas.
684
+ num_ondemand_to_provision = min(num_ondemand_to_provision,
685
+ self.target_num_replicas)
671
686
  if num_ondemand_to_provision > num_nonterminal_ondemand:
672
687
  num_ondemand_to_scale_up = (num_ondemand_to_provision -
673
688
  num_nonterminal_ondemand)
@@ -27,6 +27,7 @@ from sky.serve import constants as serve_constants
27
27
  from sky.serve import serve_state
28
28
  from sky.serve import serve_utils
29
29
  from sky.serve import service
30
+ from sky.serve import spot_placer
30
31
  from sky.skylet import constants
31
32
  from sky.skylet import job_lib
32
33
  from sky.usage import usage_lib
@@ -60,6 +61,7 @@ def launch_cluster(replica_id: int,
60
61
  task_yaml_path: str,
61
62
  cluster_name: str,
62
63
  resources_override: Optional[Dict[str, Any]] = None,
64
+ retry_until_up: bool = True,
63
65
  max_retry: int = 3) -> None:
64
66
  """Launch a sky serve replica cluster.
65
67
 
@@ -71,6 +73,10 @@ def launch_cluster(replica_id: int,
71
73
  or some error happened before provisioning and will happen again
72
74
  if retry.
73
75
  """
76
+ if resources_override is not None:
77
+ logger.info(f'Scaling up replica (id: {replica_id}) cluster '
78
+ f'{cluster_name} with resources override: '
79
+ f'{resources_override}')
74
80
  try:
75
81
  config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
76
82
  task = sky.Task.from_yaml_config(config)
@@ -98,7 +104,7 @@ def launch_cluster(replica_id: int,
98
104
  usage_lib.messages.usage.set_internal()
99
105
  execution.launch(task,
100
106
  cluster_name,
101
- retry_until_up=True,
107
+ retry_until_up=retry_until_up,
102
108
  _is_launched_by_sky_serve_controller=True)
103
109
  logger.info(f'Replica cluster {cluster_name} launched.')
104
110
  except (exceptions.InvalidClusterNameError,
@@ -246,6 +252,10 @@ class ReplicaStatusProperty:
246
252
  preempted: bool = False
247
253
  # Whether the replica is purged.
248
254
  purged: bool = False
255
+ # Whether the replica failed to launch due to spot availability.
256
+ # This is only possible when spot placer is enabled, so the retry until up
257
+ # is set to True and it can fail immediately due to spot availability.
258
+ failed_spot_availability: bool = False
249
259
 
250
260
  def remove_terminated_replica(self) -> bool:
251
261
  """Whether to remove the replica record from the replica table.
@@ -385,10 +395,11 @@ class ReplicaStatusProperty:
385
395
  class ReplicaInfo:
386
396
  """Replica info for each replica."""
387
397
 
388
- _VERSION = 0
398
+ _VERSION = 1
389
399
 
390
400
  def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
391
- is_spot: bool, version: int) -> None:
401
+ is_spot: bool, location: Optional[spot_placer.Location],
402
+ version: int) -> None:
392
403
  self._version = self._VERSION
393
404
  self.replica_id: int = replica_id
394
405
  self.cluster_name: str = cluster_name
@@ -398,6 +409,11 @@ class ReplicaInfo:
398
409
  self.consecutive_failure_times: List[float] = []
399
410
  self.status_property: ReplicaStatusProperty = ReplicaStatusProperty()
400
411
  self.is_spot: bool = is_spot
412
+ self.location: Optional[Dict[str, Optional[str]]] = (
413
+ location.to_pickleable() if location is not None else None)
414
+
415
+ def get_spot_location(self) -> Optional[spot_placer.Location]:
416
+ return spot_placer.Location.from_pickleable(self.location)
401
417
 
402
418
  def handle(
403
419
  self,
@@ -483,6 +499,7 @@ class ReplicaInfo:
483
499
  f'version={self.version}, '
484
500
  f'replica_port={self.replica_port}, '
485
501
  f'is_spot={self.is_spot}, '
502
+ f'location={self.location}, '
486
503
  f'status={self.status}, '
487
504
  f'launched_at={info_dict["launched_at"]}{handle_str})')
488
505
  return info
@@ -557,6 +574,9 @@ class ReplicaInfo:
557
574
  # Treated similar to on-demand instances.
558
575
  self.is_spot = False
559
576
 
577
+ if version < 1:
578
+ self.location = None
579
+
560
580
  self.__dict__.update(state)
561
581
 
562
582
 
@@ -620,6 +640,9 @@ class SkyPilotReplicaManager(ReplicaManager):
620
640
  task_yaml_path: str) -> None:
621
641
  super().__init__(service_name, spec)
622
642
  self._task_yaml_path = task_yaml_path
643
+ task = sky.Task.from_yaml(task_yaml_path)
644
+ self._spot_placer: Optional[spot_placer.SpotPlacer] = (
645
+ spot_placer.SpotPlacer.from_task(spec, task))
623
646
  # TODO(tian): Store launch/down pid in the replica table, to make the
624
647
  # manager more persistent. Current blocker is that we need to manually
625
648
  # poll the Process (by join or is_launch), otherwise, it will never
@@ -639,6 +662,9 @@ class SkyPilotReplicaManager(ReplicaManager):
639
662
  # Replica management functions #
640
663
  ################################
641
664
 
665
+ # Adding lock here to make sure spot placer's current locations are
666
+ # consistent with the replicas' status.
667
+ @with_lock
642
668
  def _launch_replica(
643
669
  self,
644
670
  replica_id: int,
@@ -653,19 +679,41 @@ class SkyPilotReplicaManager(ReplicaManager):
653
679
  self._service_name, replica_id)
654
680
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
655
681
  self._service_name, replica_id)
682
+ use_spot = _should_use_spot(self._task_yaml_path, resources_override)
683
+ retry_until_up = True
684
+ location = None
685
+ if use_spot and self._spot_placer is not None:
686
+ # For spot placer, we don't retry until up so any launch failed
687
+ # due to availability issue will be handled by the placer.
688
+ retry_until_up = False
689
+ # TODO(tian): Currently, the resources_override can only be
690
+ # `use_spot=True/False`, which will not cause any conflict with
691
+ # spot placer's cloud, region & zone. When we add more resources
692
+ # to the resources_override, we need to make sure they won't
693
+ # conflict with the spot placer's selection.
694
+ if resources_override is None:
695
+ resources_override = {}
696
+ current_spot_locations: List[spot_placer.Location] = []
697
+ for info in serve_state.get_replica_infos(self._service_name):
698
+ if info.is_spot:
699
+ spot_location = info.get_spot_location()
700
+ if spot_location is not None:
701
+ current_spot_locations.append(spot_location)
702
+ location = self._spot_placer.select_next_location(
703
+ current_spot_locations)
704
+ resources_override.update(location.to_dict())
656
705
  p = multiprocessing.Process(
657
706
  target=ux_utils.RedirectOutputForProcess(
658
707
  launch_cluster,
659
708
  log_file_name,
660
709
  ).run,
661
710
  args=(replica_id, self._task_yaml_path, cluster_name,
662
- resources_override),
711
+ resources_override, retry_until_up),
663
712
  )
664
713
  replica_port = _get_resources_ports(self._task_yaml_path)
665
- use_spot = _should_use_spot(self._task_yaml_path, resources_override)
666
714
 
667
715
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
668
- self.latest_version)
716
+ location, self.latest_version)
669
717
  serve_state.add_or_update_replica(self._service_name, replica_id, info)
670
718
  # Don't start right now; we will start it later in _refresh_process_pool
671
719
  # to avoid too many sky.launch running at the same time.
@@ -814,6 +862,10 @@ class SkyPilotReplicaManager(ReplicaManager):
814
862
  logger.info(
815
863
  f'Replica {info.replica_id} is preempted{cluster_status_str}.')
816
864
  info.status_property.preempted = True
865
+ if self._spot_placer is not None:
866
+ spot_location = info.get_spot_location()
867
+ assert spot_location is not None
868
+ self._spot_placer.set_preemptive(spot_location)
817
869
  serve_state.add_or_update_replica(self._service_name, info.replica_id,
818
870
  info)
819
871
  self._terminate_replica(info.replica_id,
@@ -868,6 +920,23 @@ class SkyPilotReplicaManager(ReplicaManager):
868
920
  else:
869
921
  info.status_property.sky_launch_status = (
870
922
  ProcessStatus.SUCCEEDED)
923
+ if self._spot_placer is not None and info.is_spot:
924
+ # TODO(tian): Currently, we set the location to
925
+ # preemptive if the launch process failed. This is
926
+ # because if the error is not related to the
927
+ # availability of the location, then all locations
928
+ # should failed for same reason. So it does not matter
929
+ # which location is preemptive or not, instead, all
930
+ # locations would fail. We should implement a log parser
931
+ # to detect if the error is actually related to the
932
+ # availability of the location later.
933
+ location = info.get_spot_location()
934
+ assert location is not None
935
+ if p.exitcode != 0:
936
+ self._spot_placer.set_preemptive(location)
937
+ info.status_property.failed_spot_availability = True
938
+ else:
939
+ self._spot_placer.set_active(location)
871
940
  serve_state.add_or_update_replica(self._service_name,
872
941
  replica_id, info)
873
942
  if error_in_sky_launch:
@@ -918,6 +987,8 @@ class SkyPilotReplicaManager(ReplicaManager):
918
987
  removal_reason = 'for version outdated'
919
988
  elif info.status_property.purged:
920
989
  removal_reason = 'for purge'
990
+ elif info.status_property.failed_spot_availability:
991
+ removal_reason = 'for spot availability failure'
921
992
  else:
922
993
  logger.info(f'Termination of replica {replica_id} '
923
994
  'finished. Replica info is kept since some '
sky/serve/serve_utils.py CHANGED
@@ -25,6 +25,7 @@ from sky import global_user_state
25
25
  from sky.adaptors import common as adaptors_common
26
26
  from sky.serve import constants
27
27
  from sky.serve import serve_state
28
+ from sky.serve import spot_placer
28
29
  from sky.skylet import constants as skylet_constants
29
30
  from sky.skylet import job_lib
30
31
  from sky.utils import annotations
@@ -40,6 +41,7 @@ if typing.TYPE_CHECKING:
40
41
  import psutil
41
42
  import requests
42
43
 
44
+ import sky
43
45
  from sky.serve import replica_managers
44
46
  else:
45
47
  psutil = adaptors_common.LazyImport('psutil')
@@ -210,6 +212,84 @@ class RequestTimestamp(RequestsAggregator):
210
212
  return f'RequestTimestamp(timestamps={self.timestamps})'
211
213
 
212
214
 
215
+ def validate_service_task(task: 'sky.Task') -> None:
216
+ """Validate the task for Sky Serve.
217
+
218
+ Args:
219
+ task: sky.Task to validate
220
+
221
+ Raises:
222
+ ValueError: if the arguments are invalid.
223
+ RuntimeError: if the task.serve is not found.
224
+ """
225
+ spot_resources: List['sky.Resources'] = [
226
+ resource for resource in task.resources if resource.use_spot
227
+ ]
228
+ # TODO(MaoZiming): Allow mixed on-demand and spot specification in resources
229
+ # On-demand fallback should go to the resources specified as on-demand.
230
+ if len(spot_resources) not in [0, len(task.resources)]:
231
+ with ux_utils.print_exception_no_traceback():
232
+ raise ValueError(
233
+ 'Resources must either all use spot or none use spot. '
234
+ 'To use on-demand and spot instances together, '
235
+ 'use `dynamic_ondemand_fallback` or set '
236
+ 'base_ondemand_fallback_replicas.')
237
+
238
+ if task.service is None:
239
+ with ux_utils.print_exception_no_traceback():
240
+ raise RuntimeError('Service section not found.')
241
+
242
+ policy_description = ('on-demand'
243
+ if task.service.dynamic_ondemand_fallback else 'spot')
244
+ for resource in list(task.resources):
245
+ if resource.job_recovery is not None:
246
+ with ux_utils.print_exception_no_traceback():
247
+ raise ValueError('job_recovery is disabled for SkyServe. '
248
+ 'SkyServe will replenish preempted spot '
249
+ f'with {policy_description} instances.')
250
+
251
+ # Try to create a spot placer from the task yaml. Check if the task yaml
252
+ # is valid for spot placer.
253
+ spot_placer.SpotPlacer.from_task(task.service, task)
254
+
255
+ replica_ingress_port: Optional[int] = int(
256
+ task.service.ports) if (task.service.ports is not None) else None
257
+ for requested_resources in task.resources:
258
+ if (task.service.use_ondemand_fallback and
259
+ not requested_resources.use_spot):
260
+ with ux_utils.print_exception_no_traceback():
261
+ raise ValueError(
262
+ '`use_ondemand_fallback` is only supported '
263
+ 'for spot resources. Please explicitly specify '
264
+ '`use_spot: true` in resources for on-demand fallback.')
265
+ if (task.service.spot_placer is not None and
266
+ not requested_resources.use_spot):
267
+ with ux_utils.print_exception_no_traceback():
268
+ raise ValueError(
269
+ '`spot_placer` is only supported for spot resources. '
270
+ 'Please explicitly specify `use_spot: true` in resources.')
271
+ if task.service.ports is None:
272
+ requested_ports = list(
273
+ resources_utils.port_ranges_to_set(requested_resources.ports))
274
+ if len(requested_ports) != 1:
275
+ with ux_utils.print_exception_no_traceback():
276
+ raise ValueError(
277
+ 'To open multiple ports on the replica, please set the '
278
+ '`service.ports` field to specify a main service port. '
279
+ 'Must only specify one port in resources otherwise. '
280
+ 'Each replica will use the port specified as '
281
+ 'application ingress port.')
282
+ service_port = requested_ports[0]
283
+ if replica_ingress_port is None:
284
+ replica_ingress_port = service_port
285
+ elif service_port != replica_ingress_port:
286
+ with ux_utils.print_exception_no_traceback():
287
+ raise ValueError(
288
+ f'Got multiple ports: {service_port} and '
289
+ f'{replica_ingress_port} in different resources. '
290
+ 'Please specify the same port instead.')
291
+
292
+
213
293
  def generate_service_name():
214
294
  return f'sky-service-{uuid.uuid4().hex[:4]}'
215
295