skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -0
  3. sky/backends/backend_utils.py +10 -133
  4. sky/backends/cloud_vm_ray_backend.py +17 -105
  5. sky/clouds/azure.py +10 -1
  6. sky/execution.py +5 -4
  7. sky/jobs/controller.py +38 -22
  8. sky/jobs/recovery_strategy.py +30 -5
  9. sky/jobs/state.py +33 -5
  10. sky/jobs/utils.py +28 -4
  11. sky/optimizer.py +11 -7
  12. sky/provision/azure/azure-config-template.json +7 -1
  13. sky/provision/azure/config.py +65 -45
  14. sky/provision/azure/instance.py +275 -70
  15. sky/provision/constants.py +7 -0
  16. sky/provision/gcp/instance.py +0 -7
  17. sky/resources.py +25 -8
  18. sky/serve/core.py +0 -2
  19. sky/serve/serve_state.py +3 -7
  20. sky/serve/serve_utils.py +2 -14
  21. sky/serve/service_spec.py +0 -28
  22. sky/setup_files/setup.py +4 -3
  23. sky/skylet/job_lib.py +37 -53
  24. sky/skylet/log_lib.py +5 -14
  25. sky/templates/azure-ray.yml.j2 +1 -0
  26. sky/utils/dag_utils.py +14 -4
  27. sky/utils/schemas.py +25 -15
  28. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
  29. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
  30. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
  31. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
  32. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
  33. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0
@@ -41,6 +41,15 @@ UNIQUE_ID_LEN = 4
41
41
  _TAG_SKYPILOT_VM_ID = 'skypilot-vm-id'
42
42
  _WAIT_CREATION_TIMEOUT_SECONDS = 600
43
43
 
44
+ _RESOURCE_MANAGED_IDENTITY_TYPE = (
45
+ 'Microsoft.ManagedIdentity/userAssignedIdentities')
46
+ _RESOURCE_NETWORK_SECURITY_GROUP_TYPE = (
47
+ 'Microsoft.Network/networkSecurityGroups')
48
+ _RESOURCE_VIRTUAL_NETWORK_TYPE = 'Microsoft.Network/virtualNetworks'
49
+ _RESOURCE_PUBLIC_IP_ADDRESS_TYPE = 'Microsoft.Network/publicIPAddresses'
50
+ _RESOURCE_VIRTUAL_MACHINE_TYPE = 'Microsoft.Compute/virtualMachines'
51
+ _RESOURCE_NETWORK_INTERFACE_TYPE = 'Microsoft.Network/networkInterfaces'
52
+
44
53
  _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE = 'ResourceGroupNotFound'
45
54
  _POLL_INTERVAL = 1
46
55
  # TODO(Doyoung): _LEGACY_NSG_NAME can be remove this after 0.8.0 to ignore
@@ -282,6 +291,7 @@ def _create_vm(
282
291
  image_reference=image_reference,
283
292
  os_disk=compute.OSDisk(
284
293
  create_option=compute.DiskCreateOptionTypes.FROM_IMAGE,
294
+ delete_option=compute.DiskDeleteOptionTypes.DELETE,
285
295
  managed_disk=compute.ManagedDiskParameters(
286
296
  storage_account_type=node_config['azure_arm_parameters']
287
297
  ['osDiskTier']),
@@ -697,18 +707,30 @@ def terminate_instances(
697
707
 
698
708
  assert provider_config is not None, cluster_name_on_cloud
699
709
 
700
- resource_group_client = azure.get_client('resource', subscription_id)
701
- delete_resource_group = _get_azure_sdk_function(
702
- client=resource_group_client.resource_groups, function_name='delete')
703
-
704
- try:
705
- delete_resource_group(resource_group, force_deletion_types=None)
706
- except azure.exceptions().ResourceNotFoundError as e:
707
- if 'ResourceGroupNotFound' in str(e):
708
- logger.warning(f'Resource group {resource_group} not found. Skip '
709
- 'terminating it.')
710
- return
711
- raise
710
+ use_external_resource_group = provider_config.get(
711
+ 'use_external_resource_group', False)
712
+ # When user specified resource group through config.yaml to create a VM, we
713
+ # cannot remove the entire resource group as it may contain other resources
714
+ # unrelated to this VM being removed.
715
+ if use_external_resource_group:
716
+ delete_vm_and_attached_resources(subscription_id, resource_group,
717
+ cluster_name_on_cloud)
718
+ else:
719
+ # For SkyPilot default resource groups, delete entire resource group.
720
+ # This automatically terminates all resources within, including VMs
721
+ resource_group_client = azure.get_client('resource', subscription_id)
722
+ delete_resource_group = _get_azure_sdk_function(
723
+ client=resource_group_client.resource_groups,
724
+ function_name='delete')
725
+ try:
726
+ delete_resource_group(resource_group, force_deletion_types=None)
727
+ except azure.exceptions().ResourceNotFoundError as e:
728
+ if 'ResourceGroupNotFound' in str(e):
729
+ logger.warning(
730
+ f'Resource group {resource_group} not found. Skip '
731
+ 'terminating it.')
732
+ return
733
+ raise
712
734
 
713
735
 
714
736
  def _get_instance_status(
@@ -770,6 +792,188 @@ def _filter_instances(
770
792
  return nodes
771
793
 
772
794
 
795
+ def _delete_nic_with_retries(network_client,
796
+ resource_group,
797
+ nic_name,
798
+ max_retries=15,
799
+ retry_interval=20):
800
+ """Delete a NIC with retries.
801
+
802
+ When a VM is created, its NIC is reserved for 180 seconds, preventing its
803
+ immediate deletion. If the NIC is in this reserved state, we must retry
804
+ deletion with intervals until the reservation expires. This situation
805
+ commonly arises if a VM termination is followed by a failover to another
806
+ region due to provisioning failures.
807
+ """
808
+ delete_network_interfaces = _get_azure_sdk_function(
809
+ client=network_client.network_interfaces, function_name='begin_delete')
810
+ for _ in range(max_retries):
811
+ try:
812
+ delete_network_interfaces(resource_group_name=resource_group,
813
+ network_interface_name=nic_name).result()
814
+ return
815
+ except azure.exceptions().HttpResponseError as e:
816
+ if 'NicReservedForAnotherVm' in str(e):
817
+ # Retry when deletion fails with reserved NIC.
818
+ logger.warning(f'NIC {nic_name} is reserved. '
819
+ f'Retrying in {retry_interval} seconds...')
820
+ time.sleep(retry_interval)
821
+ else:
822
+ raise e
823
+ logger.error(
824
+ f'Failed to delete NIC {nic_name} after {max_retries} attempts.')
825
+
826
+
827
+ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
828
+ cluster_name_on_cloud: str) -> None:
829
+ """Removes VM with attached resources and Deployments.
830
+
831
+ This function deletes a virtual machine and its associated resources
832
+ (public IP addresses, virtual networks, managed identities, network
833
+ interface and network security groups) that match cluster_name_on_cloud.
834
+ There is one attached resources that is not removed within this
835
+ method: OS disk. It is configured to be deleted when VM is terminated while
836
+ setting up storage profile from _create_vm.
837
+
838
+ Args:
839
+ subscription_id: The Azure subscription ID.
840
+ resource_group: The name of the resource group.
841
+ cluster_name_on_cloud: The name of the cluster to filter resources.
842
+ """
843
+ resource_client = azure.get_client('resource', subscription_id)
844
+ try:
845
+ list_resources = _get_azure_sdk_function(
846
+ client=resource_client.resources,
847
+ function_name='list_by_resource_group')
848
+ resources = list(list_resources(resource_group))
849
+ except azure.exceptions().ResourceNotFoundError as e:
850
+ if _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE in str(e):
851
+ return
852
+ raise
853
+
854
+ filtered_resources: Dict[str, List[str]] = {
855
+ _RESOURCE_VIRTUAL_MACHINE_TYPE: [],
856
+ _RESOURCE_MANAGED_IDENTITY_TYPE: [],
857
+ _RESOURCE_NETWORK_SECURITY_GROUP_TYPE: [],
858
+ _RESOURCE_VIRTUAL_NETWORK_TYPE: [],
859
+ _RESOURCE_PUBLIC_IP_ADDRESS_TYPE: [],
860
+ _RESOURCE_NETWORK_INTERFACE_TYPE: []
861
+ }
862
+
863
+ for resource in resources:
864
+ if (resource.type in filtered_resources and
865
+ cluster_name_on_cloud in resource.name):
866
+ filtered_resources[resource.type].append(resource.name)
867
+
868
+ network_client = azure.get_client('network', subscription_id)
869
+ msi_client = azure.get_client('msi', subscription_id)
870
+ compute_client = azure.get_client('compute', subscription_id)
871
+ auth_client = azure.get_client('authorization', subscription_id)
872
+
873
+ delete_virtual_machine = _get_azure_sdk_function(
874
+ client=compute_client.virtual_machines, function_name='delete')
875
+ delete_public_ip_addresses = _get_azure_sdk_function(
876
+ client=network_client.public_ip_addresses, function_name='begin_delete')
877
+ delete_virtual_networks = _get_azure_sdk_function(
878
+ client=network_client.virtual_networks, function_name='begin_delete')
879
+ delete_managed_identity = _get_azure_sdk_function(
880
+ client=msi_client.user_assigned_identities, function_name='delete')
881
+ delete_network_security_group = _get_azure_sdk_function(
882
+ client=network_client.network_security_groups,
883
+ function_name='begin_delete')
884
+ delete_role_assignment = _get_azure_sdk_function(
885
+ client=auth_client.role_assignments, function_name='delete')
886
+
887
+ for vm_name in filtered_resources[_RESOURCE_VIRTUAL_MACHINE_TYPE]:
888
+ try:
889
+ # Before removing Network Interface, we need to wait for the VM to
890
+ # be completely removed with .result() so the dependency of VM on
891
+ # Network Interface is disassociated. This takes abour ~30s.
892
+ delete_virtual_machine(resource_group_name=resource_group,
893
+ vm_name=vm_name).result()
894
+ except Exception as e: # pylint: disable=broad-except
895
+ logger.warning('Failed to delete VM: {}'.format(e))
896
+
897
+ for nic_name in filtered_resources[_RESOURCE_NETWORK_INTERFACE_TYPE]:
898
+ try:
899
+ # Before removing Public IP Address, we need to wait for the
900
+ # Network Interface to be completely removed with .result() so the
901
+ # dependency of Network Interface on Public IP Address is
902
+ # disassociated. This takes about ~1s.
903
+ _delete_nic_with_retries(network_client, resource_group, nic_name)
904
+ except Exception as e: # pylint: disable=broad-except
905
+ logger.warning('Failed to delete nic: {}'.format(e))
906
+
907
+ for public_ip_name in filtered_resources[_RESOURCE_PUBLIC_IP_ADDRESS_TYPE]:
908
+ try:
909
+ delete_public_ip_addresses(resource_group_name=resource_group,
910
+ public_ip_address_name=public_ip_name)
911
+ except Exception as e: # pylint: disable=broad-except
912
+ logger.warning('Failed to delete public ip: {}'.format(e))
913
+
914
+ for vnet_name in filtered_resources[_RESOURCE_VIRTUAL_NETWORK_TYPE]:
915
+ try:
916
+ delete_virtual_networks(resource_group_name=resource_group,
917
+ virtual_network_name=vnet_name)
918
+ except Exception as e: # pylint: disable=broad-except
919
+ logger.warning('Failed to delete vnet: {}'.format(e))
920
+
921
+ for msi_name in filtered_resources[_RESOURCE_MANAGED_IDENTITY_TYPE]:
922
+ user_assigned_identities = (
923
+ msi_client.user_assigned_identities.list_by_resource_group(
924
+ resource_group_name=resource_group))
925
+ for identity in user_assigned_identities:
926
+ if msi_name == identity.name:
927
+ # We use the principal_id to find the correct guid converted
928
+ # role assignment name because each managed identity has a
929
+ # unique principal_id, and role assignments are associated
930
+ # with security principals (like managed identities) via this
931
+ # principal_id.
932
+ target_principal_id = identity.principal_id
933
+ scope = (f'/subscriptions/{subscription_id}'
934
+ f'/resourceGroups/{resource_group}')
935
+ role_assignments = auth_client.role_assignments.list_for_scope(
936
+ scope)
937
+ for assignment in role_assignments:
938
+ if target_principal_id == assignment.principal_id:
939
+ guid_role_assignment_name = assignment.name
940
+ try:
941
+ delete_role_assignment(
942
+ scope=scope,
943
+ role_assignment_name=guid_role_assignment_name)
944
+ except Exception as e: # pylint: disable=broad-except
945
+ logger.warning('Failed to delete role '
946
+ 'assignment: {}'.format(e))
947
+ break
948
+ try:
949
+ delete_managed_identity(resource_group_name=resource_group,
950
+ resource_name=msi_name)
951
+ except Exception as e: # pylint: disable=broad-except
952
+ logger.warning('Failed to delete msi: {}'.format(e))
953
+
954
+ for nsg_name in filtered_resources[_RESOURCE_NETWORK_SECURITY_GROUP_TYPE]:
955
+ try:
956
+ delete_network_security_group(resource_group_name=resource_group,
957
+ network_security_group_name=nsg_name)
958
+ except Exception as e: # pylint: disable=broad-except
959
+ logger.warning('Failed to delete nsg: {}'.format(e))
960
+
961
+ delete_deployment = _get_azure_sdk_function(
962
+ client=resource_client.deployments, function_name='begin_delete')
963
+ deployment_names = [
964
+ constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
965
+ cluster_name_on_cloud=cluster_name_on_cloud),
966
+ constants.EXTERNAL_RG_VM_DEPLOYMENT_NAME.format(
967
+ cluster_name_on_cloud=cluster_name_on_cloud)
968
+ ]
969
+ for deployment_name in deployment_names:
970
+ try:
971
+ delete_deployment(resource_group_name=resource_group,
972
+ deployment_name=deployment_name)
973
+ except Exception as e: # pylint: disable=broad-except
974
+ logger.warning('Failed to delete deployment: {}'.format(e))
975
+
976
+
773
977
  @common_utils.retry
774
978
  def query_instances(
775
979
  cluster_name_on_cloud: str,
@@ -842,66 +1046,67 @@ def open_ports(
842
1046
  update_network_security_groups = _get_azure_sdk_function(
843
1047
  client=network_client.network_security_groups,
844
1048
  function_name='create_or_update')
1049
+ list_network_security_groups = _get_azure_sdk_function(
1050
+ client=network_client.network_security_groups, function_name='list')
845
1051
 
846
- try:
847
- # Wait for the NSG creation to be finished before opening a port. The
848
- # cluster provisioning triggers the NSG creation, but it may not be
849
- # finished yet.
850
- backoff = common_utils.Backoff(max_backoff_factor=1)
851
- start_time = time.time()
852
- while True:
853
- nsg = _get_cluster_nsg(network_client, resource_group,
854
- cluster_name_on_cloud)
855
- if nsg.provisioning_state not in ['Creating', 'Updating']:
856
- break
857
- if time.time() - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
858
- with ux_utils.print_exception_no_traceback():
859
- raise TimeoutError(
860
- f'Timed out while waiting for the Network '
861
- f'Security Group {nsg.name!r} to be ready for '
862
- f'cluster {cluster_name_on_cloud!r} in '
863
- f'resource group {resource_group!r}. The NSG '
864
- f'did not reach a stable state '
865
- '(Creating/Updating) within the allocated '
866
- f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
867
- 'Consequently, the operation to open ports '
868
- f'{ports} failed.')
869
-
870
- backoff_time = backoff.current_backoff()
871
- logger.info(f'NSG {nsg.name} is not created yet. Waiting for '
1052
+ for nsg in list_network_security_groups(resource_group):
1053
+ # Given resource group can contain network security groups that are
1054
+ # irrelevant to this provisioning especially with user specified
1055
+ # resource group at ~/.sky/config. So we make sure to check for the
1056
+ # completion of nsg relevant to the VM being provisioned.
1057
+ if cluster_name_on_cloud in nsg.name:
1058
+ try:
1059
+ # Wait the NSG creation to be finished before opening a port.
1060
+ # The cluster provisioning triggers the NSG creation, but it
1061
+ # may not be finished yet.
1062
+ backoff = common_utils.Backoff(max_backoff_factor=1)
1063
+ start_time = time.time()
1064
+ while True:
1065
+ if nsg.provisioning_state not in ['Creating', 'Updating']:
1066
+ break
1067
+ if time.time(
1068
+ ) - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
1069
+ logger.warning(
1070
+ f'Fails to wait for the creation of NSG {nsg.name}'
1071
+ f' in {resource_group} within '
1072
+ f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
1073
+ 'Skip this NSG.')
1074
+ backoff_time = backoff.current_backoff()
1075
+ logger.info(
1076
+ f'NSG {nsg.name} is not created yet. Waiting for '
872
1077
  f'{backoff_time} seconds before checking again.')
873
- time.sleep(backoff_time)
874
-
875
- # Azure NSG rules have a priority field that determines the order
876
- # in which they are applied. The priority must be unique across
877
- # all inbound rules in one NSG.
878
- priority = max(rule.priority
879
- for rule in nsg.security_rules
880
- if rule.direction == 'Inbound') + 1
881
- nsg.security_rules.append(
882
- azure.create_security_rule(
883
- name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
884
- priority=priority,
885
- protocol='Tcp',
886
- access='Allow',
887
- direction='Inbound',
888
- source_address_prefix='*',
889
- source_port_range='*',
890
- destination_address_prefix='*',
891
- destination_port_ranges=ports,
892
- ))
893
- poller = update_network_security_groups(resource_group, nsg.name, nsg)
894
- poller.wait()
895
- if poller.status() != 'Succeeded':
896
- with ux_utils.print_exception_no_traceback():
897
- raise ValueError(f'Failed to open ports {ports} in NSG '
898
- f'{nsg.name}: {poller.status()}')
899
-
900
- except azure.exceptions().HttpResponseError as e:
901
- with ux_utils.print_exception_no_traceback():
902
- raise ValueError(f'Failed to open ports {ports} in NSG for cluster '
903
- f'{cluster_name_on_cloud!r} within resource group '
904
- f'{resource_group!r}.') from e
1078
+ time.sleep(backoff_time)
1079
+
1080
+ # Azure NSG rules have a priority field that determines the
1081
+ # order in which they are applied. The priority must be unique
1082
+ # across all inbound rules in one NSG.
1083
+ priority = max(rule.priority
1084
+ for rule in nsg.security_rules
1085
+ if rule.direction == 'Inbound') + 1
1086
+ nsg.security_rules.append(
1087
+ azure.create_security_rule(
1088
+ name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
1089
+ priority=priority,
1090
+ protocol='Tcp',
1091
+ access='Allow',
1092
+ direction='Inbound',
1093
+ source_address_prefix='*',
1094
+ source_port_range='*',
1095
+ destination_address_prefix='*',
1096
+ destination_port_ranges=ports,
1097
+ ))
1098
+ poller = update_network_security_groups(resource_group,
1099
+ nsg.name, nsg)
1100
+ poller.wait()
1101
+ if poller.status() != 'Succeeded':
1102
+ with ux_utils.print_exception_no_traceback():
1103
+ raise ValueError(f'Failed to open ports {ports} in NSG '
1104
+ f'{nsg.name}: {poller.status()}')
1105
+ except azure.exceptions().HttpResponseError as e:
1106
+ with ux_utils.print_exception_no_traceback():
1107
+ raise ValueError(
1108
+ f'Failed to open ports {ports} in NSG {nsg.name}.'
1109
+ ) from e
905
1110
 
906
1111
 
907
1112
  def cleanup_ports(
@@ -16,3 +16,10 @@ WORKER_NODE_TAGS = {
16
16
  TAG_RAY_NODE_KIND: 'worker',
17
17
  TAG_SKYPILOT_HEAD_NODE: '0',
18
18
  }
19
+
20
+ # Names for Azure Deployments.
21
+ DEPLOYMENT_NAME = 'skypilot-config'
22
+ LEGACY_DEPLOYMENT_NAME = 'ray-config'
23
+ EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME = (
24
+ 'skypilot-bootstrap-{cluster_name_on_cloud}')
25
+ EXTERNAL_RG_VM_DEPLOYMENT_NAME = 'skypilot-vm-{cluster_name_on_cloud}'
@@ -632,13 +632,6 @@ def cleanup_ports(
632
632
  del ports # Unused.
633
633
  assert provider_config is not None, cluster_name_on_cloud
634
634
  project_id = provider_config['project_id']
635
- if 'ports' in provider_config:
636
- # Backward compatibility for old provider config.
637
- # TODO(tian): remove this after 2 minor releases, 0.6.0.
638
- for port in provider_config['ports']:
639
- firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
640
- instance_utils.GCPComputeInstance.delete_firewall_rule(
641
- project_id, firewall_rule_name)
642
635
  if 'firewall_rule' in provider_config:
643
636
  firewall_rule_name = provider_config['firewall_rule']
644
637
  instance_utils.GCPComputeInstance.delete_firewall_rule(
sky/resources.py CHANGED
@@ -55,7 +55,7 @@ class Resources:
55
55
  accelerators: Union[None, str, Dict[str, int]] = None,
56
56
  accelerator_args: Optional[Dict[str, str]] = None,
57
57
  use_spot: Optional[bool] = None,
58
- job_recovery: Optional[str] = None,
58
+ job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
59
59
  region: Optional[str] = None,
60
60
  zone: Optional[str] = None,
61
61
  image_id: Union[Dict[str, str], str, None] = None,
@@ -111,6 +111,12 @@ class Resources:
111
111
  job to recover the cluster from preemption. Refer to
112
112
  `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
113
113
  for more details.
114
+ When a dict is provided, it can have the following fields:
115
+
116
+ - strategy: the recovery strategy to use.
117
+ - max_restarts_on_errors: the max number of restarts on user code
118
+ errors.
119
+
114
120
  region: the region to use.
115
121
  zone: the zone to use.
116
122
  image_id: the image ID to use. If a str, must be a string
@@ -161,10 +167,20 @@ class Resources:
161
167
 
162
168
  self._use_spot_specified = use_spot is not None
163
169
  self._use_spot = use_spot if use_spot is not None else False
164
- self._job_recovery = None
170
+ self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
165
171
  if job_recovery is not None:
166
- if job_recovery.strip().lower() != 'none':
167
- self._job_recovery = job_recovery.upper()
172
+ if isinstance(job_recovery, str):
173
+ job_recovery = {'strategy': job_recovery}
174
+ if 'strategy' not in job_recovery:
175
+ job_recovery['strategy'] = None
176
+
177
+ strategy_name = job_recovery['strategy']
178
+ if strategy_name == 'none':
179
+ self._job_recovery = None
180
+ else:
181
+ if strategy_name is not None:
182
+ job_recovery['strategy'] = strategy_name.upper()
183
+ self._job_recovery = job_recovery
168
184
 
169
185
  if disk_size is not None:
170
186
  if round(disk_size) != disk_size:
@@ -419,7 +435,7 @@ class Resources:
419
435
  return self._use_spot_specified
420
436
 
421
437
  @property
422
- def job_recovery(self) -> Optional[str]:
438
+ def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
423
439
  return self._job_recovery
424
440
 
425
441
  @property
@@ -814,12 +830,13 @@ class Resources:
814
830
  Raises:
815
831
  ValueError: if the attributes are invalid.
816
832
  """
817
- if self._job_recovery is None:
833
+ if self._job_recovery is None or self._job_recovery['strategy'] is None:
818
834
  return
819
- if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
835
+ if (self._job_recovery['strategy']
836
+ not in managed_jobs.RECOVERY_STRATEGIES):
820
837
  with ux_utils.print_exception_no_traceback():
821
838
  raise ValueError(
822
- f'Spot recovery strategy {self._job_recovery} '
839
+ f'Spot recovery strategy {self._job_recovery["strategy"]} '
823
840
  'is not supported. The strategy should be among '
824
841
  f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
825
842
 
sky/serve/core.py CHANGED
@@ -572,8 +572,6 @@ def status(
572
572
  'controller_port': (Optional[int]) controller port,
573
573
  'load_balancer_port': (Optional[int]) load balancer port,
574
574
  'policy': (Optional[str]) load balancer policy description,
575
- 'requested_resources': (sky.Resources) requested resources
576
- for replica (deprecated),
577
575
  'requested_resources_str': (str) str representation of
578
576
  requested resources,
579
577
  'replica_info': (List[Dict[str, Any]]) replica information,
sky/serve/serve_state.py CHANGED
@@ -34,7 +34,7 @@ _DB_PATH: str = _get_db_path()
34
34
  def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
35
35
  """Creates the service and replica tables if they do not exist."""
36
36
 
37
- # auto_restart column is deprecated.
37
+ # auto_restart and requested_resources column is deprecated.
38
38
  cursor.execute("""\
39
39
  CREATE TABLE IF NOT EXISTS services (
40
40
  name TEXT PRIMARY KEY,
@@ -323,8 +323,8 @@ def set_service_load_balancer_port(service_name: str,
323
323
 
324
324
  def _get_service_from_row(row) -> Dict[str, Any]:
325
325
  (current_version, name, controller_job_id, controller_port,
326
- load_balancer_port, status, uptime, policy, _, requested_resources,
327
- requested_resources_str, _, active_versions) = row[:13]
326
+ load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
327
+ _, active_versions) = row[:13]
328
328
  return {
329
329
  'name': name,
330
330
  'controller_job_id': controller_job_id,
@@ -340,10 +340,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
340
340
  # The versions that is active for the load balancer. This is a list of
341
341
  # integers in json format. This is mainly for display purpose.
342
342
  'active_versions': json.loads(active_versions),
343
- # TODO(tian): Backward compatibility.
344
- # Remove after 2 minor release, 0.6.0.
345
- 'requested_resources': pickle.loads(requested_resources)
346
- if requested_resources is not None else None,
347
343
  'requested_resources_str': requested_resources_str,
348
344
  }
349
345
 
sky/serve/serve_utils.py CHANGED
@@ -825,12 +825,7 @@ def format_service_table(service_records: List[Dict[str, Any]],
825
825
  replicas = _get_replicas(record)
826
826
  endpoint = get_endpoint(record)
827
827
  policy = record['policy']
828
- # TODO(tian): Backward compatibility.
829
- # Remove `requested_resources` field after 2 minor release, 0.6.0.
830
- if record.get('requested_resources_str') is None:
831
- requested_resources_str = str(record['requested_resources'])
832
- else:
833
- requested_resources_str = record['requested_resources_str']
828
+ requested_resources_str = record['requested_resources_str']
834
829
 
835
830
  service_values = [
836
831
  service_name,
@@ -1004,15 +999,8 @@ class ServeCodeGen:
1004
999
  @classmethod
1005
1000
  def update_service(cls, service_name: str, version: int, mode: str) -> str:
1006
1001
  code = [
1007
- # Backward compatibility for old serve version on the remote
1008
- # machine. The `mode` argument was added in #3249, and if the remote
1009
- # machine has an old SkyPilot version before that, we need to avoid
1010
- # passing the `mode` argument to the job_lib functions.
1011
- # TODO(zhwu): Remove this in 0.7.0 release.
1012
- f'mode_kwargs = {{"mode": {mode!r}}} '
1013
- 'if getattr(constants, "SERVE_VERSION", 0) >= 1 else {}',
1014
1002
  f'msg = serve_utils.update_service_encoded({service_name!r}, '
1015
- f'{version}, **mode_kwargs)',
1003
+ f'{version}, mode={mode!r})',
1016
1004
  'print(msg, end="", flush=True)',
1017
1005
  ]
1018
1006
  return cls._build(code)
sky/serve/service_spec.py CHANGED
@@ -29,13 +29,6 @@ class SkyServiceSpec:
29
29
  base_ondemand_fallback_replicas: Optional[int] = None,
30
30
  upscale_delay_seconds: Optional[int] = None,
31
31
  downscale_delay_seconds: Optional[int] = None,
32
- # The following arguments are deprecated.
33
- # TODO(ziming): remove this after 2 minor release, i.e. 0.6.0.
34
- # Deprecated: Always be True
35
- auto_restart: Optional[bool] = None,
36
- # Deprecated: replaced by the target_qps_per_replica.
37
- qps_upper_threshold: Optional[float] = None,
38
- qps_lower_threshold: Optional[float] = None,
39
32
  ) -> None:
40
33
  if max_replicas is not None and max_replicas < min_replicas:
41
34
  with ux_utils.print_exception_no_traceback():
@@ -62,21 +55,6 @@ class SkyServiceSpec:
62
55
  raise ValueError('readiness_path must start with a slash (/). '
63
56
  f'Got: {readiness_path}')
64
57
 
65
- # TODO(tian): Following field are deprecated. Remove after 2 minor
66
- # release, i.e. 0.6.0.
67
- if qps_upper_threshold is not None or qps_lower_threshold is not None:
68
- with ux_utils.print_exception_no_traceback():
69
- raise ValueError(
70
- 'Field `qps_upper_threshold` and `qps_lower_threshold`'
71
- 'under `replica_policy` are deprecated. '
72
- 'Please use target_qps_per_replica instead.')
73
- if auto_restart is not None:
74
- with ux_utils.print_exception_no_traceback():
75
- raise ValueError(
76
- 'Field `auto_restart` under `replica_policy` is deprecated.'
77
- 'Currently, SkyServe will cleanup failed replicas'
78
- 'and auto restart it to keep the service running.')
79
-
80
58
  self._readiness_path: str = readiness_path
81
59
  self._initial_delay_seconds: int = initial_delay_seconds
82
60
  self._readiness_timeout_seconds: int = readiness_timeout_seconds
@@ -160,14 +138,8 @@ class SkyServiceSpec:
160
138
  service_config['min_replicas'] = policy_section['min_replicas']
161
139
  service_config['max_replicas'] = policy_section.get(
162
140
  'max_replicas', None)
163
- service_config['qps_upper_threshold'] = policy_section.get(
164
- 'qps_upper_threshold', None)
165
- service_config['qps_lower_threshold'] = policy_section.get(
166
- 'qps_lower_threshold', None)
167
141
  service_config['target_qps_per_replica'] = policy_section.get(
168
142
  'target_qps_per_replica', None)
169
- service_config['auto_restart'] = policy_section.get(
170
- 'auto_restart', None)
171
143
  service_config['upscale_delay_seconds'] = policy_section.get(
172
144
  'upscale_delay_seconds', None)
173
145
  service_config['downscale_delay_seconds'] = policy_section.get(
sky/setup_files/setup.py CHANGED
@@ -153,7 +153,7 @@ install_requires = [
153
153
  'tabulate',
154
154
  # Light weight requirement, can be replaced with "typing" once
155
155
  # we deprecate Python 3.7 (this will take a while).
156
- "typing_extensions",
156
+ 'typing_extensions',
157
157
  'filelock >= 3.6.0',
158
158
  'packaging',
159
159
  'psutil',
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
216
216
  # We need azure-identity>=1.13.0 to enable the customization of the
217
217
  # timeout of AzureCliCredential.
218
218
  'azure': [
219
- 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0',
220
- 'azure-mgmt-network', 'azure-storage-blob', 'msgraph-sdk'
219
+ 'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
220
+ 'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
221
+ 'azure-storage-blob>=12.23.1', 'msgraph-sdk'
221
222
  ] + local_ray,
222
223
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
223
224
  # parameter for stopping instances.