skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -0
- sky/backends/backend_utils.py +10 -133
- sky/backends/cloud_vm_ray_backend.py +17 -105
- sky/clouds/azure.py +10 -1
- sky/execution.py +5 -4
- sky/jobs/controller.py +38 -22
- sky/jobs/recovery_strategy.py +30 -5
- sky/jobs/state.py +33 -5
- sky/jobs/utils.py +28 -4
- sky/optimizer.py +11 -7
- sky/provision/azure/azure-config-template.json +7 -1
- sky/provision/azure/config.py +65 -45
- sky/provision/azure/instance.py +275 -70
- sky/provision/constants.py +7 -0
- sky/provision/gcp/instance.py +0 -7
- sky/resources.py +25 -8
- sky/serve/core.py +0 -2
- sky/serve/serve_state.py +3 -7
- sky/serve/serve_utils.py +2 -14
- sky/serve/service_spec.py +0 -28
- sky/setup_files/setup.py +4 -3
- sky/skylet/job_lib.py +37 -53
- sky/skylet/log_lib.py +5 -14
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/utils/dag_utils.py +14 -4
- sky/utils/schemas.py +25 -15
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0
sky/provision/azure/instance.py
CHANGED
@@ -41,6 +41,15 @@ UNIQUE_ID_LEN = 4
|
|
41
41
|
_TAG_SKYPILOT_VM_ID = 'skypilot-vm-id'
|
42
42
|
_WAIT_CREATION_TIMEOUT_SECONDS = 600
|
43
43
|
|
44
|
+
_RESOURCE_MANAGED_IDENTITY_TYPE = (
|
45
|
+
'Microsoft.ManagedIdentity/userAssignedIdentities')
|
46
|
+
_RESOURCE_NETWORK_SECURITY_GROUP_TYPE = (
|
47
|
+
'Microsoft.Network/networkSecurityGroups')
|
48
|
+
_RESOURCE_VIRTUAL_NETWORK_TYPE = 'Microsoft.Network/virtualNetworks'
|
49
|
+
_RESOURCE_PUBLIC_IP_ADDRESS_TYPE = 'Microsoft.Network/publicIPAddresses'
|
50
|
+
_RESOURCE_VIRTUAL_MACHINE_TYPE = 'Microsoft.Compute/virtualMachines'
|
51
|
+
_RESOURCE_NETWORK_INTERFACE_TYPE = 'Microsoft.Network/networkInterfaces'
|
52
|
+
|
44
53
|
_RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE = 'ResourceGroupNotFound'
|
45
54
|
_POLL_INTERVAL = 1
|
46
55
|
# TODO(Doyoung): _LEGACY_NSG_NAME can be remove this after 0.8.0 to ignore
|
@@ -282,6 +291,7 @@ def _create_vm(
|
|
282
291
|
image_reference=image_reference,
|
283
292
|
os_disk=compute.OSDisk(
|
284
293
|
create_option=compute.DiskCreateOptionTypes.FROM_IMAGE,
|
294
|
+
delete_option=compute.DiskDeleteOptionTypes.DELETE,
|
285
295
|
managed_disk=compute.ManagedDiskParameters(
|
286
296
|
storage_account_type=node_config['azure_arm_parameters']
|
287
297
|
['osDiskTier']),
|
@@ -697,18 +707,30 @@ def terminate_instances(
|
|
697
707
|
|
698
708
|
assert provider_config is not None, cluster_name_on_cloud
|
699
709
|
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
710
|
+
use_external_resource_group = provider_config.get(
|
711
|
+
'use_external_resource_group', False)
|
712
|
+
# When user specified resource group through config.yaml to create a VM, we
|
713
|
+
# cannot remove the entire resource group as it may contain other resources
|
714
|
+
# unrelated to this VM being removed.
|
715
|
+
if use_external_resource_group:
|
716
|
+
delete_vm_and_attached_resources(subscription_id, resource_group,
|
717
|
+
cluster_name_on_cloud)
|
718
|
+
else:
|
719
|
+
# For SkyPilot default resource groups, delete entire resource group.
|
720
|
+
# This automatically terminates all resources within, including VMs
|
721
|
+
resource_group_client = azure.get_client('resource', subscription_id)
|
722
|
+
delete_resource_group = _get_azure_sdk_function(
|
723
|
+
client=resource_group_client.resource_groups,
|
724
|
+
function_name='delete')
|
725
|
+
try:
|
726
|
+
delete_resource_group(resource_group, force_deletion_types=None)
|
727
|
+
except azure.exceptions().ResourceNotFoundError as e:
|
728
|
+
if 'ResourceGroupNotFound' in str(e):
|
729
|
+
logger.warning(
|
730
|
+
f'Resource group {resource_group} not found. Skip '
|
731
|
+
'terminating it.')
|
732
|
+
return
|
733
|
+
raise
|
712
734
|
|
713
735
|
|
714
736
|
def _get_instance_status(
|
@@ -770,6 +792,188 @@ def _filter_instances(
|
|
770
792
|
return nodes
|
771
793
|
|
772
794
|
|
795
|
+
def _delete_nic_with_retries(network_client,
|
796
|
+
resource_group,
|
797
|
+
nic_name,
|
798
|
+
max_retries=15,
|
799
|
+
retry_interval=20):
|
800
|
+
"""Delete a NIC with retries.
|
801
|
+
|
802
|
+
When a VM is created, its NIC is reserved for 180 seconds, preventing its
|
803
|
+
immediate deletion. If the NIC is in this reserved state, we must retry
|
804
|
+
deletion with intervals until the reservation expires. This situation
|
805
|
+
commonly arises if a VM termination is followed by a failover to another
|
806
|
+
region due to provisioning failures.
|
807
|
+
"""
|
808
|
+
delete_network_interfaces = _get_azure_sdk_function(
|
809
|
+
client=network_client.network_interfaces, function_name='begin_delete')
|
810
|
+
for _ in range(max_retries):
|
811
|
+
try:
|
812
|
+
delete_network_interfaces(resource_group_name=resource_group,
|
813
|
+
network_interface_name=nic_name).result()
|
814
|
+
return
|
815
|
+
except azure.exceptions().HttpResponseError as e:
|
816
|
+
if 'NicReservedForAnotherVm' in str(e):
|
817
|
+
# Retry when deletion fails with reserved NIC.
|
818
|
+
logger.warning(f'NIC {nic_name} is reserved. '
|
819
|
+
f'Retrying in {retry_interval} seconds...')
|
820
|
+
time.sleep(retry_interval)
|
821
|
+
else:
|
822
|
+
raise e
|
823
|
+
logger.error(
|
824
|
+
f'Failed to delete NIC {nic_name} after {max_retries} attempts.')
|
825
|
+
|
826
|
+
|
827
|
+
def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
828
|
+
cluster_name_on_cloud: str) -> None:
|
829
|
+
"""Removes VM with attached resources and Deployments.
|
830
|
+
|
831
|
+
This function deletes a virtual machine and its associated resources
|
832
|
+
(public IP addresses, virtual networks, managed identities, network
|
833
|
+
interface and network security groups) that match cluster_name_on_cloud.
|
834
|
+
There is one attached resources that is not removed within this
|
835
|
+
method: OS disk. It is configured to be deleted when VM is terminated while
|
836
|
+
setting up storage profile from _create_vm.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
subscription_id: The Azure subscription ID.
|
840
|
+
resource_group: The name of the resource group.
|
841
|
+
cluster_name_on_cloud: The name of the cluster to filter resources.
|
842
|
+
"""
|
843
|
+
resource_client = azure.get_client('resource', subscription_id)
|
844
|
+
try:
|
845
|
+
list_resources = _get_azure_sdk_function(
|
846
|
+
client=resource_client.resources,
|
847
|
+
function_name='list_by_resource_group')
|
848
|
+
resources = list(list_resources(resource_group))
|
849
|
+
except azure.exceptions().ResourceNotFoundError as e:
|
850
|
+
if _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE in str(e):
|
851
|
+
return
|
852
|
+
raise
|
853
|
+
|
854
|
+
filtered_resources: Dict[str, List[str]] = {
|
855
|
+
_RESOURCE_VIRTUAL_MACHINE_TYPE: [],
|
856
|
+
_RESOURCE_MANAGED_IDENTITY_TYPE: [],
|
857
|
+
_RESOURCE_NETWORK_SECURITY_GROUP_TYPE: [],
|
858
|
+
_RESOURCE_VIRTUAL_NETWORK_TYPE: [],
|
859
|
+
_RESOURCE_PUBLIC_IP_ADDRESS_TYPE: [],
|
860
|
+
_RESOURCE_NETWORK_INTERFACE_TYPE: []
|
861
|
+
}
|
862
|
+
|
863
|
+
for resource in resources:
|
864
|
+
if (resource.type in filtered_resources and
|
865
|
+
cluster_name_on_cloud in resource.name):
|
866
|
+
filtered_resources[resource.type].append(resource.name)
|
867
|
+
|
868
|
+
network_client = azure.get_client('network', subscription_id)
|
869
|
+
msi_client = azure.get_client('msi', subscription_id)
|
870
|
+
compute_client = azure.get_client('compute', subscription_id)
|
871
|
+
auth_client = azure.get_client('authorization', subscription_id)
|
872
|
+
|
873
|
+
delete_virtual_machine = _get_azure_sdk_function(
|
874
|
+
client=compute_client.virtual_machines, function_name='delete')
|
875
|
+
delete_public_ip_addresses = _get_azure_sdk_function(
|
876
|
+
client=network_client.public_ip_addresses, function_name='begin_delete')
|
877
|
+
delete_virtual_networks = _get_azure_sdk_function(
|
878
|
+
client=network_client.virtual_networks, function_name='begin_delete')
|
879
|
+
delete_managed_identity = _get_azure_sdk_function(
|
880
|
+
client=msi_client.user_assigned_identities, function_name='delete')
|
881
|
+
delete_network_security_group = _get_azure_sdk_function(
|
882
|
+
client=network_client.network_security_groups,
|
883
|
+
function_name='begin_delete')
|
884
|
+
delete_role_assignment = _get_azure_sdk_function(
|
885
|
+
client=auth_client.role_assignments, function_name='delete')
|
886
|
+
|
887
|
+
for vm_name in filtered_resources[_RESOURCE_VIRTUAL_MACHINE_TYPE]:
|
888
|
+
try:
|
889
|
+
# Before removing Network Interface, we need to wait for the VM to
|
890
|
+
# be completely removed with .result() so the dependency of VM on
|
891
|
+
# Network Interface is disassociated. This takes abour ~30s.
|
892
|
+
delete_virtual_machine(resource_group_name=resource_group,
|
893
|
+
vm_name=vm_name).result()
|
894
|
+
except Exception as e: # pylint: disable=broad-except
|
895
|
+
logger.warning('Failed to delete VM: {}'.format(e))
|
896
|
+
|
897
|
+
for nic_name in filtered_resources[_RESOURCE_NETWORK_INTERFACE_TYPE]:
|
898
|
+
try:
|
899
|
+
# Before removing Public IP Address, we need to wait for the
|
900
|
+
# Network Interface to be completely removed with .result() so the
|
901
|
+
# dependency of Network Interface on Public IP Address is
|
902
|
+
# disassociated. This takes about ~1s.
|
903
|
+
_delete_nic_with_retries(network_client, resource_group, nic_name)
|
904
|
+
except Exception as e: # pylint: disable=broad-except
|
905
|
+
logger.warning('Failed to delete nic: {}'.format(e))
|
906
|
+
|
907
|
+
for public_ip_name in filtered_resources[_RESOURCE_PUBLIC_IP_ADDRESS_TYPE]:
|
908
|
+
try:
|
909
|
+
delete_public_ip_addresses(resource_group_name=resource_group,
|
910
|
+
public_ip_address_name=public_ip_name)
|
911
|
+
except Exception as e: # pylint: disable=broad-except
|
912
|
+
logger.warning('Failed to delete public ip: {}'.format(e))
|
913
|
+
|
914
|
+
for vnet_name in filtered_resources[_RESOURCE_VIRTUAL_NETWORK_TYPE]:
|
915
|
+
try:
|
916
|
+
delete_virtual_networks(resource_group_name=resource_group,
|
917
|
+
virtual_network_name=vnet_name)
|
918
|
+
except Exception as e: # pylint: disable=broad-except
|
919
|
+
logger.warning('Failed to delete vnet: {}'.format(e))
|
920
|
+
|
921
|
+
for msi_name in filtered_resources[_RESOURCE_MANAGED_IDENTITY_TYPE]:
|
922
|
+
user_assigned_identities = (
|
923
|
+
msi_client.user_assigned_identities.list_by_resource_group(
|
924
|
+
resource_group_name=resource_group))
|
925
|
+
for identity in user_assigned_identities:
|
926
|
+
if msi_name == identity.name:
|
927
|
+
# We use the principal_id to find the correct guid converted
|
928
|
+
# role assignment name because each managed identity has a
|
929
|
+
# unique principal_id, and role assignments are associated
|
930
|
+
# with security principals (like managed identities) via this
|
931
|
+
# principal_id.
|
932
|
+
target_principal_id = identity.principal_id
|
933
|
+
scope = (f'/subscriptions/{subscription_id}'
|
934
|
+
f'/resourceGroups/{resource_group}')
|
935
|
+
role_assignments = auth_client.role_assignments.list_for_scope(
|
936
|
+
scope)
|
937
|
+
for assignment in role_assignments:
|
938
|
+
if target_principal_id == assignment.principal_id:
|
939
|
+
guid_role_assignment_name = assignment.name
|
940
|
+
try:
|
941
|
+
delete_role_assignment(
|
942
|
+
scope=scope,
|
943
|
+
role_assignment_name=guid_role_assignment_name)
|
944
|
+
except Exception as e: # pylint: disable=broad-except
|
945
|
+
logger.warning('Failed to delete role '
|
946
|
+
'assignment: {}'.format(e))
|
947
|
+
break
|
948
|
+
try:
|
949
|
+
delete_managed_identity(resource_group_name=resource_group,
|
950
|
+
resource_name=msi_name)
|
951
|
+
except Exception as e: # pylint: disable=broad-except
|
952
|
+
logger.warning('Failed to delete msi: {}'.format(e))
|
953
|
+
|
954
|
+
for nsg_name in filtered_resources[_RESOURCE_NETWORK_SECURITY_GROUP_TYPE]:
|
955
|
+
try:
|
956
|
+
delete_network_security_group(resource_group_name=resource_group,
|
957
|
+
network_security_group_name=nsg_name)
|
958
|
+
except Exception as e: # pylint: disable=broad-except
|
959
|
+
logger.warning('Failed to delete nsg: {}'.format(e))
|
960
|
+
|
961
|
+
delete_deployment = _get_azure_sdk_function(
|
962
|
+
client=resource_client.deployments, function_name='begin_delete')
|
963
|
+
deployment_names = [
|
964
|
+
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
|
965
|
+
cluster_name_on_cloud=cluster_name_on_cloud),
|
966
|
+
constants.EXTERNAL_RG_VM_DEPLOYMENT_NAME.format(
|
967
|
+
cluster_name_on_cloud=cluster_name_on_cloud)
|
968
|
+
]
|
969
|
+
for deployment_name in deployment_names:
|
970
|
+
try:
|
971
|
+
delete_deployment(resource_group_name=resource_group,
|
972
|
+
deployment_name=deployment_name)
|
973
|
+
except Exception as e: # pylint: disable=broad-except
|
974
|
+
logger.warning('Failed to delete deployment: {}'.format(e))
|
975
|
+
|
976
|
+
|
773
977
|
@common_utils.retry
|
774
978
|
def query_instances(
|
775
979
|
cluster_name_on_cloud: str,
|
@@ -842,66 +1046,67 @@ def open_ports(
|
|
842
1046
|
update_network_security_groups = _get_azure_sdk_function(
|
843
1047
|
client=network_client.network_security_groups,
|
844
1048
|
function_name='create_or_update')
|
1049
|
+
list_network_security_groups = _get_azure_sdk_function(
|
1050
|
+
client=network_client.network_security_groups, function_name='list')
|
845
1051
|
|
846
|
-
|
847
|
-
#
|
848
|
-
#
|
849
|
-
#
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
logger.info(f'NSG {nsg.name} is not created yet. Waiting for '
|
1052
|
+
for nsg in list_network_security_groups(resource_group):
|
1053
|
+
# Given resource group can contain network security groups that are
|
1054
|
+
# irrelevant to this provisioning especially with user specified
|
1055
|
+
# resource group at ~/.sky/config. So we make sure to check for the
|
1056
|
+
# completion of nsg relevant to the VM being provisioned.
|
1057
|
+
if cluster_name_on_cloud in nsg.name:
|
1058
|
+
try:
|
1059
|
+
# Wait the NSG creation to be finished before opening a port.
|
1060
|
+
# The cluster provisioning triggers the NSG creation, but it
|
1061
|
+
# may not be finished yet.
|
1062
|
+
backoff = common_utils.Backoff(max_backoff_factor=1)
|
1063
|
+
start_time = time.time()
|
1064
|
+
while True:
|
1065
|
+
if nsg.provisioning_state not in ['Creating', 'Updating']:
|
1066
|
+
break
|
1067
|
+
if time.time(
|
1068
|
+
) - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
|
1069
|
+
logger.warning(
|
1070
|
+
f'Fails to wait for the creation of NSG {nsg.name}'
|
1071
|
+
f' in {resource_group} within '
|
1072
|
+
f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
|
1073
|
+
'Skip this NSG.')
|
1074
|
+
backoff_time = backoff.current_backoff()
|
1075
|
+
logger.info(
|
1076
|
+
f'NSG {nsg.name} is not created yet. Waiting for '
|
872
1077
|
f'{backoff_time} seconds before checking again.')
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
1078
|
+
time.sleep(backoff_time)
|
1079
|
+
|
1080
|
+
# Azure NSG rules have a priority field that determines the
|
1081
|
+
# order in which they are applied. The priority must be unique
|
1082
|
+
# across all inbound rules in one NSG.
|
1083
|
+
priority = max(rule.priority
|
1084
|
+
for rule in nsg.security_rules
|
1085
|
+
if rule.direction == 'Inbound') + 1
|
1086
|
+
nsg.security_rules.append(
|
1087
|
+
azure.create_security_rule(
|
1088
|
+
name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
|
1089
|
+
priority=priority,
|
1090
|
+
protocol='Tcp',
|
1091
|
+
access='Allow',
|
1092
|
+
direction='Inbound',
|
1093
|
+
source_address_prefix='*',
|
1094
|
+
source_port_range='*',
|
1095
|
+
destination_address_prefix='*',
|
1096
|
+
destination_port_ranges=ports,
|
1097
|
+
))
|
1098
|
+
poller = update_network_security_groups(resource_group,
|
1099
|
+
nsg.name, nsg)
|
1100
|
+
poller.wait()
|
1101
|
+
if poller.status() != 'Succeeded':
|
1102
|
+
with ux_utils.print_exception_no_traceback():
|
1103
|
+
raise ValueError(f'Failed to open ports {ports} in NSG '
|
1104
|
+
f'{nsg.name}: {poller.status()}')
|
1105
|
+
except azure.exceptions().HttpResponseError as e:
|
1106
|
+
with ux_utils.print_exception_no_traceback():
|
1107
|
+
raise ValueError(
|
1108
|
+
f'Failed to open ports {ports} in NSG {nsg.name}.'
|
1109
|
+
) from e
|
905
1110
|
|
906
1111
|
|
907
1112
|
def cleanup_ports(
|
sky/provision/constants.py
CHANGED
@@ -16,3 +16,10 @@ WORKER_NODE_TAGS = {
|
|
16
16
|
TAG_RAY_NODE_KIND: 'worker',
|
17
17
|
TAG_SKYPILOT_HEAD_NODE: '0',
|
18
18
|
}
|
19
|
+
|
20
|
+
# Names for Azure Deployments.
|
21
|
+
DEPLOYMENT_NAME = 'skypilot-config'
|
22
|
+
LEGACY_DEPLOYMENT_NAME = 'ray-config'
|
23
|
+
EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME = (
|
24
|
+
'skypilot-bootstrap-{cluster_name_on_cloud}')
|
25
|
+
EXTERNAL_RG_VM_DEPLOYMENT_NAME = 'skypilot-vm-{cluster_name_on_cloud}'
|
sky/provision/gcp/instance.py
CHANGED
@@ -632,13 +632,6 @@ def cleanup_ports(
|
|
632
632
|
del ports # Unused.
|
633
633
|
assert provider_config is not None, cluster_name_on_cloud
|
634
634
|
project_id = provider_config['project_id']
|
635
|
-
if 'ports' in provider_config:
|
636
|
-
# Backward compatibility for old provider config.
|
637
|
-
# TODO(tian): remove this after 2 minor releases, 0.6.0.
|
638
|
-
for port in provider_config['ports']:
|
639
|
-
firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
|
640
|
-
instance_utils.GCPComputeInstance.delete_firewall_rule(
|
641
|
-
project_id, firewall_rule_name)
|
642
635
|
if 'firewall_rule' in provider_config:
|
643
636
|
firewall_rule_name = provider_config['firewall_rule']
|
644
637
|
instance_utils.GCPComputeInstance.delete_firewall_rule(
|
sky/resources.py
CHANGED
@@ -55,7 +55,7 @@ class Resources:
|
|
55
55
|
accelerators: Union[None, str, Dict[str, int]] = None,
|
56
56
|
accelerator_args: Optional[Dict[str, str]] = None,
|
57
57
|
use_spot: Optional[bool] = None,
|
58
|
-
job_recovery: Optional[str] = None,
|
58
|
+
job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
|
59
59
|
region: Optional[str] = None,
|
60
60
|
zone: Optional[str] = None,
|
61
61
|
image_id: Union[Dict[str, str], str, None] = None,
|
@@ -111,6 +111,12 @@ class Resources:
|
|
111
111
|
job to recover the cluster from preemption. Refer to
|
112
112
|
`recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
|
113
113
|
for more details.
|
114
|
+
When a dict is provided, it can have the following fields:
|
115
|
+
|
116
|
+
- strategy: the recovery strategy to use.
|
117
|
+
- max_restarts_on_errors: the max number of restarts on user code
|
118
|
+
errors.
|
119
|
+
|
114
120
|
region: the region to use.
|
115
121
|
zone: the zone to use.
|
116
122
|
image_id: the image ID to use. If a str, must be a string
|
@@ -161,10 +167,20 @@ class Resources:
|
|
161
167
|
|
162
168
|
self._use_spot_specified = use_spot is not None
|
163
169
|
self._use_spot = use_spot if use_spot is not None else False
|
164
|
-
self._job_recovery = None
|
170
|
+
self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
|
165
171
|
if job_recovery is not None:
|
166
|
-
if job_recovery
|
167
|
-
|
172
|
+
if isinstance(job_recovery, str):
|
173
|
+
job_recovery = {'strategy': job_recovery}
|
174
|
+
if 'strategy' not in job_recovery:
|
175
|
+
job_recovery['strategy'] = None
|
176
|
+
|
177
|
+
strategy_name = job_recovery['strategy']
|
178
|
+
if strategy_name == 'none':
|
179
|
+
self._job_recovery = None
|
180
|
+
else:
|
181
|
+
if strategy_name is not None:
|
182
|
+
job_recovery['strategy'] = strategy_name.upper()
|
183
|
+
self._job_recovery = job_recovery
|
168
184
|
|
169
185
|
if disk_size is not None:
|
170
186
|
if round(disk_size) != disk_size:
|
@@ -419,7 +435,7 @@ class Resources:
|
|
419
435
|
return self._use_spot_specified
|
420
436
|
|
421
437
|
@property
|
422
|
-
def job_recovery(self) -> Optional[str]:
|
438
|
+
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
423
439
|
return self._job_recovery
|
424
440
|
|
425
441
|
@property
|
@@ -814,12 +830,13 @@ class Resources:
|
|
814
830
|
Raises:
|
815
831
|
ValueError: if the attributes are invalid.
|
816
832
|
"""
|
817
|
-
if self._job_recovery is None:
|
833
|
+
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
818
834
|
return
|
819
|
-
if self._job_recovery
|
835
|
+
if (self._job_recovery['strategy']
|
836
|
+
not in managed_jobs.RECOVERY_STRATEGIES):
|
820
837
|
with ux_utils.print_exception_no_traceback():
|
821
838
|
raise ValueError(
|
822
|
-
f'Spot recovery strategy {self._job_recovery} '
|
839
|
+
f'Spot recovery strategy {self._job_recovery["strategy"]} '
|
823
840
|
'is not supported. The strategy should be among '
|
824
841
|
f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
|
825
842
|
|
sky/serve/core.py
CHANGED
@@ -572,8 +572,6 @@ def status(
|
|
572
572
|
'controller_port': (Optional[int]) controller port,
|
573
573
|
'load_balancer_port': (Optional[int]) load balancer port,
|
574
574
|
'policy': (Optional[str]) load balancer policy description,
|
575
|
-
'requested_resources': (sky.Resources) requested resources
|
576
|
-
for replica (deprecated),
|
577
575
|
'requested_resources_str': (str) str representation of
|
578
576
|
requested resources,
|
579
577
|
'replica_info': (List[Dict[str, Any]]) replica information,
|
sky/serve/serve_state.py
CHANGED
@@ -34,7 +34,7 @@ _DB_PATH: str = _get_db_path()
|
|
34
34
|
def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
35
35
|
"""Creates the service and replica tables if they do not exist."""
|
36
36
|
|
37
|
-
# auto_restart column is deprecated.
|
37
|
+
# auto_restart and requested_resources column is deprecated.
|
38
38
|
cursor.execute("""\
|
39
39
|
CREATE TABLE IF NOT EXISTS services (
|
40
40
|
name TEXT PRIMARY KEY,
|
@@ -323,8 +323,8 @@ def set_service_load_balancer_port(service_name: str,
|
|
323
323
|
|
324
324
|
def _get_service_from_row(row) -> Dict[str, Any]:
|
325
325
|
(current_version, name, controller_job_id, controller_port,
|
326
|
-
load_balancer_port, status, uptime, policy, _,
|
327
|
-
|
326
|
+
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
327
|
+
_, active_versions) = row[:13]
|
328
328
|
return {
|
329
329
|
'name': name,
|
330
330
|
'controller_job_id': controller_job_id,
|
@@ -340,10 +340,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
340
340
|
# The versions that is active for the load balancer. This is a list of
|
341
341
|
# integers in json format. This is mainly for display purpose.
|
342
342
|
'active_versions': json.loads(active_versions),
|
343
|
-
# TODO(tian): Backward compatibility.
|
344
|
-
# Remove after 2 minor release, 0.6.0.
|
345
|
-
'requested_resources': pickle.loads(requested_resources)
|
346
|
-
if requested_resources is not None else None,
|
347
343
|
'requested_resources_str': requested_resources_str,
|
348
344
|
}
|
349
345
|
|
sky/serve/serve_utils.py
CHANGED
@@ -825,12 +825,7 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
825
825
|
replicas = _get_replicas(record)
|
826
826
|
endpoint = get_endpoint(record)
|
827
827
|
policy = record['policy']
|
828
|
-
|
829
|
-
# Remove `requested_resources` field after 2 minor release, 0.6.0.
|
830
|
-
if record.get('requested_resources_str') is None:
|
831
|
-
requested_resources_str = str(record['requested_resources'])
|
832
|
-
else:
|
833
|
-
requested_resources_str = record['requested_resources_str']
|
828
|
+
requested_resources_str = record['requested_resources_str']
|
834
829
|
|
835
830
|
service_values = [
|
836
831
|
service_name,
|
@@ -1004,15 +999,8 @@ class ServeCodeGen:
|
|
1004
999
|
@classmethod
|
1005
1000
|
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
1006
1001
|
code = [
|
1007
|
-
# Backward compatibility for old serve version on the remote
|
1008
|
-
# machine. The `mode` argument was added in #3249, and if the remote
|
1009
|
-
# machine has an old SkyPilot version before that, we need to avoid
|
1010
|
-
# passing the `mode` argument to the job_lib functions.
|
1011
|
-
# TODO(zhwu): Remove this in 0.7.0 release.
|
1012
|
-
f'mode_kwargs = {{"mode": {mode!r}}} '
|
1013
|
-
'if getattr(constants, "SERVE_VERSION", 0) >= 1 else {}',
|
1014
1002
|
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
1015
|
-
f'{version},
|
1003
|
+
f'{version}, mode={mode!r})',
|
1016
1004
|
'print(msg, end="", flush=True)',
|
1017
1005
|
]
|
1018
1006
|
return cls._build(code)
|
sky/serve/service_spec.py
CHANGED
@@ -29,13 +29,6 @@ class SkyServiceSpec:
|
|
29
29
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
30
30
|
upscale_delay_seconds: Optional[int] = None,
|
31
31
|
downscale_delay_seconds: Optional[int] = None,
|
32
|
-
# The following arguments are deprecated.
|
33
|
-
# TODO(ziming): remove this after 2 minor release, i.e. 0.6.0.
|
34
|
-
# Deprecated: Always be True
|
35
|
-
auto_restart: Optional[bool] = None,
|
36
|
-
# Deprecated: replaced by the target_qps_per_replica.
|
37
|
-
qps_upper_threshold: Optional[float] = None,
|
38
|
-
qps_lower_threshold: Optional[float] = None,
|
39
32
|
) -> None:
|
40
33
|
if max_replicas is not None and max_replicas < min_replicas:
|
41
34
|
with ux_utils.print_exception_no_traceback():
|
@@ -62,21 +55,6 @@ class SkyServiceSpec:
|
|
62
55
|
raise ValueError('readiness_path must start with a slash (/). '
|
63
56
|
f'Got: {readiness_path}')
|
64
57
|
|
65
|
-
# TODO(tian): Following field are deprecated. Remove after 2 minor
|
66
|
-
# release, i.e. 0.6.0.
|
67
|
-
if qps_upper_threshold is not None or qps_lower_threshold is not None:
|
68
|
-
with ux_utils.print_exception_no_traceback():
|
69
|
-
raise ValueError(
|
70
|
-
'Field `qps_upper_threshold` and `qps_lower_threshold`'
|
71
|
-
'under `replica_policy` are deprecated. '
|
72
|
-
'Please use target_qps_per_replica instead.')
|
73
|
-
if auto_restart is not None:
|
74
|
-
with ux_utils.print_exception_no_traceback():
|
75
|
-
raise ValueError(
|
76
|
-
'Field `auto_restart` under `replica_policy` is deprecated.'
|
77
|
-
'Currently, SkyServe will cleanup failed replicas'
|
78
|
-
'and auto restart it to keep the service running.')
|
79
|
-
|
80
58
|
self._readiness_path: str = readiness_path
|
81
59
|
self._initial_delay_seconds: int = initial_delay_seconds
|
82
60
|
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
@@ -160,14 +138,8 @@ class SkyServiceSpec:
|
|
160
138
|
service_config['min_replicas'] = policy_section['min_replicas']
|
161
139
|
service_config['max_replicas'] = policy_section.get(
|
162
140
|
'max_replicas', None)
|
163
|
-
service_config['qps_upper_threshold'] = policy_section.get(
|
164
|
-
'qps_upper_threshold', None)
|
165
|
-
service_config['qps_lower_threshold'] = policy_section.get(
|
166
|
-
'qps_lower_threshold', None)
|
167
141
|
service_config['target_qps_per_replica'] = policy_section.get(
|
168
142
|
'target_qps_per_replica', None)
|
169
|
-
service_config['auto_restart'] = policy_section.get(
|
170
|
-
'auto_restart', None)
|
171
143
|
service_config['upscale_delay_seconds'] = policy_section.get(
|
172
144
|
'upscale_delay_seconds', None)
|
173
145
|
service_config['downscale_delay_seconds'] = policy_section.get(
|
sky/setup_files/setup.py
CHANGED
@@ -153,7 +153,7 @@ install_requires = [
|
|
153
153
|
'tabulate',
|
154
154
|
# Light weight requirement, can be replaced with "typing" once
|
155
155
|
# we deprecate Python 3.7 (this will take a while).
|
156
|
-
|
156
|
+
'typing_extensions',
|
157
157
|
'filelock >= 3.6.0',
|
158
158
|
'packaging',
|
159
159
|
'psutil',
|
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
|
|
216
216
|
# We need azure-identity>=1.13.0 to enable the customization of the
|
217
217
|
# timeout of AzureCliCredential.
|
218
218
|
'azure': [
|
219
|
-
'azure-cli>=2.
|
220
|
-
'azure-mgmt-network', 'azure-
|
219
|
+
'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
|
220
|
+
'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
|
221
|
+
'azure-storage-blob>=12.23.1', 'msgraph-sdk'
|
221
222
|
] + local_ray,
|
222
223
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
223
224
|
# parameter for stopping instances.
|