skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241029__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -0
- sky/backends/backend_utils.py +10 -133
- sky/backends/cloud_vm_ray_backend.py +4 -102
- sky/clouds/azure.py +10 -1
- sky/optimizer.py +11 -7
- sky/provision/azure/azure-config-template.json +7 -1
- sky/provision/azure/config.py +65 -45
- sky/provision/azure/instance.py +275 -70
- sky/provision/constants.py +7 -0
- sky/provision/gcp/instance.py +0 -7
- sky/serve/core.py +0 -2
- sky/serve/serve_state.py +3 -7
- sky/serve/serve_utils.py +2 -14
- sky/serve/service_spec.py +0 -28
- sky/skylet/job_lib.py +3 -11
- sky/skylet/log_lib.py +5 -14
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/utils/schemas.py +4 -14
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/RECORD +25 -25
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/top_level.txt +0 -0
sky/provision/azure/instance.py
CHANGED
@@ -41,6 +41,15 @@ UNIQUE_ID_LEN = 4
|
|
41
41
|
_TAG_SKYPILOT_VM_ID = 'skypilot-vm-id'
|
42
42
|
_WAIT_CREATION_TIMEOUT_SECONDS = 600
|
43
43
|
|
44
|
+
_RESOURCE_MANAGED_IDENTITY_TYPE = (
|
45
|
+
'Microsoft.ManagedIdentity/userAssignedIdentities')
|
46
|
+
_RESOURCE_NETWORK_SECURITY_GROUP_TYPE = (
|
47
|
+
'Microsoft.Network/networkSecurityGroups')
|
48
|
+
_RESOURCE_VIRTUAL_NETWORK_TYPE = 'Microsoft.Network/virtualNetworks'
|
49
|
+
_RESOURCE_PUBLIC_IP_ADDRESS_TYPE = 'Microsoft.Network/publicIPAddresses'
|
50
|
+
_RESOURCE_VIRTUAL_MACHINE_TYPE = 'Microsoft.Compute/virtualMachines'
|
51
|
+
_RESOURCE_NETWORK_INTERFACE_TYPE = 'Microsoft.Network/networkInterfaces'
|
52
|
+
|
44
53
|
_RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE = 'ResourceGroupNotFound'
|
45
54
|
_POLL_INTERVAL = 1
|
46
55
|
# TODO(Doyoung): _LEGACY_NSG_NAME can be remove this after 0.8.0 to ignore
|
@@ -282,6 +291,7 @@ def _create_vm(
|
|
282
291
|
image_reference=image_reference,
|
283
292
|
os_disk=compute.OSDisk(
|
284
293
|
create_option=compute.DiskCreateOptionTypes.FROM_IMAGE,
|
294
|
+
delete_option=compute.DiskDeleteOptionTypes.DELETE,
|
285
295
|
managed_disk=compute.ManagedDiskParameters(
|
286
296
|
storage_account_type=node_config['azure_arm_parameters']
|
287
297
|
['osDiskTier']),
|
@@ -697,18 +707,30 @@ def terminate_instances(
|
|
697
707
|
|
698
708
|
assert provider_config is not None, cluster_name_on_cloud
|
699
709
|
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
710
|
+
use_external_resource_group = provider_config.get(
|
711
|
+
'use_external_resource_group', False)
|
712
|
+
# When user specified resource group through config.yaml to create a VM, we
|
713
|
+
# cannot remove the entire resource group as it may contain other resources
|
714
|
+
# unrelated to this VM being removed.
|
715
|
+
if use_external_resource_group:
|
716
|
+
delete_vm_and_attached_resources(subscription_id, resource_group,
|
717
|
+
cluster_name_on_cloud)
|
718
|
+
else:
|
719
|
+
# For SkyPilot default resource groups, delete entire resource group.
|
720
|
+
# This automatically terminates all resources within, including VMs
|
721
|
+
resource_group_client = azure.get_client('resource', subscription_id)
|
722
|
+
delete_resource_group = _get_azure_sdk_function(
|
723
|
+
client=resource_group_client.resource_groups,
|
724
|
+
function_name='delete')
|
725
|
+
try:
|
726
|
+
delete_resource_group(resource_group, force_deletion_types=None)
|
727
|
+
except azure.exceptions().ResourceNotFoundError as e:
|
728
|
+
if 'ResourceGroupNotFound' in str(e):
|
729
|
+
logger.warning(
|
730
|
+
f'Resource group {resource_group} not found. Skip '
|
731
|
+
'terminating it.')
|
732
|
+
return
|
733
|
+
raise
|
712
734
|
|
713
735
|
|
714
736
|
def _get_instance_status(
|
@@ -770,6 +792,188 @@ def _filter_instances(
|
|
770
792
|
return nodes
|
771
793
|
|
772
794
|
|
795
|
+
def _delete_nic_with_retries(network_client,
|
796
|
+
resource_group,
|
797
|
+
nic_name,
|
798
|
+
max_retries=15,
|
799
|
+
retry_interval=20):
|
800
|
+
"""Delete a NIC with retries.
|
801
|
+
|
802
|
+
When a VM is created, its NIC is reserved for 180 seconds, preventing its
|
803
|
+
immediate deletion. If the NIC is in this reserved state, we must retry
|
804
|
+
deletion with intervals until the reservation expires. This situation
|
805
|
+
commonly arises if a VM termination is followed by a failover to another
|
806
|
+
region due to provisioning failures.
|
807
|
+
"""
|
808
|
+
delete_network_interfaces = _get_azure_sdk_function(
|
809
|
+
client=network_client.network_interfaces, function_name='begin_delete')
|
810
|
+
for _ in range(max_retries):
|
811
|
+
try:
|
812
|
+
delete_network_interfaces(resource_group_name=resource_group,
|
813
|
+
network_interface_name=nic_name).result()
|
814
|
+
return
|
815
|
+
except azure.exceptions().HttpResponseError as e:
|
816
|
+
if 'NicReservedForAnotherVm' in str(e):
|
817
|
+
# Retry when deletion fails with reserved NIC.
|
818
|
+
logger.warning(f'NIC {nic_name} is reserved. '
|
819
|
+
f'Retrying in {retry_interval} seconds...')
|
820
|
+
time.sleep(retry_interval)
|
821
|
+
else:
|
822
|
+
raise e
|
823
|
+
logger.error(
|
824
|
+
f'Failed to delete NIC {nic_name} after {max_retries} attempts.')
|
825
|
+
|
826
|
+
|
827
|
+
def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
828
|
+
cluster_name_on_cloud: str) -> None:
|
829
|
+
"""Removes VM with attached resources and Deployments.
|
830
|
+
|
831
|
+
This function deletes a virtual machine and its associated resources
|
832
|
+
(public IP addresses, virtual networks, managed identities, network
|
833
|
+
interface and network security groups) that match cluster_name_on_cloud.
|
834
|
+
There is one attached resources that is not removed within this
|
835
|
+
method: OS disk. It is configured to be deleted when VM is terminated while
|
836
|
+
setting up storage profile from _create_vm.
|
837
|
+
|
838
|
+
Args:
|
839
|
+
subscription_id: The Azure subscription ID.
|
840
|
+
resource_group: The name of the resource group.
|
841
|
+
cluster_name_on_cloud: The name of the cluster to filter resources.
|
842
|
+
"""
|
843
|
+
resource_client = azure.get_client('resource', subscription_id)
|
844
|
+
try:
|
845
|
+
list_resources = _get_azure_sdk_function(
|
846
|
+
client=resource_client.resources,
|
847
|
+
function_name='list_by_resource_group')
|
848
|
+
resources = list(list_resources(resource_group))
|
849
|
+
except azure.exceptions().ResourceNotFoundError as e:
|
850
|
+
if _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE in str(e):
|
851
|
+
return
|
852
|
+
raise
|
853
|
+
|
854
|
+
filtered_resources: Dict[str, List[str]] = {
|
855
|
+
_RESOURCE_VIRTUAL_MACHINE_TYPE: [],
|
856
|
+
_RESOURCE_MANAGED_IDENTITY_TYPE: [],
|
857
|
+
_RESOURCE_NETWORK_SECURITY_GROUP_TYPE: [],
|
858
|
+
_RESOURCE_VIRTUAL_NETWORK_TYPE: [],
|
859
|
+
_RESOURCE_PUBLIC_IP_ADDRESS_TYPE: [],
|
860
|
+
_RESOURCE_NETWORK_INTERFACE_TYPE: []
|
861
|
+
}
|
862
|
+
|
863
|
+
for resource in resources:
|
864
|
+
if (resource.type in filtered_resources and
|
865
|
+
cluster_name_on_cloud in resource.name):
|
866
|
+
filtered_resources[resource.type].append(resource.name)
|
867
|
+
|
868
|
+
network_client = azure.get_client('network', subscription_id)
|
869
|
+
msi_client = azure.get_client('msi', subscription_id)
|
870
|
+
compute_client = azure.get_client('compute', subscription_id)
|
871
|
+
auth_client = azure.get_client('authorization', subscription_id)
|
872
|
+
|
873
|
+
delete_virtual_machine = _get_azure_sdk_function(
|
874
|
+
client=compute_client.virtual_machines, function_name='delete')
|
875
|
+
delete_public_ip_addresses = _get_azure_sdk_function(
|
876
|
+
client=network_client.public_ip_addresses, function_name='begin_delete')
|
877
|
+
delete_virtual_networks = _get_azure_sdk_function(
|
878
|
+
client=network_client.virtual_networks, function_name='begin_delete')
|
879
|
+
delete_managed_identity = _get_azure_sdk_function(
|
880
|
+
client=msi_client.user_assigned_identities, function_name='delete')
|
881
|
+
delete_network_security_group = _get_azure_sdk_function(
|
882
|
+
client=network_client.network_security_groups,
|
883
|
+
function_name='begin_delete')
|
884
|
+
delete_role_assignment = _get_azure_sdk_function(
|
885
|
+
client=auth_client.role_assignments, function_name='delete')
|
886
|
+
|
887
|
+
for vm_name in filtered_resources[_RESOURCE_VIRTUAL_MACHINE_TYPE]:
|
888
|
+
try:
|
889
|
+
# Before removing Network Interface, we need to wait for the VM to
|
890
|
+
# be completely removed with .result() so the dependency of VM on
|
891
|
+
# Network Interface is disassociated. This takes abour ~30s.
|
892
|
+
delete_virtual_machine(resource_group_name=resource_group,
|
893
|
+
vm_name=vm_name).result()
|
894
|
+
except Exception as e: # pylint: disable=broad-except
|
895
|
+
logger.warning('Failed to delete VM: {}'.format(e))
|
896
|
+
|
897
|
+
for nic_name in filtered_resources[_RESOURCE_NETWORK_INTERFACE_TYPE]:
|
898
|
+
try:
|
899
|
+
# Before removing Public IP Address, we need to wait for the
|
900
|
+
# Network Interface to be completely removed with .result() so the
|
901
|
+
# dependency of Network Interface on Public IP Address is
|
902
|
+
# disassociated. This takes about ~1s.
|
903
|
+
_delete_nic_with_retries(network_client, resource_group, nic_name)
|
904
|
+
except Exception as e: # pylint: disable=broad-except
|
905
|
+
logger.warning('Failed to delete nic: {}'.format(e))
|
906
|
+
|
907
|
+
for public_ip_name in filtered_resources[_RESOURCE_PUBLIC_IP_ADDRESS_TYPE]:
|
908
|
+
try:
|
909
|
+
delete_public_ip_addresses(resource_group_name=resource_group,
|
910
|
+
public_ip_address_name=public_ip_name)
|
911
|
+
except Exception as e: # pylint: disable=broad-except
|
912
|
+
logger.warning('Failed to delete public ip: {}'.format(e))
|
913
|
+
|
914
|
+
for vnet_name in filtered_resources[_RESOURCE_VIRTUAL_NETWORK_TYPE]:
|
915
|
+
try:
|
916
|
+
delete_virtual_networks(resource_group_name=resource_group,
|
917
|
+
virtual_network_name=vnet_name)
|
918
|
+
except Exception as e: # pylint: disable=broad-except
|
919
|
+
logger.warning('Failed to delete vnet: {}'.format(e))
|
920
|
+
|
921
|
+
for msi_name in filtered_resources[_RESOURCE_MANAGED_IDENTITY_TYPE]:
|
922
|
+
user_assigned_identities = (
|
923
|
+
msi_client.user_assigned_identities.list_by_resource_group(
|
924
|
+
resource_group_name=resource_group))
|
925
|
+
for identity in user_assigned_identities:
|
926
|
+
if msi_name == identity.name:
|
927
|
+
# We use the principal_id to find the correct guid converted
|
928
|
+
# role assignment name because each managed identity has a
|
929
|
+
# unique principal_id, and role assignments are associated
|
930
|
+
# with security principals (like managed identities) via this
|
931
|
+
# principal_id.
|
932
|
+
target_principal_id = identity.principal_id
|
933
|
+
scope = (f'/subscriptions/{subscription_id}'
|
934
|
+
f'/resourceGroups/{resource_group}')
|
935
|
+
role_assignments = auth_client.role_assignments.list_for_scope(
|
936
|
+
scope)
|
937
|
+
for assignment in role_assignments:
|
938
|
+
if target_principal_id == assignment.principal_id:
|
939
|
+
guid_role_assignment_name = assignment.name
|
940
|
+
try:
|
941
|
+
delete_role_assignment(
|
942
|
+
scope=scope,
|
943
|
+
role_assignment_name=guid_role_assignment_name)
|
944
|
+
except Exception as e: # pylint: disable=broad-except
|
945
|
+
logger.warning('Failed to delete role '
|
946
|
+
'assignment: {}'.format(e))
|
947
|
+
break
|
948
|
+
try:
|
949
|
+
delete_managed_identity(resource_group_name=resource_group,
|
950
|
+
resource_name=msi_name)
|
951
|
+
except Exception as e: # pylint: disable=broad-except
|
952
|
+
logger.warning('Failed to delete msi: {}'.format(e))
|
953
|
+
|
954
|
+
for nsg_name in filtered_resources[_RESOURCE_NETWORK_SECURITY_GROUP_TYPE]:
|
955
|
+
try:
|
956
|
+
delete_network_security_group(resource_group_name=resource_group,
|
957
|
+
network_security_group_name=nsg_name)
|
958
|
+
except Exception as e: # pylint: disable=broad-except
|
959
|
+
logger.warning('Failed to delete nsg: {}'.format(e))
|
960
|
+
|
961
|
+
delete_deployment = _get_azure_sdk_function(
|
962
|
+
client=resource_client.deployments, function_name='begin_delete')
|
963
|
+
deployment_names = [
|
964
|
+
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
|
965
|
+
cluster_name_on_cloud=cluster_name_on_cloud),
|
966
|
+
constants.EXTERNAL_RG_VM_DEPLOYMENT_NAME.format(
|
967
|
+
cluster_name_on_cloud=cluster_name_on_cloud)
|
968
|
+
]
|
969
|
+
for deployment_name in deployment_names:
|
970
|
+
try:
|
971
|
+
delete_deployment(resource_group_name=resource_group,
|
972
|
+
deployment_name=deployment_name)
|
973
|
+
except Exception as e: # pylint: disable=broad-except
|
974
|
+
logger.warning('Failed to delete deployment: {}'.format(e))
|
975
|
+
|
976
|
+
|
773
977
|
@common_utils.retry
|
774
978
|
def query_instances(
|
775
979
|
cluster_name_on_cloud: str,
|
@@ -842,66 +1046,67 @@ def open_ports(
|
|
842
1046
|
update_network_security_groups = _get_azure_sdk_function(
|
843
1047
|
client=network_client.network_security_groups,
|
844
1048
|
function_name='create_or_update')
|
1049
|
+
list_network_security_groups = _get_azure_sdk_function(
|
1050
|
+
client=network_client.network_security_groups, function_name='list')
|
845
1051
|
|
846
|
-
|
847
|
-
#
|
848
|
-
#
|
849
|
-
#
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
logger.info(f'NSG {nsg.name} is not created yet. Waiting for '
|
1052
|
+
for nsg in list_network_security_groups(resource_group):
|
1053
|
+
# Given resource group can contain network security groups that are
|
1054
|
+
# irrelevant to this provisioning especially with user specified
|
1055
|
+
# resource group at ~/.sky/config. So we make sure to check for the
|
1056
|
+
# completion of nsg relevant to the VM being provisioned.
|
1057
|
+
if cluster_name_on_cloud in nsg.name:
|
1058
|
+
try:
|
1059
|
+
# Wait the NSG creation to be finished before opening a port.
|
1060
|
+
# The cluster provisioning triggers the NSG creation, but it
|
1061
|
+
# may not be finished yet.
|
1062
|
+
backoff = common_utils.Backoff(max_backoff_factor=1)
|
1063
|
+
start_time = time.time()
|
1064
|
+
while True:
|
1065
|
+
if nsg.provisioning_state not in ['Creating', 'Updating']:
|
1066
|
+
break
|
1067
|
+
if time.time(
|
1068
|
+
) - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
|
1069
|
+
logger.warning(
|
1070
|
+
f'Fails to wait for the creation of NSG {nsg.name}'
|
1071
|
+
f' in {resource_group} within '
|
1072
|
+
f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
|
1073
|
+
'Skip this NSG.')
|
1074
|
+
backoff_time = backoff.current_backoff()
|
1075
|
+
logger.info(
|
1076
|
+
f'NSG {nsg.name} is not created yet. Waiting for '
|
872
1077
|
f'{backoff_time} seconds before checking again.')
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
1078
|
+
time.sleep(backoff_time)
|
1079
|
+
|
1080
|
+
# Azure NSG rules have a priority field that determines the
|
1081
|
+
# order in which they are applied. The priority must be unique
|
1082
|
+
# across all inbound rules in one NSG.
|
1083
|
+
priority = max(rule.priority
|
1084
|
+
for rule in nsg.security_rules
|
1085
|
+
if rule.direction == 'Inbound') + 1
|
1086
|
+
nsg.security_rules.append(
|
1087
|
+
azure.create_security_rule(
|
1088
|
+
name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
|
1089
|
+
priority=priority,
|
1090
|
+
protocol='Tcp',
|
1091
|
+
access='Allow',
|
1092
|
+
direction='Inbound',
|
1093
|
+
source_address_prefix='*',
|
1094
|
+
source_port_range='*',
|
1095
|
+
destination_address_prefix='*',
|
1096
|
+
destination_port_ranges=ports,
|
1097
|
+
))
|
1098
|
+
poller = update_network_security_groups(resource_group,
|
1099
|
+
nsg.name, nsg)
|
1100
|
+
poller.wait()
|
1101
|
+
if poller.status() != 'Succeeded':
|
1102
|
+
with ux_utils.print_exception_no_traceback():
|
1103
|
+
raise ValueError(f'Failed to open ports {ports} in NSG '
|
1104
|
+
f'{nsg.name}: {poller.status()}')
|
1105
|
+
except azure.exceptions().HttpResponseError as e:
|
1106
|
+
with ux_utils.print_exception_no_traceback():
|
1107
|
+
raise ValueError(
|
1108
|
+
f'Failed to open ports {ports} in NSG {nsg.name}.'
|
1109
|
+
) from e
|
905
1110
|
|
906
1111
|
|
907
1112
|
def cleanup_ports(
|
sky/provision/constants.py
CHANGED
@@ -16,3 +16,10 @@ WORKER_NODE_TAGS = {
|
|
16
16
|
TAG_RAY_NODE_KIND: 'worker',
|
17
17
|
TAG_SKYPILOT_HEAD_NODE: '0',
|
18
18
|
}
|
19
|
+
|
20
|
+
# Names for Azure Deployments.
|
21
|
+
DEPLOYMENT_NAME = 'skypilot-config'
|
22
|
+
LEGACY_DEPLOYMENT_NAME = 'ray-config'
|
23
|
+
EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME = (
|
24
|
+
'skypilot-bootstrap-{cluster_name_on_cloud}')
|
25
|
+
EXTERNAL_RG_VM_DEPLOYMENT_NAME = 'skypilot-vm-{cluster_name_on_cloud}'
|
sky/provision/gcp/instance.py
CHANGED
@@ -632,13 +632,6 @@ def cleanup_ports(
|
|
632
632
|
del ports # Unused.
|
633
633
|
assert provider_config is not None, cluster_name_on_cloud
|
634
634
|
project_id = provider_config['project_id']
|
635
|
-
if 'ports' in provider_config:
|
636
|
-
# Backward compatibility for old provider config.
|
637
|
-
# TODO(tian): remove this after 2 minor releases, 0.6.0.
|
638
|
-
for port in provider_config['ports']:
|
639
|
-
firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
|
640
|
-
instance_utils.GCPComputeInstance.delete_firewall_rule(
|
641
|
-
project_id, firewall_rule_name)
|
642
635
|
if 'firewall_rule' in provider_config:
|
643
636
|
firewall_rule_name = provider_config['firewall_rule']
|
644
637
|
instance_utils.GCPComputeInstance.delete_firewall_rule(
|
sky/serve/core.py
CHANGED
@@ -572,8 +572,6 @@ def status(
|
|
572
572
|
'controller_port': (Optional[int]) controller port,
|
573
573
|
'load_balancer_port': (Optional[int]) load balancer port,
|
574
574
|
'policy': (Optional[str]) load balancer policy description,
|
575
|
-
'requested_resources': (sky.Resources) requested resources
|
576
|
-
for replica (deprecated),
|
577
575
|
'requested_resources_str': (str) str representation of
|
578
576
|
requested resources,
|
579
577
|
'replica_info': (List[Dict[str, Any]]) replica information,
|
sky/serve/serve_state.py
CHANGED
@@ -34,7 +34,7 @@ _DB_PATH: str = _get_db_path()
|
|
34
34
|
def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
35
35
|
"""Creates the service and replica tables if they do not exist."""
|
36
36
|
|
37
|
-
# auto_restart column is deprecated.
|
37
|
+
# auto_restart and requested_resources column is deprecated.
|
38
38
|
cursor.execute("""\
|
39
39
|
CREATE TABLE IF NOT EXISTS services (
|
40
40
|
name TEXT PRIMARY KEY,
|
@@ -323,8 +323,8 @@ def set_service_load_balancer_port(service_name: str,
|
|
323
323
|
|
324
324
|
def _get_service_from_row(row) -> Dict[str, Any]:
|
325
325
|
(current_version, name, controller_job_id, controller_port,
|
326
|
-
load_balancer_port, status, uptime, policy, _,
|
327
|
-
|
326
|
+
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
327
|
+
_, active_versions) = row[:13]
|
328
328
|
return {
|
329
329
|
'name': name,
|
330
330
|
'controller_job_id': controller_job_id,
|
@@ -340,10 +340,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
340
340
|
# The versions that is active for the load balancer. This is a list of
|
341
341
|
# integers in json format. This is mainly for display purpose.
|
342
342
|
'active_versions': json.loads(active_versions),
|
343
|
-
# TODO(tian): Backward compatibility.
|
344
|
-
# Remove after 2 minor release, 0.6.0.
|
345
|
-
'requested_resources': pickle.loads(requested_resources)
|
346
|
-
if requested_resources is not None else None,
|
347
343
|
'requested_resources_str': requested_resources_str,
|
348
344
|
}
|
349
345
|
|
sky/serve/serve_utils.py
CHANGED
@@ -825,12 +825,7 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
825
825
|
replicas = _get_replicas(record)
|
826
826
|
endpoint = get_endpoint(record)
|
827
827
|
policy = record['policy']
|
828
|
-
|
829
|
-
# Remove `requested_resources` field after 2 minor release, 0.6.0.
|
830
|
-
if record.get('requested_resources_str') is None:
|
831
|
-
requested_resources_str = str(record['requested_resources'])
|
832
|
-
else:
|
833
|
-
requested_resources_str = record['requested_resources_str']
|
828
|
+
requested_resources_str = record['requested_resources_str']
|
834
829
|
|
835
830
|
service_values = [
|
836
831
|
service_name,
|
@@ -1004,15 +999,8 @@ class ServeCodeGen:
|
|
1004
999
|
@classmethod
|
1005
1000
|
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
1006
1001
|
code = [
|
1007
|
-
# Backward compatibility for old serve version on the remote
|
1008
|
-
# machine. The `mode` argument was added in #3249, and if the remote
|
1009
|
-
# machine has an old SkyPilot version before that, we need to avoid
|
1010
|
-
# passing the `mode` argument to the job_lib functions.
|
1011
|
-
# TODO(zhwu): Remove this in 0.7.0 release.
|
1012
|
-
f'mode_kwargs = {{"mode": {mode!r}}} '
|
1013
|
-
'if getattr(constants, "SERVE_VERSION", 0) >= 1 else {}',
|
1014
1002
|
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
1015
|
-
f'{version},
|
1003
|
+
f'{version}, mode={mode!r})',
|
1016
1004
|
'print(msg, end="", flush=True)',
|
1017
1005
|
]
|
1018
1006
|
return cls._build(code)
|
sky/serve/service_spec.py
CHANGED
@@ -29,13 +29,6 @@ class SkyServiceSpec:
|
|
29
29
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
30
30
|
upscale_delay_seconds: Optional[int] = None,
|
31
31
|
downscale_delay_seconds: Optional[int] = None,
|
32
|
-
# The following arguments are deprecated.
|
33
|
-
# TODO(ziming): remove this after 2 minor release, i.e. 0.6.0.
|
34
|
-
# Deprecated: Always be True
|
35
|
-
auto_restart: Optional[bool] = None,
|
36
|
-
# Deprecated: replaced by the target_qps_per_replica.
|
37
|
-
qps_upper_threshold: Optional[float] = None,
|
38
|
-
qps_lower_threshold: Optional[float] = None,
|
39
32
|
) -> None:
|
40
33
|
if max_replicas is not None and max_replicas < min_replicas:
|
41
34
|
with ux_utils.print_exception_no_traceback():
|
@@ -62,21 +55,6 @@ class SkyServiceSpec:
|
|
62
55
|
raise ValueError('readiness_path must start with a slash (/). '
|
63
56
|
f'Got: {readiness_path}')
|
64
57
|
|
65
|
-
# TODO(tian): Following field are deprecated. Remove after 2 minor
|
66
|
-
# release, i.e. 0.6.0.
|
67
|
-
if qps_upper_threshold is not None or qps_lower_threshold is not None:
|
68
|
-
with ux_utils.print_exception_no_traceback():
|
69
|
-
raise ValueError(
|
70
|
-
'Field `qps_upper_threshold` and `qps_lower_threshold`'
|
71
|
-
'under `replica_policy` are deprecated. '
|
72
|
-
'Please use target_qps_per_replica instead.')
|
73
|
-
if auto_restart is not None:
|
74
|
-
with ux_utils.print_exception_no_traceback():
|
75
|
-
raise ValueError(
|
76
|
-
'Field `auto_restart` under `replica_policy` is deprecated.'
|
77
|
-
'Currently, SkyServe will cleanup failed replicas'
|
78
|
-
'and auto restart it to keep the service running.')
|
79
|
-
|
80
58
|
self._readiness_path: str = readiness_path
|
81
59
|
self._initial_delay_seconds: int = initial_delay_seconds
|
82
60
|
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
@@ -160,14 +138,8 @@ class SkyServiceSpec:
|
|
160
138
|
service_config['min_replicas'] = policy_section['min_replicas']
|
161
139
|
service_config['max_replicas'] = policy_section.get(
|
162
140
|
'max_replicas', None)
|
163
|
-
service_config['qps_upper_threshold'] = policy_section.get(
|
164
|
-
'qps_upper_threshold', None)
|
165
|
-
service_config['qps_lower_threshold'] = policy_section.get(
|
166
|
-
'qps_lower_threshold', None)
|
167
141
|
service_config['target_qps_per_replica'] = policy_section.get(
|
168
142
|
'target_qps_per_replica', None)
|
169
|
-
service_config['auto_restart'] = policy_section.get(
|
170
|
-
'auto_restart', None)
|
171
143
|
service_config['upscale_delay_seconds'] = policy_section.get(
|
172
144
|
'upscale_delay_seconds', None)
|
173
145
|
service_config['downscale_delay_seconds'] = policy_section.get(
|
sky/skylet/job_lib.py
CHANGED
@@ -827,14 +827,6 @@ class JobLibCodeGen:
|
|
827
827
|
'import os',
|
828
828
|
'import getpass',
|
829
829
|
'from sky.skylet import job_lib, log_lib, constants',
|
830
|
-
# Backward compatibility for old skylet lib version on the remote
|
831
|
-
# machine. The `job_owner` argument was removed in #3037, and if the
|
832
|
-
# remote machine has an old SkyPilot version before that, we need to
|
833
|
-
# pass the `job_owner` argument to the job_lib functions.
|
834
|
-
# TODO(zhwu): Remove this in 0.7.0 release.
|
835
|
-
'job_owner_kwargs = {} '
|
836
|
-
'if getattr(constants, "SKYLET_LIB_VERSION", 0) >= 1 '
|
837
|
-
'else {"job_owner": getpass.getuser()}',
|
838
830
|
]
|
839
831
|
|
840
832
|
@classmethod
|
@@ -861,7 +853,7 @@ class JobLibCodeGen:
|
|
861
853
|
|
862
854
|
@classmethod
|
863
855
|
def update_status(cls) -> str:
|
864
|
-
code = ['job_lib.update_status(
|
856
|
+
code = ['job_lib.update_status()']
|
865
857
|
return cls._build(code)
|
866
858
|
|
867
859
|
@classmethod
|
@@ -879,7 +871,7 @@ class JobLibCodeGen:
|
|
879
871
|
"""See job_lib.cancel_jobs()."""
|
880
872
|
code = [
|
881
873
|
(f'cancelled = job_lib.cancel_jobs_encoded_results('
|
882
|
-
f' {job_ids!r}, {cancel_all}
|
874
|
+
f' {job_ids!r}, {cancel_all})'),
|
883
875
|
# Print cancelled IDs. Caller should parse by decoding.
|
884
876
|
'print(cancelled, flush=True)',
|
885
877
|
]
|
@@ -902,7 +894,7 @@ class JobLibCodeGen:
|
|
902
894
|
'run_timestamp = job_lib.get_run_timestamp(job_id)',
|
903
895
|
f'log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)',
|
904
896
|
f'log_lib.tail_logs(job_id=job_id, log_dir=log_dir, '
|
905
|
-
f'managed_job_id={managed_job_id!r}, follow={follow}
|
897
|
+
f'managed_job_id={managed_job_id!r}, follow={follow})',
|
906
898
|
]
|
907
899
|
return cls._build(code)
|
908
900
|
|
sky/skylet/log_lib.py
CHANGED
@@ -186,20 +186,11 @@ def run_with_log(
|
|
186
186
|
daemon_script = os.path.join(
|
187
187
|
os.path.dirname(os.path.abspath(job_lib.__file__)),
|
188
188
|
'subprocess_daemon.py')
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
# not updated. We fallback to `python3` in this case.
|
195
|
-
# TODO(zhwu): remove this after 0.7.0.
|
196
|
-
python_path = 'python3'
|
197
|
-
else:
|
198
|
-
python_path = subprocess.check_output(
|
199
|
-
constants.SKY_GET_PYTHON_PATH_CMD,
|
200
|
-
shell=True,
|
201
|
-
stderr=subprocess.DEVNULL,
|
202
|
-
encoding='utf-8').strip()
|
189
|
+
python_path = subprocess.check_output(
|
190
|
+
constants.SKY_GET_PYTHON_PATH_CMD,
|
191
|
+
shell=True,
|
192
|
+
stderr=subprocess.DEVNULL,
|
193
|
+
encoding='utf-8').strip()
|
203
194
|
daemon_cmd = [
|
204
195
|
python_path,
|
205
196
|
daemon_script,
|
sky/templates/azure-ray.yml.j2
CHANGED
@@ -34,6 +34,7 @@ provider:
|
|
34
34
|
# instead of the cluster_name. This ensures that ray creates new instances
|
35
35
|
# for different cluster_name.
|
36
36
|
resource_group: {{resource_group}}
|
37
|
+
use_external_resource_group: {{use_external_resource_group}}
|
37
38
|
# Keep (otherwise cannot reuse when re-provisioning).
|
38
39
|
# teardown(terminate=True) will override this.
|
39
40
|
cache_stopped_nodes: True
|
sky/utils/schemas.py
CHANGED
@@ -357,19 +357,6 @@ def get_service_schema():
|
|
357
357
|
'downscale_delay_seconds': {
|
358
358
|
'type': 'number',
|
359
359
|
},
|
360
|
-
# TODO(MaoZiming): Fields `qps_upper_threshold`,
|
361
|
-
# `qps_lower_threshold` and `auto_restart` are deprecated.
|
362
|
-
# Temporarily keep these fields for backward compatibility.
|
363
|
-
# Remove after 2 minor release, i.e., 0.6.0.
|
364
|
-
'auto_restart': {
|
365
|
-
'type': 'boolean',
|
366
|
-
},
|
367
|
-
'qps_upper_threshold': {
|
368
|
-
'type': 'number',
|
369
|
-
},
|
370
|
-
'qps_lower_threshold': {
|
371
|
-
'type': 'number',
|
372
|
-
},
|
373
360
|
}
|
374
361
|
},
|
375
362
|
'replicas': {
|
@@ -595,7 +582,7 @@ _NETWORK_CONFIG_SCHEMA = {
|
|
595
582
|
|
596
583
|
_LABELS_SCHEMA = {
|
597
584
|
# Deprecated: 'instance_tags' is replaced by 'labels'. Keeping for backward
|
598
|
-
# compatibility. Will be removed after 0.
|
585
|
+
# compatibility. Will be removed after 0.8.0.
|
599
586
|
'instance_tags': {
|
600
587
|
'type': 'object',
|
601
588
|
'required': [],
|
@@ -771,6 +758,9 @@ def get_config_schema():
|
|
771
758
|
'storage_account': {
|
772
759
|
'type': 'string',
|
773
760
|
},
|
761
|
+
'resource_group_vm': {
|
762
|
+
'type': 'string',
|
763
|
+
},
|
774
764
|
}
|
775
765
|
},
|
776
766
|
'kubernetes': {
|