skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241029__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -0
- sky/backends/backend_utils.py +10 -133
- sky/backends/cloud_vm_ray_backend.py +4 -102
- sky/clouds/azure.py +10 -1
- sky/optimizer.py +11 -7
- sky/provision/azure/azure-config-template.json +7 -1
- sky/provision/azure/config.py +65 -45
- sky/provision/azure/instance.py +275 -70
- sky/provision/constants.py +7 -0
- sky/provision/gcp/instance.py +0 -7
- sky/serve/core.py +0 -2
- sky/serve/serve_state.py +3 -7
- sky/serve/serve_utils.py +2 -14
- sky/serve/service_spec.py +0 -28
- sky/skylet/job_lib.py +3 -11
- sky/skylet/log_lib.py +5 -14
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/utils/schemas.py +4 -14
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/RECORD +25 -25
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241029.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241029'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/azure.py
CHANGED
@@ -131,6 +131,9 @@ def get_client(name: str,
|
|
131
131
|
from azure.mgmt import authorization
|
132
132
|
return authorization.AuthorizationManagementClient(
|
133
133
|
credential, subscription_id)
|
134
|
+
elif name == 'msi':
|
135
|
+
from azure.mgmt import msi
|
136
|
+
return msi.ManagedServiceIdentityClient(credential, subscription_id)
|
134
137
|
elif name == 'graph':
|
135
138
|
import msgraph
|
136
139
|
return msgraph.GraphServiceClient(credential)
|
sky/backends/backend_utils.py
CHANGED
@@ -401,6 +401,8 @@ class SSHConfigHelper(object):
|
|
401
401
|
|
402
402
|
ssh_conf_path = '~/.ssh/config'
|
403
403
|
ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
|
404
|
+
ssh_conf_per_cluster_lock_path = os.path.expanduser(
|
405
|
+
'~/.sky/ssh_config_{}.lock')
|
404
406
|
ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
|
405
407
|
|
406
408
|
@classmethod
|
@@ -486,12 +488,6 @@ class SSHConfigHelper(object):
|
|
486
488
|
|
487
489
|
config_path = os.path.expanduser(cls.ssh_conf_path)
|
488
490
|
|
489
|
-
# For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
|
490
|
-
# directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
|
491
|
-
# and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
|
492
|
-
cls._remove_stale_cluster_config_for_backward_compatibility(
|
493
|
-
cluster_name, ip, auth_config, docker_user)
|
494
|
-
|
495
491
|
if not os.path.exists(config_path):
|
496
492
|
config = ['\n']
|
497
493
|
with open(config_path,
|
@@ -560,139 +556,20 @@ class SSHConfigHelper(object):
|
|
560
556
|
f.write(codegen)
|
561
557
|
|
562
558
|
@classmethod
|
563
|
-
def
|
564
|
-
cls,
|
565
|
-
cluster_name: str,
|
566
|
-
ip: str,
|
567
|
-
auth_config: Dict[str, str],
|
568
|
-
docker_user: Optional[str] = None,
|
569
|
-
):
|
570
|
-
"""Remove authentication information for cluster from local SSH config.
|
571
|
-
|
572
|
-
If no existing host matching the provided specification is found, then
|
573
|
-
nothing is removed.
|
574
|
-
|
575
|
-
Args:
|
576
|
-
ip: Head node's IP address.
|
577
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
578
|
-
docker_user: If not None, use this user to ssh into the docker
|
579
|
-
"""
|
580
|
-
username = auth_config['ssh_user']
|
581
|
-
config_path = os.path.expanduser(cls.ssh_conf_path)
|
582
|
-
cluster_config_path = os.path.expanduser(
|
583
|
-
cls.ssh_cluster_path.format(cluster_name))
|
584
|
-
if not os.path.exists(config_path):
|
585
|
-
return
|
586
|
-
|
587
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
588
|
-
config = f.readlines()
|
589
|
-
|
590
|
-
start_line_idx = None
|
591
|
-
|
592
|
-
# Scan the config for the cluster name.
|
593
|
-
for i, line in enumerate(config):
|
594
|
-
next_line = config[i + 1] if i + 1 < len(config) else ''
|
595
|
-
if docker_user is None:
|
596
|
-
found = (line.strip() == f'HostName {ip}' and
|
597
|
-
next_line.strip() == f'User {username}')
|
598
|
-
else:
|
599
|
-
found = (line.strip() == 'HostName localhost' and
|
600
|
-
next_line.strip() == f'User {docker_user}')
|
601
|
-
if found:
|
602
|
-
# Find the line starting with ProxyCommand and contains the ip
|
603
|
-
found = False
|
604
|
-
for idx in range(i, len(config)):
|
605
|
-
# Stop if we reach an empty line, which means a new host
|
606
|
-
if not config[idx].strip():
|
607
|
-
break
|
608
|
-
if config[idx].strip().startswith('ProxyCommand'):
|
609
|
-
proxy_command_line = config[idx].strip()
|
610
|
-
if proxy_command_line.endswith(f'@{ip}'):
|
611
|
-
found = True
|
612
|
-
break
|
613
|
-
if found:
|
614
|
-
start_line_idx = i - 1
|
615
|
-
break
|
616
|
-
|
617
|
-
if start_line_idx is not None:
|
618
|
-
# Scan for end of previous config.
|
619
|
-
cursor = start_line_idx
|
620
|
-
while cursor > 0 and len(config[cursor].strip()) > 0:
|
621
|
-
cursor -= 1
|
622
|
-
prev_end_line_idx = cursor
|
623
|
-
|
624
|
-
# Scan for end of the cluster config.
|
625
|
-
end_line_idx = None
|
626
|
-
cursor = start_line_idx + 1
|
627
|
-
start_line_idx -= 1 # remove auto-generated comment
|
628
|
-
while cursor < len(config):
|
629
|
-
if config[cursor].strip().startswith(
|
630
|
-
'# ') or config[cursor].strip().startswith('Host '):
|
631
|
-
end_line_idx = cursor
|
632
|
-
break
|
633
|
-
cursor += 1
|
634
|
-
|
635
|
-
# Remove sky-generated config and update the file.
|
636
|
-
config[prev_end_line_idx:end_line_idx] = [
|
637
|
-
'\n'
|
638
|
-
] if end_line_idx is not None else []
|
639
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
640
|
-
f.write(''.join(config).strip())
|
641
|
-
f.write('\n' * 2)
|
642
|
-
|
643
|
-
# Delete include statement if it exists in the config.
|
644
|
-
sky_autogen_comment = ('# Added by sky (use `sky stop/down '
|
645
|
-
f'{cluster_name}` to remove)')
|
646
|
-
with open(config_path, 'r', encoding='utf-8') as f:
|
647
|
-
config = f.readlines()
|
648
|
-
|
649
|
-
for i, line in enumerate(config):
|
650
|
-
config_str = line.strip()
|
651
|
-
if f'Include {cluster_config_path}' in config_str:
|
652
|
-
with open(config_path, 'w', encoding='utf-8') as f:
|
653
|
-
if i < len(config) - 1 and config[i + 1] == '\n':
|
654
|
-
del config[i + 1]
|
655
|
-
# Delete Include string
|
656
|
-
del config[i]
|
657
|
-
# Delete Sky Autogen Comment
|
658
|
-
if i > 0 and sky_autogen_comment in config[i - 1].strip():
|
659
|
-
del config[i - 1]
|
660
|
-
f.write(''.join(config))
|
661
|
-
break
|
662
|
-
if 'Host' in config_str:
|
663
|
-
break
|
664
|
-
|
665
|
-
@classmethod
|
666
|
-
# TODO: We can remove this after 0.6.0 and have a lock only per cluster.
|
667
|
-
@timeline.FileLockEvent(ssh_conf_lock_path)
|
668
|
-
def remove_cluster(
|
669
|
-
cls,
|
670
|
-
cluster_name: str,
|
671
|
-
ip: str,
|
672
|
-
auth_config: Dict[str, str],
|
673
|
-
docker_user: Optional[str] = None,
|
674
|
-
):
|
559
|
+
def remove_cluster(cls, cluster_name: str):
|
675
560
|
"""Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
|
676
561
|
|
677
|
-
For backward compatibility also remove the config from ~/.ssh/config if it exists.
|
678
|
-
|
679
562
|
If no existing host matching the provided specification is found, then
|
680
563
|
nothing is removed.
|
681
564
|
|
682
565
|
Args:
|
683
|
-
|
684
|
-
auth_config: read_yaml(handle.cluster_yaml)['auth']
|
685
|
-
docker_user: If not None, use this user to ssh into the docker
|
566
|
+
cluster_name: Cluster name.
|
686
567
|
"""
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
# directly in ~/.ssh/config. For these clusters, we should clean up the config.
|
693
|
-
# TODO: Remove this after 0.6.0
|
694
|
-
cls._remove_stale_cluster_config_for_backward_compatibility(
|
695
|
-
cluster_name, ip, auth_config, docker_user)
|
568
|
+
with timeline.FileLockEvent(
|
569
|
+
cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
|
570
|
+
cluster_config_path = os.path.expanduser(
|
571
|
+
cls.ssh_cluster_path.format(cluster_name))
|
572
|
+
common_utils.remove_file_if_exists(cluster_config_path)
|
696
573
|
|
697
574
|
|
698
575
|
def _replace_yaml_dicts(
|
@@ -867,7 +744,7 @@ def write_cluster_config(
|
|
867
744
|
labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
|
868
745
|
# Deprecated: instance_tags have been replaced by labels. For backward
|
869
746
|
# compatibility, we support them and the schema allows them only if
|
870
|
-
# `labels` are not specified. This should be removed after 0.
|
747
|
+
# `labels` are not specified. This should be removed after 0.8.0.
|
871
748
|
labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
|
872
749
|
labels)
|
873
750
|
# labels is a dict, which is guaranteed by the type check in
|
@@ -2118,13 +2118,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2118
2118
|
stable_internal_external_ips: Optional[List[Tuple[str,
|
2119
2119
|
str]]] = None,
|
2120
2120
|
stable_ssh_ports: Optional[List[int]] = None,
|
2121
|
-
cluster_info: Optional[provision_common.ClusterInfo] = None
|
2122
|
-
|
2123
|
-
# API handles the TPU node creation/deletion.
|
2124
|
-
# Backward compatibility for TPU nodes created before #2943.
|
2125
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2126
|
-
tpu_create_script: Optional[str] = None,
|
2127
|
-
tpu_delete_script: Optional[str] = None) -> None:
|
2121
|
+
cluster_info: Optional[provision_common.ClusterInfo] = None
|
2122
|
+
) -> None:
|
2128
2123
|
self._version = self._VERSION
|
2129
2124
|
self.cluster_name = cluster_name
|
2130
2125
|
self.cluster_name_on_cloud = cluster_name_on_cloud
|
@@ -2139,12 +2134,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2139
2134
|
self.launched_nodes = launched_nodes
|
2140
2135
|
self.launched_resources = launched_resources
|
2141
2136
|
self.docker_user: Optional[str] = None
|
2142
|
-
# Deprecated. SkyPilot new provisioner API handles the TPU node
|
2143
|
-
# creation/deletion.
|
2144
|
-
# Backward compatibility for TPU nodes created before #2943.
|
2145
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2146
|
-
self.tpu_create_script = tpu_create_script
|
2147
|
-
self.tpu_delete_script = tpu_delete_script
|
2148
2137
|
|
2149
2138
|
def __repr__(self):
|
2150
2139
|
return (f'ResourceHandle('
|
@@ -2160,10 +2149,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2160
2149
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
2161
2150
|
f'{self.launched_resources}, '
|
2162
2151
|
f'\n\tdocker_user={self.docker_user},'
|
2163
|
-
f'\n\tssh_user={self.ssh_user}
|
2164
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
2165
|
-
f'\n\ttpu_create_script={self.tpu_create_script}, '
|
2166
|
-
f'\n\ttpu_delete_script={self.tpu_delete_script})')
|
2152
|
+
f'\n\tssh_user={self.ssh_user}')
|
2167
2153
|
|
2168
2154
|
def get_cluster_name(self):
|
2169
2155
|
return self.cluster_name
|
@@ -2176,26 +2162,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2176
2162
|
return common_utils.read_yaml(self.cluster_yaml).get(
|
2177
2163
|
'provider', {}).get('use_internal_ips', False)
|
2178
2164
|
|
2179
|
-
def _update_cluster_region(self):
|
2180
|
-
"""Update the region in handle.launched_resources.
|
2181
|
-
|
2182
|
-
This is for backward compatibility to handle the clusters launched
|
2183
|
-
long before. We should remove this after 0.6.0.
|
2184
|
-
"""
|
2185
|
-
if self.launched_resources.region is not None:
|
2186
|
-
return
|
2187
|
-
|
2188
|
-
config = common_utils.read_yaml(self.cluster_yaml)
|
2189
|
-
provider = config['provider']
|
2190
|
-
cloud = self.launched_resources.cloud
|
2191
|
-
if cloud.is_same_cloud(clouds.Azure()):
|
2192
|
-
region = provider['location']
|
2193
|
-
elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
|
2194
|
-
clouds.AWS()):
|
2195
|
-
region = provider['region']
|
2196
|
-
|
2197
|
-
self.launched_resources = self.launched_resources.copy(region=region)
|
2198
|
-
|
2199
2165
|
def update_ssh_ports(self, max_attempts: int = 1) -> None:
|
2200
2166
|
"""Fetches and sets the SSH ports for the cluster nodes.
|
2201
2167
|
|
@@ -2567,8 +2533,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2567
2533
|
if version < 4:
|
2568
2534
|
self.update_ssh_ports()
|
2569
2535
|
|
2570
|
-
self._update_cluster_region()
|
2571
|
-
|
2572
2536
|
if version < 8:
|
2573
2537
|
try:
|
2574
2538
|
self._update_cluster_info()
|
@@ -2649,8 +2613,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2649
2613
|
if record is not None:
|
2650
2614
|
usage_lib.messages.usage.update_cluster_status(record['status'])
|
2651
2615
|
|
2652
|
-
# Backward compatibility: the old launched_resources without region info
|
2653
|
-
# was handled by ResourceHandle._update_cluster_region.
|
2654
2616
|
assert launched_resources.region is not None, handle
|
2655
2617
|
|
2656
2618
|
mismatch_str = (f'To fix: specify a new cluster name, or down the '
|
@@ -3585,9 +3547,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3585
3547
|
backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
3586
3548
|
|
3587
3549
|
try:
|
3588
|
-
# TODO(mraheja): remove pylint disabling when filelock
|
3589
|
-
# version updated
|
3590
|
-
# pylint: disable=abstract-class-instantiated
|
3591
3550
|
with filelock.FileLock(
|
3592
3551
|
lock_path,
|
3593
3552
|
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
@@ -4096,55 +4055,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4096
4055
|
* Removing ssh configs for the cluster;
|
4097
4056
|
* Updating the local state of the cluster;
|
4098
4057
|
* Removing the terminated cluster's scripts and ray yaml files.
|
4099
|
-
|
4100
|
-
Raises:
|
4101
|
-
RuntimeError: If it fails to delete the TPU.
|
4102
4058
|
"""
|
4103
|
-
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
4104
|
-
'teardown.log')
|
4105
|
-
log_abs_path = os.path.abspath(log_path)
|
4106
4059
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
4107
4060
|
|
4108
|
-
# Backward compatibility for TPU nodes created before #2943. Any TPU
|
4109
|
-
# node launched before that PR have the delete script generated (and do
|
4110
|
-
# not have the tpu_node config set in its cluster yaml), so we have to
|
4111
|
-
# call the deletion script to clean up the TPU node.
|
4112
|
-
# For TPU nodes launched after the PR, deletion is done in SkyPilot's
|
4113
|
-
# new GCP provisioner API.
|
4114
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
4115
|
-
if (handle.tpu_delete_script is not None and
|
4116
|
-
os.path.exists(handle.tpu_delete_script)):
|
4117
|
-
# Only call the deletion script if the cluster config does not
|
4118
|
-
# contain TPU node config. Otherwise, the deletion should
|
4119
|
-
# already be handled by the new provisioner.
|
4120
|
-
config = common_utils.read_yaml(handle.cluster_yaml)
|
4121
|
-
tpu_node_config = config['provider'].get('tpu_node')
|
4122
|
-
if tpu_node_config is None:
|
4123
|
-
with rich_utils.safe_status(
|
4124
|
-
ux_utils.spinner_message('Terminating TPU')):
|
4125
|
-
tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
|
4126
|
-
['bash', handle.tpu_delete_script],
|
4127
|
-
log_abs_path,
|
4128
|
-
stream_logs=False,
|
4129
|
-
require_outputs=True)
|
4130
|
-
if tpu_rc != 0:
|
4131
|
-
if _TPU_NOT_FOUND_ERROR in tpu_stderr:
|
4132
|
-
logger.info('TPU not found. '
|
4133
|
-
'It should have been deleted already.')
|
4134
|
-
elif purge:
|
4135
|
-
logger.warning(
|
4136
|
-
_TEARDOWN_PURGE_WARNING.format(
|
4137
|
-
reason='stopping/terminating TPU',
|
4138
|
-
details=tpu_stderr))
|
4139
|
-
else:
|
4140
|
-
raise RuntimeError(
|
4141
|
-
_TEARDOWN_FAILURE_MESSAGE.format(
|
4142
|
-
extra_reason='It is caused by TPU failure.',
|
4143
|
-
cluster_name=common_utils.cluster_name_in_hint(
|
4144
|
-
handle.cluster_name, cluster_name_on_cloud),
|
4145
|
-
stdout=tpu_stdout,
|
4146
|
-
stderr=tpu_stderr))
|
4147
|
-
|
4148
4061
|
if (terminate and handle.launched_resources.is_image_managed is True):
|
4149
4062
|
# Delete the image when terminating a "cloned" cluster, i.e.,
|
4150
4063
|
# whose image is created by SkyPilot (--clone-disk-from)
|
@@ -4189,11 +4102,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4189
4102
|
# The cluster file must exist because the cluster_yaml will only
|
4190
4103
|
# be removed after the cluster entry in the database is removed.
|
4191
4104
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
4192
|
-
|
4193
|
-
backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
|
4194
|
-
handle.head_ip,
|
4195
|
-
auth_config,
|
4196
|
-
handle.docker_user)
|
4105
|
+
backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
|
4197
4106
|
|
4198
4107
|
global_user_state.remove_cluster(handle.cluster_name,
|
4199
4108
|
terminate=terminate)
|
@@ -4202,13 +4111,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4202
4111
|
# This function could be directly called from status refresh,
|
4203
4112
|
# where we need to cleanup the cluster profile.
|
4204
4113
|
metadata_utils.remove_cluster_metadata(handle.cluster_name)
|
4205
|
-
# Clean up TPU creation/deletion scripts
|
4206
|
-
# Backward compatibility for TPU nodes created before #2943.
|
4207
|
-
# TODO (zhwu): Remove this after 0.6.0.
|
4208
|
-
if handle.tpu_delete_script is not None:
|
4209
|
-
assert handle.tpu_create_script is not None
|
4210
|
-
common_utils.remove_file_if_exists(handle.tpu_create_script)
|
4211
|
-
common_utils.remove_file_if_exists(handle.tpu_delete_script)
|
4212
4114
|
|
4213
4115
|
# Clean up generated config
|
4214
4116
|
# No try-except is needed since Ray will fail to teardown the
|
sky/clouds/azure.py
CHANGED
@@ -12,6 +12,7 @@ import colorama
|
|
12
12
|
from sky import clouds
|
13
13
|
from sky import exceptions
|
14
14
|
from sky import sky_logging
|
15
|
+
from sky import skypilot_config
|
15
16
|
from sky.adaptors import azure
|
16
17
|
from sky.clouds import service_catalog
|
17
18
|
from sky.clouds.utils import azure_utils
|
@@ -353,6 +354,13 @@ class Azure(clouds.Cloud):
|
|
353
354
|
need_nvidia_driver_extension = (acc_dict is not None and
|
354
355
|
'A10' in acc_dict)
|
355
356
|
|
357
|
+
# Determine resource group for deploying the instance.
|
358
|
+
resource_group_name = skypilot_config.get_nested(
|
359
|
+
('azure', 'resource_group_vm'), None)
|
360
|
+
use_external_resource_group = resource_group_name is not None
|
361
|
+
if resource_group_name is None:
|
362
|
+
resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
|
363
|
+
|
356
364
|
# Setup commands to eliminate the banner and restart sshd.
|
357
365
|
# This script will modify /etc/ssh/sshd_config and add a bash script
|
358
366
|
# into .bashrc. The bash script will restart sshd if it has not been
|
@@ -409,7 +417,8 @@ class Azure(clouds.Cloud):
|
|
409
417
|
'disk_tier': Azure._get_disk_type(disk_tier),
|
410
418
|
'cloud_init_setup_commands': cloud_init_setup_commands,
|
411
419
|
'azure_subscription_id': self.get_project_id(dryrun),
|
412
|
-
'resource_group':
|
420
|
+
'resource_group': resource_group_name,
|
421
|
+
'use_external_resource_group': use_external_resource_group,
|
413
422
|
}
|
414
423
|
|
415
424
|
# Setting disk performance tier for high disk tier.
|
sky/optimizer.py
CHANGED
@@ -831,13 +831,17 @@ class Optimizer:
|
|
831
831
|
return row
|
832
832
|
|
833
833
|
def _get_resource_group_hash(resources: 'resources_lib.Resources'):
|
834
|
-
|
835
|
-
{
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
834
|
+
resource_key_dict = {
|
835
|
+
'cloud': f'{resources.cloud}',
|
836
|
+
'accelerators': f'{resources.accelerators}',
|
837
|
+
'use_spot': resources.use_spot
|
838
|
+
}
|
839
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
840
|
+
# Region for Kubernetes is the context name, i.e. different
|
841
|
+
# Kubernetes clusters. We add region to the key to show all the
|
842
|
+
# Kubernetes clusters in the optimizer table for better UX.
|
843
|
+
resource_key_dict['region'] = resources.region
|
844
|
+
return json.dumps(resource_key_dict, sort_keys=True)
|
841
845
|
|
842
846
|
# Print the list of resouces that the optimizer considered.
|
843
847
|
resource_fields = [
|
@@ -14,6 +14,12 @@
|
|
14
14
|
"description": "Subnet parameters."
|
15
15
|
}
|
16
16
|
},
|
17
|
+
"location": {
|
18
|
+
"type": "string",
|
19
|
+
"metadata": {
|
20
|
+
"description": "Location of where the resources are allocated."
|
21
|
+
}
|
22
|
+
},
|
17
23
|
"nsgName": {
|
18
24
|
"type": "string",
|
19
25
|
"metadata": {
|
@@ -23,7 +29,7 @@
|
|
23
29
|
},
|
24
30
|
"variables": {
|
25
31
|
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
|
26
|
-
"location": "[
|
32
|
+
"location": "[parameters('location')]",
|
27
33
|
"msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
|
28
34
|
"roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
|
29
35
|
"nsgName": "[parameters('nsgName')]",
|
sky/provision/azure/config.py
CHANGED
@@ -14,13 +14,12 @@ from sky import exceptions
|
|
14
14
|
from sky import sky_logging
|
15
15
|
from sky.adaptors import azure
|
16
16
|
from sky.provision import common
|
17
|
+
from sky.provision import constants
|
17
18
|
from sky.utils import common_utils
|
18
19
|
|
19
20
|
logger = sky_logging.init_logger(__name__)
|
20
21
|
|
21
22
|
UNIQUE_ID_LEN = 4
|
22
|
-
_DEPLOYMENT_NAME = 'skypilot-config'
|
23
|
-
_LEGACY_DEPLOYMENT_NAME = 'ray-config'
|
24
23
|
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
|
25
24
|
_CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
|
26
25
|
|
@@ -82,46 +81,55 @@ def bootstrap_instances(
|
|
82
81
|
in provider_config), 'Provider config must include location field'
|
83
82
|
params = {'location': provider_config['location']}
|
84
83
|
|
84
|
+
assert ('use_external_resource_group'
|
85
|
+
in provider_config), ('Provider config must include '
|
86
|
+
'use_external_resource_group field')
|
87
|
+
use_external_resource_group = provider_config['use_external_resource_group']
|
88
|
+
|
85
89
|
if 'tags' in provider_config:
|
86
90
|
params['tags'] = provider_config['tags']
|
87
91
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
92
|
+
# When resource group is user specified, it already exists in certain
|
93
|
+
# region.
|
94
|
+
if not use_external_resource_group:
|
95
|
+
logger.info(f'Creating/Updating resource group: {resource_group}')
|
96
|
+
rg_create_or_update = get_azure_sdk_function(
|
97
|
+
client=resource_client.resource_groups,
|
98
|
+
function_name='create_or_update')
|
99
|
+
rg_creation_start = time.time()
|
100
|
+
retry = 0
|
101
|
+
while (time.time() - rg_creation_start <
|
102
|
+
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
|
103
|
+
try:
|
104
|
+
rg_create_or_update(resource_group_name=resource_group,
|
105
|
+
parameters=params)
|
106
|
+
break
|
107
|
+
except azure.exceptions().ResourceExistsError as e:
|
108
|
+
if 'ResourceGroupBeingDeleted' in str(e):
|
109
|
+
if retry % 5 == 0:
|
110
|
+
logger.info(
|
111
|
+
f'Azure resource group {resource_group} of a '
|
112
|
+
'recent terminated cluster '
|
113
|
+
f'{cluster_name_on_cloud} is being deleted. It can'
|
114
|
+
' only be provisioned after it is fully deleted. '
|
115
|
+
'Waiting...')
|
116
|
+
time.sleep(1)
|
117
|
+
retry += 1
|
118
|
+
continue
|
119
|
+
raise
|
120
|
+
except azure.exceptions().ClientAuthenticationError as e:
|
121
|
+
message = (
|
122
|
+
'Failed to authenticate with Azure. Please check your '
|
123
|
+
'Azure credentials. Error: '
|
124
|
+
f'{common_utils.format_exception(e)}').replace('\n', ' ')
|
125
|
+
logger.error(message)
|
126
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
127
|
+
else:
|
113
128
|
message = (
|
114
|
-
'
|
115
|
-
|
116
|
-
).replace('\n', ' ')
|
129
|
+
f'Timed out waiting for resource group {resource_group} to be '
|
130
|
+
'deleted.')
|
117
131
|
logger.error(message)
|
118
|
-
raise
|
119
|
-
else:
|
120
|
-
message = (
|
121
|
-
f'Timed out waiting for resource group {resource_group} to be '
|
122
|
-
'deleted.')
|
123
|
-
logger.error(message)
|
124
|
-
raise TimeoutError(message)
|
132
|
+
raise TimeoutError(message)
|
125
133
|
|
126
134
|
# load the template file
|
127
135
|
current_path = Path(__file__).parent
|
@@ -155,6 +163,9 @@ def bootstrap_instances(
|
|
155
163
|
'nsgName': {
|
156
164
|
'value': nsg_name
|
157
165
|
},
|
166
|
+
'location': {
|
167
|
+
'value': params['location']
|
168
|
+
}
|
158
169
|
},
|
159
170
|
}
|
160
171
|
}
|
@@ -164,11 +175,22 @@ def bootstrap_instances(
|
|
164
175
|
get_deployment = get_azure_sdk_function(client=resource_client.deployments,
|
165
176
|
function_name='get')
|
166
177
|
deployment_exists = False
|
167
|
-
|
178
|
+
if use_external_resource_group:
|
179
|
+
deployment_name = (
|
180
|
+
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
|
181
|
+
cluster_name_on_cloud=cluster_name_on_cloud))
|
182
|
+
deployment_list = [deployment_name]
|
183
|
+
else:
|
184
|
+
deployment_name = constants.DEPLOYMENT_NAME
|
185
|
+
deployment_list = [
|
186
|
+
constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
|
187
|
+
]
|
188
|
+
|
189
|
+
for deploy_name in deployment_list:
|
168
190
|
try:
|
169
191
|
deployment = get_deployment(resource_group_name=resource_group,
|
170
|
-
deployment_name=
|
171
|
-
logger.info(f'Deployment {
|
192
|
+
deployment_name=deploy_name)
|
193
|
+
logger.info(f'Deployment {deploy_name!r} already exists. '
|
172
194
|
'Skipping deployment creation.')
|
173
195
|
|
174
196
|
outputs = deployment.properties.outputs
|
@@ -179,22 +201,20 @@ def bootstrap_instances(
|
|
179
201
|
deployment_exists = False
|
180
202
|
|
181
203
|
if not deployment_exists:
|
182
|
-
logger.info(f'Creating/Updating deployment: {
|
204
|
+
logger.info(f'Creating/Updating deployment: {deployment_name}')
|
183
205
|
create_or_update = get_azure_sdk_function(
|
184
206
|
client=resource_client.deployments,
|
185
207
|
function_name='create_or_update')
|
186
208
|
# TODO (skypilot): this takes a long time (> 40 seconds) to run.
|
187
209
|
outputs = create_or_update(
|
188
210
|
resource_group_name=resource_group,
|
189
|
-
deployment_name=
|
211
|
+
deployment_name=deployment_name,
|
190
212
|
parameters=parameters,
|
191
213
|
).result().properties.outputs
|
192
214
|
|
193
|
-
nsg_id = outputs['nsg']['value']
|
194
|
-
|
195
215
|
# append output resource ids to be used with vm creation
|
196
216
|
provider_config['msi'] = outputs['msi']['value']
|
197
|
-
provider_config['nsg'] =
|
217
|
+
provider_config['nsg'] = outputs['nsg']['value']
|
198
218
|
provider_config['subnet'] = outputs['subnet']['value']
|
199
219
|
|
200
220
|
return config
|