skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241029__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'c0c17483d1f692ad639144050f5f6fa0966e47a5'
8
+ _SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241028'
38
+ __version__ = '1.0.0.dev20241029'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/azure.py CHANGED
@@ -131,6 +131,9 @@ def get_client(name: str,
131
131
  from azure.mgmt import authorization
132
132
  return authorization.AuthorizationManagementClient(
133
133
  credential, subscription_id)
134
+ elif name == 'msi':
135
+ from azure.mgmt import msi
136
+ return msi.ManagedServiceIdentityClient(credential, subscription_id)
134
137
  elif name == 'graph':
135
138
  import msgraph
136
139
  return msgraph.GraphServiceClient(credential)
@@ -401,6 +401,8 @@ class SSHConfigHelper(object):
401
401
 
402
402
  ssh_conf_path = '~/.ssh/config'
403
403
  ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
404
+ ssh_conf_per_cluster_lock_path = os.path.expanduser(
405
+ '~/.sky/ssh_config_{}.lock')
404
406
  ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
405
407
 
406
408
  @classmethod
@@ -486,12 +488,6 @@ class SSHConfigHelper(object):
486
488
 
487
489
  config_path = os.path.expanduser(cls.ssh_conf_path)
488
490
 
489
- # For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
490
- # directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
491
- # and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
492
- cls._remove_stale_cluster_config_for_backward_compatibility(
493
- cluster_name, ip, auth_config, docker_user)
494
-
495
491
  if not os.path.exists(config_path):
496
492
  config = ['\n']
497
493
  with open(config_path,
@@ -560,139 +556,20 @@ class SSHConfigHelper(object):
560
556
  f.write(codegen)
561
557
 
562
558
  @classmethod
563
- def _remove_stale_cluster_config_for_backward_compatibility(
564
- cls,
565
- cluster_name: str,
566
- ip: str,
567
- auth_config: Dict[str, str],
568
- docker_user: Optional[str] = None,
569
- ):
570
- """Remove authentication information for cluster from local SSH config.
571
-
572
- If no existing host matching the provided specification is found, then
573
- nothing is removed.
574
-
575
- Args:
576
- ip: Head node's IP address.
577
- auth_config: read_yaml(handle.cluster_yaml)['auth']
578
- docker_user: If not None, use this user to ssh into the docker
579
- """
580
- username = auth_config['ssh_user']
581
- config_path = os.path.expanduser(cls.ssh_conf_path)
582
- cluster_config_path = os.path.expanduser(
583
- cls.ssh_cluster_path.format(cluster_name))
584
- if not os.path.exists(config_path):
585
- return
586
-
587
- with open(config_path, 'r', encoding='utf-8') as f:
588
- config = f.readlines()
589
-
590
- start_line_idx = None
591
-
592
- # Scan the config for the cluster name.
593
- for i, line in enumerate(config):
594
- next_line = config[i + 1] if i + 1 < len(config) else ''
595
- if docker_user is None:
596
- found = (line.strip() == f'HostName {ip}' and
597
- next_line.strip() == f'User {username}')
598
- else:
599
- found = (line.strip() == 'HostName localhost' and
600
- next_line.strip() == f'User {docker_user}')
601
- if found:
602
- # Find the line starting with ProxyCommand and contains the ip
603
- found = False
604
- for idx in range(i, len(config)):
605
- # Stop if we reach an empty line, which means a new host
606
- if not config[idx].strip():
607
- break
608
- if config[idx].strip().startswith('ProxyCommand'):
609
- proxy_command_line = config[idx].strip()
610
- if proxy_command_line.endswith(f'@{ip}'):
611
- found = True
612
- break
613
- if found:
614
- start_line_idx = i - 1
615
- break
616
-
617
- if start_line_idx is not None:
618
- # Scan for end of previous config.
619
- cursor = start_line_idx
620
- while cursor > 0 and len(config[cursor].strip()) > 0:
621
- cursor -= 1
622
- prev_end_line_idx = cursor
623
-
624
- # Scan for end of the cluster config.
625
- end_line_idx = None
626
- cursor = start_line_idx + 1
627
- start_line_idx -= 1 # remove auto-generated comment
628
- while cursor < len(config):
629
- if config[cursor].strip().startswith(
630
- '# ') or config[cursor].strip().startswith('Host '):
631
- end_line_idx = cursor
632
- break
633
- cursor += 1
634
-
635
- # Remove sky-generated config and update the file.
636
- config[prev_end_line_idx:end_line_idx] = [
637
- '\n'
638
- ] if end_line_idx is not None else []
639
- with open(config_path, 'w', encoding='utf-8') as f:
640
- f.write(''.join(config).strip())
641
- f.write('\n' * 2)
642
-
643
- # Delete include statement if it exists in the config.
644
- sky_autogen_comment = ('# Added by sky (use `sky stop/down '
645
- f'{cluster_name}` to remove)')
646
- with open(config_path, 'r', encoding='utf-8') as f:
647
- config = f.readlines()
648
-
649
- for i, line in enumerate(config):
650
- config_str = line.strip()
651
- if f'Include {cluster_config_path}' in config_str:
652
- with open(config_path, 'w', encoding='utf-8') as f:
653
- if i < len(config) - 1 and config[i + 1] == '\n':
654
- del config[i + 1]
655
- # Delete Include string
656
- del config[i]
657
- # Delete Sky Autogen Comment
658
- if i > 0 and sky_autogen_comment in config[i - 1].strip():
659
- del config[i - 1]
660
- f.write(''.join(config))
661
- break
662
- if 'Host' in config_str:
663
- break
664
-
665
- @classmethod
666
- # TODO: We can remove this after 0.6.0 and have a lock only per cluster.
667
- @timeline.FileLockEvent(ssh_conf_lock_path)
668
- def remove_cluster(
669
- cls,
670
- cluster_name: str,
671
- ip: str,
672
- auth_config: Dict[str, str],
673
- docker_user: Optional[str] = None,
674
- ):
559
+ def remove_cluster(cls, cluster_name: str):
675
560
  """Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
676
561
 
677
- For backward compatibility also remove the config from ~/.ssh/config if it exists.
678
-
679
562
  If no existing host matching the provided specification is found, then
680
563
  nothing is removed.
681
564
 
682
565
  Args:
683
- ip: Head node's IP address.
684
- auth_config: read_yaml(handle.cluster_yaml)['auth']
685
- docker_user: If not None, use this user to ssh into the docker
566
+ cluster_name: Cluster name.
686
567
  """
687
- cluster_config_path = os.path.expanduser(
688
- cls.ssh_cluster_path.format(cluster_name))
689
- common_utils.remove_file_if_exists(cluster_config_path)
690
-
691
- # Ensures backward compatibility: before #2706, we wrote the config of SkyPilot clusters
692
- # directly in ~/.ssh/config. For these clusters, we should clean up the config.
693
- # TODO: Remove this after 0.6.0
694
- cls._remove_stale_cluster_config_for_backward_compatibility(
695
- cluster_name, ip, auth_config, docker_user)
568
+ with timeline.FileLockEvent(
569
+ cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
570
+ cluster_config_path = os.path.expanduser(
571
+ cls.ssh_cluster_path.format(cluster_name))
572
+ common_utils.remove_file_if_exists(cluster_config_path)
696
573
 
697
574
 
698
575
  def _replace_yaml_dicts(
@@ -867,7 +744,7 @@ def write_cluster_config(
867
744
  labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
868
745
  # Deprecated: instance_tags have been replaced by labels. For backward
869
746
  # compatibility, we support them and the schema allows them only if
870
- # `labels` are not specified. This should be removed after 0.7.0.
747
+ # `labels` are not specified. This should be removed after 0.8.0.
871
748
  labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
872
749
  labels)
873
750
  # labels is a dict, which is guaranteed by the type check in
@@ -2118,13 +2118,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2118
2118
  stable_internal_external_ips: Optional[List[Tuple[str,
2119
2119
  str]]] = None,
2120
2120
  stable_ssh_ports: Optional[List[int]] = None,
2121
- cluster_info: Optional[provision_common.ClusterInfo] = None,
2122
- # The following 2 fields are deprecated. SkyPilot new provisioner
2123
- # API handles the TPU node creation/deletion.
2124
- # Backward compatibility for TPU nodes created before #2943.
2125
- # TODO (zhwu): Remove this after 0.6.0.
2126
- tpu_create_script: Optional[str] = None,
2127
- tpu_delete_script: Optional[str] = None) -> None:
2121
+ cluster_info: Optional[provision_common.ClusterInfo] = None
2122
+ ) -> None:
2128
2123
  self._version = self._VERSION
2129
2124
  self.cluster_name = cluster_name
2130
2125
  self.cluster_name_on_cloud = cluster_name_on_cloud
@@ -2139,12 +2134,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2139
2134
  self.launched_nodes = launched_nodes
2140
2135
  self.launched_resources = launched_resources
2141
2136
  self.docker_user: Optional[str] = None
2142
- # Deprecated. SkyPilot new provisioner API handles the TPU node
2143
- # creation/deletion.
2144
- # Backward compatibility for TPU nodes created before #2943.
2145
- # TODO (zhwu): Remove this after 0.6.0.
2146
- self.tpu_create_script = tpu_create_script
2147
- self.tpu_delete_script = tpu_delete_script
2148
2137
 
2149
2138
  def __repr__(self):
2150
2139
  return (f'ResourceHandle('
@@ -2160,10 +2149,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2160
2149
  f'\n\tlaunched_resources={self.launched_nodes}x '
2161
2150
  f'{self.launched_resources}, '
2162
2151
  f'\n\tdocker_user={self.docker_user},'
2163
- f'\n\tssh_user={self.ssh_user},'
2164
- # TODO (zhwu): Remove this after 0.6.0.
2165
- f'\n\ttpu_create_script={self.tpu_create_script}, '
2166
- f'\n\ttpu_delete_script={self.tpu_delete_script})')
2152
+ f'\n\tssh_user={self.ssh_user}')
2167
2153
 
2168
2154
  def get_cluster_name(self):
2169
2155
  return self.cluster_name
@@ -2176,26 +2162,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2176
2162
  return common_utils.read_yaml(self.cluster_yaml).get(
2177
2163
  'provider', {}).get('use_internal_ips', False)
2178
2164
 
2179
- def _update_cluster_region(self):
2180
- """Update the region in handle.launched_resources.
2181
-
2182
- This is for backward compatibility to handle the clusters launched
2183
- long before. We should remove this after 0.6.0.
2184
- """
2185
- if self.launched_resources.region is not None:
2186
- return
2187
-
2188
- config = common_utils.read_yaml(self.cluster_yaml)
2189
- provider = config['provider']
2190
- cloud = self.launched_resources.cloud
2191
- if cloud.is_same_cloud(clouds.Azure()):
2192
- region = provider['location']
2193
- elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
2194
- clouds.AWS()):
2195
- region = provider['region']
2196
-
2197
- self.launched_resources = self.launched_resources.copy(region=region)
2198
-
2199
2165
  def update_ssh_ports(self, max_attempts: int = 1) -> None:
2200
2166
  """Fetches and sets the SSH ports for the cluster nodes.
2201
2167
 
@@ -2567,8 +2533,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2567
2533
  if version < 4:
2568
2534
  self.update_ssh_ports()
2569
2535
 
2570
- self._update_cluster_region()
2571
-
2572
2536
  if version < 8:
2573
2537
  try:
2574
2538
  self._update_cluster_info()
@@ -2649,8 +2613,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2649
2613
  if record is not None:
2650
2614
  usage_lib.messages.usage.update_cluster_status(record['status'])
2651
2615
 
2652
- # Backward compatibility: the old launched_resources without region info
2653
- # was handled by ResourceHandle._update_cluster_region.
2654
2616
  assert launched_resources.region is not None, handle
2655
2617
 
2656
2618
  mismatch_str = (f'To fix: specify a new cluster name, or down the '
@@ -3585,9 +3547,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3585
3547
  backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3586
3548
 
3587
3549
  try:
3588
- # TODO(mraheja): remove pylint disabling when filelock
3589
- # version updated
3590
- # pylint: disable=abstract-class-instantiated
3591
3550
  with filelock.FileLock(
3592
3551
  lock_path,
3593
3552
  backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
@@ -4096,55 +4055,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4096
4055
  * Removing ssh configs for the cluster;
4097
4056
  * Updating the local state of the cluster;
4098
4057
  * Removing the terminated cluster's scripts and ray yaml files.
4099
-
4100
- Raises:
4101
- RuntimeError: If it fails to delete the TPU.
4102
4058
  """
4103
- log_path = os.path.join(os.path.expanduser(self.log_dir),
4104
- 'teardown.log')
4105
- log_abs_path = os.path.abspath(log_path)
4106
4059
  cluster_name_on_cloud = handle.cluster_name_on_cloud
4107
4060
 
4108
- # Backward compatibility for TPU nodes created before #2943. Any TPU
4109
- # node launched before that PR have the delete script generated (and do
4110
- # not have the tpu_node config set in its cluster yaml), so we have to
4111
- # call the deletion script to clean up the TPU node.
4112
- # For TPU nodes launched after the PR, deletion is done in SkyPilot's
4113
- # new GCP provisioner API.
4114
- # TODO (zhwu): Remove this after 0.6.0.
4115
- if (handle.tpu_delete_script is not None and
4116
- os.path.exists(handle.tpu_delete_script)):
4117
- # Only call the deletion script if the cluster config does not
4118
- # contain TPU node config. Otherwise, the deletion should
4119
- # already be handled by the new provisioner.
4120
- config = common_utils.read_yaml(handle.cluster_yaml)
4121
- tpu_node_config = config['provider'].get('tpu_node')
4122
- if tpu_node_config is None:
4123
- with rich_utils.safe_status(
4124
- ux_utils.spinner_message('Terminating TPU')):
4125
- tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
4126
- ['bash', handle.tpu_delete_script],
4127
- log_abs_path,
4128
- stream_logs=False,
4129
- require_outputs=True)
4130
- if tpu_rc != 0:
4131
- if _TPU_NOT_FOUND_ERROR in tpu_stderr:
4132
- logger.info('TPU not found. '
4133
- 'It should have been deleted already.')
4134
- elif purge:
4135
- logger.warning(
4136
- _TEARDOWN_PURGE_WARNING.format(
4137
- reason='stopping/terminating TPU',
4138
- details=tpu_stderr))
4139
- else:
4140
- raise RuntimeError(
4141
- _TEARDOWN_FAILURE_MESSAGE.format(
4142
- extra_reason='It is caused by TPU failure.',
4143
- cluster_name=common_utils.cluster_name_in_hint(
4144
- handle.cluster_name, cluster_name_on_cloud),
4145
- stdout=tpu_stdout,
4146
- stderr=tpu_stderr))
4147
-
4148
4061
  if (terminate and handle.launched_resources.is_image_managed is True):
4149
4062
  # Delete the image when terminating a "cloned" cluster, i.e.,
4150
4063
  # whose image is created by SkyPilot (--clone-disk-from)
@@ -4189,11 +4102,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4189
4102
  # The cluster file must exist because the cluster_yaml will only
4190
4103
  # be removed after the cluster entry in the database is removed.
4191
4104
  config = common_utils.read_yaml(handle.cluster_yaml)
4192
- auth_config = config['auth']
4193
- backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
4194
- handle.head_ip,
4195
- auth_config,
4196
- handle.docker_user)
4105
+ backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
4197
4106
 
4198
4107
  global_user_state.remove_cluster(handle.cluster_name,
4199
4108
  terminate=terminate)
@@ -4202,13 +4111,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4202
4111
  # This function could be directly called from status refresh,
4203
4112
  # where we need to cleanup the cluster profile.
4204
4113
  metadata_utils.remove_cluster_metadata(handle.cluster_name)
4205
- # Clean up TPU creation/deletion scripts
4206
- # Backward compatibility for TPU nodes created before #2943.
4207
- # TODO (zhwu): Remove this after 0.6.0.
4208
- if handle.tpu_delete_script is not None:
4209
- assert handle.tpu_create_script is not None
4210
- common_utils.remove_file_if_exists(handle.tpu_create_script)
4211
- common_utils.remove_file_if_exists(handle.tpu_delete_script)
4212
4114
 
4213
4115
  # Clean up generated config
4214
4116
  # No try-except is needed since Ray will fail to teardown the
sky/clouds/azure.py CHANGED
@@ -12,6 +12,7 @@ import colorama
12
12
  from sky import clouds
13
13
  from sky import exceptions
14
14
  from sky import sky_logging
15
+ from sky import skypilot_config
15
16
  from sky.adaptors import azure
16
17
  from sky.clouds import service_catalog
17
18
  from sky.clouds.utils import azure_utils
@@ -353,6 +354,13 @@ class Azure(clouds.Cloud):
353
354
  need_nvidia_driver_extension = (acc_dict is not None and
354
355
  'A10' in acc_dict)
355
356
 
357
+ # Determine resource group for deploying the instance.
358
+ resource_group_name = skypilot_config.get_nested(
359
+ ('azure', 'resource_group_vm'), None)
360
+ use_external_resource_group = resource_group_name is not None
361
+ if resource_group_name is None:
362
+ resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
363
+
356
364
  # Setup commands to eliminate the banner and restart sshd.
357
365
  # This script will modify /etc/ssh/sshd_config and add a bash script
358
366
  # into .bashrc. The bash script will restart sshd if it has not been
@@ -409,7 +417,8 @@ class Azure(clouds.Cloud):
409
417
  'disk_tier': Azure._get_disk_type(disk_tier),
410
418
  'cloud_init_setup_commands': cloud_init_setup_commands,
411
419
  'azure_subscription_id': self.get_project_id(dryrun),
412
- 'resource_group': f'{cluster_name.name_on_cloud}-{region_name}',
420
+ 'resource_group': resource_group_name,
421
+ 'use_external_resource_group': use_external_resource_group,
413
422
  }
414
423
 
415
424
  # Setting disk performance tier for high disk tier.
sky/optimizer.py CHANGED
@@ -831,13 +831,17 @@ class Optimizer:
831
831
  return row
832
832
 
833
833
  def _get_resource_group_hash(resources: 'resources_lib.Resources'):
834
- return json.dumps(
835
- {
836
- 'cloud': f'{resources.cloud}',
837
- 'accelerators': f'{resources.accelerators}',
838
- 'use_spot': resources.use_spot
839
- },
840
- sort_keys=True)
834
+ resource_key_dict = {
835
+ 'cloud': f'{resources.cloud}',
836
+ 'accelerators': f'{resources.accelerators}',
837
+ 'use_spot': resources.use_spot
838
+ }
839
+ if isinstance(resources.cloud, clouds.Kubernetes):
840
+ # Region for Kubernetes is the context name, i.e. different
841
+ # Kubernetes clusters. We add region to the key to show all the
842
+ # Kubernetes clusters in the optimizer table for better UX.
843
+ resource_key_dict['region'] = resources.region
844
+ return json.dumps(resource_key_dict, sort_keys=True)
841
845
 
842
846
  # Print the list of resouces that the optimizer considered.
843
847
  resource_fields = [
@@ -14,6 +14,12 @@
14
14
  "description": "Subnet parameters."
15
15
  }
16
16
  },
17
+ "location": {
18
+ "type": "string",
19
+ "metadata": {
20
+ "description": "Location of where the resources are allocated."
21
+ }
22
+ },
17
23
  "nsgName": {
18
24
  "type": "string",
19
25
  "metadata": {
@@ -23,7 +29,7 @@
23
29
  },
24
30
  "variables": {
25
31
  "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
26
- "location": "[resourceGroup().location]",
32
+ "location": "[parameters('location')]",
27
33
  "msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
28
34
  "roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
29
35
  "nsgName": "[parameters('nsgName')]",
@@ -14,13 +14,12 @@ from sky import exceptions
14
14
  from sky import sky_logging
15
15
  from sky.adaptors import azure
16
16
  from sky.provision import common
17
+ from sky.provision import constants
17
18
  from sky.utils import common_utils
18
19
 
19
20
  logger = sky_logging.init_logger(__name__)
20
21
 
21
22
  UNIQUE_ID_LEN = 4
22
- _DEPLOYMENT_NAME = 'skypilot-config'
23
- _LEGACY_DEPLOYMENT_NAME = 'ray-config'
24
23
  _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
25
24
  _CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
26
25
 
@@ -82,46 +81,55 @@ def bootstrap_instances(
82
81
  in provider_config), 'Provider config must include location field'
83
82
  params = {'location': provider_config['location']}
84
83
 
84
+ assert ('use_external_resource_group'
85
+ in provider_config), ('Provider config must include '
86
+ 'use_external_resource_group field')
87
+ use_external_resource_group = provider_config['use_external_resource_group']
88
+
85
89
  if 'tags' in provider_config:
86
90
  params['tags'] = provider_config['tags']
87
91
 
88
- logger.info(f'Creating/Updating resource group: {resource_group}')
89
- rg_create_or_update = get_azure_sdk_function(
90
- client=resource_client.resource_groups,
91
- function_name='create_or_update')
92
- rg_creation_start = time.time()
93
- retry = 0
94
- while (time.time() - rg_creation_start <
95
- _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
96
- try:
97
- rg_create_or_update(resource_group_name=resource_group,
98
- parameters=params)
99
- break
100
- except azure.exceptions().ResourceExistsError as e:
101
- if 'ResourceGroupBeingDeleted' in str(e):
102
- if retry % 5 == 0:
103
- logger.info(
104
- f'Azure resource group {resource_group} of a recent '
105
- f'terminated cluster {cluster_name_on_cloud} is being '
106
- 'deleted. It can only be provisioned after it is fully '
107
- 'deleted. Waiting...')
108
- time.sleep(1)
109
- retry += 1
110
- continue
111
- raise
112
- except azure.exceptions().ClientAuthenticationError as e:
92
+ # When resource group is user specified, it already exists in certain
93
+ # region.
94
+ if not use_external_resource_group:
95
+ logger.info(f'Creating/Updating resource group: {resource_group}')
96
+ rg_create_or_update = get_azure_sdk_function(
97
+ client=resource_client.resource_groups,
98
+ function_name='create_or_update')
99
+ rg_creation_start = time.time()
100
+ retry = 0
101
+ while (time.time() - rg_creation_start <
102
+ _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
103
+ try:
104
+ rg_create_or_update(resource_group_name=resource_group,
105
+ parameters=params)
106
+ break
107
+ except azure.exceptions().ResourceExistsError as e:
108
+ if 'ResourceGroupBeingDeleted' in str(e):
109
+ if retry % 5 == 0:
110
+ logger.info(
111
+ f'Azure resource group {resource_group} of a '
112
+ 'recent terminated cluster '
113
+ f'{cluster_name_on_cloud} is being deleted. It can'
114
+ ' only be provisioned after it is fully deleted. '
115
+ 'Waiting...')
116
+ time.sleep(1)
117
+ retry += 1
118
+ continue
119
+ raise
120
+ except azure.exceptions().ClientAuthenticationError as e:
121
+ message = (
122
+ 'Failed to authenticate with Azure. Please check your '
123
+ 'Azure credentials. Error: '
124
+ f'{common_utils.format_exception(e)}').replace('\n', ' ')
125
+ logger.error(message)
126
+ raise exceptions.NoClusterLaunchedError(message) from e
127
+ else:
113
128
  message = (
114
- 'Failed to authenticate with Azure. Please check your Azure '
115
- f'credentials. Error: {common_utils.format_exception(e)}'
116
- ).replace('\n', ' ')
129
+ f'Timed out waiting for resource group {resource_group} to be '
130
+ 'deleted.')
117
131
  logger.error(message)
118
- raise exceptions.NoClusterLaunchedError(message) from e
119
- else:
120
- message = (
121
- f'Timed out waiting for resource group {resource_group} to be '
122
- 'deleted.')
123
- logger.error(message)
124
- raise TimeoutError(message)
132
+ raise TimeoutError(message)
125
133
 
126
134
  # load the template file
127
135
  current_path = Path(__file__).parent
@@ -155,6 +163,9 @@ def bootstrap_instances(
155
163
  'nsgName': {
156
164
  'value': nsg_name
157
165
  },
166
+ 'location': {
167
+ 'value': params['location']
168
+ }
158
169
  },
159
170
  }
160
171
  }
@@ -164,11 +175,22 @@ def bootstrap_instances(
164
175
  get_deployment = get_azure_sdk_function(client=resource_client.deployments,
165
176
  function_name='get')
166
177
  deployment_exists = False
167
- for deployment_name in [_DEPLOYMENT_NAME, _LEGACY_DEPLOYMENT_NAME]:
178
+ if use_external_resource_group:
179
+ deployment_name = (
180
+ constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
181
+ cluster_name_on_cloud=cluster_name_on_cloud))
182
+ deployment_list = [deployment_name]
183
+ else:
184
+ deployment_name = constants.DEPLOYMENT_NAME
185
+ deployment_list = [
186
+ constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
187
+ ]
188
+
189
+ for deploy_name in deployment_list:
168
190
  try:
169
191
  deployment = get_deployment(resource_group_name=resource_group,
170
- deployment_name=deployment_name)
171
- logger.info(f'Deployment {deployment_name!r} already exists. '
192
+ deployment_name=deploy_name)
193
+ logger.info(f'Deployment {deploy_name!r} already exists. '
172
194
  'Skipping deployment creation.')
173
195
 
174
196
  outputs = deployment.properties.outputs
@@ -179,22 +201,20 @@ def bootstrap_instances(
179
201
  deployment_exists = False
180
202
 
181
203
  if not deployment_exists:
182
- logger.info(f'Creating/Updating deployment: {_DEPLOYMENT_NAME}')
204
+ logger.info(f'Creating/Updating deployment: {deployment_name}')
183
205
  create_or_update = get_azure_sdk_function(
184
206
  client=resource_client.deployments,
185
207
  function_name='create_or_update')
186
208
  # TODO (skypilot): this takes a long time (> 40 seconds) to run.
187
209
  outputs = create_or_update(
188
210
  resource_group_name=resource_group,
189
- deployment_name=_DEPLOYMENT_NAME,
211
+ deployment_name=deployment_name,
190
212
  parameters=parameters,
191
213
  ).result().properties.outputs
192
214
 
193
- nsg_id = outputs['nsg']['value']
194
-
195
215
  # append output resource ids to be used with vm creation
196
216
  provider_config['msi'] = outputs['msi']['value']
197
- provider_config['nsg'] = nsg_id
217
+ provider_config['nsg'] = outputs['nsg']['value']
198
218
  provider_config['subnet'] = outputs['subnet']['value']
199
219
 
200
220
  return config