skypilot-nightly 1.0.0.dev20241024__py3-none-any.whl → 1.0.0.dev20241025__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'cbf5c0022ad920edb4f41cfad65a2cf4909d5930'
8
+ _SKYPILOT_COMMIT_SHA = '057bc4b44755ac1e9dadc680e022c369e8ddff52'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241024'
38
+ __version__ = '1.0.0.dev20241025'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/check.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Credential checks: check cloud credentials and enable clouds."""
2
+ import os
2
3
  import traceback
3
4
  from types import ModuleType
4
5
  from typing import Dict, Iterable, List, Optional, Tuple, Union
@@ -194,19 +195,25 @@ def get_cached_enabled_clouds_or_refresh(
194
195
  def get_cloud_credential_file_mounts(
195
196
  excluded_clouds: Optional[Iterable[sky_clouds.Cloud]]
196
197
  ) -> Dict[str, str]:
197
- """Returns the files necessary to access all enabled clouds.
198
+ """Returns the files necessary to access all clouds.
198
199
 
199
200
  Returns a dictionary that will be added to a task's file mounts
200
201
  and a list of patterns that will be excluded (used as rsync_exclude).
201
202
  """
202
- enabled_clouds = get_cached_enabled_clouds_or_refresh()
203
+ # Uploading credentials for all clouds instead of only sky check
204
+ # enabled clouds because users may have partial credentials for some
205
+ # clouds to access their specific resources (e.g. cloud storage) but
206
+ # not have the complete credentials to pass sky check.
207
+ clouds = sky_clouds.CLOUD_REGISTRY.values()
203
208
  file_mounts = {}
204
- for cloud in enabled_clouds:
209
+ for cloud in clouds:
205
210
  if (excluded_clouds is not None and
206
211
  sky_clouds.cloud_in_iterable(cloud, excluded_clouds)):
207
212
  continue
208
213
  cloud_file_mounts = cloud.get_credential_file_mounts()
209
- file_mounts.update(cloud_file_mounts)
214
+ for remote_path, local_path in cloud_file_mounts.items():
215
+ if os.path.exists(os.path.expanduser(local_path)):
216
+ file_mounts[remote_path] = local_path
210
217
  # Currently, get_cached_enabled_clouds_or_refresh() does not support r2 as
211
218
  # only clouds with computing instances are marked as enabled by skypilot.
212
219
  # This will be removed when cloudflare/r2 is added as a 'cloud'.
sky/cli.py CHANGED
@@ -3085,7 +3085,7 @@ def show_gpus(
3085
3085
  qty_header = 'QTY_FILTER'
3086
3086
  free_header = 'FILTERED_FREE_GPUS'
3087
3087
  else:
3088
- qty_header = 'QTY_PER_NODE'
3088
+ qty_header = 'REQUESTABLE_QTY_PER_NODE'
3089
3089
  free_header = 'TOTAL_FREE_GPUS'
3090
3090
  realtime_gpu_table = log_utils.create_table(
3091
3091
  ['GPU', qty_header, 'TOTAL_GPUS', free_header])
sky/clouds/azure.py CHANGED
@@ -15,6 +15,7 @@ from sky import exceptions
15
15
  from sky import sky_logging
16
16
  from sky.adaptors import azure
17
17
  from sky.clouds import service_catalog
18
+ from sky.clouds.utils import azure_utils
18
19
  from sky.utils import common_utils
19
20
  from sky.utils import resources_utils
20
21
  from sky.utils import ux_utils
@@ -36,6 +37,15 @@ _MAX_IDENTITY_FETCH_RETRY = 10
36
37
 
37
38
  _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
38
39
  _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
40
+ _DEFAULT_SKYPILOT_IMAGE_GB = 30
41
+
42
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
43
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
44
+ _DEFAULT_V1_IMAGE_ID = 'skypilot:v1-ubuntu-2004'
45
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
46
+ _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
47
+
48
+ _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
39
49
 
40
50
 
41
51
  def _run_output(cmd):
@@ -132,29 +142,56 @@ class Azure(clouds.Cloud):
132
142
  cost += 0.0
133
143
  return cost
134
144
 
145
+ @classmethod
146
+ def get_default_instance_type(
147
+ cls,
148
+ cpus: Optional[str] = None,
149
+ memory: Optional[str] = None,
150
+ disk_tier: Optional[resources_utils.DiskTier] = None
151
+ ) -> Optional[str]:
152
+ return service_catalog.get_default_instance_type(cpus=cpus,
153
+ memory=memory,
154
+ disk_tier=disk_tier,
155
+ clouds='azure')
156
+
135
157
  @classmethod
136
158
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
137
- if region is None:
138
- # The region used here is only for where to send the query,
139
- # not the image location. Azure's image is globally available.
140
- region = 'eastus'
141
- is_skypilot_image_tag = False
159
+ # Process skypilot images.
142
160
  if image_id.startswith('skypilot:'):
143
- is_skypilot_image_tag = True
144
161
  image_id = service_catalog.get_image_id_from_tag(image_id,
145
162
  clouds='azure')
146
- image_id_splitted = image_id.split(':')
147
- if len(image_id_splitted) != 4:
148
- with ux_utils.print_exception_no_traceback():
149
- raise ValueError(f'Invalid image id: {image_id}. Expected '
150
- 'format: <publisher>:<offer>:<sku>:<version>')
151
- publisher, offer, sku, version = image_id_splitted
152
- if is_skypilot_image_tag:
153
- if offer == 'ubuntu-hpc':
154
- return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
163
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
164
+ # Avoid querying the image size from Azure as
165
+ # all skypilot custom images have the same size.
166
+ return _DEFAULT_SKYPILOT_IMAGE_GB
155
167
  else:
156
- return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
168
+ publisher, offer, sku, version = image_id.split(':')
169
+ if offer == 'ubuntu-hpc':
170
+ return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
171
+ else:
172
+ return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
173
+
174
+ # Process user-specified images.
175
+ azure_utils.validate_image_id(image_id)
157
176
  compute_client = azure.get_client('compute', cls.get_project_id())
177
+
178
+ # Community gallery image.
179
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
180
+ if region is None:
181
+ return 0.0
182
+ _, _, gallery_name, _, image_name = image_id.split('/')
183
+ try:
184
+ return azure_utils.get_community_image_size(
185
+ compute_client, gallery_name, image_name, region)
186
+ except exceptions.ResourcesUnavailableError:
187
+ return 0.0
188
+
189
+ # Marketplace image
190
+ if region is None:
191
+ # The region used here is only for where to send the query,
192
+ # not the image location. Marketplace image is globally available.
193
+ region = 'eastus'
194
+ publisher, offer, sku, version = image_id.split(':')
158
195
  try:
159
196
  image = compute_client.virtual_machine_images.get(
160
197
  region, publisher, offer, sku, version)
@@ -176,40 +213,23 @@ class Azure(clouds.Cloud):
176
213
  size_in_gb = size_in_bytes / (1024**3)
177
214
  return size_in_gb
178
215
 
179
- @classmethod
180
- def get_default_instance_type(
181
- cls,
182
- cpus: Optional[str] = None,
183
- memory: Optional[str] = None,
184
- disk_tier: Optional[resources_utils.DiskTier] = None
185
- ) -> Optional[str]:
186
- return service_catalog.get_default_instance_type(cpus=cpus,
187
- memory=memory,
188
- disk_tier=disk_tier,
189
- clouds='azure')
190
-
191
216
  def _get_default_image_tag(self, gen_version, instance_type) -> str:
192
217
  # ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
193
218
  acc = self.get_accelerators_from_instance_type(instance_type)
194
219
  if acc is not None:
195
220
  acc_name = list(acc.keys())[0]
196
221
  if acc_name == 'K80':
197
- return 'skypilot:k80-ubuntu-2004'
198
-
199
- # ubuntu-2004 v21.11.04, the previous image we used in the past for
200
- # V1 HyperV instance before we change default image to ubuntu-hpc.
222
+ return _DEFAULT_GPU_K80_IMAGE_ID
223
+ # About Gen V1 vs V2:
201
224
  # In Azure, all instances with K80 (Standard_NC series), some
202
225
  # instances with M60 (Standard_NV series) and some cpu instances
203
- # (Basic_A, Standard_D, ...) are V1 instance. For these instances,
204
- # we use the previous image.
226
+ # (Basic_A, Standard_D, ...) are V1 instance.
227
+ # All A100 instances are V2.
205
228
  if gen_version == 'V1':
206
- return 'skypilot:v1-ubuntu-2004'
207
-
208
- # nvidia-driver: 535.54.03, cuda: 12.2
209
- # see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
210
- # All A100 instances is of gen2, so it will always use
211
- # the latest ubuntu-hpc:2204 image.
212
- return 'skypilot:gpu-ubuntu-2204'
229
+ return _DEFAULT_V1_IMAGE_ID
230
+ if acc is None:
231
+ return _DEFAULT_CPU_IMAGE_ID
232
+ return _DEFAULT_GPU_IMAGE_ID
213
233
 
214
234
  @classmethod
215
235
  def regions_with_offering(cls, instance_type: str,
@@ -302,17 +322,34 @@ class Azure(clouds.Cloud):
302
322
  else:
303
323
  assert region_name in resources.image_id, resources.image_id
304
324
  image_id = resources.image_id[region_name]
325
+
326
+ # Checked basic image syntax in resources.py
305
327
  if image_id.startswith('skypilot:'):
306
328
  image_id = service_catalog.get_image_id_from_tag(image_id,
307
329
  clouds='azure')
308
- # Already checked in resources.py
309
- publisher, offer, sku, version = image_id.split(':')
310
- image_config = {
311
- 'image_publisher': publisher,
312
- 'image_offer': offer,
313
- 'image_sku': sku,
314
- 'image_version': version,
315
- }
330
+ # Fallback if image does not exist in the specified region.
331
+ # Putting fallback here instead of at image validation
332
+ # when creating the resource because community images are
333
+ # regional so we need the correct region when we check whether
334
+ # the image exists.
335
+ if image_id.startswith(
336
+ _COMMUNITY_IMAGE_PREFIX
337
+ ) and region_name not in azure_catalog.COMMUNITY_IMAGE_AVAILABLE_REGIONS:
338
+ logger.info(f'Azure image {image_id} does not exist in region '
339
+ f'{region_name} so use the fallback image instead.')
340
+ image_id = service_catalog.get_image_id_from_tag(
341
+ _FALLBACK_IMAGE_ID, clouds='azure')
342
+
343
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
344
+ image_config = {'community_gallery_image_id': image_id}
345
+ else:
346
+ publisher, offer, sku, version = image_id.split(':')
347
+ image_config = {
348
+ 'image_publisher': publisher,
349
+ 'image_offer': offer,
350
+ 'image_sku': sku,
351
+ 'image_version': version,
352
+ }
316
353
 
317
354
  # Setup the A10 nvidia driver.
318
355
  need_nvidia_driver_extension = (acc_dict is not None and
@@ -380,7 +417,6 @@ class Azure(clouds.Cloud):
380
417
  # Setting disk performance tier for high disk tier.
381
418
  if disk_tier == resources_utils.DiskTier.HIGH:
382
419
  resources_vars['disk_performance_tier'] = 'P50'
383
-
384
420
  return resources_vars
385
421
 
386
422
  def _get_feasible_launchable_resources(
sky/clouds/oci.py CHANGED
@@ -468,7 +468,7 @@ class OCI(clouds.Cloud):
468
468
  api_key_file = oci_cfg[
469
469
  'key_file'] if 'key_file' in oci_cfg else 'BadConf'
470
470
  sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
471
- except ImportError:
471
+ except (ImportError, oci_adaptor.oci.exceptions.ConfigFileNotFound):
472
472
  return {}
473
473
 
474
474
  # OCI config and API key file are mandatory
@@ -12,6 +12,21 @@ from sky.clouds.service_catalog import common
12
12
  from sky.utils import resources_utils
13
13
  from sky.utils import ux_utils
14
14
 
15
+ # This list should match the list of regions in
16
+ # skypilot image generation Packer script's replication_regions
17
+ # sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl
18
+ COMMUNITY_IMAGE_AVAILABLE_REGIONS = {
19
+ 'centralus',
20
+ 'eastus',
21
+ 'eastus2',
22
+ 'northcentralus',
23
+ 'southcentralus',
24
+ 'westcentralus',
25
+ 'westus',
26
+ 'westus2',
27
+ 'westus3',
28
+ }
29
+
15
30
  # The frequency of pulling the latest catalog from the cloud provider.
16
31
  # Though the catalog update is manual in our skypilot-catalog repo, we
17
32
  # still want to pull the latest catalog periodically to make sure the
@@ -120,8 +120,14 @@ def list_accelerators_realtime(
120
120
 
121
121
  # Generate the GPU quantities for the accelerators
122
122
  if accelerator_name and accelerator_count > 0:
123
- for count in range(1, accelerator_count + 1):
123
+ count = 1
124
+ while count <= accelerator_count:
124
125
  accelerators_qtys.add((accelerator_name, count))
126
+ count *= 2
127
+ # Add the accelerator count if it's not already in the set
128
+ # (e.g., if there's 12 GPUs, we should have qtys 1, 2, 4, 8, 12)
129
+ if accelerator_count not in accelerators_qtys:
130
+ accelerators_qtys.add((accelerator_name, accelerator_count))
125
131
 
126
132
  for pod in pods:
127
133
  # Get all the pods running on the node
@@ -0,0 +1,91 @@
1
+ """Utilies for Azure"""
2
+
3
+ import typing
4
+
5
+ from sky import exceptions
6
+ from sky.adaptors import azure
7
+ from sky.utils import ux_utils
8
+
9
+ if typing.TYPE_CHECKING:
10
+ from azure.mgmt import compute as azure_compute
11
+ from azure.mgmt.compute import models as azure_compute_models
12
+
13
+
14
+ def validate_image_id(image_id: str):
15
+ """Check if the image ID has a valid format.
16
+
17
+ Raises:
18
+ ValueError: If the image ID is invalid.
19
+ """
20
+ image_id_colon_splitted = image_id.split(':')
21
+ image_id_slash_splitted = image_id.split('/')
22
+ if len(image_id_slash_splitted) != 5 and len(image_id_colon_splitted) != 4:
23
+ with ux_utils.print_exception_no_traceback():
24
+ raise ValueError(
25
+ f'Invalid image id for Azure: {image_id}. Expected format: \n'
26
+ '* Marketplace image ID: <publisher>:<offer>:<sku>:<version>\n'
27
+ '* Community image ID: '
28
+ '/CommunityGalleries/<gallery-name>/Images/<image-name>')
29
+ if len(image_id_slash_splitted) == 5:
30
+ _, gallery_type, _, image_type, _ = image_id.split('/')
31
+ if gallery_type != 'CommunityGalleries' or image_type != 'Images':
32
+ with ux_utils.print_exception_no_traceback():
33
+ raise ValueError(
34
+ f'Invalid community image id for Azure: {image_id}.\n'
35
+ 'Expected format: '
36
+ '/CommunityGalleries/<gallery-name>/Images/<image-name>')
37
+
38
+
39
+ def get_community_image(
40
+ compute_client: 'azure_compute.ComputeManagementClient', image_id: str,
41
+ region: str) -> 'azure_compute_models.CommunityGalleryImage':
42
+ """Get community image from cloud.
43
+
44
+ Args:
45
+ image_id: /CommunityGalleries/<gallery-name>/Images/<image-name>
46
+ Raises:
47
+ ResourcesUnavailableError
48
+ """
49
+ try:
50
+ _, _, gallery_name, _, image_name = image_id.split('/')
51
+ return compute_client.community_gallery_images.get(
52
+ location=region,
53
+ public_gallery_name=gallery_name,
54
+ gallery_image_name=image_name)
55
+ except azure.exceptions().AzureError as e:
56
+ raise exceptions.ResourcesUnavailableError(
57
+ f'Community image {image_id} does not exist in region {region}.'
58
+ ) from e
59
+
60
+
61
+ def get_community_image_size(
62
+ compute_client: 'azure_compute.ComputeManagementClient',
63
+ gallery_name: str, image_name: str, region: str) -> float:
64
+ """Get the size of the community image from cloud.
65
+
66
+ Args:
67
+ image_id: /CommunityGalleries/<gallery-name>/Images/<image-name>
68
+ Raises:
69
+ ResourcesUnavailableError
70
+ """
71
+ try:
72
+ image_versions = compute_client.community_gallery_image_versions.list(
73
+ location=region,
74
+ public_gallery_name=gallery_name,
75
+ gallery_image_name=image_name,
76
+ )
77
+ image_versions = list(image_versions)
78
+ if not image_versions:
79
+ raise exceptions.ResourcesUnavailableError(
80
+ f'No versions available for Azure community image {image_name}')
81
+ latest_version = image_versions[-1].name
82
+
83
+ image_details = compute_client.community_gallery_image_versions.get(
84
+ location=region,
85
+ public_gallery_name=gallery_name,
86
+ gallery_image_name=image_name,
87
+ gallery_image_version_name=latest_version)
88
+ return image_details.storage_profile.os_disk_image.disk_size_gb
89
+ except azure.exceptions().AzureError as e:
90
+ raise exceptions.ResourcesUnavailableError(
91
+ f'Failed to get community image size: {e}.') from e
sky/exceptions.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """Exceptions."""
2
2
  import enum
3
3
  import typing
4
- from typing import List, Optional
4
+ from typing import List, Optional, Sequence
5
5
 
6
6
  if typing.TYPE_CHECKING:
7
7
  from sky import status_lib
@@ -61,12 +61,12 @@ class ProvisionPrechecksError(Exception):
61
61
  the error will be raised.
62
62
 
63
63
  Args:
64
- reasons: (List[Exception]) The reasons why the prechecks failed.
64
+ reasons: (Sequence[Exception]) The reasons why the prechecks failed.
65
65
  """
66
66
 
67
- def __init__(self, reasons: List[Exception]) -> None:
67
+ def __init__(self, reasons: Sequence[Exception]) -> None:
68
68
  super().__init__()
69
- self.reasons = list(reasons)
69
+ self.reasons = reasons
70
70
 
71
71
 
72
72
  class ManagedJobReachedMaxRetriesError(Exception):
@@ -24,6 +24,7 @@ from sky.utils import common_utils
24
24
  from sky.utils import ux_utils
25
25
 
26
26
  if typing.TYPE_CHECKING:
27
+ from sky import resources
27
28
  from sky import task as task_lib
28
29
 
29
30
  logger = sky_logging.init_logger(__name__)
@@ -327,8 +328,7 @@ class StrategyExecutor:
327
328
  'Failure happened before provisioning. Failover '
328
329
  f'reasons: {reasons_str}')
329
330
  if raise_on_failure:
330
- raise exceptions.ProvisionPrechecksError(
331
- reasons=reasons)
331
+ raise exceptions.ProvisionPrechecksError(reasons)
332
332
  return None
333
333
  logger.info('Failed to launch a cluster with error: '
334
334
  f'{common_utils.format_exception(e)})')
@@ -382,7 +382,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
382
382
  # first retry in the same cloud/region. (Inside recover() we may not
383
383
  # rely on cluster handle, as it can be None if the cluster is
384
384
  # preempted.)
385
- self._launched_resources: Optional['sky.resources.Resources'] = None
385
+ self._launched_resources: Optional['resources.Resources'] = None
386
386
 
387
387
  def _launch(self,
388
388
  max_retry: Optional[int] = 3,
@@ -13,6 +13,12 @@
13
13
  "metadata": {
14
14
  "description": "Subnet parameters."
15
15
  }
16
+ },
17
+ "nsgName": {
18
+ "type": "string",
19
+ "metadata": {
20
+ "description": "Name of the Network Security Group associated with the SkyPilot cluster."
21
+ }
16
22
  }
17
23
  },
18
24
  "variables": {
@@ -20,7 +26,7 @@
20
26
  "location": "[resourceGroup().location]",
21
27
  "msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
22
28
  "roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
23
- "nsgName": "[concat('sky-', parameters('clusterId'), '-nsg')]",
29
+ "nsgName": "[parameters('nsgName')]",
24
30
  "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
25
31
  "vnetName": "[concat('sky-', parameters('clusterId'), '-vnet')]",
26
32
  "subnetName": "[concat('sky-', parameters('clusterId'), '-subnet')]"
@@ -8,7 +8,7 @@ import json
8
8
  from pathlib import Path
9
9
  import random
10
10
  import time
11
- from typing import Any, Callable
11
+ from typing import Any, Callable, Tuple
12
12
 
13
13
  from sky import exceptions
14
14
  from sky import sky_logging
@@ -22,6 +22,7 @@ UNIQUE_ID_LEN = 4
22
22
  _DEPLOYMENT_NAME = 'skypilot-config'
23
23
  _LEGACY_DEPLOYMENT_NAME = 'ray-config'
24
24
  _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
25
+ _CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
25
26
 
26
27
 
27
28
  def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
@@ -41,6 +42,19 @@ def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
41
42
  return func
42
43
 
43
44
 
45
+ def get_cluster_id_and_nsg_name(resource_group: str,
46
+ cluster_name_on_cloud: str) -> Tuple[str, str]:
47
+ hasher = hashlib.md5(resource_group.encode('utf-8'))
48
+ unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
49
+ # We use the cluster name + resource group hash as the
50
+ # unique ID for the cluster, as we need to make sure that
51
+ # the deployments have unique names during failover.
52
+ cluster_id = _CLUSTER_ID.format(cluster_name_on_cloud=cluster_name_on_cloud,
53
+ unique_id=unique_id)
54
+ nsg_name = f'sky-{cluster_id}-nsg'
55
+ return cluster_id, nsg_name
56
+
57
+
44
58
  @common.log_function_start_end
45
59
  def bootstrap_instances(
46
60
  region: str, cluster_name_on_cloud: str,
@@ -117,12 +131,13 @@ def bootstrap_instances(
117
131
 
118
132
  logger.info(f'Using cluster name: {cluster_name_on_cloud}')
119
133
 
120
- hasher = hashlib.md5(provider_config['resource_group'].encode('utf-8'))
121
- unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
134
+ cluster_id, nsg_name = get_cluster_id_and_nsg_name(
135
+ resource_group=provider_config['resource_group'],
136
+ cluster_name_on_cloud=cluster_name_on_cloud)
122
137
  subnet_mask = provider_config.get('subnet_mask')
123
138
  if subnet_mask is None:
124
139
  # choose a random subnet, skipping most common value of 0
125
- random.seed(unique_id)
140
+ random.seed(cluster_id)
126
141
  subnet_mask = f'10.{random.randint(1, 254)}.0.0/16'
127
142
  logger.info(f'Using subnet mask: {subnet_mask}')
128
143
 
@@ -135,10 +150,10 @@ def bootstrap_instances(
135
150
  'value': subnet_mask
136
151
  },
137
152
  'clusterId': {
138
- # We use the cluster name + resource group hash as the
139
- # unique ID for the cluster, as we need to make sure that
140
- # the deployments have unique names during failover.
141
- 'value': f'{cluster_name_on_cloud}-{unique_id}'
153
+ 'value': cluster_id
154
+ },
155
+ 'nsgName': {
156
+ 'value': nsg_name
142
157
  },
143
158
  },
144
159
  }
@@ -15,6 +15,7 @@ from sky import status_lib
15
15
  from sky.adaptors import azure
16
16
  from sky.provision import common
17
17
  from sky.provision import constants
18
+ from sky.provision.azure import config as config_lib
18
19
  from sky.utils import common_utils
19
20
  from sky.utils import subprocess_utils
20
21
  from sky.utils import ux_utils
@@ -31,6 +32,8 @@ logger = sky_logging.init_logger(__name__)
31
32
  # https://github.com/Azure/azure-sdk-for-python/issues/9422
32
33
  azure_logger = logging.getLogger('azure')
33
34
  azure_logger.setLevel(logging.WARNING)
35
+ Client = Any
36
+ NetworkSecurityGroup = Any
34
37
 
35
38
  _RESUME_INSTANCE_TIMEOUT = 480 # 8 minutes
36
39
  _RESUME_PER_INSTANCE_TIMEOUT = 120 # 2 minutes
@@ -40,6 +43,10 @@ _WAIT_CREATION_TIMEOUT_SECONDS = 600
40
43
 
41
44
  _RESOURCE_GROUP_NOT_FOUND_ERROR_MESSAGE = 'ResourceGroupNotFound'
42
45
  _POLL_INTERVAL = 1
46
+ # TODO(Doyoung): _LEGACY_NSG_NAME can be remove this after 0.8.0 to ignore
47
+ # legacy nsg names.
48
+ _LEGACY_NSG_NAME = 'ray-{cluster_name_on_cloud}-nsg'
49
+ _SECOND_LEGACY_NSG_NAME = 'sky-{cluster_name_on_cloud}-nsg'
43
50
 
44
51
 
45
52
  class AzureInstanceStatus(enum.Enum):
@@ -795,6 +802,32 @@ def query_instances(
795
802
  return statuses
796
803
 
797
804
 
805
+ # TODO(Doyoung): _get_cluster_nsg can be remove this after 0.8.0 to ignore
806
+ # legacy nsg names.
807
+ def _get_cluster_nsg(network_client: Client, resource_group: str,
808
+ cluster_name_on_cloud: str) -> NetworkSecurityGroup:
809
+ """Retrieve the NSG associated with the given name of the cluster."""
810
+ list_network_security_groups = _get_azure_sdk_function(
811
+ client=network_client.network_security_groups, function_name='list')
812
+ legacy_nsg_name = _LEGACY_NSG_NAME.format(
813
+ cluster_name_on_cloud=cluster_name_on_cloud)
814
+ second_legacy_nsg_name = _SECOND_LEGACY_NSG_NAME.format(
815
+ cluster_name_on_cloud=cluster_name_on_cloud)
816
+ _, nsg_name = config_lib.get_cluster_id_and_nsg_name(
817
+ resource_group=resource_group,
818
+ cluster_name_on_cloud=cluster_name_on_cloud)
819
+ possible_nsg_names = [nsg_name, legacy_nsg_name, second_legacy_nsg_name]
820
+ for nsg in list_network_security_groups(resource_group):
821
+ if nsg.name in possible_nsg_names:
822
+ return nsg
823
+
824
+ # Raise an error if no matching NSG is found
825
+ raise ValueError('Failed to find a matching NSG for cluster '
826
+ f'{cluster_name_on_cloud!r} in resource group '
827
+ f'{resource_group!r}. Expected NSG names were: '
828
+ f'{possible_nsg_names}.')
829
+
830
+
798
831
  def open_ports(
799
832
  cluster_name_on_cloud: str,
800
833
  ports: List[str],
@@ -809,58 +842,66 @@ def open_ports(
809
842
  update_network_security_groups = _get_azure_sdk_function(
810
843
  client=network_client.network_security_groups,
811
844
  function_name='create_or_update')
812
- list_network_security_groups = _get_azure_sdk_function(
813
- client=network_client.network_security_groups, function_name='list')
814
- for nsg in list_network_security_groups(resource_group):
815
- try:
816
- # Wait the NSG creation to be finished before opening a port. The
817
- # cluster provisioning triggers the NSG creation, but it may not be
818
- # finished yet.
819
- backoff = common_utils.Backoff(max_backoff_factor=1)
820
- start_time = time.time()
821
- while True:
822
- if nsg.provisioning_state not in ['Creating', 'Updating']:
823
- break
824
- if time.time() - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
825
- logger.warning(
826
- f'Fails to wait for the creation of NSG {nsg.name} in '
827
- f'{resource_group} within '
828
- f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
829
- 'Skip this NSG.')
830
- backoff_time = backoff.current_backoff()
831
- logger.info(f'NSG {nsg.name} is not created yet. Waiting for '
832
- f'{backoff_time} seconds before checking again.')
833
- time.sleep(backoff_time)
834
-
835
- # Azure NSG rules have a priority field that determines the order
836
- # in which they are applied. The priority must be unique across
837
- # all inbound rules in one NSG.
838
- priority = max(rule.priority
839
- for rule in nsg.security_rules
840
- if rule.direction == 'Inbound') + 1
841
- nsg.security_rules.append(
842
- azure.create_security_rule(
843
- name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
844
- priority=priority,
845
- protocol='Tcp',
846
- access='Allow',
847
- direction='Inbound',
848
- source_address_prefix='*',
849
- source_port_range='*',
850
- destination_address_prefix='*',
851
- destination_port_ranges=ports,
852
- ))
853
- poller = update_network_security_groups(resource_group, nsg.name,
854
- nsg)
855
- poller.wait()
856
- if poller.status() != 'Succeeded':
845
+
846
+ try:
847
+ # Wait for the NSG creation to be finished before opening a port. The
848
+ # cluster provisioning triggers the NSG creation, but it may not be
849
+ # finished yet.
850
+ backoff = common_utils.Backoff(max_backoff_factor=1)
851
+ start_time = time.time()
852
+ while True:
853
+ nsg = _get_cluster_nsg(network_client, resource_group,
854
+ cluster_name_on_cloud)
855
+ if nsg.provisioning_state not in ['Creating', 'Updating']:
856
+ break
857
+ if time.time() - start_time > _WAIT_CREATION_TIMEOUT_SECONDS:
857
858
  with ux_utils.print_exception_no_traceback():
858
- raise ValueError(f'Failed to open ports {ports} in NSG '
859
- f'{nsg.name}: {poller.status()}')
860
- except azure.exceptions().HttpResponseError as e:
859
+ raise TimeoutError(
860
+ f'Timed out while waiting for the Network '
861
+ f'Security Group {nsg.name!r} to be ready for '
862
+ f'cluster {cluster_name_on_cloud!r} in '
863
+ f'resource group {resource_group!r}. The NSG '
864
+ f'did not reach a stable state '
865
+ '(Creating/Updating) within the allocated '
866
+ f'{_WAIT_CREATION_TIMEOUT_SECONDS} seconds. '
867
+ 'Consequently, the operation to open ports '
868
+ f'{ports} failed.')
869
+
870
+ backoff_time = backoff.current_backoff()
871
+ logger.info(f'NSG {nsg.name} is not created yet. Waiting for '
872
+ f'{backoff_time} seconds before checking again.')
873
+ time.sleep(backoff_time)
874
+
875
+ # Azure NSG rules have a priority field that determines the order
876
+ # in which they are applied. The priority must be unique across
877
+ # all inbound rules in one NSG.
878
+ priority = max(rule.priority
879
+ for rule in nsg.security_rules
880
+ if rule.direction == 'Inbound') + 1
881
+ nsg.security_rules.append(
882
+ azure.create_security_rule(
883
+ name=f'sky-ports-{cluster_name_on_cloud}-{priority}',
884
+ priority=priority,
885
+ protocol='Tcp',
886
+ access='Allow',
887
+ direction='Inbound',
888
+ source_address_prefix='*',
889
+ source_port_range='*',
890
+ destination_address_prefix='*',
891
+ destination_port_ranges=ports,
892
+ ))
893
+ poller = update_network_security_groups(resource_group, nsg.name, nsg)
894
+ poller.wait()
895
+ if poller.status() != 'Succeeded':
861
896
  with ux_utils.print_exception_no_traceback():
862
- raise ValueError(
863
- f'Failed to open ports {ports} in NSG {nsg.name}.') from e
897
+ raise ValueError(f'Failed to open ports {ports} in NSG '
898
+ f'{nsg.name}: {poller.status()}')
899
+
900
+ except azure.exceptions().HttpResponseError as e:
901
+ with ux_utils.print_exception_no_traceback():
902
+ raise ValueError(f'Failed to open ports {ports} in NSG for cluster '
903
+ f'{cluster_name_on_cloud!r} within resource group '
904
+ f'{resource_group!r}.') from e
864
905
 
865
906
 
866
907
  def cleanup_ports(
@@ -18,6 +18,7 @@ from sky.provision.kubernetes import utils as kubernetes_utils
18
18
  from sky.utils import command_runner
19
19
  from sky.utils import common_utils
20
20
  from sky.utils import kubernetes_enums
21
+ from sky.utils import subprocess_utils
21
22
  from sky.utils import ux_utils
22
23
 
23
24
  POLL_INTERVAL = 2
@@ -398,8 +399,7 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
398
399
  # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
399
400
  '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
400
401
 
401
- # TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
402
- for new_node in new_nodes:
402
+ def _setup_ssh_thread(new_node):
403
403
  pod_name = new_node.metadata.name
404
404
  runner = command_runner.KubernetesCommandRunner(
405
405
  ((namespace, context), pod_name))
@@ -411,6 +411,8 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
411
411
  stdout)
412
412
  logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
413
413
 
414
+ subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes)
415
+
414
416
 
415
417
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
416
418
  label: Dict[str, str]) -> None:
@@ -28,6 +28,7 @@ from sky.skylet import constants
28
28
  from sky.utils import common_utils
29
29
  from sky.utils import resources_utils
30
30
  from sky.utils import rich_utils
31
+ from sky.utils import subprocess_utils
31
32
  from sky.utils import ux_utils
32
33
 
33
34
  # Do not use __name__ as we do not want to propagate logs to sky.provision,
@@ -365,14 +366,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
365
366
  # use a queue for SSH querying
366
367
  ips = collections.deque(ip_list)
367
368
  ssh_ports = collections.deque(port_list)
368
- while ips:
369
- ip = ips.popleft()
370
- ssh_port = ssh_ports.popleft()
371
- success, stderr = waiter(ip, ssh_port, **ssh_credentials)
372
- if not success:
373
- ips.append(ip)
374
- ssh_ports.append(ssh_port)
375
- if time.time() - start > timeout:
369
+
370
+ def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
371
+ ip, ssh_port = ip_ssh_port
372
+ success = False
373
+ while not success:
374
+ success, stderr = waiter(ip, ssh_port, **ssh_credentials)
375
+ if not success and time.time() - start > timeout:
376
376
  with ux_utils.print_exception_no_traceback():
377
377
  raise RuntimeError(
378
378
  f'Failed to SSH to {ip} after timeout {timeout}s, with '
@@ -380,6 +380,14 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
380
380
  logger.debug('Retrying in 1 second...')
381
381
  time.sleep(1)
382
382
 
383
+ # try one node and multiprocess the rest
384
+ if ips:
385
+ ip = ips.popleft()
386
+ ssh_port = ssh_ports.popleft()
387
+ _retry_ssh_thread((ip, ssh_port))
388
+ subprocess_utils.run_in_parallel(_retry_ssh_thread,
389
+ list(zip(ips, ssh_ports)))
390
+
383
391
 
384
392
  def _post_provision_setup(
385
393
  cloud_name: str, cluster_name: resources_utils.ClusterName,
sky/resources.py CHANGED
@@ -225,6 +225,7 @@ class Resources:
225
225
  self._set_memory(memory)
226
226
  self._set_accelerators(accelerators, accelerator_args)
227
227
 
228
+ # TODO: move these out of init to prevent repeated calls.
228
229
  self._try_validate_instance_type()
229
230
  self._try_validate_cpus_mem()
230
231
  self._try_validate_managed_job_attributes()
sky/usage/usage_lib.py CHANGED
@@ -432,8 +432,9 @@ def entrypoint_context(name: str, fallback: bool = False):
432
432
  with ux_utils.enable_traceback():
433
433
  trace = traceback.format_exc()
434
434
  messages.usage.stacktrace = trace
435
- if hasattr(e, 'detailed_reason') and e.detailed_reason is not None:
436
- messages.usage.stacktrace += '\nDetails: ' + e.detailed_reason
435
+ detailed_reason = getattr(e, 'detailed_reason', None)
436
+ if detailed_reason is not None:
437
+ messages.usage.stacktrace += '\nDetails: ' + detailed_reason
437
438
  messages.usage.exception = common_utils.remove_color(
438
439
  common_utils.format_exception(e))
439
440
  raise
sky/utils/common_utils.py CHANGED
@@ -362,7 +362,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
362
362
 
363
363
  @functools.wraps(f)
364
364
  def _record(*args, **kwargs):
365
- nonlocal name_or_fn
366
365
  with cls(name_or_fn, **ctx_kwargs):
367
366
  return f(*args, **kwargs)
368
367
 
@@ -376,7 +375,6 @@ def make_decorator(cls, name_or_fn: Union[str, Callable],
376
375
 
377
376
  @functools.wraps(name_or_fn)
378
377
  def _record(*args, **kwargs):
379
- nonlocal name_or_fn
380
378
  f = name_or_fn
381
379
  func_name = getattr(f, '__qualname__', f.__name__)
382
380
  module_name = getattr(f, '__module__', '')
@@ -579,7 +577,10 @@ def validate_schema(obj, schema, err_msg_prefix='', skip_none=True):
579
577
  e.message)
580
578
  else:
581
579
  err_msg = err_msg_prefix
580
+ assert isinstance(e.schema, dict), 'Schema must be a dictionary'
582
581
  known_fields = set(e.schema.get('properties', {}).keys())
582
+ assert isinstance(e.instance,
583
+ dict), 'Instance must be a dictionary'
583
584
  for field in e.instance:
584
585
  if field not in known_fields:
585
586
  most_similar_field = difflib.get_close_matches(
@@ -505,20 +505,17 @@ def get_controller_resources(
505
505
  if handle is not None:
506
506
  controller_resources_to_use = handle.launched_resources
507
507
 
508
- if controller_resources_to_use.cloud is not None:
509
- return {controller_resources_to_use}
508
+ # If the controller and replicas are from the same cloud (and region/zone),
509
+ # it should provide better connectivity. We will let the controller choose
510
+ # from the clouds (and regions/zones) of the resources if the user does not
511
+ # specify the cloud (and region/zone) for the controller.
510
512
 
511
- # If the controller and replicas are from the same cloud, it should
512
- # provide better connectivity. We will let the controller choose from
513
- # the clouds of the resources if the controller does not exist.
514
- # TODO(tian): Consider respecting the regions/zones specified for the
515
- # resources as well.
516
- requested_clouds: Set['clouds.Cloud'] = set()
513
+ requested_clouds_with_region_zone: Dict[str, Dict[Optional[str],
514
+ Set[Optional[str]]]] = {}
517
515
  for resource in task_resources:
518
- # cloud is an object and will not be able to be distinguished by set.
519
- # Here we manually check if the cloud is in the set.
520
516
  if resource.cloud is not None:
521
- if not clouds.cloud_in_iterable(resource.cloud, requested_clouds):
517
+ cloud_name = str(resource.cloud)
518
+ if cloud_name not in requested_clouds_with_region_zone:
522
519
  try:
523
520
  resource.cloud.check_features_are_supported(
524
521
  resources.Resources(),
@@ -526,7 +523,26 @@ def get_controller_resources(
526
523
  except exceptions.NotSupportedError:
527
524
  # Skip the cloud if it does not support hosting controllers.
528
525
  continue
529
- requested_clouds.add(resource.cloud)
526
+ requested_clouds_with_region_zone[cloud_name] = {}
527
+ if resource.region is None:
528
+ # If one of the resource.region is None, this could represent
529
+ # that the user is unsure about which region the resource is
530
+ # hosted in. In this case, we allow any region for this cloud.
531
+ requested_clouds_with_region_zone[cloud_name] = {None: {None}}
532
+ elif None not in requested_clouds_with_region_zone[cloud_name]:
533
+ if resource.region not in requested_clouds_with_region_zone[
534
+ cloud_name]:
535
+ requested_clouds_with_region_zone[cloud_name][
536
+ resource.region] = set()
537
+ # If one of the resource.zone is None, allow any zone in the
538
+ # region.
539
+ if resource.zone is None:
540
+ requested_clouds_with_region_zone[cloud_name][
541
+ resource.region] = {None}
542
+ elif None not in requested_clouds_with_region_zone[cloud_name][
543
+ resource.region]:
544
+ requested_clouds_with_region_zone[cloud_name][
545
+ resource.region].add(resource.zone)
530
546
  else:
531
547
  # if one of the resource.cloud is None, this could represent user
532
548
  # does not know which cloud is best for the specified resources.
@@ -536,14 +552,49 @@ def get_controller_resources(
536
552
  # - cloud: runpod
537
553
  # accelerators: A40
538
554
  # In this case, we allow the controller to be launched on any cloud.
539
- requested_clouds.clear()
555
+ requested_clouds_with_region_zone.clear()
540
556
  break
541
- if not requested_clouds:
557
+
558
+ # Extract filtering criteria from the controller resources specified by the
559
+ # user.
560
+ controller_cloud = str(
561
+ controller_resources_to_use.cloud
562
+ ) if controller_resources_to_use.cloud is not None else None
563
+ controller_region = controller_resources_to_use.region
564
+ controller_zone = controller_resources_to_use.zone
565
+
566
+ # Filter clouds if controller_resources_to_use.cloud is specified.
567
+ filtered_clouds = ({controller_cloud} if controller_cloud is not None else
568
+ requested_clouds_with_region_zone.keys())
569
+
570
+ # Filter regions and zones and construct the result.
571
+ result: Set[resources.Resources] = set()
572
+ for cloud_name in filtered_clouds:
573
+ regions = requested_clouds_with_region_zone.get(cloud_name,
574
+ {None: {None}})
575
+
576
+ # Filter regions if controller_resources_to_use.region is specified.
577
+ filtered_regions = ({controller_region} if controller_region is not None
578
+ else regions.keys())
579
+
580
+ for region in filtered_regions:
581
+ zones = regions.get(region, {None})
582
+
583
+ # Filter zones if controller_resources_to_use.zone is specified.
584
+ filtered_zones = ({controller_zone}
585
+ if controller_zone is not None else zones)
586
+
587
+ # Create combinations of cloud, region, and zone.
588
+ for zone in filtered_zones:
589
+ resource_copy = controller_resources_to_use.copy(
590
+ cloud=clouds.CLOUD_REGISTRY.from_str(cloud_name),
591
+ region=region,
592
+ zone=zone)
593
+ result.add(resource_copy)
594
+
595
+ if not result:
542
596
  return {controller_resources_to_use}
543
- return {
544
- controller_resources_to_use.copy(cloud=controller_cloud)
545
- for controller_cloud in requested_clouds
546
- }
597
+ return result
547
598
 
548
599
 
549
600
  def _setup_proxy_command_on_controller(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241024
3
+ Version: 1.0.0.dev20241025
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,16 +1,16 @@
1
- sky/__init__.py,sha256=6pKM5lRNTckGEImvaBniSM_EuAzDT5xbSHqP8AhenhU,5882
1
+ sky/__init__.py,sha256=GSmePpdUEM88IzUu72hNEsvulpIRguwT7aXuz3ked5s,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
- sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
5
- sky/cli.py,sha256=PdqIKhGs6P2lHEShX2-iSwAtbpS5TTFKWNKCRl2i6kw,210345
4
+ sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
5
+ sky/cli.py,sha256=4HOGW3LTDlPNXHqvTykcM8iMWOCdAK90l6w34DYBIsg,210357
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
- sky/exceptions.py,sha256=D7WARzYRt4dGjXo6gI-gzkoodZbKF1D-qncm_DbHB28,8846
9
+ sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
10
10
  sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=OzxWiA6ZC0tyJ1eNMy4e72vitjfLKfbOLF9ywZOccXU,59343
13
- sky/resources.py,sha256=Qk_CYvLO8OFsnRLqXu-nG6qXfJEZ2aBMzxFJHYaXTvE,67398
13
+ sky/resources.py,sha256=6Hhk4tdwr_pZ6m5iA4w4MZUYflAOJtJw6va3jSHsVOI,67464
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
15
15
  sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
16
16
  sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -41,7 +41,7 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
43
  sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
44
- sky/clouds/azure.py,sha256=jTgynKU5tuOyBe97n2I7_k9P0Sw0QFU-6wLDLFwQhfM,28634
44
+ sky/clouds/azure.py,sha256=FklG_CEvOXLkZVoEYSCcNtPsQpq-2w6AJovzbLKun0w,30162
45
45
  sky/clouds/cloud.py,sha256=BBu1G-gkmylffldL50cvJ2DkDJ8vjVPziOPUAsvgJ2o,34948
46
46
  sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
47
47
  sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
@@ -50,14 +50,14 @@ sky/clouds/gcp.py,sha256=m_dH04HqgU-DdW4R9wrSr66IpPt9JMKHEvHEGGFpeRo,54655
50
50
  sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
51
51
  sky/clouds/kubernetes.py,sha256=j3imm_sbtyyZXvJ6qbqZmXok2C9OQIcGpyulljbTSJ4,28696
52
52
  sky/clouds/lambda_cloud.py,sha256=11dKUSunHUgaPZ1t8O85X29_NJ-o26sCt5DjwAPFgl4,12697
53
- sky/clouds/oci.py,sha256=ecVgcbCVJwDLtaYXs-yGDzwPYRr23KvjnzFOXwaY2O0,26914
53
+ sky/clouds/oci.py,sha256=Ve3MqVHay9oHRuK6vaCd3Rxz8fD54nfM_DKA4Qzf8l4,26963
54
54
  sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
55
55
  sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
56
56
  sky/clouds/scp.py,sha256=2KLTuNSMdBzK8CLwSesv7efOuiLidIMoyNG4AOt5Sqw,15870
57
57
  sky/clouds/vsphere.py,sha256=7eZFYIDtY5sX_ATr8h7kwwkY9t8Z-EYMJ9HCjoRBoxI,12309
58
58
  sky/clouds/service_catalog/__init__.py,sha256=e0K-c64jQV9d6zly5OnIXMsYaZXs_Ko9osAbDaRlOOw,14743
59
59
  sky/clouds/service_catalog/aws_catalog.py,sha256=1wX1-wOMw2LZ7RkV_Ah7c42RLRYm-m5_GAXzn32M5a8,13038
60
- sky/clouds/service_catalog/azure_catalog.py,sha256=VJi3yfhZy9Sc6UfcLAc8xIoTlUlUr090TODkCZyyHFw,7311
60
+ sky/clouds/service_catalog/azure_catalog.py,sha256=DOAzAhI5eHRHTzYDBrlNmfh3YByAoR-A9kBVeh6ZXvs,7689
61
61
  sky/clouds/service_catalog/common.py,sha256=PA3llB0zZh4v0DO_gDDCKGhRIBx16CAp2WJZNxhjNOA,27266
62
62
  sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZuLPvEVYA,1793
63
63
  sky/clouds/service_catalog/constants.py,sha256=ai2yOlsVqBnEpbxaEHXt61COsHBLwOfw6GZXntEPj7k,411
@@ -65,7 +65,7 @@ sky/clouds/service_catalog/cudo_catalog.py,sha256=QXAOpx5fJ_cGCr5LbB7wpHMfKIla7G
65
65
  sky/clouds/service_catalog/fluidstack_catalog.py,sha256=c8MMTldG-q97MJ0zJymudQiOVQC_rxS7vqrZgLrgbQA,5038
66
66
  sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=0dzjmXABFECzaAuIa0E6pVINhVK6-G6U52Mj-L45gK8,4472
68
- sky/clouds/service_catalog/kubernetes_catalog.py,sha256=6OocEUkgyJtBgHwzu4RPsvru6pj6RwGU-4uSFNQmsSM,8254
68
+ sky/clouds/service_catalog/kubernetes_catalog.py,sha256=Eezfl-tx3obgy3d2Kz2XR-_ezj_y8Dxk4oOW7Hy_g-o,8599
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=BAhUGqHj8aVe1zUhEQNO7bQUhcd9jAespGvPyQubTJY,5281
70
70
  sky/clouds/service_catalog/oci_catalog.py,sha256=AG1mOgc-iWaX4zapONWMZPNd2RCKCsaNOyFc0eq_LFU,8551
71
71
  sky/clouds/service_catalog/paperspace_catalog.py,sha256=W8GgGlPbbWViELQ8EZfmIkxSbeQcCmMRUX4ecIIYDsk,3768
@@ -83,6 +83,7 @@ sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38z
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
+ sky/clouds/utils/azure_utils.py,sha256=NToRBnhEyuUvb-nBnsKTxjhOBRkMcrelL8LK4w6s4t8,3555
86
87
  sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
87
88
  sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
88
89
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
@@ -96,7 +97,7 @@ sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
96
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
97
98
  sky/jobs/controller.py,sha256=k28bbicxtML6p1YxSetk-1nhBHPCubpvLWJsh7TtU9c,26701
98
99
  sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
99
- sky/jobs/recovery_strategy.py,sha256=G3iFicEajB-l9FefvcqjqPIazb1X8BJ_AgVmD5bDV2w,25556
100
+ sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
100
101
  sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
101
102
  sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
102
103
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
@@ -109,15 +110,15 @@ sky/provision/docker_utils.py,sha256=cKYasCwbMf6C2_0vTxg2GvbrnhFvko-xDl1frfm7wxc
109
110
  sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
110
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
111
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
112
- sky/provision/provisioner.py,sha256=A4-yY0Q4GnkdJsHl_DLNEycq5wFKFsPwT0fwTNh1dG0,25016
113
+ sky/provision/provisioner.py,sha256=mTvtBjS-Xz64LJcyeHx_-wdM8Gin8D49YRaV_TADaz4,25334
113
114
  sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
114
115
  sky/provision/aws/config.py,sha256=ApEh63RR_KyCp9nPXX35z6jBREoulJPQ5st8K9Jlclo,23385
115
116
  sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
116
117
  sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,3238
117
118
  sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
118
- sky/provision/azure/azure-config-template.json,sha256=dwTO-DG70UtBifN59NzsJwPZHZ4uCs7_oLooHgXG_N8,4349
119
- sky/provision/azure/config.py,sha256=-VUYxmwLZKHzxNkRmE0q8ZkZ_0iq-gi0q8XPvRD_Y0U,7345
120
- sky/provision/azure/instance.py,sha256=pDdGScePohZHTRLEtr92k6vO9LIyd-RBCKOkZ5oDV58,37463
119
+ sky/provision/azure/azure-config-template.json,sha256=jsyzjE03FLoR8TF9phe7XMdmQyRQpbfwCMlSxjPjEig,4531
120
+ sky/provision/azure/config.py,sha256=zFgWWK-UWsbuxOo7_TTVL2rqWMqUWmkQl9bnP7w18xI,7907
121
+ sky/provision/azure/instance.py,sha256=xHHPSfPoBJxC4IkzHeaGfQZoxEpElSK7LC04z34spgc,39319
121
122
  sky/provision/cudo/__init__.py,sha256=KAEl26MVPsk7IoP9Gg-MOJJRIV6-X9B0fbyHdyJWdLo,741
122
123
  sky/provision/cudo/config.py,sha256=RYOVkV0MoUqVBJRZiKhBZhjFygeyFs7eUdVMdPg1vds,327
123
124
  sky/provision/cudo/cudo_machine_type.py,sha256=_VNXWPELmlFXbtdcnPvkuLuyE9CZ923BUCdiac-ClDY,696
@@ -136,7 +137,7 @@ sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHa
136
137
  sky/provision/gcp/mig_utils.py,sha256=oFpcFZoapHMILSE4iIm8V5bxP1RhbMHRF7cciqq8qAk,7883
137
138
  sky/provision/kubernetes/__init__.py,sha256=y6yVfii81WYG3ROxv4hiIj-ydinS5-xGxLvXnARVQoI,719
138
139
  sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2vmzldc,29072
139
- sky/provision/kubernetes/instance.py,sha256=Qth9AWc8OBGB7WeGJ4ERlopNA8y2wg1AvS5XPJEuXXQ,38421
140
+ sky/provision/kubernetes/instance.py,sha256=FOt77bFSKwi12J1_1qXhUrKiCqLfKWFgcRa1cLlNFlU,38453
140
141
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
141
142
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
142
143
  sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
@@ -239,15 +240,15 @@ sky/templates/sky-serve-controller.yaml.j2,sha256=V1IiYhArv_D_7JzC3sVN4nKlSCCCL1
239
240
  sky/templates/vsphere-ray.yml.j2,sha256=cOQ-qdpxGA2FHajMMhTJI-SmlYzdPterX4Gsiq-nkb0,3587
240
241
  sky/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
241
242
  sky/usage/constants.py,sha256=8xpg9vhDU9A3eObtpkNFjwa42oCazqGEv4yw_vJSO7U,590
242
- sky/usage/usage_lib.py,sha256=uqclBc87_9D_QVWigCjOIfWFoVB6re68C7RnwjzRYvg,17870
243
+ sky/usage/usage_lib.py,sha256=mxsbwUMEQjesUOIv4Yne-Ze7rVxSQYr3_wBXruifGRA,17898
243
244
  sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
244
245
  sky/utils/accelerator_registry.py,sha256=BO4iYH5bV80Xyp4EPfO0n1D3LL0FvESCy7xm59Je3_o,3798
245
246
  sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO9U,5943
246
247
  sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
247
248
  sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,34858
248
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
249
- sky/utils/common_utils.py,sha256=MwFhIcvCEMBo7kbENUjN3qRNO5SoMV0fzAORc65c5x0,24525
250
- sky/utils/controller_utils.py,sha256=V05hiLJIjqqXssYzs_Gchk4-tijgpMgLJsRW8ymhS-E,40625
250
+ sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
251
+ sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
251
252
  sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
252
253
  sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
253
254
  sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
@@ -273,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
273
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
274
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
275
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
276
- skypilot_nightly-1.0.0.dev20241024.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
- skypilot_nightly-1.0.0.dev20241024.dist-info/METADATA,sha256=R0zBIXcaURnFidv6hVzwuO_p5xUp-L8ZB5_l9-NG0yc,19540
278
- skypilot_nightly-1.0.0.dev20241024.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
279
- skypilot_nightly-1.0.0.dev20241024.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
- skypilot_nightly-1.0.0.dev20241024.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
- skypilot_nightly-1.0.0.dev20241024.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241025.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241025.dist-info/METADATA,sha256=siLhZo4MgO_jZOW2C51DpjE_Uxw1MOaDZyaemct3w1g,19540
279
+ skypilot_nightly-1.0.0.dev20241025.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
280
+ skypilot_nightly-1.0.0.dev20241025.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241025.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241025.dist-info/RECORD,,