skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/adaptors/azure.py CHANGED
@@ -1,25 +1,58 @@
1
1
  """Azure cli adaptor"""
2
2
 
3
3
  # pylint: disable=import-outside-toplevel
4
- import functools
4
+ import asyncio
5
+ import datetime
6
+ import logging
5
7
  import threading
8
+ import time
9
+ from typing import Any, Optional
10
+ import uuid
6
11
 
12
+ from sky import exceptions as sky_exceptions
13
+ from sky import sky_logging
7
14
  from sky.adaptors import common
15
+ from sky.skylet import constants
16
+ from sky.utils import annotations
17
+ from sky.utils import common_utils
18
+ from sky.utils import ux_utils
8
19
 
9
20
  azure = common.LazyImport(
10
21
  'azure',
11
22
  import_error_message=('Failed to import dependencies for Azure.'
12
- 'Try pip install "skypilot[azure]"'))
23
+ 'Try pip install "skypilot[azure]"'),
24
+ set_loggers=lambda: logging.getLogger('azure.identity').setLevel(logging.
25
+ ERROR))
26
+ Client = Any
27
+ sky_logger = sky_logging.init_logger(__name__)
28
+
13
29
  _LAZY_MODULES = (azure,)
14
30
 
15
31
  _session_creation_lock = threading.RLock()
32
+ _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5
16
33
 
17
34
 
18
35
  @common.load_lazy_modules(modules=_LAZY_MODULES)
36
+ @annotations.lru_cache(scope='global', maxsize=1)
19
37
  def get_subscription_id() -> str:
20
38
  """Get the default subscription id."""
21
39
  from azure.common import credentials
22
- return credentials.get_cli_profile().get_subscription_id()
40
+ retry = 0
41
+ backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
42
+ while True:
43
+ try:
44
+ return credentials.get_cli_profile().get_subscription_id()
45
+ except Exception as e:
46
+ if ('Please run \'az login\' to setup account.' in str(e) and
47
+ retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
48
+ # When there are multiple processes trying to get the
49
+ # subscription id, it may fail with the above error message.
50
+ # Retry will fix the issue.
51
+ retry += 1
52
+
53
+ time.sleep(backoff.current_backoff())
54
+ continue
55
+ raise
23
56
 
24
57
 
25
58
  @common.load_lazy_modules(modules=_LAZY_MODULES)
@@ -36,30 +69,414 @@ def exceptions():
36
69
  return azure_exceptions
37
70
 
38
71
 
39
- @functools.lru_cache()
72
+ @annotations.lru_cache(scope='global')
73
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
74
+ def azure_mgmt_models(name: str):
75
+ if name == 'compute':
76
+ from azure.mgmt.compute import models
77
+ return models
78
+ elif name == 'network':
79
+ from azure.mgmt.network import models
80
+ return models
81
+
82
+
83
+ # We should keep the order of the decorators having 'lru_cache' followed
84
+ # by 'load_lazy_modules' as we need to make sure a caller can call
85
+ # 'get_client.cache_clear', which is a function provided by 'lru_cache'
86
+ @annotations.lru_cache(scope='global')
40
87
  @common.load_lazy_modules(modules=_LAZY_MODULES)
41
- def get_client(name: str, subscription_id: str):
88
+ def get_client(name: str,
89
+ subscription_id: Optional[str] = None,
90
+ **kwargs) -> Client:
91
+ """Creates and returns an Azure client for the specified service.
92
+
93
+ Args:
94
+ name: The type of Azure client to create.
95
+ subscription_id: The Azure subscription ID. Defaults to None.
96
+
97
+ Returns:
98
+ An instance of the specified Azure client.
99
+
100
+ Raises:
101
+ NonExistentStorageAccountError: When storage account provided
102
+ either through config.yaml or local db does not exist under
103
+ user's subscription ID.
104
+ StorageBucketGetError: If there is an error retrieving the container
105
+ client or if a non-existent public container is specified.
106
+ ValueError: If an unsupported client type is specified.
107
+ TimeoutError: If unable to get the container client within the
108
+ specified time.
109
+ """
42
110
  # Sky only supports Azure CLI credential for now.
43
111
  # Increase the timeout to fix the Azure get-access-token timeout issue.
44
112
  # Tracked in
45
113
  # https://github.com/Azure/azure-cli/issues/20404#issuecomment-1249575110
46
- from azure.identity import AzureCliCredential
114
+ from azure import identity
47
115
  with _session_creation_lock:
48
- credential = AzureCliCredential(process_timeout=30)
116
+ credential = identity.AzureCliCredential(process_timeout=30)
49
117
  if name == 'compute':
50
- from azure.mgmt.compute import ComputeManagementClient
51
- return ComputeManagementClient(credential, subscription_id)
118
+ from azure.mgmt import compute
119
+ return compute.ComputeManagementClient(credential, subscription_id)
52
120
  elif name == 'network':
53
- from azure.mgmt.network import NetworkManagementClient
54
- return NetworkManagementClient(credential, subscription_id)
121
+ from azure.mgmt import network
122
+ return network.NetworkManagementClient(credential, subscription_id)
55
123
  elif name == 'resource':
56
- from azure.mgmt.resource import ResourceManagementClient
57
- return ResourceManagementClient(credential, subscription_id)
124
+ from azure.mgmt import resource
125
+ return resource.ResourceManagementClient(credential,
126
+ subscription_id)
127
+ elif name == 'storage':
128
+ from azure.mgmt import storage
129
+ return storage.StorageManagementClient(credential, subscription_id)
130
+ elif name == 'authorization':
131
+ from azure.mgmt import authorization
132
+ return authorization.AuthorizationManagementClient(
133
+ credential, subscription_id)
134
+ elif name == 'msi':
135
+ from azure.mgmt import msi
136
+ return msi.ManagedServiceIdentityClient(credential, subscription_id)
137
+ elif name == 'graph':
138
+ import msgraph
139
+ return msgraph.GraphServiceClient(credential)
140
+ elif name == 'container':
141
+ # There is no direct way to check if a container URL is public or
142
+ # private. Attempting to access a private container without
143
+ # credentials or a public container with credentials throws an
144
+ # error. Therefore, we use a try-except block, first assuming the
145
+ # URL is for a public container. If an error occurs, we retry with
146
+ # credentials, assuming it's a private container.
147
+ # Reference: https://github.com/Azure/azure-sdk-for-python/issues/35770 # pylint: disable=line-too-long
148
+ # Note: Checking a private container without credentials is
149
+ # faster (~0.2s) than checking a public container with
150
+ # credentials (~90s).
151
+ from azure.mgmt import storage
152
+ from azure.storage import blob
153
+ container_url = kwargs.pop('container_url', None)
154
+ assert container_url is not None, ('Must provide container_url'
155
+ ' keyword arguments for '
156
+ 'container client.')
157
+ storage_account_name = kwargs.pop('storage_account_name', None)
158
+ assert storage_account_name is not None, ('Must provide '
159
+ 'storage_account_name '
160
+ 'keyword arguments for '
161
+ 'container client.')
162
+
163
+ # Check if the given storage account exists. This separate check
164
+ # is necessary as running container_client.exists() with container
165
+ # url on non-existent storage account errors out after long lag(~90s)
166
+ storage_client = storage.StorageManagementClient(
167
+ credential, subscription_id)
168
+ storage_account_availability = (
169
+ storage_client.storage_accounts.check_name_availability(
170
+ {'name': storage_account_name}))
171
+ if storage_account_availability.name_available:
172
+ with ux_utils.print_exception_no_traceback():
173
+ raise sky_exceptions.NonExistentStorageAccountError(
174
+ f'The storage account {storage_account_name!r} does '
175
+ 'not exist. Please check if the name is correct.')
176
+
177
+ # First, assume the URL is from a public container.
178
+ container_client = blob.ContainerClient.from_container_url(
179
+ container_url)
180
+ try:
181
+ container_client.exists()
182
+ return container_client
183
+ except exceptions().ClientAuthenticationError:
184
+ pass
185
+
186
+ # If the URL is not for a public container, assume it's private
187
+ # and retry with credentials.
188
+ start_time = time.time()
189
+ role_assigned = False
190
+
191
+ while (time.time() - start_time <
192
+ constants.WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT):
193
+ container_client = blob.ContainerClient.from_container_url(
194
+ container_url, credential)
195
+ try:
196
+ # Suppress noisy logs from Azure SDK when attempting
197
+ # to run exists() on private container without access.
198
+ # Reference:
199
+ # https://github.com/Azure/azure-sdk-for-python/issues/9422
200
+ azure_logger = logging.getLogger('azure')
201
+ original_level = azure_logger.getEffectiveLevel()
202
+ azure_logger.setLevel(logging.CRITICAL)
203
+ container_client.exists()
204
+ azure_logger.setLevel(original_level)
205
+ return container_client
206
+ except exceptions().ClientAuthenticationError as e:
207
+ # Caught when user attempted to use private container
208
+ # without access rights. Raised error is handled at the
209
+ # upstream.
210
+ # Reference: https://learn.microsoft.com/en-us/troubleshoot/azure/entra/entra-id/app-integration/error-code-aadsts50020-user-account-identity-provider-does-not-exist # pylint: disable=line-too-long
211
+ if 'ERROR: AADSTS50020' in str(e):
212
+ with ux_utils.print_exception_no_traceback():
213
+ raise e
214
+ with ux_utils.print_exception_no_traceback():
215
+ raise sky_exceptions.StorageBucketGetError(
216
+ 'Failed to retreive the container client for the '
217
+ f'container {container_client.container_name!r}. '
218
+ f'Details: '
219
+ f'{common_utils.format_exception(e, use_bracket=True)}'
220
+ )
221
+ except exceptions().HttpResponseError as e:
222
+ # Handle case where user lacks sufficient IAM role for
223
+ # a private container in the same subscription. Attempt to
224
+ # assign appropriate role to current user.
225
+ if 'AuthorizationPermissionMismatch' in str(e):
226
+ if not role_assigned:
227
+ # resource_group_name is not None only for private
228
+ # containers with user access.
229
+ resource_group_name = kwargs.pop(
230
+ 'resource_group_name', None)
231
+ assert resource_group_name is not None, (
232
+ 'Must provide resource_group_name keyword '
233
+ 'arguments for container client.')
234
+ sky_logger.info(
235
+ 'Failed to check the existance of the '
236
+ f'container {container_url!r} due to '
237
+ 'insufficient IAM role for storage '
238
+ f'account {storage_account_name!r}.')
239
+ assign_storage_account_iam_role(
240
+ storage_account_name=storage_account_name,
241
+ resource_group_name=resource_group_name)
242
+ role_assigned = True
243
+ else:
244
+ sky_logger.info(
245
+ 'Waiting due to the propagation delay of IAM '
246
+ 'role assignment to the storage account '
247
+ f'{storage_account_name!r}.')
248
+ time.sleep(
249
+ constants.RETRY_INTERVAL_AFTER_ROLE_ASSIGNMENT)
250
+ continue
251
+ with ux_utils.print_exception_no_traceback():
252
+ raise sky_exceptions.StorageBucketGetError(
253
+ 'Failed to retreive the container client for the '
254
+ f'container {container_client.container_name!r}. '
255
+ f'Details: '
256
+ f'{common_utils.format_exception(e, use_bracket=True)}'
257
+ )
258
+ else:
259
+ raise TimeoutError(
260
+ 'Failed to get the container client within '
261
+ f'{constants.WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT}'
262
+ ' seconds.')
58
263
  else:
59
264
  raise ValueError(f'Client not supported: "{name}"')
60
265
 
61
266
 
267
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
268
+ def get_az_container_sas_token(
269
+ storage_account_name: str,
270
+ storage_account_key: str,
271
+ container_name: str,
272
+ ) -> str:
273
+ """Returns SAS token used to access container.
274
+
275
+ Args:
276
+ storage_account_name: Name of the storage account
277
+ storage_account_key: Access key for the given storage account
278
+ container_name: The name of the mounting container
279
+
280
+ Returns:
281
+ An SAS token with a 1-hour lifespan to access the specified container.
282
+ """
283
+ from azure.storage import blob
284
+ sas_token = blob.generate_container_sas(
285
+ account_name=storage_account_name,
286
+ container_name=container_name,
287
+ account_key=storage_account_key,
288
+ permission=blob.ContainerSasPermissions(read=True,
289
+ write=True,
290
+ list=True,
291
+ create=True),
292
+ expiry=datetime.datetime.now(datetime.timezone.utc) +
293
+ datetime.timedelta(hours=1))
294
+ return sas_token
295
+
296
+
297
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
298
+ def get_az_blob_sas_token(storage_account_name: str, storage_account_key: str,
299
+ container_name: str, blob_name: str) -> str:
300
+ """Returns SAS token used to access a blob.
301
+
302
+ Args:
303
+ storage_account_name: Name of the storage account
304
+ storage_account_key: access key for the given storage
305
+ account
306
+ container_name: name of the mounting container
307
+ blob_name: path to the blob(file)
308
+
309
+ Returns:
310
+ A SAS token with a 1-hour lifespan to access the specified blob.
311
+ """
312
+ from azure.storage import blob
313
+ sas_token = blob.generate_blob_sas(
314
+ account_name=storage_account_name,
315
+ container_name=container_name,
316
+ blob_name=blob_name,
317
+ account_key=storage_account_key,
318
+ permission=blob.BlobSasPermissions(read=True,
319
+ write=True,
320
+ list=True,
321
+ create=True),
322
+ expiry=datetime.datetime.now(datetime.timezone.utc) +
323
+ datetime.timedelta(hours=1))
324
+ return sas_token
325
+
326
+
327
+ def assign_storage_account_iam_role(
328
+ storage_account_name: str,
329
+ storage_account_id: Optional[str] = None,
330
+ resource_group_name: Optional[str] = None) -> None:
331
+ """Assigns the Storage Blob Data Owner role to a storage account.
332
+
333
+ This function retrieves the current user's object ID, then assigns the
334
+ Storage Blob Data Owner role to that user for the specified storage
335
+ account. If the role is already assigned, the function will return without
336
+ making changes.
337
+
338
+ Args:
339
+ storage_account_name: The name of the storage account.
340
+ storage_account_id: The ID of the storage account. If not provided,
341
+ it will be determined using the storage account name.
342
+ resource_group_name: Name of the resource group the
343
+ passed storage account belongs to.
344
+
345
+ Raises:
346
+ StorageBucketCreateError: If there is an error assigning the role
347
+ to the storage account.
348
+ """
349
+ subscription_id = get_subscription_id()
350
+ authorization_client = get_client('authorization', subscription_id)
351
+ graph_client = get_client('graph')
352
+
353
+ # Obtaining user's object ID to assign role.
354
+ # Reference: https://github.com/Azure/azure-sdk-for-python/issues/35573 # pylint: disable=line-too-long
355
+ async def get_object_id() -> str:
356
+ httpx_logger = logging.getLogger('httpx')
357
+ original_level = httpx_logger.getEffectiveLevel()
358
+ # silencing the INFO level response log from httpx request
359
+ httpx_logger.setLevel(logging.WARNING)
360
+ user = await graph_client.users.with_url(
361
+ 'https://graph.microsoft.com/v1.0/me').get()
362
+ httpx_logger.setLevel(original_level)
363
+ object_id = str(user.additional_data['id'])
364
+ return object_id
365
+
366
+ # Create a new event loop if none exists
367
+ try:
368
+ loop = asyncio.get_running_loop()
369
+ except RuntimeError:
370
+ loop = asyncio.new_event_loop()
371
+ asyncio.set_event_loop(loop)
372
+
373
+ object_id = loop.run_until_complete(get_object_id())
374
+
375
+ # Defintion ID of Storage Blob Data Owner role.
376
+ # Reference: https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles/storage#storage-blob-data-owner # pylint: disable=line-too-long
377
+ storage_blob_data_owner_role_id = 'b7e6dc6d-f1e8-4753-8033-0f276bb0955b'
378
+ role_definition_id = ('/subscriptions'
379
+ f'/{subscription_id}'
380
+ '/providers/Microsoft.Authorization'
381
+ '/roleDefinitions'
382
+ f'/{storage_blob_data_owner_role_id}')
383
+
384
+ # Obtain storage account ID to assign role if not provided.
385
+ if storage_account_id is None:
386
+ assert resource_group_name is not None, ('resource_group_name should '
387
+ 'be provided if '
388
+ 'storage_account_id is not.')
389
+ storage_client = get_client('storage', subscription_id)
390
+ storage_account = storage_client.storage_accounts.get_properties(
391
+ resource_group_name, storage_account_name)
392
+ storage_account_id = storage_account.id
393
+
394
+ role_assignment_failure_error_msg = (
395
+ constants.ROLE_ASSIGNMENT_FAILURE_ERROR_MSG.format(
396
+ storage_account_name=storage_account_name))
397
+ try:
398
+ authorization_client.role_assignments.create(
399
+ scope=storage_account_id,
400
+ role_assignment_name=uuid.uuid4(),
401
+ parameters={
402
+ 'properties': {
403
+ 'principalId': object_id,
404
+ 'principalType': 'User',
405
+ 'roleDefinitionId': role_definition_id,
406
+ }
407
+ },
408
+ )
409
+ sky_logger.info('Assigned Storage Blob Data Owner role to your '
410
+ f'account on storage account {storage_account_name!r}.')
411
+ return
412
+ except exceptions().ResourceExistsError as e:
413
+ # Return if the storage account already has been assigned
414
+ # the role.
415
+ if 'RoleAssignmentExists' in str(e):
416
+ return
417
+ else:
418
+ with ux_utils.print_exception_no_traceback():
419
+ raise sky_exceptions.StorageBucketCreateError(
420
+ f'{role_assignment_failure_error_msg}'
421
+ f'Details: {common_utils.format_exception(e, use_bracket=True)}'
422
+ )
423
+ except exceptions().HttpResponseError as e:
424
+ if 'AuthorizationFailed' in str(e):
425
+ with ux_utils.print_exception_no_traceback():
426
+ raise sky_exceptions.StorageBucketCreateError(
427
+ f'{role_assignment_failure_error_msg}'
428
+ 'Please check to see if you have the authorization'
429
+ ' "Microsoft.Authorization/roleAssignments/write" '
430
+ 'to assign the role to the newly created storage '
431
+ 'account.')
432
+ else:
433
+ with ux_utils.print_exception_no_traceback():
434
+ raise sky_exceptions.StorageBucketCreateError(
435
+ f'{role_assignment_failure_error_msg}'
436
+ f'Details: {common_utils.format_exception(e, use_bracket=True)}'
437
+ )
438
+
439
+
440
+ def get_az_resource_group(
441
+ storage_account_name: str,
442
+ storage_client: Optional[Client] = None) -> Optional[str]:
443
+ """Returns the resource group name the given storage account belongs to.
444
+
445
+ Args:
446
+ storage_account_name: Name of the storage account
447
+ storage_client: Client object facing storage
448
+
449
+ Returns:
450
+ Name of the resource group the given storage account belongs to, or
451
+ None if not found.
452
+ """
453
+ if storage_client is None:
454
+ subscription_id = get_subscription_id()
455
+ storage_client = get_client('storage', subscription_id)
456
+ for account in storage_client.storage_accounts.list():
457
+ if account.name == storage_account_name:
458
+ # Extract the resource group name from the account ID
459
+ # An example of account.id would be the following:
460
+ # /subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Storage/storageAccounts/{container_name} # pylint: disable=line-too-long
461
+ split_account_id = account.id.split('/')
462
+ assert len(split_account_id) == 9
463
+ resource_group_name = split_account_id[4]
464
+ return resource_group_name
465
+ # resource group cannot be found when using container not created
466
+ # under the user's subscription id, i.e. public container, or
467
+ # private containers not belonging to the user or when the storage account
468
+ # does not exist.
469
+ return None
470
+
471
+
62
472
  @common.load_lazy_modules(modules=_LAZY_MODULES)
63
473
  def create_security_rule(**kwargs):
64
- from azure.mgmt.network.models import SecurityRule
65
- return SecurityRule(**kwargs)
474
+ from azure.mgmt.network import models
475
+ return models.SecurityRule(**kwargs)
476
+
477
+
478
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
479
+ def deployment_mode():
480
+ """Azure deployment mode."""
481
+ from azure.mgmt.resource.resources.models import DeploymentMode
482
+ return DeploymentMode
@@ -2,12 +2,12 @@
2
2
  # pylint: disable=import-outside-toplevel
3
3
 
4
4
  import contextlib
5
- import functools
6
5
  import os
7
6
  import threading
8
7
  from typing import Dict, Optional, Tuple
9
8
 
10
9
  from sky.adaptors import common
10
+ from sky.utils import annotations
11
11
  from sky.utils import ux_utils
12
12
 
13
13
  _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Cloudflare.'
@@ -62,7 +62,7 @@ def get_r2_credentials(boto3_session):
62
62
  # lru_cache() is thread-safe and it will return the same session object
63
63
  # for different threads.
64
64
  # Reference: https://docs.python.org/3/library/functools.html#functools.lru_cache # pylint: disable=line-too-long
65
- @functools.lru_cache()
65
+ @annotations.lru_cache(scope='global')
66
66
  def session():
67
67
  """Create an AWS session."""
68
68
  # Creating the session object is not thread-safe for boto3,
@@ -76,7 +76,7 @@ def session():
76
76
  return session_
77
77
 
78
78
 
79
- @functools.lru_cache()
79
+ @annotations.lru_cache(scope='global')
80
80
  def resource(resource_name: str, **kwargs):
81
81
  """Create a Cloudflare resource.
82
82
 
@@ -102,7 +102,7 @@ def resource(resource_name: str, **kwargs):
102
102
  **kwargs)
103
103
 
104
104
 
105
- @functools.lru_cache()
105
+ @annotations.lru_cache(scope='global')
106
106
  def client(service_name: str, region):
107
107
  """Create an CLOUDFLARE client of a certain service.
108
108
 
@@ -177,7 +177,7 @@ def check_credentials() -> Tuple[bool, Optional[str]]:
177
177
  hints += f'\n{_INDENT_PREFIX} $ mkdir -p ~/.cloudflare'
178
178
  hints += f'\n{_INDENT_PREFIX} $ echo <YOUR_ACCOUNT_ID_HERE> > ~/.cloudflare/accountid' # pylint: disable=line-too-long
179
179
  hints += f'\n{_INDENT_PREFIX}For more info: '
180
- hints += 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long
180
+ hints += 'https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long
181
181
 
182
182
  return (False, hints) if hints else (True, hints)
183
183
 
sky/adaptors/common.py CHANGED
@@ -1,7 +1,8 @@
1
1
  """Lazy import for modules to avoid import error when not used."""
2
2
  import functools
3
3
  import importlib
4
- from typing import Any, Optional, Tuple
4
+ import threading
5
+ from typing import Any, Callable, Optional, Tuple
5
6
 
6
7
 
7
8
  class LazyImport:
@@ -18,19 +19,28 @@ class LazyImport:
18
19
 
19
20
  def __init__(self,
20
21
  module_name: str,
21
- import_error_message: Optional[str] = None):
22
+ import_error_message: Optional[str] = None,
23
+ set_loggers: Optional[Callable] = None):
22
24
  self._module_name = module_name
23
25
  self._module = None
24
26
  self._import_error_message = import_error_message
27
+ self._set_loggers = set_loggers
28
+ self._lock = threading.RLock()
25
29
 
26
30
  def load_module(self):
27
- if self._module is None:
28
- try:
29
- self._module = importlib.import_module(self._module_name)
30
- except ImportError as e:
31
- if self._import_error_message is not None:
32
- raise ImportError(self._import_error_message) from e
33
- raise
31
+ # Avoid extra imports when multiple threads try to import the same
32
+ # module. The overhead is minor since import can only run in serial
33
+ # due to GIL even in multi-threaded environments.
34
+ with self._lock:
35
+ if self._module is None:
36
+ try:
37
+ self._module = importlib.import_module(self._module_name)
38
+ if self._set_loggers is not None:
39
+ self._set_loggers()
40
+ except ImportError as e:
41
+ if self._import_error_message is not None:
42
+ raise ImportError(self._import_error_message) from e
43
+ raise
34
44
  return self._module
35
45
 
36
46
  def __getattr__(self, name: str) -> Any:
sky/adaptors/do.py ADDED
@@ -0,0 +1,20 @@
1
+ """Digital Ocean cloud adaptors"""
2
+
3
+ # pylint: disable=import-outside-toplevel
4
+
5
+ from sky.adaptors import common
6
+
7
+ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for DO. '
8
+ 'Try pip install "skypilot[do]"')
9
+ pydo = common.LazyImport('pydo', import_error_message=_IMPORT_ERROR_MESSAGE)
10
+ azure = common.LazyImport('azure', import_error_message=_IMPORT_ERROR_MESSAGE)
11
+ _LAZY_MODULES = (pydo, azure)
12
+
13
+
14
+ # `pydo`` inherits Azure exceptions. See:
15
+ # https://github.com/digitalocean/pydo/blob/7b01498d99eb0d3a772366b642e5fab3d6fc6aa2/examples/poc_droplets_volumes_sshkeys.py#L6
16
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
17
+ def exceptions():
18
+ """Azure exceptions."""
19
+ from azure.core import exceptions as azure_exceptions
20
+ return azure_exceptions
sky/adaptors/gcp.py CHANGED
@@ -21,8 +21,9 @@ def build(service_name: str, version: str, *args, **kwargs):
21
21
  service_name: GCP service name (e.g., 'compute', 'storagetransfer').
22
22
  version: Service version (e.g., 'v1').
23
23
  """
24
- from googleapiclient import discovery
25
- return discovery.build(service_name, version, *args, **kwargs)
24
+
25
+ return googleapiclient.discovery.build(service_name, version, *args,
26
+ **kwargs)
26
27
 
27
28
 
28
29
  @common.load_lazy_modules(_LAZY_MODULES)