skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/azure.py CHANGED
@@ -1,24 +1,24 @@
1
1
  """Azure."""
2
- import base64
3
- import functools
4
- import json
5
2
  import os
6
3
  import re
7
4
  import subprocess
8
5
  import textwrap
9
6
  import typing
10
- from typing import Dict, Iterator, List, Optional, Tuple
7
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
11
8
 
12
9
  import colorama
10
+ from packaging import version as pversion
13
11
 
14
12
  from sky import clouds
15
13
  from sky import exceptions
16
14
  from sky import sky_logging
17
- from sky import status_lib
15
+ from sky import skypilot_config
18
16
  from sky.adaptors import azure
19
17
  from sky.clouds import service_catalog
20
- from sky.skylet import log_lib
18
+ from sky.clouds.utils import azure_utils
19
+ from sky.utils import annotations
21
20
  from sky.utils import common_utils
21
+ from sky.utils import registry
22
22
  from sky.utils import resources_utils
23
23
  from sky.utils import ux_utils
24
24
 
@@ -39,6 +39,17 @@ _MAX_IDENTITY_FETCH_RETRY = 10
39
39
 
40
40
  _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
41
41
  _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
42
+ _DEFAULT_SKYPILOT_IMAGE_GB = 30
43
+
44
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-v2'
45
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
46
+ _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
47
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
48
+ _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
49
+ # This is used by Azure GPU VMs that use grid drivers (e.g. A10).
50
+ _DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
51
+
52
+ _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
42
53
 
43
54
 
44
55
  def _run_output(cmd):
@@ -50,7 +61,7 @@ def _run_output(cmd):
50
61
  return proc.stdout.decode('ascii')
51
62
 
52
63
 
53
- @clouds.CLOUD_REGISTRY.register
64
+ @registry.CLOUD_REGISTRY.register
54
65
  class Azure(clouds.Cloud):
55
66
  """Azure."""
56
67
 
@@ -61,15 +72,16 @@ class Azure(clouds.Cloud):
61
72
  # names, so the limit is 64 - 4 - 7 - 10 = 43.
62
73
  # Reference: https://azure.github.io/PSRule.Rules.Azure/en/rules/Azure.ResourceGroup.Name/ # pylint: disable=line-too-long
63
74
  _MAX_CLUSTER_NAME_LEN_LIMIT = 42
64
- _BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM
75
+ _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
65
76
  _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
66
- # Azure does not support high disk tier.
77
+ # Azure does not support high disk and ultra disk tier.
67
78
  _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
68
- {resources_utils.DiskTier.HIGH})
79
+ {resources_utils.DiskTier.ULTRA})
69
80
 
70
81
  _INDENT_PREFIX = ' ' * 4
71
82
 
72
- PROVISIONER_VERSION = clouds.ProvisionerVersion.RAY_AUTOSCALER
83
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
84
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
73
85
 
74
86
  @classmethod
75
87
  def _unsupported_features_for_resources(
@@ -134,29 +146,72 @@ class Azure(clouds.Cloud):
134
146
  cost += 0.0
135
147
  return cost
136
148
 
149
+ @classmethod
150
+ def get_default_instance_type(
151
+ cls,
152
+ cpus: Optional[str] = None,
153
+ memory: Optional[str] = None,
154
+ disk_tier: Optional[resources_utils.DiskTier] = None
155
+ ) -> Optional[str]:
156
+ return service_catalog.get_default_instance_type(cpus=cpus,
157
+ memory=memory,
158
+ disk_tier=disk_tier,
159
+ clouds='azure')
160
+
137
161
  @classmethod
138
162
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
139
- if region is None:
140
- # The region used here is only for where to send the query,
141
- # not the image location. Azure's image is globally available.
142
- region = 'eastus'
143
- is_skypilot_image_tag = False
163
+ # Process skypilot images.
144
164
  if image_id.startswith('skypilot:'):
145
- is_skypilot_image_tag = True
146
165
  image_id = service_catalog.get_image_id_from_tag(image_id,
147
166
  clouds='azure')
148
- image_id_splitted = image_id.split(':')
149
- if len(image_id_splitted) != 4:
150
- with ux_utils.print_exception_no_traceback():
151
- raise ValueError(f'Invalid image id: {image_id}. Expected '
152
- 'format: <publisher>:<offer>:<sku>:<version>')
153
- publisher, offer, sku, version = image_id_splitted
154
- if is_skypilot_image_tag:
155
- if offer == 'ubuntu-hpc':
156
- return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
167
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
168
+ # Avoid querying the image size from Azure as
169
+ # all skypilot custom images have the same size.
170
+ return _DEFAULT_SKYPILOT_IMAGE_GB
157
171
  else:
158
- return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
159
- compute_client = azure.get_client('compute', cls.get_project_id())
172
+ publisher, offer, sku, version = image_id.split(':')
173
+ if offer == 'ubuntu-hpc':
174
+ return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
175
+ else:
176
+ return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
177
+
178
+ # Process user-specified images.
179
+ azure_utils.validate_image_id(image_id)
180
+ try:
181
+ compute_client = azure.get_client('compute', cls.get_project_id())
182
+ except (azure.exceptions().AzureError, RuntimeError):
183
+ # Fallback to default image size if no credentials are available.
184
+ return 0.0
185
+
186
+ # Community gallery image.
187
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
188
+ if region is None:
189
+ return 0.0
190
+ _, _, gallery_name, _, image_name = image_id.split('/')
191
+ try:
192
+ return azure_utils.get_community_image_size(
193
+ compute_client, gallery_name, image_name, region)
194
+ except exceptions.ResourcesUnavailableError:
195
+ return 0.0
196
+
197
+ # Marketplace image
198
+ if region is None:
199
+ # The region used here is only for where to send the query,
200
+ # not the image location. Marketplace image is globally available.
201
+ region = 'eastus'
202
+ publisher, offer, sku, version = image_id.split(':')
203
+ # Since the Azure SDK requires explicitly specifying the image version number,
204
+ # when the version is "latest," we need to manually query the current latest version.
205
+ # By querying the image size through a precise image version, while directly using the latest image version when creating a VM,
206
+ # there might be a difference in image information, and the probability of this occurring is very small.
207
+ if version == 'latest':
208
+ versions = compute_client.virtual_machine_images.list(
209
+ location=region,
210
+ publisher_name=publisher,
211
+ offer=offer,
212
+ skus=sku)
213
+ latest_version = max(versions, key=lambda x: pversion.parse(x.name))
214
+ version = latest_version.name
160
215
  try:
161
216
  image = compute_client.virtual_machine_images.get(
162
217
  region, publisher, offer, sku, version)
@@ -178,40 +233,25 @@ class Azure(clouds.Cloud):
178
233
  size_in_gb = size_in_bytes / (1024**3)
179
234
  return size_in_gb
180
235
 
181
- @classmethod
182
- def get_default_instance_type(
183
- cls,
184
- cpus: Optional[str] = None,
185
- memory: Optional[str] = None,
186
- disk_tier: Optional[resources_utils.DiskTier] = None
187
- ) -> Optional[str]:
188
- return service_catalog.get_default_instance_type(cpus=cpus,
189
- memory=memory,
190
- disk_tier=disk_tier,
191
- clouds='azure')
192
-
193
236
  def _get_default_image_tag(self, gen_version, instance_type) -> str:
194
237
  # ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
195
238
  acc = self.get_accelerators_from_instance_type(instance_type)
196
239
  if acc is not None:
197
240
  acc_name = list(acc.keys())[0]
198
241
  if acc_name == 'K80':
199
- return 'skypilot:k80-ubuntu-2004'
200
-
201
- # ubuntu-2004 v21.11.04, the previous image we used in the past for
202
- # V1 HyperV instance before we change default image to ubuntu-hpc.
242
+ return _DEFAULT_GPU_K80_IMAGE_ID
243
+ if acc_name == 'A10':
244
+ return _DEFAULT_GPU_GRID_IMAGE_ID
245
+ # About Gen V1 vs V2:
203
246
  # In Azure, all instances with K80 (Standard_NC series), some
204
247
  # instances with M60 (Standard_NV series) and some cpu instances
205
- # (Basic_A, Standard_D, ...) are V1 instance. For these instances,
206
- # we use the previous image.
248
+ # (Basic_A, Standard_D, ...) are V1 instance.
249
+ # All A100 instances are V2.
207
250
  if gen_version == 'V1':
208
- return 'skypilot:v1-ubuntu-2004'
209
-
210
- # nvidia-driver: 535.54.03, cuda: 12.2
211
- # see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
212
- # All A100 instances is of gen2, so it will always use
213
- # the latest ubuntu-hpc:2204 image.
214
- return 'skypilot:gpu-ubuntu-2204'
251
+ return _DEFAULT_V1_IMAGE_ID
252
+ if acc is None:
253
+ return _DEFAULT_CPU_IMAGE_ID
254
+ return _DEFAULT_GPU_IMAGE_ID
215
255
 
216
256
  @classmethod
217
257
  def regions_with_offering(cls, instance_type: str,
@@ -254,7 +294,7 @@ class Azure(clouds.Cloud):
254
294
  def get_accelerators_from_instance_type(
255
295
  cls,
256
296
  instance_type: str,
257
- ) -> Optional[Dict[str, int]]:
297
+ ) -> Optional[Dict[str, Union[int, float]]]:
258
298
  return service_catalog.get_accelerators_from_instance_type(
259
299
  instance_type, clouds='azure')
260
300
 
@@ -273,10 +313,11 @@ class Azure(clouds.Cloud):
273
313
  def make_deploy_resources_variables(
274
314
  self,
275
315
  resources: 'resources.Resources',
276
- cluster_name_on_cloud: str,
316
+ cluster_name: resources_utils.ClusterName,
277
317
  region: 'clouds.Region',
278
318
  zones: Optional[List['clouds.Zone']],
279
- dryrun: bool = False) -> Dict[str, Optional[str]]:
319
+ num_nodes: int,
320
+ dryrun: bool = False) -> Dict[str, Any]:
280
321
  assert zones is None, ('Azure does not support zones', zones)
281
322
 
282
323
  region_name = region.name
@@ -286,10 +327,9 @@ class Azure(clouds.Cloud):
286
327
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
287
328
  acc_count = None
288
329
  if acc_dict is not None:
289
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
290
330
  acc_count = str(sum(acc_dict.values()))
291
- else:
292
- custom_resources = None
331
+ custom_resources = resources_utils.make_ray_custom_resources_str(
332
+ acc_dict)
293
333
 
294
334
  if (resources.image_id is None or
295
335
  resources.extract_docker_image() is not None):
@@ -304,17 +344,41 @@ class Azure(clouds.Cloud):
304
344
  else:
305
345
  assert region_name in resources.image_id, resources.image_id
306
346
  image_id = resources.image_id[region_name]
347
+
348
+ # Checked basic image syntax in resources.py
307
349
  if image_id.startswith('skypilot:'):
308
350
  image_id = service_catalog.get_image_id_from_tag(image_id,
309
351
  clouds='azure')
310
- # Already checked in resources.py
311
- publisher, offer, sku, version = image_id.split(':')
312
- image_config = {
313
- 'image_publisher': publisher,
314
- 'image_offer': offer,
315
- 'image_sku': sku,
316
- 'image_version': version,
317
- }
352
+ # Fallback if image does not exist in the specified region.
353
+ # Putting fallback here instead of at image validation
354
+ # when creating the resource because community images are
355
+ # regional so we need the correct region when we check whether
356
+ # the image exists.
357
+ if image_id.startswith(
358
+ _COMMUNITY_IMAGE_PREFIX
359
+ ) and region_name not in azure_catalog.COMMUNITY_IMAGE_AVAILABLE_REGIONS:
360
+ logger.info(f'Azure image {image_id} does not exist in region '
361
+ f'{region_name} so use the fallback image instead.')
362
+ image_id = service_catalog.get_image_id_from_tag(
363
+ _FALLBACK_IMAGE_ID, clouds='azure')
364
+
365
+ if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
366
+ image_config = {'community_gallery_image_id': image_id}
367
+ else:
368
+ publisher, offer, sku, version = image_id.split(':')
369
+ image_config = {
370
+ 'image_publisher': publisher,
371
+ 'image_offer': offer,
372
+ 'image_sku': sku,
373
+ 'image_version': version,
374
+ }
375
+
376
+ # Determine resource group for deploying the instance.
377
+ resource_group_name = skypilot_config.get_nested(
378
+ ('azure', 'resource_group_vm'), None)
379
+ use_external_resource_group = resource_group_name is not None
380
+ if resource_group_name is None:
381
+ resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
318
382
 
319
383
  # Setup commands to eliminate the banner and restart sshd.
320
384
  # This script will modify /etc/ssh/sshd_config and add a bash script
@@ -322,13 +386,11 @@ class Azure(clouds.Cloud):
322
386
  # restarted, identified by a file /tmp/__restarted is existing.
323
387
  # Also, add default user to docker group.
324
388
  # pylint: disable=line-too-long
325
- cloud_init_setup_commands = base64.b64encode(
326
- textwrap.dedent("""\
389
+ cloud_init_setup_commands = textwrap.dedent("""\
327
390
  #cloud-config
328
391
  runcmd:
329
392
  - sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
330
393
  - echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
331
- - usermod -aG docker skypilot:ssh_user
332
394
  write_files:
333
395
  - path: /etc/apt/apt.conf.d/20auto-upgrades
334
396
  content: |
@@ -339,7 +401,7 @@ class Azure(clouds.Cloud):
339
401
  - path: /etc/apt/apt.conf.d/10cloudinit-disable
340
402
  content: |
341
403
  APT::Periodic::Enable "0";
342
- """).encode('utf-8')).decode('utf-8')
404
+ """).split('\n')
343
405
 
344
406
  def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
345
407
  if (r.disk_tier is not None and
@@ -359,7 +421,9 @@ class Azure(clouds.Cloud):
359
421
  start_index += 1
360
422
  assert False, 'Low disk tier should always be supported on Azure.'
361
423
 
362
- return {
424
+ disk_tier = _failover_disk_tier()
425
+
426
+ resources_vars = {
363
427
  'instance_type': r.instance_type,
364
428
  'custom_resources': custom_resources,
365
429
  'num_gpus': acc_count,
@@ -368,25 +432,33 @@ class Azure(clouds.Cloud):
368
432
  # Azure does not support specific zones.
369
433
  'zones': None,
370
434
  **image_config,
371
- 'disk_tier': Azure._get_disk_type(_failover_disk_tier()),
435
+ 'disk_tier': Azure._get_disk_type(disk_tier),
372
436
  'cloud_init_setup_commands': cloud_init_setup_commands,
373
437
  'azure_subscription_id': self.get_project_id(dryrun),
374
- 'resource_group': f'{cluster_name_on_cloud}-{region_name}',
438
+ 'resource_group': resource_group_name,
439
+ 'use_external_resource_group': use_external_resource_group,
375
440
  }
376
441
 
442
+ # Setting disk performance tier for high disk tier.
443
+ if disk_tier == resources_utils.DiskTier.HIGH:
444
+ resources_vars['disk_performance_tier'] = 'P50'
445
+ return resources_vars
446
+
377
447
  def _get_feasible_launchable_resources(
378
448
  self, resources: 'resources.Resources'
379
- ) -> Tuple[List['resources.Resources'], List[str]]:
449
+ ) -> 'resources_utils.FeasibleResources':
380
450
  if resources.instance_type is not None:
381
451
  assert resources.is_launchable(), resources
382
452
  ok, _ = Azure.check_disk_tier(resources.instance_type,
383
453
  resources.disk_tier)
384
454
  if not ok:
385
- return ([], [])
455
+ # TODO: Add hints to all return values in this method to help
456
+ # users understand why the resources are not launchable.
457
+ return resources_utils.FeasibleResources([], [], None)
386
458
  # Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as
387
459
  # Resources(Azure, Standard_NC4as_T4_v3).
388
460
  resources = resources.copy(accelerators=None)
389
- return ([resources], [])
461
+ return resources_utils.FeasibleResources([resources], [], None)
390
462
 
391
463
  def _make(instance_list):
392
464
  resource_list = []
@@ -416,9 +488,10 @@ class Azure(clouds.Cloud):
416
488
  memory=resources.memory,
417
489
  disk_tier=resources.disk_tier)
418
490
  if default_instance_type is None:
419
- return ([], [])
491
+ return resources_utils.FeasibleResources([], [], None)
420
492
  else:
421
- return (_make([default_instance_type]), [])
493
+ return resources_utils.FeasibleResources(
494
+ _make([default_instance_type]), [], None)
422
495
 
423
496
  assert len(accelerators) == 1, resources
424
497
  acc, acc_count = list(accelerators.items())[0]
@@ -433,8 +506,10 @@ class Azure(clouds.Cloud):
433
506
  zone=resources.zone,
434
507
  clouds='azure')
435
508
  if instance_list is None:
436
- return ([], fuzzy_candidate_list)
437
- return (_make(instance_list), fuzzy_candidate_list)
509
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
510
+ None)
511
+ return resources_utils.FeasibleResources(_make(instance_list),
512
+ fuzzy_candidate_list, None)
438
513
 
439
514
  @classmethod
440
515
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -468,11 +543,24 @@ class Azure(clouds.Cloud):
468
543
  # If Azure is properly logged in, this will return the account email
469
544
  # address + subscription ID.
470
545
  try:
471
- cls.get_current_user_identity()
546
+ cls.get_active_user_identity()
472
547
  except exceptions.CloudUserIdentityError as e:
473
548
  return False, (f'Getting user\'s Azure identity failed.{help_str}\n'
474
549
  f'{cls._INDENT_PREFIX}Details: '
475
550
  f'{common_utils.format_exception(e)}')
551
+
552
+ # Check if the azure blob storage dependencies are installed.
553
+ try:
554
+ # pylint: disable=redefined-outer-name, import-outside-toplevel, unused-import
555
+ from azure.storage import blob
556
+ import msgraph
557
+ except ImportError as e:
558
+ return False, (
559
+ f'Azure blob storage depdencies are not installed. '
560
+ 'Run the following commands:'
561
+ f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
562
+ f'\n{cls._INDENT_PREFIX}Details: '
563
+ f'{common_utils.format_exception(e)}')
476
564
  return True, None
477
565
 
478
566
  def get_credential_file_mounts(self) -> Dict[str, str]:
@@ -487,8 +575,9 @@ class Azure(clouds.Cloud):
487
575
  clouds='azure')
488
576
 
489
577
  @classmethod
490
- @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
491
- def get_current_user_identity(cls) -> Optional[List[str]]:
578
+ @annotations.lru_cache(scope='global',
579
+ maxsize=1) # Cache since getting identity is slow.
580
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
492
581
  """Returns the cloud user identity."""
493
582
  # This returns the user's email address + [subscription_id].
494
583
  retry_cnt = 0
@@ -530,11 +619,13 @@ class Azure(clouds.Cloud):
530
619
  with ux_utils.print_exception_no_traceback():
531
620
  raise exceptions.CloudUserIdentityError(
532
621
  'Failed to get Azure project ID.') from e
533
- return [f'{account_email} [subscription_id={project_id}]']
622
+ # TODO: Return a list of identities in the profile when we support
623
+ # automatic switching for Az. Currently we only support one identity.
624
+ return [[f'{account_email} [subscription_id={project_id}]']]
534
625
 
535
626
  @classmethod
536
- def get_current_user_identity_str(cls) -> Optional[str]:
537
- user_identity = cls.get_current_user_identity()
627
+ def get_active_user_identity_str(cls) -> Optional[str]:
628
+ user_identity = cls.get_active_user_identity()
538
629
  if user_identity is None:
539
630
  return None
540
631
  return user_identity[0]
@@ -579,9 +670,10 @@ class Azure(clouds.Cloud):
579
670
  disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
580
671
  if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
581
672
  return True, ''
582
- if disk_tier == resources_utils.DiskTier.HIGH:
583
- return False, ('Azure disk_tier=high is not supported now. '
584
- 'Please use disk_tier={low, medium} instead.')
673
+ if disk_tier == resources_utils.DiskTier.ULTRA:
674
+ return False, (
675
+ 'Azure disk_tier=ultra is not supported now. '
676
+ 'Please use disk_tier={low, medium, high, best} instead.')
585
677
  # Only S-series supported premium ssd
586
678
  # see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long
587
679
  if cls._get_disk_type(
@@ -589,7 +681,7 @@ class Azure(clouds.Cloud):
589
681
  ) == 'Premium_LRS' and not Azure._is_s_series(instance_type):
590
682
  return False, (
591
683
  'Azure premium SSDs are only supported for S-series '
592
- 'instances. To use disk_tier=medium, please make sure '
684
+ 'instances. To use disk_tier>=medium, please make sure '
593
685
  'instance_type is specified to an S-series instance.')
594
686
  return True, ''
595
687
 
@@ -608,95 +700,9 @@ class Azure(clouds.Cloud):
608
700
  # TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two
609
701
  # cannot be used as OS disks so we might need data disk support
610
702
  tier2name = {
611
- resources_utils.DiskTier.HIGH: 'Disabled',
703
+ resources_utils.DiskTier.ULTRA: 'Disabled',
704
+ resources_utils.DiskTier.HIGH: 'Premium_LRS',
612
705
  resources_utils.DiskTier.MEDIUM: 'Premium_LRS',
613
706
  resources_utils.DiskTier.LOW: 'Standard_LRS',
614
707
  }
615
708
  return tier2name[tier]
616
-
617
- @classmethod
618
- def query_status(cls, name: str, tag_filters: Dict[str, str],
619
- region: Optional[str], zone: Optional[str],
620
- **kwargs) -> List[status_lib.ClusterStatus]:
621
- del zone # unused
622
- status_map = {
623
- 'VM starting': status_lib.ClusterStatus.INIT,
624
- 'VM running': status_lib.ClusterStatus.UP,
625
- # 'VM stopped' in Azure means Stopped (Allocated), which still bills
626
- # for the VM.
627
- 'VM stopping': status_lib.ClusterStatus.INIT,
628
- 'VM stopped': status_lib.ClusterStatus.INIT,
629
- # 'VM deallocated' in Azure means Stopped (Deallocated), which does not
630
- # bill for the VM.
631
- 'VM deallocating': status_lib.ClusterStatus.STOPPED,
632
- 'VM deallocated': status_lib.ClusterStatus.STOPPED,
633
- }
634
- tag_filter_str = ' '.join(
635
- f'tags.\\"{k}\\"==\'{v}\'' for k, v in tag_filters.items())
636
-
637
- query_node_id = (f'az vm list --query "[?{tag_filter_str}].id" -o json')
638
- returncode, stdout, stderr = log_lib.run_with_log(query_node_id,
639
- '/dev/null',
640
- require_outputs=True,
641
- shell=True)
642
- logger.debug(f'{query_node_id} returned {returncode}.\n'
643
- '**** STDOUT ****\n'
644
- f'{stdout}\n'
645
- '**** STDERR ****\n'
646
- f'{stderr}')
647
- if returncode == 0:
648
- if not stdout.strip():
649
- return []
650
- node_ids = json.loads(stdout.strip())
651
- if not node_ids:
652
- return []
653
- state_str = '[].powerState'
654
- if len(node_ids) == 1:
655
- state_str = 'powerState'
656
- node_ids_str = '\t'.join(node_ids)
657
- query_cmd = (
658
- f'az vm show -d --ids {node_ids_str} --query "{state_str}" -o json'
659
- )
660
- returncode, stdout, stderr = log_lib.run_with_log(
661
- query_cmd, '/dev/null', require_outputs=True, shell=True)
662
- logger.debug(f'{query_cmd} returned {returncode}.\n'
663
- '**** STDOUT ****\n'
664
- f'{stdout}\n'
665
- '**** STDERR ****\n'
666
- f'{stderr}')
667
-
668
- # NOTE: Azure cli should be handled carefully. The query command above
669
- # takes about 1 second to run.
670
- # An alternative is the following command, but it will take more than
671
- # 20 seconds to run.
672
- # query_cmd = (
673
- # f'az vm list --show-details --query "['
674
- # f'?tags.\\"ray-cluster-name\\" == \'{handle.cluster_name}\' '
675
- # '&& tags.\\"ray-node-type\\" == \'head\'].powerState" -o tsv'
676
- # )
677
-
678
- if returncode != 0:
679
- with ux_utils.print_exception_no_traceback():
680
- raise exceptions.ClusterStatusFetchingError(
681
- f'Failed to query Azure cluster {name!r} status: '
682
- f'{stdout + stderr}')
683
-
684
- assert stdout.strip(), f'No status returned for {name!r}'
685
-
686
- original_statuses_list = json.loads(stdout.strip())
687
- if not original_statuses_list:
688
- # No nodes found. The original_statuses_list will be empty string.
689
- # Return empty list.
690
- return []
691
- if not isinstance(original_statuses_list, list):
692
- original_statuses_list = [original_statuses_list]
693
- statuses = []
694
- for s in original_statuses_list:
695
- if s not in status_map:
696
- with ux_utils.print_exception_no_traceback():
697
- raise exceptions.ClusterStatusFetchingError(
698
- f'Failed to parse status from Azure response: {stdout}')
699
- node_status = status_map[s]
700
- if node_status is not None:
701
- statuses.append(node_status)
702
- return statuses