skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/azure.py
CHANGED
@@ -1,24 +1,24 @@
|
|
1
1
|
"""Azure."""
|
2
|
-
import base64
|
3
|
-
import functools
|
4
|
-
import json
|
5
2
|
import os
|
6
3
|
import re
|
7
4
|
import subprocess
|
8
5
|
import textwrap
|
9
6
|
import typing
|
10
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
7
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
11
8
|
|
12
9
|
import colorama
|
10
|
+
from packaging import version as pversion
|
13
11
|
|
14
12
|
from sky import clouds
|
15
13
|
from sky import exceptions
|
16
14
|
from sky import sky_logging
|
17
|
-
from sky import
|
15
|
+
from sky import skypilot_config
|
18
16
|
from sky.adaptors import azure
|
19
17
|
from sky.clouds import service_catalog
|
20
|
-
from sky.
|
18
|
+
from sky.clouds.utils import azure_utils
|
19
|
+
from sky.utils import annotations
|
21
20
|
from sky.utils import common_utils
|
21
|
+
from sky.utils import registry
|
22
22
|
from sky.utils import resources_utils
|
23
23
|
from sky.utils import ux_utils
|
24
24
|
|
@@ -39,6 +39,17 @@ _MAX_IDENTITY_FETCH_RETRY = 10
|
|
39
39
|
|
40
40
|
_DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
|
41
41
|
_DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
|
42
|
+
_DEFAULT_SKYPILOT_IMAGE_GB = 30
|
43
|
+
|
44
|
+
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-v2'
|
45
|
+
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
|
46
|
+
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
|
47
|
+
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
48
|
+
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
|
49
|
+
# This is used by Azure GPU VMs that use grid drivers (e.g. A10).
|
50
|
+
_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
|
51
|
+
|
52
|
+
_COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
|
42
53
|
|
43
54
|
|
44
55
|
def _run_output(cmd):
|
@@ -50,7 +61,7 @@ def _run_output(cmd):
|
|
50
61
|
return proc.stdout.decode('ascii')
|
51
62
|
|
52
63
|
|
53
|
-
@
|
64
|
+
@registry.CLOUD_REGISTRY.register
|
54
65
|
class Azure(clouds.Cloud):
|
55
66
|
"""Azure."""
|
56
67
|
|
@@ -61,15 +72,16 @@ class Azure(clouds.Cloud):
|
|
61
72
|
# names, so the limit is 64 - 4 - 7 - 10 = 43.
|
62
73
|
# Reference: https://azure.github.io/PSRule.Rules.Azure/en/rules/Azure.ResourceGroup.Name/ # pylint: disable=line-too-long
|
63
74
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 42
|
64
|
-
_BEST_DISK_TIER = resources_utils.DiskTier.
|
75
|
+
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
|
65
76
|
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
|
66
|
-
# Azure does not support high disk tier.
|
77
|
+
# Azure does not support high disk and ultra disk tier.
|
67
78
|
_SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
|
68
|
-
{resources_utils.DiskTier.
|
79
|
+
{resources_utils.DiskTier.ULTRA})
|
69
80
|
|
70
81
|
_INDENT_PREFIX = ' ' * 4
|
71
82
|
|
72
|
-
PROVISIONER_VERSION = clouds.ProvisionerVersion.
|
83
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
84
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
73
85
|
|
74
86
|
@classmethod
|
75
87
|
def _unsupported_features_for_resources(
|
@@ -134,29 +146,72 @@ class Azure(clouds.Cloud):
|
|
134
146
|
cost += 0.0
|
135
147
|
return cost
|
136
148
|
|
149
|
+
@classmethod
|
150
|
+
def get_default_instance_type(
|
151
|
+
cls,
|
152
|
+
cpus: Optional[str] = None,
|
153
|
+
memory: Optional[str] = None,
|
154
|
+
disk_tier: Optional[resources_utils.DiskTier] = None
|
155
|
+
) -> Optional[str]:
|
156
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
157
|
+
memory=memory,
|
158
|
+
disk_tier=disk_tier,
|
159
|
+
clouds='azure')
|
160
|
+
|
137
161
|
@classmethod
|
138
162
|
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
139
|
-
|
140
|
-
# The region used here is only for where to send the query,
|
141
|
-
# not the image location. Azure's image is globally available.
|
142
|
-
region = 'eastus'
|
143
|
-
is_skypilot_image_tag = False
|
163
|
+
# Process skypilot images.
|
144
164
|
if image_id.startswith('skypilot:'):
|
145
|
-
is_skypilot_image_tag = True
|
146
165
|
image_id = service_catalog.get_image_id_from_tag(image_id,
|
147
166
|
clouds='azure')
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
'format: <publisher>:<offer>:<sku>:<version>')
|
153
|
-
publisher, offer, sku, version = image_id_splitted
|
154
|
-
if is_skypilot_image_tag:
|
155
|
-
if offer == 'ubuntu-hpc':
|
156
|
-
return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
|
167
|
+
if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
|
168
|
+
# Avoid querying the image size from Azure as
|
169
|
+
# all skypilot custom images have the same size.
|
170
|
+
return _DEFAULT_SKYPILOT_IMAGE_GB
|
157
171
|
else:
|
158
|
-
|
159
|
-
|
172
|
+
publisher, offer, sku, version = image_id.split(':')
|
173
|
+
if offer == 'ubuntu-hpc':
|
174
|
+
return _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB
|
175
|
+
else:
|
176
|
+
return _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB
|
177
|
+
|
178
|
+
# Process user-specified images.
|
179
|
+
azure_utils.validate_image_id(image_id)
|
180
|
+
try:
|
181
|
+
compute_client = azure.get_client('compute', cls.get_project_id())
|
182
|
+
except (azure.exceptions().AzureError, RuntimeError):
|
183
|
+
# Fallback to default image size if no credentials are available.
|
184
|
+
return 0.0
|
185
|
+
|
186
|
+
# Community gallery image.
|
187
|
+
if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
|
188
|
+
if region is None:
|
189
|
+
return 0.0
|
190
|
+
_, _, gallery_name, _, image_name = image_id.split('/')
|
191
|
+
try:
|
192
|
+
return azure_utils.get_community_image_size(
|
193
|
+
compute_client, gallery_name, image_name, region)
|
194
|
+
except exceptions.ResourcesUnavailableError:
|
195
|
+
return 0.0
|
196
|
+
|
197
|
+
# Marketplace image
|
198
|
+
if region is None:
|
199
|
+
# The region used here is only for where to send the query,
|
200
|
+
# not the image location. Marketplace image is globally available.
|
201
|
+
region = 'eastus'
|
202
|
+
publisher, offer, sku, version = image_id.split(':')
|
203
|
+
# Since the Azure SDK requires explicitly specifying the image version number,
|
204
|
+
# when the version is "latest," we need to manually query the current latest version.
|
205
|
+
# By querying the image size through a precise image version, while directly using the latest image version when creating a VM,
|
206
|
+
# there might be a difference in image information, and the probability of this occurring is very small.
|
207
|
+
if version == 'latest':
|
208
|
+
versions = compute_client.virtual_machine_images.list(
|
209
|
+
location=region,
|
210
|
+
publisher_name=publisher,
|
211
|
+
offer=offer,
|
212
|
+
skus=sku)
|
213
|
+
latest_version = max(versions, key=lambda x: pversion.parse(x.name))
|
214
|
+
version = latest_version.name
|
160
215
|
try:
|
161
216
|
image = compute_client.virtual_machine_images.get(
|
162
217
|
region, publisher, offer, sku, version)
|
@@ -178,40 +233,25 @@ class Azure(clouds.Cloud):
|
|
178
233
|
size_in_gb = size_in_bytes / (1024**3)
|
179
234
|
return size_in_gb
|
180
235
|
|
181
|
-
@classmethod
|
182
|
-
def get_default_instance_type(
|
183
|
-
cls,
|
184
|
-
cpus: Optional[str] = None,
|
185
|
-
memory: Optional[str] = None,
|
186
|
-
disk_tier: Optional[resources_utils.DiskTier] = None
|
187
|
-
) -> Optional[str]:
|
188
|
-
return service_catalog.get_default_instance_type(cpus=cpus,
|
189
|
-
memory=memory,
|
190
|
-
disk_tier=disk_tier,
|
191
|
-
clouds='azure')
|
192
|
-
|
193
236
|
def _get_default_image_tag(self, gen_version, instance_type) -> str:
|
194
237
|
# ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
|
195
238
|
acc = self.get_accelerators_from_instance_type(instance_type)
|
196
239
|
if acc is not None:
|
197
240
|
acc_name = list(acc.keys())[0]
|
198
241
|
if acc_name == 'K80':
|
199
|
-
return
|
200
|
-
|
201
|
-
|
202
|
-
#
|
242
|
+
return _DEFAULT_GPU_K80_IMAGE_ID
|
243
|
+
if acc_name == 'A10':
|
244
|
+
return _DEFAULT_GPU_GRID_IMAGE_ID
|
245
|
+
# About Gen V1 vs V2:
|
203
246
|
# In Azure, all instances with K80 (Standard_NC series), some
|
204
247
|
# instances with M60 (Standard_NV series) and some cpu instances
|
205
|
-
# (Basic_A, Standard_D, ...) are V1 instance.
|
206
|
-
#
|
248
|
+
# (Basic_A, Standard_D, ...) are V1 instance.
|
249
|
+
# All A100 instances are V2.
|
207
250
|
if gen_version == 'V1':
|
208
|
-
return
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
# All A100 instances is of gen2, so it will always use
|
213
|
-
# the latest ubuntu-hpc:2204 image.
|
214
|
-
return 'skypilot:gpu-ubuntu-2204'
|
251
|
+
return _DEFAULT_V1_IMAGE_ID
|
252
|
+
if acc is None:
|
253
|
+
return _DEFAULT_CPU_IMAGE_ID
|
254
|
+
return _DEFAULT_GPU_IMAGE_ID
|
215
255
|
|
216
256
|
@classmethod
|
217
257
|
def regions_with_offering(cls, instance_type: str,
|
@@ -254,7 +294,7 @@ class Azure(clouds.Cloud):
|
|
254
294
|
def get_accelerators_from_instance_type(
|
255
295
|
cls,
|
256
296
|
instance_type: str,
|
257
|
-
) -> Optional[Dict[str, int]]:
|
297
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
258
298
|
return service_catalog.get_accelerators_from_instance_type(
|
259
299
|
instance_type, clouds='azure')
|
260
300
|
|
@@ -273,10 +313,11 @@ class Azure(clouds.Cloud):
|
|
273
313
|
def make_deploy_resources_variables(
|
274
314
|
self,
|
275
315
|
resources: 'resources.Resources',
|
276
|
-
|
316
|
+
cluster_name: resources_utils.ClusterName,
|
277
317
|
region: 'clouds.Region',
|
278
318
|
zones: Optional[List['clouds.Zone']],
|
279
|
-
|
319
|
+
num_nodes: int,
|
320
|
+
dryrun: bool = False) -> Dict[str, Any]:
|
280
321
|
assert zones is None, ('Azure does not support zones', zones)
|
281
322
|
|
282
323
|
region_name = region.name
|
@@ -286,10 +327,9 @@ class Azure(clouds.Cloud):
|
|
286
327
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
287
328
|
acc_count = None
|
288
329
|
if acc_dict is not None:
|
289
|
-
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
290
330
|
acc_count = str(sum(acc_dict.values()))
|
291
|
-
|
292
|
-
|
331
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
332
|
+
acc_dict)
|
293
333
|
|
294
334
|
if (resources.image_id is None or
|
295
335
|
resources.extract_docker_image() is not None):
|
@@ -304,17 +344,41 @@ class Azure(clouds.Cloud):
|
|
304
344
|
else:
|
305
345
|
assert region_name in resources.image_id, resources.image_id
|
306
346
|
image_id = resources.image_id[region_name]
|
347
|
+
|
348
|
+
# Checked basic image syntax in resources.py
|
307
349
|
if image_id.startswith('skypilot:'):
|
308
350
|
image_id = service_catalog.get_image_id_from_tag(image_id,
|
309
351
|
clouds='azure')
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
352
|
+
# Fallback if image does not exist in the specified region.
|
353
|
+
# Putting fallback here instead of at image validation
|
354
|
+
# when creating the resource because community images are
|
355
|
+
# regional so we need the correct region when we check whether
|
356
|
+
# the image exists.
|
357
|
+
if image_id.startswith(
|
358
|
+
_COMMUNITY_IMAGE_PREFIX
|
359
|
+
) and region_name not in azure_catalog.COMMUNITY_IMAGE_AVAILABLE_REGIONS:
|
360
|
+
logger.info(f'Azure image {image_id} does not exist in region '
|
361
|
+
f'{region_name} so use the fallback image instead.')
|
362
|
+
image_id = service_catalog.get_image_id_from_tag(
|
363
|
+
_FALLBACK_IMAGE_ID, clouds='azure')
|
364
|
+
|
365
|
+
if image_id.startswith(_COMMUNITY_IMAGE_PREFIX):
|
366
|
+
image_config = {'community_gallery_image_id': image_id}
|
367
|
+
else:
|
368
|
+
publisher, offer, sku, version = image_id.split(':')
|
369
|
+
image_config = {
|
370
|
+
'image_publisher': publisher,
|
371
|
+
'image_offer': offer,
|
372
|
+
'image_sku': sku,
|
373
|
+
'image_version': version,
|
374
|
+
}
|
375
|
+
|
376
|
+
# Determine resource group for deploying the instance.
|
377
|
+
resource_group_name = skypilot_config.get_nested(
|
378
|
+
('azure', 'resource_group_vm'), None)
|
379
|
+
use_external_resource_group = resource_group_name is not None
|
380
|
+
if resource_group_name is None:
|
381
|
+
resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
|
318
382
|
|
319
383
|
# Setup commands to eliminate the banner and restart sshd.
|
320
384
|
# This script will modify /etc/ssh/sshd_config and add a bash script
|
@@ -322,13 +386,11 @@ class Azure(clouds.Cloud):
|
|
322
386
|
# restarted, identified by a file /tmp/__restarted is existing.
|
323
387
|
# Also, add default user to docker group.
|
324
388
|
# pylint: disable=line-too-long
|
325
|
-
cloud_init_setup_commands =
|
326
|
-
textwrap.dedent("""\
|
389
|
+
cloud_init_setup_commands = textwrap.dedent("""\
|
327
390
|
#cloud-config
|
328
391
|
runcmd:
|
329
392
|
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
|
330
393
|
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
|
331
|
-
- usermod -aG docker skypilot:ssh_user
|
332
394
|
write_files:
|
333
395
|
- path: /etc/apt/apt.conf.d/20auto-upgrades
|
334
396
|
content: |
|
@@ -339,7 +401,7 @@ class Azure(clouds.Cloud):
|
|
339
401
|
- path: /etc/apt/apt.conf.d/10cloudinit-disable
|
340
402
|
content: |
|
341
403
|
APT::Periodic::Enable "0";
|
342
|
-
""").
|
404
|
+
""").split('\n')
|
343
405
|
|
344
406
|
def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
|
345
407
|
if (r.disk_tier is not None and
|
@@ -359,7 +421,9 @@ class Azure(clouds.Cloud):
|
|
359
421
|
start_index += 1
|
360
422
|
assert False, 'Low disk tier should always be supported on Azure.'
|
361
423
|
|
362
|
-
|
424
|
+
disk_tier = _failover_disk_tier()
|
425
|
+
|
426
|
+
resources_vars = {
|
363
427
|
'instance_type': r.instance_type,
|
364
428
|
'custom_resources': custom_resources,
|
365
429
|
'num_gpus': acc_count,
|
@@ -368,25 +432,33 @@ class Azure(clouds.Cloud):
|
|
368
432
|
# Azure does not support specific zones.
|
369
433
|
'zones': None,
|
370
434
|
**image_config,
|
371
|
-
'disk_tier': Azure._get_disk_type(
|
435
|
+
'disk_tier': Azure._get_disk_type(disk_tier),
|
372
436
|
'cloud_init_setup_commands': cloud_init_setup_commands,
|
373
437
|
'azure_subscription_id': self.get_project_id(dryrun),
|
374
|
-
'resource_group':
|
438
|
+
'resource_group': resource_group_name,
|
439
|
+
'use_external_resource_group': use_external_resource_group,
|
375
440
|
}
|
376
441
|
|
442
|
+
# Setting disk performance tier for high disk tier.
|
443
|
+
if disk_tier == resources_utils.DiskTier.HIGH:
|
444
|
+
resources_vars['disk_performance_tier'] = 'P50'
|
445
|
+
return resources_vars
|
446
|
+
|
377
447
|
def _get_feasible_launchable_resources(
|
378
448
|
self, resources: 'resources.Resources'
|
379
|
-
) ->
|
449
|
+
) -> 'resources_utils.FeasibleResources':
|
380
450
|
if resources.instance_type is not None:
|
381
451
|
assert resources.is_launchable(), resources
|
382
452
|
ok, _ = Azure.check_disk_tier(resources.instance_type,
|
383
453
|
resources.disk_tier)
|
384
454
|
if not ok:
|
385
|
-
return
|
455
|
+
# TODO: Add hints to all return values in this method to help
|
456
|
+
# users understand why the resources are not launchable.
|
457
|
+
return resources_utils.FeasibleResources([], [], None)
|
386
458
|
# Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as
|
387
459
|
# Resources(Azure, Standard_NC4as_T4_v3).
|
388
460
|
resources = resources.copy(accelerators=None)
|
389
|
-
return ([resources], [])
|
461
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
390
462
|
|
391
463
|
def _make(instance_list):
|
392
464
|
resource_list = []
|
@@ -416,9 +488,10 @@ class Azure(clouds.Cloud):
|
|
416
488
|
memory=resources.memory,
|
417
489
|
disk_tier=resources.disk_tier)
|
418
490
|
if default_instance_type is None:
|
419
|
-
return ([], [])
|
491
|
+
return resources_utils.FeasibleResources([], [], None)
|
420
492
|
else:
|
421
|
-
return (
|
493
|
+
return resources_utils.FeasibleResources(
|
494
|
+
_make([default_instance_type]), [], None)
|
422
495
|
|
423
496
|
assert len(accelerators) == 1, resources
|
424
497
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -433,8 +506,10 @@ class Azure(clouds.Cloud):
|
|
433
506
|
zone=resources.zone,
|
434
507
|
clouds='azure')
|
435
508
|
if instance_list is None:
|
436
|
-
return ([], fuzzy_candidate_list
|
437
|
-
|
509
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
510
|
+
None)
|
511
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
512
|
+
fuzzy_candidate_list, None)
|
438
513
|
|
439
514
|
@classmethod
|
440
515
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -468,11 +543,24 @@ class Azure(clouds.Cloud):
|
|
468
543
|
# If Azure is properly logged in, this will return the account email
|
469
544
|
# address + subscription ID.
|
470
545
|
try:
|
471
|
-
cls.
|
546
|
+
cls.get_active_user_identity()
|
472
547
|
except exceptions.CloudUserIdentityError as e:
|
473
548
|
return False, (f'Getting user\'s Azure identity failed.{help_str}\n'
|
474
549
|
f'{cls._INDENT_PREFIX}Details: '
|
475
550
|
f'{common_utils.format_exception(e)}')
|
551
|
+
|
552
|
+
# Check if the azure blob storage dependencies are installed.
|
553
|
+
try:
|
554
|
+
# pylint: disable=redefined-outer-name, import-outside-toplevel, unused-import
|
555
|
+
from azure.storage import blob
|
556
|
+
import msgraph
|
557
|
+
except ImportError as e:
|
558
|
+
return False, (
|
559
|
+
f'Azure blob storage depdencies are not installed. '
|
560
|
+
'Run the following commands:'
|
561
|
+
f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
|
562
|
+
f'\n{cls._INDENT_PREFIX}Details: '
|
563
|
+
f'{common_utils.format_exception(e)}')
|
476
564
|
return True, None
|
477
565
|
|
478
566
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
@@ -487,8 +575,9 @@ class Azure(clouds.Cloud):
|
|
487
575
|
clouds='azure')
|
488
576
|
|
489
577
|
@classmethod
|
490
|
-
@
|
491
|
-
|
578
|
+
@annotations.lru_cache(scope='global',
|
579
|
+
maxsize=1) # Cache since getting identity is slow.
|
580
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
492
581
|
"""Returns the cloud user identity."""
|
493
582
|
# This returns the user's email address + [subscription_id].
|
494
583
|
retry_cnt = 0
|
@@ -530,11 +619,13 @@ class Azure(clouds.Cloud):
|
|
530
619
|
with ux_utils.print_exception_no_traceback():
|
531
620
|
raise exceptions.CloudUserIdentityError(
|
532
621
|
'Failed to get Azure project ID.') from e
|
533
|
-
|
622
|
+
# TODO: Return a list of identities in the profile when we support
|
623
|
+
# automatic switching for Az. Currently we only support one identity.
|
624
|
+
return [[f'{account_email} [subscription_id={project_id}]']]
|
534
625
|
|
535
626
|
@classmethod
|
536
|
-
def
|
537
|
-
user_identity = cls.
|
627
|
+
def get_active_user_identity_str(cls) -> Optional[str]:
|
628
|
+
user_identity = cls.get_active_user_identity()
|
538
629
|
if user_identity is None:
|
539
630
|
return None
|
540
631
|
return user_identity[0]
|
@@ -579,9 +670,10 @@ class Azure(clouds.Cloud):
|
|
579
670
|
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
|
580
671
|
if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
|
581
672
|
return True, ''
|
582
|
-
if disk_tier == resources_utils.DiskTier.
|
583
|
-
return False, (
|
584
|
-
|
673
|
+
if disk_tier == resources_utils.DiskTier.ULTRA:
|
674
|
+
return False, (
|
675
|
+
'Azure disk_tier=ultra is not supported now. '
|
676
|
+
'Please use disk_tier={low, medium, high, best} instead.')
|
585
677
|
# Only S-series supported premium ssd
|
586
678
|
# see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long
|
587
679
|
if cls._get_disk_type(
|
@@ -589,7 +681,7 @@ class Azure(clouds.Cloud):
|
|
589
681
|
) == 'Premium_LRS' and not Azure._is_s_series(instance_type):
|
590
682
|
return False, (
|
591
683
|
'Azure premium SSDs are only supported for S-series '
|
592
|
-
'instances. To use disk_tier
|
684
|
+
'instances. To use disk_tier>=medium, please make sure '
|
593
685
|
'instance_type is specified to an S-series instance.')
|
594
686
|
return True, ''
|
595
687
|
|
@@ -608,95 +700,9 @@ class Azure(clouds.Cloud):
|
|
608
700
|
# TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two
|
609
701
|
# cannot be used as OS disks so we might need data disk support
|
610
702
|
tier2name = {
|
611
|
-
resources_utils.DiskTier.
|
703
|
+
resources_utils.DiskTier.ULTRA: 'Disabled',
|
704
|
+
resources_utils.DiskTier.HIGH: 'Premium_LRS',
|
612
705
|
resources_utils.DiskTier.MEDIUM: 'Premium_LRS',
|
613
706
|
resources_utils.DiskTier.LOW: 'Standard_LRS',
|
614
707
|
}
|
615
708
|
return tier2name[tier]
|
616
|
-
|
617
|
-
@classmethod
|
618
|
-
def query_status(cls, name: str, tag_filters: Dict[str, str],
|
619
|
-
region: Optional[str], zone: Optional[str],
|
620
|
-
**kwargs) -> List[status_lib.ClusterStatus]:
|
621
|
-
del zone # unused
|
622
|
-
status_map = {
|
623
|
-
'VM starting': status_lib.ClusterStatus.INIT,
|
624
|
-
'VM running': status_lib.ClusterStatus.UP,
|
625
|
-
# 'VM stopped' in Azure means Stopped (Allocated), which still bills
|
626
|
-
# for the VM.
|
627
|
-
'VM stopping': status_lib.ClusterStatus.INIT,
|
628
|
-
'VM stopped': status_lib.ClusterStatus.INIT,
|
629
|
-
# 'VM deallocated' in Azure means Stopped (Deallocated), which does not
|
630
|
-
# bill for the VM.
|
631
|
-
'VM deallocating': status_lib.ClusterStatus.STOPPED,
|
632
|
-
'VM deallocated': status_lib.ClusterStatus.STOPPED,
|
633
|
-
}
|
634
|
-
tag_filter_str = ' '.join(
|
635
|
-
f'tags.\\"{k}\\"==\'{v}\'' for k, v in tag_filters.items())
|
636
|
-
|
637
|
-
query_node_id = (f'az vm list --query "[?{tag_filter_str}].id" -o json')
|
638
|
-
returncode, stdout, stderr = log_lib.run_with_log(query_node_id,
|
639
|
-
'/dev/null',
|
640
|
-
require_outputs=True,
|
641
|
-
shell=True)
|
642
|
-
logger.debug(f'{query_node_id} returned {returncode}.\n'
|
643
|
-
'**** STDOUT ****\n'
|
644
|
-
f'{stdout}\n'
|
645
|
-
'**** STDERR ****\n'
|
646
|
-
f'{stderr}')
|
647
|
-
if returncode == 0:
|
648
|
-
if not stdout.strip():
|
649
|
-
return []
|
650
|
-
node_ids = json.loads(stdout.strip())
|
651
|
-
if not node_ids:
|
652
|
-
return []
|
653
|
-
state_str = '[].powerState'
|
654
|
-
if len(node_ids) == 1:
|
655
|
-
state_str = 'powerState'
|
656
|
-
node_ids_str = '\t'.join(node_ids)
|
657
|
-
query_cmd = (
|
658
|
-
f'az vm show -d --ids {node_ids_str} --query "{state_str}" -o json'
|
659
|
-
)
|
660
|
-
returncode, stdout, stderr = log_lib.run_with_log(
|
661
|
-
query_cmd, '/dev/null', require_outputs=True, shell=True)
|
662
|
-
logger.debug(f'{query_cmd} returned {returncode}.\n'
|
663
|
-
'**** STDOUT ****\n'
|
664
|
-
f'{stdout}\n'
|
665
|
-
'**** STDERR ****\n'
|
666
|
-
f'{stderr}')
|
667
|
-
|
668
|
-
# NOTE: Azure cli should be handled carefully. The query command above
|
669
|
-
# takes about 1 second to run.
|
670
|
-
# An alternative is the following command, but it will take more than
|
671
|
-
# 20 seconds to run.
|
672
|
-
# query_cmd = (
|
673
|
-
# f'az vm list --show-details --query "['
|
674
|
-
# f'?tags.\\"ray-cluster-name\\" == \'{handle.cluster_name}\' '
|
675
|
-
# '&& tags.\\"ray-node-type\\" == \'head\'].powerState" -o tsv'
|
676
|
-
# )
|
677
|
-
|
678
|
-
if returncode != 0:
|
679
|
-
with ux_utils.print_exception_no_traceback():
|
680
|
-
raise exceptions.ClusterStatusFetchingError(
|
681
|
-
f'Failed to query Azure cluster {name!r} status: '
|
682
|
-
f'{stdout + stderr}')
|
683
|
-
|
684
|
-
assert stdout.strip(), f'No status returned for {name!r}'
|
685
|
-
|
686
|
-
original_statuses_list = json.loads(stdout.strip())
|
687
|
-
if not original_statuses_list:
|
688
|
-
# No nodes found. The original_statuses_list will be empty string.
|
689
|
-
# Return empty list.
|
690
|
-
return []
|
691
|
-
if not isinstance(original_statuses_list, list):
|
692
|
-
original_statuses_list = [original_statuses_list]
|
693
|
-
statuses = []
|
694
|
-
for s in original_statuses_list:
|
695
|
-
if s not in status_map:
|
696
|
-
with ux_utils.print_exception_no_traceback():
|
697
|
-
raise exceptions.ClusterStatusFetchingError(
|
698
|
-
f'Failed to parse status from Azure response: {stdout}')
|
699
|
-
node_status = status_map[s]
|
700
|
-
if node_status is not None:
|
701
|
-
statuses.append(node_status)
|
702
|
-
return statuses
|