skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/adaptors/azure.py
CHANGED
@@ -1,25 +1,58 @@
|
|
1
1
|
"""Azure cli adaptor"""
|
2
2
|
|
3
3
|
# pylint: disable=import-outside-toplevel
|
4
|
-
import
|
4
|
+
import asyncio
|
5
|
+
import datetime
|
6
|
+
import logging
|
5
7
|
import threading
|
8
|
+
import time
|
9
|
+
from typing import Any, Optional
|
10
|
+
import uuid
|
6
11
|
|
12
|
+
from sky import exceptions as sky_exceptions
|
13
|
+
from sky import sky_logging
|
7
14
|
from sky.adaptors import common
|
15
|
+
from sky.skylet import constants
|
16
|
+
from sky.utils import annotations
|
17
|
+
from sky.utils import common_utils
|
18
|
+
from sky.utils import ux_utils
|
8
19
|
|
9
20
|
azure = common.LazyImport(
|
10
21
|
'azure',
|
11
22
|
import_error_message=('Failed to import dependencies for Azure.'
|
12
|
-
'Try pip install "skypilot[azure]"')
|
23
|
+
'Try pip install "skypilot[azure]"'),
|
24
|
+
set_loggers=lambda: logging.getLogger('azure.identity').setLevel(logging.
|
25
|
+
ERROR))
|
26
|
+
Client = Any
|
27
|
+
sky_logger = sky_logging.init_logger(__name__)
|
28
|
+
|
13
29
|
_LAZY_MODULES = (azure,)
|
14
30
|
|
15
31
|
_session_creation_lock = threading.RLock()
|
32
|
+
_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5
|
16
33
|
|
17
34
|
|
18
35
|
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
36
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
19
37
|
def get_subscription_id() -> str:
|
20
38
|
"""Get the default subscription id."""
|
21
39
|
from azure.common import credentials
|
22
|
-
|
40
|
+
retry = 0
|
41
|
+
backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4)
|
42
|
+
while True:
|
43
|
+
try:
|
44
|
+
return credentials.get_cli_profile().get_subscription_id()
|
45
|
+
except Exception as e:
|
46
|
+
if ('Please run \'az login\' to setup account.' in str(e) and
|
47
|
+
retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID):
|
48
|
+
# When there are multiple processes trying to get the
|
49
|
+
# subscription id, it may fail with the above error message.
|
50
|
+
# Retry will fix the issue.
|
51
|
+
retry += 1
|
52
|
+
|
53
|
+
time.sleep(backoff.current_backoff())
|
54
|
+
continue
|
55
|
+
raise
|
23
56
|
|
24
57
|
|
25
58
|
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
@@ -36,30 +69,414 @@ def exceptions():
|
|
36
69
|
return azure_exceptions
|
37
70
|
|
38
71
|
|
39
|
-
@
|
72
|
+
@annotations.lru_cache(scope='global')
|
73
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
74
|
+
def azure_mgmt_models(name: str):
|
75
|
+
if name == 'compute':
|
76
|
+
from azure.mgmt.compute import models
|
77
|
+
return models
|
78
|
+
elif name == 'network':
|
79
|
+
from azure.mgmt.network import models
|
80
|
+
return models
|
81
|
+
|
82
|
+
|
83
|
+
# We should keep the order of the decorators having 'lru_cache' followed
|
84
|
+
# by 'load_lazy_modules' as we need to make sure a caller can call
|
85
|
+
# 'get_client.cache_clear', which is a function provided by 'lru_cache'
|
86
|
+
@annotations.lru_cache(scope='global')
|
40
87
|
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
41
|
-
def get_client(name: str,
|
88
|
+
def get_client(name: str,
|
89
|
+
subscription_id: Optional[str] = None,
|
90
|
+
**kwargs) -> Client:
|
91
|
+
"""Creates and returns an Azure client for the specified service.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
name: The type of Azure client to create.
|
95
|
+
subscription_id: The Azure subscription ID. Defaults to None.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
An instance of the specified Azure client.
|
99
|
+
|
100
|
+
Raises:
|
101
|
+
NonExistentStorageAccountError: When storage account provided
|
102
|
+
either through config.yaml or local db does not exist under
|
103
|
+
user's subscription ID.
|
104
|
+
StorageBucketGetError: If there is an error retrieving the container
|
105
|
+
client or if a non-existent public container is specified.
|
106
|
+
ValueError: If an unsupported client type is specified.
|
107
|
+
TimeoutError: If unable to get the container client within the
|
108
|
+
specified time.
|
109
|
+
"""
|
42
110
|
# Sky only supports Azure CLI credential for now.
|
43
111
|
# Increase the timeout to fix the Azure get-access-token timeout issue.
|
44
112
|
# Tracked in
|
45
113
|
# https://github.com/Azure/azure-cli/issues/20404#issuecomment-1249575110
|
46
|
-
from azure
|
114
|
+
from azure import identity
|
47
115
|
with _session_creation_lock:
|
48
|
-
credential = AzureCliCredential(process_timeout=30)
|
116
|
+
credential = identity.AzureCliCredential(process_timeout=30)
|
49
117
|
if name == 'compute':
|
50
|
-
from azure.mgmt
|
51
|
-
return ComputeManagementClient(credential, subscription_id)
|
118
|
+
from azure.mgmt import compute
|
119
|
+
return compute.ComputeManagementClient(credential, subscription_id)
|
52
120
|
elif name == 'network':
|
53
|
-
from azure.mgmt
|
54
|
-
return NetworkManagementClient(credential, subscription_id)
|
121
|
+
from azure.mgmt import network
|
122
|
+
return network.NetworkManagementClient(credential, subscription_id)
|
55
123
|
elif name == 'resource':
|
56
|
-
from azure.mgmt
|
57
|
-
return ResourceManagementClient(credential,
|
124
|
+
from azure.mgmt import resource
|
125
|
+
return resource.ResourceManagementClient(credential,
|
126
|
+
subscription_id)
|
127
|
+
elif name == 'storage':
|
128
|
+
from azure.mgmt import storage
|
129
|
+
return storage.StorageManagementClient(credential, subscription_id)
|
130
|
+
elif name == 'authorization':
|
131
|
+
from azure.mgmt import authorization
|
132
|
+
return authorization.AuthorizationManagementClient(
|
133
|
+
credential, subscription_id)
|
134
|
+
elif name == 'msi':
|
135
|
+
from azure.mgmt import msi
|
136
|
+
return msi.ManagedServiceIdentityClient(credential, subscription_id)
|
137
|
+
elif name == 'graph':
|
138
|
+
import msgraph
|
139
|
+
return msgraph.GraphServiceClient(credential)
|
140
|
+
elif name == 'container':
|
141
|
+
# There is no direct way to check if a container URL is public or
|
142
|
+
# private. Attempting to access a private container without
|
143
|
+
# credentials or a public container with credentials throws an
|
144
|
+
# error. Therefore, we use a try-except block, first assuming the
|
145
|
+
# URL is for a public container. If an error occurs, we retry with
|
146
|
+
# credentials, assuming it's a private container.
|
147
|
+
# Reference: https://github.com/Azure/azure-sdk-for-python/issues/35770 # pylint: disable=line-too-long
|
148
|
+
# Note: Checking a private container without credentials is
|
149
|
+
# faster (~0.2s) than checking a public container with
|
150
|
+
# credentials (~90s).
|
151
|
+
from azure.mgmt import storage
|
152
|
+
from azure.storage import blob
|
153
|
+
container_url = kwargs.pop('container_url', None)
|
154
|
+
assert container_url is not None, ('Must provide container_url'
|
155
|
+
' keyword arguments for '
|
156
|
+
'container client.')
|
157
|
+
storage_account_name = kwargs.pop('storage_account_name', None)
|
158
|
+
assert storage_account_name is not None, ('Must provide '
|
159
|
+
'storage_account_name '
|
160
|
+
'keyword arguments for '
|
161
|
+
'container client.')
|
162
|
+
|
163
|
+
# Check if the given storage account exists. This separate check
|
164
|
+
# is necessary as running container_client.exists() with container
|
165
|
+
# url on non-existent storage account errors out after long lag(~90s)
|
166
|
+
storage_client = storage.StorageManagementClient(
|
167
|
+
credential, subscription_id)
|
168
|
+
storage_account_availability = (
|
169
|
+
storage_client.storage_accounts.check_name_availability(
|
170
|
+
{'name': storage_account_name}))
|
171
|
+
if storage_account_availability.name_available:
|
172
|
+
with ux_utils.print_exception_no_traceback():
|
173
|
+
raise sky_exceptions.NonExistentStorageAccountError(
|
174
|
+
f'The storage account {storage_account_name!r} does '
|
175
|
+
'not exist. Please check if the name is correct.')
|
176
|
+
|
177
|
+
# First, assume the URL is from a public container.
|
178
|
+
container_client = blob.ContainerClient.from_container_url(
|
179
|
+
container_url)
|
180
|
+
try:
|
181
|
+
container_client.exists()
|
182
|
+
return container_client
|
183
|
+
except exceptions().ClientAuthenticationError:
|
184
|
+
pass
|
185
|
+
|
186
|
+
# If the URL is not for a public container, assume it's private
|
187
|
+
# and retry with credentials.
|
188
|
+
start_time = time.time()
|
189
|
+
role_assigned = False
|
190
|
+
|
191
|
+
while (time.time() - start_time <
|
192
|
+
constants.WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT):
|
193
|
+
container_client = blob.ContainerClient.from_container_url(
|
194
|
+
container_url, credential)
|
195
|
+
try:
|
196
|
+
# Suppress noisy logs from Azure SDK when attempting
|
197
|
+
# to run exists() on private container without access.
|
198
|
+
# Reference:
|
199
|
+
# https://github.com/Azure/azure-sdk-for-python/issues/9422
|
200
|
+
azure_logger = logging.getLogger('azure')
|
201
|
+
original_level = azure_logger.getEffectiveLevel()
|
202
|
+
azure_logger.setLevel(logging.CRITICAL)
|
203
|
+
container_client.exists()
|
204
|
+
azure_logger.setLevel(original_level)
|
205
|
+
return container_client
|
206
|
+
except exceptions().ClientAuthenticationError as e:
|
207
|
+
# Caught when user attempted to use private container
|
208
|
+
# without access rights. Raised error is handled at the
|
209
|
+
# upstream.
|
210
|
+
# Reference: https://learn.microsoft.com/en-us/troubleshoot/azure/entra/entra-id/app-integration/error-code-aadsts50020-user-account-identity-provider-does-not-exist # pylint: disable=line-too-long
|
211
|
+
if 'ERROR: AADSTS50020' in str(e):
|
212
|
+
with ux_utils.print_exception_no_traceback():
|
213
|
+
raise e
|
214
|
+
with ux_utils.print_exception_no_traceback():
|
215
|
+
raise sky_exceptions.StorageBucketGetError(
|
216
|
+
'Failed to retreive the container client for the '
|
217
|
+
f'container {container_client.container_name!r}. '
|
218
|
+
f'Details: '
|
219
|
+
f'{common_utils.format_exception(e, use_bracket=True)}'
|
220
|
+
)
|
221
|
+
except exceptions().HttpResponseError as e:
|
222
|
+
# Handle case where user lacks sufficient IAM role for
|
223
|
+
# a private container in the same subscription. Attempt to
|
224
|
+
# assign appropriate role to current user.
|
225
|
+
if 'AuthorizationPermissionMismatch' in str(e):
|
226
|
+
if not role_assigned:
|
227
|
+
# resource_group_name is not None only for private
|
228
|
+
# containers with user access.
|
229
|
+
resource_group_name = kwargs.pop(
|
230
|
+
'resource_group_name', None)
|
231
|
+
assert resource_group_name is not None, (
|
232
|
+
'Must provide resource_group_name keyword '
|
233
|
+
'arguments for container client.')
|
234
|
+
sky_logger.info(
|
235
|
+
'Failed to check the existance of the '
|
236
|
+
f'container {container_url!r} due to '
|
237
|
+
'insufficient IAM role for storage '
|
238
|
+
f'account {storage_account_name!r}.')
|
239
|
+
assign_storage_account_iam_role(
|
240
|
+
storage_account_name=storage_account_name,
|
241
|
+
resource_group_name=resource_group_name)
|
242
|
+
role_assigned = True
|
243
|
+
else:
|
244
|
+
sky_logger.info(
|
245
|
+
'Waiting due to the propagation delay of IAM '
|
246
|
+
'role assignment to the storage account '
|
247
|
+
f'{storage_account_name!r}.')
|
248
|
+
time.sleep(
|
249
|
+
constants.RETRY_INTERVAL_AFTER_ROLE_ASSIGNMENT)
|
250
|
+
continue
|
251
|
+
with ux_utils.print_exception_no_traceback():
|
252
|
+
raise sky_exceptions.StorageBucketGetError(
|
253
|
+
'Failed to retreive the container client for the '
|
254
|
+
f'container {container_client.container_name!r}. '
|
255
|
+
f'Details: '
|
256
|
+
f'{common_utils.format_exception(e, use_bracket=True)}'
|
257
|
+
)
|
258
|
+
else:
|
259
|
+
raise TimeoutError(
|
260
|
+
'Failed to get the container client within '
|
261
|
+
f'{constants.WAIT_FOR_STORAGE_ACCOUNT_ROLE_ASSIGNMENT}'
|
262
|
+
' seconds.')
|
58
263
|
else:
|
59
264
|
raise ValueError(f'Client not supported: "{name}"')
|
60
265
|
|
61
266
|
|
267
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
268
|
+
def get_az_container_sas_token(
|
269
|
+
storage_account_name: str,
|
270
|
+
storage_account_key: str,
|
271
|
+
container_name: str,
|
272
|
+
) -> str:
|
273
|
+
"""Returns SAS token used to access container.
|
274
|
+
|
275
|
+
Args:
|
276
|
+
storage_account_name: Name of the storage account
|
277
|
+
storage_account_key: Access key for the given storage account
|
278
|
+
container_name: The name of the mounting container
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
An SAS token with a 1-hour lifespan to access the specified container.
|
282
|
+
"""
|
283
|
+
from azure.storage import blob
|
284
|
+
sas_token = blob.generate_container_sas(
|
285
|
+
account_name=storage_account_name,
|
286
|
+
container_name=container_name,
|
287
|
+
account_key=storage_account_key,
|
288
|
+
permission=blob.ContainerSasPermissions(read=True,
|
289
|
+
write=True,
|
290
|
+
list=True,
|
291
|
+
create=True),
|
292
|
+
expiry=datetime.datetime.now(datetime.timezone.utc) +
|
293
|
+
datetime.timedelta(hours=1))
|
294
|
+
return sas_token
|
295
|
+
|
296
|
+
|
297
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
298
|
+
def get_az_blob_sas_token(storage_account_name: str, storage_account_key: str,
|
299
|
+
container_name: str, blob_name: str) -> str:
|
300
|
+
"""Returns SAS token used to access a blob.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
storage_account_name: Name of the storage account
|
304
|
+
storage_account_key: access key for the given storage
|
305
|
+
account
|
306
|
+
container_name: name of the mounting container
|
307
|
+
blob_name: path to the blob(file)
|
308
|
+
|
309
|
+
Returns:
|
310
|
+
A SAS token with a 1-hour lifespan to access the specified blob.
|
311
|
+
"""
|
312
|
+
from azure.storage import blob
|
313
|
+
sas_token = blob.generate_blob_sas(
|
314
|
+
account_name=storage_account_name,
|
315
|
+
container_name=container_name,
|
316
|
+
blob_name=blob_name,
|
317
|
+
account_key=storage_account_key,
|
318
|
+
permission=blob.BlobSasPermissions(read=True,
|
319
|
+
write=True,
|
320
|
+
list=True,
|
321
|
+
create=True),
|
322
|
+
expiry=datetime.datetime.now(datetime.timezone.utc) +
|
323
|
+
datetime.timedelta(hours=1))
|
324
|
+
return sas_token
|
325
|
+
|
326
|
+
|
327
|
+
def assign_storage_account_iam_role(
|
328
|
+
storage_account_name: str,
|
329
|
+
storage_account_id: Optional[str] = None,
|
330
|
+
resource_group_name: Optional[str] = None) -> None:
|
331
|
+
"""Assigns the Storage Blob Data Owner role to a storage account.
|
332
|
+
|
333
|
+
This function retrieves the current user's object ID, then assigns the
|
334
|
+
Storage Blob Data Owner role to that user for the specified storage
|
335
|
+
account. If the role is already assigned, the function will return without
|
336
|
+
making changes.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
storage_account_name: The name of the storage account.
|
340
|
+
storage_account_id: The ID of the storage account. If not provided,
|
341
|
+
it will be determined using the storage account name.
|
342
|
+
resource_group_name: Name of the resource group the
|
343
|
+
passed storage account belongs to.
|
344
|
+
|
345
|
+
Raises:
|
346
|
+
StorageBucketCreateError: If there is an error assigning the role
|
347
|
+
to the storage account.
|
348
|
+
"""
|
349
|
+
subscription_id = get_subscription_id()
|
350
|
+
authorization_client = get_client('authorization', subscription_id)
|
351
|
+
graph_client = get_client('graph')
|
352
|
+
|
353
|
+
# Obtaining user's object ID to assign role.
|
354
|
+
# Reference: https://github.com/Azure/azure-sdk-for-python/issues/35573 # pylint: disable=line-too-long
|
355
|
+
async def get_object_id() -> str:
|
356
|
+
httpx_logger = logging.getLogger('httpx')
|
357
|
+
original_level = httpx_logger.getEffectiveLevel()
|
358
|
+
# silencing the INFO level response log from httpx request
|
359
|
+
httpx_logger.setLevel(logging.WARNING)
|
360
|
+
user = await graph_client.users.with_url(
|
361
|
+
'https://graph.microsoft.com/v1.0/me').get()
|
362
|
+
httpx_logger.setLevel(original_level)
|
363
|
+
object_id = str(user.additional_data['id'])
|
364
|
+
return object_id
|
365
|
+
|
366
|
+
# Create a new event loop if none exists
|
367
|
+
try:
|
368
|
+
loop = asyncio.get_running_loop()
|
369
|
+
except RuntimeError:
|
370
|
+
loop = asyncio.new_event_loop()
|
371
|
+
asyncio.set_event_loop(loop)
|
372
|
+
|
373
|
+
object_id = loop.run_until_complete(get_object_id())
|
374
|
+
|
375
|
+
# Defintion ID of Storage Blob Data Owner role.
|
376
|
+
# Reference: https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles/storage#storage-blob-data-owner # pylint: disable=line-too-long
|
377
|
+
storage_blob_data_owner_role_id = 'b7e6dc6d-f1e8-4753-8033-0f276bb0955b'
|
378
|
+
role_definition_id = ('/subscriptions'
|
379
|
+
f'/{subscription_id}'
|
380
|
+
'/providers/Microsoft.Authorization'
|
381
|
+
'/roleDefinitions'
|
382
|
+
f'/{storage_blob_data_owner_role_id}')
|
383
|
+
|
384
|
+
# Obtain storage account ID to assign role if not provided.
|
385
|
+
if storage_account_id is None:
|
386
|
+
assert resource_group_name is not None, ('resource_group_name should '
|
387
|
+
'be provided if '
|
388
|
+
'storage_account_id is not.')
|
389
|
+
storage_client = get_client('storage', subscription_id)
|
390
|
+
storage_account = storage_client.storage_accounts.get_properties(
|
391
|
+
resource_group_name, storage_account_name)
|
392
|
+
storage_account_id = storage_account.id
|
393
|
+
|
394
|
+
role_assignment_failure_error_msg = (
|
395
|
+
constants.ROLE_ASSIGNMENT_FAILURE_ERROR_MSG.format(
|
396
|
+
storage_account_name=storage_account_name))
|
397
|
+
try:
|
398
|
+
authorization_client.role_assignments.create(
|
399
|
+
scope=storage_account_id,
|
400
|
+
role_assignment_name=uuid.uuid4(),
|
401
|
+
parameters={
|
402
|
+
'properties': {
|
403
|
+
'principalId': object_id,
|
404
|
+
'principalType': 'User',
|
405
|
+
'roleDefinitionId': role_definition_id,
|
406
|
+
}
|
407
|
+
},
|
408
|
+
)
|
409
|
+
sky_logger.info('Assigned Storage Blob Data Owner role to your '
|
410
|
+
f'account on storage account {storage_account_name!r}.')
|
411
|
+
return
|
412
|
+
except exceptions().ResourceExistsError as e:
|
413
|
+
# Return if the storage account already has been assigned
|
414
|
+
# the role.
|
415
|
+
if 'RoleAssignmentExists' in str(e):
|
416
|
+
return
|
417
|
+
else:
|
418
|
+
with ux_utils.print_exception_no_traceback():
|
419
|
+
raise sky_exceptions.StorageBucketCreateError(
|
420
|
+
f'{role_assignment_failure_error_msg}'
|
421
|
+
f'Details: {common_utils.format_exception(e, use_bracket=True)}'
|
422
|
+
)
|
423
|
+
except exceptions().HttpResponseError as e:
|
424
|
+
if 'AuthorizationFailed' in str(e):
|
425
|
+
with ux_utils.print_exception_no_traceback():
|
426
|
+
raise sky_exceptions.StorageBucketCreateError(
|
427
|
+
f'{role_assignment_failure_error_msg}'
|
428
|
+
'Please check to see if you have the authorization'
|
429
|
+
' "Microsoft.Authorization/roleAssignments/write" '
|
430
|
+
'to assign the role to the newly created storage '
|
431
|
+
'account.')
|
432
|
+
else:
|
433
|
+
with ux_utils.print_exception_no_traceback():
|
434
|
+
raise sky_exceptions.StorageBucketCreateError(
|
435
|
+
f'{role_assignment_failure_error_msg}'
|
436
|
+
f'Details: {common_utils.format_exception(e, use_bracket=True)}'
|
437
|
+
)
|
438
|
+
|
439
|
+
|
440
|
+
def get_az_resource_group(
|
441
|
+
storage_account_name: str,
|
442
|
+
storage_client: Optional[Client] = None) -> Optional[str]:
|
443
|
+
"""Returns the resource group name the given storage account belongs to.
|
444
|
+
|
445
|
+
Args:
|
446
|
+
storage_account_name: Name of the storage account
|
447
|
+
storage_client: Client object facing storage
|
448
|
+
|
449
|
+
Returns:
|
450
|
+
Name of the resource group the given storage account belongs to, or
|
451
|
+
None if not found.
|
452
|
+
"""
|
453
|
+
if storage_client is None:
|
454
|
+
subscription_id = get_subscription_id()
|
455
|
+
storage_client = get_client('storage', subscription_id)
|
456
|
+
for account in storage_client.storage_accounts.list():
|
457
|
+
if account.name == storage_account_name:
|
458
|
+
# Extract the resource group name from the account ID
|
459
|
+
# An example of account.id would be the following:
|
460
|
+
# /subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Storage/storageAccounts/{container_name} # pylint: disable=line-too-long
|
461
|
+
split_account_id = account.id.split('/')
|
462
|
+
assert len(split_account_id) == 9
|
463
|
+
resource_group_name = split_account_id[4]
|
464
|
+
return resource_group_name
|
465
|
+
# resource group cannot be found when using container not created
|
466
|
+
# under the user's subscription id, i.e. public container, or
|
467
|
+
# private containers not belonging to the user or when the storage account
|
468
|
+
# does not exist.
|
469
|
+
return None
|
470
|
+
|
471
|
+
|
62
472
|
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
63
473
|
def create_security_rule(**kwargs):
|
64
|
-
from azure.mgmt.network
|
65
|
-
return SecurityRule(**kwargs)
|
474
|
+
from azure.mgmt.network import models
|
475
|
+
return models.SecurityRule(**kwargs)
|
476
|
+
|
477
|
+
|
478
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
479
|
+
def deployment_mode():
|
480
|
+
"""Azure deployment mode."""
|
481
|
+
from azure.mgmt.resource.resources.models import DeploymentMode
|
482
|
+
return DeploymentMode
|
sky/adaptors/cloudflare.py
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
# pylint: disable=import-outside-toplevel
|
3
3
|
|
4
4
|
import contextlib
|
5
|
-
import functools
|
6
5
|
import os
|
7
6
|
import threading
|
8
7
|
from typing import Dict, Optional, Tuple
|
9
8
|
|
10
9
|
from sky.adaptors import common
|
10
|
+
from sky.utils import annotations
|
11
11
|
from sky.utils import ux_utils
|
12
12
|
|
13
13
|
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Cloudflare.'
|
@@ -62,7 +62,7 @@ def get_r2_credentials(boto3_session):
|
|
62
62
|
# lru_cache() is thread-safe and it will return the same session object
|
63
63
|
# for different threads.
|
64
64
|
# Reference: https://docs.python.org/3/library/functools.html#functools.lru_cache # pylint: disable=line-too-long
|
65
|
-
@
|
65
|
+
@annotations.lru_cache(scope='global')
|
66
66
|
def session():
|
67
67
|
"""Create an AWS session."""
|
68
68
|
# Creating the session object is not thread-safe for boto3,
|
@@ -76,7 +76,7 @@ def session():
|
|
76
76
|
return session_
|
77
77
|
|
78
78
|
|
79
|
-
@
|
79
|
+
@annotations.lru_cache(scope='global')
|
80
80
|
def resource(resource_name: str, **kwargs):
|
81
81
|
"""Create a Cloudflare resource.
|
82
82
|
|
@@ -102,7 +102,7 @@ def resource(resource_name: str, **kwargs):
|
|
102
102
|
**kwargs)
|
103
103
|
|
104
104
|
|
105
|
-
@
|
105
|
+
@annotations.lru_cache(scope='global')
|
106
106
|
def client(service_name: str, region):
|
107
107
|
"""Create an CLOUDFLARE client of a certain service.
|
108
108
|
|
@@ -177,7 +177,7 @@ def check_credentials() -> Tuple[bool, Optional[str]]:
|
|
177
177
|
hints += f'\n{_INDENT_PREFIX} $ mkdir -p ~/.cloudflare'
|
178
178
|
hints += f'\n{_INDENT_PREFIX} $ echo <YOUR_ACCOUNT_ID_HERE> > ~/.cloudflare/accountid' # pylint: disable=line-too-long
|
179
179
|
hints += f'\n{_INDENT_PREFIX}For more info: '
|
180
|
-
hints += 'https://skypilot.
|
180
|
+
hints += 'https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long
|
181
181
|
|
182
182
|
return (False, hints) if hints else (True, hints)
|
183
183
|
|
sky/adaptors/common.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Lazy import for modules to avoid import error when not used."""
|
2
2
|
import functools
|
3
3
|
import importlib
|
4
|
-
|
4
|
+
import threading
|
5
|
+
from typing import Any, Callable, Optional, Tuple
|
5
6
|
|
6
7
|
|
7
8
|
class LazyImport:
|
@@ -18,19 +19,28 @@ class LazyImport:
|
|
18
19
|
|
19
20
|
def __init__(self,
|
20
21
|
module_name: str,
|
21
|
-
import_error_message: Optional[str] = None
|
22
|
+
import_error_message: Optional[str] = None,
|
23
|
+
set_loggers: Optional[Callable] = None):
|
22
24
|
self._module_name = module_name
|
23
25
|
self._module = None
|
24
26
|
self._import_error_message = import_error_message
|
27
|
+
self._set_loggers = set_loggers
|
28
|
+
self._lock = threading.RLock()
|
25
29
|
|
26
30
|
def load_module(self):
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
# Avoid extra imports when multiple threads try to import the same
|
32
|
+
# module. The overhead is minor since import can only run in serial
|
33
|
+
# due to GIL even in multi-threaded environments.
|
34
|
+
with self._lock:
|
35
|
+
if self._module is None:
|
36
|
+
try:
|
37
|
+
self._module = importlib.import_module(self._module_name)
|
38
|
+
if self._set_loggers is not None:
|
39
|
+
self._set_loggers()
|
40
|
+
except ImportError as e:
|
41
|
+
if self._import_error_message is not None:
|
42
|
+
raise ImportError(self._import_error_message) from e
|
43
|
+
raise
|
34
44
|
return self._module
|
35
45
|
|
36
46
|
def __getattr__(self, name: str) -> Any:
|
sky/adaptors/do.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""Digital Ocean cloud adaptors"""
|
2
|
+
|
3
|
+
# pylint: disable=import-outside-toplevel
|
4
|
+
|
5
|
+
from sky.adaptors import common
|
6
|
+
|
7
|
+
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for DO. '
|
8
|
+
'Try pip install "skypilot[do]"')
|
9
|
+
pydo = common.LazyImport('pydo', import_error_message=_IMPORT_ERROR_MESSAGE)
|
10
|
+
azure = common.LazyImport('azure', import_error_message=_IMPORT_ERROR_MESSAGE)
|
11
|
+
_LAZY_MODULES = (pydo, azure)
|
12
|
+
|
13
|
+
|
14
|
+
# `pydo`` inherits Azure exceptions. See:
|
15
|
+
# https://github.com/digitalocean/pydo/blob/7b01498d99eb0d3a772366b642e5fab3d6fc6aa2/examples/poc_droplets_volumes_sshkeys.py#L6
|
16
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
17
|
+
def exceptions():
|
18
|
+
"""Azure exceptions."""
|
19
|
+
from azure.core import exceptions as azure_exceptions
|
20
|
+
return azure_exceptions
|
sky/adaptors/gcp.py
CHANGED
@@ -21,8 +21,9 @@ def build(service_name: str, version: str, *args, **kwargs):
|
|
21
21
|
service_name: GCP service name (e.g., 'compute', 'storagetransfer').
|
22
22
|
version: Service version (e.g., 'v1').
|
23
23
|
"""
|
24
|
-
|
25
|
-
return discovery.build(service_name, version, *args,
|
24
|
+
|
25
|
+
return googleapiclient.discovery.build(service_name, version, *args,
|
26
|
+
**kwargs)
|
26
27
|
|
27
28
|
|
28
29
|
@common.load_lazy_modules(_LAZY_MODULES)
|