skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/dag.py
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
"""DAGs: user applications to be run."""
|
2
2
|
import pprint
|
3
3
|
import threading
|
4
|
+
import typing
|
4
5
|
from typing import List, Optional
|
5
6
|
|
7
|
+
if typing.TYPE_CHECKING:
|
8
|
+
from sky import task
|
9
|
+
|
6
10
|
|
7
11
|
class Dag:
|
8
12
|
"""Dag: a user application, represented as a DAG of Tasks.
|
@@ -13,37 +17,38 @@ class Dag:
|
|
13
17
|
>>> task = sky.Task(...)
|
14
18
|
"""
|
15
19
|
|
16
|
-
def __init__(self):
|
17
|
-
self.tasks = []
|
20
|
+
def __init__(self) -> None:
|
21
|
+
self.tasks: List['task.Task'] = []
|
18
22
|
import networkx as nx # pylint: disable=import-outside-toplevel
|
19
23
|
|
20
24
|
self.graph = nx.DiGraph()
|
21
|
-
self.name = None
|
25
|
+
self.name: Optional[str] = None
|
26
|
+
self.policy_applied: bool = False
|
22
27
|
|
23
|
-
def add(self, task):
|
28
|
+
def add(self, task: 'task.Task') -> None:
|
24
29
|
self.graph.add_node(task)
|
25
30
|
self.tasks.append(task)
|
26
31
|
|
27
|
-
def remove(self, task):
|
32
|
+
def remove(self, task: 'task.Task') -> None:
|
28
33
|
self.tasks.remove(task)
|
29
34
|
self.graph.remove_node(task)
|
30
35
|
|
31
|
-
def add_edge(self, op1, op2):
|
36
|
+
def add_edge(self, op1: 'task.Task', op2: 'task.Task') -> None:
|
32
37
|
assert op1 in self.graph.nodes
|
33
38
|
assert op2 in self.graph.nodes
|
34
39
|
self.graph.add_edge(op1, op2)
|
35
40
|
|
36
|
-
def __len__(self):
|
41
|
+
def __len__(self) -> int:
|
37
42
|
return len(self.tasks)
|
38
43
|
|
39
|
-
def __enter__(self):
|
44
|
+
def __enter__(self) -> 'Dag':
|
40
45
|
push_dag(self)
|
41
46
|
return self
|
42
47
|
|
43
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
48
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
44
49
|
pop_dag()
|
45
50
|
|
46
|
-
def __repr__(self):
|
51
|
+
def __repr__(self) -> str:
|
47
52
|
pformat = pprint.pformat(self.tasks)
|
48
53
|
return f'DAG:\n{pformat}'
|
49
54
|
|
@@ -51,34 +56,42 @@ class Dag:
|
|
51
56
|
return self.graph
|
52
57
|
|
53
58
|
def is_chain(self) -> bool:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
59
|
+
"""Check if the DAG is a linear chain of tasks."""
|
60
|
+
|
61
|
+
nodes = list(self.graph.nodes)
|
62
|
+
|
63
|
+
if len(nodes) == 0:
|
64
|
+
return True
|
65
|
+
|
66
|
+
in_degrees = [self.graph.in_degree(node) for node in nodes]
|
67
|
+
out_degrees = [self.graph.out_degree(node) for node in nodes]
|
68
|
+
|
69
|
+
# Check out-degrees: all <= 1 and exactly one node has out_degree == 0
|
70
|
+
out_degree_condition = (all(degree <= 1 for degree in out_degrees) and
|
71
|
+
sum(degree == 0 for degree in out_degrees) == 1)
|
72
|
+
|
73
|
+
# Check in-degrees: all <= 1 and exactly one node has in_degree == 0
|
74
|
+
in_degree_condition = (all(degree <= 1 for degree in in_degrees) and
|
75
|
+
sum(degree == 0 for degree in in_degrees) == 1)
|
76
|
+
|
77
|
+
return out_degree_condition and in_degree_condition
|
78
|
+
|
79
|
+
def validate(self, workdir_only: bool = False):
|
80
|
+
for task in self.tasks:
|
81
|
+
task.validate(workdir_only=workdir_only)
|
69
82
|
|
70
83
|
|
71
84
|
class _DagContext(threading.local):
|
72
85
|
"""A thread-local stack of Dags."""
|
73
|
-
_current_dag = None
|
86
|
+
_current_dag: Optional[Dag] = None
|
74
87
|
_previous_dags: List[Dag] = []
|
75
88
|
|
76
|
-
def push_dag(self, dag):
|
89
|
+
def push_dag(self, dag: Dag):
|
77
90
|
if self._current_dag is not None:
|
78
91
|
self._previous_dags.append(self._current_dag)
|
79
92
|
self._current_dag = dag
|
80
93
|
|
81
|
-
def pop_dag(self):
|
94
|
+
def pop_dag(self) -> Optional[Dag]:
|
82
95
|
old_dag = self._current_dag
|
83
96
|
if self._previous_dags:
|
84
97
|
self._current_dag = self._previous_dags.pop()
|
sky/data/data_transfer.py
CHANGED
@@ -200,3 +200,40 @@ def _add_bucket_iam_member(bucket_name: str, role: str, member: str) -> None:
|
|
200
200
|
bucket.set_iam_policy(policy)
|
201
201
|
|
202
202
|
logger.debug(f'Added {member} with role {role} to {bucket_name}.')
|
203
|
+
|
204
|
+
|
205
|
+
def s3_to_oci(s3_bucket_name: str, oci_bucket_name: str) -> None:
|
206
|
+
"""Creates a one-time transfer from Amazon S3 to OCI Object Storage.
|
207
|
+
Args:
|
208
|
+
s3_bucket_name: str; Name of the Amazon S3 Bucket
|
209
|
+
oci_bucket_name: str; Name of the OCI Bucket
|
210
|
+
"""
|
211
|
+
# TODO(HysunHe): Implement sync with other clouds (s3, gs)
|
212
|
+
raise NotImplementedError('Moving data directly from S3 to OCI bucket '
|
213
|
+
'is currently not supported. Please specify '
|
214
|
+
'a local source for the storage object.')
|
215
|
+
|
216
|
+
|
217
|
+
def gcs_to_oci(gs_bucket_name: str, oci_bucket_name: str) -> None:
|
218
|
+
"""Creates a one-time transfer from Google Cloud Storage to
|
219
|
+
OCI Object Storage.
|
220
|
+
Args:
|
221
|
+
gs_bucket_name: str; Name of the Google Cloud Storage Bucket
|
222
|
+
oci_bucket_name: str; Name of the OCI Bucket
|
223
|
+
"""
|
224
|
+
# TODO(HysunHe): Implement sync with other clouds (s3, gs)
|
225
|
+
raise NotImplementedError('Moving data directly from GCS to OCI bucket '
|
226
|
+
'is currently not supported. Please specify '
|
227
|
+
'a local source for the storage object.')
|
228
|
+
|
229
|
+
|
230
|
+
def r2_to_oci(r2_bucket_name: str, oci_bucket_name: str) -> None:
|
231
|
+
"""Creates a one-time transfer from Cloudflare R2 to OCI Bucket.
|
232
|
+
Args:
|
233
|
+
r2_bucket_name: str; Name of the Cloudflare R2 Bucket
|
234
|
+
oci_bucket_name: str; Name of the OCI Bucket
|
235
|
+
"""
|
236
|
+
raise NotImplementedError(
|
237
|
+
'Moving data directly from Cloudflare R2 to OCI '
|
238
|
+
'bucket is currently not supported. Please specify '
|
239
|
+
'a local source for the storage object.')
|
sky/data/data_utils.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import re
|
8
8
|
import subprocess
|
9
9
|
import textwrap
|
10
|
+
import time
|
10
11
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
11
12
|
import urllib.parse
|
12
13
|
|
@@ -15,15 +16,25 @@ from filelock import FileLock
|
|
15
16
|
from sky import exceptions
|
16
17
|
from sky import sky_logging
|
17
18
|
from sky.adaptors import aws
|
19
|
+
from sky.adaptors import azure
|
18
20
|
from sky.adaptors import cloudflare
|
19
21
|
from sky.adaptors import gcp
|
20
22
|
from sky.adaptors import ibm
|
23
|
+
from sky.skylet import log_lib
|
24
|
+
from sky.utils import common_utils
|
21
25
|
from sky.utils import ux_utils
|
22
26
|
|
23
27
|
Client = Any
|
24
28
|
|
25
29
|
logger = sky_logging.init_logger(__name__)
|
26
30
|
|
31
|
+
AZURE_CONTAINER_URL = (
|
32
|
+
'https://{storage_account_name}.blob.core.windows.net/{container_name}')
|
33
|
+
|
34
|
+
# Retry 5 times by default for delayed propagation to Azure system
|
35
|
+
# when creating Storage Account.
|
36
|
+
_STORAGE_ACCOUNT_KEY_RETRIEVE_MAX_ATTEMPT = 5
|
37
|
+
|
27
38
|
|
28
39
|
def split_s3_path(s3_path: str) -> Tuple[str, str]:
|
29
40
|
"""Splits S3 Path into Bucket name and Relative Path to Bucket
|
@@ -49,6 +60,28 @@ def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
|
|
49
60
|
return bucket, key
|
50
61
|
|
51
62
|
|
63
|
+
def split_az_path(az_path: str) -> Tuple[str, str, str]:
|
64
|
+
"""Splits Path into Storage account and Container names and Relative Path
|
65
|
+
|
66
|
+
Args:
|
67
|
+
az_path: Container Path,
|
68
|
+
e.g. https://azureopendatastorage.blob.core.windows.net/nyctlc
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
str: Name of the storage account
|
72
|
+
str: Name of the container
|
73
|
+
str: Paths of the file/directory defined within the container
|
74
|
+
"""
|
75
|
+
path_parts = az_path.replace('https://', '').split('/')
|
76
|
+
service_endpoint = path_parts.pop(0)
|
77
|
+
service_endpoint_parts = service_endpoint.split('.')
|
78
|
+
storage_account_name = service_endpoint_parts[0]
|
79
|
+
container_name = path_parts.pop(0)
|
80
|
+
path = '/'.join(path_parts)
|
81
|
+
|
82
|
+
return storage_account_name, container_name, path
|
83
|
+
|
84
|
+
|
52
85
|
def split_r2_path(r2_path: str) -> Tuple[str, str]:
|
53
86
|
"""Splits R2 Path into Bucket name and Relative Path to Bucket
|
54
87
|
|
@@ -126,6 +159,145 @@ def verify_gcs_bucket(name: str) -> bool:
|
|
126
159
|
return False
|
127
160
|
|
128
161
|
|
162
|
+
def create_az_client(client_type: str, **kwargs: Any) -> Client:
|
163
|
+
"""Helper method that connects to AZ client for diverse Resources.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
client_type: str; specify client type, e.g. storage, resource, container
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
Client object facing AZ Resource of the 'client_type'.
|
170
|
+
"""
|
171
|
+
resource_group_name = kwargs.pop('resource_group_name', None)
|
172
|
+
container_url = kwargs.pop('container_url', None)
|
173
|
+
storage_account_name = kwargs.pop('storage_account_name', None)
|
174
|
+
refresh_client = kwargs.pop('refresh_client', False)
|
175
|
+
if client_type == 'container':
|
176
|
+
# We do not assert on resource_group_name as it is set to None when the
|
177
|
+
# container_url is for public container with user access.
|
178
|
+
assert container_url is not None, ('container_url must be provided for '
|
179
|
+
'container client')
|
180
|
+
assert storage_account_name is not None, ('storage_account_name must '
|
181
|
+
'be provided for container '
|
182
|
+
'client')
|
183
|
+
|
184
|
+
if refresh_client:
|
185
|
+
azure.get_client.cache_clear()
|
186
|
+
|
187
|
+
subscription_id = azure.get_subscription_id()
|
188
|
+
client = azure.get_client(client_type,
|
189
|
+
subscription_id,
|
190
|
+
container_url=container_url,
|
191
|
+
storage_account_name=storage_account_name,
|
192
|
+
resource_group_name=resource_group_name)
|
193
|
+
return client
|
194
|
+
|
195
|
+
|
196
|
+
def verify_az_bucket(storage_account_name: str, container_name: str) -> bool:
|
197
|
+
"""Helper method that checks if the AZ Container exists
|
198
|
+
|
199
|
+
Args:
|
200
|
+
storage_account_name: str; Name of the storage account
|
201
|
+
container_name: str; Name of the container
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
True if the container exists, False otherwise.
|
205
|
+
"""
|
206
|
+
container_url = AZURE_CONTAINER_URL.format(
|
207
|
+
storage_account_name=storage_account_name,
|
208
|
+
container_name=container_name)
|
209
|
+
resource_group_name = azure.get_az_resource_group(storage_account_name)
|
210
|
+
container_client = create_az_client(
|
211
|
+
client_type='container',
|
212
|
+
container_url=container_url,
|
213
|
+
storage_account_name=storage_account_name,
|
214
|
+
resource_group_name=resource_group_name)
|
215
|
+
return container_client.exists()
|
216
|
+
|
217
|
+
|
218
|
+
def get_az_storage_account_key(
|
219
|
+
storage_account_name: str,
|
220
|
+
resource_group_name: Optional[str] = None,
|
221
|
+
storage_client: Optional[Client] = None,
|
222
|
+
resource_client: Optional[Client] = None,
|
223
|
+
) -> Optional[str]:
|
224
|
+
"""Returns access key of the given name of storage account.
|
225
|
+
|
226
|
+
Args:
|
227
|
+
storage_account_name: Name of the storage account
|
228
|
+
resource_group_name: Name of the resource group the
|
229
|
+
passed storage account belongs to.
|
230
|
+
storage_clent: Client object facing Storage
|
231
|
+
resource_client: Client object facing Resource
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
One of the two access keys to the given storage account, or None if
|
235
|
+
the account is not found.
|
236
|
+
"""
|
237
|
+
if resource_client is None:
|
238
|
+
resource_client = create_az_client('resource')
|
239
|
+
if storage_client is None:
|
240
|
+
storage_client = create_az_client('storage')
|
241
|
+
if resource_group_name is None:
|
242
|
+
resource_group_name = azure.get_az_resource_group(
|
243
|
+
storage_account_name, storage_client)
|
244
|
+
# resource_group_name is None when using a public container or
|
245
|
+
# a private container not belonging to the user.
|
246
|
+
if resource_group_name is None:
|
247
|
+
return None
|
248
|
+
|
249
|
+
attempt = 0
|
250
|
+
backoff = common_utils.Backoff()
|
251
|
+
while True:
|
252
|
+
storage_account_keys = None
|
253
|
+
resources = resource_client.resources.list_by_resource_group(
|
254
|
+
resource_group_name)
|
255
|
+
# resource group is either created or read when Storage initializes.
|
256
|
+
assert resources is not None
|
257
|
+
for resource in resources:
|
258
|
+
if (resource.type == 'Microsoft.Storage/storageAccounts' and
|
259
|
+
resource.name == storage_account_name):
|
260
|
+
assert storage_account_keys is None
|
261
|
+
keys = storage_client.storage_accounts.list_keys(
|
262
|
+
resource_group_name, storage_account_name)
|
263
|
+
storage_account_keys = [key.value for key in keys.keys]
|
264
|
+
# If storage account was created right before call to this method,
|
265
|
+
# it is possible to fail to retrieve the key as the creation did not
|
266
|
+
# propagate to Azure yet. We retry several times.
|
267
|
+
if storage_account_keys is None:
|
268
|
+
attempt += 1
|
269
|
+
time.sleep(backoff.current_backoff())
|
270
|
+
if attempt > _STORAGE_ACCOUNT_KEY_RETRIEVE_MAX_ATTEMPT:
|
271
|
+
raise RuntimeError('Failed to obtain key value of storage '
|
272
|
+
f'account {storage_account_name!r}. '
|
273
|
+
'Check if the storage account was created.')
|
274
|
+
continue
|
275
|
+
# Azure provides two sets of working storage account keys and we use
|
276
|
+
# one of it.
|
277
|
+
storage_account_key = storage_account_keys[0]
|
278
|
+
return storage_account_key
|
279
|
+
|
280
|
+
|
281
|
+
def is_az_container_endpoint(endpoint_url: str) -> bool:
|
282
|
+
"""Checks if provided url follows a valid container endpoint naming format.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
endpoint_url: Url of container endpoint.
|
286
|
+
e.g. https://azureopendatastorage.blob.core.windows.net/nyctlc
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
bool: True if the endpoint is valid, False otherwise.
|
290
|
+
"""
|
291
|
+
# Storage account must be length of 3-24
|
292
|
+
# Reference: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftstorage # pylint: disable=line-too-long
|
293
|
+
pattern = re.compile(
|
294
|
+
r'^https://([a-z0-9]{3,24})\.blob\.core\.windows\.net(/[^/]+)*$')
|
295
|
+
match = pattern.match(endpoint_url)
|
296
|
+
if match is None:
|
297
|
+
return False
|
298
|
+
return True
|
299
|
+
|
300
|
+
|
129
301
|
def create_r2_client(region: str = 'auto') -> Client:
|
130
302
|
"""Helper method that connects to Boto3 client for R2 Bucket
|
131
303
|
|
@@ -259,6 +431,7 @@ def _group_files_by_dir(
|
|
259
431
|
def parallel_upload(source_path_list: List[str],
|
260
432
|
filesync_command_generator: Callable[[str, List[str]], str],
|
261
433
|
dirsync_command_generator: Callable[[str, str], str],
|
434
|
+
log_path: str,
|
262
435
|
bucket_name: str,
|
263
436
|
access_denied_message: str,
|
264
437
|
create_dirs: bool = False,
|
@@ -274,6 +447,7 @@ def parallel_upload(source_path_list: List[str],
|
|
274
447
|
for a list of files belonging to the same dir.
|
275
448
|
dirsync_command_generator: Callable that generates rsync command
|
276
449
|
for a directory.
|
450
|
+
log_path: Path to the log file.
|
277
451
|
access_denied_message: Message to intercept from the underlying
|
278
452
|
upload utility when permissions are insufficient. Used in
|
279
453
|
exception handling.
|
@@ -306,7 +480,7 @@ def parallel_upload(source_path_list: List[str],
|
|
306
480
|
p.starmap(
|
307
481
|
run_upload_cli,
|
308
482
|
zip(commands, [access_denied_message] * len(commands),
|
309
|
-
[bucket_name] * len(commands)))
|
483
|
+
[bucket_name] * len(commands), [log_path] * len(commands)))
|
310
484
|
|
311
485
|
|
312
486
|
def get_gsutil_command() -> Tuple[str, str]:
|
@@ -347,37 +521,31 @@ def get_gsutil_command() -> Tuple[str, str]:
|
|
347
521
|
return gsutil_alias, alias_gen
|
348
522
|
|
349
523
|
|
350
|
-
def run_upload_cli(command: str, access_denied_message: str, bucket_name: str
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
stderr_str = '\n'.join(stderr)
|
376
|
-
with ux_utils.print_exception_no_traceback():
|
377
|
-
logger.error(stderr_str)
|
378
|
-
raise exceptions.StorageUploadError(
|
379
|
-
f'Upload to bucket failed for store {bucket_name}. '
|
380
|
-
'Please check the logs.')
|
524
|
+
def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
|
525
|
+
log_path: str):
|
526
|
+
returncode, stdout, stderr = log_lib.run_with_log(
|
527
|
+
command,
|
528
|
+
log_path,
|
529
|
+
shell=True,
|
530
|
+
require_outputs=True,
|
531
|
+
# We need to use bash as some of the cloud commands uses bash syntax,
|
532
|
+
# such as [[ ... ]]
|
533
|
+
executable='/bin/bash')
|
534
|
+
if access_denied_message in stderr:
|
535
|
+
with ux_utils.print_exception_no_traceback():
|
536
|
+
raise PermissionError('Failed to upload files to '
|
537
|
+
'the remote bucket. The bucket does not have '
|
538
|
+
'write permissions. It is possible that '
|
539
|
+
'the bucket is public.')
|
540
|
+
if returncode != 0:
|
541
|
+
with ux_utils.print_exception_no_traceback():
|
542
|
+
logger.error(stderr)
|
543
|
+
raise exceptions.StorageUploadError(
|
544
|
+
f'Upload to bucket failed for store {bucket_name}. '
|
545
|
+
f'Please check the logs: {log_path}')
|
546
|
+
if not stdout:
|
547
|
+
logger.debug('No file uploaded. This could be due to an error or '
|
548
|
+
'because all files already exist on the cloud.')
|
381
549
|
|
382
550
|
|
383
551
|
def get_cos_regions() -> List[str]:
|
@@ -566,3 +734,14 @@ class Rclone():
|
|
566
734
|
lines_to_keep.append(line)
|
567
735
|
|
568
736
|
return lines_to_keep
|
737
|
+
|
738
|
+
|
739
|
+
def split_oci_path(oci_path: str) -> Tuple[str, str]:
|
740
|
+
"""Splits OCI Path into Bucket name and Relative Path to Bucket
|
741
|
+
Args:
|
742
|
+
oci_path: str; OCI Path, e.g. oci://imagenet/train/
|
743
|
+
"""
|
744
|
+
path_parts = oci_path.replace('oci://', '').split('/')
|
745
|
+
bucket = path_parts.pop(0)
|
746
|
+
key = '/'.join(path_parts)
|
747
|
+
return bucket, key
|