skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/dag.py CHANGED
@@ -1,8 +1,12 @@
1
1
  """DAGs: user applications to be run."""
2
2
  import pprint
3
3
  import threading
4
+ import typing
4
5
  from typing import List, Optional
5
6
 
7
+ if typing.TYPE_CHECKING:
8
+ from sky import task
9
+
6
10
 
7
11
  class Dag:
8
12
  """Dag: a user application, represented as a DAG of Tasks.
@@ -13,37 +17,38 @@ class Dag:
13
17
  >>> task = sky.Task(...)
14
18
  """
15
19
 
16
- def __init__(self):
17
- self.tasks = []
20
+ def __init__(self) -> None:
21
+ self.tasks: List['task.Task'] = []
18
22
  import networkx as nx # pylint: disable=import-outside-toplevel
19
23
 
20
24
  self.graph = nx.DiGraph()
21
- self.name = None
25
+ self.name: Optional[str] = None
26
+ self.policy_applied: bool = False
22
27
 
23
- def add(self, task):
28
+ def add(self, task: 'task.Task') -> None:
24
29
  self.graph.add_node(task)
25
30
  self.tasks.append(task)
26
31
 
27
- def remove(self, task):
32
+ def remove(self, task: 'task.Task') -> None:
28
33
  self.tasks.remove(task)
29
34
  self.graph.remove_node(task)
30
35
 
31
- def add_edge(self, op1, op2):
36
+ def add_edge(self, op1: 'task.Task', op2: 'task.Task') -> None:
32
37
  assert op1 in self.graph.nodes
33
38
  assert op2 in self.graph.nodes
34
39
  self.graph.add_edge(op1, op2)
35
40
 
36
- def __len__(self):
41
+ def __len__(self) -> int:
37
42
  return len(self.tasks)
38
43
 
39
- def __enter__(self):
44
+ def __enter__(self) -> 'Dag':
40
45
  push_dag(self)
41
46
  return self
42
47
 
43
- def __exit__(self, exc_type, exc_value, traceback):
48
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
44
49
  pop_dag()
45
50
 
46
- def __repr__(self):
51
+ def __repr__(self) -> str:
47
52
  pformat = pprint.pformat(self.tasks)
48
53
  return f'DAG:\n{pformat}'
49
54
 
@@ -51,34 +56,42 @@ class Dag:
51
56
  return self.graph
52
57
 
53
58
  def is_chain(self) -> bool:
54
- # NOTE: this method assumes that the graph has no cycle.
55
- is_chain = True
56
- visited_zero_out_degree = False
57
- for node in self.graph.nodes:
58
- out_degree = self.graph.out_degree(node)
59
- if out_degree > 1:
60
- is_chain = False
61
- break
62
- elif out_degree == 0:
63
- if visited_zero_out_degree:
64
- is_chain = False
65
- break
66
- else:
67
- visited_zero_out_degree = True
68
- return is_chain
59
+ """Check if the DAG is a linear chain of tasks."""
60
+
61
+ nodes = list(self.graph.nodes)
62
+
63
+ if len(nodes) == 0:
64
+ return True
65
+
66
+ in_degrees = [self.graph.in_degree(node) for node in nodes]
67
+ out_degrees = [self.graph.out_degree(node) for node in nodes]
68
+
69
+ # Check out-degrees: all <= 1 and exactly one node has out_degree == 0
70
+ out_degree_condition = (all(degree <= 1 for degree in out_degrees) and
71
+ sum(degree == 0 for degree in out_degrees) == 1)
72
+
73
+ # Check in-degrees: all <= 1 and exactly one node has in_degree == 0
74
+ in_degree_condition = (all(degree <= 1 for degree in in_degrees) and
75
+ sum(degree == 0 for degree in in_degrees) == 1)
76
+
77
+ return out_degree_condition and in_degree_condition
78
+
79
+ def validate(self, workdir_only: bool = False):
80
+ for task in self.tasks:
81
+ task.validate(workdir_only=workdir_only)
69
82
 
70
83
 
71
84
  class _DagContext(threading.local):
72
85
  """A thread-local stack of Dags."""
73
- _current_dag = None
86
+ _current_dag: Optional[Dag] = None
74
87
  _previous_dags: List[Dag] = []
75
88
 
76
- def push_dag(self, dag):
89
+ def push_dag(self, dag: Dag):
77
90
  if self._current_dag is not None:
78
91
  self._previous_dags.append(self._current_dag)
79
92
  self._current_dag = dag
80
93
 
81
- def pop_dag(self):
94
+ def pop_dag(self) -> Optional[Dag]:
82
95
  old_dag = self._current_dag
83
96
  if self._previous_dags:
84
97
  self._current_dag = self._previous_dags.pop()
sky/data/data_transfer.py CHANGED
@@ -200,3 +200,40 @@ def _add_bucket_iam_member(bucket_name: str, role: str, member: str) -> None:
200
200
  bucket.set_iam_policy(policy)
201
201
 
202
202
  logger.debug(f'Added {member} with role {role} to {bucket_name}.')
203
+
204
+
205
+ def s3_to_oci(s3_bucket_name: str, oci_bucket_name: str) -> None:
206
+ """Creates a one-time transfer from Amazon S3 to OCI Object Storage.
207
+ Args:
208
+ s3_bucket_name: str; Name of the Amazon S3 Bucket
209
+ oci_bucket_name: str; Name of the OCI Bucket
210
+ """
211
+ # TODO(HysunHe): Implement sync with other clouds (s3, gs)
212
+ raise NotImplementedError('Moving data directly from S3 to OCI bucket '
213
+ 'is currently not supported. Please specify '
214
+ 'a local source for the storage object.')
215
+
216
+
217
+ def gcs_to_oci(gs_bucket_name: str, oci_bucket_name: str) -> None:
218
+ """Creates a one-time transfer from Google Cloud Storage to
219
+ OCI Object Storage.
220
+ Args:
221
+ gs_bucket_name: str; Name of the Google Cloud Storage Bucket
222
+ oci_bucket_name: str; Name of the OCI Bucket
223
+ """
224
+ # TODO(HysunHe): Implement sync with other clouds (s3, gs)
225
+ raise NotImplementedError('Moving data directly from GCS to OCI bucket '
226
+ 'is currently not supported. Please specify '
227
+ 'a local source for the storage object.')
228
+
229
+
230
+ def r2_to_oci(r2_bucket_name: str, oci_bucket_name: str) -> None:
231
+ """Creates a one-time transfer from Cloudflare R2 to OCI Bucket.
232
+ Args:
233
+ r2_bucket_name: str; Name of the Cloudflare R2 Bucket
234
+ oci_bucket_name: str; Name of the OCI Bucket
235
+ """
236
+ raise NotImplementedError(
237
+ 'Moving data directly from Cloudflare R2 to OCI '
238
+ 'bucket is currently not supported. Please specify '
239
+ 'a local source for the storage object.')
sky/data/data_utils.py CHANGED
@@ -7,6 +7,7 @@ import os
7
7
  import re
8
8
  import subprocess
9
9
  import textwrap
10
+ import time
10
11
  from typing import Any, Callable, Dict, List, Optional, Tuple
11
12
  import urllib.parse
12
13
 
@@ -15,15 +16,25 @@ from filelock import FileLock
15
16
  from sky import exceptions
16
17
  from sky import sky_logging
17
18
  from sky.adaptors import aws
19
+ from sky.adaptors import azure
18
20
  from sky.adaptors import cloudflare
19
21
  from sky.adaptors import gcp
20
22
  from sky.adaptors import ibm
23
+ from sky.skylet import log_lib
24
+ from sky.utils import common_utils
21
25
  from sky.utils import ux_utils
22
26
 
23
27
  Client = Any
24
28
 
25
29
  logger = sky_logging.init_logger(__name__)
26
30
 
31
+ AZURE_CONTAINER_URL = (
32
+ 'https://{storage_account_name}.blob.core.windows.net/{container_name}')
33
+
34
+ # Retry 5 times by default for delayed propagation to Azure system
35
+ # when creating Storage Account.
36
+ _STORAGE_ACCOUNT_KEY_RETRIEVE_MAX_ATTEMPT = 5
37
+
27
38
 
28
39
  def split_s3_path(s3_path: str) -> Tuple[str, str]:
29
40
  """Splits S3 Path into Bucket name and Relative Path to Bucket
@@ -49,6 +60,28 @@ def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
49
60
  return bucket, key
50
61
 
51
62
 
63
+ def split_az_path(az_path: str) -> Tuple[str, str, str]:
64
+ """Splits Path into Storage account and Container names and Relative Path
65
+
66
+ Args:
67
+ az_path: Container Path,
68
+ e.g. https://azureopendatastorage.blob.core.windows.net/nyctlc
69
+
70
+ Returns:
71
+ str: Name of the storage account
72
+ str: Name of the container
73
+ str: Paths of the file/directory defined within the container
74
+ """
75
+ path_parts = az_path.replace('https://', '').split('/')
76
+ service_endpoint = path_parts.pop(0)
77
+ service_endpoint_parts = service_endpoint.split('.')
78
+ storage_account_name = service_endpoint_parts[0]
79
+ container_name = path_parts.pop(0)
80
+ path = '/'.join(path_parts)
81
+
82
+ return storage_account_name, container_name, path
83
+
84
+
52
85
  def split_r2_path(r2_path: str) -> Tuple[str, str]:
53
86
  """Splits R2 Path into Bucket name and Relative Path to Bucket
54
87
 
@@ -126,6 +159,145 @@ def verify_gcs_bucket(name: str) -> bool:
126
159
  return False
127
160
 
128
161
 
162
+ def create_az_client(client_type: str, **kwargs: Any) -> Client:
163
+ """Helper method that connects to AZ client for diverse Resources.
164
+
165
+ Args:
166
+ client_type: str; specify client type, e.g. storage, resource, container
167
+
168
+ Returns:
169
+ Client object facing AZ Resource of the 'client_type'.
170
+ """
171
+ resource_group_name = kwargs.pop('resource_group_name', None)
172
+ container_url = kwargs.pop('container_url', None)
173
+ storage_account_name = kwargs.pop('storage_account_name', None)
174
+ refresh_client = kwargs.pop('refresh_client', False)
175
+ if client_type == 'container':
176
+ # We do not assert on resource_group_name as it is set to None when the
177
+ # container_url is for public container with user access.
178
+ assert container_url is not None, ('container_url must be provided for '
179
+ 'container client')
180
+ assert storage_account_name is not None, ('storage_account_name must '
181
+ 'be provided for container '
182
+ 'client')
183
+
184
+ if refresh_client:
185
+ azure.get_client.cache_clear()
186
+
187
+ subscription_id = azure.get_subscription_id()
188
+ client = azure.get_client(client_type,
189
+ subscription_id,
190
+ container_url=container_url,
191
+ storage_account_name=storage_account_name,
192
+ resource_group_name=resource_group_name)
193
+ return client
194
+
195
+
196
+ def verify_az_bucket(storage_account_name: str, container_name: str) -> bool:
197
+ """Helper method that checks if the AZ Container exists
198
+
199
+ Args:
200
+ storage_account_name: str; Name of the storage account
201
+ container_name: str; Name of the container
202
+
203
+ Returns:
204
+ True if the container exists, False otherwise.
205
+ """
206
+ container_url = AZURE_CONTAINER_URL.format(
207
+ storage_account_name=storage_account_name,
208
+ container_name=container_name)
209
+ resource_group_name = azure.get_az_resource_group(storage_account_name)
210
+ container_client = create_az_client(
211
+ client_type='container',
212
+ container_url=container_url,
213
+ storage_account_name=storage_account_name,
214
+ resource_group_name=resource_group_name)
215
+ return container_client.exists()
216
+
217
+
218
+ def get_az_storage_account_key(
219
+ storage_account_name: str,
220
+ resource_group_name: Optional[str] = None,
221
+ storage_client: Optional[Client] = None,
222
+ resource_client: Optional[Client] = None,
223
+ ) -> Optional[str]:
224
+ """Returns access key of the given name of storage account.
225
+
226
+ Args:
227
+ storage_account_name: Name of the storage account
228
+ resource_group_name: Name of the resource group the
229
+ passed storage account belongs to.
230
+ storage_clent: Client object facing Storage
231
+ resource_client: Client object facing Resource
232
+
233
+ Returns:
234
+ One of the two access keys to the given storage account, or None if
235
+ the account is not found.
236
+ """
237
+ if resource_client is None:
238
+ resource_client = create_az_client('resource')
239
+ if storage_client is None:
240
+ storage_client = create_az_client('storage')
241
+ if resource_group_name is None:
242
+ resource_group_name = azure.get_az_resource_group(
243
+ storage_account_name, storage_client)
244
+ # resource_group_name is None when using a public container or
245
+ # a private container not belonging to the user.
246
+ if resource_group_name is None:
247
+ return None
248
+
249
+ attempt = 0
250
+ backoff = common_utils.Backoff()
251
+ while True:
252
+ storage_account_keys = None
253
+ resources = resource_client.resources.list_by_resource_group(
254
+ resource_group_name)
255
+ # resource group is either created or read when Storage initializes.
256
+ assert resources is not None
257
+ for resource in resources:
258
+ if (resource.type == 'Microsoft.Storage/storageAccounts' and
259
+ resource.name == storage_account_name):
260
+ assert storage_account_keys is None
261
+ keys = storage_client.storage_accounts.list_keys(
262
+ resource_group_name, storage_account_name)
263
+ storage_account_keys = [key.value for key in keys.keys]
264
+ # If storage account was created right before call to this method,
265
+ # it is possible to fail to retrieve the key as the creation did not
266
+ # propagate to Azure yet. We retry several times.
267
+ if storage_account_keys is None:
268
+ attempt += 1
269
+ time.sleep(backoff.current_backoff())
270
+ if attempt > _STORAGE_ACCOUNT_KEY_RETRIEVE_MAX_ATTEMPT:
271
+ raise RuntimeError('Failed to obtain key value of storage '
272
+ f'account {storage_account_name!r}. '
273
+ 'Check if the storage account was created.')
274
+ continue
275
+ # Azure provides two sets of working storage account keys and we use
276
+ # one of it.
277
+ storage_account_key = storage_account_keys[0]
278
+ return storage_account_key
279
+
280
+
281
+ def is_az_container_endpoint(endpoint_url: str) -> bool:
282
+ """Checks if provided url follows a valid container endpoint naming format.
283
+
284
+ Args:
285
+ endpoint_url: Url of container endpoint.
286
+ e.g. https://azureopendatastorage.blob.core.windows.net/nyctlc
287
+
288
+ Returns:
289
+ bool: True if the endpoint is valid, False otherwise.
290
+ """
291
+ # Storage account must be length of 3-24
292
+ # Reference: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftstorage # pylint: disable=line-too-long
293
+ pattern = re.compile(
294
+ r'^https://([a-z0-9]{3,24})\.blob\.core\.windows\.net(/[^/]+)*$')
295
+ match = pattern.match(endpoint_url)
296
+ if match is None:
297
+ return False
298
+ return True
299
+
300
+
129
301
  def create_r2_client(region: str = 'auto') -> Client:
130
302
  """Helper method that connects to Boto3 client for R2 Bucket
131
303
 
@@ -259,6 +431,7 @@ def _group_files_by_dir(
259
431
  def parallel_upload(source_path_list: List[str],
260
432
  filesync_command_generator: Callable[[str, List[str]], str],
261
433
  dirsync_command_generator: Callable[[str, str], str],
434
+ log_path: str,
262
435
  bucket_name: str,
263
436
  access_denied_message: str,
264
437
  create_dirs: bool = False,
@@ -274,6 +447,7 @@ def parallel_upload(source_path_list: List[str],
274
447
  for a list of files belonging to the same dir.
275
448
  dirsync_command_generator: Callable that generates rsync command
276
449
  for a directory.
450
+ log_path: Path to the log file.
277
451
  access_denied_message: Message to intercept from the underlying
278
452
  upload utility when permissions are insufficient. Used in
279
453
  exception handling.
@@ -306,7 +480,7 @@ def parallel_upload(source_path_list: List[str],
306
480
  p.starmap(
307
481
  run_upload_cli,
308
482
  zip(commands, [access_denied_message] * len(commands),
309
- [bucket_name] * len(commands)))
483
+ [bucket_name] * len(commands), [log_path] * len(commands)))
310
484
 
311
485
 
312
486
  def get_gsutil_command() -> Tuple[str, str]:
@@ -347,37 +521,31 @@ def get_gsutil_command() -> Tuple[str, str]:
347
521
  return gsutil_alias, alias_gen
348
522
 
349
523
 
350
- def run_upload_cli(command: str, access_denied_message: str, bucket_name: str):
351
- # TODO(zhwu): Use log_lib.run_with_log() and redirect the output
352
- # to a log file.
353
- with subprocess.Popen(command,
354
- stderr=subprocess.PIPE,
355
- stdout=subprocess.DEVNULL,
356
- shell=True) as process:
357
- stderr = []
358
- assert process.stderr is not None # for mypy
359
- while True:
360
- line = process.stderr.readline()
361
- if not line:
362
- break
363
- str_line = line.decode('utf-8')
364
- stderr.append(str_line)
365
- if access_denied_message in str_line:
366
- process.kill()
367
- with ux_utils.print_exception_no_traceback():
368
- raise PermissionError(
369
- 'Failed to upload files to '
370
- 'the remote bucket. The bucket does not have '
371
- 'write permissions. It is possible that '
372
- 'the bucket is public.')
373
- returncode = process.wait()
374
- if returncode != 0:
375
- stderr_str = '\n'.join(stderr)
376
- with ux_utils.print_exception_no_traceback():
377
- logger.error(stderr_str)
378
- raise exceptions.StorageUploadError(
379
- f'Upload to bucket failed for store {bucket_name}. '
380
- 'Please check the logs.')
524
+ def run_upload_cli(command: str, access_denied_message: str, bucket_name: str,
525
+ log_path: str):
526
+ returncode, stdout, stderr = log_lib.run_with_log(
527
+ command,
528
+ log_path,
529
+ shell=True,
530
+ require_outputs=True,
531
+ # We need to use bash as some of the cloud commands uses bash syntax,
532
+ # such as [[ ... ]]
533
+ executable='/bin/bash')
534
+ if access_denied_message in stderr:
535
+ with ux_utils.print_exception_no_traceback():
536
+ raise PermissionError('Failed to upload files to '
537
+ 'the remote bucket. The bucket does not have '
538
+ 'write permissions. It is possible that '
539
+ 'the bucket is public.')
540
+ if returncode != 0:
541
+ with ux_utils.print_exception_no_traceback():
542
+ logger.error(stderr)
543
+ raise exceptions.StorageUploadError(
544
+ f'Upload to bucket failed for store {bucket_name}. '
545
+ f'Please check the logs: {log_path}')
546
+ if not stdout:
547
+ logger.debug('No file uploaded. This could be due to an error or '
548
+ 'because all files already exist on the cloud.')
381
549
 
382
550
 
383
551
  def get_cos_regions() -> List[str]:
@@ -566,3 +734,14 @@ class Rclone():
566
734
  lines_to_keep.append(line)
567
735
 
568
736
  return lines_to_keep
737
+
738
+
739
+ def split_oci_path(oci_path: str) -> Tuple[str, str]:
740
+ """Splits OCI Path into Bucket name and Relative Path to Bucket
741
+ Args:
742
+ oci_path: str; OCI Path, e.g. oci://imagenet/train/
743
+ """
744
+ path_parts = oci_path.replace('oci://', '').split('/')
745
+ bucket = path_parts.pop(0)
746
+ key = '/'.join(path_parts)
747
+ return bucket, key