skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import os
7
7
  import tempfile
8
8
  import typing
9
9
  from typing import Any, Dict, Iterable, List, Optional, Set
10
+ import uuid
10
11
 
11
12
  import colorama
12
13
 
@@ -22,12 +23,16 @@ from sky.clouds import gcp
22
23
  from sky.data import data_utils
23
24
  from sky.data import storage as storage_lib
24
25
  from sky.jobs import constants as managed_job_constants
25
- from sky.jobs import utils as managed_job_utils
26
26
  from sky.serve import constants as serve_constants
27
- from sky.serve import serve_utils
27
+ from sky.setup_files import dependencies
28
28
  from sky.skylet import constants
29
+ from sky.skylet import log_lib
30
+ from sky.utils import common
29
31
  from sky.utils import common_utils
32
+ from sky.utils import config_utils
30
33
  from sky.utils import env_options
34
+ from sky.utils import registry
35
+ from sky.utils import rich_utils
31
36
  from sky.utils import ux_utils
32
37
 
33
38
  if typing.TYPE_CHECKING:
@@ -44,8 +49,12 @@ CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
44
49
  '{controller_type}.controller.resources is a valid resources spec. '
45
50
  'Details:\n {err}')
46
51
 
47
- # The placeholder for the local skypilot config path in file mounts.
48
- LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
52
+ # The suffix for local skypilot config path for a job/service in file mounts
53
+ # that tells the controller logic to update the config with specific settings,
54
+ # e.g., removing the ssh_proxy_command when a job/service is launched in a same
55
+ # cloud as controller.
56
+ _LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX = (
57
+ '__skypilot:local_skypilot_config_path.yaml')
49
58
 
50
59
 
51
60
  @dataclasses.dataclass
@@ -53,9 +62,7 @@ class _ControllerSpec:
53
62
  """Spec for skypilot controllers."""
54
63
  controller_type: str
55
64
  name: str
56
- # Use a list of strings to support fallback to old names. The list is in the
57
- # fallback order.
58
- candidate_cluster_names: List[str]
65
+ cluster_name: str
59
66
  in_progress_hint: str
60
67
  decline_cancel_hint: str
61
68
  _decline_down_when_failed_to_fetch_status_hint: str
@@ -65,15 +72,6 @@ class _ControllerSpec:
65
72
  connection_error_hint: str
66
73
  default_resources_config: Dict[str, Any]
67
74
 
68
- @property
69
- def cluster_name(self) -> str:
70
- """The name in candidate_cluster_names that exists, else the first."""
71
- for candidate_name in self.candidate_cluster_names:
72
- record = global_user_state.get_cluster_from_name(candidate_name)
73
- if record is not None:
74
- return candidate_name
75
- return self.candidate_cluster_names[0]
76
-
77
75
  @property
78
76
  def decline_down_when_failed_to_fetch_status_hint(self) -> str:
79
77
  return self._decline_down_when_failed_to_fetch_status_hint.format(
@@ -85,6 +83,7 @@ class _ControllerSpec:
85
83
  cluster_name=self.cluster_name)
86
84
 
87
85
 
86
+ # TODO: refactor controller class to not be an enum.
88
87
  class Controllers(enum.Enum):
89
88
  """Skypilot controllers."""
90
89
  # NOTE(dev): Keep this align with
@@ -92,10 +91,7 @@ class Controllers(enum.Enum):
92
91
  JOBS_CONTROLLER = _ControllerSpec(
93
92
  controller_type='jobs',
94
93
  name='managed jobs controller',
95
- candidate_cluster_names=[
96
- managed_job_utils.JOB_CONTROLLER_NAME,
97
- managed_job_utils.LEGACY_JOB_CONTROLLER_NAME
98
- ],
94
+ cluster_name=common.JOB_CONTROLLER_NAME,
99
95
  in_progress_hint=(
100
96
  '* {job_info}To see all managed jobs: '
101
97
  f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
@@ -125,7 +121,7 @@ class Controllers(enum.Enum):
125
121
  SKY_SERVE_CONTROLLER = _ControllerSpec(
126
122
  controller_type='serve',
127
123
  name='serve controller',
128
- candidate_cluster_names=[serve_utils.SKY_SERVE_CONTROLLER_NAME],
124
+ cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
129
125
  in_progress_hint=(
130
126
  f'* To see detailed service status: {colorama.Style.BRIGHT}'
131
127
  f'sky serve status -a{colorama.Style.RESET_ALL}'),
@@ -161,10 +157,23 @@ class Controllers(enum.Enum):
161
157
  The controller if the cluster name is a controller name.
162
158
  Otherwise, returns None.
163
159
  """
164
- for controller in cls:
165
- if name in controller.value.candidate_cluster_names:
166
- return controller
167
- return None
160
+ if name is None:
161
+ return None
162
+ controller = None
163
+ # The controller name is always the same. However, on the client-side,
164
+ # we may not know the exact name, because we are missing the server-side
165
+ # common.SERVER_ID. So, we will assume anything that matches the prefix
166
+ # is a controller.
167
+ if name.startswith(common.SKY_SERVE_CONTROLLER_PREFIX):
168
+ controller = cls.SKY_SERVE_CONTROLLER
169
+ elif name.startswith(common.JOB_CONTROLLER_PREFIX):
170
+ controller = cls.JOBS_CONTROLLER
171
+ if controller is not None and name != controller.value.cluster_name:
172
+ # The client-side cluster_name is not accurate. Assume that `name`
173
+ # is the actual cluster name, so need to set the controller's
174
+ # cluster name to the input name.
175
+ controller.value.cluster_name = name
176
+ return controller
168
177
 
169
178
  @classmethod
170
179
  def from_type(cls, controller_type: str) -> Optional['Controllers']:
@@ -182,63 +191,59 @@ class Controllers(enum.Enum):
182
191
 
183
192
  # Install cli dependencies. Not using SkyPilot wheels because the wheel
184
193
  # can be cleaned up by another process.
185
- # TODO(zhwu): Keep the dependencies align with the ones in setup.py
186
194
  def _get_cloud_dependencies_installation_commands(
187
195
  controller: Controllers) -> List[str]:
188
- # TODO(tian): Make dependency installation command a method of cloud
189
- # class and get all installation command for enabled clouds.
190
- commands = []
191
- prefix_str = 'Check & install cloud dependencies on controller: '
196
+ # We use <step>/<total> instead of strong formatting, as we need to update
197
+ # the <total> at the end of the for loop, and python does not support
198
+ # partial string formatting.
199
+ prefix_str = ('[<step>/<total>] Check & install cloud dependencies '
200
+ 'on controller: ')
201
+ commands: List[str] = []
192
202
  # This is to make sure the shorter checking message does not have junk
193
203
  # characters from the previous message.
194
- empty_str = ' ' * 5
195
- aws_dependencies_installation = (
196
- 'pip list | grep boto3 > /dev/null 2>&1 || pip install '
197
- 'botocore>=1.29.10 boto3>=1.26.1; '
198
- # Need to separate the installation of awscli from above because some
199
- # other clouds will install boto3 but not awscli.
200
- 'pip list | grep awscli> /dev/null 2>&1 || pip install "urllib3<2" '
201
- 'awscli>=1.27.10 "colorama<0.4.5" > /dev/null 2>&1')
204
+ empty_str = ' ' * 20
205
+
206
+ # All python dependencies will be accumulated and then installed in one
207
+ # command at the end. This is very fast if the packages are already
208
+ # installed, so we don't check that.
209
+ python_packages: Set[str] = set()
210
+
211
+ # add flask to the controller dependencies for dashboard
212
+ python_packages.add('flask')
213
+
214
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
215
+ commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
216
+ f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
217
+
202
218
  for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
203
- if isinstance(
204
- clouds,
205
- (clouds.Lambda, clouds.SCP, clouds.Fluidstack, clouds.Paperspace)):
206
- # no need to install any cloud dependencies for lambda, scp,
207
- # fluidstack and paperspace
208
- continue
209
- if isinstance(cloud, clouds.AWS):
210
- commands.append(f'echo -en "\\r{prefix_str}AWS{empty_str}" && ' +
211
- aws_dependencies_installation)
212
- elif isinstance(cloud, clouds.Azure):
219
+ cloud_python_dependencies: List[str] = copy.deepcopy(
220
+ dependencies.extras_require[cloud.canonical_name()])
221
+
222
+ if isinstance(cloud, clouds.Azure):
223
+ # azure-cli cannot be normally installed by uv.
224
+ # See comments in sky/skylet/constants.py.
225
+ cloud_python_dependencies.remove(dependencies.AZURE_CLI)
226
+
227
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
213
228
  commands.append(
214
- f'echo -en "\\r{prefix_str}Azure{empty_str}" && '
215
- 'pip list | grep azure-cli > /dev/null 2>&1 || '
216
- 'pip install "azure-cli>=2.31.0" azure-core '
217
- '"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
229
+ f'echo -en "\\r{step_prefix}azure-cli{empty_str}" &&'
230
+ f'{constants.SKY_UV_PIP_CMD} install --prerelease=allow '
231
+ f'"{dependencies.AZURE_CLI}" > /dev/null 2>&1')
218
232
  elif isinstance(cloud, clouds.GCP):
219
- commands.append(
220
- f'echo -en "\\r{prefix_str}GCP{empty_str}" && '
221
- 'pip list | grep google-api-python-client > /dev/null 2>&1 || '
222
- 'pip install "google-api-python-client>=2.69.0" '
223
- '> /dev/null 2>&1')
224
- # Have to separate the installation of google-cloud-storage from
225
- # above because for a VM launched on GCP, the VM may have
226
- # google-api-python-client installed alone.
227
- commands.append(
228
- 'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
229
- 'pip install google-cloud-storage > /dev/null 2>&1')
230
- commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
233
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
234
+ commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
235
+ f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
231
236
  elif isinstance(cloud, clouds.Kubernetes):
237
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
232
238
  commands.append(
233
- f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && '
234
- 'pip list | grep kubernetes > /dev/null 2>&1 || '
235
- 'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&'
239
+ f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
236
240
  # Install k8s + skypilot dependencies
237
241
  'sudo bash -c "if '
238
242
  '! command -v curl &> /dev/null || '
239
243
  '! command -v socat &> /dev/null || '
240
244
  '! command -v netcat &> /dev/null; '
241
- 'then apt update && apt install curl socat netcat -y; '
245
+ 'then apt update &> /dev/null && '
246
+ 'apt install curl socat netcat -y &> /dev/null; '
242
247
  'fi" && '
243
248
  # Install kubectl
244
249
  '(command -v kubectl &>/dev/null || '
@@ -247,34 +252,42 @@ def _get_cloud_dependencies_installation_commands(
247
252
  '/bin/linux/amd64/kubectl" && '
248
253
  'sudo install -o root -g root -m 0755 '
249
254
  'kubectl /usr/local/bin/kubectl))')
250
- if controller == Controllers.JOBS_CONTROLLER:
251
- if isinstance(cloud, clouds.IBM):
252
- commands.append(
253
- f'echo -en "\\r{prefix_str}IBM{empty_str}" '
254
- '&& pip list | grep ibm-cloud-sdk-core > /dev/null 2>&1 || '
255
- 'pip install ibm-cloud-sdk-core ibm-vpc '
256
- 'ibm-platform-services ibm-cos-sdk > /dev/null 2>&1')
257
- elif isinstance(cloud, clouds.OCI):
258
- commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && '
259
- 'pip list | grep oci > /dev/null 2>&1 || '
260
- 'pip install oci > /dev/null 2>&1')
261
- elif isinstance(cloud, clouds.RunPod):
262
- commands.append(
263
- f'echo -en "\\r{prefix_str}RunPod{empty_str}" && '
264
- 'pip list | grep runpod > /dev/null 2>&1 || '
265
- 'pip install "runpod>=1.5.1" > /dev/null 2>&1')
266
- elif isinstance(cloud, clouds.Cudo):
267
- # cudo doesn't support open port
268
- commands.append(
269
- f'echo -en "\\r{prefix_str}Cudo{empty_str}" && '
270
- 'pip list | grep cudo-compute > /dev/null 2>&1 || '
271
- 'pip install "cudo-compute>=0.1.8" > /dev/null 2>&1')
255
+ elif isinstance(cloud, clouds.Cudo):
256
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
257
+ commands.append(
258
+ f'echo -en "\\r{step_prefix}cudoctl{empty_str}" && '
259
+ 'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
260
+ 'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
261
+ elif isinstance(cloud, clouds.IBM):
262
+ if controller != Controllers.JOBS_CONTROLLER:
263
+ # We only need IBM deps on the jobs controller.
264
+ cloud_python_dependencies = []
265
+ elif isinstance(cloud, clouds.Vast):
266
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
267
+ commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
268
+ 'pip list | grep vastai_sdk > /dev/null 2>&1 || '
269
+ 'pip install "vastai_sdk>=0.1.12" > /dev/null 2>&1')
270
+
271
+ python_packages.update(cloud_python_dependencies)
272
+
272
273
  if (cloudflare.NAME
273
274
  in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
274
- commands.append(f'echo -en "\\r{prefix_str}Cloudflare{empty_str}" && ' +
275
- aws_dependencies_installation)
276
- commands.append(f'echo -e "\\r{prefix_str}Done for {len(commands)} '
277
- 'clouds."')
275
+ python_packages.update(dependencies.extras_require['cloudflare'])
276
+
277
+ packages_string = ' '.join([f'"{package}"' for package in python_packages])
278
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
279
+ commands.append(
280
+ f'echo -en "\\r{step_prefix}cloud python packages{empty_str}" && '
281
+ f'{constants.SKY_UV_PIP_CMD} install {packages_string} > /dev/null 2>&1'
282
+ )
283
+
284
+ total_commands = len(commands)
285
+ finish_prefix = prefix_str.replace('[<step>/<total>] ', ' ')
286
+ commands.append(f'echo -e "\\r{finish_prefix}done.{empty_str}"')
287
+
288
+ commands = [
289
+ command.replace('<total>', str(total_commands)) for command in commands
290
+ ]
278
291
  return commands
279
292
 
280
293
 
@@ -308,8 +321,10 @@ def download_and_stream_latest_job_log(
308
321
  """Downloads and streams the latest job log.
309
322
 
310
323
  This function is only used by jobs controller and sky serve controller.
324
+
325
+ If the log cannot be fetched for any reason, return None.
311
326
  """
312
- os.makedirs(local_dir, exist_ok=True)
327
+ os.makedirs(os.path.expanduser(local_dir), exist_ok=True)
313
328
  log_file = None
314
329
  try:
315
330
  log_dirs = backend.sync_down_logs(
@@ -322,29 +337,74 @@ def download_and_stream_latest_job_log(
322
337
  # job_ids all represent the same logical managed job.
323
338
  job_ids=None,
324
339
  local_dir=local_dir)
325
- except exceptions.CommandError as e:
326
- logger.info(f'Failed to download the logs: '
327
- f'{common_utils.format_exception(e)}')
328
- else:
329
- if not log_dirs:
330
- logger.error('Failed to find the logs for the user program.')
331
- else:
332
- log_dir = list(log_dirs.values())[0]
333
- log_file = os.path.join(log_dir, 'run.log')
334
-
335
- # Print the logs to the console.
336
- try:
337
- with open(log_file, 'r', encoding='utf-8') as f:
338
- print(f.read())
339
- except FileNotFoundError:
340
- logger.error('Failed to find the logs for the user '
341
- f'program at {log_file}.')
340
+ except Exception as e: # pylint: disable=broad-except
341
+ # We want to avoid crashing the controller. sync_down_logs() is pretty
342
+ # complicated and could crash in various places (creating remote
343
+ # runners, executing remote code, decoding the payload, etc.). So, we
344
+ # use a broad except and just return None.
345
+ logger.info(
346
+ f'Failed to download the logs: '
347
+ f'{common_utils.format_exception(e)}',
348
+ exc_info=True)
349
+ return None
350
+
351
+ if not log_dirs:
352
+ logger.error('Failed to find the logs for the user program.')
353
+ return None
354
+
355
+ log_dir = list(log_dirs.values())[0]
356
+ log_file = os.path.join(log_dir, 'run.log')
357
+
358
+ # Print the logs to the console.
359
+ # TODO(zhwu): refactor this into log_utils, along with the refactoring for
360
+ # the log_lib.tail_logs.
361
+ try:
362
+ with open(log_file, 'r', encoding='utf-8') as f:
363
+ # Stream the logs to the console without reading the whole file into
364
+ # memory.
365
+ start_streaming = False
366
+ for line in f:
367
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
368
+ start_streaming = True
369
+ if start_streaming:
370
+ print(line, end='', flush=True)
371
+ except FileNotFoundError:
372
+ logger.error('Failed to find the logs for the user '
373
+ f'program at {log_file}.')
374
+ except Exception as e: # pylint: disable=broad-except
375
+ logger.error(
376
+ f'Failed to stream the logs for the user program at '
377
+ f'{log_file}: {common_utils.format_exception(e)}',
378
+ exc_info=True)
379
+ # Return the log_file anyway.
380
+
342
381
  return log_file
343
382
 
344
383
 
345
384
  def shared_controller_vars_to_fill(
346
- controller: Controllers,
347
- remote_user_config_path: str) -> Dict[str, str]:
385
+ controller: Controllers, remote_user_config_path: str,
386
+ local_user_config: Dict[str, Any]) -> Dict[str, str]:
387
+ if not local_user_config:
388
+ local_user_config_path = None
389
+ else:
390
+ # Remove admin_policy from local_user_config so that it is not applied
391
+ # again on the controller. This is required since admin_policy is not
392
+ # installed on the controller.
393
+ local_user_config.pop('admin_policy', None)
394
+ # Remove allowed_contexts from local_user_config since the controller
395
+ # may be running in a Kubernetes cluster with in-cluster auth and may
396
+ # not have kubeconfig available to it. This is the typical case since
397
+ # remote_identity default for Kubernetes is SERVICE_ACCOUNT.
398
+ # TODO(romilb): We should check the cloud the controller is running on
399
+ # before popping allowed_contexts. If it is not on Kubernetes,
400
+ # we may be able to use allowed_contexts.
401
+ local_user_config.pop('allowed_contexts', None)
402
+ with tempfile.NamedTemporaryFile(
403
+ delete=False,
404
+ suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
405
+ common_utils.dump_yaml(temp_file.name, dict(**local_user_config))
406
+ local_user_config_path = temp_file.name
407
+
348
408
  vars_to_fill: Dict[str, Any] = {
349
409
  'cloud_dependencies_installation_commands':
350
410
  _get_cloud_dependencies_installation_commands(controller),
@@ -352,9 +412,11 @@ def shared_controller_vars_to_fill(
352
412
  # cloud SDKs are installed in SkyPilot runtime environment and can be
353
413
  # accessed.
354
414
  'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
415
+ 'sky_python_cmd': constants.SKY_PYTHON_CMD,
416
+ 'local_user_config_path': local_user_config_path,
355
417
  }
356
418
  env_vars: Dict[str, str] = {
357
- env.value: '1' for env in env_options.Options if env.get()
419
+ env.env_key: str(int(env.get())) for env in env_options.Options
358
420
  }
359
421
  env_vars.update({
360
422
  # Should not use $USER here, as that env var can be empty when
@@ -362,7 +424,9 @@ def shared_controller_vars_to_fill(
362
424
  constants.USER_ENV_VAR: getpass.getuser(),
363
425
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
364
426
  # Skip cloud identity check to avoid the overhead.
365
- env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1',
427
+ env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
428
+ # Disable minimize logging to get more details on the controller.
429
+ env_options.Options.MINIMIZE_LOGGING.env_key: '0',
366
430
  })
367
431
  if skypilot_config.loaded():
368
432
  # Only set the SKYPILOT_CONFIG env var if the user has a config file.
@@ -395,10 +459,6 @@ def get_controller_resources(
395
459
  if custom_controller_resources_config is not None:
396
460
  controller_resources_config_copied.update(
397
461
  custom_controller_resources_config)
398
- elif controller == Controllers.JOBS_CONTROLLER:
399
- controller_resources_config_copied.update(
400
- skypilot_config.get_nested(('spot', 'controller', 'resources'),
401
- {}))
402
462
 
403
463
  try:
404
464
  controller_resources = resources.Resources.from_yaml_config(
@@ -431,20 +491,17 @@ def get_controller_resources(
431
491
  if handle is not None:
432
492
  controller_resources_to_use = handle.launched_resources
433
493
 
434
- if controller_resources_to_use.cloud is not None:
435
- return {controller_resources_to_use}
494
+ # If the controller and replicas are from the same cloud (and region/zone),
495
+ # it should provide better connectivity. We will let the controller choose
496
+ # from the clouds (and regions/zones) of the resources if the user does not
497
+ # specify the cloud (and region/zone) for the controller.
436
498
 
437
- # If the controller and replicas are from the same cloud, it should
438
- # provide better connectivity. We will let the controller choose from
439
- # the clouds of the resources if the controller does not exist.
440
- # TODO(tian): Consider respecting the regions/zones specified for the
441
- # resources as well.
442
- requested_clouds: Set['clouds.Cloud'] = set()
499
+ requested_clouds_with_region_zone: Dict[str, Dict[Optional[str],
500
+ Set[Optional[str]]]] = {}
443
501
  for resource in task_resources:
444
- # cloud is an object and will not be able to be distinguished by set.
445
- # Here we manually check if the cloud is in the set.
446
502
  if resource.cloud is not None:
447
- if not clouds.cloud_in_iterable(resource.cloud, requested_clouds):
503
+ cloud_name = str(resource.cloud)
504
+ if cloud_name not in requested_clouds_with_region_zone:
448
505
  try:
449
506
  resource.cloud.check_features_are_supported(
450
507
  resources.Resources(),
@@ -452,7 +509,26 @@ def get_controller_resources(
452
509
  except exceptions.NotSupportedError:
453
510
  # Skip the cloud if it does not support hosting controllers.
454
511
  continue
455
- requested_clouds.add(resource.cloud)
512
+ requested_clouds_with_region_zone[cloud_name] = {}
513
+ if resource.region is None:
514
+ # If one of the resource.region is None, this could represent
515
+ # that the user is unsure about which region the resource is
516
+ # hosted in. In this case, we allow any region for this cloud.
517
+ requested_clouds_with_region_zone[cloud_name] = {None: {None}}
518
+ elif None not in requested_clouds_with_region_zone[cloud_name]:
519
+ if resource.region not in requested_clouds_with_region_zone[
520
+ cloud_name]:
521
+ requested_clouds_with_region_zone[cloud_name][
522
+ resource.region] = set()
523
+ # If one of the resource.zone is None, allow any zone in the
524
+ # region.
525
+ if resource.zone is None:
526
+ requested_clouds_with_region_zone[cloud_name][
527
+ resource.region] = {None}
528
+ elif None not in requested_clouds_with_region_zone[cloud_name][
529
+ resource.region]:
530
+ requested_clouds_with_region_zone[cloud_name][
531
+ resource.region].add(resource.zone)
456
532
  else:
457
533
  # if one of the resource.cloud is None, this could represent user
458
534
  # does not know which cloud is best for the specified resources.
@@ -462,18 +538,54 @@ def get_controller_resources(
462
538
  # - cloud: runpod
463
539
  # accelerators: A40
464
540
  # In this case, we allow the controller to be launched on any cloud.
465
- requested_clouds.clear()
541
+ requested_clouds_with_region_zone.clear()
466
542
  break
467
- if not requested_clouds:
543
+
544
+ # Extract filtering criteria from the controller resources specified by the
545
+ # user.
546
+ controller_cloud = str(
547
+ controller_resources_to_use.cloud
548
+ ) if controller_resources_to_use.cloud is not None else None
549
+ controller_region = controller_resources_to_use.region
550
+ controller_zone = controller_resources_to_use.zone
551
+
552
+ # Filter clouds if controller_resources_to_use.cloud is specified.
553
+ filtered_clouds = ({controller_cloud} if controller_cloud is not None else
554
+ requested_clouds_with_region_zone.keys())
555
+
556
+ # Filter regions and zones and construct the result.
557
+ result: Set[resources.Resources] = set()
558
+ for cloud_name in filtered_clouds:
559
+ regions = requested_clouds_with_region_zone.get(cloud_name,
560
+ {None: {None}})
561
+
562
+ # Filter regions if controller_resources_to_use.region is specified.
563
+ filtered_regions = ({controller_region} if controller_region is not None
564
+ else regions.keys())
565
+
566
+ for region in filtered_regions:
567
+ zones = regions.get(region, {None})
568
+
569
+ # Filter zones if controller_resources_to_use.zone is specified.
570
+ filtered_zones = ({controller_zone}
571
+ if controller_zone is not None else zones)
572
+
573
+ # Create combinations of cloud, region, and zone.
574
+ for zone in filtered_zones:
575
+ resource_copy = controller_resources_to_use.copy(
576
+ cloud=registry.CLOUD_REGISTRY.from_str(cloud_name),
577
+ region=region,
578
+ zone=zone)
579
+ result.add(resource_copy)
580
+
581
+ if not result:
468
582
  return {controller_resources_to_use}
469
- return {
470
- controller_resources_to_use.copy(cloud=controller_cloud)
471
- for controller_cloud in requested_clouds
472
- }
583
+ return result
473
584
 
474
585
 
475
586
  def _setup_proxy_command_on_controller(
476
- controller_launched_cloud: 'clouds.Cloud') -> Dict[str, Any]:
587
+ controller_launched_cloud: 'clouds.Cloud',
588
+ user_config: Dict[str, Any]) -> config_utils.Config:
477
589
  """Sets up proxy command on the controller.
478
590
 
479
591
  This function should be called on the controller (remote cluster), which
@@ -507,21 +619,20 @@ def _setup_proxy_command_on_controller(
507
619
  # (or name). It may not be a sufficient check (as it's always
508
620
  # possible that peering is not set up), but it may catch some
509
621
  # obvious errors.
622
+ config = config_utils.Config.from_dict(user_config)
510
623
  proxy_command_key = (str(controller_launched_cloud).lower(),
511
624
  'ssh_proxy_command')
512
- ssh_proxy_command = skypilot_config.get_nested(proxy_command_key, None)
513
- config_dict = skypilot_config.to_dict()
625
+ ssh_proxy_command = config.get_nested(proxy_command_key, None)
514
626
  if isinstance(ssh_proxy_command, str):
515
- config_dict = skypilot_config.set_nested(proxy_command_key, None)
627
+ config.set_nested(proxy_command_key, None)
516
628
  elif isinstance(ssh_proxy_command, dict):
517
629
  # Instead of removing the key, we set the value to empty string
518
630
  # so that the controller will only try the regions specified by
519
631
  # the keys.
520
632
  ssh_proxy_command = {k: None for k in ssh_proxy_command}
521
- config_dict = skypilot_config.set_nested(proxy_command_key,
522
- ssh_proxy_command)
633
+ config.set_nested(proxy_command_key, ssh_proxy_command)
523
634
 
524
- return config_dict
635
+ return config
525
636
 
526
637
 
527
638
  def replace_skypilot_config_path_in_file_mounts(
@@ -535,29 +646,84 @@ def replace_skypilot_config_path_in_file_mounts(
535
646
  if file_mounts is None:
536
647
  return
537
648
  replaced = False
538
- to_replace = True
539
- with tempfile.NamedTemporaryFile('w', delete=False) as f:
540
- if skypilot_config.loaded():
541
- new_skypilot_config = _setup_proxy_command_on_controller(cloud)
542
- common_utils.dump_yaml(f.name, new_skypilot_config)
543
- to_replace = True
544
- else:
545
- # Empty config. Remove the placeholder below.
546
- to_replace = False
547
- for remote_path, local_path in list(file_mounts.items()):
548
- if local_path == LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER:
549
- if to_replace:
550
- file_mounts[remote_path] = f.name
551
- replaced = True
552
- else:
553
- del file_mounts[remote_path]
649
+ for remote_path, local_path in list(file_mounts.items()):
650
+ if local_path is None:
651
+ del file_mounts[remote_path]
652
+ continue
653
+ if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
654
+ with tempfile.NamedTemporaryFile('w', delete=False) as f:
655
+ user_config = common_utils.read_yaml(local_path)
656
+ config = _setup_proxy_command_on_controller(cloud, user_config)
657
+ common_utils.dump_yaml(f.name, dict(**config))
658
+ file_mounts[remote_path] = f.name
659
+ replaced = True
554
660
  if replaced:
555
- logger.debug(f'Replaced {LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER} with '
556
- f'the real path in file mounts: {file_mounts}')
661
+ logger.debug(f'Replaced {_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX} '
662
+ f'with the real path in file mounts: {file_mounts}')
557
663
 
558
664
 
665
+ def _generate_run_uuid() -> str:
666
+ """Generates a unique run id for the job."""
667
+ return common_utils.base36_encode(uuid.uuid4().hex)[:8]
668
+
669
+
670
+ def translate_local_file_mounts_to_two_hop(
671
+ task: 'task_lib.Task') -> Dict[str, str]:
672
+ """Translates local->VM mounts into two-hop file mounts.
673
+
674
+ This strategy will upload the local files to the controller first, using a
675
+ normal rsync as part of sky.launch() for the controller. Then, when the
676
+ controller launches the task, it will also use local file_mounts from the
677
+ destination path of the first hop.
678
+
679
+ Local machine/API server Controller Job cluster
680
+ ------------------------ ----------------------- --------------------
681
+ | local path ----|--|-> controller path --|--|-> job dst path |
682
+ ------------------------ ----------------------- --------------------
683
+
684
+ Returns:
685
+ A dict mapping from controller file mount path to local file mount path
686
+ for the first hop. The task is updated in-place to do the second hop.
687
+ """
688
+ first_hop_file_mounts = {}
689
+ second_hop_file_mounts = {}
690
+
691
+ run_id = _generate_run_uuid()
692
+ base_tmp_dir = os.path.join(constants.FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH,
693
+ run_id)
694
+
695
+ # Use a simple counter to create unique paths within the base_tmp_dir for
696
+ # each mount.
697
+ file_mount_id = 0
698
+
699
+ file_mounts_to_translate = task.file_mounts or {}
700
+ if task.workdir is not None:
701
+ file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
702
+ task.workdir = None
703
+
704
+ for job_cluster_path, local_path in file_mounts_to_translate.items():
705
+ if data_utils.is_cloud_store_url(
706
+ local_path) or data_utils.is_cloud_store_url(job_cluster_path):
707
+ raise exceptions.NotSupportedError(
708
+ 'Cloud-based file_mounts are specified, but no cloud storage '
709
+ 'is available. Please specify local file_mounts only.')
710
+
711
+ controller_path = os.path.join(base_tmp_dir, f'{file_mount_id}')
712
+ file_mount_id += 1
713
+ first_hop_file_mounts[controller_path] = local_path
714
+ second_hop_file_mounts[job_cluster_path] = controller_path
715
+
716
+ # Use set_file_mounts to override existing file mounts, if they exist.
717
+ task.set_file_mounts(second_hop_file_mounts)
718
+
719
+ # Return the first hop info so that it can be added to the jobs-controller
720
+ # YAML.
721
+ return first_hop_file_mounts
722
+
723
+
724
+ # (maybe translate local file mounts) and (sync up)
559
725
  def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
560
- path: str) -> None:
726
+ task_type: str) -> None:
561
727
  """Translates local->VM mounts into Storage->VM, then syncs up any Storage.
562
728
 
563
729
  Eagerly syncing up local->Storage ensures Storage->VM would work at task
@@ -566,11 +732,31 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
566
732
  If there are no local source paths to be translated, this function would
567
733
  still sync up any storage mounts with local source paths (which do not
568
734
  undergo translation).
735
+
736
+ When jobs.bucket or serve.bucket is not specified, an intermediate storage
737
+ dedicated for the job is created for the workdir and local file mounts and
738
+ the storage is deleted when the job finishes. We don't share the storage
739
+ between jobs, because jobs might have different resources requirements, and
740
+ sharing storage between jobs may cause egress costs or slower transfer
741
+ speeds.
569
742
  """
743
+
570
744
  # ================================================================
571
745
  # Translate the workdir and local file mounts to cloud file mounts.
572
746
  # ================================================================
573
- run_id = common_utils.get_usage_run_id()[:8]
747
+
748
+ def _sub_path_join(sub_path: Optional[str], path: str) -> str:
749
+ if sub_path is None:
750
+ return path
751
+ return os.path.join(sub_path, path).strip('/')
752
+
753
+ # We use uuid to generate a unique run id for the job, so that the bucket/
754
+ # subdirectory name is unique across different jobs/services.
755
+ # We should not use common_utils.get_usage_run_id() here, because when
756
+ # Python API is used, the run id will be the same across multiple
757
+ # jobs.launch/serve.up calls after the sky is imported.
758
+ run_id = _generate_run_uuid()
759
+ user_hash = common_utils.get_user_hash()
574
760
  original_file_mounts = task.file_mounts if task.file_mounts else {}
575
761
  original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
576
762
 
@@ -589,14 +775,35 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
589
775
  elif has_local_source_paths_workdir:
590
776
  msg = 'workdir'
591
777
  if msg:
592
- logger.info(f'{colorama.Fore.YELLOW}Translating {msg} to SkyPilot '
593
- f'Storage...{colorama.Style.RESET_ALL}')
778
+ logger.info(
779
+ ux_utils.starting_message(f'Translating {msg} to '
780
+ 'SkyPilot Storage...'))
781
+ rich_utils.force_update_status(
782
+ ux_utils.spinner_message(
783
+ f'Translating {msg} to SkyPilot Storage...'))
784
+
785
+ # Get the bucket name for the workdir and file mounts,
786
+ # we store all these files in same bucket from config.
787
+ bucket_wth_prefix = skypilot_config.get_nested((task_type, 'bucket'), None)
788
+ store_kwargs: Dict[str, Any] = {}
789
+ if bucket_wth_prefix is None:
790
+ store_type = sub_path = None
791
+ storage_account_name = region = None
792
+ bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
793
+ username=common_utils.get_cleaned_username(),
794
+ user_hash=user_hash,
795
+ id=run_id)
796
+ else:
797
+ (store_type, bucket_name, sub_path, storage_account_name, region) = (
798
+ storage_lib.StoreType.get_fields_from_store_url(bucket_wth_prefix))
799
+ if storage_account_name is not None:
800
+ store_kwargs['storage_account_name'] = storage_account_name
801
+ if region is not None:
802
+ store_kwargs['region'] = region
594
803
 
595
804
  # Step 1: Translate the workdir to SkyPilot storage.
596
805
  new_storage_mounts = {}
597
806
  if task.workdir is not None:
598
- bucket_name = constants.WORKDIR_BUCKET_NAME.format(
599
- username=common_utils.get_cleaned_username(), id=run_id)
600
807
  workdir = task.workdir
601
808
  task.workdir = None
602
809
  if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
@@ -604,18 +811,29 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
604
811
  raise ValueError(
605
812
  f'Cannot mount {constants.SKY_REMOTE_WORKDIR} as both the '
606
813
  'workdir and file_mounts contains it as the target.')
607
- new_storage_mounts[
608
- constants.
609
- SKY_REMOTE_WORKDIR] = storage_lib.Storage.from_yaml_config({
610
- 'name': bucket_name,
611
- 'source': workdir,
612
- 'persistent': False,
613
- 'mode': 'COPY',
614
- })
814
+ bucket_sub_path = _sub_path_join(
815
+ sub_path,
816
+ constants.FILE_MOUNTS_WORKDIR_SUBPATH.format(run_id=run_id))
817
+ stores = None
818
+ if store_type is not None:
819
+ stores = [store_type]
820
+
821
+ storage_obj = storage_lib.Storage(
822
+ name=bucket_name,
823
+ source=workdir,
824
+ persistent=False,
825
+ mode=storage_lib.StorageMode.COPY,
826
+ stores=stores,
827
+ # Set `_is_sky_managed` to False when `bucket_with_prefix` is
828
+ # specified, so that the storage is not deleted when job finishes,
829
+ # but only the sub path is deleted.
830
+ _is_sky_managed=bucket_wth_prefix is None,
831
+ _bucket_sub_path=bucket_sub_path)
832
+ new_storage_mounts[constants.SKY_REMOTE_WORKDIR] = storage_obj
615
833
  # Check of the existence of the workdir in file_mounts is done in
616
834
  # the task construction.
617
- logger.info(f'Workdir {workdir!r} will be synced to cloud storage '
618
- f'{bucket_name!r}.')
835
+ logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
836
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
619
837
 
620
838
  # Step 2: Translate the local file mounts with folder in src to SkyPilot
621
839
  # storage.
@@ -629,88 +847,111 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
629
847
  if os.path.isfile(os.path.abspath(os.path.expanduser(src))):
630
848
  copy_mounts_with_file_in_src[dst] = src
631
849
  continue
632
- bucket_name = constants.FILE_MOUNTS_BUCKET_NAME.format(
633
- username=common_utils.get_cleaned_username(),
634
- id=f'{run_id}-{i}',
635
- )
636
- new_storage_mounts[dst] = storage_lib.Storage.from_yaml_config({
637
- 'name': bucket_name,
638
- 'source': src,
639
- 'persistent': False,
640
- 'mode': 'COPY',
641
- })
642
- logger.info(
643
- f'Folder in local file mount {src!r} will be synced to SkyPilot '
644
- f'storage {bucket_name}.')
850
+ bucket_sub_path = _sub_path_join(
851
+ sub_path, constants.FILE_MOUNTS_SUBPATH.format(i=i, run_id=run_id))
852
+ stores = None
853
+ if store_type is not None:
854
+ stores = [store_type]
855
+ storage_obj = storage_lib.Storage(name=bucket_name,
856
+ source=src,
857
+ persistent=False,
858
+ mode=storage_lib.StorageMode.COPY,
859
+ stores=stores,
860
+ _is_sky_managed=not bucket_wth_prefix,
861
+ _bucket_sub_path=bucket_sub_path)
862
+ new_storage_mounts[dst] = storage_obj
863
+ logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
864
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
645
865
 
646
866
  # Step 3: Translate local file mounts with file in src to SkyPilot storage.
647
867
  # Hard link the files in src to a temporary directory, and upload folder.
648
- local_fm_path = os.path.join(
649
- tempfile.gettempdir(),
650
- constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
651
- os.makedirs(local_fm_path, exist_ok=True)
652
- file_bucket_name = constants.FILE_MOUNTS_FILE_ONLY_BUCKET_NAME.format(
653
- username=common_utils.get_cleaned_username(), id=run_id)
654
- file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
655
- path)
656
- if copy_mounts_with_file_in_src:
657
- src_to_file_id = {}
658
- for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
659
- src_to_file_id[src] = i
660
- os.link(os.path.abspath(os.path.expanduser(src)),
661
- os.path.join(local_fm_path, f'file-{i}'))
662
-
663
- new_storage_mounts[
664
- file_mount_remote_tmp_dir] = storage_lib.Storage.from_yaml_config({
665
- 'name': file_bucket_name,
666
- 'source': local_fm_path,
667
- 'persistent': False,
668
- 'mode': 'MOUNT',
669
- })
670
- if file_mount_remote_tmp_dir in original_storage_mounts:
671
- with ux_utils.print_exception_no_traceback():
672
- raise ValueError(
673
- 'Failed to translate file mounts, due to the default '
674
- f'destination {file_mount_remote_tmp_dir} '
675
- 'being taken.')
676
- sources = list(src_to_file_id.keys())
677
- sources_str = '\n\t'.join(sources)
678
- logger.info('Source files in file_mounts will be synced to '
679
- f'cloud storage {file_bucket_name}:'
680
- f'\n\t{sources_str}')
681
- task.update_storage_mounts(new_storage_mounts)
682
-
683
- # Step 4: Upload storage from sources
684
- # Upload the local source to a bucket. The task will not be executed
685
- # locally, so we need to upload the files/folders to the bucket manually
686
- # here before sending the task to the remote jobs controller.
687
- if task.storage_mounts:
688
- # There may be existing (non-translated) storage mounts, so log this
689
- # whenever task.storage_mounts is non-empty.
690
- logger.info(f'{colorama.Fore.YELLOW}Uploading sources to cloud storage.'
691
- f'{colorama.Style.RESET_ALL} See: sky storage ls')
692
- try:
693
- task.sync_storage_mounts()
694
- except ValueError as e:
695
- if 'No enabled cloud for storage' in str(e):
696
- data_src = None
697
- if has_local_source_paths_file_mounts:
698
- data_src = 'file_mounts'
699
- if has_local_source_paths_workdir:
700
- if data_src:
701
- data_src += ' and workdir'
702
- else:
703
- data_src = 'workdir'
704
- store_enabled_clouds = ', '.join(storage_lib.STORE_ENABLED_CLOUDS)
705
- with ux_utils.print_exception_no_traceback():
706
- raise exceptions.NotSupportedError(
707
- f'Unable to use {data_src} - no cloud with object store '
708
- 'is enabled. Please enable at least one cloud with '
709
- f'object store support ({store_enabled_clouds}) by running '
710
- f'`sky check`, or remove {data_src} from your task.'
711
- '\nHint: If you do not have any cloud access, you may still'
712
- ' download data and code over the network using curl or '
713
- 'other tools in the `setup` section of the task.') from None
868
+ file_mounts_tmp_subpath = _sub_path_join(
869
+ sub_path, constants.FILE_MOUNTS_TMP_SUBPATH.format(run_id=run_id))
870
+ base_tmp_dir = os.path.expanduser(constants.FILE_MOUNTS_LOCAL_TMP_BASE_PATH)
871
+ os.makedirs(base_tmp_dir, exist_ok=True)
872
+ with tempfile.TemporaryDirectory(dir=base_tmp_dir) as temp_path:
873
+ local_fm_path = os.path.join(
874
+ temp_path, constants.FILE_MOUNTS_LOCAL_TMP_DIR.format(id=run_id))
875
+ os.makedirs(local_fm_path, exist_ok=True)
876
+ file_mount_remote_tmp_dir = constants.FILE_MOUNTS_REMOTE_TMP_DIR.format(
877
+ task_type)
878
+ if copy_mounts_with_file_in_src:
879
+ src_to_file_id = {}
880
+ for i, src in enumerate(set(copy_mounts_with_file_in_src.values())):
881
+ src_to_file_id[src] = i
882
+ os.link(os.path.abspath(os.path.expanduser(src)),
883
+ os.path.join(local_fm_path, f'file-{i}'))
884
+ stores = None
885
+ if store_type is not None:
886
+ stores = [store_type]
887
+ storage_obj = storage_lib.Storage(
888
+ name=bucket_name,
889
+ source=local_fm_path,
890
+ persistent=False,
891
+ mode=storage_lib.StorageMode.MOUNT,
892
+ stores=stores,
893
+ _is_sky_managed=not bucket_wth_prefix,
894
+ _bucket_sub_path=file_mounts_tmp_subpath)
895
+
896
+ new_storage_mounts[file_mount_remote_tmp_dir] = storage_obj
897
+ if file_mount_remote_tmp_dir in original_storage_mounts:
898
+ with ux_utils.print_exception_no_traceback():
899
+ raise ValueError(
900
+ 'Failed to translate file mounts, due to the default '
901
+ f'destination {file_mount_remote_tmp_dir} '
902
+ 'being taken.')
903
+ sources = list(src_to_file_id.keys())
904
+ sources_str = '\n '.join(sources)
905
+ logger.info(f' {colorama.Style.DIM}Files (listed below) '
906
+ f' -> storage: {bucket_name}:'
907
+ f'\n {sources_str}{colorama.Style.RESET_ALL}')
908
+
909
+ rich_utils.force_update_status(
910
+ ux_utils.spinner_message(
911
+ 'Uploading translated local files/folders'))
912
+ task.update_storage_mounts(new_storage_mounts)
913
+
914
+ # Step 4: Upload storage from sources
915
+ # Upload the local source to a bucket. The task will not be executed
916
+ # locally, so we need to upload the files/folders to the bucket manually
917
+ # here before sending the task to the remote jobs controller. This will
918
+ # also upload any storage mounts that are not translated. After
919
+ # sync_storage_mounts, we will also have file_mounts in the task, but
920
+ # these aren't used since the storage_mounts for the same paths take
921
+ # precedence.
922
+ if task.storage_mounts:
923
+ # There may be existing (non-translated) storage mounts, so log this
924
+ # whenever task.storage_mounts is non-empty.
925
+ rich_utils.force_update_status(
926
+ ux_utils.spinner_message(
927
+ 'Uploading local sources to storage[/] '
928
+ '[dim]View storages: sky storage ls'))
929
+ try:
930
+ task.sync_storage_mounts()
931
+ except (ValueError, exceptions.NoCloudAccessError) as e:
932
+ if 'No enabled cloud for storage' in str(e) or isinstance(
933
+ e, exceptions.NoCloudAccessError):
934
+ data_src = None
935
+ if has_local_source_paths_file_mounts:
936
+ data_src = 'file_mounts'
937
+ if has_local_source_paths_workdir:
938
+ if data_src:
939
+ data_src += ' and workdir'
940
+ else:
941
+ data_src = 'workdir'
942
+ store_enabled_clouds = ', '.join(
943
+ storage_lib.STORE_ENABLED_CLOUDS)
944
+ with ux_utils.print_exception_no_traceback():
945
+ raise exceptions.NotSupportedError(
946
+ f'Unable to use {data_src} - no cloud with object '
947
+ 'store support is enabled. Please enable at least one '
948
+ 'cloud with object store support '
949
+ f'({store_enabled_clouds}) by running `sky check`, or '
950
+ f'remove {data_src} from your task.'
951
+ '\nHint: If you do not have any cloud access, you may '
952
+ 'still download data and code over the network using '
953
+ 'curl or other tools in the `setup` section of the '
954
+ 'task.') from None
714
955
 
715
956
  # Step 5: Add the file download into the file mounts, such as
716
957
  # /original-dst: s3://spot-fm-file-only-bucket-name/file-0
@@ -718,10 +959,15 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
718
959
  if copy_mounts_with_file_in_src:
719
960
  # file_mount_remote_tmp_dir will only exist when there are files in
720
961
  # the src for copy mounts.
721
- storage = task.storage_mounts[file_mount_remote_tmp_dir]
722
- store_type = list(storage.stores.keys())[0]
723
- store_prefix = store_type.store_prefix()
724
- bucket_url = store_prefix + file_bucket_name
962
+ storage_obj = task.storage_mounts[file_mount_remote_tmp_dir]
963
+ assert storage_obj.stores, (storage_obj.__dict__, task.to_yaml_config())
964
+ curr_store_type = list(storage_obj.stores.keys())[0]
965
+ store_object = storage_obj.stores[curr_store_type]
966
+ assert store_object is not None, (storage_obj.__dict__,
967
+ task.to_yaml_config())
968
+ bucket_url = storage_lib.StoreType.get_endpoint_url(
969
+ store_object, bucket_name)
970
+ bucket_url += f'/{file_mounts_tmp_subpath}'
725
971
  for dst, src in copy_mounts_with_file_in_src.items():
726
972
  file_id = src_to_file_id[src]
727
973
  new_file_mounts[dst] = bucket_url + f'/file-{file_id}'
@@ -733,12 +979,48 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
733
979
  if (storage_obj.source is not None and
734
980
  not data_utils.is_cloud_store_url(storage_obj.source)):
735
981
  # Need to replace the local path with bucket URI, and remove the
736
- # name field, so that the storage mount can work on the spot
982
+ # name field, so that the storage mount can work on the jobs
737
983
  # controller.
738
984
  store_types = list(storage_obj.stores.keys())
739
985
  assert len(store_types) == 1, (
740
986
  'We only support one store type for now.', storage_obj.stores)
741
- store_type = store_types[0]
742
- store_prefix = store_type.store_prefix()
743
- storage_obj.source = f'{store_prefix}{storage_obj.name}'
987
+ curr_store_type = store_types[0]
988
+ store_object = storage_obj.stores[curr_store_type]
989
+ assert store_object is not None and storage_obj.name is not None, (
990
+ store_object, storage_obj.name)
991
+ storage_obj.source = storage_lib.StoreType.get_endpoint_url(
992
+ store_object, storage_obj.name)
744
993
  storage_obj.force_delete = True
994
+
995
+ # Step 7: Convert all `MOUNT` mode storages which don't specify a source
996
+ # to specifying a source. If the source is specified with a local path,
997
+ # it was handled in step 6.
998
+ updated_mount_storages = {}
999
+ for storage_path, storage_obj in task.storage_mounts.items():
1000
+ if (storage_obj.mode == storage_lib.StorageMode.MOUNT and
1001
+ not storage_obj.source):
1002
+ # Construct source URL with first store type and storage name
1003
+ # E.g., s3://my-storage-name
1004
+ store_types = list(storage_obj.stores.keys())
1005
+ assert len(store_types) == 1, (
1006
+ 'We only support one store type for now.', storage_obj.stores)
1007
+ curr_store_type = store_types[0]
1008
+ store_object = storage_obj.stores[curr_store_type]
1009
+ assert store_object is not None and storage_obj.name is not None, (
1010
+ store_object, storage_obj.name)
1011
+ source = storage_lib.StoreType.get_endpoint_url(
1012
+ store_object, storage_obj.name)
1013
+ assert store_object is not None and storage_obj.name is not None, (
1014
+ store_object, storage_obj.name)
1015
+ new_storage = storage_lib.Storage.from_yaml_config({
1016
+ 'source': source,
1017
+ 'persistent': storage_obj.persistent,
1018
+ 'mode': storage_lib.StorageMode.MOUNT.value,
1019
+ # We enable force delete to allow the controller to delete
1020
+ # the object store in case persistent is set to False.
1021
+ '_force_delete': True
1022
+ })
1023
+ updated_mount_storages[storage_path] = new_storage
1024
+ task.update_storage_mounts(updated_mount_storages)
1025
+ if msg:
1026
+ logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))