skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -3,12 +3,12 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
- import os
6
+ import typing
7
7
  from typing import List, Optional, Tuple, Union
8
8
 
9
9
  import colorama
10
10
 
11
- import sky
11
+ from sky import admin_policy
12
12
  from sky import backends
13
13
  from sky import clouds
14
14
  from sky import global_user_state
@@ -16,14 +16,19 @@ from sky import optimizer
16
16
  from sky import sky_logging
17
17
  from sky.backends import backend_utils
18
18
  from sky.usage import usage_lib
19
+ from sky.utils import admin_policy_utils
20
+ from sky.utils import common
19
21
  from sky.utils import controller_utils
20
22
  from sky.utils import dag_utils
21
- from sky.utils import env_options
23
+ from sky.utils import resources_utils
22
24
  from sky.utils import rich_utils
23
- from sky.utils import subprocess_utils
25
+ from sky.utils import status_lib
24
26
  from sky.utils import timeline
25
27
  from sky.utils import ux_utils
26
28
 
29
+ if typing.TYPE_CHECKING:
30
+ import sky
31
+
27
32
  logger = sky_logging.init_logger(__name__)
28
33
 
29
34
 
@@ -55,8 +60,9 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str],
55
60
  with rich_utils.safe_status('Creating image from source cluster '
56
61
  f'{clone_disk_from!r}'):
57
62
  image_id = original_cloud.create_image_from_cluster(
58
- clone_disk_from,
59
- handle.cluster_name_on_cloud,
63
+ cluster_name=resources_utils.ClusterName(
64
+ display_name=clone_disk_from,
65
+ name_on_cloud=handle.cluster_name_on_cloud),
60
66
  region=handle.launched_resources.region,
61
67
  zone=handle.launched_resources.zone,
62
68
  )
@@ -98,7 +104,7 @@ def _execute(
98
104
  handle: Optional[backends.ResourceHandle] = None,
99
105
  backend: Optional[backends.Backend] = None,
100
106
  retry_until_up: bool = False,
101
- optimize_target: optimizer.OptimizeTarget = optimizer.OptimizeTarget.COST,
107
+ optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
102
108
  stages: Optional[List[Stage]] = None,
103
109
  cluster_name: Optional[str] = None,
104
110
  detach_setup: bool = False,
@@ -106,8 +112,10 @@ def _execute(
106
112
  idle_minutes_to_autostop: Optional[int] = None,
107
113
  no_setup: bool = False,
108
114
  clone_disk_from: Optional[str] = None,
115
+ skip_unnecessary_provisioning: bool = False,
109
116
  # Internal only:
110
117
  # pylint: disable=invalid-name
118
+ _quiet_optimizer: bool = False,
111
119
  _is_launched_by_jobs_controller: bool = False,
112
120
  _is_launched_by_sky_serve_controller: bool = False,
113
121
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
@@ -126,8 +134,9 @@ def _execute(
126
134
  Note that if errors occur during provisioning/data syncing/setting up,
127
135
  the cluster will not be torn down for debugging purposes.
128
136
  stream_logs: bool; whether to stream all tasks' outputs to the client.
129
- handle: Optional[backends.ResourceHandle]; if provided, execution will use
130
- an existing backend cluster handle instead of provisioning a new one.
137
+ handle: Optional[backends.ResourceHandle]; if provided, execution will
138
+ attempt to use an existing backend cluster handle instead of
139
+ provisioning a new one.
131
140
  backend: Backend; backend to use for executing the tasks. Defaults to
132
141
  CloudVmRayBackend()
133
142
  retry_until_up: bool; whether to retry the provisioning until the cluster
@@ -148,6 +157,11 @@ def _execute(
148
157
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
149
158
  autostop after this many minutes of idleness.
150
159
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
160
+ clone_disk_from: Optional[str]; if set, clone the disk from the specified
161
+ cluster.
162
+ skip_unecessary_provisioning: bool; if True, compare the calculated
163
+ cluster config to the current cluster's config. If they match, shortcut
164
+ provisioning even if we have Stage.PROVISION.
151
165
 
152
166
  Returns:
153
167
  job_id: Optional[int]; the job ID of the submitted job. None if the
@@ -156,21 +170,35 @@ def _execute(
156
170
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
157
171
  if dryrun.
158
172
  """
173
+
159
174
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
175
+ for task in dag.tasks:
176
+ if task.storage_mounts is not None:
177
+ for storage in task.storage_mounts.values():
178
+ # Ensure the storage is constructed.
179
+ storage.construct()
180
+ dag, _ = admin_policy_utils.apply(
181
+ dag,
182
+ request_options=admin_policy.RequestOptions(
183
+ cluster_name=cluster_name,
184
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
185
+ down=down,
186
+ dryrun=dryrun,
187
+ ))
160
188
  assert len(dag) == 1, f'We support 1 task for now. {dag}'
161
189
  task = dag.tasks[0]
162
190
 
163
191
  if any(r.job_recovery is not None for r in task.resources):
164
- with ux_utils.print_exception_no_traceback():
165
- raise ValueError(
166
- 'Job recovery is specified in the task. To launch a '
167
- 'managed job, please use: sky jobs launch')
192
+ logger.warning(
193
+ f'{colorama.Style.DIM}The task has `job_recovery` specified, '
194
+ 'but is launched as an unmanaged job. It will be ignored.'
195
+ 'To enable job recovery, use managed jobs: sky jobs launch.'
196
+ f'{colorama.Style.RESET_ALL}')
168
197
 
169
198
  cluster_exists = False
170
199
  if cluster_name is not None:
171
- existing_handle = global_user_state.get_handle_from_cluster_name(
172
- cluster_name)
173
- cluster_exists = existing_handle is not None
200
+ cluster_record = global_user_state.get_cluster_from_name(cluster_name)
201
+ cluster_exists = cluster_record is not None
174
202
  # TODO(woosuk): If the cluster exists, print a warning that
175
203
  # `cpus` and `memory` are not used as a job scheduling constraint,
176
204
  # unlike `gpus`.
@@ -206,7 +234,8 @@ def _execute(
206
234
  '(after all jobs finish).'
207
235
  f'{colorama.Style.RESET_ALL}')
208
236
  idle_minutes_to_autostop = 1
209
- stages.remove(Stage.DOWN)
237
+ if Stage.DOWN in stages:
238
+ stages.remove(Stage.DOWN)
210
239
  if idle_minutes_to_autostop >= 0:
211
240
  requested_features.add(
212
241
  clouds.CloudImplementationFeatures.AUTO_TERMINATE)
@@ -238,8 +267,8 @@ def _execute(
238
267
  bold = colorama.Style.BRIGHT
239
268
  reset = colorama.Style.RESET_ALL
240
269
  logger.info(
241
- f'{yellow}Launching an unmanaged spot task, which does not '
242
- f'automatically recover from preemptions.{reset}\n{yellow}To '
270
+ f'{yellow}Launching a spot job that does not '
271
+ f'automatically recover from preemptions. To '
243
272
  'get automatic recovery, use managed job instead: '
244
273
  f'{reset}{bold}sky jobs launch{reset} {yellow}or{reset} '
245
274
  f'{bold}sky.jobs.launch(){reset}.')
@@ -253,7 +282,15 @@ def _execute(
253
282
  # no-credential machine should not enter optimize(), which
254
283
  # would directly error out ('No cloud is enabled...'). Fix
255
284
  # by moving `sky check` checks out of optimize()?
256
- dag = sky.optimize(dag, minimize=optimize_target)
285
+ controller = controller_utils.Controllers.from_name(
286
+ cluster_name)
287
+ if controller is not None:
288
+ logger.info(
289
+ f'Choosing resources for {controller.value.name}...'
290
+ )
291
+ dag = optimizer.Optimizer.optimize(dag,
292
+ minimize=optimize_target,
293
+ quiet=_quiet_optimizer)
257
294
  task = dag.tasks[0] # Keep: dag may have been deep-copied.
258
295
  assert task.best_resources is not None, task
259
296
 
@@ -267,13 +304,18 @@ def _execute(
267
304
 
268
305
  try:
269
306
  if Stage.PROVISION in stages:
270
- if handle is None:
271
- handle = backend.provision(task,
272
- task.best_resources,
273
- dryrun=dryrun,
274
- stream_logs=stream_logs,
275
- cluster_name=cluster_name,
276
- retry_until_up=retry_until_up)
307
+ assert handle is None or skip_unnecessary_provisioning, (
308
+ 'Provisioning requested, but handle is already set. PROVISION '
309
+ 'should be excluded from stages or '
310
+ 'skip_unecessary_provisioning should be set. ')
311
+ handle = backend.provision(
312
+ task,
313
+ task.best_resources,
314
+ dryrun=dryrun,
315
+ stream_logs=stream_logs,
316
+ cluster_name=cluster_name,
317
+ retry_until_up=retry_until_up,
318
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning)
277
319
 
278
320
  if handle is None:
279
321
  assert dryrun, ('If not dryrun, handle must be set or '
@@ -281,11 +323,18 @@ def _execute(
281
323
  logger.info('Dryrun finished.')
282
324
  return None, None
283
325
 
284
- if Stage.SYNC_WORKDIR in stages and not dryrun:
285
- if task.workdir is not None:
286
- backend.sync_workdir(handle, task.workdir)
326
+ do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
327
+ task.workdir is not None)
328
+ do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
329
+ (task.file_mounts is not None or
330
+ task.storage_mounts is not None))
331
+ if do_workdir or do_file_mounts:
332
+ logger.info(ux_utils.starting_message('Syncing files.'))
333
+
334
+ if do_workdir:
335
+ backend.sync_workdir(handle, task.workdir)
287
336
 
288
- if Stage.SYNC_FILE_MOUNTS in stages and not dryrun:
337
+ if do_file_mounts:
289
338
  backend.sync_file_mounts(handle, task.file_mounts,
290
339
  task.storage_mounts)
291
340
 
@@ -318,23 +367,6 @@ def _execute(
318
367
  backend.teardown_ephemeral_storage(task)
319
368
  backend.teardown(handle, terminate=True)
320
369
  finally:
321
- controller = controller_utils.Controllers.from_name(cluster_name)
322
- if controller is None and not _is_launched_by_sky_serve_controller:
323
- # UX: print live clusters to make users aware (to save costs).
324
- #
325
- # Don't print if this job is launched by the jobs controller,
326
- # because managed jobs are serverless, there can be many of them,
327
- # and users tend to continuously monitor managed jobs using `sky
328
- # job queue`. Also don't print if this job is a skyserve controller
329
- # job or launched by a skyserve controller job, because the
330
- # redirect for this subprocess.run won't success and it will
331
- # pollute the controller logs.
332
- #
333
- # Disable the usage collection for this status command.
334
- env = dict(os.environ,
335
- **{env_options.Options.DISABLE_LOGGING.value: '1'})
336
- subprocess_utils.run(
337
- 'sky status --no-show-managed-jobs --no-show-services', env=env)
338
370
  print()
339
371
  print('\x1b[?25h', end='') # Show cursor.
340
372
  return job_id, handle
@@ -351,19 +383,19 @@ def launch(
351
383
  down: bool = False,
352
384
  stream_logs: bool = True,
353
385
  backend: Optional[backends.Backend] = None,
354
- optimize_target: optimizer.OptimizeTarget = optimizer.OptimizeTarget.COST,
355
- detach_setup: bool = False,
356
- detach_run: bool = False,
386
+ optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
357
387
  no_setup: bool = False,
358
388
  clone_disk_from: Optional[str] = None,
389
+ fast: bool = False,
359
390
  # Internal only:
360
391
  # pylint: disable=invalid-name
392
+ _quiet_optimizer: bool = False,
361
393
  _is_launched_by_jobs_controller: bool = False,
362
394
  _is_launched_by_sky_serve_controller: bool = False,
363
395
  _disable_controller_check: bool = False,
364
396
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
365
397
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
366
- """Launch a cluster or task.
398
+ """Launches a cluster or task.
367
399
 
368
400
  The task's setup and run commands are executed under the task's workdir
369
401
  (when specified, it is synced to remote cluster). The task undergoes job
@@ -373,6 +405,16 @@ def launch(
373
405
  usage) a sky.Dag. In the latter case, currently it must contain a single
374
406
  task; support for pipelines/general DAGs are in experimental branches.
375
407
 
408
+ Example:
409
+ .. code-block:: python
410
+
411
+ import sky
412
+ task = sky.Task(run='echo hello SkyPilot')
413
+ task.set_resources(
414
+ sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
415
+ sky.launch(task, cluster_name='my-cluster')
416
+
417
+
376
418
  Args:
377
419
  task: sky.Task, or sky.Dag (experimental; 1-task only) to launch.
378
420
  cluster_name: name of the cluster to create/reuse. If None,
@@ -384,7 +426,7 @@ def launch(
384
426
  cluster's job queue. Idleness gets reset whenever setting-up/
385
427
  running/pending jobs are found in the job queue. Setting this
386
428
  flag is equivalent to running
387
- ``sky.launch(..., detach_run=True, ...)`` and then
429
+ ``sky.launch(...)`` and then
388
430
  ``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
389
431
  will not be autostopped.
390
432
  down: Tear down the cluster after all jobs finish (successfully or
@@ -398,27 +440,12 @@ def launch(
398
440
  (CloudVMRayBackend).
399
441
  optimize_target: target to optimize for. Choices: OptimizeTarget.COST,
400
442
  OptimizeTarget.TIME.
401
- detach_setup: If True, run setup in non-interactive mode as part of the
402
- job itself. You can safely ctrl-c to detach from logging, and it
403
- will not interrupt the setup process. To see the logs again after
404
- detaching, use `sky logs`. To cancel setup, cancel the job via
405
- `sky cancel`. Useful for long-running setup
406
- commands.
407
- detach_run: If True, as soon as a job is submitted, return from this
408
- function and do not stream execution logs.
409
443
  no_setup: if True, do not re-run setup commands.
410
444
  clone_disk_from: [Experimental] if set, clone the disk from the
411
445
  specified cluster. This is useful to migrate the cluster to a
412
446
  different availability zone or region.
413
-
414
- Example:
415
- .. code-block:: python
416
-
417
- import sky
418
- task = sky.Task(run='echo hello SkyPilot')
419
- task.set_resources(
420
- sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
421
- sky.launch(task, cluster_name='my-cluster')
447
+ fast: [Experimental] If the cluster is already up and available,
448
+ skip provisioning and setup steps.
422
449
 
423
450
  Raises:
424
451
  exceptions.ClusterOwnerIdentityMismatchError: if the cluster is
@@ -448,26 +475,78 @@ def launch(
448
475
  handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
449
476
  if dryrun.
450
477
  """
478
+
451
479
  entrypoint = task
480
+ entrypoint.validate()
452
481
  if not _disable_controller_check:
453
482
  controller_utils.check_cluster_name_not_controller(
454
483
  cluster_name, operation_str='sky.launch')
455
484
 
485
+ handle = None
486
+ stages = None
487
+ skip_unnecessary_provisioning = False
488
+ # Check if cluster exists and we are doing fast provisioning
489
+ if fast and cluster_name is not None:
490
+ cluster_status, maybe_handle = (
491
+ backend_utils.refresh_cluster_status_handle(cluster_name))
492
+ if cluster_status == status_lib.ClusterStatus.INIT:
493
+ # If the cluster is INIT, it may be provisioning. We want to prevent
494
+ # concurrent calls from queueing up many sequential reprovision
495
+ # attempts. Since provisioning will hold the cluster status lock, we
496
+ # wait to hold that lock by force refreshing the status. This will
497
+ # block until the cluster finishes provisioning, then correctly see
498
+ # that it is UP.
499
+ # TODO(cooperc): If multiple processes launched in parallel see that
500
+ # the cluster is STOPPED or does not exist, they will still all try
501
+ # to provision it, since we do not hold the lock continuously from
502
+ # the status check until the provision call. Fixing this requires a
503
+ # bigger refactor.
504
+ cluster_status, maybe_handle = (
505
+ backend_utils.refresh_cluster_status_handle(
506
+ cluster_name,
507
+ force_refresh_statuses=[
508
+ # If the cluster is INIT, we want to try to grab the
509
+ # status lock, which should block until provisioning is
510
+ # finished.
511
+ status_lib.ClusterStatus.INIT,
512
+ ],
513
+ # Wait indefinitely to obtain the lock, so that we don't
514
+ # have multiple processes launching the same cluster at
515
+ # once.
516
+ cluster_status_lock_timeout=-1,
517
+ ))
518
+ if cluster_status == status_lib.ClusterStatus.UP:
519
+ handle = maybe_handle
520
+ stages = [
521
+ # Provisioning will be short-circuited if the existing
522
+ # cluster config hash matches the calculated one.
523
+ Stage.PROVISION,
524
+ Stage.SYNC_WORKDIR,
525
+ Stage.SYNC_FILE_MOUNTS,
526
+ Stage.PRE_EXEC,
527
+ Stage.EXEC,
528
+ Stage.DOWN,
529
+ ]
530
+ skip_unnecessary_provisioning = True
531
+
456
532
  return _execute(
457
533
  entrypoint=entrypoint,
458
534
  dryrun=dryrun,
459
535
  down=down,
460
536
  stream_logs=stream_logs,
461
- handle=None,
537
+ handle=handle,
462
538
  backend=backend,
463
539
  retry_until_up=retry_until_up,
464
540
  optimize_target=optimize_target,
541
+ stages=stages,
465
542
  cluster_name=cluster_name,
466
- detach_setup=detach_setup,
467
- detach_run=detach_run,
543
+ detach_setup=True,
544
+ detach_run=True,
468
545
  idle_minutes_to_autostop=idle_minutes_to_autostop,
469
546
  no_setup=no_setup,
470
547
  clone_disk_from=clone_disk_from,
548
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning,
549
+ _quiet_optimizer=_quiet_optimizer,
471
550
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
472
551
  _is_launched_by_sky_serve_controller=
473
552
  _is_launched_by_sky_serve_controller,
@@ -482,10 +561,9 @@ def exec( # pylint: disable=redefined-builtin
482
561
  down: bool = False,
483
562
  stream_logs: bool = True,
484
563
  backend: Optional[backends.Backend] = None,
485
- detach_run: bool = False,
486
564
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
487
565
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
488
- """Execute a task on an existing cluster.
566
+ """Executes a task on an existing cluster.
489
567
 
490
568
  This function performs two actions:
491
569
 
@@ -520,12 +598,11 @@ def exec( # pylint: disable=redefined-builtin
520
598
  stream_logs: if True, show the logs in the terminal.
521
599
  backend: backend to use. If None, use the default backend
522
600
  (CloudVMRayBackend).
523
- detach_run: if True, detach from logging once the task has been
524
- submitted.
525
601
 
526
602
  Raises:
527
- ValueError: if the specified cluster does not exist or is not in UP
528
- status.
603
+ ValueError: if the specified cluster is not in UP status.
604
+ sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
605
+ exist.
529
606
  sky.exceptions.NotSupportedError: if the specified cluster is a
530
607
  controller that does not support this operation.
531
608
 
@@ -537,11 +614,7 @@ def exec( # pylint: disable=redefined-builtin
537
614
  if dryrun.
538
615
  """
539
616
  entrypoint = task
540
- if isinstance(entrypoint, sky.Dag):
541
- logger.warning(
542
- f'{colorama.Fore.YELLOW}Passing a sky.Dag to sky.exec() is '
543
- 'deprecated. Pass sky.Task instead.'
544
- f'{colorama.Style.RESET_ALL}')
617
+ entrypoint.validate(workdir_only=True)
545
618
  controller_utils.check_cluster_name_not_controller(cluster_name,
546
619
  operation_str='sky.exec')
547
620
 
@@ -562,5 +635,5 @@ def exec( # pylint: disable=redefined-builtin
562
635
  Stage.EXEC,
563
636
  ],
564
637
  cluster_name=cluster_name,
565
- detach_run=detach_run,
638
+ detach_run=True,
566
639
  )