skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -4,17 +4,17 @@ import time
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
6
  from sky import sky_logging
7
- from sky import status_lib
8
7
  from sky.provision import common
9
8
  from sky.provision.paperspace import utils
10
9
  from sky.utils import common_utils
10
+ from sky.utils import status_lib
11
11
  from sky.utils import ux_utils
12
12
 
13
13
  # The maximum number of times to poll for the status of an operation.
14
14
  POLL_INTERVAL = 5
15
15
  MAX_POLLS = 60 // POLL_INTERVAL
16
16
  # Stopping instances can take several minutes, so we increase the timeout
17
- MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 8
17
+ MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 16
18
18
 
19
19
  logger = sky_logging.init_logger(__name__)
20
20
 
@@ -286,12 +286,13 @@ def query_instances(
286
286
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
287
  instances = _filter_instances(cluster_name_on_cloud, None)
288
288
 
289
+ # https://docs.digitalocean.com/reference/paperspace/core/commands/machines/#show
289
290
  status_map = {
290
291
  'starting': status_lib.ClusterStatus.INIT,
291
292
  'restarting': status_lib.ClusterStatus.INIT,
292
293
  'upgrading': status_lib.ClusterStatus.INIT,
293
294
  'provisioning': status_lib.ClusterStatus.INIT,
294
- 'stopping': status_lib.ClusterStatus.INIT,
295
+ 'stopping': status_lib.ClusterStatus.STOPPED,
295
296
  'serviceready': status_lib.ClusterStatus.INIT,
296
297
  'ready': status_lib.ClusterStatus.UP,
297
298
  'off': status_lib.ClusterStatus.STOPPED,
@@ -132,6 +132,8 @@ class PaperspaceCloudClient:
132
132
  'apt-get update \n'
133
133
  'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
134
134
  'fi \n'
135
+ # TODO(tian): Maybe remove this as well since we are now adding
136
+ # users to docker group in the DockerInitializer. Need to test.
135
137
  'usermod -aG docker paperspace \n'
136
138
  f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
137
139
  try:
@@ -14,9 +14,9 @@ import colorama
14
14
 
15
15
  import sky
16
16
  from sky import clouds
17
+ from sky import exceptions
17
18
  from sky import provision
18
19
  from sky import sky_logging
19
- from sky import status_lib
20
20
  from sky.adaptors import aws
21
21
  from sky.backends import backend_utils
22
22
  from sky.provision import common as provision_common
@@ -25,7 +25,12 @@ from sky.provision import logging as provision_logging
25
25
  from sky.provision import metadata_utils
26
26
  from sky.skylet import constants
27
27
  from sky.utils import common_utils
28
+ from sky.utils import message_utils
29
+ from sky.utils import resources_utils
28
30
  from sky.utils import rich_utils
31
+ from sky.utils import status_lib
32
+ from sky.utils import subprocess_utils
33
+ from sky.utils import timeline
29
34
  from sky.utils import ux_utils
30
35
 
31
36
  # Do not use __name__ as we do not want to propagate logs to sky.provision,
@@ -38,91 +43,53 @@ _MAX_RETRY = 3
38
43
  _TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n'
39
44
 
40
45
 
41
- @dataclasses.dataclass
42
- class ClusterName:
43
- display_name: str
44
- name_on_cloud: str
45
-
46
- def __repr__(self) -> str:
47
- return repr(self.display_name)
48
-
49
- def __str__(self) -> str:
50
- return self.display_name
51
-
52
-
53
46
  def _bulk_provision(
54
47
  cloud: clouds.Cloud,
55
48
  region: clouds.Region,
56
- zones: Optional[List[clouds.Zone]],
57
- cluster_name: ClusterName,
49
+ cluster_name: resources_utils.ClusterName,
58
50
  bootstrap_config: provision_common.ProvisionConfig,
59
51
  ) -> provision_common.ProvisionRecord:
60
52
  provider_name = repr(cloud)
61
53
  region_name = region.name
62
54
 
63
- style = colorama.Style
64
-
65
- if not zones:
66
- # For Azure, zones is always an empty list.
67
- zone_str = 'all zones'
68
- else:
69
- zone_str = ','.join(z.name for z in zones)
70
-
71
- if isinstance(cloud, clouds.Kubernetes):
72
- # Omit the region name for Kubernetes.
73
- logger.info(f'{style.BRIGHT}Launching on {cloud}{style.RESET_ALL} '
74
- f'{cluster_name!r}.')
75
- else:
76
- logger.info(f'{style.BRIGHT}Launching on {cloud} '
77
- f'{region_name}{style.RESET_ALL} ({zone_str})')
78
-
79
55
  start = time.time()
80
- with rich_utils.safe_status('[bold cyan]Launching[/]') as status:
56
+ # TODO(suquark): Should we cache the bootstrapped result?
57
+ # Currently it is not necessary as bootstrapping takes
58
+ # only ~3s, caching it seems over-engineering and could
59
+ # cause other issues like the cache is not synced
60
+ # with the cloud configuration.
61
+ config = provision.bootstrap_instances(provider_name, region_name,
62
+ cluster_name.name_on_cloud,
63
+ bootstrap_config)
64
+
65
+ provision_record = provision.run_instances(provider_name,
66
+ region_name,
67
+ cluster_name.name_on_cloud,
68
+ config=config)
69
+
70
+ backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
71
+ logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
72
+ rich_utils.force_update_status(
73
+ ux_utils.spinner_message('Launching - Checking instance status',
74
+ str(provision_logging.config.log_path)))
75
+ # AWS would take a very short time (<<1s) updating the state of the
76
+ # instance.
77
+ time.sleep(1)
78
+ for retry_cnt in range(_MAX_RETRY):
81
79
  try:
82
- # TODO(suquark): Should we cache the bootstrapped result?
83
- # Currently it is not necessary as bootstrapping takes
84
- # only ~3s, caching it seems over-engineering and could
85
- # cause other issues like the cache is not synced
86
- # with the cloud configuration.
87
- config = provision.bootstrap_instances(provider_name, region_name,
88
- cluster_name.name_on_cloud,
89
- bootstrap_config)
90
- except Exception as e:
91
- logger.error(f'{colorama.Fore.YELLOW}Failed to configure '
92
- f'{cluster_name!r} on {cloud} {region} ({zone_str}) '
93
- 'with the following error:'
94
- f'{colorama.Style.RESET_ALL}\n'
95
- f'{common_utils.format_exception(e)}')
96
- raise
97
-
98
- provision_record = provision.run_instances(provider_name,
99
- region_name,
100
- cluster_name.name_on_cloud,
101
- config=config)
102
-
103
- backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
104
- logger.debug(
105
- f'\nWaiting for instances of {cluster_name!r} to be ready...')
106
- status.update('[bold cyan]Launching - Checking instance status[/]')
107
- # AWS would take a very short time (<<1s) updating the state of the
108
- # instance.
109
- time.sleep(1)
110
- for retry_cnt in range(_MAX_RETRY):
111
- try:
112
- provision.wait_instances(provider_name,
113
- region_name,
114
- cluster_name.name_on_cloud,
115
- state=status_lib.ClusterStatus.UP)
116
- break
117
- except (aws.botocore_exceptions().WaiterError, RuntimeError):
118
- time.sleep(backoff.current_backoff())
119
- else:
120
- raise RuntimeError(
121
- f'Failed to wait for instances of {cluster_name!r} to be '
122
- f'ready on the cloud provider after max retries {_MAX_RETRY}.')
123
- logger.debug(
124
- f'Instances of {cluster_name!r} are ready after {retry_cnt} '
125
- 'retries.')
80
+ provision.wait_instances(provider_name,
81
+ region_name,
82
+ cluster_name.name_on_cloud,
83
+ state=status_lib.ClusterStatus.UP)
84
+ break
85
+ except (aws.botocore_exceptions().WaiterError, RuntimeError):
86
+ time.sleep(backoff.current_backoff())
87
+ else:
88
+ raise RuntimeError(
89
+ f'Failed to wait for instances of {cluster_name!r} to be '
90
+ f'ready on the cloud provider after max retries {_MAX_RETRY}.')
91
+ logger.debug(f'Instances of {cluster_name!r} are ready after {retry_cnt} '
92
+ 'retries.')
126
93
 
127
94
  logger.debug(
128
95
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
@@ -135,11 +102,12 @@ def bulk_provision(
135
102
  cloud: clouds.Cloud,
136
103
  region: clouds.Region,
137
104
  zones: Optional[List[clouds.Zone]],
138
- cluster_name: ClusterName,
105
+ cluster_name: resources_utils.ClusterName,
139
106
  num_nodes: int,
140
107
  cluster_yaml: str,
141
108
  prev_cluster_ever_up: bool,
142
109
  log_dir: str,
110
+ ports_to_open_on_launch: Optional[List[int]] = None,
143
111
  ) -> provision_common.ProvisionRecord:
144
112
  """Provisions a cluster and wait until fully provisioned.
145
113
 
@@ -161,7 +129,8 @@ def bulk_provision(
161
129
  ['node_config'],
162
130
  count=num_nodes,
163
131
  tags={},
164
- resume_stopped_nodes=True)
132
+ resume_stopped_nodes=True,
133
+ ports_to_open_on_launch=ports_to_open_on_launch)
165
134
 
166
135
  with provision_logging.setup_provision_logging(log_dir):
167
136
  try:
@@ -171,8 +140,11 @@ def bulk_provision(
171
140
  logger.debug(
172
141
  'Provision config:\n'
173
142
  f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
174
- return _bulk_provision(cloud, region, zones, cluster_name,
143
+ return _bulk_provision(cloud, region, cluster_name,
175
144
  bootstrap_config)
145
+ except exceptions.NoClusterLaunchedError:
146
+ # Skip the teardown if the cluster was never launched.
147
+ raise
176
148
  except Exception: # pylint: disable=broad-except
177
149
  zone_str = 'all zones'
178
150
  if zones:
@@ -225,7 +197,7 @@ def bulk_provision(
225
197
  raise
226
198
 
227
199
 
228
- def teardown_cluster(cloud_name: str, cluster_name: ClusterName,
200
+ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
229
201
  terminate: bool, provider_config: Dict) -> None:
230
202
  """Deleting or stopping a cluster.
231
203
 
@@ -268,6 +240,8 @@ def _ssh_probe_command(ip: str,
268
240
  '-o',
269
241
  'IdentitiesOnly=yes',
270
242
  '-o',
243
+ 'AddKeysToAgent=yes',
244
+ '-o',
271
245
  'ExitOnForwardFailure=yes',
272
246
  '-o',
273
247
  'ServerAliveInterval=5',
@@ -371,6 +345,7 @@ def _wait_ssh_connection_indirect(ip: str,
371
345
  return True, ''
372
346
 
373
347
 
348
+ @timeline.event
374
349
  def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
375
350
  ssh_credentials: Dict[str, str]):
376
351
  """Wait until SSH is ready.
@@ -394,14 +369,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
394
369
  # use a queue for SSH querying
395
370
  ips = collections.deque(ip_list)
396
371
  ssh_ports = collections.deque(port_list)
397
- while ips:
398
- ip = ips.popleft()
399
- ssh_port = ssh_ports.popleft()
400
- success, stderr = waiter(ip, ssh_port, **ssh_credentials)
401
- if not success:
402
- ips.append(ip)
403
- ssh_ports.append(ssh_port)
404
- if time.time() - start > timeout:
372
+
373
+ def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
374
+ ip, ssh_port = ip_ssh_port
375
+ success = False
376
+ while not success:
377
+ success, stderr = waiter(ip, ssh_port, **ssh_credentials)
378
+ if not success and time.time() - start > timeout:
405
379
  with ux_utils.print_exception_no_traceback():
406
380
  raise RuntimeError(
407
381
  f'Failed to SSH to {ip} after timeout {timeout}s, with '
@@ -409,10 +383,18 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
409
383
  logger.debug('Retrying in 1 second...')
410
384
  time.sleep(1)
411
385
 
386
+ # try one node and multiprocess the rest
387
+ if ips:
388
+ ip = ips.popleft()
389
+ ssh_port = ssh_ports.popleft()
390
+ _retry_ssh_thread((ip, ssh_port))
391
+ subprocess_utils.run_in_parallel(_retry_ssh_thread,
392
+ list(zip(ips, ssh_ports)))
393
+
412
394
 
413
395
  def _post_provision_setup(
414
- cloud_name: str, cluster_name: ClusterName, cluster_yaml: str,
415
- provision_record: provision_common.ProvisionRecord,
396
+ cloud_name: str, cluster_name: resources_utils.ClusterName,
397
+ cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
416
398
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
417
399
  config_from_yaml = common_utils.read_yaml(cluster_yaml)
418
400
  provider_config = config_from_yaml.get('provider')
@@ -434,35 +416,53 @@ def _post_provision_setup(
434
416
  f'{json.dumps(dataclasses.asdict(provision_record), indent=2)}\n'
435
417
  'Cluster info:\n'
436
418
  f'{json.dumps(dataclasses.asdict(cluster_info), indent=2)}')
437
-
438
419
  head_instance = cluster_info.get_head_instance()
439
420
  if head_instance is None:
440
- raise RuntimeError(
441
- f'Provision failed for cluster {cluster_name!r}. '
442
- 'Could not find any head instance. To fix: refresh '
443
- 'status with: sky status -r; and retry provisioning.')
421
+ e = RuntimeError(f'Provision failed for cluster {cluster_name!r}. '
422
+ 'Could not find any head instance. To fix: refresh '
423
+ f'status with: sky status -r; and retry provisioning.')
424
+ setattr(e, 'detailed_reason', str(cluster_info))
425
+ raise e
444
426
 
445
427
  # TODO(suquark): Move wheel build here in future PRs.
446
428
  # We don't set docker_user here, as we are configuring the VM itself.
447
429
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
448
430
  cluster_yaml, ssh_user=cluster_info.ssh_user)
431
+ docker_config = config_from_yaml.get('docker', {})
449
432
 
450
433
  with rich_utils.safe_status(
451
- '[bold cyan]Launching - Waiting for SSH access[/]') as status:
452
-
453
- logger.debug(
454
- f'\nWaiting for SSH to be available for {cluster_name!r} ...')
455
- wait_for_ssh(cluster_info, ssh_credentials)
456
- logger.debug(f'SSH Conection ready for {cluster_name!r}')
434
+ ux_utils.spinner_message(
435
+ 'Launching - Waiting for SSH access',
436
+ provision_logging.config.log_path)) as status:
437
+ # If on Kubernetes, skip SSH check since the pods are guaranteed to be
438
+ # ready by the provisioner, and we use kubectl instead of SSH to run the
439
+ # commands and rsync on the pods. SSH will still be ready after a while
440
+ # for the users to SSH into the pod.
441
+ if cloud_name.lower() != 'kubernetes':
442
+ logger.debug(
443
+ f'\nWaiting for SSH to be available for {cluster_name!r} ...')
444
+ wait_for_ssh(cluster_info, ssh_credentials)
445
+ logger.debug(f'SSH Connection ready for {cluster_name!r}')
446
+ vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
457
447
  plural = '' if len(cluster_info.instances) == 1 else 's'
458
- logger.info(f'{colorama.Fore.GREEN}Successfully provisioned '
459
- f'or found existing instance{plural}.'
460
- f'{colorama.Style.RESET_ALL}')
461
-
462
- docker_config = config_from_yaml.get('docker', {})
448
+ verb = 'is' if len(cluster_info.instances) == 1 else 'are'
449
+ indent_str = (ux_utils.INDENT_SYMBOL
450
+ if docker_config else ux_utils.INDENT_LAST_SYMBOL)
451
+ logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
452
+ f'up.{colorama.Style.RESET_ALL}')
453
+
454
+ # It's promised by the cluster config that docker_config does not
455
+ # exist for docker-native clouds, i.e. they provide docker containers
456
+ # instead of full VMs, like Kubernetes and RunPod, as it requires some
457
+ # special handlings to run docker inside their docker virtualization.
458
+ # For their Docker image settings, we do them when provisioning the
459
+ # cluster. See provision/{cloud}/instance.py:get_cluster_info for more
460
+ # details.
463
461
  if docker_config:
464
462
  status.update(
465
- '[bold cyan]Launching - Initializing docker container[/]')
463
+ ux_utils.spinner_message(
464
+ 'Launching - Initializing docker container',
465
+ provision_logging.config.log_path))
466
466
  docker_user = instance_setup.initialize_docker(
467
467
  cluster_name.name_on_cloud,
468
468
  docker_config=docker_config,
@@ -476,6 +476,8 @@ def _post_provision_setup(
476
476
  cluster_info.docker_user = docker_user
477
477
  ssh_credentials['docker_user'] = docker_user
478
478
  logger.debug(f'Docker user: {docker_user}')
479
+ logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
480
+ f'Docker container is up.{colorama.Style.RESET_ALL}')
479
481
 
480
482
  # We mount the metadata with sky wheel for speedup.
481
483
  # NOTE: currently we mount all credentials for all nodes, because
@@ -488,8 +490,9 @@ def _post_provision_setup(
488
490
  # for later.
489
491
  file_mounts = config_from_yaml.get('file_mounts', {})
490
492
 
491
- runtime_preparation_str = ('[bold cyan]Preparing SkyPilot '
492
- 'runtime ({step}/3 - {step_name})')
493
+ runtime_preparation_str = (ux_utils.spinner_message(
494
+ 'Preparing SkyPilot runtime ({step}/3 - {step_name})',
495
+ provision_logging.config.log_path))
493
496
  status.update(
494
497
  runtime_preparation_str.format(step=1, step_name='initializing'))
495
498
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -506,31 +509,94 @@ def _post_provision_setup(
506
509
  **ssh_credentials)
507
510
  head_runner = runners[0]
508
511
 
509
- status.update(
510
- runtime_preparation_str.format(step=3, step_name='runtime'))
511
- full_ray_setup = True
512
- ray_port = constants.SKY_REMOTE_RAY_PORT
513
- if not provision_record.is_instance_just_booted(
514
- head_instance.instance_id):
512
+ def is_ray_cluster_healthy(ray_status_output: str,
513
+ expected_num_nodes: int) -> bool:
514
+ """Parse the output of `ray status` to get #active nodes.
515
+
516
+ The output of `ray status` looks like:
517
+ Node status
518
+ ---------------------------------------------------------------
519
+ Active:
520
+ 1 node_291a8b849439ad6186387c35dc76dc43f9058108f09e8b68108cf9ec
521
+ 1 node_0945fbaaa7f0b15a19d2fd3dc48f3a1e2d7c97e4a50ca965f67acbfd
522
+ Pending:
523
+ (no pending nodes)
524
+ Recent failures:
525
+ (no failures)
526
+ """
527
+ start = ray_status_output.find('Active:')
528
+ end = ray_status_output.find('Pending:', start)
529
+ if start == -1 or end == -1:
530
+ return False
531
+ num_active_nodes = 0
532
+ for line in ray_status_output[start:end].split('\n'):
533
+ if line.strip() and not line.startswith('Active:'):
534
+ num_active_nodes += 1
535
+ return num_active_nodes == expected_num_nodes
536
+
537
+ def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
538
+ head_ray_needs_restart = True
539
+ ray_cluster_healthy = False
540
+ ray_port = constants.SKY_REMOTE_RAY_PORT
541
+
515
542
  # Check if head node Ray is alive
516
543
  returncode, stdout, _ = head_runner.run(
517
544
  instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
518
545
  stream_logs=False,
519
546
  require_outputs=True)
520
- if returncode:
521
- logger.debug('Ray cluster on head is not up. Restarting...')
522
- else:
523
- logger.debug('Ray cluster on head is up.')
524
- ray_port = common_utils.decode_payload(stdout)['ray_port']
525
- full_ray_setup = bool(returncode)
526
-
527
- if full_ray_setup:
547
+ if not returncode:
548
+ ray_port = message_utils.decode_payload(stdout)['ray_port']
549
+ logger.debug(f'Ray cluster on head is up with port {ray_port}.')
550
+
551
+ head_ray_needs_restart = bool(returncode)
552
+ # This is a best effort check to see if the ray cluster has expected
553
+ # number of nodes connected.
554
+ ray_cluster_healthy = (not head_ray_needs_restart and
555
+ is_ray_cluster_healthy(
556
+ stdout, cluster_info.num_instances))
557
+ return ray_port, ray_cluster_healthy, head_ray_needs_restart
558
+
559
+ status.update(
560
+ runtime_preparation_str.format(step=3, step_name='runtime'))
561
+
562
+ ray_port = constants.SKY_REMOTE_RAY_PORT
563
+ head_ray_needs_restart = True
564
+ ray_cluster_healthy = False
565
+ if (not provision_record.is_instance_just_booted(
566
+ head_instance.instance_id)):
567
+ # Check if head node Ray is alive
568
+ (ray_port, ray_cluster_healthy,
569
+ head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
570
+ elif cloud_name.lower() == 'kubernetes':
571
+ timeout = 90 # 1.5-min maximum timeout
572
+ start = time.time()
573
+ while True:
574
+ # Wait until Ray cluster is ready
575
+ (ray_port, ray_cluster_healthy,
576
+ head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
577
+ if ray_cluster_healthy:
578
+ logger.debug('Ray cluster is ready. Skip head and worker '
579
+ 'node ray cluster setup.')
580
+ break
581
+ if time.time() - start > timeout:
582
+ # In most cases, the ray cluster will be ready after a few
583
+ # seconds. Trigger ray start on head or worker nodes to be
584
+ # safe, if the ray cluster is not ready after timeout.
585
+ break
586
+ logger.debug('Ray cluster is not ready yet, waiting for the '
587
+ 'async setup to complete...')
588
+ time.sleep(1)
589
+
590
+ if head_ray_needs_restart:
528
591
  logger.debug('Starting Ray on the entire cluster.')
529
592
  instance_setup.start_ray_on_head_node(
530
593
  cluster_name.name_on_cloud,
531
594
  custom_resource=custom_resource,
532
595
  cluster_info=cluster_info,
533
596
  ssh_credentials=ssh_credentials)
597
+ else:
598
+ logger.debug('Ray cluster on head is ready. Skip starting ray '
599
+ 'cluster on head node.')
534
600
 
535
601
  # NOTE: We have to check all worker nodes to make sure they are all
536
602
  # healthy, otherwise we can only start Ray on newly started worker
@@ -541,10 +607,13 @@ def _post_provision_setup(
541
607
  # if provision_record.is_instance_just_booted(inst.instance_id):
542
608
  # worker_ips.append(inst.public_ip)
543
609
 
544
- if cluster_info.num_instances > 1:
610
+ # We don't need to restart ray on worker nodes if the ray cluster is
611
+ # already healthy, i.e. the head node has expected number of nodes
612
+ # connected to the ray cluster.
613
+ if cluster_info.num_instances > 1 and not ray_cluster_healthy:
545
614
  instance_setup.start_ray_on_worker_nodes(
546
615
  cluster_name.name_on_cloud,
547
- no_restart=not full_ray_setup,
616
+ no_restart=not head_ray_needs_restart,
548
617
  custom_resource=custom_resource,
549
618
  # Pass the ray_port to worker nodes for backward compatibility
550
619
  # as in some existing clusters the ray_port is not dumped with
@@ -553,18 +622,23 @@ def _post_provision_setup(
553
622
  ray_port=ray_port,
554
623
  cluster_info=cluster_info,
555
624
  ssh_credentials=ssh_credentials)
625
+ elif ray_cluster_healthy:
626
+ logger.debug('Ray cluster is ready. Skip starting ray cluster on '
627
+ 'worker nodes.')
556
628
 
557
629
  instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
558
630
  cluster_info, ssh_credentials)
559
631
 
560
- logger.info(f'{colorama.Fore.GREEN}Successfully provisioned cluster: '
561
- f'{cluster_name}{colorama.Style.RESET_ALL}')
632
+ logger.info(
633
+ ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
634
+ provision_logging.config.log_path))
562
635
  return cluster_info
563
636
 
564
637
 
638
+ @timeline.event
565
639
  def post_provision_runtime_setup(
566
- cloud_name: str, cluster_name: ClusterName, cluster_yaml: str,
567
- provision_record: provision_common.ProvisionRecord,
640
+ cloud_name: str, cluster_name: resources_utils.ClusterName,
641
+ cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
568
642
  custom_resource: Optional[str],
569
643
  log_dir: str) -> provision_common.ClusterInfo:
570
644
  """Run internal setup commands after provisioning and before user setup.
@@ -588,7 +662,10 @@ def post_provision_runtime_setup(
588
662
  provision_record=provision_record,
589
663
  custom_resource=custom_resource)
590
664
  except Exception: # pylint: disable=broad-except
591
- logger.error('*** Failed setting up cluster. ***')
665
+ logger.error(
666
+ ux_utils.error_message(
667
+ 'Failed to set up SkyPilot runtime on cluster.',
668
+ provision_logging.config.log_path))
592
669
  logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
593
670
  with ux_utils.print_exception_no_traceback():
594
671
  raise
@@ -4,6 +4,7 @@ from sky.provision.runpod.config import bootstrap_instances
4
4
  from sky.provision.runpod.instance import cleanup_ports
5
5
  from sky.provision.runpod.instance import get_cluster_info
6
6
  from sky.provision.runpod.instance import query_instances
7
+ from sky.provision.runpod.instance import query_ports
7
8
  from sky.provision.runpod.instance import run_instances
8
9
  from sky.provision.runpod.instance import stop_instances
9
10
  from sky.provision.runpod.instance import terminate_instances
@@ -0,0 +1,3 @@
1
+ """RunPod low level API support for spot pod."""
2
+
3
+ from sky.provision.runpod.api.commands import create_spot_pod