skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/gcp.py CHANGED
@@ -1,30 +1,32 @@
1
1
  """Google Cloud Platform."""
2
2
  import enum
3
- import functools
4
3
  import json
5
4
  import os
6
5
  import re
7
6
  import subprocess
8
7
  import time
9
8
  import typing
10
- from typing import Dict, Iterator, List, Optional, Set, Tuple
9
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
11
10
 
12
11
  import colorama
13
12
 
14
13
  from sky import clouds
15
14
  from sky import exceptions
16
15
  from sky import sky_logging
16
+ from sky import skypilot_config
17
17
  from sky.adaptors import gcp
18
18
  from sky.clouds import service_catalog
19
19
  from sky.clouds.utils import gcp_utils
20
+ from sky.utils import annotations
20
21
  from sky.utils import common_utils
22
+ from sky.utils import registry
21
23
  from sky.utils import resources_utils
22
24
  from sky.utils import subprocess_utils
23
25
  from sky.utils import ux_utils
24
26
 
25
27
  if typing.TYPE_CHECKING:
26
28
  from sky import resources
27
- from sky import status_lib
29
+ from sky.utils import status_lib
28
30
 
29
31
  logger = sky_logging.init_logger(__name__)
30
32
 
@@ -93,6 +95,12 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
93
95
  f'\nTo query common AI images: {colorama.Style.BRIGHT}gcloud compute images list --project deeplearning-platform-release | less{colorama.Style.RESET_ALL}'
94
96
  )
95
97
 
98
+ # Image ID tags
99
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
100
+ # For GPU-related package version, see sky/clouds/service_catalog/images/provisioners/cuda.sh
101
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
102
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
103
+
96
104
 
97
105
  def _run_output(cmd):
98
106
  proc = subprocess.run(cmd,
@@ -125,8 +133,11 @@ class GCPIdentityType(enum.Enum):
125
133
 
126
134
  SHARED_CREDENTIALS_FILE = ''
127
135
 
136
+ def can_credential_expire(self) -> bool:
137
+ return self == GCPIdentityType.SHARED_CREDENTIALS_FILE
138
+
128
139
 
129
- @clouds.CLOUD_REGISTRY.register
140
+ @registry.CLOUD_REGISTRY.register
130
141
  class GCP(clouds.Cloud):
131
142
  """Google Cloud Platform."""
132
143
 
@@ -160,7 +171,7 @@ class GCP(clouds.Cloud):
160
171
  # ~/.config/gcloud/application_default_credentials.json.
161
172
  f'{_INDENT_PREFIX} $ gcloud auth application-default login\n'
162
173
  f'{_INDENT_PREFIX}For more info: '
163
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
174
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
164
175
  )
165
176
  _APPLICATION_CREDENTIAL_HINT = (
166
177
  'Run the following commands:\n'
@@ -168,7 +179,7 @@ class GCP(clouds.Cloud):
168
179
  f'{_INDENT_PREFIX}Or set the environment variable GOOGLE_APPLICATION_CREDENTIALS '
169
180
  'to the path of your service account key file.\n'
170
181
  f'{_INDENT_PREFIX}For more info: '
171
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
182
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
172
183
  )
173
184
 
174
185
  _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
@@ -179,20 +190,33 @@ class GCP(clouds.Cloud):
179
190
  def _unsupported_features_for_resources(
180
191
  cls, resources: 'resources.Resources'
181
192
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
193
+ unsupported = {}
182
194
  if gcp_utils.is_tpu_vm_pod(resources):
183
- return {
195
+ unsupported = {
184
196
  clouds.CloudImplementationFeatures.STOP: (
185
- 'TPU VM pods cannot be stopped. Please refer to: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources'
197
+ 'TPU VM pods cannot be stopped. Please refer to: '
198
+ 'https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources'
186
199
  )
187
200
  }
188
201
  if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources):
189
202
  # TPU node does not support multi-node.
190
- return {
191
- clouds.CloudImplementationFeatures.MULTI_NODE:
192
- ('TPU node does not support multi-node. Please set '
193
- 'num_nodes to 1.')
194
- }
195
- return {}
203
+ unsupported[clouds.CloudImplementationFeatures.MULTI_NODE] = (
204
+ 'TPU node does not support multi-node. Please set '
205
+ 'num_nodes to 1.')
206
+ # TODO(zhwu): We probably need to store the MIG requirement in resources
207
+ # because `skypilot_config` may change for an existing cluster.
208
+ # Clusters created with MIG (only GPU clusters) cannot be stopped.
209
+ if (skypilot_config.get_nested(
210
+ ('gcp', 'managed_instance_group'),
211
+ None,
212
+ override_configs=resources.cluster_config_overrides) is not None
213
+ and resources.accelerators):
214
+ unsupported[clouds.CloudImplementationFeatures.STOP] = (
215
+ 'Managed Instance Group (MIG) does not support stopping yet.')
216
+ unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
217
+ 'Managed Instance Group with DWS does not support '
218
+ 'spot instances.')
219
+ return unsupported
196
220
 
197
221
  @classmethod
198
222
  def max_cluster_name_length(cls) -> Optional[int]:
@@ -246,6 +270,10 @@ class GCP(clouds.Cloud):
246
270
  regions = [r for r in regions if r.zones]
247
271
  return regions
248
272
 
273
+ @classmethod
274
+ def optimize_by_zone(cls) -> bool:
275
+ return True
276
+
249
277
  @classmethod
250
278
  def zones_provision_loop(
251
279
  cls,
@@ -321,7 +349,7 @@ class GCP(clouds.Cloud):
321
349
  return find_machine is not None
322
350
 
323
351
  @classmethod
324
- @functools.lru_cache(maxsize=1)
352
+ @annotations.lru_cache(scope='global', maxsize=1)
325
353
  def _get_image_size(cls, image_id: str) -> float:
326
354
  if image_id.startswith('skypilot:'):
327
355
  return DEFAULT_GCP_IMAGE_GB
@@ -390,9 +418,10 @@ class GCP(clouds.Cloud):
390
418
  def make_deploy_resources_variables(
391
419
  self,
392
420
  resources: 'resources.Resources',
393
- cluster_name_on_cloud: str,
421
+ cluster_name: resources_utils.ClusterName,
394
422
  region: 'clouds.Region',
395
423
  zones: Optional[List['clouds.Zone']],
424
+ num_nodes: int,
396
425
  dryrun: bool = False) -> Dict[str, Optional[str]]:
397
426
  assert zones is not None, (region, zones)
398
427
 
@@ -404,7 +433,22 @@ class GCP(clouds.Cloud):
404
433
  # --no-standard-images
405
434
  # We use the debian image, as the ubuntu image has some connectivity
406
435
  # issue when first booted.
407
- image_id = 'skypilot:cpu-debian-11'
436
+ image_id = _DEFAULT_CPU_IMAGE_ID
437
+
438
+ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
439
+ if (r.disk_tier is not None and
440
+ r.disk_tier != resources_utils.DiskTier.BEST):
441
+ return r.disk_tier
442
+ # Failover disk tier from ultra to low.
443
+ all_tiers = list(reversed(resources_utils.DiskTier))
444
+ start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
445
+ while start_index < len(all_tiers):
446
+ disk_tier = all_tiers[start_index]
447
+ ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
448
+ if ok:
449
+ return disk_tier
450
+ start_index += 1
451
+ assert False, 'Low disk tier should always be supported on GCP.'
408
452
 
409
453
  r = resources
410
454
  # Find GPU spec, if any.
@@ -419,6 +463,7 @@ class GCP(clouds.Cloud):
419
463
  'custom_resources': None,
420
464
  'use_spot': r.use_spot,
421
465
  'gcp_project_id': self.get_project_id(dryrun),
466
+ **GCP._get_disk_specs(_failover_disk_tier()),
422
467
  }
423
468
  accelerators = r.accelerators
424
469
  if accelerators is not None:
@@ -437,13 +482,16 @@ class GCP(clouds.Cloud):
437
482
  'runtime_version']
438
483
  resources_vars['tpu_node_name'] = r.accelerator_args.get(
439
484
  'tpu_name')
485
+ # TPU VMs require privileged mode for docker containers to
486
+ # access TPU devices.
487
+ resources_vars['docker_run_options'] = ['--privileged']
440
488
  else:
441
489
  # Convert to GCP names:
442
490
  # https://cloud.google.com/compute/docs/gpus
443
491
  if acc in ('A100-80GB', 'L4'):
444
492
  # A100-80GB and L4 have a different name pattern.
445
493
  resources_vars['gpu'] = f'nvidia-{acc.lower()}'
446
- elif acc == 'H100':
494
+ elif acc in ('H100', 'H100-MEGA'):
447
495
  resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
448
496
  else:
449
497
  resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
@@ -453,10 +501,10 @@ class GCP(clouds.Cloud):
453
501
  # Though the image is called cu113, it actually has later
454
502
  # versions of CUDA as noted below.
455
503
  # CUDA driver version 470.57.02, CUDA Library 11.4
456
- image_id = 'skypilot:k80-debian-10'
504
+ image_id = _DEFAULT_GPU_K80_IMAGE_ID
457
505
  else:
458
506
  # CUDA driver version 535.86.10, CUDA Library 12.2
459
- image_id = 'skypilot:gpu-debian-11'
507
+ image_id = _DEFAULT_GPU_IMAGE_ID
460
508
 
461
509
  if (resources.image_id is not None and
462
510
  resources.extract_docker_image() is None):
@@ -477,30 +525,52 @@ class GCP(clouds.Cloud):
477
525
  resources_vars['machine_image'] = image_id
478
526
  resources_vars['image_id'] = None
479
527
 
480
- resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier)
481
-
482
528
  firewall_rule = None
483
529
  if resources.ports is not None:
484
- firewall_rule = (
485
- USER_PORTS_FIREWALL_RULE_NAME.format(cluster_name_on_cloud))
530
+ firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format(
531
+ cluster_name.name_on_cloud))
486
532
  resources_vars['firewall_rule'] = firewall_rule
487
533
 
488
534
  # For TPU nodes. TPU VMs do not need TPU_NAME.
489
535
  tpu_node_name = resources_vars.get('tpu_node_name')
490
536
  if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources):
491
537
  if tpu_node_name is None:
492
- tpu_node_name = cluster_name_on_cloud
538
+ tpu_node_name = cluster_name.name_on_cloud
493
539
 
494
540
  resources_vars['tpu_node_name'] = tpu_node_name
495
541
 
542
+ managed_instance_group_config = skypilot_config.get_nested(
543
+ ('gcp', 'managed_instance_group'),
544
+ None,
545
+ override_configs=resources.cluster_config_overrides)
546
+ use_mig = managed_instance_group_config is not None
547
+ resources_vars['gcp_use_managed_instance_group'] = use_mig
548
+ # Convert boolean to 0 or 1 in string, as GCP does not support boolean
549
+ # value in labels for TPU VM APIs.
550
+ resources_vars['gcp_use_managed_instance_group_value'] = str(
551
+ int(use_mig))
552
+ if use_mig:
553
+ resources_vars.update(managed_instance_group_config)
554
+ resources_vars[
555
+ 'force_enable_external_ips'] = skypilot_config.get_nested(
556
+ ('gcp', 'force_enable_external_ips'), False)
557
+
558
+ # Add gVNIC from config
559
+ resources_vars['enable_gvnic'] = skypilot_config.get_nested(
560
+ ('gcp', 'enable_gvnic'), False)
561
+
496
562
  return resources_vars
497
563
 
498
564
  def _get_feasible_launchable_resources(
499
565
  self, resources: 'resources.Resources'
500
- ) -> Tuple[List['resources.Resources'], List[str]]:
566
+ ) -> 'resources_utils.FeasibleResources':
501
567
  if resources.instance_type is not None:
502
568
  assert resources.is_launchable(), resources
503
- return ([resources], [])
569
+ ok, _ = GCP.check_disk_tier(resources.instance_type,
570
+ resources.disk_tier)
571
+ if not ok:
572
+ return resources_utils.FeasibleResources([], [], None)
573
+ return resources_utils.FeasibleResources([resources], [], None)
504
574
 
505
575
  if resources.accelerators is None:
506
576
  # Return a default instance type with the given number of vCPUs.
@@ -509,16 +579,20 @@ class GCP(clouds.Cloud):
509
579
  memory=resources.memory,
510
580
  disk_tier=resources.disk_tier)
511
581
  if host_vm_type is None:
512
- return ([], [])
513
- else:
514
- r = resources.copy(
515
- cloud=GCP(),
516
- instance_type=host_vm_type,
517
- accelerators=None,
518
- cpus=None,
519
- memory=None,
520
- )
521
- return ([r], [])
582
+ # TODO: Add hints to all return values in this method to help
583
+ # users understand why the resources are not launchable.
584
+ return resources_utils.FeasibleResources([], [], None)
585
+ ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
586
+ if not ok:
587
+ return resources_utils.FeasibleResources([], [], None)
588
+ r = resources.copy(
589
+ cloud=GCP(),
590
+ instance_type=host_vm_type,
591
+ accelerators=None,
592
+ cpus=None,
593
+ memory=None,
594
+ )
595
+ return resources_utils.FeasibleResources([r], [], None)
522
596
 
523
597
  # Find instance candidates to meet user's requirements
524
598
  assert len(resources.accelerators.items()
@@ -540,7 +614,8 @@ class GCP(clouds.Cloud):
540
614
  clouds='gcp')
541
615
 
542
616
  if instance_list is None:
543
- return ([], fuzzy_candidate_list)
617
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
618
+ None)
544
619
  assert len(
545
620
  instance_list
546
621
  ) == 1, f'More than one instance type matched, {instance_list}'
@@ -555,11 +630,13 @@ class GCP(clouds.Cloud):
555
630
  if resources.cpus.endswith('+'):
556
631
  cpus = float(resources.cpus[:-1])
557
632
  if cpus > num_cpus_in_tpu_vm:
558
- return ([], fuzzy_candidate_list)
633
+ return resources_utils.FeasibleResources(
634
+ [], fuzzy_candidate_list, None)
559
635
  else:
560
636
  cpus = float(resources.cpus)
561
637
  if cpus != num_cpus_in_tpu_vm:
562
- return ([], fuzzy_candidate_list)
638
+ return resources_utils.FeasibleResources(
639
+ [], fuzzy_candidate_list, None)
563
640
  # FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs
564
641
  # have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move
565
642
  # this to service catalog, instead.
@@ -568,14 +645,20 @@ class GCP(clouds.Cloud):
568
645
  if resources.memory.endswith('+'):
569
646
  memory = float(resources.memory[:-1])
570
647
  if memory > memory_in_tpu_vm:
571
- return ([], fuzzy_candidate_list)
648
+ return resources_utils.FeasibleResources(
649
+ [], fuzzy_candidate_list, None)
572
650
  else:
573
651
  memory = float(resources.memory)
574
652
  if memory != memory_in_tpu_vm:
575
- return ([], fuzzy_candidate_list)
653
+ return resources_utils.FeasibleResources(
654
+ [], fuzzy_candidate_list, None)
576
655
  else:
577
656
  host_vm_type = instance_list[0]
578
657
 
658
+ ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
659
+ if not ok:
660
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
661
+ None)
579
662
  acc_dict = {acc: acc_count}
580
663
  r = resources.copy(
581
664
  cloud=GCP(),
@@ -584,13 +667,14 @@ class GCP(clouds.Cloud):
584
667
  cpus=None,
585
668
  memory=None,
586
669
  )
587
- return ([r], fuzzy_candidate_list)
670
+ return resources_utils.FeasibleResources([r], fuzzy_candidate_list,
671
+ None)
588
672
 
589
673
  @classmethod
590
674
  def get_accelerators_from_instance_type(
591
675
  cls,
592
676
  instance_type: str,
593
- ) -> Optional[Dict[str, int]]:
677
+ ) -> Optional[Dict[str, Union[int, float]]]:
594
678
  # GCP handles accelerators separately from regular instance types,
595
679
  # hence return none here.
596
680
  return None
@@ -675,7 +759,7 @@ class GCP(clouds.Cloud):
675
759
  project_id = cls.get_project_id()
676
760
 
677
761
  # Check if the user is activated.
678
- identity = cls.get_current_user_identity()
762
+ identity = cls.get_active_user_identity()
679
763
  except (auth.exceptions.DefaultCredentialsError,
680
764
  exceptions.CloudUserIdentityError) as e:
681
765
  # See also: https://stackoverflow.com/a/53307505/1165051
@@ -736,13 +820,13 @@ class GCP(clouds.Cloud):
736
820
 
737
821
  # pylint: disable=import-outside-toplevel,unused-import
738
822
  import google.auth
739
- import googleapiclient.discovery
740
823
 
741
824
  # This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long
742
825
  credentials, project = google.auth.default()
743
- crm = googleapiclient.discovery.build('cloudresourcemanager',
744
- 'v1',
745
- credentials=credentials)
826
+ crm = gcp.build('cloudresourcemanager',
827
+ 'v1',
828
+ credentials=credentials,
829
+ cache_discovery=False)
746
830
  gcp_minimal_permissions = gcp_utils.get_minimal_permissions()
747
831
  permissions = {'permissions': gcp_minimal_permissions}
748
832
  request = crm.projects().testIamPermissions(resource=project,
@@ -750,13 +834,13 @@ class GCP(clouds.Cloud):
750
834
  ret_permissions = request.execute().get('permissions', [])
751
835
 
752
836
  diffs = set(gcp_minimal_permissions).difference(set(ret_permissions))
753
- if len(diffs) > 0:
837
+ if diffs:
754
838
  identity_str = identity[0] if identity else None
755
839
  return False, (
756
840
  'The following permissions are not enabled for the current '
757
841
  f'GCP identity ({identity_str}):\n '
758
842
  f'{diffs}\n '
759
- 'For more details, visit: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long
843
+ 'For more details, visit: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long
760
844
  return True, None
761
845
 
762
846
  def get_credential_file_mounts(self) -> Dict[str, str]:
@@ -783,19 +867,29 @@ class GCP(clouds.Cloud):
783
867
  pass
784
868
  return credentials
785
869
 
870
+ @annotations.lru_cache(scope='global', maxsize=1)
871
+ def can_credential_expire(self) -> bool:
872
+ identity_type = self._get_identity_type()
873
+ return (identity_type is not None and
874
+ identity_type.can_credential_expire())
875
+
786
876
  @classmethod
787
877
  def _get_identity_type(cls) -> Optional[GCPIdentityType]:
788
878
  try:
789
- account = cls.get_current_user_identity()[0]
879
+ account = cls.get_active_user_identity()
790
880
  except exceptions.CloudUserIdentityError:
791
881
  return None
792
- if GCPIdentityType.SERVICE_ACCOUNT.value in account:
882
+ if account is None:
883
+ return None
884
+ assert account is not None
885
+ if GCPIdentityType.SERVICE_ACCOUNT.value in account[0]:
793
886
  return GCPIdentityType.SERVICE_ACCOUNT
794
887
  return GCPIdentityType.SHARED_CREDENTIALS_FILE
795
888
 
796
889
  @classmethod
797
- @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
798
- def get_current_user_identity(cls) -> List[str]:
890
+ @annotations.lru_cache(scope='request',
891
+ maxsize=1) # Cache since getting identity is slow.
892
+ def get_user_identities(cls) -> List[List[str]]:
799
893
  """Returns the email address + project id of the active user."""
800
894
  try:
801
895
  account = _run_output('gcloud auth list --filter=status:ACTIVE '
@@ -826,11 +920,13 @@ class GCP(clouds.Cloud):
826
920
  ' Reason: '
827
921
  f'{common_utils.format_exception(e, use_bracket=True)}'
828
922
  ) from e
829
- return [f'{account} [project_id={project_id}]']
923
+ # TODO: Return a list of identities in the profile when we support
924
+ # automatic switching for GCP. Currently we only support one identity.
925
+ return [[f'{account} [project_id={project_id}]']]
830
926
 
831
927
  @classmethod
832
- def get_current_user_identity_str(cls) -> Optional[str]:
833
- user_identity = cls.get_current_user_identity()
928
+ def get_active_user_identity_str(cls) -> Optional[str]:
929
+ user_identity = cls.get_active_user_identity()
834
930
  if user_identity is None:
835
931
  return None
836
932
  return user_identity[0].replace('\n', '')
@@ -871,17 +967,59 @@ class GCP(clouds.Cloud):
871
967
  resources.instance_type, resources.accelerators, resources.zone,
872
968
  'gcp')
873
969
 
970
+ @classmethod
971
+ def check_disk_tier(
972
+ cls, instance_type: Optional[str],
973
+ disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
974
+ if disk_tier != resources_utils.DiskTier.ULTRA or instance_type is None:
975
+ return True, ''
976
+ # Ultra disk tier (pd-extreme) only support m2, m3 and part of n2
977
+ # instance types, so we failover to lower tiers for other instance
978
+ # types. Reference:
979
+ # https://cloud.google.com/compute/docs/disks/extreme-persistent-disk#machine_shape_support # pylint: disable=line-too-long
980
+ series = instance_type.split('-')[0]
981
+ if series in ['m2', 'm3', 'n2']:
982
+ if series == 'n2':
983
+ num_cpus = int(instance_type.split('-')[2])
984
+ if num_cpus < 64:
985
+ return False, ('n2 series with less than 64 vCPUs are '
986
+ 'not supported with pd-extreme.')
987
+ return True, ''
988
+ return False, (f'{series} series is not supported with pd-extreme. '
989
+ 'Only m2, m3 series and n2 series with 64 or more vCPUs '
990
+ 'are supported.')
991
+
992
+ @classmethod
993
+ def check_disk_tier_enabled(cls, instance_type: Optional[str],
994
+ disk_tier: resources_utils.DiskTier) -> None:
995
+ ok, msg = cls.check_disk_tier(instance_type, disk_tier)
996
+ if not ok:
997
+ with ux_utils.print_exception_no_traceback():
998
+ raise exceptions.NotSupportedError(msg)
999
+
874
1000
  @classmethod
875
1001
  def _get_disk_type(cls,
876
1002
  disk_tier: Optional[resources_utils.DiskTier]) -> str:
877
1003
  tier = cls._translate_disk_tier(disk_tier)
878
1004
  tier2name = {
1005
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
879
1006
  resources_utils.DiskTier.HIGH: 'pd-ssd',
880
1007
  resources_utils.DiskTier.MEDIUM: 'pd-balanced',
881
1008
  resources_utils.DiskTier.LOW: 'pd-standard',
882
1009
  }
883
1010
  return tier2name[tier]
884
1011
 
1012
+ @classmethod
1013
+ def _get_disk_specs(
1014
+ cls,
1015
+ disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
1016
+ specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)}
1017
+ if disk_tier == resources_utils.DiskTier.ULTRA:
1018
+ # Only pd-extreme supports custom iops.
1019
+ # see https://cloud.google.com/compute/docs/disks#disk-types
1020
+ specs['disk_iops'] = 20000
1021
+ return specs
1022
+
885
1023
  @classmethod
886
1024
  def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
887
1025
  return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
@@ -976,8 +1114,8 @@ class GCP(clouds.Cloud):
976
1114
  assert False, 'This code path should not be used.'
977
1115
 
978
1116
  @classmethod
979
- def create_image_from_cluster(cls, cluster_name: str,
980
- cluster_name_on_cloud: str,
1117
+ def create_image_from_cluster(cls,
1118
+ cluster_name: resources_utils.ClusterName,
981
1119
  region: Optional[str],
982
1120
  zone: Optional[str]) -> str:
983
1121
  del region # unused
@@ -986,7 +1124,7 @@ class GCP(clouds.Cloud):
986
1124
  # `ray-cluster-name` tag, which is guaranteed by the current `ray`
987
1125
  # backend. Once the `provision.query_instances` is implemented for GCP,
988
1126
  # we should be able to get rid of this assumption.
989
- tag_filters = {'ray-cluster-name': cluster_name_on_cloud}
1127
+ tag_filters = {'ray-cluster-name': cluster_name.name_on_cloud}
990
1128
  label_filter_str = cls._label_filter_str(tag_filters)
991
1129
  instance_name_cmd = ('gcloud compute instances list '
992
1130
  f'--filter="({label_filter_str})" '
@@ -998,7 +1136,8 @@ class GCP(clouds.Cloud):
998
1136
  subprocess_utils.handle_returncode(
999
1137
  returncode,
1000
1138
  instance_name_cmd,
1001
- error_msg=f'Failed to get instance name for {cluster_name!r}',
1139
+ error_msg=
1140
+ f'Failed to get instance name for {cluster_name.display_name!r}',
1002
1141
  stderr=stderr,
1003
1142
  stream_logs=True)
1004
1143
  instance_names = json.loads(stdout)
@@ -1009,7 +1148,7 @@ class GCP(clouds.Cloud):
1009
1148
  f'instance, but got: {instance_names}')
1010
1149
  instance_name = instance_names[0]['name']
1011
1150
 
1012
- image_name = f'skypilot-{cluster_name}-{int(time.time())}'
1151
+ image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
1013
1152
  create_image_cmd = (f'gcloud compute images create {image_name} '
1014
1153
  f'--source-disk {instance_name} '
1015
1154
  f'--source-disk-zone {zone}')
@@ -1021,7 +1160,8 @@ class GCP(clouds.Cloud):
1021
1160
  subprocess_utils.handle_returncode(
1022
1161
  returncode,
1023
1162
  create_image_cmd,
1024
- error_msg=f'Failed to create image for {cluster_name!r}',
1163
+ error_msg=
1164
+ f'Failed to create image for {cluster_name.display_name!r}',
1025
1165
  stderr=stderr,
1026
1166
  stream_logs=True)
1027
1167
 
@@ -1035,7 +1175,8 @@ class GCP(clouds.Cloud):
1035
1175
  subprocess_utils.handle_returncode(
1036
1176
  returncode,
1037
1177
  image_uri_cmd,
1038
- error_msg=f'Failed to get image uri for {cluster_name!r}',
1178
+ error_msg=
1179
+ f'Failed to get image uri for {cluster_name.display_name!r}',
1039
1180
  stderr=stderr,
1040
1181
  stream_logs=True)
1041
1182