skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/cloud.py CHANGED
@@ -9,19 +9,21 @@ reused across cloud object creation.
9
9
  """
10
10
  import collections
11
11
  import enum
12
+ import math
12
13
  import typing
13
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
14
+ from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
14
15
 
15
16
  from sky import exceptions
16
17
  from sky import skypilot_config
17
18
  from sky.clouds import service_catalog
18
19
  from sky.utils import log_utils
19
20
  from sky.utils import resources_utils
21
+ from sky.utils import timeline
20
22
  from sky.utils import ux_utils
21
23
 
22
24
  if typing.TYPE_CHECKING:
23
25
  from sky import resources as resources_lib
24
- from sky import status_lib
26
+ from sky.utils import status_lib
25
27
 
26
28
 
27
29
  class CloudImplementationFeatures(enum.Enum):
@@ -93,12 +95,31 @@ class StatusVersion(enum.Enum):
93
95
  return self.value >= other.value
94
96
 
95
97
 
98
+ class OpenPortsVersion(enum.Enum):
99
+ """The version of the open ports implementation.
100
+
101
+ 1: Open ports on launching of the cluster only, cannot be modified after
102
+ provisioning of the cluster. This is for clouds like RunPod which only
103
+ accepts port argument on VM creation API, and requires Web GUI and an VM
104
+ restart to update ports. We currently do not support this.
105
+ 2: Open ports after provisioning of the cluster, updatable. This is for most
106
+ of the cloud providers which allow opening ports using an programmable API
107
+ and won't affect the running VMs.
108
+ """
109
+ LAUNCH_ONLY = 'LAUNCH ONLY'
110
+ UPDATABLE = 'UPDATABLE'
111
+
112
+ def __le__(self, other):
113
+ versions = list(OpenPortsVersion)
114
+ return versions.index(self) <= versions.index(other)
115
+
116
+
96
117
  class Cloud:
97
118
  """A cloud provider."""
98
119
 
99
120
  _REPR = '<Cloud>'
100
121
  _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
101
- _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
122
+ _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
102
123
  _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
103
124
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
104
125
 
@@ -107,6 +128,7 @@ class Cloud:
107
128
  # NOTE: new clouds being added should use the latest version, i.e. SKYPILOT.
108
129
  PROVISIONER_VERSION = ProvisionerVersion.RAY_AUTOSCALER
109
130
  STATUS_VERSION = StatusVersion.CLOUD_CLI
131
+ OPEN_PORTS_VERSION = OpenPortsVersion.UPDATABLE
110
132
 
111
133
  @classmethod
112
134
  def max_cluster_name_length(cls) -> Optional[int]:
@@ -157,6 +179,11 @@ class Cloud:
157
179
  """
158
180
  raise NotImplementedError
159
181
 
182
+ @classmethod
183
+ def optimize_by_zone(cls) -> bool:
184
+ """Returns whether to optimize this cloud by zone (default: region)."""
185
+ return False
186
+
160
187
  @classmethod
161
188
  def zones_provision_loop(
162
189
  cls,
@@ -253,9 +280,10 @@ class Cloud:
253
280
  def make_deploy_resources_variables(
254
281
  self,
255
282
  resources: 'resources_lib.Resources',
256
- cluster_name_on_cloud: str,
283
+ cluster_name: resources_utils.ClusterName,
257
284
  region: 'Region',
258
285
  zones: Optional[List['Zone']],
286
+ num_nodes: int,
259
287
  dryrun: bool = False,
260
288
  ) -> Dict[str, Optional[str]]:
261
289
  """Converts planned sky.Resources to cloud-specific resource variables.
@@ -281,7 +309,7 @@ class Cloud:
281
309
  def get_accelerators_from_instance_type(
282
310
  cls,
283
311
  instance_type: str,
284
- ) -> Optional[Dict[str, int]]:
312
+ ) -> Optional[Dict[str, Union[int, float]]]:
285
313
  """Returns {acc: acc_count} held by 'instance_type', if any."""
286
314
  raise NotImplementedError
287
315
 
@@ -340,12 +368,12 @@ class Cloud:
340
368
  del label_key, label_value
341
369
  return True, None
342
370
 
371
+ @timeline.event
343
372
  def get_feasible_launchable_resources(
344
- self,
345
- resources: 'resources_lib.Resources',
346
- num_nodes: int = 1
347
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
348
- """Returns ([feasible and launchable resources], [fuzzy candidates]).
373
+ self,
374
+ resources: 'resources_lib.Resources',
375
+ num_nodes: int = 1) -> 'resources_utils.FeasibleResources':
376
+ """Returns FeasibleResources for the given resources.
349
377
 
350
378
  Feasible resources refer to an offering respecting the resource
351
379
  requirements. Currently, this function implements "filtering" the
@@ -353,10 +381,15 @@ class Cloud:
353
381
 
354
382
  Launchable resources require a cloud and an instance type be assigned.
355
383
 
356
- Fuzzy candidates example: when the requested GPU is A100:1 but is not
357
- available in a cloud/region, the fuzzy candidates are results of a fuzzy
358
- search in the catalog that are offered in the location. E.g.,
359
- ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
384
+ The returned dataclass object FeasibleResources contains three fields:
385
+
386
+ - resources_list: a list of resources that are feasible to launch
387
+ - fuzzy_candidate_list: a list of resources that loosely match requested
388
+ resources. E.g., when A100:1 GPU is requested but is not available
389
+ in a cloud/region, the fuzzy candidates are results of a fuzzy
390
+ search in the catalog that are offered in the location. E.g.,
391
+ ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
392
+ - hint: an optional string hint if no feasible resources are found.
360
393
  """
361
394
  if resources.is_launchable():
362
395
  self._check_instance_type_accelerators_combination(resources)
@@ -372,13 +405,18 @@ class Cloud:
372
405
  # TODO(zhwu): The resources are now silently filtered out. We
373
406
  # should have some logging telling the user why the resources
374
407
  # are not considered.
375
- return ([], [])
408
+ return resources_utils.FeasibleResources(resources_list=[],
409
+ fuzzy_candidate_list=[],
410
+ hint=None)
376
411
  return self._get_feasible_launchable_resources(resources)
377
412
 
378
413
  def _get_feasible_launchable_resources(
379
414
  self, resources: 'resources_lib.Resources'
380
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
415
+ ) -> 'resources_utils.FeasibleResources':
381
416
  """See get_feasible_launchable_resources()."""
417
+ # TODO: Currently only the Kubernetes implementation of this method
418
+ # returns hints when no feasible resources are found. This should be
419
+ # implemented for all clouds.
382
420
  raise NotImplementedError
383
421
 
384
422
  def get_reservations_available_resources(
@@ -407,11 +445,11 @@ class Cloud:
407
445
 
408
446
  # TODO(zhwu): Make the return type immutable.
409
447
  @classmethod
410
- def get_current_user_identity(cls) -> Optional[List[str]]:
411
- """(Advanced) Returns currently active user identity of this cloud.
448
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
449
+ """(Advanced) Returns all available user identities of this cloud.
412
450
 
413
451
  The user "identity" is associated with each SkyPilot cluster they
414
- creates. This is used in protecting cluster operations, such as
452
+ create. This is used in protecting cluster operations, such as
415
453
  provision, teardown and status refreshing, in a multi-identity
416
454
  scenario, where the same user/device can switch between different
417
455
  cloud identities. We check that the user identity matches before:
@@ -419,10 +457,16 @@ class Cloud:
419
457
  - Stopping/tearing down a cluster
420
458
  - Refreshing the status of a cluster
421
459
 
422
- Design choice: we allow the operations that can correctly work with
423
- a different user identity, as a user should have full control over
424
- all their clusters (no matter which identity it belongs to), e.g.,
425
- submitting jobs, viewing logs, auto-stopping, etc.
460
+ Design choices:
461
+ 1. We allow the operations that can correctly work with a different
462
+ user identity, as a user should have full control over all their
463
+ clusters (no matter which identity it belongs to), e.g.,
464
+ submitting jobs, viewing logs, auto-stopping, etc.
465
+ 2. A cloud implementation can optionally switch between different
466
+ identities if required for cluster operations. In this case,
467
+ the cloud implementation should return multiple identities
468
+ as a list. E.g., our Kubernetes implementation can use multiple
469
+ kubeconfig contexts to switch between different identities.
426
470
 
427
471
  The choice of what constitutes an identity is up to each cloud's
428
472
  implementation. In general, to suffice for the above purposes,
@@ -430,24 +474,34 @@ class Cloud:
430
474
  resources are used when the user invoked each cloud's default
431
475
  CLI/API.
432
476
 
433
- The returned identity is a list of strings. The list is in the order of
477
+ An identity is a list of strings. The list is in the order of
434
478
  strictness, i.e., the first element is the most strict identity, and
435
479
  the last element is the least strict identity.
436
480
  When performing an identity check between the current active identity
437
481
  and the owner identity associated with a cluster, we compare the two
438
482
  lists in order: if a position does not match, we go to the next. To
439
- see an example, see the docstring of the AWS.get_current_user_identity.
440
-
483
+ see an example, see the docstring of the AWS.get_user_identities.
441
484
 
442
485
  Example identities (see cloud implementations):
443
486
  - AWS: [UserId, AccountId]
444
487
  - GCP: [email address + project ID]
445
488
  - Azure: [email address + subscription ID]
489
+ - Kubernetes: [context name]
490
+
491
+ Example return values:
492
+ - AWS: [[UserId, AccountId]]
493
+ - GCP: [[email address + project ID]]
494
+ - Azure: [[email address + subscription ID]]
495
+ - Kubernetes: [[current active context], [context 2], ...]
446
496
 
447
497
  Returns:
448
498
  None if the cloud does not have a concept of user identity
449
499
  (access protection will be disabled for these clusters);
450
- otherwise the currently active user identity.
500
+ otherwise a list of available identities with the current active
501
+ identity being the first element. Most clouds have only one identity
502
+ available, so the returned list will only have one element: the
503
+ current active identity.
504
+
451
505
  Raises:
452
506
  exceptions.CloudUserIdentityError: If the user identity cannot be
453
507
  retrieved.
@@ -455,13 +509,26 @@ class Cloud:
455
509
  return None
456
510
 
457
511
  @classmethod
458
- def get_current_user_identity_str(cls) -> Optional[str]:
459
- """Returns a user friendly representation of the current identity."""
460
- user_identity = cls.get_current_user_identity()
512
+ def get_active_user_identity_str(cls) -> Optional[str]:
513
+ """Returns a user friendly representation of the active identity."""
514
+ user_identity = cls.get_active_user_identity()
461
515
  if user_identity is None:
462
516
  return None
463
517
  return ', '.join(user_identity)
464
518
 
519
+ @classmethod
520
+ def get_active_user_identity(cls) -> Optional[List[str]]:
521
+ """Returns currently active user identity of this cloud
522
+
523
+ See get_user_identities for definition of user identity.
524
+
525
+ Returns:
526
+ None if the cloud does not have a concept of user identity;
527
+ otherwise the current active identity.
528
+ """
529
+ identities = cls.get_user_identities()
530
+ return identities[0] if identities is not None else None
531
+
465
532
  def get_credential_file_mounts(self) -> Dict[str, str]:
466
533
  """Returns the files necessary to access this cloud.
467
534
 
@@ -469,6 +536,10 @@ class Cloud:
469
536
  """
470
537
  raise NotImplementedError
471
538
 
539
+ def can_credential_expire(self) -> bool:
540
+ """Returns whether the cloud credential can expire."""
541
+ return False
542
+
472
543
  @classmethod
473
544
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
474
545
  """Check the image size from the cloud.
@@ -610,8 +681,9 @@ class Cloud:
610
681
  assert resources.is_launchable(), resources
611
682
 
612
683
  def _equal_accelerators(
613
- acc_requested: Optional[Dict[str, int]],
614
- acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
684
+ acc_requested: Optional[Dict[str, Union[int, float]]],
685
+ acc_from_instance_type: Optional[Dict[str, Union[int,
686
+ float]]]) -> bool:
615
687
  """Check the requested accelerators equals to the instance type
616
688
 
617
689
  Check the requested accelerators equals to the accelerators
@@ -626,12 +698,14 @@ class Cloud:
626
698
  for acc in acc_requested:
627
699
  if acc not in acc_from_instance_type:
628
700
  return False
629
- if acc_requested[acc] != acc_from_instance_type[acc]:
701
+ # Avoid float point precision issue.
702
+ if not math.isclose(acc_requested[acc],
703
+ acc_from_instance_type[acc]):
630
704
  return False
631
705
  return True
632
706
 
633
- acc_from_instance_type = (cls.get_accelerators_from_instance_type(
634
- resources.instance_type))
707
+ acc_from_instance_type = cls.get_accelerators_from_instance_type(
708
+ resources.instance_type)
635
709
  if not _equal_accelerators(resources.accelerators,
636
710
  acc_from_instance_type):
637
711
  with ux_utils.print_exception_no_traceback():
@@ -726,8 +800,8 @@ class Cloud:
726
800
  # cloud._cloud_unsupported_features().
727
801
 
728
802
  @classmethod
729
- def create_image_from_cluster(cls, cluster_name: str,
730
- cluster_name_on_cloud: str,
803
+ def create_image_from_cluster(cls,
804
+ cluster_name: resources_utils.ClusterName,
731
805
  region: Optional[str],
732
806
  zone: Optional[str]) -> str:
733
807
  """Creates an image from the cluster.
@@ -756,6 +830,10 @@ class Cloud:
756
830
 
757
831
  # === End of image related methods ===
758
832
 
833
+ @classmethod
834
+ def canonical_name(cls) -> str:
835
+ return cls.__name__.lower()
836
+
759
837
  def __repr__(self):
760
838
  return self._REPR
761
839
 
sky/clouds/cudo.py CHANGED
@@ -1,12 +1,12 @@
1
1
  """Cudo Compute"""
2
- import json
3
2
  import subprocess
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  from sky import clouds
8
7
  from sky.clouds import service_catalog
9
8
  from sky.utils import common_utils
9
+ from sky.utils import registry
10
10
  from sky.utils import resources_utils
11
11
 
12
12
  if typing.TYPE_CHECKING:
@@ -28,7 +28,7 @@ def _run_output(cmd):
28
28
  return proc.stdout.decode('ascii')
29
29
 
30
30
 
31
- @clouds.CLOUD_REGISTRY.register
31
+ @registry.CLOUD_REGISTRY.register
32
32
  class Cudo(clouds.Cloud):
33
33
  """Cudo Compute"""
34
34
  _REPR = 'Cudo'
@@ -43,8 +43,7 @@ class Cudo(clouds.Cloud):
43
43
  f'{_INDENT_PREFIX} $ cudoctl init\n'
44
44
  f'{_INDENT_PREFIX}For more info: '
45
45
  # pylint: disable=line-too-long
46
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html'
47
- )
46
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html')
48
47
 
49
48
  _PROJECT_HINT = (
50
49
  'Create a project and then set it as the default project,:\n'
@@ -52,8 +51,7 @@ class Cudo(clouds.Cloud):
52
51
  f'{_INDENT_PREFIX} $ cudoctl init\n'
53
52
  f'{_INDENT_PREFIX}For more info: '
54
53
  # pylint: disable=line-too-long
55
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html'
56
- )
54
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html')
57
55
 
58
56
  _CLOUD_UNSUPPORTED_FEATURES = {
59
57
  clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
@@ -66,6 +64,10 @@ class Cudo(clouds.Cloud):
66
64
  clouds.CloudImplementationFeatures.DOCKER_IMAGE:
67
65
  ('Docker image is currently not supported on Cudo. You can try '
68
66
  'running docker command inside the `run` section in task.yaml.'),
67
+ clouds.CloudImplementationFeatures.HOST_CONTROLLERS: (
68
+ 'Cudo Compute cannot host a controller as it does not '
69
+ 'autostopping, which will leave the controller to run indefinitely.'
70
+ ),
69
71
  }
70
72
  _MAX_CLUSTER_NAME_LEN_LIMIT = 60
71
73
 
@@ -179,7 +181,7 @@ class Cudo(clouds.Cloud):
179
181
  def get_accelerators_from_instance_type(
180
182
  cls,
181
183
  instance_type: str,
182
- ) -> Optional[Dict[str, int]]:
184
+ ) -> Optional[Dict[str, Union[int, float]]]:
183
185
  return service_catalog.get_accelerators_from_instance_type(
184
186
  instance_type, clouds='cudo')
185
187
 
@@ -190,18 +192,17 @@ class Cudo(clouds.Cloud):
190
192
  def make_deploy_resources_variables(
191
193
  self,
192
194
  resources: 'resources_lib.Resources',
193
- cluster_name_on_cloud: str,
195
+ cluster_name: resources_utils.ClusterName,
194
196
  region: 'clouds.Region',
195
197
  zones: Optional[List['clouds.Zone']],
198
+ num_nodes: int,
196
199
  dryrun: bool = False,
197
200
  ) -> Dict[str, Optional[str]]:
198
- del zones
201
+ del zones, cluster_name # unused
199
202
  r = resources
200
203
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
201
- if acc_dict is not None:
202
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
203
- else:
204
- custom_resources = None
204
+ custom_resources = resources_utils.make_ray_custom_resources_str(
205
+ acc_dict)
205
206
 
206
207
  return {
207
208
  'instance_type': resources.instance_type,
@@ -210,13 +211,16 @@ class Cudo(clouds.Cloud):
210
211
  }
211
212
 
212
213
  def _get_feasible_launchable_resources(
213
- self, resources: 'resources_lib.Resources'):
214
+ self, resources: 'resources_lib.Resources'
215
+ ) -> 'resources_utils.FeasibleResources':
214
216
  if resources.use_spot:
215
- return ([], [])
217
+ # TODO: Add hints to all return values in this method to help
218
+ # users understand why the resources are not launchable.
219
+ return resources_utils.FeasibleResources([], [], None)
216
220
  if resources.instance_type is not None:
217
221
  assert resources.is_launchable(), resources
218
222
  resources = resources.copy(accelerators=None)
219
- return ([resources], [])
223
+ return resources_utils.FeasibleResources([resources], [], None)
220
224
 
221
225
  def _make(instance_list):
222
226
  resource_list = []
@@ -239,9 +243,10 @@ class Cudo(clouds.Cloud):
239
243
  memory=resources.memory,
240
244
  disk_tier=resources.disk_tier)
241
245
  if default_instance_type is None:
242
- return ([], [])
246
+ return resources_utils.FeasibleResources([], [], None)
243
247
  else:
244
- return (_make([default_instance_type]), [])
248
+ return resources_utils.FeasibleResources(
249
+ _make([default_instance_type]), [], None)
245
250
 
246
251
  assert len(accelerators) == 1, resources
247
252
  acc, acc_count = list(accelerators.items())[0]
@@ -256,8 +261,10 @@ class Cudo(clouds.Cloud):
256
261
  zone=resources.zone,
257
262
  clouds='cudo')
258
263
  if instance_list is None:
259
- return ([], fuzzy_candidate_list)
260
- return (_make(instance_list), fuzzy_candidate_list)
264
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
265
+ None)
266
+ return resources_utils.FeasibleResources(_make(instance_list),
267
+ fuzzy_candidate_list, None)
261
268
 
262
269
  @classmethod
263
270
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -318,7 +325,7 @@ class Cudo(clouds.Cloud):
318
325
  }
319
326
 
320
327
  @classmethod
321
- def get_current_user_identity(cls) -> Optional[List[str]]:
328
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
322
329
  # NOTE: used for very advanced SkyPilot functionality
323
330
  # Can implement later if desired
324
331
  return None