skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/runpod.py CHANGED
@@ -1,11 +1,11 @@
1
1
  """ RunPod Cloud. """
2
2
 
3
- import json
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  from sky import clouds
8
7
  from sky.clouds import service_catalog
8
+ from sky.utils import registry
9
9
  from sky.utils import resources_utils
10
10
 
11
11
  if typing.TYPE_CHECKING:
@@ -16,7 +16,7 @@ _CREDENTIAL_FILES = [
16
16
  ]
17
17
 
18
18
 
19
- @clouds.CLOUD_REGISTRY.register
19
+ @registry.CLOUD_REGISTRY.register
20
20
  class RunPod(clouds.Cloud):
21
21
  """ RunPod GPU Cloud
22
22
 
@@ -25,32 +25,22 @@ class RunPod(clouds.Cloud):
25
25
  _REPR = 'RunPod'
26
26
  _CLOUD_UNSUPPORTED_FEATURES = {
27
27
  clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
28
- clouds.CloudImplementationFeatures.SPOT_INSTANCE:
29
- ('Spot is not supported, as runpod API does not implement spot.'),
30
28
  clouds.CloudImplementationFeatures.MULTI_NODE:
31
29
  ('Multi-node not supported yet, as the interconnection among nodes '
32
30
  'are non-trivial on RunPod.'),
33
- clouds.CloudImplementationFeatures.OPEN_PORTS:
34
- ('Opening ports is not '
35
- 'supported yet on RunPod.'),
36
- clouds.CloudImplementationFeatures.IMAGE_ID:
37
- ('Specifying image ID is not supported on RunPod.'),
38
- clouds.CloudImplementationFeatures.DOCKER_IMAGE:
39
- (f'Docker image is currently not supported on {_REPR}.'),
40
31
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
41
32
  ('Customizing disk tier is not supported yet on RunPod.'),
42
33
  clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
43
34
  ('Mounting object stores is not supported on RunPod. To read data '
44
35
  'from object stores on RunPod, use `mode: COPY` to copy the data '
45
36
  'to local disk.'),
46
- clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
47
- ('Host controllers are not supported on RunPod.'),
48
37
  }
49
38
  _MAX_CLUSTER_NAME_LEN_LIMIT = 120
50
39
  _regions: List[clouds.Region] = []
51
40
 
52
41
  PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
53
42
  STATUS_VERSION = clouds.StatusVersion.SKYPILOT
43
+ OPEN_PORTS_VERSION = clouds.OpenPortsVersion.LAUNCH_ONLY
54
44
 
55
45
  @classmethod
56
46
  def _unsupported_features_for_resources(
@@ -79,11 +69,8 @@ class RunPod(clouds.Cloud):
79
69
  zone: Optional[str]) -> List[clouds.Region]:
80
70
  assert zone is None, 'RunPod does not support zones.'
81
71
  del accelerators, zone # unused
82
- if use_spot:
83
- return []
84
- else:
85
- regions = service_catalog.get_region_zones_for_instance_type(
86
- instance_type, use_spot, 'runpod')
72
+ regions = service_catalog.get_region_zones_for_instance_type(
73
+ instance_type, use_spot, 'runpod')
87
74
 
88
75
  if region is not None:
89
76
  regions = [r for r in regions if r.name == region]
@@ -155,7 +142,7 @@ class RunPod(clouds.Cloud):
155
142
 
156
143
  @classmethod
157
144
  def get_accelerators_from_instance_type(
158
- cls, instance_type: str) -> Optional[Dict[str, int]]:
145
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
159
146
  return service_catalog.get_accelerators_from_instance_type(
160
147
  instance_type, clouds='runpod')
161
148
 
@@ -166,33 +153,54 @@ class RunPod(clouds.Cloud):
166
153
  def make_deploy_resources_variables(
167
154
  self,
168
155
  resources: 'resources_lib.Resources',
169
- cluster_name_on_cloud: str,
156
+ cluster_name: resources_utils.ClusterName,
170
157
  region: 'clouds.Region',
171
158
  zones: Optional[List['clouds.Zone']],
159
+ num_nodes: int,
172
160
  dryrun: bool = False) -> Dict[str, Optional[str]]:
173
- del zones, dryrun # unused
161
+ del zones, dryrun, cluster_name # unused
174
162
 
175
163
  r = resources
176
164
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
177
- if acc_dict is not None:
178
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
165
+ custom_resources = resources_utils.make_ray_custom_resources_str(
166
+ acc_dict)
167
+
168
+ if r.image_id is None:
169
+ image_id = 'runpod/base:0.0.2'
170
+ elif r.extract_docker_image() is not None:
171
+ image_id = r.extract_docker_image()
179
172
  else:
180
- custom_resources = None
173
+ image_id = r.image_id[r.region]
174
+
175
+ instance_type = resources.instance_type
176
+ use_spot = resources.use_spot
177
+
178
+ hourly_cost = self.instance_type_to_hourly_cost(
179
+ instance_type=instance_type, use_spot=use_spot)
180
+
181
+ # default to root
182
+ docker_username_for_runpod = (resources.docker_username_for_runpod
183
+ if resources.docker_username_for_runpod
184
+ is not None else 'root')
181
185
 
182
186
  return {
183
- 'instance_type': resources.instance_type,
187
+ 'instance_type': instance_type,
184
188
  'custom_resources': custom_resources,
185
189
  'region': region.name,
190
+ 'image_id': image_id,
191
+ 'use_spot': use_spot,
192
+ 'bid_per_gpu': str(hourly_cost),
193
+ 'docker_username_for_runpod': docker_username_for_runpod,
186
194
  }
187
195
 
188
196
  def _get_feasible_launchable_resources(
189
197
  self, resources: 'resources_lib.Resources'
190
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
198
+ ) -> 'resources_utils.FeasibleResources':
191
199
  """Returns a list of feasible resources for the given resources."""
192
200
  if resources.instance_type is not None:
193
201
  assert resources.is_launchable(), resources
194
202
  resources = resources.copy(accelerators=None)
195
- return ([resources], [])
203
+ return resources_utils.FeasibleResources([resources], [], None)
196
204
 
197
205
  def _make(instance_list):
198
206
  resource_list = []
@@ -215,9 +223,12 @@ class RunPod(clouds.Cloud):
215
223
  memory=resources.memory,
216
224
  disk_tier=resources.disk_tier)
217
225
  if default_instance_type is None:
218
- return ([], [])
226
+ # TODO: Add hints to all return values in this method to help
227
+ # users understand why the resources are not launchable.
228
+ return resources_utils.FeasibleResources([], [], None)
219
229
  else:
220
- return (_make([default_instance_type]), [])
230
+ return resources_utils.FeasibleResources(
231
+ _make([default_instance_type]), [], None)
221
232
 
222
233
  assert len(accelerators) == 1, resources
223
234
  acc, acc_count = list(accelerators.items())[0]
@@ -231,8 +242,10 @@ class RunPod(clouds.Cloud):
231
242
  zone=resources.zone,
232
243
  clouds='runpod')
233
244
  if instance_list is None:
234
- return ([], fuzzy_candidate_list)
235
- return (_make(instance_list), fuzzy_candidate_list)
245
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
246
+ None)
247
+ return resources_utils.FeasibleResources(_make(instance_list),
248
+ fuzzy_candidate_list, None)
236
249
 
237
250
  @classmethod
238
251
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -247,7 +260,7 @@ class RunPod(clouds.Cloud):
247
260
  ' Credentials can be set up by running: \n'
248
261
  f' $ pip install runpod \n'
249
262
  f' $ runpod config\n'
250
- ' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long
263
+ ' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long
251
264
  )
252
265
 
253
266
  return True, None
@@ -263,7 +276,7 @@ class RunPod(clouds.Cloud):
263
276
  }
264
277
 
265
278
  @classmethod
266
- def get_current_user_identity(cls) -> Optional[List[str]]:
279
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
267
280
  # NOTE: used for very advanced SkyPilot functionality
268
281
  # Can implement later if desired
269
282
  return None
@@ -275,3 +288,9 @@ class RunPod(clouds.Cloud):
275
288
  return service_catalog.validate_region_zone(region,
276
289
  zone,
277
290
  clouds='runpod')
291
+
292
+ @classmethod
293
+ def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
294
+ # TODO: use 0.0 for now to allow all images. We should change this to
295
+ # return the docker image size.
296
+ return 0.0
sky/clouds/scp.py CHANGED
@@ -4,17 +4,17 @@ This module includes the set of functions
4
4
  to access the SCP catalog and check credentials for the SCP access.
5
5
  """
6
6
 
7
- import json
8
7
  import typing
9
- from typing import Dict, Iterator, List, Optional, Tuple
8
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
10
9
 
11
10
  from sky import clouds
12
11
  from sky import exceptions
13
12
  from sky import sky_logging
14
- from sky import status_lib
15
13
  from sky.clouds import service_catalog
16
14
  from sky.clouds.utils import scp_utils
15
+ from sky.utils import registry
17
16
  from sky.utils import resources_utils
17
+ from sky.utils import status_lib
18
18
 
19
19
  if typing.TYPE_CHECKING:
20
20
  # Renaming to avoid shadowing variables.
@@ -30,7 +30,7 @@ _SCP_MIN_DISK_SIZE_GB = 100
30
30
  _SCP_MAX_DISK_SIZE_GB = 300
31
31
 
32
32
 
33
- @clouds.CLOUD_REGISTRY.register
33
+ @registry.CLOUD_REGISTRY.register
34
34
  class SCP(clouds.Cloud):
35
35
  """SCP Cloud."""
36
36
 
@@ -146,10 +146,10 @@ class SCP(clouds.Cloud):
146
146
 
147
147
  @classmethod
148
148
  def get_default_instance_type(
149
- cls,
150
- cpus: Optional[str] = None,
151
- memory: Optional[str] = None,
152
- disk_tier: Optional[resources_utils.DiskTier] = None
149
+ cls,
150
+ cpus: Optional[str] = None,
151
+ memory: Optional[str] = None,
152
+ disk_tier: Optional['resources_utils.DiskTier'] = None
153
153
  ) -> Optional[str]:
154
154
  return service_catalog.get_default_instance_type(cpus=cpus,
155
155
  memory=memory,
@@ -160,7 +160,7 @@ class SCP(clouds.Cloud):
160
160
  def get_accelerators_from_instance_type(
161
161
  cls,
162
162
  instance_type: str,
163
- ) -> Optional[Dict[str, int]]:
163
+ ) -> Optional[Dict[str, Union[int, float]]]:
164
164
  return service_catalog.get_accelerators_from_instance_type(
165
165
  instance_type, clouds='scp')
166
166
 
@@ -179,20 +179,19 @@ class SCP(clouds.Cloud):
179
179
  def make_deploy_resources_variables(
180
180
  self,
181
181
  resources: 'resources_lib.Resources',
182
- cluster_name_on_cloud: str,
182
+ cluster_name: 'resources_utils.ClusterName',
183
183
  region: 'clouds.Region',
184
184
  zones: Optional[List['clouds.Zone']],
185
+ num_nodes: int,
185
186
  dryrun: bool = False) -> Dict[str, Optional[str]]:
186
- del cluster_name_on_cloud, dryrun # Unused.
187
+ del cluster_name, dryrun # Unused.
187
188
  assert zones is None, 'SCP does not support zones.'
188
189
 
189
190
  r = resources
190
191
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
192
+ custom_resources = resources_utils.make_ray_custom_resources_str(
193
+ acc_dict)
191
194
 
192
- if acc_dict is not None:
193
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
194
- else:
195
- custom_resources = None
196
195
  image_id = self._get_image_id(r.image_id, region.name, r.instance_type)
197
196
  return {
198
197
  'instance_type': resources.instance_type,
@@ -251,16 +250,18 @@ class SCP(clouds.Cloud):
251
250
 
252
251
  def _get_feasible_launchable_resources(
253
252
  self, resources: 'resources_lib.Resources'
254
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
253
+ ) -> 'resources_utils.FeasibleResources':
255
254
  # Check if the host VM satisfies the min/max disk size limits.
256
255
  is_allowed = self._is_disk_size_allowed(resources)
257
256
  if not is_allowed:
258
- return ([], [])
257
+ # TODO: Add hints to all return values in this method to help
258
+ # users understand why the resources are not launchable.
259
+ return resources_utils.FeasibleResources([], [], None)
259
260
  if resources.instance_type is not None:
260
261
  assert resources.is_launchable(), resources
261
262
  # Accelerators are part of the instance type in SCP Cloud
262
263
  resources = resources.copy(accelerators=None)
263
- return ([resources], [])
264
+ return resources_utils.FeasibleResources([resources], [], None)
264
265
 
265
266
  def _make(instance_list):
266
267
  resource_list = []
@@ -287,9 +288,10 @@ class SCP(clouds.Cloud):
287
288
  memory=resources.memory,
288
289
  disk_tier=resources.disk_tier)
289
290
  if default_instance_type is None:
290
- return ([], [])
291
+ return resources_utils.FeasibleResources([], [], None)
291
292
  else:
292
- return (_make([default_instance_type]), [])
293
+ return resources_utils.FeasibleResources(
294
+ _make([default_instance_type]), [], None)
293
295
 
294
296
  assert len(accelerators) == 1, resources
295
297
  acc, acc_count = list(accelerators.items())[0]
@@ -304,8 +306,10 @@ class SCP(clouds.Cloud):
304
306
  zone=resources.zone,
305
307
  clouds='scp')
306
308
  if instance_list is None:
307
- return ([], fuzzy_candidate_list)
308
- return (_make(instance_list), fuzzy_candidate_list)
309
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
310
+ None)
311
+ return resources_utils.FeasibleResources(_make(instance_list),
312
+ fuzzy_candidate_list, None)
309
313
 
310
314
  @classmethod
311
315
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -332,8 +336,8 @@ class SCP(clouds.Cloud):
332
336
  }
333
337
 
334
338
  @classmethod
335
- def get_current_user_identity(cls) -> Optional[List[str]]:
336
- # TODO(jgoo1): Implement get_current_user_identity for SCP
339
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
340
+ # TODO(jgoo1): Implement get_user_identities for SCP
337
341
  return None
338
342
 
339
343
  def instance_type_exists(self, instance_type: str) -> bool:
@@ -10,6 +10,7 @@ from sky.clouds.service_catalog.constants import CATALOG_DIR
10
10
  from sky.clouds.service_catalog.constants import CATALOG_SCHEMA_VERSION
11
11
  from sky.clouds.service_catalog.constants import HOSTED_CATALOG_DIR_URL
12
12
  from sky.utils import resources_utils
13
+ from sky.utils import subprocess_utils
13
14
 
14
15
  if typing.TYPE_CHECKING:
15
16
  from sky.clouds import cloud
@@ -31,11 +32,10 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
31
32
  if single:
32
33
  clouds = [clouds] # type: ignore
33
34
 
34
- results = []
35
- for cloud in clouds:
35
+ def _execute_catalog_method(cloud: str):
36
36
  try:
37
37
  cloud_module = importlib.import_module(
38
- f'sky.clouds.service_catalog.{cloud}_catalog')
38
+ f'sky.clouds.service_catalog.{cloud.lower()}_catalog')
39
39
  except ModuleNotFoundError:
40
40
  raise ValueError(
41
41
  'Cannot find module "sky.clouds.service_catalog'
@@ -46,7 +46,11 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
46
46
  raise AttributeError(
47
47
  f'Module "{cloud}_catalog" does not '
48
48
  f'implement the "{method_name}" method') from None
49
- results.append(method(*args, **kwargs))
49
+ return method(*args, **kwargs)
50
+
51
+ results = subprocess_utils.run_in_parallel(_execute_catalog_method,
52
+ args=list(clouds),
53
+ num_threads=len(clouds))
50
54
  if single:
51
55
  return results[0]
52
56
  return results
@@ -63,7 +67,7 @@ def list_accelerators(
63
67
  all_regions: bool = False,
64
68
  require_price: bool = True,
65
69
  ) -> 'Dict[str, List[common.InstanceTypeInfo]]':
66
- """List the names of all accelerators offered by Sky.
70
+ """Lists the names of all accelerators offered by Sky.
67
71
 
68
72
  This will include all accelerators offered by Sky, including those
69
73
  that may not be available in the user's account.
@@ -91,7 +95,7 @@ def list_accelerator_counts(
91
95
  quantity_filter: Optional[int] = None,
92
96
  clouds: CloudFilter = None,
93
97
  ) -> Dict[str, List[int]]:
94
- """List all accelerators offered by Sky and available counts.
98
+ """Lists all accelerators offered by Sky and available counts.
95
99
 
96
100
  Returns: A dictionary of canonical accelerator names mapped to a list
97
101
  of available counts. See usage in cli.py.
@@ -125,7 +129,7 @@ def list_accelerator_realtime(
125
129
  clouds: CloudFilter = None,
126
130
  case_sensitive: bool = True,
127
131
  ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
128
- """List all accelerators offered by Sky with their realtime availability.
132
+ """Lists all accelerators offered by Sky with their realtime availability.
129
133
 
130
134
  Realtime availability is the total number of accelerators in the cluster
131
135
  and number of accelerators available at the time of the call.
@@ -238,7 +242,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
238
242
 
239
243
  def get_accelerators_from_instance_type(
240
244
  instance_type: str,
241
- clouds: CloudFilter = None) -> Optional[Dict[str, int]]:
245
+ clouds: CloudFilter = None) -> Optional[Dict[str, Union[int, float]]]:
242
246
  """Returns the accelerators from a instance type."""
243
247
  return _map_clouds_catalog(clouds, 'get_accelerators_from_instance_type',
244
248
  instance_type)
@@ -324,9 +328,8 @@ def get_common_gpus() -> List[str]:
324
328
  'A100',
325
329
  'A100-80GB',
326
330
  'H100',
327
- 'K80',
328
331
  'L4',
329
- 'M60',
332
+ 'L40S',
330
333
  'P100',
331
334
  'T4',
332
335
  'V100',
@@ -337,10 +340,13 @@ def get_common_gpus() -> List[str]:
337
340
  def get_tpus() -> List[str]:
338
341
  """Returns a list of TPU names."""
339
342
  # TODO(wei-lin): refactor below hard-coded list.
343
+ # There are many TPU configurations available, we show the some smallest
344
+ # ones for each generation, and people should find larger ones with
345
+ # sky show-gpus tpu.
340
346
  return [
341
- 'tpu-v2-8', 'tpu-v2-32', 'tpu-v2-128', 'tpu-v2-256', 'tpu-v2-512',
342
- 'tpu-v3-8', 'tpu-v3-32', 'tpu-v3-64', 'tpu-v3-128', 'tpu-v3-256',
343
- 'tpu-v3-512', 'tpu-v3-1024', 'tpu-v3-2048'
347
+ 'tpu-v2-8', 'tpu-v3-8', 'tpu-v4-8', 'tpu-v4-16', 'tpu-v4-32',
348
+ 'tpu-v5litepod-1', 'tpu-v5litepod-4', 'tpu-v5litepod-8', 'tpu-v5p-8',
349
+ 'tpu-v5p-16', 'tpu-v5p-32', 'tpu-v6e-1', 'tpu-v6e-4', 'tpu-v6e-8'
344
350
  ]
345
351
 
346
352
 
@@ -8,9 +8,7 @@ import hashlib
8
8
  import os
9
9
  import threading
10
10
  import typing
11
- from typing import Dict, List, Optional, Tuple
12
-
13
- import colorama
11
+ from typing import Dict, List, Optional, Tuple, Union
14
12
 
15
13
  from sky import exceptions
16
14
  from sky import sky_logging
@@ -21,6 +19,9 @@ from sky.clouds.service_catalog import config
21
19
  from sky.clouds.service_catalog.data_fetchers import fetch_aws
22
20
  from sky.utils import common_utils
23
21
  from sky.utils import resources_utils
22
+ from sky.utils import rich_utils
23
+ from sky.utils import timeline
24
+ from sky.utils import ux_utils
24
25
 
25
26
  if typing.TYPE_CHECKING:
26
27
  import pandas as pd
@@ -82,11 +83,10 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
82
83
  az_mappings = None
83
84
  if aws_user_hash != 'default':
84
85
  # Fetch az mapping from AWS.
85
- print(
86
- f'\r{colorama.Style.DIM}AWS: Fetching availability zones '
87
- f'mapping...{colorama.Style.RESET_ALL}',
88
- end='')
89
- az_mappings = fetch_aws.fetch_availability_zone_mappings()
86
+ with rich_utils.safe_status(
87
+ ux_utils.spinner_message('AWS: Fetching availability '
88
+ 'zones mapping')):
89
+ az_mappings = fetch_aws.fetch_availability_zone_mappings()
90
90
  else:
91
91
  return None
92
92
  az_mappings.to_csv(az_mapping_path, index=False)
@@ -101,6 +101,7 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
101
101
  return az_mappings
102
102
 
103
103
 
104
+ @timeline.event
104
105
  def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
105
106
  """Maps zone IDs (use1-az1) to zone names (us-east-1x).
106
107
 
@@ -123,11 +124,17 @@ def _fetch_and_apply_az_mapping(df: common.LazyDataFrame) -> 'pd.DataFrame':
123
124
  with the zone name (e.g. us-east-1a).
124
125
  """
125
126
  try:
126
- user_identity_list = aws.AWS.get_current_user_identity()
127
+ user_identity_list = aws.AWS.get_active_user_identity()
127
128
  assert user_identity_list, user_identity_list
128
129
  user_identity = user_identity_list[0]
129
130
  aws_user_hash = hashlib.md5(user_identity.encode()).hexdigest()[:8]
130
- except exceptions.CloudUserIdentityError:
131
+ except (exceptions.CloudUserIdentityError, ImportError):
132
+ # If failed to get user identity, or import aws dependencies, we use the
133
+ # latest mapping file or the default mapping file.
134
+ # The import error can happen on the client side when the user does not
135
+ # have AWS dependencies installed.
136
+ # TODO(zhwu): we should avoid the dependency of the availability zone
137
+ # mapping so as to get rid of the import error.
131
138
  glob_name = common.get_catalog_path('aws/az_mappings-*.csv')
132
139
  # Find the most recent file that matches the glob.
133
140
  # We check the existing files because the user could remove the
@@ -244,7 +251,7 @@ def get_default_instance_type(
244
251
 
245
252
 
246
253
  def get_accelerators_from_instance_type(
247
- instance_type: str) -> Optional[Dict[str, int]]:
254
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
248
255
  return common.get_accelerators_from_instance_type_impl(
249
256
  _get_df(), instance_type)
250
257
 
@@ -309,7 +316,17 @@ def list_accelerators(
309
316
 
310
317
  def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
311
318
  """Returns the image id from the tag."""
312
- return common.get_image_id_from_tag_impl(_image_df, tag, region)
319
+ global _image_df
320
+
321
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
322
+ if image_id is None:
323
+ # Refresh the image catalog and try again, if the image tag is not
324
+ # found.
325
+ logger.debug('Refreshing the image catalog and trying again.')
326
+ _image_df = common.read_catalog('aws/images.csv',
327
+ pull_frequency_hours=0)
328
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
329
+ return image_id
313
330
 
314
331
 
315
332
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
@@ -4,14 +4,32 @@ This module loads the service catalog file and can be used to query
4
4
  instance types and pricing information for Azure.
5
5
  """
6
6
  import re
7
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky import clouds as cloud_lib
10
+ from sky import sky_logging
10
11
  from sky.clouds import Azure
11
12
  from sky.clouds.service_catalog import common
12
13
  from sky.utils import resources_utils
13
14
  from sky.utils import ux_utils
14
15
 
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ # This list should match the list of regions in
19
+ # skypilot image generation Packer script's replication_regions
20
+ # sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl
21
+ COMMUNITY_IMAGE_AVAILABLE_REGIONS = {
22
+ 'centralus',
23
+ 'eastus',
24
+ 'eastus2',
25
+ 'northcentralus',
26
+ 'southcentralus',
27
+ 'westcentralus',
28
+ 'westus',
29
+ 'westus2',
30
+ 'westus3',
31
+ }
32
+
15
33
  # The frequency of pulling the latest catalog from the cloud provider.
16
34
  # Though the catalog update is manual in our skypilot-catalog repo, we
17
35
  # still want to pull the latest catalog periodically to make sure the
@@ -110,7 +128,8 @@ def get_default_instance_type(
110
128
  _DEFAULT_INSTANCE_FAMILY)]
111
129
 
112
130
  def _filter_disk_type(instance_type: str) -> bool:
113
- return Azure.check_disk_tier(instance_type, disk_tier)[0]
131
+ valid, _ = Azure.check_disk_tier(instance_type, disk_tier)
132
+ return valid
114
133
 
115
134
  df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
116
135
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
@@ -118,7 +137,7 @@ def get_default_instance_type(
118
137
 
119
138
 
120
139
  def get_accelerators_from_instance_type(
121
- instance_type: str) -> Optional[Dict[str, int]]:
140
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
122
141
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
123
142
 
124
143
 
@@ -138,6 +157,7 @@ def get_instance_type_for_accelerator(
138
157
  if zone is not None:
139
158
  with ux_utils.print_exception_no_traceback():
140
159
  raise ValueError('Azure does not support zones.')
160
+
141
161
  return common.get_instance_type_for_accelerator_impl(df=_df,
142
162
  acc_name=acc_name,
143
163
  acc_count=acc_count,
@@ -175,9 +195,16 @@ def list_accelerators(
175
195
 
176
196
  def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
177
197
  """Returns the image id from the tag."""
178
- # Azure images are not region-specific.
179
- del region # Unused.
180
- return common.get_image_id_from_tag_impl(_image_df, tag, None)
198
+ global _image_df
199
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
200
+ if image_id is None:
201
+ # Refresh the image catalog and try again, if the image tag is not
202
+ # found.
203
+ logger.debug('Refreshing the image catalog and trying again.')
204
+ _image_df = common.read_catalog('azure/images.csv',
205
+ pull_frequency_hours=0)
206
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
207
+ return image_id
181
208
 
182
209
 
183
210
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool: