skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/oci.py CHANGED
@@ -4,21 +4,37 @@ History:
4
4
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
5
5
  - Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default
6
6
  image_id (configurable) if no image_id specified in the task yaml.
7
+ - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
8
+ get_credential_file_mounts(): bug fix for sky config
9
+ file path resolution (by os.path.expanduser) when construct the file
10
+ mounts. This bug will cause the created workder nodes located in different
11
+ compartment and VCN than the header node if user specifies compartment_id
12
+ in the sky config file, because the ~/.sky/config.yaml is not sync-ed to the
13
+ remote machine.
14
+ The workaround is set the sky config file path using ENV before running
15
+ the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
16
+ - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
17
+ make_deploy_resources_variables(): Bug fix for specify the image_id as
18
+ the ocid of the image in the task.yaml file, in this case the image_id
19
+ for the node config should be set to the ocid instead of a dict.
20
+ - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
21
+ Support more OS types additional to ubuntu for OCI resources.
7
22
  """
8
- import json
9
23
  import logging
10
24
  import os
11
25
  import typing
12
- from typing import Dict, Iterator, List, Optional, Tuple
26
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
13
27
 
14
28
  from sky import clouds
15
29
  from sky import exceptions
16
- from sky import status_lib
17
30
  from sky.adaptors import oci as oci_adaptor
18
31
  from sky.clouds import service_catalog
19
32
  from sky.clouds.utils import oci_utils
33
+ from sky.provision.oci.query_utils import query_helper
20
34
  from sky.utils import common_utils
35
+ from sky.utils import registry
21
36
  from sky.utils import resources_utils
37
+ from sky.utils import status_lib
22
38
  from sky.utils import ux_utils
23
39
 
24
40
  if typing.TYPE_CHECKING:
@@ -30,7 +46,7 @@ logger = logging.getLogger(__name__)
30
46
  _tenancy_prefix: Optional[str] = None
31
47
 
32
48
 
33
- @clouds.CLOUD_REGISTRY.register
49
+ @registry.CLOUD_REGISTRY.register
34
50
  class OCI(clouds.Cloud):
35
51
  """OCI: Oracle Cloud Infrastructure """
36
52
 
@@ -42,7 +58,12 @@ class OCI(clouds.Cloud):
42
58
 
43
59
  _INDENT_PREFIX = ' '
44
60
 
45
- _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
61
+ _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
62
+ {resources_utils.DiskTier.ULTRA})
63
+ _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
64
+
65
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
66
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
46
67
 
47
68
  @classmethod
48
69
  def _unsupported_features_for_resources(
@@ -55,8 +76,6 @@ class OCI(clouds.Cloud):
55
76
  (f'Docker image is currently not supported on {cls._REPR}. '
56
77
  'You can try running docker command inside the '
57
78
  '`run` section in task.yaml.'),
58
- clouds.CloudImplementationFeatures.OPEN_PORTS:
59
- (f'Opening ports is currently not supported on {cls._REPR}.'),
60
79
  }
61
80
  if resources.use_spot:
62
81
  features[clouds.CloudImplementationFeatures.STOP] = (
@@ -176,7 +195,7 @@ class OCI(clouds.Cloud):
176
195
  def get_accelerators_from_instance_type(
177
196
  cls,
178
197
  instance_type: str,
179
- ) -> Optional[Dict[str, int]]:
198
+ ) -> Optional[Dict[str, Union[int, float]]]:
180
199
  return service_catalog.get_accelerators_from_instance_type(
181
200
  instance_type, clouds='oci')
182
201
 
@@ -187,19 +206,18 @@ class OCI(clouds.Cloud):
187
206
  def make_deploy_resources_variables(
188
207
  self,
189
208
  resources: 'resources_lib.Resources',
190
- cluster_name_on_cloud: str,
209
+ cluster_name: resources_utils.ClusterName,
191
210
  region: Optional['clouds.Region'],
192
211
  zones: Optional[List['clouds.Zone']],
212
+ num_nodes: int,
193
213
  dryrun: bool = False) -> Dict[str, Optional[str]]:
194
- del cluster_name_on_cloud, dryrun # Unused.
214
+ del cluster_name, dryrun # Unused.
195
215
  assert region is not None, resources
196
216
 
197
217
  acc_dict = self.get_accelerators_from_instance_type(
198
218
  resources.instance_type)
199
- if acc_dict is not None:
200
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
201
- else:
202
- custom_resources = None
219
+ custom_resources = resources_utils.make_ray_custom_resources_str(
220
+ acc_dict)
203
221
 
204
222
  image_str = self._get_image_id(resources.image_id, region.name,
205
223
  resources.instance_type)
@@ -209,10 +227,20 @@ class OCI(clouds.Cloud):
209
227
  listing_id = image_cols[1]
210
228
  res_ver = image_cols[2]
211
229
  else:
212
- image_id = resources.image_id
230
+ # Oct.12,2024 by HysunHe: Bug fix - resources.image_id is an
231
+ # dict. The image_id here should be the ocid format.
232
+ image_id = image_str
213
233
  listing_id = None
214
234
  res_ver = None
215
235
 
236
+ os_type = None
237
+ if ':' in image_id:
238
+ # OS type provided in the --image-id. This is the case where
239
+ # custom image's ocid provided in the --image-id parameter.
240
+ # - ocid1.image...aaa:oraclelinux (os type is oraclelinux)
241
+ # - ocid1.image...aaa (OS not provided)
242
+ image_id, os_type = image_id.replace(' ', '').split(':')
243
+
216
244
  cpus = resources.cpus
217
245
  instance_type_arr = resources.instance_type.split(
218
246
  oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR)
@@ -278,10 +306,24 @@ class OCI(clouds.Cloud):
278
306
  cpus=None if cpus is None else float(cpus),
279
307
  disk_tier=resources.disk_tier)
280
308
 
309
+ if os_type is None:
310
+ # OS type is not determined yet. So try to get it from vms.csv
311
+ image_str = self._get_image_str(
312
+ image_id=resources.image_id,
313
+ instance_type=resources.instance_type,
314
+ region=region.name)
315
+
316
+ # pylint: disable=import-outside-toplevel
317
+ from sky.clouds.service_catalog import oci_catalog
318
+ os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
319
+ region=region.name)
320
+ logger.debug(f'OS type for the image {image_id} is {os_type}')
321
+
281
322
  return {
282
323
  'instance_type': instance_type,
283
324
  'custom_resources': custom_resources,
284
325
  'region': region.name,
326
+ 'os_type': os_type,
285
327
  'cpus': str(cpus),
286
328
  'memory': resources.memory,
287
329
  'disk_size': resources.disk_size,
@@ -295,11 +337,13 @@ class OCI(clouds.Cloud):
295
337
 
296
338
  def _get_feasible_launchable_resources(
297
339
  self, resources: 'resources_lib.Resources'
298
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
340
+ ) -> 'resources_utils.FeasibleResources':
299
341
  if resources.instance_type is not None:
300
342
  assert resources.is_launchable(), resources
301
343
  resources = resources.copy(accelerators=None)
302
- return ([resources], [])
344
+ # TODO: Add hints to all return values in this method to help
345
+ # users understand why the resources are not launchable.
346
+ return resources_utils.FeasibleResources([resources], [], None)
303
347
 
304
348
  def _make(instance_list):
305
349
  resource_list = []
@@ -326,9 +370,10 @@ class OCI(clouds.Cloud):
326
370
  disk_tier=resources.disk_tier)
327
371
 
328
372
  if default_instance_type is None:
329
- return ([], [])
373
+ return resources_utils.FeasibleResources([], [], None)
330
374
  else:
331
- return (_make([default_instance_type]), [])
375
+ return resources_utils.FeasibleResources(
376
+ _make([default_instance_type]), [], None)
332
377
 
333
378
  assert len(accelerators) == 1, resources
334
379
 
@@ -344,9 +389,11 @@ class OCI(clouds.Cloud):
344
389
  zone=resources.zone,
345
390
  clouds='oci')
346
391
  if instance_list is None:
347
- return ([], fuzzy_candidate_list)
392
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
393
+ None)
348
394
 
349
- return (_make(instance_list), fuzzy_candidate_list)
395
+ return resources_utils.FeasibleResources(_make(instance_list),
396
+ fuzzy_candidate_list, None)
350
397
 
351
398
  @classmethod
352
399
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -355,7 +402,7 @@ class OCI(clouds.Cloud):
355
402
  short_credential_help_str = (
356
403
  'For more details, refer to: '
357
404
  # pylint: disable=line-too-long
358
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci'
405
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci'
359
406
  )
360
407
  credential_help_str = (
361
408
  'To configure credentials, go to: '
@@ -401,7 +448,7 @@ class OCI(clouds.Cloud):
401
448
  return True, None
402
449
  except (oci_adaptor.oci.exceptions.ConfigFileNotFound,
403
450
  oci_adaptor.oci.exceptions.InvalidConfig,
404
- oci_adaptor.service_exception()) as e:
451
+ oci_adaptor.oci.exceptions.ServiceError) as e:
405
452
  return False, (
406
453
  f'OCI credential is not correctly set. '
407
454
  f'Check the credential file at {conf_file}\n'
@@ -409,22 +456,42 @@ class OCI(clouds.Cloud):
409
456
  f'{cls._INDENT_PREFIX}Error details: '
410
457
  f'{common_utils.format_exception(e, use_bracket=True)}')
411
458
 
459
+ @classmethod
460
+ def check_disk_tier(
461
+ cls, instance_type: Optional[str],
462
+ disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
463
+ del instance_type # Unused.
464
+ if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
465
+ return True, ''
466
+ if disk_tier == resources_utils.DiskTier.ULTRA:
467
+ return False, ('OCI disk_tier=ultra is not supported now. '
468
+ 'Please use disk_tier={low, medium, high, best} '
469
+ 'instead.')
470
+ return True, ''
471
+
412
472
  def get_credential_file_mounts(self) -> Dict[str, str]:
413
473
  """Returns a dict of credential file paths to mount paths."""
414
- oci_cfg_file = oci_adaptor.get_config_file()
415
- # Pass-in a profile parameter so that multiple profile in oci
416
- # config file is supported (2023/06/09).
417
- oci_cfg = oci_adaptor.get_oci_config(
418
- profile=oci_utils.oci_config.get_profile())
419
- api_key_file = oci_cfg[
420
- 'key_file'] if 'key_file' in oci_cfg else 'BadConf'
421
- sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
474
+ try:
475
+ oci_cfg_file = oci_adaptor.get_config_file()
476
+ # Pass-in a profile parameter so that multiple profile in oci
477
+ # config file is supported (2023/06/09).
478
+ oci_cfg = oci_adaptor.get_oci_config(
479
+ profile=oci_utils.oci_config.get_profile())
480
+ api_key_file = oci_cfg[
481
+ 'key_file'] if 'key_file' in oci_cfg else 'BadConf'
482
+ sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
483
+ # Must catch ImportError before any oci_adaptor.oci.exceptions
484
+ # because oci_adaptor.oci.exceptions can throw ImportError.
485
+ except ImportError:
486
+ return {}
487
+ except oci_adaptor.oci.exceptions.ConfigFileNotFound:
488
+ return {}
422
489
 
423
490
  # OCI config and API key file are mandatory
424
491
  credential_files = [oci_cfg_file, api_key_file]
425
492
 
426
493
  # Sky config file is optional
427
- if os.path.exists(sky_cfg_file):
494
+ if os.path.exists(os.path.expanduser(sky_cfg_file)):
428
495
  credential_files.append(sky_cfg_file)
429
496
 
430
497
  file_mounts = {
@@ -435,7 +502,7 @@ class OCI(clouds.Cloud):
435
502
  return file_mounts
436
503
 
437
504
  @classmethod
438
- def get_current_user_identity(cls) -> Optional[List[str]]:
505
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
439
506
  # NOTE: used for very advanced SkyPilot functionality
440
507
  # Can implement later if desired
441
508
  # If the user switches the compartment_ocid, the existing clusters
@@ -463,59 +530,45 @@ class OCI(clouds.Cloud):
463
530
  region_name: str,
464
531
  instance_type: str,
465
532
  ) -> str:
466
- if image_id is None:
467
- return self._get_default_image(region_name=region_name,
468
- instance_type=instance_type)
469
- if None in image_id:
470
- image_id_str = image_id[None]
471
- else:
472
- assert region_name in image_id, image_id
473
- image_id_str = image_id[region_name]
533
+ image_id_str = self._get_image_str(image_id=image_id,
534
+ instance_type=instance_type,
535
+ region=region_name)
536
+
474
537
  if image_id_str.startswith('skypilot:'):
475
538
  image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
476
539
  region_name,
477
540
  clouds='oci')
478
- if image_id_str is None:
479
- logger.critical(
480
- '! Real image_id not found! - {region_name}:{image_id}')
481
- # Raise ResourcesUnavailableError to make sure the failover
482
- # in CloudVMRayBackend will be correctly triggered.
483
- # TODO(zhwu): This is a information leakage to the cloud
484
- # implementor, we need to find a better way to handle this.
485
- raise exceptions.ResourcesUnavailableError(
486
- '! ERR: No image found in catalog for region '
487
- f'{region_name}. Try setting a valid image_id.')
541
+
542
+ # Image_id should be impossible be None, except for the case when
543
+ # user specify an image tag which does not exist in the image.csv
544
+ # catalog file which only possible in "test" / "evaluation" phase.
545
+ # Therefore, we use assert here.
546
+ assert image_id_str is not None
488
547
 
489
548
  logger.debug(f'Got real image_id {image_id_str}')
490
549
  return image_id_str
491
550
 
492
- def _get_default_image(self, region_name: str, instance_type: str) -> str:
551
+ def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]],
552
+ instance_type: str, region: str):
553
+ if image_id is None:
554
+ image_str = self._get_default_image_tag(instance_type)
555
+ elif None in image_id:
556
+ image_str = image_id[None]
557
+ else:
558
+ assert region in image_id, image_id
559
+ image_str = image_id[region]
560
+ return image_str
561
+
562
+ def _get_default_image_tag(self, instance_type: str) -> str:
493
563
  acc = self.get_accelerators_from_instance_type(instance_type)
494
564
 
495
565
  if acc is None:
496
566
  image_tag = oci_utils.oci_config.get_default_image_tag()
497
- image_id_str = service_catalog.get_image_id_from_tag(image_tag,
498
- region_name,
499
- clouds='oci')
500
567
  else:
501
568
  assert len(acc) == 1, acc
502
569
  image_tag = oci_utils.oci_config.get_default_gpu_image_tag()
503
- image_id_str = service_catalog.get_image_id_from_tag(image_tag,
504
- region_name,
505
- clouds='oci')
506
-
507
- if image_id_str is not None:
508
- logger.debug(
509
- f'Got default image_id {image_id_str} from tag {image_tag}')
510
- return image_id_str
511
570
 
512
- # Raise ResourcesUnavailableError to make sure the failover in
513
- # CloudVMRayBackend will be correctly triggered.
514
- # TODO(zhwu): This is a information leakage to the cloud implementor,
515
- # we need to find a better way to handle this.
516
- raise exceptions.ResourcesUnavailableError(
517
- 'ERR: No image found in catalog for region '
518
- f'{region_name}. Try update your default image_id settings.')
571
+ return image_tag
519
572
 
520
573
  def get_vpu_from_disktier(
521
574
  self, cpus: Optional[float],
@@ -559,25 +612,11 @@ class OCI(clouds.Cloud):
559
612
  region: Optional[str], zone: Optional[str],
560
613
  **kwargs) -> List[status_lib.ClusterStatus]:
561
614
  del zone, kwargs # Unused.
562
- # Check the lifecycleState definition from the page
563
- # https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
564
- status_map = {
565
- 'PROVISIONING': status_lib.ClusterStatus.INIT,
566
- 'STARTING': status_lib.ClusterStatus.INIT,
567
- 'RUNNING': status_lib.ClusterStatus.UP,
568
- 'STOPPING': status_lib.ClusterStatus.STOPPED,
569
- 'STOPPED': status_lib.ClusterStatus.STOPPED,
570
- 'TERMINATED': None,
571
- 'TERMINATING': None,
572
- }
573
-
574
- # pylint: disable=import-outside-toplevel
575
- from sky.skylet.providers.oci.query_helper import oci_query_helper
576
615
 
577
616
  status_list = []
578
617
  try:
579
- vms = oci_query_helper.query_instances_by_tags(
580
- tag_filters=tag_filters, region=region)
618
+ vms = query_helper.query_instances_by_tags(tag_filters=tag_filters,
619
+ region=region)
581
620
  except Exception as e: # pylint: disable=broad-except
582
621
  with ux_utils.print_exception_no_traceback():
583
622
  raise exceptions.ClusterStatusFetchingError(
@@ -587,9 +626,9 @@ class OCI(clouds.Cloud):
587
626
 
588
627
  for node in vms:
589
628
  vm_status = node.lifecycle_state
590
- if vm_status in status_map:
591
- sky_status = status_map[vm_status]
592
- if sky_status is not None:
593
- status_list.append(sky_status)
629
+ sky_status = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY.get(
630
+ vm_status, None)
631
+ if sky_status is not None:
632
+ status_list.append(sky_status)
594
633
 
595
634
  return status_list
sky/clouds/paperspace.py CHANGED
@@ -1,14 +1,14 @@
1
1
  """ Paperspace Cloud. """
2
2
 
3
- import json
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  import requests
8
7
 
9
8
  from sky import clouds
10
9
  from sky.clouds import service_catalog
11
10
  from sky.provision.paperspace import utils
11
+ from sky.utils import registry
12
12
  from sky.utils import resources_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -20,7 +20,7 @@ _CREDENTIAL_FILES = [
20
20
  ]
21
21
 
22
22
 
23
- @clouds.CLOUD_REGISTRY.register
23
+ @registry.CLOUD_REGISTRY.register
24
24
  class Paperspace(clouds.Cloud):
25
25
  """Paperspace GPU Cloud"""
26
26
 
@@ -162,7 +162,7 @@ class Paperspace(clouds.Cloud):
162
162
 
163
163
  @classmethod
164
164
  def get_accelerators_from_instance_type(
165
- cls, instance_type: str) -> Optional[Dict[str, int]]:
165
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
166
166
  return service_catalog.get_accelerators_from_instance_type(
167
167
  instance_type, clouds='paperspace')
168
168
 
@@ -173,18 +173,17 @@ class Paperspace(clouds.Cloud):
173
173
  def make_deploy_resources_variables(
174
174
  self,
175
175
  resources: 'resources_lib.Resources',
176
- cluster_name_on_cloud: str,
176
+ cluster_name: resources_utils.ClusterName,
177
177
  region: 'clouds.Region',
178
178
  zones: Optional[List['clouds.Zone']],
179
+ num_nodes: int,
179
180
  dryrun: bool = False) -> Dict[str, Optional[str]]:
180
- del zones, dryrun
181
+ del zones, dryrun, cluster_name
181
182
 
182
183
  r = resources
183
184
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
184
- if acc_dict is not None:
185
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
186
- else:
187
- custom_resources = None
185
+ custom_resources = resources_utils.make_ray_custom_resources_str(
186
+ acc_dict)
188
187
 
189
188
  return {
190
189
  'instance_type': resources.instance_type,
@@ -196,11 +195,13 @@ class Paperspace(clouds.Cloud):
196
195
  self, resources: 'resources_lib.Resources'):
197
196
  """Returns a list of feasible resources for the given resources."""
198
197
  if resources.use_spot:
199
- return ([], [])
198
+ # TODO: Add hints to all return values in this method to help
199
+ # users understand why the resources are not launchable.
200
+ return resources_utils.FeasibleResources([], [], None)
200
201
  if resources.instance_type is not None:
201
202
  assert resources.is_launchable(), resources
202
203
  resources = resources.copy(accelerators=None)
203
- return ([resources], [])
204
+ return resources_utils.FeasibleResources([resources], [], None)
204
205
 
205
206
  def _make(instance_list):
206
207
  resource_list = []
@@ -223,9 +224,10 @@ class Paperspace(clouds.Cloud):
223
224
  memory=resources.memory,
224
225
  disk_tier=resources.disk_tier)
225
226
  if default_instance_type is None:
226
- return ([], [])
227
+ return resources_utils.FeasibleResources([], [], None)
227
228
  else:
228
- return (_make([default_instance_type]), [])
229
+ return resources_utils.FeasibleResources(
230
+ _make([default_instance_type]), [], None)
229
231
 
230
232
  assert len(accelerators) == 1, resources
231
233
  acc, acc_count = list(accelerators.items())[0]
@@ -241,8 +243,10 @@ class Paperspace(clouds.Cloud):
241
243
  clouds='paperspace',
242
244
  ))
243
245
  if instance_list is None:
244
- return ([], fuzzy_candidate_list)
245
- return (_make(instance_list), fuzzy_candidate_list)
246
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
247
+ None)
248
+ return resources_utils.FeasibleResources(_make(instance_list),
249
+ fuzzy_candidate_list, None)
246
250
 
247
251
  @classmethod
248
252
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
@@ -255,7 +259,7 @@ class Paperspace(clouds.Cloud):
255
259
  return False, (
256
260
  'Failed to access Paperspace Cloud with credentials.\n '
257
261
  'To configure credentials, follow the instructions at: '
258
- 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#paperspace\n '
262
+ 'https://docs.skypilot.co/en/latest/getting-started/installation.html#paperspace\n '
259
263
  'Generate API key and create a json at `~/.paperspace/config.json` with \n '
260
264
  ' {"apiKey": "[YOUR API KEY]"}\n '
261
265
  f'Reason: {str(e)}')
@@ -275,7 +279,7 @@ class Paperspace(clouds.Cloud):
275
279
  }
276
280
 
277
281
  @classmethod
278
- def get_current_user_identity(cls) -> Optional[List[str]]:
282
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
279
283
  # NOTE: used for very advanced SkyPilot functionality
280
284
  # Can implement later if desired
281
285
  return None