skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
1
+ """Nebius Catalog.
2
+
3
+ This module loads the service catalog file and can be used to query
4
+ instance types and pricing information for Nebius.
5
+ """
6
+ import typing
7
+ from typing import Dict, List, Optional, Tuple, Union
8
+
9
+ from sky.clouds.service_catalog import common
10
+ from sky.utils import resources_utils
11
+ from sky.utils import ux_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky.clouds import cloud
15
+
16
+ # Keep it synced with the frequency in
17
+ # skypilot-catalog/.github/workflows/update-Nebius-catalog.yml
18
+ _PULL_FREQUENCY_HOURS = 7
19
+
20
+ _df = common.read_catalog('nebius/vms.csv')
21
+
22
+
23
+ def instance_type_exists(instance_type: str) -> bool:
24
+ return common.instance_type_exists_impl(_df, instance_type)
25
+
26
+
27
+ def validate_region_zone(
28
+ region: Optional[str],
29
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
30
+ if zone is not None:
31
+ with ux_utils.print_exception_no_traceback():
32
+ raise ValueError('Nebius does not support zones.')
33
+ return common.validate_region_zone_impl('nebius', _df, region, zone)
34
+
35
+
36
+ def get_hourly_cost(instance_type: str,
37
+ use_spot: bool = False,
38
+ region: Optional[str] = None,
39
+ zone: Optional[str] = None) -> float:
40
+ """Returns the cost, or the cheapest cost among all zones for spot."""
41
+ assert not use_spot, 'Nebius does not support spot.'
42
+ if zone is not None:
43
+ with ux_utils.print_exception_no_traceback():
44
+ raise ValueError('Nebius does not support zones.')
45
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
46
+ zone)
47
+
48
+
49
+ def get_vcpus_mem_from_instance_type(
50
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
51
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
52
+
53
+
54
+ def get_default_instance_type(
55
+ cpus: Optional[str] = None,
56
+ memory: Optional[str] = None,
57
+ disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
58
+ del disk_tier # unused
59
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
60
+
61
+
62
+ def get_accelerators_from_instance_type(
63
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
64
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
65
+
66
+
67
+ def get_instance_type_for_accelerator(
68
+ acc_name: str,
69
+ acc_count: int,
70
+ cpus: Optional[str] = None,
71
+ memory: Optional[str] = None,
72
+ use_spot: bool = False,
73
+ region: Optional[str] = None,
74
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
75
+ """Filter the instance types based on resource requirements.
76
+
77
+ Returns a list of instance types satisfying the required count of
78
+ accelerators with sorted prices and a list of candidates with fuzzy search.
79
+ """
80
+ if zone is not None:
81
+ with ux_utils.print_exception_no_traceback():
82
+ raise ValueError('Nebius does not support zones.')
83
+ return common.get_instance_type_for_accelerator_impl(df=_df,
84
+ acc_name=acc_name,
85
+ acc_count=acc_count,
86
+ cpus=cpus,
87
+ memory=memory,
88
+ use_spot=use_spot,
89
+ region=region,
90
+ zone=zone)
91
+
92
+
93
+ def regions() -> List['cloud.Region']:
94
+ return common.get_region_zones(_df, use_spot=False)
95
+
96
+
97
+ def get_region_zones_for_instance_type(instance_type: str,
98
+ use_spot: bool) -> List['cloud.Region']:
99
+ df = _df[_df['InstanceType'] == instance_type]
100
+ return common.get_region_zones(df, use_spot)
101
+
102
+
103
+ def list_accelerators(
104
+ gpus_only: bool,
105
+ name_filter: Optional[str],
106
+ region_filter: Optional[str],
107
+ quantity_filter: Optional[int],
108
+ case_sensitive: bool = True,
109
+ all_regions: bool = False,
110
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
111
+ """Returns all instance types in Nebius offering GPUs."""
112
+
113
+ del require_price # Unused.
114
+ return common.list_accelerators_impl('nebius', _df, gpus_only, name_filter,
115
+ region_filter, quantity_filter,
116
+ case_sensitive, all_regions)
@@ -7,14 +7,17 @@ History:
7
7
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
8
8
  - Hysun He (hysun.he@oracle.com) @ Jun, 2023: Reduce retry times by
9
9
  excluding those unsubscribed regions.
10
+ - Hysun He (hysun.he@oracle.com) @ Oct 14, 2024: Bug fix for validation
11
+ of the Marketplace images
10
12
  """
11
13
 
12
14
  import logging
13
15
  import threading
14
16
  import typing
15
- from typing import Dict, List, Optional, Tuple
17
+ from typing import Dict, List, Optional, Tuple, Union
16
18
 
17
19
  from sky.adaptors import oci as oci_adaptor
20
+ from sky.clouds import OCI
18
21
  from sky.clouds.service_catalog import common
19
22
  from sky.clouds.utils import oci_utils
20
23
  from sky.utils import resources_utils
@@ -63,7 +66,7 @@ def _get_df() -> 'pd.DataFrame':
63
66
  logger.debug(f'It is OK goes here when testing: {str(e)}')
64
67
  subscribed_regions = []
65
68
 
66
- except oci_adaptor.service_exception() as e:
69
+ except oci_adaptor.oci.exceptions.ServiceError as e:
67
70
  # Should never expect going here. However, we still catch
68
71
  # it so that if any OCI call failed, the program can still
69
72
  # proceed with try-and-error way.
@@ -102,7 +105,6 @@ def get_default_instance_type(
102
105
  cpus: Optional[str] = None,
103
106
  memory: Optional[str] = None,
104
107
  disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
105
- del disk_tier # unused
106
108
  if cpus is None:
107
109
  cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+'
108
110
 
@@ -111,12 +113,17 @@ def get_default_instance_type(
111
113
  else:
112
114
  memory_gb_or_ratio = memory
113
115
 
116
+ def _filter_disk_type(instance_type: str) -> bool:
117
+ valid, _ = OCI.check_disk_tier(instance_type, disk_tier)
118
+ return valid
119
+
114
120
  instance_type_prefix = tuple(
115
121
  f'{family}' for family in oci_utils.oci_config.DEFAULT_INSTANCE_FAMILY)
116
122
 
117
123
  df = _get_df()
118
124
  df = df[df['InstanceType'].notna()]
119
125
  df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
126
+ df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
120
127
 
121
128
  logger.debug(f'# get_default_instance_type: {df}')
122
129
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
@@ -124,7 +131,7 @@ def get_default_instance_type(
124
131
 
125
132
 
126
133
  def get_accelerators_from_instance_type(
127
- instance_type: str) -> Optional[Dict[str, int]]:
134
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
128
135
  return common.get_accelerators_from_instance_type_impl(
129
136
  _get_df(), instance_type)
130
137
 
@@ -201,4 +208,24 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
201
208
 
202
209
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
203
210
  """Returns whether the image tag is valid."""
211
+ # Oct.14, 2024 by Hysun He: Marketplace images are region neutral, so don't
212
+ # check with region for the Marketplace images.
213
+ df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
214
+ if df.empty:
215
+ return False
216
+ app_catalog_listing_id = df['AppCatalogListingId'].iloc[0]
217
+ if app_catalog_listing_id:
218
+ return True
204
219
  return common.is_image_tag_valid_impl(_image_df, tag, region)
220
+
221
+
222
+ def get_image_os_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
223
+ del region
224
+ df = _image_df[_image_df['Tag'].str.fullmatch(tag)]
225
+ if df.empty:
226
+ os_type = oci_utils.oci_config.get_default_image_os()
227
+ else:
228
+ os_type = df['OS'].iloc[0]
229
+
230
+ logger.debug(f'Operation system for the image {tag} is {os_type}')
231
+ return os_type
@@ -5,7 +5,7 @@ query instance types and pricing information for Paperspace.
5
5
  """
6
6
 
7
7
  import typing
8
- from typing import Dict, List, Optional, Tuple
8
+ from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from sky.clouds.service_catalog import common
11
11
  from sky.utils import ux_utils
@@ -60,7 +60,7 @@ def get_default_instance_type(
60
60
 
61
61
 
62
62
  def get_accelerators_from_instance_type(
63
- instance_type: str) -> Optional[Dict[str, int]]:
63
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
64
64
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
65
65
 
66
66
 
@@ -5,7 +5,7 @@ query instance types and pricing information for RunPod.
5
5
  """
6
6
 
7
7
  import typing
8
- from typing import Dict, List, Optional, Tuple
8
+ from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from sky.clouds.service_catalog import common
11
11
  from sky.utils import ux_utils
@@ -56,7 +56,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
56
56
 
57
57
 
58
58
  def get_accelerators_from_instance_type(
59
- instance_type: str) -> Optional[Dict[str, int]]:
59
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
60
60
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
61
61
 
62
62
 
@@ -5,7 +5,7 @@ instance types and pricing information for SCP.
5
5
  """
6
6
 
7
7
  import typing
8
- from typing import Dict, List, Optional, Tuple
8
+ from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from sky.clouds.service_catalog import common
11
11
  from sky.utils import resources_utils
@@ -67,7 +67,7 @@ def get_default_instance_type(
67
67
 
68
68
 
69
69
  def get_accelerators_from_instance_type(
70
- instance_type: str) -> Optional[Dict[str, int]]:
70
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
71
71
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
72
72
 
73
73
 
@@ -0,0 +1,104 @@
1
+ """ Vast | Catalog
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for Vast.ai.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ from sky.clouds.service_catalog import common
11
+ from sky.utils import ux_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky.clouds import cloud
15
+
16
+ _df = common.read_catalog('vast/vms.csv')
17
+
18
+
19
+ def instance_type_exists(instance_type: str) -> bool:
20
+ return common.instance_type_exists_impl(_df, instance_type)
21
+
22
+
23
+ def validate_region_zone(
24
+ region: Optional[str],
25
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
26
+ if zone is not None:
27
+ with ux_utils.print_exception_no_traceback():
28
+ raise ValueError('Vast does not support zones.')
29
+ return common.validate_region_zone_impl('vast', _df, region, zone)
30
+
31
+
32
+ def get_hourly_cost(instance_type: str,
33
+ use_spot: bool = False,
34
+ region: Optional[str] = None,
35
+ zone: Optional[str] = None) -> float:
36
+ """Returns the cost, or the cheapest cost among all zones for spot."""
37
+ if zone is not None:
38
+ with ux_utils.print_exception_no_traceback():
39
+ raise ValueError('Vast does not support zones.')
40
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
41
+ zone)
42
+
43
+
44
+ def get_vcpus_mem_from_instance_type(
45
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
46
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
47
+
48
+
49
+ def get_default_instance_type(cpus: Optional[str] = None,
50
+ memory: Optional[str] = None,
51
+ disk_tier: Optional[str] = None) -> Optional[str]:
52
+ del disk_tier
53
+ # NOTE: After expanding catalog to multiple entries, you may
54
+ # want to specify a default instance type or family.
55
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
56
+
57
+
58
+ def get_accelerators_from_instance_type(
59
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
60
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
61
+
62
+
63
+ def get_instance_type_for_accelerator(
64
+ acc_name: str,
65
+ acc_count: int,
66
+ cpus: Optional[str] = None,
67
+ memory: Optional[str] = None,
68
+ use_spot: bool = False,
69
+ region: Optional[str] = None,
70
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
71
+ """Returns a list of instance types that have the given accelerator."""
72
+ if zone is not None:
73
+ with ux_utils.print_exception_no_traceback():
74
+ raise ValueError('Vast does not support zones.')
75
+ return common.get_instance_type_for_accelerator_impl(df=_df,
76
+ acc_name=acc_name,
77
+ acc_count=acc_count,
78
+ cpus=cpus,
79
+ memory=memory,
80
+ use_spot=use_spot,
81
+ region=region,
82
+ zone=zone)
83
+
84
+
85
+ def get_region_zones_for_instance_type(instance_type: str,
86
+ use_spot: bool) -> List['cloud.Region']:
87
+ df = _df[_df['InstanceType'] == instance_type]
88
+ return common.get_region_zones(df, use_spot)
89
+
90
+
91
+ # TODO: this differs from the fluffy catalog version
92
+ def list_accelerators(
93
+ gpus_only: bool,
94
+ name_filter: Optional[str],
95
+ region_filter: Optional[str],
96
+ quantity_filter: Optional[int],
97
+ case_sensitive: bool = True,
98
+ all_regions: bool = False,
99
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
100
+ """Returns all instance types in Vast offering GPUs."""
101
+ del require_price # Unused.
102
+ return common.list_accelerators_impl('Vast', _df, gpus_only, name_filter,
103
+ region_filter, quantity_filter,
104
+ case_sensitive, all_regions)
@@ -2,7 +2,7 @@
2
2
  import io
3
3
  import os
4
4
  import typing
5
- from typing import Dict, List, Optional, Tuple
5
+ from typing import Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky.adaptors import common as adaptors_common
8
8
  from sky.clouds.service_catalog import common
@@ -85,7 +85,7 @@ def get_default_instance_type(
85
85
 
86
86
 
87
87
  def get_accelerators_from_instance_type(
88
- instance_type: str) -> Optional[Dict[str, int]]:
88
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
89
89
  return common.get_accelerators_from_instance_type_impl(
90
90
  _get_df(), instance_type)
91
91
 
@@ -0,0 +1,65 @@
1
+ """Utilities for AWS."""
2
+ import dataclasses
3
+ import enum
4
+ import time
5
+ from typing import List
6
+
7
+ import cachetools
8
+
9
+ from sky import skypilot_config
10
+ from sky.adaptors import aws
11
+
12
+
13
+ class ReservationType(str, enum.Enum):
14
+ DEFAULT = 'default'
15
+ BLOCK = 'capacity-block'
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class AWSReservation:
20
+ name: str
21
+ instance_type: str
22
+ zone: str
23
+ available_resources: int
24
+ # Whether the reservation is targeted, i.e. can only be consumed when
25
+ # the reservation name is specified.
26
+ targeted: bool
27
+ type: ReservationType
28
+
29
+
30
+ def use_reservations() -> bool:
31
+ prioritize_reservations = skypilot_config.get_nested(
32
+ ('aws', 'prioritize_reservations'), False)
33
+ specific_reservations = skypilot_config.get_nested(
34
+ ('aws', 'specific_reservations'), set())
35
+ return prioritize_reservations or specific_reservations
36
+
37
+
38
+ @cachetools.cached(cache=cachetools.TTLCache(maxsize=100,
39
+ ttl=300,
40
+ timer=time.time))
41
+ def list_reservations_for_instance_type(
42
+ instance_type: str,
43
+ region: str,
44
+ ) -> List[AWSReservation]:
45
+ if not use_reservations():
46
+ return []
47
+ ec2 = aws.client('ec2', region_name=region)
48
+ response = ec2.describe_capacity_reservations(Filters=[{
49
+ 'Name': 'instance-type',
50
+ 'Values': [instance_type]
51
+ }, {
52
+ 'Name': 'state',
53
+ 'Values': ['active']
54
+ }])
55
+ reservations = response['CapacityReservations']
56
+ return [
57
+ AWSReservation(name=r['CapacityReservationId'],
58
+ instance_type=r['InstanceType'],
59
+ zone=r['AvailabilityZone'],
60
+ available_resources=r['AvailableInstanceCount'],
61
+ targeted=r['InstanceMatchCriteria'] == 'targeted',
62
+ type=ReservationType(r.get('ReservationType',
63
+ 'default')))
64
+ for r in reservations
65
+ ]
@@ -0,0 +1,91 @@
1
+ """Utilies for Azure"""
2
+
3
+ import typing
4
+
5
+ from sky import exceptions
6
+ from sky.adaptors import azure
7
+ from sky.utils import ux_utils
8
+
9
+ if typing.TYPE_CHECKING:
10
+ from azure.mgmt import compute as azure_compute
11
+ from azure.mgmt.compute import models as azure_compute_models
12
+
13
+
14
+ def validate_image_id(image_id: str):
15
+ """Check if the image ID has a valid format.
16
+
17
+ Raises:
18
+ ValueError: If the image ID is invalid.
19
+ """
20
+ image_id_colon_splitted = image_id.split(':')
21
+ image_id_slash_splitted = image_id.split('/')
22
+ if len(image_id_slash_splitted) != 5 and len(image_id_colon_splitted) != 4:
23
+ with ux_utils.print_exception_no_traceback():
24
+ raise ValueError(
25
+ f'Invalid image id for Azure: {image_id}. Expected format: \n'
26
+ '* Marketplace image ID: <publisher>:<offer>:<sku>:<version>\n'
27
+ '* Community image ID: '
28
+ '/CommunityGalleries/<gallery-name>/Images/<image-name>')
29
+ if len(image_id_slash_splitted) == 5:
30
+ _, gallery_type, _, image_type, _ = image_id.split('/')
31
+ if gallery_type != 'CommunityGalleries' or image_type != 'Images':
32
+ with ux_utils.print_exception_no_traceback():
33
+ raise ValueError(
34
+ f'Invalid community image id for Azure: {image_id}.\n'
35
+ 'Expected format: '
36
+ '/CommunityGalleries/<gallery-name>/Images/<image-name>')
37
+
38
+
39
+ def get_community_image(
40
+ compute_client: 'azure_compute.ComputeManagementClient', image_id: str,
41
+ region: str) -> 'azure_compute_models.CommunityGalleryImage':
42
+ """Get community image from cloud.
43
+
44
+ Args:
45
+ image_id: /CommunityGalleries/<gallery-name>/Images/<image-name>
46
+ Raises:
47
+ ResourcesUnavailableError
48
+ """
49
+ try:
50
+ _, _, gallery_name, _, image_name = image_id.split('/')
51
+ return compute_client.community_gallery_images.get(
52
+ location=region,
53
+ public_gallery_name=gallery_name,
54
+ gallery_image_name=image_name)
55
+ except azure.exceptions().AzureError as e:
56
+ raise exceptions.ResourcesUnavailableError(
57
+ f'Community image {image_id} does not exist in region {region}.'
58
+ ) from e
59
+
60
+
61
+ def get_community_image_size(
62
+ compute_client: 'azure_compute.ComputeManagementClient',
63
+ gallery_name: str, image_name: str, region: str) -> float:
64
+ """Get the size of the community image from cloud.
65
+
66
+ Args:
67
+ image_id: /CommunityGalleries/<gallery-name>/Images/<image-name>
68
+ Raises:
69
+ ResourcesUnavailableError
70
+ """
71
+ try:
72
+ image_versions = compute_client.community_gallery_image_versions.list(
73
+ location=region,
74
+ public_gallery_name=gallery_name,
75
+ gallery_image_name=image_name,
76
+ )
77
+ image_versions = list(image_versions)
78
+ if not image_versions:
79
+ raise exceptions.ResourcesUnavailableError(
80
+ f'No versions available for Azure community image {image_name}')
81
+ latest_version = image_versions[-1].name
82
+
83
+ image_details = compute_client.community_gallery_image_versions.get(
84
+ location=region,
85
+ public_gallery_name=gallery_name,
86
+ gallery_image_name=image_name,
87
+ gallery_image_version_name=latest_version)
88
+ return image_details.storage_profile.os_disk_image.disk_size_gb
89
+ except azure.exceptions().AzureError as e:
90
+ raise exceptions.ResourcesUnavailableError(
91
+ f'Failed to get community image size: {e}.') from e
@@ -17,6 +17,7 @@ import cachetools
17
17
  from sky import sky_logging
18
18
  from sky import skypilot_config
19
19
  from sky.provision.gcp import constants
20
+ from sky.provision.kubernetes import utils as kubernetes_utils
20
21
  from sky.utils import subprocess_utils
21
22
 
22
23
  if typing.TYPE_CHECKING:
@@ -35,7 +36,10 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
35
36
  def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
36
37
  if not is_tpu(resources):
37
38
  return False
38
- assert resources is not None
39
+ assert (resources is not None and len(resources.accelerators) == 1)
40
+ acc, _ = list(resources.accelerators.items())[0]
41
+ if kubernetes_utils.is_tpu_on_gke(acc):
42
+ return False
39
43
  if resources.accelerator_args is None:
40
44
  return True
41
45
  return resources.accelerator_args.get('tpu_vm', True)
@@ -49,14 +53,6 @@ def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool:
49
53
  return not acc.endswith('-8')
50
54
 
51
55
 
52
- def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int:
53
- if resources is None or not is_tpu(resources):
54
- raise ValueError('resources must be a valid TPU resource.')
55
- acc, _ = list(resources.accelerators.items())[0]
56
- num_tpu_devices = int(int(acc.split('-')[2]) / 8)
57
- return num_tpu_devices
58
-
59
-
60
56
  @dataclasses.dataclass
61
57
  class SpecificReservation:
62
58
  count: int
@@ -1,15 +1,26 @@
1
1
  """OCI Configuration.
2
2
  History:
3
- - Zhanghao Wu @ Oct 2023: Formatting and refactoring
4
3
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
4
+ - Zhanghao Wu @ Oct 2023: Formatting and refactoring
5
+ - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
6
+ configuration.
7
+ - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant
8
+ SERVICE_PORT_RULE_TAG
9
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Set the default image
10
+ from ubuntu 20.04 to ubuntu 22.04, including:
11
+ - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204
12
+ - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204
13
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing
14
+ VCN for SkyServe.
5
15
  """
6
- import logging
7
16
  import os
8
17
 
18
+ from sky import sky_logging
9
19
  from sky import skypilot_config
10
20
  from sky.utils import resources_utils
21
+ from sky.utils import status_lib
11
22
 
12
- logger = logging.getLogger(__name__)
23
+ logger = sky_logging.init_logger(__name__)
13
24
 
14
25
 
15
26
  class OCIConfig:
@@ -39,6 +50,9 @@ class OCIConfig:
39
50
  VCN_CIDR_INTERNET = '0.0.0.0/0'
40
51
  VCN_CIDR = '192.168.0.0/16'
41
52
  VCN_SUBNET_CIDR = '192.168.0.0/18'
53
+ SERVICE_PORT_RULE_TAG = 'SkyServe-Service-Port'
54
+ # NSG name template
55
+ NSG_NAME_TEMPLATE = 'nsg_{cluster_name}'
42
56
 
43
57
  MAX_RETRY_COUNT = 3
44
58
  RETRY_INTERVAL_BASE_SECONDS = 5
@@ -75,6 +89,19 @@ class OCIConfig:
75
89
  resources_utils.DiskTier.HIGH: DISK_TIER_HIGH,
76
90
  }
77
91
 
92
+ # Oracle instance's lifecycle state to sky state mapping.
93
+ # For Oracle VM instance's lifecyle state, please refer to the link:
94
+ # https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
95
+ STATE_MAPPING_OCI_TO_SKY = {
96
+ 'PROVISIONING': status_lib.ClusterStatus.INIT,
97
+ 'STARTING': status_lib.ClusterStatus.INIT,
98
+ 'RUNNING': status_lib.ClusterStatus.UP,
99
+ 'STOPPING': status_lib.ClusterStatus.STOPPED,
100
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
101
+ 'TERMINATED': None,
102
+ 'TERMINATING': None,
103
+ }
104
+
78
105
  @classmethod
79
106
  def get_compartment(cls, region):
80
107
  # Allow task(cluster)-specific compartment/VCN parameters.
@@ -84,8 +111,15 @@ class OCIConfig:
84
111
  ('oci', region, 'compartment_ocid'), default_compartment_ocid)
85
112
  return compartment
86
113
 
114
+ @classmethod
115
+ def get_vcn_ocid(cls, region):
116
+ # Will reuse the regional VCN if specified.
117
+ vcn = skypilot_config.get_nested(('oci', region, 'vcn_ocid'), None)
118
+ return vcn
119
+
87
120
  @classmethod
88
121
  def get_vcn_subnet(cls, region):
122
+ # Will reuse the subnet if specified.
89
123
  vcn = skypilot_config.get_nested(('oci', region, 'vcn_subnet'), None)
90
124
  return vcn
91
125
 
@@ -96,7 +130,7 @@ class OCIConfig:
96
130
  # the sky's user-config file (if not specified, use the hardcode one at
97
131
  # last)
98
132
  return skypilot_config.get_nested(('oci', 'default', 'image_tag_gpu'),
99
- 'skypilot:gpu-ubuntu-2004')
133
+ 'skypilot:gpu-ubuntu-2204')
100
134
 
101
135
  @classmethod
102
136
  def get_default_image_tag(cls) -> str:
@@ -104,7 +138,7 @@ class OCIConfig:
104
138
  # set the default image tag in the sky's user-config file. (if not
105
139
  # specified, use the hardcode one at last)
106
140
  return skypilot_config.get_nested(
107
- ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2004')
141
+ ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2204')
108
142
 
109
143
  @classmethod
110
144
  def get_sky_user_config_file(cls) -> str:
@@ -121,5 +155,13 @@ class OCIConfig:
121
155
  return skypilot_config.get_nested(
122
156
  ('oci', 'default', 'oci_config_profile'), 'DEFAULT')
123
157
 
158
+ @classmethod
159
+ def get_default_image_os(cls) -> str:
160
+ # Get the default image OS. Instead of hardcoding, we give a choice to
161
+ # set the default image OS type in the sky's user-config file. (if not
162
+ # specified, use the hardcode one at last)
163
+ return skypilot_config.get_nested(('oci', 'default', 'image_os_type'),
164
+ 'ubuntu')
165
+
124
166
 
125
167
  oci_config = OCIConfig()