skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,436 @@
1
+ """OCI instance provisioning.
2
+
3
+ History:
4
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
5
+ - Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
6
+ and cleanup_ports for supporting SkyServe.
7
+ """
8
+
9
+ import copy
10
+ from datetime import datetime
11
+ import time
12
+ import typing
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from sky import exceptions
16
+ from sky import sky_logging
17
+ from sky.adaptors import oci as oci_adaptor
18
+ from sky.clouds.utils import oci_utils
19
+ from sky.provision import common
20
+ from sky.provision import constants
21
+ from sky.provision.oci import query_utils
22
+ from sky.provision.oci.query_utils import query_helper
23
+ from sky.utils import common_utils
24
+ from sky.utils import ux_utils
25
+
26
+ if typing.TYPE_CHECKING:
27
+ from sky.utils import status_lib
28
+
29
+ logger = sky_logging.init_logger(__name__)
30
+
31
+
32
+ @query_utils.debug_enabled(logger)
33
+ @common_utils.retry
34
+ def query_instances(
35
+ cluster_name_on_cloud: str,
36
+ provider_config: Optional[Dict[str, Any]] = None,
37
+ non_terminated_only: bool = True,
38
+ ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
39
+ """Query instances.
40
+
41
+ Returns a dictionary of instance IDs and status.
42
+
43
+ A None status means the instance is marked as "terminated"
44
+ or "terminating".
45
+ """
46
+ assert provider_config is not None, cluster_name_on_cloud
47
+ region = provider_config['region']
48
+
49
+ status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
+ statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
51
+ filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
+
53
+ instances = _get_filtered_nodes(region, filters)
54
+ for node in instances:
55
+ vm_status = node['status']
56
+ sky_status = status_map[vm_status]
57
+ if non_terminated_only and sky_status is None:
58
+ continue
59
+ statuses[node['inst_id']] = sky_status
60
+
61
+ return statuses
62
+
63
+
64
+ @query_utils.debug_enabled(logger)
65
+ def run_instances(region: str, cluster_name_on_cloud: str,
66
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
67
+ """Start instances with bootstrapped configuration."""
68
+ tags = dict(sorted(copy.deepcopy(config.tags).items()))
69
+
70
+ start_time = round(time.time() * 1000)
71
+ filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
72
+
73
+ # Starting stopped nodes if resume_stopped_nodes=True
74
+ resume_instances = []
75
+ if config.resume_stopped_nodes:
76
+ logger.debug('Checking existing stopped nodes.')
77
+
78
+ existing_instances = _get_filtered_nodes(region, filters)
79
+ if len(existing_instances) > config.count:
80
+ raise RuntimeError(
81
+ 'The number of pending/running/stopped/stopping '
82
+ f'instances combined ({len(existing_instances)}) in '
83
+ f'cluster "{cluster_name_on_cloud}" is greater than the '
84
+ f'number requested by the user ({config.count}). '
85
+ 'This is likely a resource leak. '
86
+ 'Use "sky down" to terminate the cluster.')
87
+
88
+ # pylint: disable=line-too-long
89
+ logger.debug(
90
+ f'run_instances: Found {[inst["name"] for inst in existing_instances]} '
91
+ 'existing instances in cluster.')
92
+ existing_instances.sort(key=lambda x: x['name'])
93
+
94
+ stopped_instances = []
95
+ for existing_node in existing_instances:
96
+ if existing_node['status'] == 'STOPPING':
97
+ query_helper.wait_instance_until_status(
98
+ region, existing_node['inst_id'], 'STOPPED')
99
+ stopped_instances.append(existing_node)
100
+ elif existing_node['status'] == 'STOPPED':
101
+ stopped_instances.append(existing_node)
102
+ elif existing_node['status'] in ('PROVISIONING', 'STARTING',
103
+ 'RUNNING'):
104
+ resume_instances.append(existing_node)
105
+
106
+ for stopped_node in stopped_instances:
107
+ stopped_node_id = stopped_node['inst_id']
108
+ instance_action_response = query_helper.start_instance(
109
+ region, stopped_node_id)
110
+
111
+ starting_inst = instance_action_response.data
112
+ resume_instances.append({
113
+ 'inst_id': starting_inst.id,
114
+ 'name': starting_inst.display_name,
115
+ 'ad': starting_inst.availability_domain,
116
+ 'compartment': starting_inst.compartment_id,
117
+ 'status': starting_inst.lifecycle_state,
118
+ 'oci_tags': starting_inst.freeform_tags,
119
+ })
120
+ # end if config.resume_stopped_nodes
121
+
122
+ # Try get head id from the existing instances
123
+ head_instance_id = _get_head_instance_id(resume_instances)
124
+ logger.debug(f'Check existing head node: {head_instance_id}')
125
+
126
+ # Let's create additional new nodes (if neccessary)
127
+ to_start_count = config.count - len(resume_instances)
128
+ created_instances = []
129
+ node_config = config.node_config
130
+ if to_start_count > 0:
131
+ compartment = query_helper.find_compartment(region)
132
+ vcn = query_helper.find_create_vcn_subnet(region)
133
+
134
+ ocpu_count = 0
135
+ vcpu_str = node_config['VCPUs']
136
+ instance_type_str = node_config['InstanceType']
137
+
138
+ if vcpu_str is not None and vcpu_str != 'None':
139
+ if instance_type_str.startswith(
140
+ f'{oci_utils.oci_config.VM_PREFIX}.A'):
141
+ # For ARM cpu, 1*ocpu = 1*vcpu
142
+ ocpu_count = round(float(vcpu_str))
143
+ else:
144
+ # For Intel / AMD cpu, 1*ocpu = 2*vcpu
145
+ ocpu_count = round(float(vcpu_str) / 2)
146
+ ocpu_count = 1 if (ocpu_count > 0 and ocpu_count < 1) else ocpu_count
147
+
148
+ machine_shape_config = None
149
+ if ocpu_count > 0:
150
+ mem = node_config['MemoryInGbs']
151
+ if mem is not None and mem != 'None':
152
+ # pylint: disable=line-too-long
153
+ machine_shape_config = oci_adaptor.oci.core.models.LaunchInstanceShapeConfigDetails(
154
+ ocpus=ocpu_count, memory_in_gbs=mem)
155
+ else:
156
+ # pylint: disable=line-too-long
157
+ machine_shape_config = oci_adaptor.oci.core.models.LaunchInstanceShapeConfigDetails(
158
+ ocpus=ocpu_count)
159
+
160
+ preempitible_config = (
161
+ oci_adaptor.oci.core.models.PreemptibleInstanceConfigDetails(
162
+ preemption_action=oci_adaptor.oci.core.models.
163
+ TerminatePreemptionAction(type='TERMINATE',
164
+ preserve_boot_volume=False))
165
+ if node_config['Preemptible'] else None)
166
+
167
+ batch_id = datetime.now().strftime('%Y%m%d%H%M%S')
168
+
169
+ vm_tags_head = {
170
+ **tags,
171
+ **constants.HEAD_NODE_TAGS,
172
+ constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
173
+ 'sky_spot_flag': str(node_config['Preemptible']).lower(),
174
+ }
175
+ vm_tags_worker = {
176
+ **tags,
177
+ **constants.WORKER_NODE_TAGS,
178
+ constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
179
+ 'sky_spot_flag': str(node_config['Preemptible']).lower(),
180
+ }
181
+
182
+ for seq in range(1, to_start_count + 1):
183
+ if head_instance_id is None:
184
+ vm_tags = vm_tags_head
185
+ node_type = constants.HEAD_NODE_TAGS[
186
+ constants.TAG_RAY_NODE_KIND]
187
+ else:
188
+ vm_tags = vm_tags_worker
189
+ node_type = constants.WORKER_NODE_TAGS[
190
+ constants.TAG_RAY_NODE_KIND]
191
+
192
+ launch_instance_response = query_helper.launch_instance(
193
+ region,
194
+ oci_adaptor.oci.core.models.LaunchInstanceDetails(
195
+ availability_domain=node_config['AvailabilityDomain'],
196
+ compartment_id=compartment,
197
+ shape=instance_type_str,
198
+ display_name=
199
+ f'{cluster_name_on_cloud}_{node_type}_{batch_id}_{seq}',
200
+ freeform_tags=vm_tags,
201
+ metadata={
202
+ 'ssh_authorized_keys': node_config['AuthorizedKey']
203
+ },
204
+ source_details=oci_adaptor.oci.core.models.
205
+ InstanceSourceViaImageDetails(
206
+ source_type='image',
207
+ image_id=node_config['ImageId'],
208
+ boot_volume_size_in_gbs=node_config['BootVolumeSize'],
209
+ boot_volume_vpus_per_gb=int(
210
+ node_config['BootVolumePerf']),
211
+ ),
212
+ create_vnic_details=oci_adaptor.oci.core.models.
213
+ CreateVnicDetails(
214
+ assign_public_ip=True,
215
+ subnet_id=vcn,
216
+ ),
217
+ shape_config=machine_shape_config,
218
+ preemptible_instance_config=preempitible_config,
219
+ ))
220
+
221
+ new_inst = launch_instance_response.data
222
+ if head_instance_id is None:
223
+ head_instance_id = new_inst.id
224
+ logger.debug(f'New head node: {head_instance_id}')
225
+
226
+ created_instances.append({
227
+ 'inst_id': new_inst.id,
228
+ 'name': new_inst.display_name,
229
+ 'ad': new_inst.availability_domain,
230
+ 'compartment': new_inst.compartment_id,
231
+ 'status': new_inst.lifecycle_state,
232
+ 'oci_tags': new_inst.freeform_tags,
233
+ })
234
+ # end for loop
235
+ # end if to_start_count > 0:...
236
+
237
+ for inst in (resume_instances + created_instances):
238
+ logger.debug(f'Provisioning for node {inst["name"]}')
239
+ query_helper.wait_instance_until_status(region, inst['inst_id'],
240
+ 'RUNNING')
241
+ logger.debug(f'Instance {inst["name"]} is RUNNING.')
242
+
243
+ total_time = round(time.time() * 1000) - start_time
244
+ logger.debug('Total time elapsed: {0} milli-seconds.'.format(total_time))
245
+
246
+ assert head_instance_id is not None, head_instance_id
247
+
248
+ # Format: TenancyPrefix:AvailabilityDomain, e.g. bxtG:US-SANJOSE-1-AD-1
249
+ _, ad = str(node_config['AvailabilityDomain']).split(':', maxsplit=1)
250
+ return common.ProvisionRecord(
251
+ provider_name='oci',
252
+ region=region,
253
+ zone=ad,
254
+ cluster_name=cluster_name_on_cloud,
255
+ head_instance_id=head_instance_id,
256
+ created_instance_ids=[n['inst_id'] for n in created_instances],
257
+ resumed_instance_ids=[n['inst_id'] for n in resume_instances],
258
+ )
259
+
260
+
261
+ @query_utils.debug_enabled(logger)
262
+ def stop_instances(
263
+ cluster_name_on_cloud: str,
264
+ provider_config: Dict[str, Any],
265
+ worker_only: bool = False,
266
+ ) -> None:
267
+ """Stop running instances."""
268
+ # pylint: disable=line-too-long
269
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
270
+
271
+ region = provider_config['region']
272
+ tag_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
273
+ if worker_only:
274
+ tag_filters[constants.TAG_RAY_NODE_KIND] = 'worker'
275
+
276
+ nodes = _get_filtered_nodes(region, tag_filters)
277
+ for node in nodes:
278
+ query_helper.stop_instance(region, node['inst_id'])
279
+
280
+
281
+ @query_utils.debug_enabled(logger)
282
+ def terminate_instances(
283
+ cluster_name_on_cloud: str,
284
+ provider_config: Dict[str, Any],
285
+ worker_only: bool = False,
286
+ ) -> None:
287
+ """Terminate running or stopped instances."""
288
+ region = provider_config['region']
289
+ tag_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
290
+ if worker_only:
291
+ tag_filters[constants.TAG_RAY_NODE_KIND] = 'worker'
292
+ query_helper.terminate_instances_by_tags(tag_filters, region)
293
+
294
+
295
+ @query_utils.debug_enabled(logger)
296
+ def open_ports(
297
+ cluster_name_on_cloud: str,
298
+ ports: List[str],
299
+ provider_config: Optional[Dict[str, Any]] = None,
300
+ ) -> None:
301
+ """Open ports for inbound traffic."""
302
+ assert provider_config is not None, cluster_name_on_cloud
303
+ region = provider_config['region']
304
+ query_helper.create_nsg_rules(region=region,
305
+ cluster_name=cluster_name_on_cloud,
306
+ ports=ports)
307
+
308
+
309
+ @query_utils.debug_enabled(logger)
310
+ def cleanup_ports(
311
+ cluster_name_on_cloud: str,
312
+ ports: List[str],
313
+ provider_config: Optional[Dict[str, Any]] = None,
314
+ ) -> None:
315
+ """Delete any opened ports."""
316
+ assert provider_config is not None, cluster_name_on_cloud
317
+ region = provider_config['region']
318
+ del ports
319
+ query_helper.remove_cluster_nsg(region=region,
320
+ cluster_name=cluster_name_on_cloud)
321
+
322
+
323
+ @query_utils.debug_enabled(logger)
324
+ def wait_instances(region: str, cluster_name_on_cloud: str,
325
+ state: Optional['status_lib.ClusterStatus']) -> None:
326
+ del region, cluster_name_on_cloud, state
327
+ # We already wait for the instances to be running in run_instances.
328
+ # We can not implement the wait logic here because the provisioning
329
+ # instances are not retrieveable by the QL 'query instance resources ...'.
330
+
331
+
332
+ @query_utils.debug_enabled(logger)
333
+ def get_cluster_info(
334
+ region: str,
335
+ cluster_name_on_cloud: str,
336
+ provider_config: Optional[Dict[str, Any]] = None,
337
+ ) -> common.ClusterInfo:
338
+ """Get the metadata of instances in a cluster."""
339
+ filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
340
+ running_instances = _get_filtered_nodes(region, filters)
341
+
342
+ instances = {}
343
+ for running_instance in running_instances:
344
+ inst = _get_inst_obj_with_ip(region, running_instance)
345
+ instances[inst['id']] = [
346
+ common.InstanceInfo(
347
+ instance_id=inst['id'],
348
+ internal_ip=inst['internal_ip'],
349
+ external_ip=inst['external_ip'],
350
+ tags=inst['tags'],
351
+ )
352
+ ]
353
+
354
+ instances = dict(sorted(instances.items(), key=lambda x: x[0]))
355
+ logger.debug(f'Cluster info: {instances}')
356
+
357
+ head_instance_id = _get_head_instance_id(running_instances)
358
+ logger.debug(f'Head instance id is {head_instance_id}')
359
+
360
+ return common.ClusterInfo(
361
+ provider_name='oci',
362
+ head_instance_id=head_instance_id,
363
+ instances=instances,
364
+ provider_config=provider_config,
365
+ )
366
+
367
+
368
+ def _get_filtered_nodes(region: str,
369
+ tag_filters: Dict[str, str]) -> List[Dict[str, Any]]:
370
+ return_nodes = []
371
+
372
+ try:
373
+ insts = query_helper.query_instances_by_tags(tag_filters, region)
374
+ except oci_adaptor.oci.exceptions.ServiceError as e:
375
+ with ux_utils.print_exception_no_traceback():
376
+ raise exceptions.ClusterStatusFetchingError(
377
+ f'Failed to query status for OCI cluster {tag_filters}.'
378
+ 'Details: '
379
+ f'{common_utils.format_exception(e, use_bracket=True)}')
380
+
381
+ for inst in insts:
382
+ inst_id = inst.identifier
383
+ return_nodes.append({
384
+ 'inst_id': inst_id,
385
+ 'name': inst.display_name,
386
+ 'ad': inst.availability_domain,
387
+ 'compartment': inst.compartment_id,
388
+ 'status': inst.lifecycle_state,
389
+ 'oci_tags': inst.freeform_tags,
390
+ })
391
+
392
+ return return_nodes
393
+
394
+
395
+ def _get_inst_obj_with_ip(region: str, inst_info: Dict[str,
396
+ Any]) -> Dict[str, Any]:
397
+ get_vnic_response = query_helper.get_instance_primary_vnic(
398
+ region, inst_info)
399
+ internal_ip = get_vnic_response.private_ip
400
+ external_ip = get_vnic_response.public_ip
401
+ if external_ip is None:
402
+ external_ip = internal_ip
403
+
404
+ return {
405
+ 'id': inst_info['inst_id'],
406
+ 'name': inst_info['name'],
407
+ 'external_ip': external_ip,
408
+ 'internal_ip': internal_ip,
409
+ 'tags': inst_info['oci_tags'],
410
+ 'status': inst_info['status'],
411
+ }
412
+
413
+
414
+ def _get_head_instance_id(instances: List[Dict[str, Any]]) -> Optional[str]:
415
+ head_instance_id = None
416
+ head_node_tags = tuple(constants.HEAD_NODE_TAGS.items())
417
+ for inst in instances:
418
+ is_matched = True
419
+ for k, v in head_node_tags:
420
+ if (k, v) not in inst['oci_tags'].items():
421
+ is_matched = False
422
+ break
423
+ if is_matched:
424
+ if head_instance_id is not None:
425
+ logger.warning(
426
+ 'There are multiple head nodes in the cluster '
427
+ f'(current head instance id: {head_instance_id}, '
428
+ f'newly discovered id: {inst["inst_id"]}. It is likely '
429
+ f'that something goes wrong.')
430
+ # Don't break here so that we can continue to check and
431
+ # warn user about duplicate head instance issue so that
432
+ # user can take further action on the abnormal cluster.
433
+
434
+ head_instance_id = inst['inst_id']
435
+
436
+ return head_instance_id