skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,681 @@
1
+ """OCI query helper class
2
+
3
+ History:
4
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Code here mainly
5
+ migrated from the old provisioning API.
6
+ - Hysun He (hysun.he@oracle.com) @ Oct.18, 2024: Enhancement.
7
+ find_compartment: allow search subtree when find a compartment.
8
+ - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to
9
+ Add/remove security rules: create_nsg_rules & remove_nsg
10
+ - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing
11
+ VCN for SkyServe.
12
+ """
13
+ from datetime import datetime
14
+ import functools
15
+ from logging import Logger
16
+ import re
17
+ import time
18
+ import traceback
19
+ import typing
20
+ from typing import List, Optional, Tuple
21
+
22
+ from sky import sky_logging
23
+ from sky.adaptors import common as adaptors_common
24
+ from sky.adaptors import oci as oci_adaptor
25
+ from sky.clouds.utils import oci_utils
26
+ from sky.provision import constants
27
+ from sky.utils import resources_utils
28
+
29
+ if typing.TYPE_CHECKING:
30
+ import pandas as pd
31
+ else:
32
+ pd = adaptors_common.LazyImport('pandas')
33
+
34
+ logger = sky_logging.init_logger(__name__)
35
+
36
+
37
+ def debug_enabled(log: Logger):
38
+
39
+ def decorate(f):
40
+
41
+ @functools.wraps(f)
42
+ def wrapper(*args, **kwargs):
43
+ dt_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
44
+ log.debug(f'{dt_str} Enter {f}, {args}, {kwargs}')
45
+ try:
46
+ return f(*args, **kwargs)
47
+ finally:
48
+ log.debug(f'{dt_str} Exit {f}')
49
+
50
+ return wrapper
51
+
52
+ return decorate
53
+
54
+
55
+ class QueryHelper:
56
+ """Helper class for some OCI operations
57
+ """
58
+ # Call Cloud API to try getting the satisfied nodes.
59
+ @classmethod
60
+ @debug_enabled(logger)
61
+ def query_instances_by_tags(cls, tag_filters, region):
62
+
63
+ where_clause_tags = ''
64
+ for tag_key in tag_filters:
65
+ if where_clause_tags != '':
66
+ where_clause_tags += ' && '
67
+
68
+ tag_value = tag_filters[tag_key]
69
+ where_clause_tags += (f'(freeformTags.key = \'{tag_key}\''
70
+ f' && freeformTags.value = \'{tag_value}\')')
71
+
72
+ qv_str = (f'query instance resources where {where_clause_tags}'
73
+ f' && (lifecycleState != \'TERMINATED\''
74
+ f' && lifecycleState != \'TERMINATING\')')
75
+
76
+ qv = oci_adaptor.oci.resource_search.models.StructuredSearchDetails(
77
+ query=qv_str,
78
+ type='Structured',
79
+ matching_context_type=oci_adaptor.oci.resource_search.models.
80
+ SearchDetails.MATCHING_CONTEXT_TYPE_NONE,
81
+ )
82
+
83
+ list_instances_response = oci_adaptor.get_search_client(
84
+ region, oci_utils.oci_config.get_profile()).search_resources(qv)
85
+ result_set = list_instances_response.data.items
86
+
87
+ return result_set
88
+
89
+ @classmethod
90
+ @debug_enabled(logger)
91
+ def terminate_instances_by_tags(cls, tag_filters, region) -> int:
92
+ logger.debug(f'Terminate instance by tags: {tag_filters}')
93
+
94
+ cluster_name = tag_filters[constants.TAG_RAY_CLUSTER_NAME]
95
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
96
+ cluster_name=cluster_name)
97
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
98
+
99
+ core_client = oci_adaptor.get_core_client(
100
+ region, oci_utils.oci_config.get_profile())
101
+
102
+ insts = cls.query_instances_by_tags(tag_filters, region)
103
+ fail_count = 0
104
+ for inst in insts:
105
+ inst_id = inst.identifier
106
+ logger.debug(f'Terminating instance {inst_id}')
107
+
108
+ try:
109
+ # Release the NSG reference so that the NSG can be
110
+ # deleted without waiting the instance being terminated.
111
+ if nsg_id is not None:
112
+ cls.detach_nsg(region, inst, nsg_id)
113
+
114
+ # Terminate the instance
115
+ core_client.terminate_instance(inst_id)
116
+
117
+ except oci_adaptor.oci.exceptions.ServiceError as e:
118
+ fail_count += 1
119
+ logger.error(f'Terminate instance failed: {str(e)}\n: {inst}')
120
+ traceback.print_exc()
121
+
122
+ if fail_count == 0:
123
+ logger.debug('Instance teardown result: OK')
124
+ else:
125
+ logger.warning(f'Instance teardown result: {fail_count} failed!')
126
+
127
+ return fail_count
128
+
129
+ @classmethod
130
+ @debug_enabled(logger)
131
+ def launch_instance(cls, region, launch_config):
132
+ """ To create a new instance """
133
+ return oci_adaptor.get_core_client(
134
+ region, oci_utils.oci_config.get_profile()).launch_instance(
135
+ launch_instance_details=launch_config)
136
+
137
+ @classmethod
138
+ @debug_enabled(logger)
139
+ def start_instance(cls, region, instance_id):
140
+ """ To start an existing instance """
141
+ return oci_adaptor.get_core_client(
142
+ region, oci_utils.oci_config.get_profile()).instance_action(
143
+ instance_id=instance_id, action='START')
144
+
145
+ @classmethod
146
+ @debug_enabled(logger)
147
+ def stop_instance(cls, region, instance_id):
148
+ """ To stop an instance """
149
+ return oci_adaptor.get_core_client(
150
+ region, oci_utils.oci_config.get_profile()).instance_action(
151
+ instance_id=instance_id, action='STOP')
152
+
153
+ @classmethod
154
+ @debug_enabled(logger)
155
+ def wait_instance_until_status(cls, region, node_id, status):
156
+ """ To wait a instance becoming the specified state """
157
+ compute_client = oci_adaptor.get_core_client(
158
+ region, oci_utils.oci_config.get_profile())
159
+
160
+ resp = compute_client.get_instance(instance_id=node_id)
161
+
162
+ oci_adaptor.oci.wait_until(
163
+ compute_client,
164
+ resp,
165
+ 'lifecycle_state',
166
+ status,
167
+ )
168
+
169
+ @classmethod
170
+ def get_instance_primary_vnic(cls, region, inst_info):
171
+ """ Get the primary vnic infomation of the instance """
172
+ list_vnic_attachments_response = oci_adaptor.get_core_client(
173
+ region, oci_utils.oci_config.get_profile()).list_vnic_attachments(
174
+ availability_domain=inst_info['ad'],
175
+ compartment_id=inst_info['compartment'],
176
+ instance_id=inst_info['inst_id'],
177
+ )
178
+ vnic = list_vnic_attachments_response.data[0]
179
+ return oci_adaptor.get_net_client(
180
+ region, oci_utils.oci_config.get_profile()).get_vnic(
181
+ vnic_id=vnic.vnic_id).data
182
+
183
+ @classmethod
184
+ @debug_enabled(logger)
185
+ def subscribe_image(cls, compartment_id, listing_id, resource_version,
186
+ region):
187
+ if (pd.isna(listing_id) or listing_id.strip() == 'None' or
188
+ listing_id.strip() == 'nan'):
189
+ return
190
+
191
+ core_client = oci_adaptor.get_core_client(
192
+ region, oci_utils.oci_config.get_profile())
193
+ try:
194
+ agreements_resp = core_client.get_app_catalog_listing_agreements(
195
+ listing_id=listing_id, resource_version=resource_version)
196
+ agreements = agreements_resp.data
197
+
198
+ core_client.create_app_catalog_subscription(
199
+ create_app_catalog_subscription_details=oci_adaptor.oci.core.
200
+ models.CreateAppCatalogSubscriptionDetails(
201
+ compartment_id=compartment_id,
202
+ listing_id=listing_id,
203
+ listing_resource_version=agreements.
204
+ listing_resource_version,
205
+ oracle_terms_of_use_link=agreements.
206
+ oracle_terms_of_use_link,
207
+ time_retrieved=datetime.strptime(
208
+ re.sub(
209
+ r'\d{3}\+\d{2}\:\d{2}',
210
+ 'Z',
211
+ str(agreements.time_retrieved),
212
+ 0,
213
+ ),
214
+ '%Y-%m-%d %H:%M:%S.%fZ',
215
+ ),
216
+ signature=agreements.signature,
217
+ eula_link=agreements.eula_link,
218
+ ))
219
+ except oci_adaptor.oci.exceptions.ServiceError as e:
220
+ logger.critical(
221
+ f'[Failed] subscribe_image: {listing_id} - {resource_version}'
222
+ f'Error message: {str(e)}')
223
+ raise RuntimeError('ERR: Image subscription error!') from e
224
+
225
+ @classmethod
226
+ @debug_enabled(logger)
227
+ def find_compartment(cls, region) -> str:
228
+ """ If compartment is not configured, we use root compartment """
229
+ # Try to use the configured one first
230
+ skypilot_compartment = oci_utils.oci_config.get_compartment(region)
231
+ if skypilot_compartment is not None:
232
+ return skypilot_compartment
233
+
234
+ # If not specified, we try to find the one skypilot-compartment
235
+ # Pass-in a profile parameter so that multiple profile in oci
236
+ # config file is supported (2023/06/09).
237
+ root = oci_adaptor.get_oci_config(
238
+ region, oci_utils.oci_config.get_profile())['tenancy']
239
+
240
+ list_compartments_response = oci_adaptor.get_identity_client(
241
+ region, oci_utils.oci_config.get_profile()).list_compartments(
242
+ compartment_id=root,
243
+ name=oci_utils.oci_config.COMPARTMENT,
244
+ compartment_id_in_subtree=True,
245
+ access_level='ACCESSIBLE',
246
+ lifecycle_state='ACTIVE',
247
+ sort_by='TIMECREATED',
248
+ sort_order='DESC',
249
+ limit=1)
250
+
251
+ compartments = list_compartments_response.data
252
+ if compartments:
253
+ skypilot_compartment = compartments[0].id
254
+ return skypilot_compartment
255
+
256
+ # Finally, we use root compartment none matches above
257
+ skypilot_compartment = root
258
+ return skypilot_compartment
259
+
260
+ @classmethod
261
+ @debug_enabled(logger)
262
+ def find_create_vcn_subnet(cls, region) -> Optional[str]:
263
+ """ If sub is not configured, we find/create VCN skypilot_vcn """
264
+ subnet = oci_utils.oci_config.get_vcn_subnet(region)
265
+ if subnet is not None:
266
+ # User explicitly specified the subnet in sky config.
267
+ return subnet
268
+
269
+ # Try to reuse the skypilot_vcn.
270
+ net_client = oci_adaptor.get_net_client(
271
+ region, oci_utils.oci_config.get_profile())
272
+ skypilot_compartment = cls.find_compartment(region)
273
+ list_vcns_response = net_client.list_vcns(
274
+ compartment_id=skypilot_compartment,
275
+ display_name=oci_utils.oci_config.VCN_NAME,
276
+ lifecycle_state='AVAILABLE')
277
+ vcns = list_vcns_response.data
278
+ if vcns:
279
+ # Found the VCN.
280
+ skypilot_vcn = vcns[0].id
281
+ list_subnets_response = net_client.list_subnets(
282
+ compartment_id=skypilot_compartment,
283
+ limit=1,
284
+ vcn_id=skypilot_vcn,
285
+ display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
286
+ lifecycle_state='AVAILABLE')
287
+ logger.debug(f'Got VCN subnet \n{list_subnets_response.data}')
288
+ if len(list_subnets_response.data) < 1:
289
+ logger.error(
290
+ f'No subnet {oci_utils.oci_config.VCN_SUBNET_NAME} '
291
+ f'found in the VCN {oci_utils.oci_config.VCN_NAME}')
292
+ raise RuntimeError(
293
+ f'VcnSubnetNotFound Error: No subnet '
294
+ f'{oci_utils.oci_config.VCN_SUBNET_NAME} found in '
295
+ f'the VCN {oci_utils.oci_config.VCN_NAME}')
296
+ subnet = list_subnets_response.data[0].id
297
+ return subnet
298
+ else:
299
+ # Create the skypilot_vcn and related resources
300
+ return cls.create_vcn_subnet(net_client, skypilot_compartment)
301
+
302
+ @classmethod
303
+ @debug_enabled(logger)
304
+ def create_vcn_subnet(cls, net_client,
305
+ skypilot_compartment) -> Optional[str]:
306
+
307
+ skypilot_vcn = None # VCN for the resources
308
+ subnet = None # Subnet for the VMs
309
+ ig = None # Internet gateway
310
+ sg = None # Service gateway
311
+
312
+ try:
313
+ # pylint: disable=line-too-long
314
+ create_vcn_response = net_client.create_vcn(
315
+ create_vcn_details=oci_adaptor.oci.core.models.CreateVcnDetails(
316
+ compartment_id=skypilot_compartment,
317
+ cidr_blocks=[oci_utils.oci_config.VCN_CIDR],
318
+ display_name=oci_utils.oci_config.VCN_NAME,
319
+ is_ipv6_enabled=False,
320
+ dns_label=oci_utils.oci_config.VCN_DNS_LABEL))
321
+ vcn_data = create_vcn_response.data
322
+ logger.debug(f'Created VCN \n{vcn_data}')
323
+ skypilot_vcn = vcn_data.id
324
+ route_table = vcn_data.default_route_table_id
325
+ security_list = vcn_data.default_security_list_id
326
+ dhcp_options_id = vcn_data.default_dhcp_options_id
327
+
328
+ # Create internet gateway for internet access
329
+ create_ig_response = net_client.create_internet_gateway(
330
+ create_internet_gateway_details=oci_adaptor.oci.core.models.
331
+ CreateInternetGatewayDetails(
332
+ compartment_id=skypilot_compartment,
333
+ is_enabled=True,
334
+ vcn_id=skypilot_vcn,
335
+ display_name=oci_utils.oci_config.VCN_INTERNET_GATEWAY_NAME
336
+ ))
337
+ logger.debug(
338
+ f'Created internet gateway \n{create_ig_response.data}')
339
+ ig = create_ig_response.data.id
340
+
341
+ # Create a public subnet.
342
+ create_subnet_response = net_client.create_subnet(
343
+ create_subnet_details=oci_adaptor.oci.core.models.
344
+ CreateSubnetDetails(
345
+ cidr_block=oci_utils.oci_config.VCN_SUBNET_CIDR,
346
+ compartment_id=skypilot_compartment,
347
+ vcn_id=skypilot_vcn,
348
+ dhcp_options_id=dhcp_options_id,
349
+ display_name=oci_utils.oci_config.VCN_SUBNET_NAME,
350
+ prohibit_internet_ingress=False,
351
+ prohibit_public_ip_on_vnic=False,
352
+ route_table_id=route_table,
353
+ security_list_ids=[security_list]))
354
+ logger.debug(f'Created subnet \n{create_subnet_response.data}')
355
+ subnet = create_subnet_response.data.id
356
+
357
+ list_services_response = net_client.list_services(limit=100)
358
+ services = [
359
+ s for s in list_services_response.data
360
+ if str(s.cidr_block).startswith('all-') and str(s.cidr_block).
361
+ endswith('-services-in-oracle-services-network')
362
+ ]
363
+ if services:
364
+ # Create service gateway for regional services.
365
+ create_sg_response = net_client.create_service_gateway(
366
+ create_service_gateway_details=oci_adaptor.oci.core.models.
367
+ CreateServiceGatewayDetails(
368
+ compartment_id=skypilot_compartment,
369
+ services=[
370
+ oci_adaptor.oci.core.models.ServiceIdRequestDetails(
371
+ service_id=services[0].id)
372
+ ],
373
+ vcn_id=skypilot_vcn))
374
+ logger.debug(f'Service Gateway: \n{create_sg_response.data}')
375
+ sg = create_sg_response.data.id
376
+
377
+ # Update security list: Allow all traffic in the same subnet
378
+ update_security_list_response = net_client.update_security_list(
379
+ security_list_id=security_list,
380
+ update_security_list_details=oci_adaptor.oci.core.models.
381
+ UpdateSecurityListDetails(ingress_security_rules=[
382
+ oci_adaptor.oci.core.models.IngressSecurityRule(
383
+ protocol='6',
384
+ source=oci_utils.oci_config.VCN_CIDR_INTERNET,
385
+ is_stateless=False,
386
+ source_type='CIDR_BLOCK',
387
+ tcp_options=oci_adaptor.oci.core.models.TcpOptions(
388
+ destination_port_range=oci_adaptor.oci.core.models.
389
+ PortRange(max=22, min=22),
390
+ source_port_range=oci_adaptor.oci.core.models.
391
+ PortRange(max=65535, min=1)),
392
+ description='Allow SSH port.'),
393
+ oci_adaptor.oci.core.models.IngressSecurityRule(
394
+ protocol='all',
395
+ source=oci_utils.oci_config.VCN_SUBNET_CIDR,
396
+ is_stateless=False,
397
+ source_type='CIDR_BLOCK',
398
+ description='Allow all traffic from/to same subnet.'),
399
+ oci_adaptor.oci.core.models.IngressSecurityRule(
400
+ protocol='1',
401
+ source=oci_utils.oci_config.VCN_CIDR_INTERNET,
402
+ is_stateless=False,
403
+ source_type='CIDR_BLOCK',
404
+ icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
405
+ type=3, code=4),
406
+ description='ICMP traffic.'),
407
+ oci_adaptor.oci.core.models.IngressSecurityRule(
408
+ protocol='1',
409
+ source=oci_utils.oci_config.VCN_CIDR,
410
+ is_stateless=False,
411
+ source_type='CIDR_BLOCK',
412
+ icmp_options=oci_adaptor.oci.core.models.IcmpOptions(
413
+ type=3),
414
+ description='ICMP traffic (VCN).'),
415
+ ]))
416
+ logger.debug(
417
+ f'Updated security_list: \n{update_security_list_response.data}'
418
+ )
419
+
420
+ # Update route table: bind to the internet gateway
421
+ update_route_table_response = net_client.update_route_table(
422
+ rt_id=route_table,
423
+ update_route_table_details=oci_adaptor.oci.core.models.
424
+ UpdateRouteTableDetails(route_rules=[
425
+ oci_adaptor.oci.core.models.RouteRule(
426
+ network_entity_id=create_ig_response.data.id,
427
+ destination='0.0.0.0/0',
428
+ destination_type='CIDR_BLOCK',
429
+ description='Route table for SkyPilot VCN',
430
+ route_type='STATIC')
431
+ ]))
432
+ logger.debug(f'Route table: \n{update_route_table_response.data}')
433
+
434
+ except oci_adaptor.oci.exceptions.ServiceError as e:
435
+ logger.error(f'Create VCN Error: Create new VCN '
436
+ f'{oci_utils.oci_config.VCN_NAME} failed: {str(e)}')
437
+ # In case of partial success while creating vcn
438
+ cls.delete_vcn(net_client, skypilot_vcn, subnet, ig, sg)
439
+ subnet = None
440
+
441
+ return subnet
442
+
443
+ @classmethod
444
+ @debug_enabled(logger)
445
+ def delete_vcn(cls, net_client, skypilot_vcn, skypilot_subnet,
446
+ internet_gateway, service_gateway):
447
+ if skypilot_vcn is None:
448
+ return # Nothing to delete
449
+ try:
450
+ if internet_gateway is not None:
451
+ # Delete internet gateway
452
+ delete_ig_response = net_client.delete_internet_gateway(
453
+ ig_id=internet_gateway)
454
+ logger.debug(f'Deleted internet gateway {internet_gateway}'
455
+ f'-{delete_ig_response.data}')
456
+ if service_gateway is not None:
457
+ # Delete service gateway
458
+ delete_sg_response = net_client.delete_service_gateway(
459
+ service_gateway_id=service_gateway)
460
+ logger.debug(f'Deleted service gateway {service_gateway}'
461
+ f'-{delete_sg_response.data}')
462
+ if skypilot_subnet is not None:
463
+ # Delete subnet
464
+ delete_subnet_response = net_client.delete_subnet(
465
+ subnet_id=skypilot_subnet)
466
+ logger.debug(f'Deleted subnet {skypilot_subnet}'
467
+ f'-{delete_subnet_response.data}')
468
+ # Delete vcn
469
+ retry_count = 0
470
+ while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
471
+ try:
472
+ delete_vcn_response = net_client.delete_vcn(
473
+ vcn_id=skypilot_vcn)
474
+ logger.debug(
475
+ f'Deleted vcn {skypilot_vcn}-{delete_vcn_response.data}'
476
+ )
477
+ break
478
+ except oci_adaptor.oci.exceptions.ServiceError as e:
479
+ logger.info(f'Waiting del SG/IG/Subnet finish: {str(e)}')
480
+ retry_count = retry_count + 1
481
+ if retry_count == oci_utils.oci_config.MAX_RETRY_COUNT:
482
+ raise e
483
+ else:
484
+ time.sleep(
485
+ oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS)
486
+
487
+ except oci_adaptor.oci.exceptions.ServiceError as e:
488
+ logger.error(
489
+ f'Delete VCN {oci_utils.oci_config.VCN_NAME} Error: {str(e)}')
490
+
491
+ @classmethod
492
+ @debug_enabled(logger)
493
+ def find_nsg(cls, region: str, nsg_name: str,
494
+ create_if_not_exist: bool) -> Optional[str]:
495
+ net_client = oci_adaptor.get_net_client(
496
+ region, oci_utils.oci_config.get_profile())
497
+
498
+ compartment = cls.find_compartment(region)
499
+
500
+ vcn_id = oci_utils.oci_config.get_vcn_ocid(region)
501
+ if vcn_id is None:
502
+ list_vcns_resp = net_client.list_vcns(
503
+ compartment_id=compartment,
504
+ display_name=oci_utils.oci_config.VCN_NAME,
505
+ lifecycle_state='AVAILABLE',
506
+ )
507
+
508
+ # Get the primary vnic. The vnic might be an empty list for the
509
+ # corner case when the cluster was exited during provision.
510
+ if not list_vcns_resp.data:
511
+ return None
512
+
513
+ vcn = list_vcns_resp.data[0]
514
+ vcn_id = vcn.id
515
+
516
+ list_nsg_resp = net_client.list_network_security_groups(
517
+ compartment_id=compartment,
518
+ vcn_id=vcn_id,
519
+ limit=1,
520
+ display_name=nsg_name,
521
+ )
522
+
523
+ nsgs = list_nsg_resp.data
524
+ if nsgs:
525
+ assert len(nsgs) == 1
526
+ return nsgs[0].id
527
+ elif not create_if_not_exist:
528
+ return None
529
+
530
+ # Continue to create new NSG if not exists
531
+ create_nsg_resp = net_client.create_network_security_group(
532
+ create_network_security_group_details=oci_adaptor.oci.core.models.
533
+ CreateNetworkSecurityGroupDetails(
534
+ compartment_id=compartment,
535
+ vcn_id=vcn_id,
536
+ display_name=nsg_name,
537
+ ))
538
+ get_nsg_resp = net_client.get_network_security_group(
539
+ network_security_group_id=create_nsg_resp.data.id)
540
+ oci_adaptor.oci.wait_until(
541
+ net_client,
542
+ get_nsg_resp,
543
+ 'lifecycle_state',
544
+ 'AVAILABLE',
545
+ )
546
+
547
+ return get_nsg_resp.data.id
548
+
549
+ @classmethod
550
+ def get_range_min_max(cls, port_range: str) -> Tuple[int, int]:
551
+ range_list = port_range.split('-')
552
+ if len(range_list) == 1:
553
+ return (int(range_list[0]), int(range_list[0]))
554
+ from_port, to_port = range_list
555
+ return (int(from_port), int(to_port))
556
+
557
+ @classmethod
558
+ @debug_enabled(logger)
559
+ def create_nsg_rules(cls, region: str, cluster_name: str,
560
+ ports: List[str]) -> None:
561
+ """ Create per-cluster NSG with ingress rules """
562
+ if not ports:
563
+ return
564
+
565
+ net_client = oci_adaptor.get_net_client(
566
+ region, oci_utils.oci_config.get_profile())
567
+
568
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
569
+ cluster_name=cluster_name)
570
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=True)
571
+
572
+ filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name}
573
+ insts = query_helper.query_instances_by_tags(filters, region)
574
+ for inst in insts:
575
+ vnic = cls.get_instance_primary_vnic(
576
+ region=region,
577
+ inst_info={
578
+ 'inst_id': inst.identifier,
579
+ 'ad': inst.availability_domain,
580
+ 'compartment': inst.compartment_id,
581
+ })
582
+ nsg_ids = vnic.nsg_ids
583
+ if not nsg_ids:
584
+ net_client.update_vnic(
585
+ vnic_id=vnic.id,
586
+ update_vnic_details=oci_adaptor.oci.core.models.
587
+ UpdateVnicDetails(nsg_ids=[nsg_id],
588
+ skip_source_dest_check=False),
589
+ )
590
+
591
+ # pylint: disable=line-too-long
592
+ list_nsg_rules_resp = net_client.list_network_security_group_security_rules(
593
+ network_security_group_id=nsg_id,
594
+ direction='INGRESS',
595
+ sort_by='TIMECREATED',
596
+ sort_order='DESC',
597
+ )
598
+
599
+ ingress_rules: List = list_nsg_rules_resp.data
600
+ existing_port_ranges: List[str] = []
601
+ for r in ingress_rules:
602
+ if r.tcp_options:
603
+ options_range = r.tcp_options.destination_port_range
604
+ rule_port_range = f'{options_range.min}-{options_range.max}'
605
+ existing_port_ranges.append(rule_port_range)
606
+
607
+ new_ports = resources_utils.port_ranges_to_set(ports)
608
+ existing_ports = resources_utils.port_ranges_to_set(
609
+ existing_port_ranges)
610
+ if new_ports.issubset(existing_ports):
611
+ # ports already contains in the existing rules, nothing to add.
612
+ return
613
+
614
+ # Determine the ports to be added, without overlapping.
615
+ ports_to_open = new_ports - existing_ports
616
+ port_ranges_to_open = resources_utils.port_set_to_ranges(ports_to_open)
617
+
618
+ new_rules = []
619
+ for port_range in port_ranges_to_open:
620
+ port_range_min, port_range_max = cls.get_range_min_max(port_range)
621
+ new_rules.append(
622
+ oci_adaptor.oci.core.models.AddSecurityRuleDetails(
623
+ direction='INGRESS',
624
+ protocol='6',
625
+ is_stateless=False,
626
+ source=oci_utils.oci_config.VCN_CIDR_INTERNET,
627
+ source_type='CIDR_BLOCK',
628
+ tcp_options=oci_adaptor.oci.core.models.TcpOptions(
629
+ destination_port_range=oci_adaptor.oci.core.models.
630
+ PortRange(min=port_range_min, max=port_range_max),),
631
+ description=oci_utils.oci_config.SERVICE_PORT_RULE_TAG,
632
+ ))
633
+
634
+ net_client.add_network_security_group_security_rules(
635
+ network_security_group_id=nsg_id,
636
+ add_network_security_group_security_rules_details=oci_adaptor.oci.
637
+ core.models.AddNetworkSecurityGroupSecurityRulesDetails(
638
+ security_rules=new_rules),
639
+ )
640
+
641
+ @classmethod
642
+ @debug_enabled(logger)
643
+ def detach_nsg(cls, region: str, inst, nsg_id: Optional[str]) -> None:
644
+ if nsg_id is None:
645
+ return
646
+
647
+ vnic = cls.get_instance_primary_vnic(
648
+ region=region,
649
+ inst_info={
650
+ 'inst_id': inst.identifier,
651
+ 'ad': inst.availability_domain,
652
+ 'compartment': inst.compartment_id,
653
+ })
654
+
655
+ # Detatch the NSG before removing it.
656
+ oci_adaptor.get_net_client(region, oci_utils.oci_config.get_profile(
657
+ )).update_vnic(
658
+ vnic_id=vnic.id,
659
+ update_vnic_details=oci_adaptor.oci.core.models.UpdateVnicDetails(
660
+ nsg_ids=[], skip_source_dest_check=False),
661
+ )
662
+
663
+ @classmethod
664
+ @debug_enabled(logger)
665
+ def remove_cluster_nsg(cls, region: str, cluster_name: str) -> None:
666
+ """ Remove NSG of the cluster """
667
+ net_client = oci_adaptor.get_net_client(
668
+ region, oci_utils.oci_config.get_profile())
669
+
670
+ nsg_name = oci_utils.oci_config.NSG_NAME_TEMPLATE.format(
671
+ cluster_name=cluster_name)
672
+ nsg_id = cls.find_nsg(region, nsg_name, create_if_not_exist=False)
673
+ if nsg_id is None:
674
+ return
675
+
676
+ # Delete the NSG
677
+ net_client.delete_network_security_group(
678
+ network_security_group_id=nsg_id)
679
+
680
+
681
+ query_helper = QueryHelper()
@@ -19,6 +19,12 @@ INSTANCE_TO_TEMPLATEID = {
19
19
  'V100-32Gx2': 'twnlo3zj',
20
20
  'V100-32G': 'twnlo3zj',
21
21
  'V100': 'twnlo3zj',
22
+ 'GPU+': 'twnlo3zj',
23
+ 'P4000': 'twnlo3zj',
24
+ 'P4000x2': 'twnlo3zj',
25
+ 'A4000': 'twnlo3zj',
26
+ 'A4000x2': 'twnlo3zj',
27
+ 'A4000x4': 'twnlo3zj',
22
28
  **CPU_INSTANCES_TEMPLATEID
23
29
  }
24
30
  NVLINK_INSTANCES = {