skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,488 +0,0 @@
1
- """OCI Node Provider.
2
-
3
- Node provider is called by the Ray Autoscaler to provision new compute
4
- resources (head / worker nodes).
5
-
6
- To show debug messages, export SKYPILOT_DEBUG=1
7
-
8
- History:
9
- - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
10
-
11
- """
12
-
13
- import copy
14
- from datetime import datetime
15
- import logging
16
- import threading
17
- import time
18
-
19
- from ray.autoscaler.node_provider import NodeProvider
20
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
21
- from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG
22
- from ray.autoscaler.tags import TAG_RAY_NODE_KIND
23
- from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
24
-
25
- from sky.adaptors import oci as oci_adaptor
26
- from sky.clouds.utils import oci_utils
27
- from sky.skylet.providers.oci import utils
28
- from sky.skylet.providers.oci.query_helper import oci_query_helper
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- def synchronized(f):
34
-
35
- def wrapper(self, *args, **kwargs):
36
- self.lock.acquire()
37
- try:
38
- return f(self, *args, **kwargs)
39
- finally:
40
- self.lock.release()
41
-
42
- return wrapper
43
-
44
-
45
- class OCINodeProvider(NodeProvider):
46
- """Node Provider for OracleCloud (OCI)."""
47
-
48
- def __init__(self, provider_config, cluster_name):
49
- NodeProvider.__init__(self, provider_config, cluster_name)
50
- self.lock = threading.RLock()
51
- self.cached_nodes = {}
52
- self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
53
- True)
54
- self.region = provider_config["region"]
55
-
56
- # Do a read-ahead cache loading to improve performance.
57
- self._get_filtered_nodes({})
58
-
59
- @synchronized
60
- def _get_filtered_nodes(self, tag_filters, force=False):
61
- # Make sure the cluster_name is always an criterion
62
- tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
63
-
64
- return_nodes = {}
65
- if not force:
66
- # Query cache first to reduce API call.
67
- cache_hit = False
68
- for k, node in self.cached_nodes.items():
69
- tags = node["tags"]
70
- unmatched_tags = [
71
- k for k, v in tag_filters.items()
72
- if k not in tags or v != tags[k]
73
- ]
74
- if len(unmatched_tags) == 0:
75
- return_nodes[k] = node
76
- cache_hit |= True
77
-
78
- if cache_hit:
79
- return return_nodes
80
-
81
- insts = oci_query_helper.query_instances_by_tags(
82
- tag_filters, self.region)
83
- for inst in insts:
84
- inst_id = inst.identifier
85
- if inst_id in self.cached_nodes:
86
- del self.cached_nodes[inst_id]
87
-
88
- item = self.get_inst_obj({
89
- "inst_id": inst_id,
90
- "ad": inst.availability_domain,
91
- "compartment": inst.compartment_id,
92
- "lifecycle_state": inst.lifecycle_state,
93
- "oci_tags": inst.freeform_tags,
94
- })
95
- return_nodes[inst_id] = item
96
- self.cached_nodes[inst_id] = item
97
-
98
- return return_nodes
99
-
100
- @utils.debug_enabled(logger=logger)
101
- def non_terminated_nodes(self, tag_filters):
102
- """Return a list of node ids filtered by the specified tags dict.
103
-
104
- This list must not include terminated nodes. For performance reasons,
105
- providers are allowed to cache the result of a call to
106
- non_terminated_nodes() to serve single-node queries
107
- (e.g. is_running(node_id)). This means that non_terminated_nodes()
108
- must be called again to refresh results.
109
- """
110
- VALIDITY_TAGS = [
111
- TAG_RAY_CLUSTER_NAME,
112
- TAG_RAY_NODE_KIND,
113
- TAG_RAY_USER_NODE_TYPE,
114
- TAG_RAY_LAUNCH_CONFIG,
115
- ]
116
- filters = {
117
- tag: tag_filters[tag] for tag in VALIDITY_TAGS if tag in tag_filters
118
- }
119
-
120
- nodes = self._get_filtered_nodes(tag_filters=filters)
121
- return [k for k, v in nodes.items() if v["status"] == "RUNNING"]
122
-
123
- @utils.debug_enabled(logger=logger)
124
- def is_running(self, node_id):
125
- """Return whether the specified node is running."""
126
- node = self._get_cached_node(node_id=node_id)
127
- check_result = node is None or node["status"] == "RUNNING"
128
-
129
- return check_result
130
-
131
- @utils.debug_enabled(logger=logger)
132
- def is_terminated(self, node_id):
133
- """Return whether the specified node is terminated."""
134
- node = self._get_cached_node(node_id=node_id)
135
- check_result = ((node is None) or (node["status"] == "TERMINATED") or
136
- (node["status"] == "TERMINATING"))
137
-
138
- return check_result
139
-
140
- @utils.debug_enabled(logger=logger)
141
- def node_tags(self, node_id):
142
- return self.cached_nodes[node_id]["tags"]
143
-
144
- @utils.debug_enabled(logger=logger)
145
- def external_ip(self, node_id):
146
- """Returns the external ip of the given node."""
147
- return self._get_cached_node(node_id=node_id)["external_ip"]
148
-
149
- @utils.debug_enabled(logger=logger)
150
- def internal_ip(self, node_id):
151
- """Returns the internal ip (Ray ip) of the given node."""
152
- return self._get_cached_node(node_id=node_id)["internal_ip"]
153
-
154
- @synchronized
155
- @utils.debug_enabled(logger=logger)
156
- def create_node(self, node_config, tags, count):
157
- """Creates a number of nodes within the namespace."""
158
- start_time = round(time.time() * 1000)
159
- starting_insts = []
160
- # Check first if it neccessary to create new nodes / start stopped nodes
161
- VALIDITY_TAGS = [
162
- TAG_RAY_CLUSTER_NAME,
163
- TAG_RAY_NODE_KIND,
164
- TAG_RAY_USER_NODE_TYPE,
165
- ]
166
- filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
167
-
168
- # Starting stopped nodes if cache_stopped_nodes=True
169
- if self.cache_stopped_nodes:
170
- logger.debug("Checking existing stopped nodes.")
171
-
172
- filters_with_launch_config = copy.copy(filters)
173
- if TAG_RAY_LAUNCH_CONFIG in tags:
174
- filters_with_launch_config[TAG_RAY_LAUNCH_CONFIG] = tags[
175
- TAG_RAY_LAUNCH_CONFIG]
176
-
177
- nodes_matching_launch_config = self.stopped_nodes(
178
- filters_with_launch_config)
179
- logger.debug(f"Found stopped nodes (with same launch config): "
180
- f"{len(nodes_matching_launch_config)}")
181
-
182
- reuse_nodes = []
183
- if len(nodes_matching_launch_config) >= count:
184
- reuse_nodes = nodes_matching_launch_config[:count]
185
- else:
186
- nodes_all = self.stopped_nodes(filters)
187
- logger.debug(f"Found stopped nodes (regardless launch config): "
188
- f"{len(nodes_all)}")
189
- nodes_matching_launch_config_ids = [
190
- n["id"] for n in nodes_matching_launch_config
191
- ]
192
- nodes_non_matching_launch_config = [
193
- n for n in nodes_all
194
- if n["id"] not in nodes_matching_launch_config_ids
195
- ]
196
- reuse_nodes = (nodes_matching_launch_config +
197
- nodes_non_matching_launch_config)
198
- reuse_nodes = reuse_nodes[:count]
199
-
200
- logger.info(
201
- f"Reusing nodes {len(reuse_nodes)}: {list(reuse_nodes)}. "
202
- "To disable reuse, set `cache_stopped_nodes: False` "
203
- "under `provider` in the cluster configuration.",)
204
-
205
- for reuse_node in reuse_nodes:
206
- if reuse_node["status"] == "STOPPING":
207
- get_instance_response = oci_adaptor.get_core_client(
208
- self.region,
209
- oci_utils.oci_config.get_profile()).get_instance(
210
- instance_id=reuse_node["id"])
211
- oci_adaptor.oci.wait_until(
212
- oci_adaptor.get_core_client(
213
- self.region, oci_utils.oci_config.get_profile()),
214
- get_instance_response,
215
- "lifecycle_state",
216
- "STOPPED",
217
- )
218
-
219
- start_time1 = round(time.time() * 1000)
220
- for matched_node in reuse_nodes:
221
- matched_node_id = matched_node["id"]
222
- instance_action_response = oci_adaptor.get_core_client(
223
- self.region,
224
- oci_utils.oci_config.get_profile()).instance_action(
225
- instance_id=matched_node_id, action="START")
226
-
227
- starting_inst = instance_action_response.data
228
- starting_insts.append({
229
- "inst_id": starting_inst.id,
230
- "ad": starting_inst.availability_domain,
231
- "compartment": starting_inst.compartment_id,
232
- "lifecycle_state": starting_inst.lifecycle_state,
233
- "oci_tags": starting_inst.freeform_tags,
234
- })
235
- count -= len(reuse_nodes)
236
-
237
- launch_stopped_time = round(time.time() * 1000) - start_time1
238
- logger.debug(
239
- "Time elapsed(Launch stopped): {0} milli-seconds.".format(
240
- launch_stopped_time))
241
- # end if self.cache_stopped_nodes:...
242
-
243
- # Let's create additional new nodes (if neccessary)
244
- if count > 0:
245
- compartment = oci_query_helper.find_compartment(self.region)
246
- vcn = oci_query_helper.find_create_vcn_subnet(self.region)
247
- if vcn is None:
248
- raise RuntimeError("VcnSubnetNotFound Error!")
249
-
250
- ocpu_count = 0
251
- vcpu_str = node_config["VCPUs"]
252
- instance_type_str = node_config["InstanceType"]
253
-
254
- if vcpu_str is not None and vcpu_str != "None":
255
- if instance_type_str.startswith(
256
- f"{oci_utils.oci_config.VM_PREFIX}.A"):
257
- # For ARM cpu, 1*ocpu = 1*vcpu
258
- ocpu_count = round(float(vcpu_str))
259
- else:
260
- # For Intel / AMD cpu, 1*ocpu = 2*vcpu
261
- ocpu_count = round(float(vcpu_str) / 2)
262
- ocpu_count = 1 if (ocpu_count > 0 and
263
- ocpu_count < 1) else ocpu_count
264
-
265
- machine_shape_config = None
266
- if ocpu_count > 0:
267
- mem = node_config["MemoryInGbs"]
268
- if mem is not None and mem != "None":
269
- machine_shape_config = (oci_adaptor.oci.core.models.
270
- LaunchInstanceShapeConfigDetails(
271
- ocpus=ocpu_count,
272
- memory_in_gbs=mem))
273
- else:
274
- machine_shape_config = (oci_adaptor.oci.core.models.
275
- LaunchInstanceShapeConfigDetails(
276
- ocpus=ocpu_count))
277
-
278
- preempitible_config = (
279
- oci_adaptor.oci.core.models.PreemptibleInstanceConfigDetails(
280
- preemption_action=oci_adaptor.oci.core.models.
281
- TerminatePreemptionAction(type="TERMINATE",
282
- preserve_boot_volume=False))
283
- if node_config["Preemptible"] else None)
284
-
285
- logger.debug(f"Shape: {instance_type_str}, ocpu: {ocpu_count}")
286
- logger.debug(f"Shape config is {machine_shape_config}")
287
- logger.debug(f"Spot config is {preempitible_config}")
288
-
289
- vm_tags = {
290
- **tags,
291
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
292
- "sky_spot_flag": str(node_config["Preemptible"]).lower(),
293
- }
294
- # Use UTC time so that header & worker nodes use same rule
295
- batch_id = datetime.utcnow().strftime("%Y%m%d%H%M%S")
296
- node_type = tags[TAG_RAY_NODE_KIND]
297
-
298
- oci_query_helper.subscribe_image(
299
- compartment_id=compartment,
300
- listing_id=node_config["AppCatalogListingId"],
301
- resource_version=node_config["ResourceVersion"],
302
- region=self.region,
303
- )
304
-
305
- start_time1 = round(time.time() * 1000)
306
- for seq in range(1, count + 1):
307
- launch_instance_response = oci_adaptor.get_core_client(
308
- self.region, oci_utils.oci_config.get_profile()
309
- ).launch_instance(
310
- launch_instance_details=oci_adaptor.oci.core.models.
311
- LaunchInstanceDetails(
312
- availability_domain=node_config["AvailabilityDomain"],
313
- compartment_id=compartment,
314
- shape=instance_type_str,
315
- display_name=
316
- f"{self.cluster_name}_{node_type}_{batch_id}_{seq}",
317
- freeform_tags=vm_tags,
318
- metadata={
319
- "ssh_authorized_keys": node_config["AuthorizedKey"]
320
- },
321
- source_details=oci_adaptor.oci.core.models.
322
- InstanceSourceViaImageDetails(
323
- source_type="image",
324
- image_id=node_config["ImageId"],
325
- boot_volume_size_in_gbs=node_config[
326
- "BootVolumeSize"],
327
- boot_volume_vpus_per_gb=int(
328
- node_config["BootVolumePerf"]),
329
- ),
330
- create_vnic_details=oci_adaptor.oci.core.models.
331
- CreateVnicDetails(
332
- assign_public_ip=True,
333
- subnet_id=vcn,
334
- ),
335
- shape_config=machine_shape_config,
336
- preemptible_instance_config=preempitible_config,
337
- ))
338
-
339
- new_inst = launch_instance_response.data
340
- starting_insts.append({
341
- "inst_id": new_inst.id,
342
- "ad": new_inst.availability_domain,
343
- "compartment": new_inst.compartment_id,
344
- "lifecycle_state": new_inst.lifecycle_state,
345
- "oci_tags": new_inst.freeform_tags,
346
- })
347
- # end for loop
348
-
349
- launch_new_time = round(time.time() * 1000) - start_time1
350
- logger.debug("Time elapsed(Launch): {0} milli-seconds.".format(
351
- launch_new_time))
352
- # end if count > 0:...
353
-
354
- for ninst in starting_insts:
355
- # Waiting for the instance to be RUNNING state
356
- get_instance_response = oci_adaptor.get_core_client(
357
- self.region, oci_utils.oci_config.get_profile()).get_instance(
358
- instance_id=ninst["inst_id"])
359
- oci_adaptor.oci.wait_until(
360
- oci_adaptor.get_core_client(self.region,
361
- oci_utils.oci_config.get_profile()),
362
- get_instance_response,
363
- "lifecycle_state",
364
- "RUNNING",
365
- )
366
- ninst["lifecycle_state"] = "RUNNING"
367
- self.cached_nodes[ninst["inst_id"]] = self.get_inst_obj(ninst)
368
-
369
- total_time = round(time.time() * 1000) - start_time
370
- logger.debug(
371
- "Total time elapsed: {0} milli-seconds.".format(total_time))
372
-
373
- def get_inst_obj(self, inst_info):
374
- list_vnic_attachments_response = oci_adaptor.get_core_client(
375
- self.region,
376
- oci_utils.oci_config.get_profile()).list_vnic_attachments(
377
- availability_domain=inst_info["ad"],
378
- compartment_id=inst_info["compartment"],
379
- instance_id=inst_info["inst_id"],
380
- )
381
-
382
- vnic = list_vnic_attachments_response.data[0]
383
- get_vnic_response = (oci_adaptor.get_net_client(
384
- self.region, oci_utils.oci_config.get_profile()).get_vnic(
385
- vnic_id=vnic.vnic_id).data)
386
-
387
- internal_ip = get_vnic_response.private_ip
388
- external_ip = get_vnic_response.public_ip
389
- if external_ip is None:
390
- external_ip = internal_ip
391
-
392
- return {
393
- "id": inst_info["inst_id"],
394
- "external_ip": external_ip,
395
- "internal_ip": internal_ip,
396
- "tags": inst_info["oci_tags"],
397
- "status": inst_info["lifecycle_state"],
398
- }
399
-
400
- @synchronized
401
- @utils.debug_enabled(logger=logger)
402
- def set_node_tags(self, node_id, tags):
403
- existing_tags = self._get_cached_node(node_id)["tags"]
404
- combined_tags = dict(existing_tags, **tags)
405
-
406
- self.cached_nodes[node_id]["tags"] = combined_tags
407
- retry_count = 0
408
- while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
409
- try:
410
- oci_adaptor.get_core_client(
411
- self.region,
412
- oci_utils.oci_config.get_profile()).update_instance(
413
- instance_id=node_id,
414
- update_instance_details=oci_adaptor.oci.core.models.
415
- UpdateInstanceDetails(freeform_tags=combined_tags),
416
- )
417
- logger.info(f"Tags are well set for node {node_id}")
418
- break
419
- except Exception as e:
420
- retry_count = retry_count + 1
421
- wait_seconds = oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS * retry_count
422
- logger.warn(
423
- f"Not ready yet, wait {wait_seconds} seconds & retry!")
424
- logger.warn(f"Exception message is {str(e)}")
425
- time.sleep(wait_seconds)
426
-
427
- @synchronized
428
- def terminate_node(self, node_id):
429
- """Terminates the specified node."""
430
- logger.info(f"terminate_node {node_id}...")
431
- node = self._get_cached_node(node_id)
432
- if node is None:
433
- logger.info(f"The node is not existed: {node_id}..")
434
- return # Node not exists yet.
435
-
436
- logger.debug(f"sky_spot_flag: {node['tags']['sky_spot_flag']}")
437
- preemptibleFlag = (True if node and
438
- (str(node["tags"]["sky_spot_flag"]) == "true") else
439
- False)
440
-
441
- if self.cache_stopped_nodes and not preemptibleFlag:
442
- logger.info(f"Stopping instance {node_id}"
443
- "(to fully terminate instead, "
444
- "set `cache_stopped_nodes: False` "
445
- "under `provider` in the cluster configuration)")
446
- instance_action_response = oci_adaptor.get_core_client(
447
- self.region,
448
- oci_utils.oci_config.get_profile()).instance_action(
449
- instance_id=node_id, action="STOP")
450
- logger.info(
451
- f"Stopped the instance {instance_action_response.data.id}")
452
- if node_id in self.cached_nodes:
453
- self.cached_nodes[node_id]["status"] = "STOPPED"
454
- state_word = "Stopped"
455
- else:
456
- terminate_instance_response = oci_adaptor.get_core_client(
457
- self.region,
458
- oci_utils.oci_config.get_profile()).terminate_instance(node_id)
459
- logger.debug(terminate_instance_response.data)
460
- if node_id in self.cached_nodes:
461
- del self.cached_nodes[node_id]
462
- state_word = "Terminated"
463
-
464
- logger.info(
465
- f"{state_word} {node_id} w/ sky_spot_flag: {preemptibleFlag}.")
466
-
467
- def _get_node(self, node_id):
468
- self._get_filtered_nodes({},
469
- force=True) # All except for those terminated.
470
- return self.cached_nodes.get(node_id, None)
471
-
472
- def _get_cached_node(self, node_id):
473
- if node_id in self.cached_nodes:
474
- return self.cached_nodes[node_id]
475
- return self._get_node(node_id=node_id)
476
-
477
- def stopped_nodes(self, tag_filters):
478
- """Return a list of stopped nodes filtered by the specified tags dict."""
479
- nodes = self._get_filtered_nodes(tag_filters=tag_filters, force=True)
480
- return [
481
- v for _, v in nodes.items()
482
- if v["status"] in ("STOPPED", "STOPPING")
483
- ]
484
-
485
- def running_nodes(self, tag_filters):
486
- """Return a list of running node ids filtered by the specified tags dict."""
487
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
488
- return [k for k, v in nodes.items() if v["status"] == "RUNNING"]