skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,320 +0,0 @@
1
- import logging
2
- import os
3
- from threading import RLock
4
- import time
5
- from typing import Any, Dict, List, Optional
6
-
7
- from ray.autoscaler.node_provider import NodeProvider
8
- from ray.autoscaler.tags import NODE_KIND_HEAD
9
- from ray.autoscaler.tags import NODE_KIND_WORKER
10
- from ray.autoscaler.tags import STATUS_UP_TO_DATE
11
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
12
- from ray.autoscaler.tags import TAG_RAY_NODE_KIND
13
- from ray.autoscaler.tags import TAG_RAY_NODE_NAME
14
- from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
15
- from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
16
-
17
- from sky import authentication as auth
18
- from sky.clouds.utils import lambda_utils
19
- from sky.utils import command_runner
20
- from sky.utils import common_utils
21
- from sky.utils import subprocess_utils
22
- from sky.utils import ux_utils
23
-
24
- _TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata'
25
- _REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name'
26
- _REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem'
27
- _REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
28
- _GET_INTERNAL_IP_CMD = 'ip -4 -br addr show | grep UP | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1]))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- def synchronized(f):
34
-
35
- def wrapper(self, *args, **kwargs):
36
- self.lock.acquire()
37
- try:
38
- return f(self, *args, **kwargs)
39
- finally:
40
- self.lock.release()
41
-
42
- return wrapper
43
-
44
-
45
- class LambdaNodeProvider(NodeProvider):
46
- """Node Provider for Lambda Cloud.
47
-
48
- This provider assumes Lambda Cloud credentials are set.
49
- """
50
-
51
- def __init__(self, provider_config: Dict[str, Any],
52
- cluster_name: str) -> None:
53
- NodeProvider.__init__(self, provider_config, cluster_name)
54
- self.lock = RLock()
55
- self.lambda_client = lambda_utils.LambdaCloudClient()
56
- self.cached_nodes: Dict[str, Dict[str, Any]] = {}
57
- self.metadata = lambda_utils.Metadata(_TAG_PATH_PREFIX, cluster_name)
58
- self.ssh_key_path = os.path.expanduser(auth.PRIVATE_SSH_KEY_PATH)
59
-
60
- def _get_ssh_key_name(prefix: str) -> str:
61
- public_key_path = os.path.expanduser(auth.PUBLIC_SSH_KEY_PATH)
62
- with open(public_key_path, 'r') as f:
63
- public_key = f.read()
64
- name, exists = self.lambda_client.get_unique_ssh_key_name(
65
- prefix, public_key)
66
- if not exists:
67
- raise lambda_utils.LambdaCloudError('SSH key not found')
68
- return name
69
-
70
- ray_yaml_path = os.path.expanduser(_REMOTE_RAY_YAML)
71
- self.on_head = (os.path.exists(ray_yaml_path) and
72
- common_utils.read_yaml(ray_yaml_path)['cluster_name']
73
- == cluster_name)
74
-
75
- if self.on_head:
76
- self.ssh_key_path = os.path.expanduser(_REMOTE_RAY_SSH_KEY)
77
- ssh_key_name_path = os.path.expanduser(_REMOTE_SSH_KEY_NAME)
78
- if os.path.exists(ssh_key_name_path):
79
- with open(ssh_key_name_path, 'r') as f:
80
- self.ssh_key_name = f.read()
81
- else:
82
- # At this point, `~/.ssh/sky-key.pub` contains the public
83
- # key used to launch this cluster. Use it to determine
84
- # ssh key name and store the name in _REMOTE_SSH_KEY_NAME.
85
- # Note: this case only runs during cluster launch, so it is
86
- # not possible for ~/.ssh/sky-key.pub to already be regenerated
87
- # by the user.
88
- self.ssh_key_name = _get_ssh_key_name('')
89
- with open(ssh_key_name_path, 'w', encoding='utf-8') as f:
90
- f.write(self.ssh_key_name)
91
- else:
92
- # On local
93
- self.ssh_key_name = _get_ssh_key_name(
94
- f'sky-key-{common_utils.get_user_hash()}')
95
-
96
- def _guess_and_add_missing_tags(self, vms: List[Dict[str, Any]]) -> None:
97
- """Adds missing vms to local tag file and guesses their tags."""
98
- for node in vms:
99
- if self.metadata.get(node['id']) is not None:
100
- pass
101
- elif node['name'] == f'{self.cluster_name}-head':
102
- self.metadata.set(
103
- node['id'], {
104
- 'tags': {
105
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
106
- TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
107
- TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
108
- TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
109
- TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-head',
110
- }
111
- })
112
- elif node['name'] == f'{self.cluster_name}-worker':
113
- self.metadata.set(
114
- node['id'], {
115
- 'tags': {
116
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
117
- TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
118
- TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
119
- TAG_RAY_USER_NODE_TYPE: 'ray_worker_default',
120
- TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-worker',
121
- }
122
- })
123
-
124
- def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
125
- """List running instances in cluster."""
126
- vms = self.lambda_client.list_instances()
127
- possible_names = [
128
- f'{self.cluster_name}-head', f'{self.cluster_name}-worker'
129
- ]
130
- return [node for node in vms if node.get('name') in possible_names]
131
-
132
- @synchronized
133
- def _get_filtered_nodes(self, tag_filters: Dict[str,
134
- str]) -> Dict[str, Any]:
135
-
136
- def _extract_metadata(vm: Dict[str, Any]) -> Dict[str, Any]:
137
- metadata = {'id': vm['id'], 'status': vm['status'], 'tags': {}}
138
- instance_info = self.metadata.get(vm['id'])
139
- if instance_info is not None:
140
- metadata['tags'] = instance_info['tags']
141
- metadata['external_ip'] = vm.get('ip')
142
- return metadata
143
-
144
- def _match_tags(vm: Dict[str, Any]):
145
- vm_info = self.metadata.get(vm['id'])
146
- tags = {} if vm_info is None else vm_info['tags']
147
- for k, v in tag_filters.items():
148
- if tags.get(k) != v:
149
- return False
150
- return True
151
-
152
- def _get_internal_ip(node: Dict[str, Any]):
153
- # TODO(ewzeng): cache internal ips in metadata file to reduce
154
- # ssh overhead.
155
- if node['external_ip'] is None or node['status'] != 'active':
156
- node['internal_ip'] = None
157
- return
158
- runner = command_runner.SSHCommandRunner(
159
- node=(node['external_ip'], 22),
160
- ssh_user='ubuntu',
161
- ssh_private_key=self.ssh_key_path)
162
- rc, stdout, stderr = runner.run(_GET_INTERNAL_IP_CMD,
163
- require_outputs=True,
164
- stream_logs=False)
165
- subprocess_utils.handle_returncode(
166
- rc,
167
- _GET_INTERNAL_IP_CMD,
168
- 'Failed get obtain private IP from node',
169
- stderr=stdout + stderr)
170
- node['internal_ip'] = stdout.strip()
171
-
172
- vms = self._list_instances_in_cluster()
173
- self.metadata.refresh([node['id'] for node in vms])
174
- self._guess_and_add_missing_tags(vms)
175
- nodes = [_extract_metadata(vm) for vm in filter(_match_tags, vms)]
176
- nodes = [
177
- node for node in nodes
178
- if node['status'] not in ['terminating', 'terminated']
179
- ]
180
- subprocess_utils.run_in_parallel(_get_internal_ip, nodes)
181
- self.cached_nodes = {node['id']: node for node in nodes}
182
- return self.cached_nodes
183
-
184
- def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
185
- """Return a list of node ids filtered by the specified tags dict.
186
-
187
- This list must not include terminated nodes. For performance reasons,
188
- providers are allowed to cache the result of a call to
189
- non_terminated_nodes() to serve single-node queries
190
- (e.g. is_running(node_id)). This means that non_terminated_nodes() must
191
- be called again to refresh results.
192
-
193
- Examples:
194
- >>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
195
- ["node-1", "node-2"]
196
- """
197
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
198
- return [k for k, _ in nodes.items()]
199
-
200
- def is_running(self, node_id: str) -> bool:
201
- """Return whether the specified node is running."""
202
- return self._get_cached_node(node_id=node_id) is not None
203
-
204
- def is_terminated(self, node_id: str) -> bool:
205
- """Return whether the specified node is terminated."""
206
- return self._get_cached_node(node_id=node_id) is None
207
-
208
- def node_tags(self, node_id: str) -> Dict[str, str]:
209
- """Returns the tags of the given node (string dict)."""
210
- node = self._get_cached_node(node_id=node_id)
211
- if node is None:
212
- return {}
213
- return node['tags']
214
-
215
- def external_ip(self, node_id: str) -> Optional[str]:
216
- """Returns the external ip of the given node."""
217
- node = self._get_cached_node(node_id=node_id)
218
- if node is None:
219
- return None
220
- ip = node.get('external_ip')
221
- with ux_utils.print_exception_no_traceback():
222
- if ip is None:
223
- raise lambda_utils.LambdaCloudError(
224
- 'A node ip address was not found. Either '
225
- '(1) Lambda Cloud has internally errored, or '
226
- '(2) the cluster is still booting. '
227
- 'You can manually terminate the cluster on the '
228
- 'Lambda Cloud console or (in case 2) wait for '
229
- 'booting to finish (~2 minutes).')
230
- return ip
231
-
232
- def internal_ip(self, node_id: str) -> Optional[str]:
233
- """Returns the internal ip (Ray ip) of the given node."""
234
- node = self._get_cached_node(node_id=node_id)
235
- if node is None:
236
- return None
237
- ip = node.get('internal_ip')
238
- with ux_utils.print_exception_no_traceback():
239
- if ip is None:
240
- raise lambda_utils.LambdaCloudError(
241
- 'A node ip address was not found. Either '
242
- '(1) Lambda Cloud has internally errored, or '
243
- '(2) the cluster is still booting. '
244
- 'You can manually terminate the cluster on the '
245
- 'Lambda Cloud console or (in case 2) wait for '
246
- 'booting to finish (~2 minutes).')
247
- return ip
248
-
249
- def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
250
- count: int) -> None:
251
- """Creates a number of nodes within the namespace."""
252
- # Get tags
253
- config_tags = node_config.get('tags', {}).copy()
254
- config_tags.update(tags)
255
- config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
256
-
257
- # Create nodes
258
- instance_type = node_config['InstanceType']
259
- region = self.provider_config['region']
260
-
261
- if config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD:
262
- name = f'{self.cluster_name}-head'
263
- # Occasionally, the head node will continue running for a short
264
- # period after termination. This can lead to the following bug:
265
- # 1. Head node autodowns but continues running.
266
- # 2. The next autodown event is triggered, which executes ray up.
267
- # 3. Head node stops running.
268
- # In this case, a new head node is created after the cluster has
269
- # terminated. We avoid this with the following check:
270
- if self.on_head:
271
- raise lambda_utils.LambdaCloudError('Head already exists.')
272
- else:
273
- name = f'{self.cluster_name}-worker'
274
-
275
- # Lambda launch api only supports launching one node at a time,
276
- # so we do a loop. Remove loop when launch api allows quantity > 1
277
- booting_list = []
278
- for _ in range(count):
279
- vm_id = self.lambda_client.create_instances(
280
- instance_type=instance_type,
281
- region=region,
282
- quantity=1,
283
- name=name,
284
- ssh_key_name=self.ssh_key_name)[0]
285
- self.metadata.set(vm_id, {'tags': config_tags})
286
- booting_list.append(vm_id)
287
- time.sleep(10) # Avoid api rate limits
288
-
289
- # Wait for nodes to finish booting
290
- while True:
291
- vms = self._list_instances_in_cluster()
292
- for vm_id in booting_list.copy():
293
- for vm in vms:
294
- if vm['id'] == vm_id and vm['status'] == 'active':
295
- booting_list.remove(vm_id)
296
- if len(booting_list) == 0:
297
- return
298
- time.sleep(10)
299
-
300
- @synchronized
301
- def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
302
- """Sets the tag values (string dict) for the specified node."""
303
- node = self._get_node(node_id)
304
- assert node is not None, node_id
305
- node['tags'].update(tags)
306
- self.metadata.set(node_id, {'tags': node['tags']})
307
-
308
- def terminate_node(self, node_id: str) -> None:
309
- """Terminates the specified node."""
310
- self.lambda_client.remove_instances(node_id)
311
- self.metadata.set(node_id, None)
312
-
313
- def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
314
- self._get_filtered_nodes({}) # Side effect: updates cache
315
- return self.cached_nodes.get(node_id, None)
316
-
317
- def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
318
- if node_id in self.cached_nodes:
319
- return self.cached_nodes[node_id]
320
- return self._get_node(node_id=node_id)
@@ -1,2 +0,0 @@
1
- """OCI node provider"""
2
- from sky.skylet.providers.oci.node_provider import OCINodeProvider