skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1,578 +0,0 @@
|
|
1
|
-
""" SCP Node provider
|
2
|
-
|
3
|
-
This module inherits NodeProvider interface
|
4
|
-
to provide the functions accessing SCP nodes
|
5
|
-
"""
|
6
|
-
|
7
|
-
import copy
|
8
|
-
from functools import wraps
|
9
|
-
import logging
|
10
|
-
import os
|
11
|
-
from threading import RLock
|
12
|
-
import time
|
13
|
-
from typing import Any, Dict, List, Optional
|
14
|
-
|
15
|
-
from ray.autoscaler._private.cli_logger import cli_logger
|
16
|
-
from ray.autoscaler._private.util import hash_launch_conf
|
17
|
-
from ray.autoscaler.node_provider import NodeProvider
|
18
|
-
from ray.autoscaler.tags import NODE_KIND_HEAD
|
19
|
-
from ray.autoscaler.tags import NODE_KIND_WORKER
|
20
|
-
from ray.autoscaler.tags import STATUS_UP_TO_DATE
|
21
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
22
|
-
from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG
|
23
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_KIND
|
24
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_NAME
|
25
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
|
26
|
-
from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
|
27
|
-
|
28
|
-
from sky.clouds.utils import scp_utils
|
29
|
-
from sky.clouds.utils.scp_utils import SCPCreationFailError
|
30
|
-
from sky.skylet.providers.scp.config import ZoneConfig
|
31
|
-
from sky.utils import common_utils
|
32
|
-
|
33
|
-
TAG_PATH_PREFIX = '~/.sky/generated/scp/metadata'
|
34
|
-
REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
|
35
|
-
|
36
|
-
logger = logging.getLogger(__name__)
|
37
|
-
|
38
|
-
|
39
|
-
def synchronized(f):
|
40
|
-
|
41
|
-
def wrapper(self, *args, **kwargs):
|
42
|
-
self.lock.acquire()
|
43
|
-
try:
|
44
|
-
return f(self, *args, **kwargs)
|
45
|
-
finally:
|
46
|
-
self.lock.release()
|
47
|
-
|
48
|
-
return wrapper
|
49
|
-
|
50
|
-
|
51
|
-
def _validation_check(node_config):
|
52
|
-
err_msg = None
|
53
|
-
if 'diskSize' not in node_config:
|
54
|
-
err_msg = "Disk size value is mandatory."
|
55
|
-
elif node_config['diskSize'] < 100 or node_config['diskSize'] > 300:
|
56
|
-
err_msg = f'The disk size must be between 100 and 300. ' \
|
57
|
-
f'Input: {node_config["diskSize"]}'
|
58
|
-
if err_msg:
|
59
|
-
raise SCPError(err_msg)
|
60
|
-
|
61
|
-
|
62
|
-
class SCPError(Exception):
|
63
|
-
pass
|
64
|
-
|
65
|
-
|
66
|
-
def _retry_on_creation(method, max_tries=3, backoff_s=2):
|
67
|
-
|
68
|
-
@wraps(method)
|
69
|
-
def method_with_retries(self, *args, **kwargs):
|
70
|
-
try_count = 0
|
71
|
-
while try_count < max_tries:
|
72
|
-
try:
|
73
|
-
return method(self, *args, **kwargs)
|
74
|
-
except SCPCreationFailError:
|
75
|
-
logger.warning("Resource Creation Failed. Retrying.")
|
76
|
-
try_count += 1
|
77
|
-
if try_count < max_tries:
|
78
|
-
time.sleep(backoff_s)
|
79
|
-
else:
|
80
|
-
raise
|
81
|
-
|
82
|
-
return method_with_retries
|
83
|
-
|
84
|
-
|
85
|
-
class SCPNodeProvider(NodeProvider):
|
86
|
-
"""Node Provider for Lambda Cloud.
|
87
|
-
|
88
|
-
This provider assumes Lambda Cloud credentials are set.
|
89
|
-
"""
|
90
|
-
|
91
|
-
def __init__(self, provider_config: Dict[str, Any],
|
92
|
-
cluster_name: str) -> None:
|
93
|
-
NodeProvider.__init__(self, provider_config, cluster_name)
|
94
|
-
self.lock = RLock()
|
95
|
-
self.scp_client = scp_utils.SCPClient()
|
96
|
-
self.my_service_zones = self.scp_client.list_service_zone_names()
|
97
|
-
|
98
|
-
self.cached_nodes: Dict[str, Any] = {}
|
99
|
-
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
|
100
|
-
True)
|
101
|
-
self.metadata = scp_utils.Metadata(TAG_PATH_PREFIX, cluster_name)
|
102
|
-
vms = self._list_instances_in_cluster()
|
103
|
-
self._refresh_security_group(vms)
|
104
|
-
|
105
|
-
# The tag file for autodowned clusters is not autoremoved. Hence, if
|
106
|
-
# a previous cluster was autodowned and has the same name as the
|
107
|
-
# current cluster, then self.metadata might load the old tag file.
|
108
|
-
# We prevent this by removing any old vms in the tag file.
|
109
|
-
self.metadata.refresh([node['virtualServerId'] for node in vms])
|
110
|
-
|
111
|
-
# If tag file does not exist on head, create it and add basic tags.
|
112
|
-
# This is a hack to make sure that ray on head can access some
|
113
|
-
# important tags.
|
114
|
-
# TODO(ewzeng): change when Lambda Cloud adds tag support.
|
115
|
-
ray_yaml_path = os.path.expanduser(REMOTE_RAY_YAML)
|
116
|
-
if os.path.exists(ray_yaml_path) and not os.path.exists(
|
117
|
-
self.metadata.path):
|
118
|
-
config = common_utils.read_yaml(ray_yaml_path)
|
119
|
-
# Ensure correct cluster so sky launch on head node works correctly
|
120
|
-
if config['cluster_name'] != cluster_name:
|
121
|
-
return
|
122
|
-
# Compute launch hash
|
123
|
-
head_node_config = config.get('head_node', {})
|
124
|
-
head_node_type = config.get('head_node_type')
|
125
|
-
if head_node_type:
|
126
|
-
head_config = config['available_node_types'][head_node_type]
|
127
|
-
head_node_config.update(head_config["node_config"])
|
128
|
-
launch_hash = hash_launch_conf(head_node_config, config['auth'])
|
129
|
-
# Populate tags
|
130
|
-
for node in vms:
|
131
|
-
self.metadata[node['virtualServerId']] = {
|
132
|
-
'tags': {
|
133
|
-
TAG_RAY_CLUSTER_NAME: cluster_name,
|
134
|
-
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
135
|
-
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
136
|
-
TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
|
137
|
-
TAG_RAY_NODE_NAME: f'ray-{cluster_name}-head',
|
138
|
-
TAG_RAY_LAUNCH_CONFIG: launch_hash,
|
139
|
-
}
|
140
|
-
}
|
141
|
-
|
142
|
-
def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
|
143
|
-
"""List running instances in cluster."""
|
144
|
-
vms = self.scp_client.list_instances()
|
145
|
-
node_list = []
|
146
|
-
for node in vms:
|
147
|
-
if node['virtualServerName'] == self.cluster_name:
|
148
|
-
node['external_ip'] = self.scp_client.get_external_ip(
|
149
|
-
virtual_server_id=node['virtualServerId'], ip=node['ip'])
|
150
|
-
node_list.append(node)
|
151
|
-
|
152
|
-
return node_list
|
153
|
-
|
154
|
-
@synchronized
|
155
|
-
def _get_filtered_nodes(self, tag_filters: Dict[str,
|
156
|
-
str]) -> Dict[str, Any]:
|
157
|
-
|
158
|
-
def match_tags(vm):
|
159
|
-
vm_info = self.metadata[vm['virtualServerId']]
|
160
|
-
tags = {} if vm_info is None else vm_info['tags']
|
161
|
-
for k, v in tag_filters.items():
|
162
|
-
if tags.get(k) != v:
|
163
|
-
return False
|
164
|
-
return True
|
165
|
-
|
166
|
-
vms = self._list_instances_in_cluster()
|
167
|
-
nodes = [self._extract_metadata(vm) for vm in filter(match_tags, vms)]
|
168
|
-
self.cached_nodes = {node['virtualServerId']: node for node in nodes}
|
169
|
-
return self.cached_nodes
|
170
|
-
|
171
|
-
def _extract_metadata(self, vm: Dict[str, Any]) -> Dict[str, Any]:
|
172
|
-
metadata = {
|
173
|
-
'virtualServerId': vm['virtualServerId'],
|
174
|
-
'virtualServerName': vm['virtualServerName'],
|
175
|
-
'status': vm['virtualServerState'],
|
176
|
-
'tags': {}
|
177
|
-
}
|
178
|
-
instance_info = self.metadata[vm['virtualServerId']]
|
179
|
-
if instance_info is not None:
|
180
|
-
metadata['tags'] = instance_info['tags']
|
181
|
-
# TODO(ewzeng): The internal ip is hard to get, so set it to the
|
182
|
-
# external ip as a hack. This should be changed in the future.
|
183
|
-
# https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
|
184
|
-
metadata['internal_ip'] = vm['ip']
|
185
|
-
metadata['external_ip'] = vm['external_ip']
|
186
|
-
return metadata
|
187
|
-
|
188
|
-
def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
|
189
|
-
"""Return a list of node ids filtered by the specified tags dict.
|
190
|
-
|
191
|
-
This list must not include terminated nodes. For performance reasons,
|
192
|
-
providers are allowed to cache the result of a call to
|
193
|
-
non_terminated_nodes() to serve single-node queries
|
194
|
-
(e.g. is_running(node_id)). This means that non_terminated_nodes() must
|
195
|
-
be called again to refresh results.
|
196
|
-
|
197
|
-
Examples:
|
198
|
-
>>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
|
199
|
-
["node-1", "node-2"]
|
200
|
-
"""
|
201
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
202
|
-
|
203
|
-
if self.cache_stopped_nodes:
|
204
|
-
print("cache_stopped_nodes value is True")
|
205
|
-
return [
|
206
|
-
k for k, v in nodes.items()
|
207
|
-
if not v["status"].startswith("STOPPED")
|
208
|
-
]
|
209
|
-
else:
|
210
|
-
print("cache_stopped_nodes value is False")
|
211
|
-
return [k for k, v in nodes.items()]
|
212
|
-
|
213
|
-
def is_running(self, node_id: str) -> bool:
|
214
|
-
"""Return whether the specified node is running."""
|
215
|
-
return self._get_cached_node(node_id=node_id) is not None
|
216
|
-
|
217
|
-
def is_terminated(self, node_id: str) -> bool:
|
218
|
-
"""Return whether the specified node is terminated."""
|
219
|
-
return self._get_cached_node(node_id=node_id) is None
|
220
|
-
|
221
|
-
def node_tags(self, node_id: str) -> Dict[str, str]:
|
222
|
-
"""Returns the tags of the given node (string dict)."""
|
223
|
-
cached_node = self._get_cached_node(node_id=node_id)
|
224
|
-
if cached_node is None:
|
225
|
-
return {}
|
226
|
-
return cached_node['tags']
|
227
|
-
|
228
|
-
def external_ip(self, node_id: str) -> Optional[str]:
|
229
|
-
"""Returns the external ip of the given node."""
|
230
|
-
cached_node = self._get_cached_node(node_id=node_id)
|
231
|
-
if cached_node is None:
|
232
|
-
return None
|
233
|
-
return cached_node['external_ip']
|
234
|
-
|
235
|
-
def internal_ip(self, node_id: str) -> Optional[str]:
|
236
|
-
"""Returns the internal ip (Ray ip) of the given node."""
|
237
|
-
cached_node = self._get_cached_node(node_id=node_id)
|
238
|
-
if cached_node is None:
|
239
|
-
return None
|
240
|
-
return cached_node['internal_ip']
|
241
|
-
|
242
|
-
def _config_security_group(self, zone_id, vpc, cluster_name):
|
243
|
-
sg_name = cluster_name.replace("-", "") + "sg"
|
244
|
-
if len(sg_name) > 20:
|
245
|
-
sg_name = sg_name[:9] + '0' + sg_name[
|
246
|
-
-10:] # should be less than 21
|
247
|
-
|
248
|
-
undo_func_stack = []
|
249
|
-
try:
|
250
|
-
response = self.scp_client.create_security_group(
|
251
|
-
zone_id, vpc, sg_name)
|
252
|
-
sg_id = response['resourceId']
|
253
|
-
undo_func_stack.append(lambda: self._del_security_group(sg_id))
|
254
|
-
while True:
|
255
|
-
sg_contents = self.scp_client.list_security_groups(
|
256
|
-
vpc_id=vpc, sg_name=sg_name)
|
257
|
-
sg = [
|
258
|
-
sg["securityGroupState"]
|
259
|
-
for sg in sg_contents
|
260
|
-
if sg["securityGroupId"] == sg_id
|
261
|
-
]
|
262
|
-
if sg and sg[0] == "ACTIVE":
|
263
|
-
break
|
264
|
-
time.sleep(5)
|
265
|
-
|
266
|
-
self.scp_client.add_security_group_in_rule(sg_id)
|
267
|
-
self.scp_client.add_security_group_out_rule(sg_id) # out all
|
268
|
-
|
269
|
-
return sg_id
|
270
|
-
except Exception as e:
|
271
|
-
logger.error("Security Group Creation Fail.")
|
272
|
-
self._undo_funcs(undo_func_stack)
|
273
|
-
return None
|
274
|
-
|
275
|
-
def _del_security_group(self, sg_id):
|
276
|
-
self.scp_client.del_security_group(sg_id)
|
277
|
-
while True:
|
278
|
-
time.sleep(5)
|
279
|
-
sg_contents = self.scp_client.list_security_groups()
|
280
|
-
sg = [
|
281
|
-
sg["securityGroupState"]
|
282
|
-
for sg in sg_contents
|
283
|
-
if sg["securityGroupId"] == sg_id
|
284
|
-
]
|
285
|
-
if not sg:
|
286
|
-
break
|
287
|
-
|
288
|
-
def _refresh_security_group(self, vms):
|
289
|
-
if vms:
|
290
|
-
return
|
291
|
-
# remove security group if vm does not exist
|
292
|
-
keys = self.metadata.keys()
|
293
|
-
security_group_id = self.metadata[
|
294
|
-
keys[0]]['creation']['securityGroupId'] if keys else None
|
295
|
-
if security_group_id:
|
296
|
-
try:
|
297
|
-
self._del_security_group(security_group_id)
|
298
|
-
except Exception as e:
|
299
|
-
logger.info(e)
|
300
|
-
|
301
|
-
def _del_vm(self, vm_id):
|
302
|
-
self.scp_client.terminate_instance(vm_id)
|
303
|
-
while True:
|
304
|
-
time.sleep(10)
|
305
|
-
vm_contents = self.scp_client.list_instances()
|
306
|
-
vms = [
|
307
|
-
vm["virtualServerId"]
|
308
|
-
for vm in vm_contents
|
309
|
-
if vm["virtualServerId"] == vm_id
|
310
|
-
]
|
311
|
-
if not vms:
|
312
|
-
break
|
313
|
-
|
314
|
-
def _del_firwall_rules(self, firewall_id, rule_ids):
|
315
|
-
if not isinstance(rule_ids, list):
|
316
|
-
rule_ids = [rule_ids]
|
317
|
-
self.scp_client.del_firwall_rules(firewall_id, rule_ids)
|
318
|
-
|
319
|
-
@_retry_on_creation
|
320
|
-
def _add_firewall_inbound(self, firewall_id, internal_ip):
|
321
|
-
|
322
|
-
rule_info = self.scp_client.add_firewall_inbound_rule(
|
323
|
-
firewall_id, internal_ip)
|
324
|
-
rule_id = rule_info['resourceId']
|
325
|
-
while True:
|
326
|
-
time.sleep(5)
|
327
|
-
rule_info = self.scp_client.get_firewal_rule_info(
|
328
|
-
firewall_id, rule_id)
|
329
|
-
if rule_info['ruleState'] == "ACTIVE":
|
330
|
-
break
|
331
|
-
return rule_id
|
332
|
-
|
333
|
-
@_retry_on_creation
|
334
|
-
def _add_firewall_outbound(self, firewall_id, internal_ip):
|
335
|
-
|
336
|
-
rule_info = self.scp_client.add_firewall_outbound_rule(
|
337
|
-
firewall_id, internal_ip)
|
338
|
-
rule_id = rule_info['resourceId']
|
339
|
-
while True:
|
340
|
-
time.sleep(5)
|
341
|
-
rule_info = self.scp_client.get_firewal_rule_info(
|
342
|
-
firewall_id, rule_id)
|
343
|
-
if rule_info['ruleState'] == "ACTIVE":
|
344
|
-
break
|
345
|
-
return rule_id
|
346
|
-
|
347
|
-
def _get_firewall_id(self, vpc_id):
|
348
|
-
|
349
|
-
firewall_contents = self.scp_client.list_firwalls()
|
350
|
-
firewall_id = [
|
351
|
-
firewall['firewallId']
|
352
|
-
for firewall in firewall_contents
|
353
|
-
if firewall['vpcId'] == vpc_id and
|
354
|
-
(firewall['firewallState'] in ['ACTIVE', 'DEPLOYING'])
|
355
|
-
][0]
|
356
|
-
|
357
|
-
return firewall_id
|
358
|
-
|
359
|
-
@_retry_on_creation
|
360
|
-
def _create_instance(self, instance_config):
|
361
|
-
response = self.scp_client.create_instance(instance_config)
|
362
|
-
vm_id = response.get('resourceId', None)
|
363
|
-
while True:
|
364
|
-
time.sleep(10)
|
365
|
-
vm_info = self.scp_client.get_vm_info(vm_id)
|
366
|
-
if vm_info["virtualServerState"] == "RUNNING":
|
367
|
-
break
|
368
|
-
return vm_id, vm_info['ip']
|
369
|
-
|
370
|
-
def _create_instance_sequence(self, vpc, instance_config):
|
371
|
-
undo_func_stack = []
|
372
|
-
try:
|
373
|
-
vm_id, vm_internal_ip = self._create_instance(instance_config)
|
374
|
-
|
375
|
-
undo_func_stack.append(lambda: self._del_vm(vm_id))
|
376
|
-
firewall_id = self._get_firewall_id(vpc)
|
377
|
-
|
378
|
-
in_rule_id = self._add_firewall_inbound(firewall_id, vm_internal_ip)
|
379
|
-
undo_func_stack.append(
|
380
|
-
lambda: self._del_firwall_rules(firewall_id, in_rule_id))
|
381
|
-
out_rule_id = self._add_firewall_outbound(firewall_id,
|
382
|
-
vm_internal_ip)
|
383
|
-
undo_func_stack.append(
|
384
|
-
lambda: self._del_firwall_rules(firewall_id, in_rule_id))
|
385
|
-
firewall_rules = [in_rule_id, out_rule_id]
|
386
|
-
return vm_id, vm_internal_ip, firewall_id, firewall_rules
|
387
|
-
|
388
|
-
except Exception as e:
|
389
|
-
logger.error("Instance Creation Fails.")
|
390
|
-
self._undo_funcs(undo_func_stack)
|
391
|
-
return None, None, None, None
|
392
|
-
|
393
|
-
def _undo_funcs(self, undo_func_list):
|
394
|
-
while undo_func_list:
|
395
|
-
func = undo_func_list.pop()
|
396
|
-
func()
|
397
|
-
|
398
|
-
def _try_vm_creation(self, vpc, sg_id, config_tags, instance_config):
|
399
|
-
vm_id, vm_internal_ip, firewall_id, firwall_rules = \
|
400
|
-
self._create_instance_sequence(vpc, instance_config)
|
401
|
-
if vm_id is None:
|
402
|
-
return False # if creation success
|
403
|
-
|
404
|
-
vm_external_ip = self.scp_client.get_external_ip(
|
405
|
-
virtual_server_id=vm_id, ip=vm_internal_ip)
|
406
|
-
creation_tags = {}
|
407
|
-
creation_tags['virtualServerId'] = vm_id
|
408
|
-
creation_tags['vmInternalIp'] = vm_internal_ip
|
409
|
-
creation_tags['firewallId'] = firewall_id
|
410
|
-
creation_tags['firewallRuleIds'] = firwall_rules
|
411
|
-
creation_tags['securityGroupId'] = sg_id
|
412
|
-
creation_tags['vmExternalIp'] = vm_external_ip
|
413
|
-
self.metadata[vm_id] = {'tags': config_tags, 'creation': creation_tags}
|
414
|
-
return True
|
415
|
-
|
416
|
-
def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
|
417
|
-
count: int) -> None:
|
418
|
-
"""Creates a number of nodes within the namespace."""
|
419
|
-
assert count == 1, count # Only support 1-node clusters for now
|
420
|
-
"""
|
421
|
-
0. need VPC where IGW attached, and its public subnets
|
422
|
-
1. select a VPC
|
423
|
-
2. create a security-group belongs to VPC
|
424
|
-
3. add an inbound rule into the security-group: 0.0.0.0/0 22port
|
425
|
-
4. select a subnet
|
426
|
-
5. create a VM
|
427
|
-
6. get the VM info including IP
|
428
|
-
7. add an inbound rule to a Firewall of the VPC: 0.0.0.0/0 22port -> VM IP
|
429
|
-
"""
|
430
|
-
config_tags = node_config.get('tags', {}).copy()
|
431
|
-
config_tags.update(tags)
|
432
|
-
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
433
|
-
|
434
|
-
if self.cache_stopped_nodes:
|
435
|
-
VALIDITY_TAGS = [
|
436
|
-
TAG_RAY_CLUSTER_NAME,
|
437
|
-
TAG_RAY_NODE_KIND,
|
438
|
-
TAG_RAY_LAUNCH_CONFIG,
|
439
|
-
TAG_RAY_USER_NODE_TYPE,
|
440
|
-
]
|
441
|
-
filters = {
|
442
|
-
tag: config_tags[tag]
|
443
|
-
for tag in VALIDITY_TAGS
|
444
|
-
if tag in config_tags
|
445
|
-
}
|
446
|
-
reuse_nodes = self._stopped_nodes(filters)[:count]
|
447
|
-
logger.info(
|
448
|
-
f"Reusing nodes {list(reuse_nodes)}. "
|
449
|
-
"To disable reuse, set `cache_stopped_nodes: False` "
|
450
|
-
"under `provider` in the cluster configuration.",)
|
451
|
-
|
452
|
-
for vm_id in reuse_nodes:
|
453
|
-
self._start_vm(vm_id=vm_id)
|
454
|
-
self.set_node_tags(vm_id, config_tags)
|
455
|
-
|
456
|
-
while True:
|
457
|
-
time.sleep(5)
|
458
|
-
vm_info = self.scp_client.get_vm_info(vm_id)
|
459
|
-
if vm_info["virtualServerState"] == "RUNNING":
|
460
|
-
break
|
461
|
-
|
462
|
-
count -= len(reuse_nodes)
|
463
|
-
|
464
|
-
if count:
|
465
|
-
if (node_config['region'] not in self.my_service_zones):
|
466
|
-
raise SCPError('This region/zone is not available for '\
|
467
|
-
'this project.')
|
468
|
-
|
469
|
-
zone_config = ZoneConfig(self.scp_client, node_config)
|
470
|
-
vpc_subnets = zone_config.get_vcp_subnets()
|
471
|
-
if not vpc_subnets:
|
472
|
-
raise SCPError("This region/zone does not have available VPCs.")
|
473
|
-
|
474
|
-
instance_config = zone_config.bootstrap_instance_config(node_config)
|
475
|
-
instance_config['virtualServerName'] = self.cluster_name
|
476
|
-
|
477
|
-
for vpc, subnets in vpc_subnets.items():
|
478
|
-
sg_id = self._config_security_group(
|
479
|
-
zone_config.zone_id, vpc, self.cluster_name) # sg_name
|
480
|
-
if sg_id is None:
|
481
|
-
continue
|
482
|
-
|
483
|
-
instance_config['securityGroupIds'] = [sg_id]
|
484
|
-
for subnet in subnets:
|
485
|
-
instance_config['nic']['subnetId'] = subnet
|
486
|
-
SUCCESS = self._try_vm_creation(vpc, sg_id, config_tags,
|
487
|
-
instance_config)
|
488
|
-
if SUCCESS:
|
489
|
-
return
|
490
|
-
|
491
|
-
self._del_security_group(sg_id)
|
492
|
-
|
493
|
-
raise SCPError("Instance Creation Fails.")
|
494
|
-
|
495
|
-
def _stopped_nodes(self, tag_filters):
|
496
|
-
"""Return a list of stopped node ids filtered by the specified tags dict."""
|
497
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
498
|
-
return [
|
499
|
-
k for k, v in nodes.items() if v["status"].startswith("STOPPED")
|
500
|
-
]
|
501
|
-
|
502
|
-
@synchronized
|
503
|
-
def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
|
504
|
-
"""Sets the tag values (string dict) for the specified node."""
|
505
|
-
node = self._get_node(node_id)
|
506
|
-
if node is None:
|
507
|
-
return
|
508
|
-
|
509
|
-
node['tags'].update(tags)
|
510
|
-
# self.metadata[node_id] = {'tags': node['tags']}
|
511
|
-
metadata = self.metadata[node_id]
|
512
|
-
metadata['tags'] = node['tags']
|
513
|
-
self.metadata[node_id] = metadata
|
514
|
-
|
515
|
-
def terminate_node(self, node_id: str) -> None:
|
516
|
-
"""Terminates the specified node."""
|
517
|
-
if self.cache_stopped_nodes:
|
518
|
-
try:
|
519
|
-
cli_logger.print(
|
520
|
-
f"Stopping instance {node_id}"
|
521
|
-
"(to fully terminate instead, "
|
522
|
-
"set `cache_stopped_nodes: False` "
|
523
|
-
"under `provider` in the cluster configuration)")
|
524
|
-
self._stop_vm(node_id)
|
525
|
-
except:
|
526
|
-
raise SCPError("Errors during stopping a node")
|
527
|
-
else:
|
528
|
-
try:
|
529
|
-
creation_tags = self.metadata[node_id]['creation']
|
530
|
-
self._del_firwall_rules(creation_tags['firewallId'],
|
531
|
-
creation_tags['firewallRuleIds'])
|
532
|
-
self._del_vm(creation_tags['virtualServerId'])
|
533
|
-
self._del_security_group(creation_tags['securityGroupId'])
|
534
|
-
self.metadata[node_id] = None
|
535
|
-
except:
|
536
|
-
raise SCPError("Errors during terminating a node")
|
537
|
-
|
538
|
-
def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
539
|
-
self._get_filtered_nodes({}) # Side effect: updates cache
|
540
|
-
return self.cached_nodes.get(node_id, None)
|
541
|
-
|
542
|
-
def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
543
|
-
if node_id in self.cached_nodes:
|
544
|
-
return self.cached_nodes[node_id]
|
545
|
-
return self._get_node(node_id=node_id)
|
546
|
-
|
547
|
-
@staticmethod
|
548
|
-
def bootstrap_config(cluster_config):
|
549
|
-
|
550
|
-
node_config = cluster_config['available_node_types'][
|
551
|
-
'ray_head_default']['node_config']
|
552
|
-
provider_config = cluster_config['provider']
|
553
|
-
node_config['region'] = provider_config['region']
|
554
|
-
node_config['auth'] = cluster_config['auth']
|
555
|
-
|
556
|
-
#Add file mount: metadata path
|
557
|
-
metadata_path = f'{TAG_PATH_PREFIX}-{cluster_config["cluster_name"]}'
|
558
|
-
cluster_config['file_mounts'][metadata_path] = metadata_path
|
559
|
-
|
560
|
-
_validation_check(node_config)
|
561
|
-
|
562
|
-
return cluster_config
|
563
|
-
|
564
|
-
def _start_vm(self, vm_id):
|
565
|
-
self.scp_client.start_instance(vm_id)
|
566
|
-
while True:
|
567
|
-
time.sleep(2)
|
568
|
-
vm_info = self.scp_client.get_vm_info(vm_id)
|
569
|
-
if vm_info["virtualServerState"] == "RUNNING":
|
570
|
-
break
|
571
|
-
|
572
|
-
def _stop_vm(self, vm_id):
|
573
|
-
self.scp_client.stop_instance(vm_id)
|
574
|
-
while True:
|
575
|
-
time.sleep(2)
|
576
|
-
vm_info = self.scp_client.get_vm_info(vm_id)
|
577
|
-
if vm_info["virtualServerState"] == "STOPPED":
|
578
|
-
break
|
/sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
/sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js}
RENAMED
File without changes
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|