skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1,578 +0,0 @@
1
- """ SCP Node provider
2
-
3
- This module inherits NodeProvider interface
4
- to provide the functions accessing SCP nodes
5
- """
6
-
7
- import copy
8
- from functools import wraps
9
- import logging
10
- import os
11
- from threading import RLock
12
- import time
13
- from typing import Any, Dict, List, Optional
14
-
15
- from ray.autoscaler._private.cli_logger import cli_logger
16
- from ray.autoscaler._private.util import hash_launch_conf
17
- from ray.autoscaler.node_provider import NodeProvider
18
- from ray.autoscaler.tags import NODE_KIND_HEAD
19
- from ray.autoscaler.tags import NODE_KIND_WORKER
20
- from ray.autoscaler.tags import STATUS_UP_TO_DATE
21
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
22
- from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG
23
- from ray.autoscaler.tags import TAG_RAY_NODE_KIND
24
- from ray.autoscaler.tags import TAG_RAY_NODE_NAME
25
- from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
26
- from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
27
-
28
- from sky.clouds.utils import scp_utils
29
- from sky.clouds.utils.scp_utils import SCPCreationFailError
30
- from sky.skylet.providers.scp.config import ZoneConfig
31
- from sky.utils import common_utils
32
-
33
- TAG_PATH_PREFIX = '~/.sky/generated/scp/metadata'
34
- REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- def synchronized(f):
40
-
41
- def wrapper(self, *args, **kwargs):
42
- self.lock.acquire()
43
- try:
44
- return f(self, *args, **kwargs)
45
- finally:
46
- self.lock.release()
47
-
48
- return wrapper
49
-
50
-
51
- def _validation_check(node_config):
52
- err_msg = None
53
- if 'diskSize' not in node_config:
54
- err_msg = "Disk size value is mandatory."
55
- elif node_config['diskSize'] < 100 or node_config['diskSize'] > 300:
56
- err_msg = f'The disk size must be between 100 and 300. ' \
57
- f'Input: {node_config["diskSize"]}'
58
- if err_msg:
59
- raise SCPError(err_msg)
60
-
61
-
62
- class SCPError(Exception):
63
- pass
64
-
65
-
66
- def _retry_on_creation(method, max_tries=3, backoff_s=2):
67
-
68
- @wraps(method)
69
- def method_with_retries(self, *args, **kwargs):
70
- try_count = 0
71
- while try_count < max_tries:
72
- try:
73
- return method(self, *args, **kwargs)
74
- except SCPCreationFailError:
75
- logger.warning("Resource Creation Failed. Retrying.")
76
- try_count += 1
77
- if try_count < max_tries:
78
- time.sleep(backoff_s)
79
- else:
80
- raise
81
-
82
- return method_with_retries
83
-
84
-
85
- class SCPNodeProvider(NodeProvider):
86
- """Node Provider for Lambda Cloud.
87
-
88
- This provider assumes Lambda Cloud credentials are set.
89
- """
90
-
91
- def __init__(self, provider_config: Dict[str, Any],
92
- cluster_name: str) -> None:
93
- NodeProvider.__init__(self, provider_config, cluster_name)
94
- self.lock = RLock()
95
- self.scp_client = scp_utils.SCPClient()
96
- self.my_service_zones = self.scp_client.list_service_zone_names()
97
-
98
- self.cached_nodes: Dict[str, Any] = {}
99
- self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
100
- True)
101
- self.metadata = scp_utils.Metadata(TAG_PATH_PREFIX, cluster_name)
102
- vms = self._list_instances_in_cluster()
103
- self._refresh_security_group(vms)
104
-
105
- # The tag file for autodowned clusters is not autoremoved. Hence, if
106
- # a previous cluster was autodowned and has the same name as the
107
- # current cluster, then self.metadata might load the old tag file.
108
- # We prevent this by removing any old vms in the tag file.
109
- self.metadata.refresh([node['virtualServerId'] for node in vms])
110
-
111
- # If tag file does not exist on head, create it and add basic tags.
112
- # This is a hack to make sure that ray on head can access some
113
- # important tags.
114
- # TODO(ewzeng): change when Lambda Cloud adds tag support.
115
- ray_yaml_path = os.path.expanduser(REMOTE_RAY_YAML)
116
- if os.path.exists(ray_yaml_path) and not os.path.exists(
117
- self.metadata.path):
118
- config = common_utils.read_yaml(ray_yaml_path)
119
- # Ensure correct cluster so sky launch on head node works correctly
120
- if config['cluster_name'] != cluster_name:
121
- return
122
- # Compute launch hash
123
- head_node_config = config.get('head_node', {})
124
- head_node_type = config.get('head_node_type')
125
- if head_node_type:
126
- head_config = config['available_node_types'][head_node_type]
127
- head_node_config.update(head_config["node_config"])
128
- launch_hash = hash_launch_conf(head_node_config, config['auth'])
129
- # Populate tags
130
- for node in vms:
131
- self.metadata[node['virtualServerId']] = {
132
- 'tags': {
133
- TAG_RAY_CLUSTER_NAME: cluster_name,
134
- TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
135
- TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
136
- TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
137
- TAG_RAY_NODE_NAME: f'ray-{cluster_name}-head',
138
- TAG_RAY_LAUNCH_CONFIG: launch_hash,
139
- }
140
- }
141
-
142
- def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
143
- """List running instances in cluster."""
144
- vms = self.scp_client.list_instances()
145
- node_list = []
146
- for node in vms:
147
- if node['virtualServerName'] == self.cluster_name:
148
- node['external_ip'] = self.scp_client.get_external_ip(
149
- virtual_server_id=node['virtualServerId'], ip=node['ip'])
150
- node_list.append(node)
151
-
152
- return node_list
153
-
154
- @synchronized
155
- def _get_filtered_nodes(self, tag_filters: Dict[str,
156
- str]) -> Dict[str, Any]:
157
-
158
- def match_tags(vm):
159
- vm_info = self.metadata[vm['virtualServerId']]
160
- tags = {} if vm_info is None else vm_info['tags']
161
- for k, v in tag_filters.items():
162
- if tags.get(k) != v:
163
- return False
164
- return True
165
-
166
- vms = self._list_instances_in_cluster()
167
- nodes = [self._extract_metadata(vm) for vm in filter(match_tags, vms)]
168
- self.cached_nodes = {node['virtualServerId']: node for node in nodes}
169
- return self.cached_nodes
170
-
171
- def _extract_metadata(self, vm: Dict[str, Any]) -> Dict[str, Any]:
172
- metadata = {
173
- 'virtualServerId': vm['virtualServerId'],
174
- 'virtualServerName': vm['virtualServerName'],
175
- 'status': vm['virtualServerState'],
176
- 'tags': {}
177
- }
178
- instance_info = self.metadata[vm['virtualServerId']]
179
- if instance_info is not None:
180
- metadata['tags'] = instance_info['tags']
181
- # TODO(ewzeng): The internal ip is hard to get, so set it to the
182
- # external ip as a hack. This should be changed in the future.
183
- # https://docs.lambdalabs.com/public-cloud/on-demand/getting-started/#learn-your-instances-private-ip-address
184
- metadata['internal_ip'] = vm['ip']
185
- metadata['external_ip'] = vm['external_ip']
186
- return metadata
187
-
188
- def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
189
- """Return a list of node ids filtered by the specified tags dict.
190
-
191
- This list must not include terminated nodes. For performance reasons,
192
- providers are allowed to cache the result of a call to
193
- non_terminated_nodes() to serve single-node queries
194
- (e.g. is_running(node_id)). This means that non_terminated_nodes() must
195
- be called again to refresh results.
196
-
197
- Examples:
198
- >>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
199
- ["node-1", "node-2"]
200
- """
201
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
202
-
203
- if self.cache_stopped_nodes:
204
- print("cache_stopped_nodes value is True")
205
- return [
206
- k for k, v in nodes.items()
207
- if not v["status"].startswith("STOPPED")
208
- ]
209
- else:
210
- print("cache_stopped_nodes value is False")
211
- return [k for k, v in nodes.items()]
212
-
213
- def is_running(self, node_id: str) -> bool:
214
- """Return whether the specified node is running."""
215
- return self._get_cached_node(node_id=node_id) is not None
216
-
217
- def is_terminated(self, node_id: str) -> bool:
218
- """Return whether the specified node is terminated."""
219
- return self._get_cached_node(node_id=node_id) is None
220
-
221
- def node_tags(self, node_id: str) -> Dict[str, str]:
222
- """Returns the tags of the given node (string dict)."""
223
- cached_node = self._get_cached_node(node_id=node_id)
224
- if cached_node is None:
225
- return {}
226
- return cached_node['tags']
227
-
228
- def external_ip(self, node_id: str) -> Optional[str]:
229
- """Returns the external ip of the given node."""
230
- cached_node = self._get_cached_node(node_id=node_id)
231
- if cached_node is None:
232
- return None
233
- return cached_node['external_ip']
234
-
235
- def internal_ip(self, node_id: str) -> Optional[str]:
236
- """Returns the internal ip (Ray ip) of the given node."""
237
- cached_node = self._get_cached_node(node_id=node_id)
238
- if cached_node is None:
239
- return None
240
- return cached_node['internal_ip']
241
-
242
- def _config_security_group(self, zone_id, vpc, cluster_name):
243
- sg_name = cluster_name.replace("-", "") + "sg"
244
- if len(sg_name) > 20:
245
- sg_name = sg_name[:9] + '0' + sg_name[
246
- -10:] # should be less than 21
247
-
248
- undo_func_stack = []
249
- try:
250
- response = self.scp_client.create_security_group(
251
- zone_id, vpc, sg_name)
252
- sg_id = response['resourceId']
253
- undo_func_stack.append(lambda: self._del_security_group(sg_id))
254
- while True:
255
- sg_contents = self.scp_client.list_security_groups(
256
- vpc_id=vpc, sg_name=sg_name)
257
- sg = [
258
- sg["securityGroupState"]
259
- for sg in sg_contents
260
- if sg["securityGroupId"] == sg_id
261
- ]
262
- if sg and sg[0] == "ACTIVE":
263
- break
264
- time.sleep(5)
265
-
266
- self.scp_client.add_security_group_in_rule(sg_id)
267
- self.scp_client.add_security_group_out_rule(sg_id) # out all
268
-
269
- return sg_id
270
- except Exception as e:
271
- logger.error("Security Group Creation Fail.")
272
- self._undo_funcs(undo_func_stack)
273
- return None
274
-
275
- def _del_security_group(self, sg_id):
276
- self.scp_client.del_security_group(sg_id)
277
- while True:
278
- time.sleep(5)
279
- sg_contents = self.scp_client.list_security_groups()
280
- sg = [
281
- sg["securityGroupState"]
282
- for sg in sg_contents
283
- if sg["securityGroupId"] == sg_id
284
- ]
285
- if not sg:
286
- break
287
-
288
- def _refresh_security_group(self, vms):
289
- if vms:
290
- return
291
- # remove security group if vm does not exist
292
- keys = self.metadata.keys()
293
- security_group_id = self.metadata[
294
- keys[0]]['creation']['securityGroupId'] if keys else None
295
- if security_group_id:
296
- try:
297
- self._del_security_group(security_group_id)
298
- except Exception as e:
299
- logger.info(e)
300
-
301
- def _del_vm(self, vm_id):
302
- self.scp_client.terminate_instance(vm_id)
303
- while True:
304
- time.sleep(10)
305
- vm_contents = self.scp_client.list_instances()
306
- vms = [
307
- vm["virtualServerId"]
308
- for vm in vm_contents
309
- if vm["virtualServerId"] == vm_id
310
- ]
311
- if not vms:
312
- break
313
-
314
- def _del_firwall_rules(self, firewall_id, rule_ids):
315
- if not isinstance(rule_ids, list):
316
- rule_ids = [rule_ids]
317
- self.scp_client.del_firwall_rules(firewall_id, rule_ids)
318
-
319
- @_retry_on_creation
320
- def _add_firewall_inbound(self, firewall_id, internal_ip):
321
-
322
- rule_info = self.scp_client.add_firewall_inbound_rule(
323
- firewall_id, internal_ip)
324
- rule_id = rule_info['resourceId']
325
- while True:
326
- time.sleep(5)
327
- rule_info = self.scp_client.get_firewal_rule_info(
328
- firewall_id, rule_id)
329
- if rule_info['ruleState'] == "ACTIVE":
330
- break
331
- return rule_id
332
-
333
- @_retry_on_creation
334
- def _add_firewall_outbound(self, firewall_id, internal_ip):
335
-
336
- rule_info = self.scp_client.add_firewall_outbound_rule(
337
- firewall_id, internal_ip)
338
- rule_id = rule_info['resourceId']
339
- while True:
340
- time.sleep(5)
341
- rule_info = self.scp_client.get_firewal_rule_info(
342
- firewall_id, rule_id)
343
- if rule_info['ruleState'] == "ACTIVE":
344
- break
345
- return rule_id
346
-
347
- def _get_firewall_id(self, vpc_id):
348
-
349
- firewall_contents = self.scp_client.list_firwalls()
350
- firewall_id = [
351
- firewall['firewallId']
352
- for firewall in firewall_contents
353
- if firewall['vpcId'] == vpc_id and
354
- (firewall['firewallState'] in ['ACTIVE', 'DEPLOYING'])
355
- ][0]
356
-
357
- return firewall_id
358
-
359
- @_retry_on_creation
360
- def _create_instance(self, instance_config):
361
- response = self.scp_client.create_instance(instance_config)
362
- vm_id = response.get('resourceId', None)
363
- while True:
364
- time.sleep(10)
365
- vm_info = self.scp_client.get_vm_info(vm_id)
366
- if vm_info["virtualServerState"] == "RUNNING":
367
- break
368
- return vm_id, vm_info['ip']
369
-
370
- def _create_instance_sequence(self, vpc, instance_config):
371
- undo_func_stack = []
372
- try:
373
- vm_id, vm_internal_ip = self._create_instance(instance_config)
374
-
375
- undo_func_stack.append(lambda: self._del_vm(vm_id))
376
- firewall_id = self._get_firewall_id(vpc)
377
-
378
- in_rule_id = self._add_firewall_inbound(firewall_id, vm_internal_ip)
379
- undo_func_stack.append(
380
- lambda: self._del_firwall_rules(firewall_id, in_rule_id))
381
- out_rule_id = self._add_firewall_outbound(firewall_id,
382
- vm_internal_ip)
383
- undo_func_stack.append(
384
- lambda: self._del_firwall_rules(firewall_id, in_rule_id))
385
- firewall_rules = [in_rule_id, out_rule_id]
386
- return vm_id, vm_internal_ip, firewall_id, firewall_rules
387
-
388
- except Exception as e:
389
- logger.error("Instance Creation Fails.")
390
- self._undo_funcs(undo_func_stack)
391
- return None, None, None, None
392
-
393
- def _undo_funcs(self, undo_func_list):
394
- while undo_func_list:
395
- func = undo_func_list.pop()
396
- func()
397
-
398
- def _try_vm_creation(self, vpc, sg_id, config_tags, instance_config):
399
- vm_id, vm_internal_ip, firewall_id, firwall_rules = \
400
- self._create_instance_sequence(vpc, instance_config)
401
- if vm_id is None:
402
- return False # if creation success
403
-
404
- vm_external_ip = self.scp_client.get_external_ip(
405
- virtual_server_id=vm_id, ip=vm_internal_ip)
406
- creation_tags = {}
407
- creation_tags['virtualServerId'] = vm_id
408
- creation_tags['vmInternalIp'] = vm_internal_ip
409
- creation_tags['firewallId'] = firewall_id
410
- creation_tags['firewallRuleIds'] = firwall_rules
411
- creation_tags['securityGroupId'] = sg_id
412
- creation_tags['vmExternalIp'] = vm_external_ip
413
- self.metadata[vm_id] = {'tags': config_tags, 'creation': creation_tags}
414
- return True
415
-
416
- def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
417
- count: int) -> None:
418
- """Creates a number of nodes within the namespace."""
419
- assert count == 1, count # Only support 1-node clusters for now
420
- """
421
- 0. need VPC where IGW attached, and its public subnets
422
- 1. select a VPC
423
- 2. create a security-group belongs to VPC
424
- 3. add an inbound rule into the security-group: 0.0.0.0/0 22port
425
- 4. select a subnet
426
- 5. create a VM
427
- 6. get the VM info including IP
428
- 7. add an inbound rule to a Firewall of the VPC: 0.0.0.0/0 22port -> VM IP
429
- """
430
- config_tags = node_config.get('tags', {}).copy()
431
- config_tags.update(tags)
432
- config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
433
-
434
- if self.cache_stopped_nodes:
435
- VALIDITY_TAGS = [
436
- TAG_RAY_CLUSTER_NAME,
437
- TAG_RAY_NODE_KIND,
438
- TAG_RAY_LAUNCH_CONFIG,
439
- TAG_RAY_USER_NODE_TYPE,
440
- ]
441
- filters = {
442
- tag: config_tags[tag]
443
- for tag in VALIDITY_TAGS
444
- if tag in config_tags
445
- }
446
- reuse_nodes = self._stopped_nodes(filters)[:count]
447
- logger.info(
448
- f"Reusing nodes {list(reuse_nodes)}. "
449
- "To disable reuse, set `cache_stopped_nodes: False` "
450
- "under `provider` in the cluster configuration.",)
451
-
452
- for vm_id in reuse_nodes:
453
- self._start_vm(vm_id=vm_id)
454
- self.set_node_tags(vm_id, config_tags)
455
-
456
- while True:
457
- time.sleep(5)
458
- vm_info = self.scp_client.get_vm_info(vm_id)
459
- if vm_info["virtualServerState"] == "RUNNING":
460
- break
461
-
462
- count -= len(reuse_nodes)
463
-
464
- if count:
465
- if (node_config['region'] not in self.my_service_zones):
466
- raise SCPError('This region/zone is not available for '\
467
- 'this project.')
468
-
469
- zone_config = ZoneConfig(self.scp_client, node_config)
470
- vpc_subnets = zone_config.get_vcp_subnets()
471
- if not vpc_subnets:
472
- raise SCPError("This region/zone does not have available VPCs.")
473
-
474
- instance_config = zone_config.bootstrap_instance_config(node_config)
475
- instance_config['virtualServerName'] = self.cluster_name
476
-
477
- for vpc, subnets in vpc_subnets.items():
478
- sg_id = self._config_security_group(
479
- zone_config.zone_id, vpc, self.cluster_name) # sg_name
480
- if sg_id is None:
481
- continue
482
-
483
- instance_config['securityGroupIds'] = [sg_id]
484
- for subnet in subnets:
485
- instance_config['nic']['subnetId'] = subnet
486
- SUCCESS = self._try_vm_creation(vpc, sg_id, config_tags,
487
- instance_config)
488
- if SUCCESS:
489
- return
490
-
491
- self._del_security_group(sg_id)
492
-
493
- raise SCPError("Instance Creation Fails.")
494
-
495
- def _stopped_nodes(self, tag_filters):
496
- """Return a list of stopped node ids filtered by the specified tags dict."""
497
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
498
- return [
499
- k for k, v in nodes.items() if v["status"].startswith("STOPPED")
500
- ]
501
-
502
- @synchronized
503
- def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
504
- """Sets the tag values (string dict) for the specified node."""
505
- node = self._get_node(node_id)
506
- if node is None:
507
- return
508
-
509
- node['tags'].update(tags)
510
- # self.metadata[node_id] = {'tags': node['tags']}
511
- metadata = self.metadata[node_id]
512
- metadata['tags'] = node['tags']
513
- self.metadata[node_id] = metadata
514
-
515
- def terminate_node(self, node_id: str) -> None:
516
- """Terminates the specified node."""
517
- if self.cache_stopped_nodes:
518
- try:
519
- cli_logger.print(
520
- f"Stopping instance {node_id}"
521
- "(to fully terminate instead, "
522
- "set `cache_stopped_nodes: False` "
523
- "under `provider` in the cluster configuration)")
524
- self._stop_vm(node_id)
525
- except:
526
- raise SCPError("Errors during stopping a node")
527
- else:
528
- try:
529
- creation_tags = self.metadata[node_id]['creation']
530
- self._del_firwall_rules(creation_tags['firewallId'],
531
- creation_tags['firewallRuleIds'])
532
- self._del_vm(creation_tags['virtualServerId'])
533
- self._del_security_group(creation_tags['securityGroupId'])
534
- self.metadata[node_id] = None
535
- except:
536
- raise SCPError("Errors during terminating a node")
537
-
538
- def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
539
- self._get_filtered_nodes({}) # Side effect: updates cache
540
- return self.cached_nodes.get(node_id, None)
541
-
542
- def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
543
- if node_id in self.cached_nodes:
544
- return self.cached_nodes[node_id]
545
- return self._get_node(node_id=node_id)
546
-
547
- @staticmethod
548
- def bootstrap_config(cluster_config):
549
-
550
- node_config = cluster_config['available_node_types'][
551
- 'ray_head_default']['node_config']
552
- provider_config = cluster_config['provider']
553
- node_config['region'] = provider_config['region']
554
- node_config['auth'] = cluster_config['auth']
555
-
556
- #Add file mount: metadata path
557
- metadata_path = f'{TAG_PATH_PREFIX}-{cluster_config["cluster_name"]}'
558
- cluster_config['file_mounts'][metadata_path] = metadata_path
559
-
560
- _validation_check(node_config)
561
-
562
- return cluster_config
563
-
564
- def _start_vm(self, vm_id):
565
- self.scp_client.start_instance(vm_id)
566
- while True:
567
- time.sleep(2)
568
- vm_info = self.scp_client.get_vm_info(vm_id)
569
- if vm_info["virtualServerState"] == "RUNNING":
570
- break
571
-
572
- def _stop_vm(self, vm_id):
573
- self.scp_client.stop_instance(vm_id)
574
- while True:
575
- time.sleep(2)
576
- vm_info = self.scp_client.get_vm_info(vm_id)
577
- if vm_info["virtualServerState"] == "STOPPED":
578
- break