skypilot-nightly 1.0.0.dev20241016__py3-none-any.whl → 1.0.0.dev20241018__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +1 -1
  3. sky/backends/backend_utils.py +4 -0
  4. sky/clouds/aws.py +15 -4
  5. sky/clouds/gcp.py +6 -1
  6. sky/clouds/lambda_cloud.py +4 -1
  7. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +6 -4
  8. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +17 -6
  9. sky/clouds/service_catalog/gcp_catalog.py +3 -0
  10. sky/provision/__init__.py +3 -0
  11. sky/provision/gcp/config.py +5 -1
  12. sky/provision/lambda_cloud/__init__.py +11 -0
  13. sky/provision/lambda_cloud/config.py +10 -0
  14. sky/provision/lambda_cloud/instance.py +261 -0
  15. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +19 -18
  16. sky/setup_files/MANIFEST.in +0 -1
  17. sky/templates/gcp-ray.yml.j2 +3 -0
  18. sky/templates/lambda-ray.yml.j2 +7 -38
  19. sky/utils/schemas.py +3 -0
  20. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/METADATA +1 -1
  21. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/RECORD +25 -24
  22. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  23. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  24. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/LICENSE +0 -0
  25. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/WHEEL +0 -0
  26. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/entry_points.txt +0 -0
  27. {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '53380e26f01452559012d57b333b17f40dd8a4d1'
8
+ _SKYPILOT_COMMIT_SHA = '71a95f4bf7f1446e80bb5c24d23c1695bc4fc031'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241016'
38
+ __version__ = '1.0.0.dev20241018'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/authentication.py CHANGED
@@ -43,9 +43,9 @@ from sky.adaptors import gcp
43
43
  from sky.adaptors import ibm
44
44
  from sky.adaptors import kubernetes
45
45
  from sky.adaptors import runpod
46
- from sky.clouds.utils import lambda_utils
47
46
  from sky.provision.fluidstack import fluidstack_utils
48
47
  from sky.provision.kubernetes import utils as kubernetes_utils
48
+ from sky.provision.lambda_cloud import lambda_utils
49
49
  from sky.utils import common_utils
50
50
  from sky.utils import kubernetes_enums
51
51
  from sky.utils import subprocess_utils
@@ -2772,6 +2772,10 @@ def get_endpoints(cluster: str,
2772
2772
  cluster_records = get_clusters(include_controller=True,
2773
2773
  refresh=False,
2774
2774
  cluster_names=[cluster])
2775
+ if not cluster_records:
2776
+ with ux_utils.print_exception_no_traceback():
2777
+ raise exceptions.ClusterNotUpError(
2778
+ f'Cluster {cluster!r} not found.', cluster_status=None)
2775
2779
  assert len(cluster_records) == 1, cluster_records
2776
2780
  cluster_record = cluster_records[0]
2777
2781
  if (not skip_status_check and
sky/clouds/aws.py CHANGED
@@ -32,6 +32,14 @@ if typing.TYPE_CHECKING:
32
32
 
33
33
  logger = sky_logging.init_logger(__name__)
34
34
 
35
+ # Image ID tags
36
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
37
+ # For GPU-related package version,
38
+ # see sky/clouds/service_catalog/images/provisioners/cuda.sh
39
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
40
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
41
+ _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
42
+
35
43
  # This local file (under ~/.aws/) will be uploaded to remote nodes (any
36
44
  # cloud), if all of the following conditions hold:
37
45
  # - the current user identity is not using AWS SSO
@@ -217,17 +225,20 @@ class AWS(clouds.Cloud):
217
225
  @classmethod
218
226
  def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
219
227
  acc = cls.get_accelerators_from_instance_type(instance_type)
220
- image_id = service_catalog.get_image_id_from_tag(
221
- 'skypilot:gpu-ubuntu-2004', region_name, clouds='aws')
228
+ image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
229
+ region_name,
230
+ clouds='aws')
222
231
  if acc is not None:
232
+ image_id = service_catalog.get_image_id_from_tag(
233
+ _DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
223
234
  assert len(acc) == 1, acc
224
235
  acc_name = list(acc.keys())[0]
225
236
  if acc_name == 'K80':
226
237
  image_id = service_catalog.get_image_id_from_tag(
227
- 'skypilot:k80-ubuntu-2004', region_name, clouds='aws')
238
+ _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
228
239
  if acc_name in ['Trainium', 'Inferentia']:
229
240
  image_id = service_catalog.get_image_id_from_tag(
230
- 'skypilot:neuron-ubuntu-2204', region_name, clouds='aws')
241
+ _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
231
242
  if image_id is not None:
232
243
  return image_id
233
244
  # Raise ResourcesUnavailableError to make sure the failover in
sky/clouds/gcp.py CHANGED
@@ -483,7 +483,7 @@ class GCP(clouds.Cloud):
483
483
  if acc in ('A100-80GB', 'L4'):
484
484
  # A100-80GB and L4 have a different name pattern.
485
485
  resources_vars['gpu'] = f'nvidia-{acc.lower()}'
486
- elif acc == 'H100':
486
+ elif acc in ('H100', 'H100-MEGA'):
487
487
  resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
488
488
  else:
489
489
  resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
@@ -546,6 +546,11 @@ class GCP(clouds.Cloud):
546
546
  resources_vars[
547
547
  'force_enable_external_ips'] = skypilot_config.get_nested(
548
548
  ('gcp', 'force_enable_external_ips'), False)
549
+
550
+ # Add gVNIC from config
551
+ resources_vars['enable_gvnic'] = skypilot_config.get_nested(
552
+ ('gcp', 'enable_gvnic'), False)
553
+
549
554
  return resources_vars
550
555
 
551
556
  def _get_feasible_launchable_resources(
@@ -8,7 +8,7 @@ import requests
8
8
  from sky import clouds
9
9
  from sky import status_lib
10
10
  from sky.clouds import service_catalog
11
- from sky.clouds.utils import lambda_utils
11
+ from sky.provision.lambda_cloud import lambda_utils
12
12
  from sky.utils import resources_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -48,6 +48,9 @@ class Lambda(clouds.Cloud):
48
48
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
49
49
  }
50
50
 
51
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
52
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
53
+
51
54
  @classmethod
52
55
  def _unsupported_features_for_resources(
53
56
  cls, resources: 'resources_lib.Resources'
@@ -538,11 +538,13 @@ if __name__ == '__main__':
538
538
  instance_df.to_csv('aws/vms.csv', index=False)
539
539
  print('AWS Service Catalog saved to aws/vms.csv')
540
540
 
541
- image_df = get_all_regions_images_df(user_regions)
542
- _check_regions_integrity(image_df, 'images')
541
+ # Disable refreshing images.csv as we are using skypilot custom AMIs
542
+ # See sky/clouds/service_catalog/images/README.md for more details.
543
+ # image_df = get_all_regions_images_df(user_regions)
544
+ # _check_regions_integrity(image_df, 'images')
543
545
 
544
- image_df.to_csv('aws/images.csv', index=False)
545
- print('AWS Images saved to aws/images.csv')
546
+ # image_df.to_csv('aws/images.csv', index=False)
547
+ # print('AWS Images saved to aws/images.csv')
546
548
 
547
549
  if args.az_mappings:
548
550
  az_mappings_df = fetch_availability_zone_mappings()
@@ -419,6 +419,11 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
419
419
  if count != 8:
420
420
  # H100 only has 8 cards.
421
421
  continue
422
+ if 'H100-MEGA-80GB' in gpu_name:
423
+ gpu_name = 'H100-MEGA'
424
+ if count != 8:
425
+ # H100-MEGA only has 8 cards.
426
+ continue
422
427
  if 'VWS' in gpu_name:
423
428
  continue
424
429
  if gpu_name.startswith('TPU-'):
@@ -447,6 +452,7 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
447
452
  'A100-80GB': 80 * 1024,
448
453
  'A100': 40 * 1024,
449
454
  'H100': 80 * 1024,
455
+ 'H100-MEGA': 80 * 1024,
450
456
  'P4': 8 * 1024,
451
457
  'T4': 16 * 1024,
452
458
  'V100': 16 * 1024,
@@ -491,12 +497,17 @@ def get_gpu_df(skus: List[Dict[str, Any]],
491
497
  if sku['category']['usageType'] != ondemand_or_spot:
492
498
  continue
493
499
 
494
- gpu_name = row['AcceleratorName']
495
- if gpu_name == 'A100-80GB':
496
- gpu_name = 'A100 80GB'
497
- if gpu_name == 'H100':
498
- gpu_name = 'H100 80GB'
499
- if f'{gpu_name} GPU' not in sku['description']:
500
+ gpu_names = [row['AcceleratorName']]
501
+ if gpu_names[0] == 'A100-80GB':
502
+ gpu_names = ['A100 80GB']
503
+ if gpu_names[0] == 'H100':
504
+ gpu_names = ['H100 80GB']
505
+ if gpu_names[0] == 'H100-MEGA':
506
+ # Seems that H100-MEGA has two different descriptions in SKUs in
507
+ # different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
508
+ gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
509
+ if not any(f'{gpu_name} GPU' in sku['description']
510
+ for gpu_name in gpu_names):
500
511
  continue
501
512
 
502
513
  unit_price = _get_unit_price(sku)
@@ -98,6 +98,9 @@ _ACC_INSTANCE_TYPE_DICTS = {
98
98
  },
99
99
  'H100': {
100
100
  8: ['a3-highgpu-8g'],
101
+ },
102
+ 'H100-MEGA': {
103
+ 8: ['a3-megagpu-8g'],
101
104
  }
102
105
  }
103
106
 
sky/provision/__init__.py CHANGED
@@ -19,6 +19,7 @@ from sky.provision import cudo
19
19
  from sky.provision import fluidstack
20
20
  from sky.provision import gcp
21
21
  from sky.provision import kubernetes
22
+ from sky.provision import lambda_cloud
22
23
  from sky.provision import runpod
23
24
  from sky.provision import vsphere
24
25
  from sky.utils import command_runner
@@ -39,6 +40,8 @@ def _route_to_cloud_impl(func):
39
40
  provider_name = kwargs.pop('provider_name')
40
41
 
41
42
  module_name = provider_name.lower()
43
+ if module_name == 'lambda':
44
+ module_name = 'lambda_cloud'
42
45
  module = globals().get(module_name)
43
46
  assert module is not None, f'Unknown provider: {module_name}'
44
47
 
@@ -670,8 +670,12 @@ def _configure_subnet(region: str, cluster_name: str,
670
670
  'accessConfigs': [{
671
671
  'name': 'External NAT',
672
672
  'type': 'ONE_TO_ONE_NAT',
673
- }],
673
+ }]
674
674
  }]
675
+ # Add gVNIC if specified in config
676
+ enable_gvnic = config.provider_config.get('enable_gvnic', False)
677
+ if enable_gvnic:
678
+ default_interfaces[0]['nicType'] = 'gVNIC'
675
679
  enable_external_ips = _enable_external_ips(config)
676
680
  if not enable_external_ips:
677
681
  # Removing this key means the VM will not be assigned an external IP.
@@ -0,0 +1,11 @@
1
+ """Lambda provisioner for SkyPilot."""
2
+
3
+ from sky.provision.lambda_cloud.config import bootstrap_instances
4
+ from sky.provision.lambda_cloud.instance import cleanup_ports
5
+ from sky.provision.lambda_cloud.instance import get_cluster_info
6
+ from sky.provision.lambda_cloud.instance import open_ports
7
+ from sky.provision.lambda_cloud.instance import query_instances
8
+ from sky.provision.lambda_cloud.instance import run_instances
9
+ from sky.provision.lambda_cloud.instance import stop_instances
10
+ from sky.provision.lambda_cloud.instance import terminate_instances
11
+ from sky.provision.lambda_cloud.instance import wait_instances
@@ -0,0 +1,10 @@
1
+ """Lambda Cloud configuration bootstrapping"""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ del region, cluster_name # unused
10
+ return config
@@ -0,0 +1,261 @@
1
+ """Lambda instance provisioning."""
2
+
3
+ import time
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from sky import authentication as auth
7
+ from sky import sky_logging
8
+ from sky import status_lib
9
+ from sky.provision import common
10
+ import sky.provision.lambda_cloud.lambda_utils as lambda_utils
11
+ from sky.utils import common_utils
12
+ from sky.utils import ux_utils
13
+
14
+ POLL_INTERVAL = 1
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+ _lambda_client = None
18
+
19
+
20
+ def _get_lambda_client():
21
+ global _lambda_client
22
+ if _lambda_client is None:
23
+ _lambda_client = lambda_utils.LambdaCloudClient()
24
+ return _lambda_client
25
+
26
+
27
+ def _filter_instances(
28
+ cluster_name_on_cloud: str,
29
+ status_filters: Optional[List[str]]) -> Dict[str, Dict[str, Any]]:
30
+ lambda_client = _get_lambda_client()
31
+ instances = lambda_client.list_instances()
32
+ possible_names = [
33
+ f'{cluster_name_on_cloud}-head',
34
+ f'{cluster_name_on_cloud}-worker',
35
+ ]
36
+
37
+ filtered_instances = {}
38
+ for instance in instances:
39
+ if (status_filters is not None and
40
+ instance['status'] not in status_filters):
41
+ continue
42
+ if instance.get('name') in possible_names:
43
+ filtered_instances[instance['id']] = instance
44
+ return filtered_instances
45
+
46
+
47
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
48
+ head_instance_id = None
49
+ for instance_id, instance in instances.items():
50
+ if instance['name'].endswith('-head'):
51
+ head_instance_id = instance_id
52
+ break
53
+ return head_instance_id
54
+
55
+
56
+ def _get_ssh_key_name(prefix: str = '') -> str:
57
+ lambda_client = _get_lambda_client()
58
+ _, public_key_path = auth.get_or_generate_keys()
59
+ with open(public_key_path, 'r', encoding='utf-8') as f:
60
+ public_key = f.read()
61
+ name, exists = lambda_client.get_unique_ssh_key_name(prefix, public_key)
62
+ if not exists:
63
+ raise lambda_utils.LambdaCloudError('SSH key not found')
64
+ return name
65
+
66
+
67
+ def run_instances(region: str, cluster_name_on_cloud: str,
68
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ """Runs instances for the given cluster"""
70
+ lambda_client = _get_lambda_client()
71
+ pending_status = ['booting']
72
+ while True:
73
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
74
+ if not instances:
75
+ break
76
+ logger.info(f'Waiting for {len(instances)} instances to be ready.')
77
+ time.sleep(POLL_INTERVAL)
78
+ exist_instances = _filter_instances(cluster_name_on_cloud, ['active'])
79
+ head_instance_id = _get_head_instance_id(exist_instances)
80
+
81
+ to_start_count = config.count - len(exist_instances)
82
+ if to_start_count < 0:
83
+ raise RuntimeError(
84
+ f'Cluster {cluster_name_on_cloud} already has '
85
+ f'{len(exist_instances)} nodes, but {config.count} are required.')
86
+ if to_start_count == 0:
87
+ if head_instance_id is None:
88
+ raise RuntimeError(
89
+ f'Cluster {cluster_name_on_cloud} has no head node.')
90
+ logger.info(f'Cluster {cluster_name_on_cloud} already has '
91
+ f'{len(exist_instances)} nodes, no need to start more.')
92
+ return common.ProvisionRecord(
93
+ provider_name='lambda',
94
+ cluster_name=cluster_name_on_cloud,
95
+ region=region,
96
+ zone=None,
97
+ head_instance_id=head_instance_id,
98
+ resumed_instance_ids=[],
99
+ created_instance_ids=[],
100
+ )
101
+
102
+ created_instance_ids = []
103
+ ssh_key_name = _get_ssh_key_name()
104
+
105
+ def launch_nodes(node_type: str, quantity: int) -> List[str]:
106
+ try:
107
+ instance_ids = lambda_client.create_instances(
108
+ instance_type=config.node_config['InstanceType'],
109
+ region=region,
110
+ name=f'{cluster_name_on_cloud}-{node_type}',
111
+ quantity=quantity,
112
+ ssh_key_name=ssh_key_name,
113
+ )
114
+ logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
115
+ f'instance_ids: {instance_ids}')
116
+ return instance_ids
117
+ except Exception as e:
118
+ logger.warning(f'run_instances error: {e}')
119
+ raise
120
+
121
+ if head_instance_id is None:
122
+ instance_ids = launch_nodes('head', 1)
123
+ assert len(instance_ids) == 1
124
+ created_instance_ids.append(instance_ids[0])
125
+ head_instance_id = instance_ids[0]
126
+
127
+ assert head_instance_id is not None, 'head_instance_id should not be None'
128
+
129
+ worker_node_count = to_start_count - 1
130
+ if worker_node_count > 0:
131
+ instance_ids = launch_nodes('worker', worker_node_count)
132
+ created_instance_ids.extend(instance_ids)
133
+
134
+ while True:
135
+ instances = _filter_instances(cluster_name_on_cloud, ['active'])
136
+ if len(instances) == config.count:
137
+ break
138
+
139
+ time.sleep(POLL_INTERVAL)
140
+
141
+ return common.ProvisionRecord(
142
+ provider_name='lambda',
143
+ cluster_name=cluster_name_on_cloud,
144
+ region=region,
145
+ zone=None,
146
+ head_instance_id=head_instance_id,
147
+ resumed_instance_ids=[],
148
+ created_instance_ids=created_instance_ids,
149
+ )
150
+
151
+
152
+ def wait_instances(region: str, cluster_name_on_cloud: str,
153
+ state: Optional[status_lib.ClusterStatus]) -> None:
154
+ del region, cluster_name_on_cloud, state # Unused.
155
+
156
+
157
+ def stop_instances(
158
+ cluster_name_on_cloud: str,
159
+ provider_config: Optional[Dict[str, Any]] = None,
160
+ worker_only: bool = False,
161
+ ) -> None:
162
+ raise NotImplementedError(
163
+ 'stop_instances is not supported for Lambda Cloud')
164
+
165
+
166
+ def terminate_instances(
167
+ cluster_name_on_cloud: str,
168
+ provider_config: Optional[Dict[str, Any]] = None,
169
+ worker_only: bool = False,
170
+ ) -> None:
171
+ """See sky/provision/__init__.py"""
172
+ del provider_config
173
+ lambda_client = _get_lambda_client()
174
+ instances = _filter_instances(cluster_name_on_cloud, None)
175
+
176
+ instance_ids_to_terminate = []
177
+ for instance_id, instance in instances.items():
178
+ if worker_only and not instance['name'].endswith('-worker'):
179
+ continue
180
+ instance_ids_to_terminate.append(instance_id)
181
+
182
+ try:
183
+ logger.debug(
184
+ f'Terminating instances {", ".join(instance_ids_to_terminate)}')
185
+ lambda_client.remove_instances(instance_ids_to_terminate)
186
+ except Exception as e: # pylint: disable=broad-except
187
+ with ux_utils.print_exception_no_traceback():
188
+ raise RuntimeError(
189
+ f'Failed to terminate instances {instance_ids_to_terminate}: '
190
+ f'{common_utils.format_exception(e, use_bracket=False)}') from e
191
+
192
+
193
+ def get_cluster_info(
194
+ region: str,
195
+ cluster_name_on_cloud: str,
196
+ provider_config: Optional[Dict[str, Any]] = None,
197
+ ) -> common.ClusterInfo:
198
+ del region # unused
199
+ running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
200
+ instances: Dict[str, List[common.InstanceInfo]] = {}
201
+ head_instance_id = None
202
+ for instance_id, instance_info in running_instances.items():
203
+ instances[instance_id] = [
204
+ common.InstanceInfo(
205
+ instance_id=instance_id,
206
+ internal_ip=instance_info['private_ip'],
207
+ external_ip=instance_info['ip'],
208
+ ssh_port=22,
209
+ tags={},
210
+ )
211
+ ]
212
+ if instance_info['name'].endswith('-head'):
213
+ head_instance_id = instance_id
214
+
215
+ return common.ClusterInfo(
216
+ instances=instances,
217
+ head_instance_id=head_instance_id,
218
+ provider_name='lambda',
219
+ provider_config=provider_config,
220
+ )
221
+
222
+
223
+ def query_instances(
224
+ cluster_name_on_cloud: str,
225
+ provider_config: Optional[Dict[str, Any]] = None,
226
+ non_terminated_only: bool = True,
227
+ ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
228
+ """See sky/provision/__init__.py"""
229
+ assert provider_config is not None, (cluster_name_on_cloud, provider_config)
230
+ instances = _filter_instances(cluster_name_on_cloud, None)
231
+
232
+ status_map = {
233
+ 'booting': status_lib.ClusterStatus.INIT,
234
+ 'active': status_lib.ClusterStatus.UP,
235
+ 'unhealthy': status_lib.ClusterStatus.INIT,
236
+ 'terminating': status_lib.ClusterStatus.INIT,
237
+ }
238
+ statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
239
+ for instance_id, instance in instances.items():
240
+ status = status_map.get(instance['status'])
241
+ if non_terminated_only and status is None:
242
+ continue
243
+ statuses[instance_id] = status
244
+ return statuses
245
+
246
+
247
+ def open_ports(
248
+ cluster_name_on_cloud: str,
249
+ ports: List[str],
250
+ provider_config: Optional[Dict[str, Any]] = None,
251
+ ) -> None:
252
+ raise NotImplementedError('open_ports is not supported for Lambda Cloud')
253
+
254
+
255
+ def cleanup_ports(
256
+ cluster_name_on_cloud: str,
257
+ ports: List[str],
258
+ provider_config: Optional[Dict[str, Any]] = None,
259
+ ) -> None:
260
+ """See sky/provision/__init__.py"""
261
+ del cluster_name_on_cloud, ports, provider_config # Unused.
@@ -1,4 +1,5 @@
1
1
  """Lambda Cloud helper functions."""
2
+
2
3
  import json
3
4
  import os
4
5
  import time
@@ -76,7 +77,7 @@ class Metadata:
76
77
 
77
78
 
78
79
  def raise_lambda_error(response: requests.Response) -> None:
79
- """Raise LambdaCloudError if appropriate. """
80
+ """Raise LambdaCloudError if appropriate."""
80
81
  status_code = response.status_code
81
82
  if status_code == 200:
82
83
  return
@@ -131,20 +132,22 @@ class LambdaCloudClient:
131
132
  self.api_key = self._credentials['api_key']
132
133
  self.headers = {'Authorization': f'Bearer {self.api_key}'}
133
134
 
134
- def create_instances(self,
135
- instance_type: str = 'gpu_1x_a100_sxm4',
136
- region: str = 'us-east-1',
137
- quantity: int = 1,
138
- name: str = '',
139
- ssh_key_name: str = '') -> List[str]:
135
+ def create_instances(
136
+ self,
137
+ instance_type: str = 'gpu_1x_a100_sxm4',
138
+ region: str = 'us-east-1',
139
+ quantity: int = 1,
140
+ name: str = '',
141
+ ssh_key_name: str = '',
142
+ ) -> List[str]:
140
143
  """Launch new instances."""
141
144
  # Optimization:
142
145
  # Most API requests are rate limited at ~1 request every second but
143
146
  # launch requests are rate limited at ~1 request every 10 seconds.
144
147
  # So don't use launch requests to check availability.
145
148
  # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
146
- available_regions = self.list_catalog()[instance_type]\
147
- ['regions_with_capacity_available']
149
+ available_regions = (self.list_catalog()[instance_type]
150
+ ['regions_with_capacity_available'])
148
151
  available_regions = [reg['name'] for reg in available_regions]
149
152
  if region not in available_regions:
150
153
  if len(available_regions) > 0:
@@ -163,27 +166,25 @@ class LambdaCloudClient:
163
166
  'instance_type_name': instance_type,
164
167
  'ssh_key_names': [ssh_key_name],
165
168
  'quantity': quantity,
166
- 'name': name
169
+ 'name': name,
167
170
  })
168
171
  response = _try_request_with_backoff(
169
172
  'post',
170
173
  f'{API_ENDPOINT}/instance-operations/launch',
171
174
  data=data,
172
- headers=self.headers)
175
+ headers=self.headers,
176
+ )
173
177
  return response.json().get('data', []).get('instance_ids', [])
174
178
 
175
- def remove_instances(self, *instance_ids: str) -> Dict[str, Any]:
179
+ def remove_instances(self, instance_ids: List[str]) -> Dict[str, Any]:
176
180
  """Terminate instances."""
177
- data = json.dumps({
178
- 'instance_ids': [
179
- instance_ids[0] # TODO(ewzeng) don't hardcode
180
- ]
181
- })
181
+ data = json.dumps({'instance_ids': instance_ids})
182
182
  response = _try_request_with_backoff(
183
183
  'post',
184
184
  f'{API_ENDPOINT}/instance-operations/terminate',
185
185
  data=data,
186
- headers=self.headers)
186
+ headers=self.headers,
187
+ )
187
188
  return response.json().get('data', []).get('terminated_instances', [])
188
189
 
189
190
  def list_instances(self) -> List[Dict[str, Any]]:
@@ -6,7 +6,6 @@ include sky/setup_files/*
6
6
  include sky/skylet/*.sh
7
7
  include sky/skylet/LICENSE
8
8
  include sky/skylet/providers/ibm/*
9
- include sky/skylet/providers/lambda_cloud/*
10
9
  include sky/skylet/providers/oci/*
11
10
  include sky/skylet/providers/scp/*
12
11
  include sky/skylet/providers/*.py
@@ -64,6 +64,9 @@ provider:
64
64
  # leakage.
65
65
  disable_launch_config_check: true
66
66
  use_managed_instance_group: {{ gcp_use_managed_instance_group }}
67
+ {%- if enable_gvnic %}
68
+ enable_gvnic: {{ enable_gvnic }}
69
+ {%- endif %}
67
70
 
68
71
  auth:
69
72
  ssh_user: gcpuser
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
7
7
 
8
8
  provider:
9
9
  type: external
10
- module: sky.skylet.providers.lambda_cloud.LambdaNodeProvider
10
+ module: sky.provision.lambda
11
11
  region: {{region}}
12
12
  # Disable launch config check for worker nodes as it can cause resource
13
13
  # leakage.
@@ -25,14 +25,6 @@ available_node_types:
25
25
  resources: {}
26
26
  node_config:
27
27
  InstanceType: {{instance_type}}
28
- {% if num_nodes > 1 %}
29
- ray_worker_default:
30
- min_workers: {{num_nodes - 1}}
31
- max_workers: {{num_nodes - 1}}
32
- resources: {}
33
- node_config:
34
- InstanceType: {{instance_type}}
35
- {%- endif %}
36
28
 
37
29
  head_node_type: ray_head_default
38
30
 
@@ -64,7 +56,10 @@ setup_commands:
64
56
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
65
57
  # Line 'mkdir -p ..': disable host key check
66
58
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
67
- - sudo systemctl stop unattended-upgrades || true;
59
+ - {%- for initial_setup_command in initial_setup_commands %}
60
+ {{ initial_setup_command }}
61
+ {%- endfor %}
62
+ sudo systemctl stop unattended-upgrades || true;
68
63
  sudo systemctl disable unattended-upgrades || true;
69
64
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
70
65
  sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
@@ -81,31 +76,5 @@ setup_commands:
81
76
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
82
77
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
83
78
 
84
- # Command to start ray on the head node. You don't need to change this.
85
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
86
- # connection, which is expensive. Try your best to co-locate commands into fewer
87
- # items! The same comment applies for worker_start_ray_commands.
88
- #
89
- # Increment the following for catching performance bugs easier:
90
- # current num items (num SSH connections): 2
91
- head_start_ray_commands:
92
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --min-worker-port 11002 --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
93
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
94
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
95
-
96
- {%- if num_nodes > 1 %}
97
- worker_start_ray_commands:
98
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --min-worker-port 11002 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
99
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
100
- {%- else %}
101
- worker_start_ray_commands: []
102
- {%- endif %}
103
-
104
- head_node: {}
105
- worker_nodes: {}
106
-
107
- # These fields are required for external cloud providers.
108
- head_setup_commands: []
109
- worker_setup_commands: []
110
- cluster_synced_files: []
111
- file_mounts_sync_continuously: False
79
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
80
+ # We do not need to list it here anymore.
sky/utils/schemas.py CHANGED
@@ -755,6 +755,9 @@ def get_config_schema():
755
755
  'force_enable_external_ips': {
756
756
  'type': 'boolean'
757
757
  },
758
+ 'enable_gvnic': {
759
+ 'type': 'boolean'
760
+ },
758
761
  **_LABELS_SCHEMA,
759
762
  **_NETWORK_CONFIG_SCHEMA,
760
763
  },
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241016
3
+ Version: 1.0.0.dev20241018
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,6 +1,6 @@
1
- sky/__init__.py,sha256=19EG_Nr4EJcbkyLvfF_ZmWhAbfEysS498RvephjOslM,5854
1
+ sky/__init__.py,sha256=ooJaoPt0Vq10nF2ftXGThCKQFJ2HbgQNKQ7Dp6Qg6s4,5854
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
- sky/authentication.py,sha256=TfKkVnmRIetATSEVQFp-rOOIRGqVig2i8faSQQt_ixA,20974
3
+ sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
5
5
  sky/cli.py,sha256=PJR6W92twf89j17OWLQJ9RawdazJcGslfW2L_fLB2PM,208545
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
@@ -30,7 +30,7 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
30
30
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
- sky/backends/backend_utils.py,sha256=u9P7Fd3DB9LaOq51fK7kwKpxtgFGGWmgULY6GoLSUPM,126791
33
+ sky/backends/backend_utils.py,sha256=PA21DAXspXuTZDQ5qA3G5RGJ0oUTpJ7XatRRvhtmtt0,126993
34
34
  sky/backends/cloud_vm_ray_backend.py,sha256=9mCLLRUD-x3ksiiPbhrMDsZWIPNU9cVSQwwpmxSia7k,236881
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
@@ -40,16 +40,16 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
- sky/clouds/aws.py,sha256=uwz4aesgA2o6WNDhZlrXafQFbILv4EiUPNP67Cc8WGE,48978
43
+ sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
44
44
  sky/clouds/azure.py,sha256=Yp_a1Lzvq4s47eRMeyVheDv9pC0hSPogCiTMYf-a5ZE,28687
45
45
  sky/clouds/cloud.py,sha256=PPk-Cbf1YbJT8bswcQLtPBtko02OWrRGJKkLzDpytTI,34858
46
46
  sky/clouds/cloud_registry.py,sha256=4yQMv-iBSgyN5aNL4Qxbn0JVE-dkVoEUIgj7S1z9S_Q,955
47
47
  sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
48
48
  sky/clouds/fluidstack.py,sha256=iOmoOx52yTrHKMzwBDaxFJCfNo79M61d5tj-Np24Lyc,12436
49
- sky/clouds/gcp.py,sha256=FKHqtF4YMY06pseloMEbnt4XwIQ5ErDLlrvyXzIzZa4,54308
49
+ sky/clouds/gcp.py,sha256=lUImS2WJIcUOtrgrVz8zaR4yPGqALqZ0lSmLbjN9xLU,54470
50
50
  sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
51
51
  sky/clouds/kubernetes.py,sha256=aWoXWR-S4puZHzuUHroLKxLdTpkqU7j75dQlXECnsmE,28679
52
- sky/clouds/lambda_cloud.py,sha256=2Al3qCSl-I4iTi7pPPNXcbaLyVfCUgTl__vYBunLB6k,12439
52
+ sky/clouds/lambda_cloud.py,sha256=VtJ2mmwMT1X4zrzgt3FXM61zmrrgoELZHFgsdYVesPY,12562
53
53
  sky/clouds/oci.py,sha256=WXtxKwDBgi3He4ayi4qzJ4Y659Bi6xU8hWmYLHwiQYs,27371
54
54
  sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
55
55
  sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
@@ -63,7 +63,7 @@ sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZu
63
63
  sky/clouds/service_catalog/constants.py,sha256=ai2yOlsVqBnEpbxaEHXt61COsHBLwOfw6GZXntEPj7k,411
64
64
  sky/clouds/service_catalog/cudo_catalog.py,sha256=QXAOpx5fJ_cGCr5LbB7wpHMfKIla7G-q_mMJnv_ArTA,4652
65
65
  sky/clouds/service_catalog/fluidstack_catalog.py,sha256=c8MMTldG-q97MJ0zJymudQiOVQC_rxS7vqrZgLrgbQA,5038
66
- sky/clouds/service_catalog/gcp_catalog.py,sha256=MHWq_-jqm68oNpK1i8AlJIGBkSKT-P6xX7DkpvqvpHU,24323
66
+ sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
67
67
  sky/clouds/service_catalog/ibm_catalog.py,sha256=0dzjmXABFECzaAuIa0E6pVINhVK6-G6U52Mj-L45gK8,4472
68
68
  sky/clouds/service_catalog/kubernetes_catalog.py,sha256=6OocEUkgyJtBgHwzu4RPsvru6pj6RwGU-4uSFNQmsSM,8254
69
69
  sky/clouds/service_catalog/lambda_catalog.py,sha256=BAhUGqHj8aVe1zUhEQNO7bQUhcd9jAespGvPyQubTJY,5281
@@ -74,17 +74,16 @@ sky/clouds/service_catalog/scp_catalog.py,sha256=4XnaZE5Q4XrrNnDnVhsHkH6jxmWXBeQ
74
74
  sky/clouds/service_catalog/vsphere_catalog.py,sha256=yJLWu9SQep-PRn1YdeQ7ZoNqQHTAxJtxf7y6FBrfSW0,4391
75
75
  sky/clouds/service_catalog/data_fetchers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  sky/clouds/service_catalog/data_fetchers/analyze.py,sha256=VdksJQs3asFE8H5T3ZV1FJas2xD9WEX6c-V5p7y-wp4,2084
77
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=6gpRtQaQtvT1cMpiVBacNXXQAjBC5MWpAUI-1ELYg0U,22850
77
+ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFyjoeb4VFxmbNZ1WK5IQrdoQWk,23003
78
78
  sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=jsSVqbSbBIw_IYmO-y2u4co20AJ-JF713KFjUKdO_VA,12272
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=2VebGjKkCckQZgOW3MBlfjFvA56U7eaIAh2q-6NYcL0,29070
81
+ sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=VHwYIPX1kGOvGQ67mtvhKe1enmKFF3knveIktDwdYio,29633
82
82
  sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
86
  sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
87
- sky/clouds/utils/lambda_utils.py,sha256=n_GwXhswehDOJrby1fGyPXSQuPmPTPQqXZzV4xQm9KI,9991
88
87
  sky/clouds/utils/oci_utils.py,sha256=LT_RtPQ2B1wlSF0e9PSD3NWxFFIzovcZeDjO-dyOghU,4482
89
88
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
90
89
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
@@ -103,7 +102,7 @@ sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
103
102
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
104
103
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
104
  sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
106
- sky/provision/__init__.py,sha256=ZBahWZgLw63EvYGBpxTJNnys_bb7TpdtQpESWRyoWC8,6146
105
+ sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
107
106
  sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
108
107
  sky/provision/constants.py,sha256=DvHj3wpqdpaSBHMOGIfVWLLWGJoz0eOQAx73DwYMNEk,531
109
108
  sky/provision/docker_utils.py,sha256=Z7vDUs9Yjqks_CsWrACcTgABIZuFi3EJVFwkU0WsdD0,18832
@@ -131,7 +130,7 @@ sky/provision/fluidstack/config.py,sha256=hDqesKEVjIhXLTWej3fDdpbHtKBXoybxFGgC6T
131
130
  sky/provision/fluidstack/fluidstack_utils.py,sha256=Y21y2IAiHPLk_b-Lp-ld26SZSfWARhxdDEiu7MtfBmc,5693
132
131
  sky/provision/fluidstack/instance.py,sha256=jZb_zJFkyCZw3QV3Dt-GbUxl_2HR1kxvOxS7G3lkWcA,13708
133
132
  sky/provision/gcp/__init__.py,sha256=zlgjR2JoaGD7sStGStMRu9bJ62f-8NKEIyb-bFHBlzM,528
134
- sky/provision/gcp/config.py,sha256=THxFEtl9FjRvn9WJPYHbPjUaYNI5dXJRrcaznhRfM9k,33161
133
+ sky/provision/gcp/config.py,sha256=i0PhR1ybGErQiPT8cD6E5OFB7LD6sub4Rc-mhgTREVI,33340
135
134
  sky/provision/gcp/constants.py,sha256=ojerfnNEeayJn-0aJq2Uq1iTchxOkpruKrPBbHmdiEw,7448
136
135
  sky/provision/gcp/instance.py,sha256=l2-1nHj4pUoHqOu8HMN1hT1bwd4Q96X8MXgOPsNJUN8,25184
137
136
  sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHaSvY7c8,70943
@@ -144,6 +143,10 @@ sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMsh
144
143
  sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
145
144
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
146
145
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
146
+ sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
147
+ sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
148
+ sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
149
+ sky/provision/lambda_cloud/lambda_utils.py,sha256=H8uaaMEpLn5cqGCdhUH_oJiccv_cuMguUNAl0NqB0Ik,9873
147
150
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
148
151
  sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
149
152
  sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
@@ -180,7 +183,7 @@ sky/serve/serve_state.py,sha256=5BZSKKKxQRk-0mku17Ch4Veu4qOhaFvaOJY3zrZCkLw,1931
180
183
  sky/serve/serve_utils.py,sha256=im_1cJoJmufFxkBVnhK4nI6XlHvEXersQyIivNruJJc,38009
181
184
  sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
182
185
  sky/serve/service_spec.py,sha256=iRhW95SERvb4NWtV10uCuhgvW31HuSAmZZ55OX0WK8s,15309
183
- sky/setup_files/MANIFEST.in,sha256=BAR1TfVIHwBFfV3najggE8HDXTJyO3fNN0Yhu5aTitI,634
186
+ sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
184
187
  sky/setup_files/setup.py,sha256=o4IgiwFoTB6Sdn3MmOirUIS0OSkoh6qo_0vrgcmrYA4,12093
185
188
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
186
189
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,8 +203,6 @@ sky/skylet/providers/ibm/__init__.py,sha256=GXo5F9ztvs0qMDI_G9wM5KvzySfYugslJMHH
200
203
  sky/skylet/providers/ibm/node_provider.py,sha256=olNtCoCxjXTT-C_youwdQ9UF1DPgO8OVwDueotGFaJI,38280
201
204
  sky/skylet/providers/ibm/utils.py,sha256=63vhKqLLOhAZdibSp8VWWONeyCER9F6U2VLrSpzlizk,1292
202
205
  sky/skylet/providers/ibm/vpc_provider.py,sha256=GiOGlWYqqeBETfAeKqVj2-9shsMSP7z1WnO8UP5JTNo,34630
203
- sky/skylet/providers/lambda_cloud/__init__.py,sha256=DNxB-NL97FU7ptGcXTnIZZRt6ZLtNVtp3nayJNaxjhY,112
204
- sky/skylet/providers/lambda_cloud/node_provider.py,sha256=77AoTaXuSW5cr4Z2cEAbAsHx1e4l8c0fc1ABbdLduKE,14042
205
206
  sky/skylet/providers/oci/__init__.py,sha256=LRMTj6OhQoxiFJw4uNxG8cn6PllP8A-lGJL3Cs5DJok,91
206
207
  sky/skylet/providers/oci/node_provider.py,sha256=YPqiRag_cysvYMIMDGbMn6lOumvHad6FLJB5DGPr00Q,20492
207
208
  sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gMj6R7Qel2fDc,17202
@@ -221,7 +222,7 @@ sky/templates/aws-ray.yml.j2,sha256=K0rAuyf1XC_GPFp1BR9df42-Be12A6T2UF0BllVSpYg,
221
222
  sky/templates/azure-ray.yml.j2,sha256=RtYAcAmFQd6TB3j-pbxi7ekjWhznqFhJtzdkqH_nXqM,6135
222
223
  sky/templates/cudo-ray.yml.j2,sha256=SEHVY57iBauCOE2HYJtYVFEKlriAkdwQu_p86a1n_bA,3548
223
224
  sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
224
- sky/templates/gcp-ray.yml.j2,sha256=q2xSWxxYI8MVAq_mA__8FF6PwEqXCAW1SOEOGTt0qPw,9591
225
+ sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
225
226
  sky/templates/ibm-ray.yml.j2,sha256=RMBUqPId8i4CnVwcyfK3DbRapF1jFMuGQlY0E0PFbMU,6669
226
227
  sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjAq7whCrxo,1607
227
228
  sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
@@ -229,7 +230,7 @@ sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8w
229
230
  sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
230
231
  sky/templates/kubernetes-ray.yml.j2,sha256=Wq9luXc6-t141uyHbtOy1IDmLMM0PBbePTZfZEtAKw0,18160
231
232
  sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
232
- sky/templates/lambda-ray.yml.j2,sha256=UrYOUh4EliPlWcfQWPZzQSiIIYSoRloujV2xsZejYPM,5786
233
+ sky/templates/lambda-ray.yml.j2,sha256=oMbrfv3zHoD1v1XXMLCLK1vB7wLBU1Z_jNpC4-5lGVo,3985
233
234
  sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
234
235
  sky/templates/oci-ray.yml.j2,sha256=5XfIobW9XuspIpEhI4vFIEcJEFCdtFJqEGfX03zL6DE,7032
235
236
  sky/templates/paperspace-ray.yml.j2,sha256=HQjZNamrB_a4fOMCxQXSVdV5JIHtbGtAE0JzEO8uuVQ,4021
@@ -255,7 +256,7 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
255
256
  sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
256
257
  sky/utils/resources_utils.py,sha256=sJuPextjJKHhvDGAaOPEzeEkteryF2fGNgNgBLqnLic,7419
257
258
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
258
- sky/utils/schemas.py,sha256=QT0Fxri2o0SiWkky1DlZhA1dzQRQoB5OdVaej0wJvhc,28787
259
+ sky/utils/schemas.py,sha256=qo9j1TJZXqgJlBgbQfqz1oIZAxc3CN8uWooKYPQXXIY,28878
259
260
  sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
260
261
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
261
262
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -273,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
273
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
274
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
275
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
276
- skypilot_nightly-1.0.0.dev20241016.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
- skypilot_nightly-1.0.0.dev20241016.dist-info/METADATA,sha256=3cBwGMlr5S-mHHm8ZXtnMUNjCcSYeAzwOEf-N4LxLEU,18945
278
- skypilot_nightly-1.0.0.dev20241016.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
279
- skypilot_nightly-1.0.0.dev20241016.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
- skypilot_nightly-1.0.0.dev20241016.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
- skypilot_nightly-1.0.0.dev20241016.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241018.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241018.dist-info/METADATA,sha256=hKti-qYovHe9BXjvZnYoV-88kOo2Qz0-xTDwY08RzrM,18945
279
+ skypilot_nightly-1.0.0.dev20241018.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
280
+ skypilot_nightly-1.0.0.dev20241018.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241018.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241018.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- """Lambda Cloud node provider"""
2
- from sky.skylet.providers.lambda_cloud.node_provider import LambdaNodeProvider
@@ -1,320 +0,0 @@
1
- import logging
2
- import os
3
- from threading import RLock
4
- import time
5
- from typing import Any, Dict, List, Optional
6
-
7
- from ray.autoscaler.node_provider import NodeProvider
8
- from ray.autoscaler.tags import NODE_KIND_HEAD
9
- from ray.autoscaler.tags import NODE_KIND_WORKER
10
- from ray.autoscaler.tags import STATUS_UP_TO_DATE
11
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
12
- from ray.autoscaler.tags import TAG_RAY_NODE_KIND
13
- from ray.autoscaler.tags import TAG_RAY_NODE_NAME
14
- from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
15
- from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
16
-
17
- from sky import authentication as auth
18
- from sky.clouds.utils import lambda_utils
19
- from sky.utils import command_runner
20
- from sky.utils import common_utils
21
- from sky.utils import subprocess_utils
22
- from sky.utils import ux_utils
23
-
24
- _TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata'
25
- _REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name'
26
- _REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem'
27
- _REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
28
- _GET_INTERNAL_IP_CMD = 's=$(ip -4 -br addr show | grep UP); echo "$s"; echo "$s" | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1])|104\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- def synchronized(f):
34
-
35
- def wrapper(self, *args, **kwargs):
36
- self.lock.acquire()
37
- try:
38
- return f(self, *args, **kwargs)
39
- finally:
40
- self.lock.release()
41
-
42
- return wrapper
43
-
44
-
45
- class LambdaNodeProvider(NodeProvider):
46
- """Node Provider for Lambda Cloud.
47
-
48
- This provider assumes Lambda Cloud credentials are set.
49
- """
50
-
51
- def __init__(self, provider_config: Dict[str, Any],
52
- cluster_name: str) -> None:
53
- NodeProvider.__init__(self, provider_config, cluster_name)
54
- self.lock = RLock()
55
- self.lambda_client = lambda_utils.LambdaCloudClient()
56
- self.cached_nodes: Dict[str, Dict[str, Any]] = {}
57
- self.metadata = lambda_utils.Metadata(_TAG_PATH_PREFIX, cluster_name)
58
- self.ssh_key_path = os.path.expanduser(auth.PRIVATE_SSH_KEY_PATH)
59
-
60
- def _get_ssh_key_name(prefix: str) -> str:
61
- public_key_path = os.path.expanduser(auth.PUBLIC_SSH_KEY_PATH)
62
- with open(public_key_path, 'r') as f:
63
- public_key = f.read()
64
- name, exists = self.lambda_client.get_unique_ssh_key_name(
65
- prefix, public_key)
66
- if not exists:
67
- raise lambda_utils.LambdaCloudError('SSH key not found')
68
- return name
69
-
70
- ray_yaml_path = os.path.expanduser(_REMOTE_RAY_YAML)
71
- self.on_head = (os.path.exists(ray_yaml_path) and
72
- common_utils.read_yaml(ray_yaml_path)['cluster_name']
73
- == cluster_name)
74
-
75
- if self.on_head:
76
- self.ssh_key_path = os.path.expanduser(_REMOTE_RAY_SSH_KEY)
77
- ssh_key_name_path = os.path.expanduser(_REMOTE_SSH_KEY_NAME)
78
- if os.path.exists(ssh_key_name_path):
79
- with open(ssh_key_name_path, 'r') as f:
80
- self.ssh_key_name = f.read()
81
- else:
82
- # At this point, `~/.ssh/sky-key.pub` contains the public
83
- # key used to launch this cluster. Use it to determine
84
- # ssh key name and store the name in _REMOTE_SSH_KEY_NAME.
85
- # Note: this case only runs during cluster launch, so it is
86
- # not possible for ~/.ssh/sky-key.pub to already be regenerated
87
- # by the user.
88
- self.ssh_key_name = _get_ssh_key_name('')
89
- with open(ssh_key_name_path, 'w', encoding='utf-8') as f:
90
- f.write(self.ssh_key_name)
91
- else:
92
- # On local
93
- self.ssh_key_name = _get_ssh_key_name(
94
- f'sky-key-{common_utils.get_user_hash()}')
95
-
96
- def _guess_and_add_missing_tags(self, vms: List[Dict[str, Any]]) -> None:
97
- """Adds missing vms to local tag file and guesses their tags."""
98
- for node in vms:
99
- if self.metadata.get(node['id']) is not None:
100
- pass
101
- elif node['name'] == f'{self.cluster_name}-head':
102
- self.metadata.set(
103
- node['id'], {
104
- 'tags': {
105
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
106
- TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
107
- TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
108
- TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
109
- TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-head',
110
- }
111
- })
112
- elif node['name'] == f'{self.cluster_name}-worker':
113
- self.metadata.set(
114
- node['id'], {
115
- 'tags': {
116
- TAG_RAY_CLUSTER_NAME: self.cluster_name,
117
- TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
118
- TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
119
- TAG_RAY_USER_NODE_TYPE: 'ray_worker_default',
120
- TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-worker',
121
- }
122
- })
123
-
124
- def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
125
- """List running instances in cluster."""
126
- vms = self.lambda_client.list_instances()
127
- possible_names = [
128
- f'{self.cluster_name}-head', f'{self.cluster_name}-worker'
129
- ]
130
- return [node for node in vms if node.get('name') in possible_names]
131
-
132
- @synchronized
133
- def _get_filtered_nodes(self, tag_filters: Dict[str,
134
- str]) -> Dict[str, Any]:
135
-
136
- def _extract_metadata(vm: Dict[str, Any]) -> Dict[str, Any]:
137
- metadata = {'id': vm['id'], 'status': vm['status'], 'tags': {}}
138
- instance_info = self.metadata.get(vm['id'])
139
- if instance_info is not None:
140
- metadata['tags'] = instance_info['tags']
141
- metadata['external_ip'] = vm.get('ip')
142
- return metadata
143
-
144
- def _match_tags(vm: Dict[str, Any]):
145
- vm_info = self.metadata.get(vm['id'])
146
- tags = {} if vm_info is None else vm_info['tags']
147
- for k, v in tag_filters.items():
148
- if tags.get(k) != v:
149
- return False
150
- return True
151
-
152
- def _get_internal_ip(node: Dict[str, Any]):
153
- # TODO(ewzeng): cache internal ips in metadata file to reduce
154
- # ssh overhead.
155
- if node['external_ip'] is None or node['status'] != 'active':
156
- node['internal_ip'] = None
157
- return
158
- runner = command_runner.SSHCommandRunner(
159
- node=(node['external_ip'], 22),
160
- ssh_user='ubuntu',
161
- ssh_private_key=self.ssh_key_path)
162
- rc, stdout, stderr = runner.run(_GET_INTERNAL_IP_CMD,
163
- require_outputs=True,
164
- stream_logs=False)
165
- subprocess_utils.handle_returncode(
166
- rc,
167
- _GET_INTERNAL_IP_CMD,
168
- 'Failed get obtain private IP from node',
169
- stderr=stdout + stderr)
170
- node['internal_ip'] = stdout.strip()
171
-
172
- vms = self._list_instances_in_cluster()
173
- self.metadata.refresh([node['id'] for node in vms])
174
- self._guess_and_add_missing_tags(vms)
175
- nodes = [_extract_metadata(vm) for vm in filter(_match_tags, vms)]
176
- nodes = [
177
- node for node in nodes
178
- if node['status'] not in ['terminating', 'terminated']
179
- ]
180
- subprocess_utils.run_in_parallel(_get_internal_ip, nodes)
181
- self.cached_nodes = {node['id']: node for node in nodes}
182
- return self.cached_nodes
183
-
184
- def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
185
- """Return a list of node ids filtered by the specified tags dict.
186
-
187
- This list must not include terminated nodes. For performance reasons,
188
- providers are allowed to cache the result of a call to
189
- non_terminated_nodes() to serve single-node queries
190
- (e.g. is_running(node_id)). This means that non_terminated_nodes() must
191
- be called again to refresh results.
192
-
193
- Examples:
194
- >>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
195
- ["node-1", "node-2"]
196
- """
197
- nodes = self._get_filtered_nodes(tag_filters=tag_filters)
198
- return [k for k, _ in nodes.items()]
199
-
200
- def is_running(self, node_id: str) -> bool:
201
- """Return whether the specified node is running."""
202
- return self._get_cached_node(node_id=node_id) is not None
203
-
204
- def is_terminated(self, node_id: str) -> bool:
205
- """Return whether the specified node is terminated."""
206
- return self._get_cached_node(node_id=node_id) is None
207
-
208
- def node_tags(self, node_id: str) -> Dict[str, str]:
209
- """Returns the tags of the given node (string dict)."""
210
- node = self._get_cached_node(node_id=node_id)
211
- if node is None:
212
- return {}
213
- return node['tags']
214
-
215
- def external_ip(self, node_id: str) -> Optional[str]:
216
- """Returns the external ip of the given node."""
217
- node = self._get_cached_node(node_id=node_id)
218
- if node is None:
219
- return None
220
- ip = node.get('external_ip')
221
- with ux_utils.print_exception_no_traceback():
222
- if ip is None:
223
- raise lambda_utils.LambdaCloudError(
224
- 'A node ip address was not found. Either '
225
- '(1) Lambda Cloud has internally errored, or '
226
- '(2) the cluster is still booting. '
227
- 'You can manually terminate the cluster on the '
228
- 'Lambda Cloud console or (in case 2) wait for '
229
- 'booting to finish (~2 minutes).')
230
- return ip
231
-
232
- def internal_ip(self, node_id: str) -> Optional[str]:
233
- """Returns the internal ip (Ray ip) of the given node."""
234
- node = self._get_cached_node(node_id=node_id)
235
- if node is None:
236
- return None
237
- ip = node.get('internal_ip')
238
- with ux_utils.print_exception_no_traceback():
239
- if ip is None:
240
- raise lambda_utils.LambdaCloudError(
241
- 'A node ip address was not found. Either '
242
- '(1) Lambda Cloud has internally errored, or '
243
- '(2) the cluster is still booting. '
244
- 'You can manually terminate the cluster on the '
245
- 'Lambda Cloud console or (in case 2) wait for '
246
- 'booting to finish (~2 minutes).')
247
- return ip
248
-
249
- def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
250
- count: int) -> None:
251
- """Creates a number of nodes within the namespace."""
252
- # Get tags
253
- config_tags = node_config.get('tags', {}).copy()
254
- config_tags.update(tags)
255
- config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
256
-
257
- # Create nodes
258
- instance_type = node_config['InstanceType']
259
- region = self.provider_config['region']
260
-
261
- if config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD:
262
- name = f'{self.cluster_name}-head'
263
- # Occasionally, the head node will continue running for a short
264
- # period after termination. This can lead to the following bug:
265
- # 1. Head node autodowns but continues running.
266
- # 2. The next autodown event is triggered, which executes ray up.
267
- # 3. Head node stops running.
268
- # In this case, a new head node is created after the cluster has
269
- # terminated. We avoid this with the following check:
270
- if self.on_head:
271
- raise lambda_utils.LambdaCloudError('Head already exists.')
272
- else:
273
- name = f'{self.cluster_name}-worker'
274
-
275
- # Lambda launch api only supports launching one node at a time,
276
- # so we do a loop. Remove loop when launch api allows quantity > 1
277
- booting_list = []
278
- for _ in range(count):
279
- vm_id = self.lambda_client.create_instances(
280
- instance_type=instance_type,
281
- region=region,
282
- quantity=1,
283
- name=name,
284
- ssh_key_name=self.ssh_key_name)[0]
285
- self.metadata.set(vm_id, {'tags': config_tags})
286
- booting_list.append(vm_id)
287
- time.sleep(10) # Avoid api rate limits
288
-
289
- # Wait for nodes to finish booting
290
- while True:
291
- vms = self._list_instances_in_cluster()
292
- for vm_id in booting_list.copy():
293
- for vm in vms:
294
- if vm['id'] == vm_id and vm['status'] == 'active':
295
- booting_list.remove(vm_id)
296
- if len(booting_list) == 0:
297
- return
298
- time.sleep(10)
299
-
300
- @synchronized
301
- def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
302
- """Sets the tag values (string dict) for the specified node."""
303
- node = self._get_node(node_id)
304
- assert node is not None, node_id
305
- node['tags'].update(tags)
306
- self.metadata.set(node_id, {'tags': node['tags']})
307
-
308
- def terminate_node(self, node_id: str) -> None:
309
- """Terminates the specified node."""
310
- self.lambda_client.remove_instances(node_id)
311
- self.metadata.set(node_id, None)
312
-
313
- def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
314
- self._get_filtered_nodes({}) # Side effect: updates cache
315
- return self.cached_nodes.get(node_id, None)
316
-
317
- def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
318
- if node_id in self.cached_nodes:
319
- return self.cached_nodes[node_id]
320
- return self._get_node(node_id=node_id)