skypilot-nightly 1.0.0.dev20241016__py3-none-any.whl → 1.0.0.dev20241018__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/authentication.py +1 -1
- sky/backends/backend_utils.py +4 -0
- sky/clouds/aws.py +15 -4
- sky/clouds/gcp.py +6 -1
- sky/clouds/lambda_cloud.py +4 -1
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +6 -4
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +17 -6
- sky/clouds/service_catalog/gcp_catalog.py +3 -0
- sky/provision/__init__.py +3 -0
- sky/provision/gcp/config.py +5 -1
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +261 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +19 -18
- sky/setup_files/MANIFEST.in +0 -1
- sky/templates/gcp-ray.yml.j2 +3 -0
- sky/templates/lambda-ray.yml.j2 +7 -38
- sky/utils/schemas.py +3 -0
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/RECORD +25 -24
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '71a95f4bf7f1446e80bb5c24d23c1695bc4fc031'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241018'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/authentication.py
CHANGED
@@ -43,9 +43,9 @@ from sky.adaptors import gcp
|
|
43
43
|
from sky.adaptors import ibm
|
44
44
|
from sky.adaptors import kubernetes
|
45
45
|
from sky.adaptors import runpod
|
46
|
-
from sky.clouds.utils import lambda_utils
|
47
46
|
from sky.provision.fluidstack import fluidstack_utils
|
48
47
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
48
|
+
from sky.provision.lambda_cloud import lambda_utils
|
49
49
|
from sky.utils import common_utils
|
50
50
|
from sky.utils import kubernetes_enums
|
51
51
|
from sky.utils import subprocess_utils
|
sky/backends/backend_utils.py
CHANGED
@@ -2772,6 +2772,10 @@ def get_endpoints(cluster: str,
|
|
2772
2772
|
cluster_records = get_clusters(include_controller=True,
|
2773
2773
|
refresh=False,
|
2774
2774
|
cluster_names=[cluster])
|
2775
|
+
if not cluster_records:
|
2776
|
+
with ux_utils.print_exception_no_traceback():
|
2777
|
+
raise exceptions.ClusterNotUpError(
|
2778
|
+
f'Cluster {cluster!r} not found.', cluster_status=None)
|
2775
2779
|
assert len(cluster_records) == 1, cluster_records
|
2776
2780
|
cluster_record = cluster_records[0]
|
2777
2781
|
if (not skip_status_check and
|
sky/clouds/aws.py
CHANGED
@@ -32,6 +32,14 @@ if typing.TYPE_CHECKING:
|
|
32
32
|
|
33
33
|
logger = sky_logging.init_logger(__name__)
|
34
34
|
|
35
|
+
# Image ID tags
|
36
|
+
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
|
37
|
+
# For GPU-related package version,
|
38
|
+
# see sky/clouds/service_catalog/images/provisioners/cuda.sh
|
39
|
+
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
|
40
|
+
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
41
|
+
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
|
42
|
+
|
35
43
|
# This local file (under ~/.aws/) will be uploaded to remote nodes (any
|
36
44
|
# cloud), if all of the following conditions hold:
|
37
45
|
# - the current user identity is not using AWS SSO
|
@@ -217,17 +225,20 @@ class AWS(clouds.Cloud):
|
|
217
225
|
@classmethod
|
218
226
|
def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
219
227
|
acc = cls.get_accelerators_from_instance_type(instance_type)
|
220
|
-
image_id = service_catalog.get_image_id_from_tag(
|
221
|
-
|
228
|
+
image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
|
229
|
+
region_name,
|
230
|
+
clouds='aws')
|
222
231
|
if acc is not None:
|
232
|
+
image_id = service_catalog.get_image_id_from_tag(
|
233
|
+
_DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
|
223
234
|
assert len(acc) == 1, acc
|
224
235
|
acc_name = list(acc.keys())[0]
|
225
236
|
if acc_name == 'K80':
|
226
237
|
image_id = service_catalog.get_image_id_from_tag(
|
227
|
-
|
238
|
+
_DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
|
228
239
|
if acc_name in ['Trainium', 'Inferentia']:
|
229
240
|
image_id = service_catalog.get_image_id_from_tag(
|
230
|
-
|
241
|
+
_DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
|
231
242
|
if image_id is not None:
|
232
243
|
return image_id
|
233
244
|
# Raise ResourcesUnavailableError to make sure the failover in
|
sky/clouds/gcp.py
CHANGED
@@ -483,7 +483,7 @@ class GCP(clouds.Cloud):
|
|
483
483
|
if acc in ('A100-80GB', 'L4'):
|
484
484
|
# A100-80GB and L4 have a different name pattern.
|
485
485
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}'
|
486
|
-
elif acc
|
486
|
+
elif acc in ('H100', 'H100-MEGA'):
|
487
487
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
|
488
488
|
else:
|
489
489
|
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
|
@@ -546,6 +546,11 @@ class GCP(clouds.Cloud):
|
|
546
546
|
resources_vars[
|
547
547
|
'force_enable_external_ips'] = skypilot_config.get_nested(
|
548
548
|
('gcp', 'force_enable_external_ips'), False)
|
549
|
+
|
550
|
+
# Add gVNIC from config
|
551
|
+
resources_vars['enable_gvnic'] = skypilot_config.get_nested(
|
552
|
+
('gcp', 'enable_gvnic'), False)
|
553
|
+
|
549
554
|
return resources_vars
|
550
555
|
|
551
556
|
def _get_feasible_launchable_resources(
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -8,7 +8,7 @@ import requests
|
|
8
8
|
from sky import clouds
|
9
9
|
from sky import status_lib
|
10
10
|
from sky.clouds import service_catalog
|
11
|
-
from sky.
|
11
|
+
from sky.provision.lambda_cloud import lambda_utils
|
12
12
|
from sky.utils import resources_utils
|
13
13
|
|
14
14
|
if typing.TYPE_CHECKING:
|
@@ -48,6 +48,9 @@ class Lambda(clouds.Cloud):
|
|
48
48
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
|
49
49
|
}
|
50
50
|
|
51
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
52
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
53
|
+
|
51
54
|
@classmethod
|
52
55
|
def _unsupported_features_for_resources(
|
53
56
|
cls, resources: 'resources_lib.Resources'
|
@@ -538,11 +538,13 @@ if __name__ == '__main__':
|
|
538
538
|
instance_df.to_csv('aws/vms.csv', index=False)
|
539
539
|
print('AWS Service Catalog saved to aws/vms.csv')
|
540
540
|
|
541
|
-
|
542
|
-
|
541
|
+
# Disable refreshing images.csv as we are using skypilot custom AMIs
|
542
|
+
# See sky/clouds/service_catalog/images/README.md for more details.
|
543
|
+
# image_df = get_all_regions_images_df(user_regions)
|
544
|
+
# _check_regions_integrity(image_df, 'images')
|
543
545
|
|
544
|
-
image_df.to_csv('aws/images.csv', index=False)
|
545
|
-
print('AWS Images saved to aws/images.csv')
|
546
|
+
# image_df.to_csv('aws/images.csv', index=False)
|
547
|
+
# print('AWS Images saved to aws/images.csv')
|
546
548
|
|
547
549
|
if args.az_mappings:
|
548
550
|
az_mappings_df = fetch_availability_zone_mappings()
|
@@ -419,6 +419,11 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
|
|
419
419
|
if count != 8:
|
420
420
|
# H100 only has 8 cards.
|
421
421
|
continue
|
422
|
+
if 'H100-MEGA-80GB' in gpu_name:
|
423
|
+
gpu_name = 'H100-MEGA'
|
424
|
+
if count != 8:
|
425
|
+
# H100-MEGA only has 8 cards.
|
426
|
+
continue
|
422
427
|
if 'VWS' in gpu_name:
|
423
428
|
continue
|
424
429
|
if gpu_name.startswith('TPU-'):
|
@@ -447,6 +452,7 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
|
|
447
452
|
'A100-80GB': 80 * 1024,
|
448
453
|
'A100': 40 * 1024,
|
449
454
|
'H100': 80 * 1024,
|
455
|
+
'H100-MEGA': 80 * 1024,
|
450
456
|
'P4': 8 * 1024,
|
451
457
|
'T4': 16 * 1024,
|
452
458
|
'V100': 16 * 1024,
|
@@ -491,12 +497,17 @@ def get_gpu_df(skus: List[Dict[str, Any]],
|
|
491
497
|
if sku['category']['usageType'] != ondemand_or_spot:
|
492
498
|
continue
|
493
499
|
|
494
|
-
|
495
|
-
if
|
496
|
-
|
497
|
-
if
|
498
|
-
|
499
|
-
if
|
500
|
+
gpu_names = [row['AcceleratorName']]
|
501
|
+
if gpu_names[0] == 'A100-80GB':
|
502
|
+
gpu_names = ['A100 80GB']
|
503
|
+
if gpu_names[0] == 'H100':
|
504
|
+
gpu_names = ['H100 80GB']
|
505
|
+
if gpu_names[0] == 'H100-MEGA':
|
506
|
+
# Seems that H100-MEGA has two different descriptions in SKUs in
|
507
|
+
# different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
|
508
|
+
gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
|
509
|
+
if not any(f'{gpu_name} GPU' in sku['description']
|
510
|
+
for gpu_name in gpu_names):
|
500
511
|
continue
|
501
512
|
|
502
513
|
unit_price = _get_unit_price(sku)
|
sky/provision/__init__.py
CHANGED
@@ -19,6 +19,7 @@ from sky.provision import cudo
|
|
19
19
|
from sky.provision import fluidstack
|
20
20
|
from sky.provision import gcp
|
21
21
|
from sky.provision import kubernetes
|
22
|
+
from sky.provision import lambda_cloud
|
22
23
|
from sky.provision import runpod
|
23
24
|
from sky.provision import vsphere
|
24
25
|
from sky.utils import command_runner
|
@@ -39,6 +40,8 @@ def _route_to_cloud_impl(func):
|
|
39
40
|
provider_name = kwargs.pop('provider_name')
|
40
41
|
|
41
42
|
module_name = provider_name.lower()
|
43
|
+
if module_name == 'lambda':
|
44
|
+
module_name = 'lambda_cloud'
|
42
45
|
module = globals().get(module_name)
|
43
46
|
assert module is not None, f'Unknown provider: {module_name}'
|
44
47
|
|
sky/provision/gcp/config.py
CHANGED
@@ -670,8 +670,12 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
670
670
|
'accessConfigs': [{
|
671
671
|
'name': 'External NAT',
|
672
672
|
'type': 'ONE_TO_ONE_NAT',
|
673
|
-
}]
|
673
|
+
}]
|
674
674
|
}]
|
675
|
+
# Add gVNIC if specified in config
|
676
|
+
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
677
|
+
if enable_gvnic:
|
678
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
675
679
|
enable_external_ips = _enable_external_ips(config)
|
676
680
|
if not enable_external_ips:
|
677
681
|
# Removing this key means the VM will not be assigned an external IP.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Lambda provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.lambda_cloud.config import bootstrap_instances
|
4
|
+
from sky.provision.lambda_cloud.instance import cleanup_ports
|
5
|
+
from sky.provision.lambda_cloud.instance import get_cluster_info
|
6
|
+
from sky.provision.lambda_cloud.instance import open_ports
|
7
|
+
from sky.provision.lambda_cloud.instance import query_instances
|
8
|
+
from sky.provision.lambda_cloud.instance import run_instances
|
9
|
+
from sky.provision.lambda_cloud.instance import stop_instances
|
10
|
+
from sky.provision.lambda_cloud.instance import terminate_instances
|
11
|
+
from sky.provision.lambda_cloud.instance import wait_instances
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Lambda Cloud configuration bootstrapping"""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
del region, cluster_name # unused
|
10
|
+
return config
|
@@ -0,0 +1,261 @@
|
|
1
|
+
"""Lambda instance provisioning."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
from sky import authentication as auth
|
7
|
+
from sky import sky_logging
|
8
|
+
from sky import status_lib
|
9
|
+
from sky.provision import common
|
10
|
+
import sky.provision.lambda_cloud.lambda_utils as lambda_utils
|
11
|
+
from sky.utils import common_utils
|
12
|
+
from sky.utils import ux_utils
|
13
|
+
|
14
|
+
POLL_INTERVAL = 1
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
_lambda_client = None
|
18
|
+
|
19
|
+
|
20
|
+
def _get_lambda_client():
|
21
|
+
global _lambda_client
|
22
|
+
if _lambda_client is None:
|
23
|
+
_lambda_client = lambda_utils.LambdaCloudClient()
|
24
|
+
return _lambda_client
|
25
|
+
|
26
|
+
|
27
|
+
def _filter_instances(
|
28
|
+
cluster_name_on_cloud: str,
|
29
|
+
status_filters: Optional[List[str]]) -> Dict[str, Dict[str, Any]]:
|
30
|
+
lambda_client = _get_lambda_client()
|
31
|
+
instances = lambda_client.list_instances()
|
32
|
+
possible_names = [
|
33
|
+
f'{cluster_name_on_cloud}-head',
|
34
|
+
f'{cluster_name_on_cloud}-worker',
|
35
|
+
]
|
36
|
+
|
37
|
+
filtered_instances = {}
|
38
|
+
for instance in instances:
|
39
|
+
if (status_filters is not None and
|
40
|
+
instance['status'] not in status_filters):
|
41
|
+
continue
|
42
|
+
if instance.get('name') in possible_names:
|
43
|
+
filtered_instances[instance['id']] = instance
|
44
|
+
return filtered_instances
|
45
|
+
|
46
|
+
|
47
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
48
|
+
head_instance_id = None
|
49
|
+
for instance_id, instance in instances.items():
|
50
|
+
if instance['name'].endswith('-head'):
|
51
|
+
head_instance_id = instance_id
|
52
|
+
break
|
53
|
+
return head_instance_id
|
54
|
+
|
55
|
+
|
56
|
+
def _get_ssh_key_name(prefix: str = '') -> str:
|
57
|
+
lambda_client = _get_lambda_client()
|
58
|
+
_, public_key_path = auth.get_or_generate_keys()
|
59
|
+
with open(public_key_path, 'r', encoding='utf-8') as f:
|
60
|
+
public_key = f.read()
|
61
|
+
name, exists = lambda_client.get_unique_ssh_key_name(prefix, public_key)
|
62
|
+
if not exists:
|
63
|
+
raise lambda_utils.LambdaCloudError('SSH key not found')
|
64
|
+
return name
|
65
|
+
|
66
|
+
|
67
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
68
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
69
|
+
"""Runs instances for the given cluster"""
|
70
|
+
lambda_client = _get_lambda_client()
|
71
|
+
pending_status = ['booting']
|
72
|
+
while True:
|
73
|
+
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
74
|
+
if not instances:
|
75
|
+
break
|
76
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready.')
|
77
|
+
time.sleep(POLL_INTERVAL)
|
78
|
+
exist_instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
79
|
+
head_instance_id = _get_head_instance_id(exist_instances)
|
80
|
+
|
81
|
+
to_start_count = config.count - len(exist_instances)
|
82
|
+
if to_start_count < 0:
|
83
|
+
raise RuntimeError(
|
84
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
85
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
86
|
+
if to_start_count == 0:
|
87
|
+
if head_instance_id is None:
|
88
|
+
raise RuntimeError(
|
89
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
90
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
91
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
92
|
+
return common.ProvisionRecord(
|
93
|
+
provider_name='lambda',
|
94
|
+
cluster_name=cluster_name_on_cloud,
|
95
|
+
region=region,
|
96
|
+
zone=None,
|
97
|
+
head_instance_id=head_instance_id,
|
98
|
+
resumed_instance_ids=[],
|
99
|
+
created_instance_ids=[],
|
100
|
+
)
|
101
|
+
|
102
|
+
created_instance_ids = []
|
103
|
+
ssh_key_name = _get_ssh_key_name()
|
104
|
+
|
105
|
+
def launch_nodes(node_type: str, quantity: int) -> List[str]:
|
106
|
+
try:
|
107
|
+
instance_ids = lambda_client.create_instances(
|
108
|
+
instance_type=config.node_config['InstanceType'],
|
109
|
+
region=region,
|
110
|
+
name=f'{cluster_name_on_cloud}-{node_type}',
|
111
|
+
quantity=quantity,
|
112
|
+
ssh_key_name=ssh_key_name,
|
113
|
+
)
|
114
|
+
logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
|
115
|
+
f'instance_ids: {instance_ids}')
|
116
|
+
return instance_ids
|
117
|
+
except Exception as e:
|
118
|
+
logger.warning(f'run_instances error: {e}')
|
119
|
+
raise
|
120
|
+
|
121
|
+
if head_instance_id is None:
|
122
|
+
instance_ids = launch_nodes('head', 1)
|
123
|
+
assert len(instance_ids) == 1
|
124
|
+
created_instance_ids.append(instance_ids[0])
|
125
|
+
head_instance_id = instance_ids[0]
|
126
|
+
|
127
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
128
|
+
|
129
|
+
worker_node_count = to_start_count - 1
|
130
|
+
if worker_node_count > 0:
|
131
|
+
instance_ids = launch_nodes('worker', worker_node_count)
|
132
|
+
created_instance_ids.extend(instance_ids)
|
133
|
+
|
134
|
+
while True:
|
135
|
+
instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
136
|
+
if len(instances) == config.count:
|
137
|
+
break
|
138
|
+
|
139
|
+
time.sleep(POLL_INTERVAL)
|
140
|
+
|
141
|
+
return common.ProvisionRecord(
|
142
|
+
provider_name='lambda',
|
143
|
+
cluster_name=cluster_name_on_cloud,
|
144
|
+
region=region,
|
145
|
+
zone=None,
|
146
|
+
head_instance_id=head_instance_id,
|
147
|
+
resumed_instance_ids=[],
|
148
|
+
created_instance_ids=created_instance_ids,
|
149
|
+
)
|
150
|
+
|
151
|
+
|
152
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
153
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
154
|
+
del region, cluster_name_on_cloud, state # Unused.
|
155
|
+
|
156
|
+
|
157
|
+
def stop_instances(
|
158
|
+
cluster_name_on_cloud: str,
|
159
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
160
|
+
worker_only: bool = False,
|
161
|
+
) -> None:
|
162
|
+
raise NotImplementedError(
|
163
|
+
'stop_instances is not supported for Lambda Cloud')
|
164
|
+
|
165
|
+
|
166
|
+
def terminate_instances(
|
167
|
+
cluster_name_on_cloud: str,
|
168
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
169
|
+
worker_only: bool = False,
|
170
|
+
) -> None:
|
171
|
+
"""See sky/provision/__init__.py"""
|
172
|
+
del provider_config
|
173
|
+
lambda_client = _get_lambda_client()
|
174
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
175
|
+
|
176
|
+
instance_ids_to_terminate = []
|
177
|
+
for instance_id, instance in instances.items():
|
178
|
+
if worker_only and not instance['name'].endswith('-worker'):
|
179
|
+
continue
|
180
|
+
instance_ids_to_terminate.append(instance_id)
|
181
|
+
|
182
|
+
try:
|
183
|
+
logger.debug(
|
184
|
+
f'Terminating instances {", ".join(instance_ids_to_terminate)}')
|
185
|
+
lambda_client.remove_instances(instance_ids_to_terminate)
|
186
|
+
except Exception as e: # pylint: disable=broad-except
|
187
|
+
with ux_utils.print_exception_no_traceback():
|
188
|
+
raise RuntimeError(
|
189
|
+
f'Failed to terminate instances {instance_ids_to_terminate}: '
|
190
|
+
f'{common_utils.format_exception(e, use_bracket=False)}') from e
|
191
|
+
|
192
|
+
|
193
|
+
def get_cluster_info(
|
194
|
+
region: str,
|
195
|
+
cluster_name_on_cloud: str,
|
196
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
197
|
+
) -> common.ClusterInfo:
|
198
|
+
del region # unused
|
199
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
200
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
201
|
+
head_instance_id = None
|
202
|
+
for instance_id, instance_info in running_instances.items():
|
203
|
+
instances[instance_id] = [
|
204
|
+
common.InstanceInfo(
|
205
|
+
instance_id=instance_id,
|
206
|
+
internal_ip=instance_info['private_ip'],
|
207
|
+
external_ip=instance_info['ip'],
|
208
|
+
ssh_port=22,
|
209
|
+
tags={},
|
210
|
+
)
|
211
|
+
]
|
212
|
+
if instance_info['name'].endswith('-head'):
|
213
|
+
head_instance_id = instance_id
|
214
|
+
|
215
|
+
return common.ClusterInfo(
|
216
|
+
instances=instances,
|
217
|
+
head_instance_id=head_instance_id,
|
218
|
+
provider_name='lambda',
|
219
|
+
provider_config=provider_config,
|
220
|
+
)
|
221
|
+
|
222
|
+
|
223
|
+
def query_instances(
|
224
|
+
cluster_name_on_cloud: str,
|
225
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
226
|
+
non_terminated_only: bool = True,
|
227
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
228
|
+
"""See sky/provision/__init__.py"""
|
229
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
230
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
231
|
+
|
232
|
+
status_map = {
|
233
|
+
'booting': status_lib.ClusterStatus.INIT,
|
234
|
+
'active': status_lib.ClusterStatus.UP,
|
235
|
+
'unhealthy': status_lib.ClusterStatus.INIT,
|
236
|
+
'terminating': status_lib.ClusterStatus.INIT,
|
237
|
+
}
|
238
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
239
|
+
for instance_id, instance in instances.items():
|
240
|
+
status = status_map.get(instance['status'])
|
241
|
+
if non_terminated_only and status is None:
|
242
|
+
continue
|
243
|
+
statuses[instance_id] = status
|
244
|
+
return statuses
|
245
|
+
|
246
|
+
|
247
|
+
def open_ports(
|
248
|
+
cluster_name_on_cloud: str,
|
249
|
+
ports: List[str],
|
250
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
251
|
+
) -> None:
|
252
|
+
raise NotImplementedError('open_ports is not supported for Lambda Cloud')
|
253
|
+
|
254
|
+
|
255
|
+
def cleanup_ports(
|
256
|
+
cluster_name_on_cloud: str,
|
257
|
+
ports: List[str],
|
258
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
259
|
+
) -> None:
|
260
|
+
"""See sky/provision/__init__.py"""
|
261
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Lambda Cloud helper functions."""
|
2
|
+
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import time
|
@@ -76,7 +77,7 @@ class Metadata:
|
|
76
77
|
|
77
78
|
|
78
79
|
def raise_lambda_error(response: requests.Response) -> None:
|
79
|
-
"""Raise LambdaCloudError if appropriate.
|
80
|
+
"""Raise LambdaCloudError if appropriate."""
|
80
81
|
status_code = response.status_code
|
81
82
|
if status_code == 200:
|
82
83
|
return
|
@@ -131,20 +132,22 @@ class LambdaCloudClient:
|
|
131
132
|
self.api_key = self._credentials['api_key']
|
132
133
|
self.headers = {'Authorization': f'Bearer {self.api_key}'}
|
133
134
|
|
134
|
-
def create_instances(
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
135
|
+
def create_instances(
|
136
|
+
self,
|
137
|
+
instance_type: str = 'gpu_1x_a100_sxm4',
|
138
|
+
region: str = 'us-east-1',
|
139
|
+
quantity: int = 1,
|
140
|
+
name: str = '',
|
141
|
+
ssh_key_name: str = '',
|
142
|
+
) -> List[str]:
|
140
143
|
"""Launch new instances."""
|
141
144
|
# Optimization:
|
142
145
|
# Most API requests are rate limited at ~1 request every second but
|
143
146
|
# launch requests are rate limited at ~1 request every 10 seconds.
|
144
147
|
# So don't use launch requests to check availability.
|
145
148
|
# See https://docs.lambdalabs.com/cloud/rate-limiting/ for more.
|
146
|
-
available_regions = self.list_catalog()[instance_type]
|
147
|
-
|
149
|
+
available_regions = (self.list_catalog()[instance_type]
|
150
|
+
['regions_with_capacity_available'])
|
148
151
|
available_regions = [reg['name'] for reg in available_regions]
|
149
152
|
if region not in available_regions:
|
150
153
|
if len(available_regions) > 0:
|
@@ -163,27 +166,25 @@ class LambdaCloudClient:
|
|
163
166
|
'instance_type_name': instance_type,
|
164
167
|
'ssh_key_names': [ssh_key_name],
|
165
168
|
'quantity': quantity,
|
166
|
-
'name': name
|
169
|
+
'name': name,
|
167
170
|
})
|
168
171
|
response = _try_request_with_backoff(
|
169
172
|
'post',
|
170
173
|
f'{API_ENDPOINT}/instance-operations/launch',
|
171
174
|
data=data,
|
172
|
-
headers=self.headers
|
175
|
+
headers=self.headers,
|
176
|
+
)
|
173
177
|
return response.json().get('data', []).get('instance_ids', [])
|
174
178
|
|
175
|
-
def remove_instances(self,
|
179
|
+
def remove_instances(self, instance_ids: List[str]) -> Dict[str, Any]:
|
176
180
|
"""Terminate instances."""
|
177
|
-
data = json.dumps({
|
178
|
-
'instance_ids': [
|
179
|
-
instance_ids[0] # TODO(ewzeng) don't hardcode
|
180
|
-
]
|
181
|
-
})
|
181
|
+
data = json.dumps({'instance_ids': instance_ids})
|
182
182
|
response = _try_request_with_backoff(
|
183
183
|
'post',
|
184
184
|
f'{API_ENDPOINT}/instance-operations/terminate',
|
185
185
|
data=data,
|
186
|
-
headers=self.headers
|
186
|
+
headers=self.headers,
|
187
|
+
)
|
187
188
|
return response.json().get('data', []).get('terminated_instances', [])
|
188
189
|
|
189
190
|
def list_instances(self) -> List[Dict[str, Any]]:
|
sky/setup_files/MANIFEST.in
CHANGED
@@ -6,7 +6,6 @@ include sky/setup_files/*
|
|
6
6
|
include sky/skylet/*.sh
|
7
7
|
include sky/skylet/LICENSE
|
8
8
|
include sky/skylet/providers/ibm/*
|
9
|
-
include sky/skylet/providers/lambda_cloud/*
|
10
9
|
include sky/skylet/providers/oci/*
|
11
10
|
include sky/skylet/providers/scp/*
|
12
11
|
include sky/skylet/providers/*.py
|
sky/templates/gcp-ray.yml.j2
CHANGED
sky/templates/lambda-ray.yml.j2
CHANGED
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
|
|
7
7
|
|
8
8
|
provider:
|
9
9
|
type: external
|
10
|
-
module: sky.
|
10
|
+
module: sky.provision.lambda
|
11
11
|
region: {{region}}
|
12
12
|
# Disable launch config check for worker nodes as it can cause resource
|
13
13
|
# leakage.
|
@@ -25,14 +25,6 @@ available_node_types:
|
|
25
25
|
resources: {}
|
26
26
|
node_config:
|
27
27
|
InstanceType: {{instance_type}}
|
28
|
-
{% if num_nodes > 1 %}
|
29
|
-
ray_worker_default:
|
30
|
-
min_workers: {{num_nodes - 1}}
|
31
|
-
max_workers: {{num_nodes - 1}}
|
32
|
-
resources: {}
|
33
|
-
node_config:
|
34
|
-
InstanceType: {{instance_type}}
|
35
|
-
{%- endif %}
|
36
28
|
|
37
29
|
head_node_type: ray_head_default
|
38
30
|
|
@@ -64,7 +56,10 @@ setup_commands:
|
|
64
56
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
65
57
|
# Line 'mkdir -p ..': disable host key check
|
66
58
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
67
|
-
-
|
59
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
60
|
+
{{ initial_setup_command }}
|
61
|
+
{%- endfor %}
|
62
|
+
sudo systemctl stop unattended-upgrades || true;
|
68
63
|
sudo systemctl disable unattended-upgrades || true;
|
69
64
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
70
65
|
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
@@ -81,31 +76,5 @@ setup_commands:
|
|
81
76
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
82
77
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
83
78
|
|
84
|
-
# Command to start ray
|
85
|
-
#
|
86
|
-
# connection, which is expensive. Try your best to co-locate commands into fewer
|
87
|
-
# items! The same comment applies for worker_start_ray_commands.
|
88
|
-
#
|
89
|
-
# Increment the following for catching performance bugs easier:
|
90
|
-
# current num items (num SSH connections): 2
|
91
|
-
head_start_ray_commands:
|
92
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --min-worker-port 11002 --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
93
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
94
|
-
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
|
95
|
-
|
96
|
-
{%- if num_nodes > 1 %}
|
97
|
-
worker_start_ray_commands:
|
98
|
-
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --min-worker-port 11002 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
|
99
|
-
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
|
100
|
-
{%- else %}
|
101
|
-
worker_start_ray_commands: []
|
102
|
-
{%- endif %}
|
103
|
-
|
104
|
-
head_node: {}
|
105
|
-
worker_nodes: {}
|
106
|
-
|
107
|
-
# These fields are required for external cloud providers.
|
108
|
-
head_setup_commands: []
|
109
|
-
worker_setup_commands: []
|
110
|
-
cluster_synced_files: []
|
111
|
-
file_mounts_sync_continuously: False
|
79
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
80
|
+
# We do not need to list it here anymore.
|
sky/utils/schemas.py
CHANGED
{skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/RECORD
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=ooJaoPt0Vq10nF2ftXGThCKQFJ2HbgQNKQ7Dp6Qg6s4,5854
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
|
-
sky/authentication.py,sha256=
|
3
|
+
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
|
5
5
|
sky/cli.py,sha256=PJR6W92twf89j17OWLQJ9RawdazJcGslfW2L_fLB2PM,208545
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
@@ -30,7 +30,7 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
|
|
30
30
|
sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
|
-
sky/backends/backend_utils.py,sha256=
|
33
|
+
sky/backends/backend_utils.py,sha256=PA21DAXspXuTZDQ5qA3G5RGJ0oUTpJ7XatRRvhtmtt0,126993
|
34
34
|
sky/backends/cloud_vm_ray_backend.py,sha256=9mCLLRUD-x3ksiiPbhrMDsZWIPNU9cVSQwwpmxSia7k,236881
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
@@ -40,16 +40,16 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
|
-
sky/clouds/aws.py,sha256=
|
43
|
+
sky/clouds/aws.py,sha256=XJVbOSkVVUHp9HbHDp0rFdHX113JHbY-3sgokGdNJVE,49527
|
44
44
|
sky/clouds/azure.py,sha256=Yp_a1Lzvq4s47eRMeyVheDv9pC0hSPogCiTMYf-a5ZE,28687
|
45
45
|
sky/clouds/cloud.py,sha256=PPk-Cbf1YbJT8bswcQLtPBtko02OWrRGJKkLzDpytTI,34858
|
46
46
|
sky/clouds/cloud_registry.py,sha256=4yQMv-iBSgyN5aNL4Qxbn0JVE-dkVoEUIgj7S1z9S_Q,955
|
47
47
|
sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
|
48
48
|
sky/clouds/fluidstack.py,sha256=iOmoOx52yTrHKMzwBDaxFJCfNo79M61d5tj-Np24Lyc,12436
|
49
|
-
sky/clouds/gcp.py,sha256=
|
49
|
+
sky/clouds/gcp.py,sha256=lUImS2WJIcUOtrgrVz8zaR4yPGqALqZ0lSmLbjN9xLU,54470
|
50
50
|
sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
|
51
51
|
sky/clouds/kubernetes.py,sha256=aWoXWR-S4puZHzuUHroLKxLdTpkqU7j75dQlXECnsmE,28679
|
52
|
-
sky/clouds/lambda_cloud.py,sha256=
|
52
|
+
sky/clouds/lambda_cloud.py,sha256=VtJ2mmwMT1X4zrzgt3FXM61zmrrgoELZHFgsdYVesPY,12562
|
53
53
|
sky/clouds/oci.py,sha256=WXtxKwDBgi3He4ayi4qzJ4Y659Bi6xU8hWmYLHwiQYs,27371
|
54
54
|
sky/clouds/paperspace.py,sha256=lmUZPYAblaqiBmGQwCunccMiTF_dVA1o3vqY9Q_Nc28,10921
|
55
55
|
sky/clouds/runpod.py,sha256=lstUC6f4JDhtcH9NfwkbpCJMmfmvMigoanhPXPbTYds,11540
|
@@ -63,7 +63,7 @@ sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZu
|
|
63
63
|
sky/clouds/service_catalog/constants.py,sha256=ai2yOlsVqBnEpbxaEHXt61COsHBLwOfw6GZXntEPj7k,411
|
64
64
|
sky/clouds/service_catalog/cudo_catalog.py,sha256=QXAOpx5fJ_cGCr5LbB7wpHMfKIla7G-q_mMJnv_ArTA,4652
|
65
65
|
sky/clouds/service_catalog/fluidstack_catalog.py,sha256=c8MMTldG-q97MJ0zJymudQiOVQC_rxS7vqrZgLrgbQA,5038
|
66
|
-
sky/clouds/service_catalog/gcp_catalog.py,sha256=
|
66
|
+
sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
|
67
67
|
sky/clouds/service_catalog/ibm_catalog.py,sha256=0dzjmXABFECzaAuIa0E6pVINhVK6-G6U52Mj-L45gK8,4472
|
68
68
|
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=6OocEUkgyJtBgHwzu4RPsvru6pj6RwGU-4uSFNQmsSM,8254
|
69
69
|
sky/clouds/service_catalog/lambda_catalog.py,sha256=BAhUGqHj8aVe1zUhEQNO7bQUhcd9jAespGvPyQubTJY,5281
|
@@ -74,17 +74,16 @@ sky/clouds/service_catalog/scp_catalog.py,sha256=4XnaZE5Q4XrrNnDnVhsHkH6jxmWXBeQ
|
|
74
74
|
sky/clouds/service_catalog/vsphere_catalog.py,sha256=yJLWu9SQep-PRn1YdeQ7ZoNqQHTAxJtxf7y6FBrfSW0,4391
|
75
75
|
sky/clouds/service_catalog/data_fetchers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
76
|
sky/clouds/service_catalog/data_fetchers/analyze.py,sha256=VdksJQs3asFE8H5T3ZV1FJas2xD9WEX6c-V5p7y-wp4,2084
|
77
|
-
sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=
|
77
|
+
sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFyjoeb4VFxmbNZ1WK5IQrdoQWk,23003
|
78
78
|
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=jsSVqbSbBIw_IYmO-y2u4co20AJ-JF713KFjUKdO_VA,12272
|
79
79
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
80
80
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
|
81
|
-
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=
|
81
|
+
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=VHwYIPX1kGOvGQ67mtvhKe1enmKFF3knveIktDwdYio,29633
|
82
82
|
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
|
83
83
|
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
|
84
84
|
sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
|
86
86
|
sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
|
87
|
-
sky/clouds/utils/lambda_utils.py,sha256=n_GwXhswehDOJrby1fGyPXSQuPmPTPQqXZzV4xQm9KI,9991
|
88
87
|
sky/clouds/utils/oci_utils.py,sha256=LT_RtPQ2B1wlSF0e9PSD3NWxFFIzovcZeDjO-dyOghU,4482
|
89
88
|
sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
|
90
89
|
sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
|
@@ -103,7 +102,7 @@ sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
|
|
103
102
|
sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
|
104
103
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
105
104
|
sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
|
106
|
-
sky/provision/__init__.py,sha256=
|
105
|
+
sky/provision/__init__.py,sha256=UhYsGRribEyK1--PPT0Dom9051jlpdn8UCNhO8qpPOc,6262
|
107
106
|
sky/provision/common.py,sha256=E8AlSUFcn0FYQq1erNmoVfMAdsF9tP2yxfyk-9PLvQU,10286
|
108
107
|
sky/provision/constants.py,sha256=DvHj3wpqdpaSBHMOGIfVWLLWGJoz0eOQAx73DwYMNEk,531
|
109
108
|
sky/provision/docker_utils.py,sha256=Z7vDUs9Yjqks_CsWrACcTgABIZuFi3EJVFwkU0WsdD0,18832
|
@@ -131,7 +130,7 @@ sky/provision/fluidstack/config.py,sha256=hDqesKEVjIhXLTWej3fDdpbHtKBXoybxFGgC6T
|
|
131
130
|
sky/provision/fluidstack/fluidstack_utils.py,sha256=Y21y2IAiHPLk_b-Lp-ld26SZSfWARhxdDEiu7MtfBmc,5693
|
132
131
|
sky/provision/fluidstack/instance.py,sha256=jZb_zJFkyCZw3QV3Dt-GbUxl_2HR1kxvOxS7G3lkWcA,13708
|
133
132
|
sky/provision/gcp/__init__.py,sha256=zlgjR2JoaGD7sStGStMRu9bJ62f-8NKEIyb-bFHBlzM,528
|
134
|
-
sky/provision/gcp/config.py,sha256=
|
133
|
+
sky/provision/gcp/config.py,sha256=i0PhR1ybGErQiPT8cD6E5OFB7LD6sub4Rc-mhgTREVI,33340
|
135
134
|
sky/provision/gcp/constants.py,sha256=ojerfnNEeayJn-0aJq2Uq1iTchxOkpruKrPBbHmdiEw,7448
|
136
135
|
sky/provision/gcp/instance.py,sha256=l2-1nHj4pUoHqOu8HMN1hT1bwd4Q96X8MXgOPsNJUN8,25184
|
137
136
|
sky/provision/gcp/instance_utils.py,sha256=veRBr6Oziv0KaUdC4acuWeaOremNV0gMYCCHaSvY7c8,70943
|
@@ -144,6 +143,10 @@ sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMsh
|
|
144
143
|
sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
|
145
144
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
146
145
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
146
|
+
sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
|
147
|
+
sky/provision/lambda_cloud/config.py,sha256=jq1iLzp4Up61r4JGxvtpVbJlgXnea3LHYQhCQyyl7ik,272
|
148
|
+
sky/provision/lambda_cloud/instance.py,sha256=5-XuX-KwlRq8y62NXNzY_p6aJs4iCPGBf5U4pIR4liI,8975
|
149
|
+
sky/provision/lambda_cloud/lambda_utils.py,sha256=H8uaaMEpLn5cqGCdhUH_oJiccv_cuMguUNAl0NqB0Ik,9873
|
147
150
|
sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
|
148
151
|
sky/provision/paperspace/config.py,sha256=oNmffSt-V466pE0DmML8hOCX1CiA24jAqE5JEKuqpyI,1541
|
149
152
|
sky/provision/paperspace/constants.py,sha256=NcLJGivJxshJwhR28yVHysWQ2gtMAkTVmHC91d3kyKM,957
|
@@ -180,7 +183,7 @@ sky/serve/serve_state.py,sha256=5BZSKKKxQRk-0mku17Ch4Veu4qOhaFvaOJY3zrZCkLw,1931
|
|
180
183
|
sky/serve/serve_utils.py,sha256=im_1cJoJmufFxkBVnhK4nI6XlHvEXersQyIivNruJJc,38009
|
181
184
|
sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
|
182
185
|
sky/serve/service_spec.py,sha256=iRhW95SERvb4NWtV10uCuhgvW31HuSAmZZ55OX0WK8s,15309
|
183
|
-
sky/setup_files/MANIFEST.in,sha256=
|
186
|
+
sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
|
184
187
|
sky/setup_files/setup.py,sha256=o4IgiwFoTB6Sdn3MmOirUIS0OSkoh6qo_0vrgcmrYA4,12093
|
185
188
|
sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
|
186
189
|
sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,8 +203,6 @@ sky/skylet/providers/ibm/__init__.py,sha256=GXo5F9ztvs0qMDI_G9wM5KvzySfYugslJMHH
|
|
200
203
|
sky/skylet/providers/ibm/node_provider.py,sha256=olNtCoCxjXTT-C_youwdQ9UF1DPgO8OVwDueotGFaJI,38280
|
201
204
|
sky/skylet/providers/ibm/utils.py,sha256=63vhKqLLOhAZdibSp8VWWONeyCER9F6U2VLrSpzlizk,1292
|
202
205
|
sky/skylet/providers/ibm/vpc_provider.py,sha256=GiOGlWYqqeBETfAeKqVj2-9shsMSP7z1WnO8UP5JTNo,34630
|
203
|
-
sky/skylet/providers/lambda_cloud/__init__.py,sha256=DNxB-NL97FU7ptGcXTnIZZRt6ZLtNVtp3nayJNaxjhY,112
|
204
|
-
sky/skylet/providers/lambda_cloud/node_provider.py,sha256=77AoTaXuSW5cr4Z2cEAbAsHx1e4l8c0fc1ABbdLduKE,14042
|
205
206
|
sky/skylet/providers/oci/__init__.py,sha256=LRMTj6OhQoxiFJw4uNxG8cn6PllP8A-lGJL3Cs5DJok,91
|
206
207
|
sky/skylet/providers/oci/node_provider.py,sha256=YPqiRag_cysvYMIMDGbMn6lOumvHad6FLJB5DGPr00Q,20492
|
207
208
|
sky/skylet/providers/oci/query_helper.py,sha256=dUsvPGzWPNF5O2NjQvuC8tkilT4H11gMj6R7Qel2fDc,17202
|
@@ -221,7 +222,7 @@ sky/templates/aws-ray.yml.j2,sha256=K0rAuyf1XC_GPFp1BR9df42-Be12A6T2UF0BllVSpYg,
|
|
221
222
|
sky/templates/azure-ray.yml.j2,sha256=RtYAcAmFQd6TB3j-pbxi7ekjWhznqFhJtzdkqH_nXqM,6135
|
222
223
|
sky/templates/cudo-ray.yml.j2,sha256=SEHVY57iBauCOE2HYJtYVFEKlriAkdwQu_p86a1n_bA,3548
|
223
224
|
sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
|
224
|
-
sky/templates/gcp-ray.yml.j2,sha256=
|
225
|
+
sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
|
225
226
|
sky/templates/ibm-ray.yml.j2,sha256=RMBUqPId8i4CnVwcyfK3DbRapF1jFMuGQlY0E0PFbMU,6669
|
226
227
|
sky/templates/jobs-controller.yaml.j2,sha256=Gu3ogFxFYr09VEXP-6zEbrCUOFo1aYxWEjAq7whCrxo,1607
|
227
228
|
sky/templates/kubernetes-ingress.yml.j2,sha256=73iDklVDWBMbItg0IexCa6_ClXPJOxw7PWz3leku4nE,1340
|
@@ -229,7 +230,7 @@ sky/templates/kubernetes-loadbalancer.yml.j2,sha256=IxrNYM366N01bbkJEbZ_UPYxUP8w
|
|
229
230
|
sky/templates/kubernetes-port-forward-proxy-command.sh,sha256=HlG7CPBBedCVBlL9qv0erW_eKm6Irj0LFyaAWuJW_lc,3148
|
230
231
|
sky/templates/kubernetes-ray.yml.j2,sha256=Wq9luXc6-t141uyHbtOy1IDmLMM0PBbePTZfZEtAKw0,18160
|
231
232
|
sky/templates/kubernetes-ssh-jump.yml.j2,sha256=k5W5sOIMppU7dDkJMwPlqsUcb92y7L5_TVG3hkgMy8M,2747
|
232
|
-
sky/templates/lambda-ray.yml.j2,sha256=
|
233
|
+
sky/templates/lambda-ray.yml.j2,sha256=oMbrfv3zHoD1v1XXMLCLK1vB7wLBU1Z_jNpC4-5lGVo,3985
|
233
234
|
sky/templates/local-ray.yml.j2,sha256=FNHeyHF6nW9nU9QLIZceUWfvrFTTcO51KqhTnYCEFaA,1185
|
234
235
|
sky/templates/oci-ray.yml.j2,sha256=5XfIobW9XuspIpEhI4vFIEcJEFCdtFJqEGfX03zL6DE,7032
|
235
236
|
sky/templates/paperspace-ray.yml.j2,sha256=HQjZNamrB_a4fOMCxQXSVdV5JIHtbGtAE0JzEO8uuVQ,4021
|
@@ -255,7 +256,7 @@ sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM
|
|
255
256
|
sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
|
256
257
|
sky/utils/resources_utils.py,sha256=sJuPextjJKHhvDGAaOPEzeEkteryF2fGNgNgBLqnLic,7419
|
257
258
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
258
|
-
sky/utils/schemas.py,sha256=
|
259
|
+
sky/utils/schemas.py,sha256=qo9j1TJZXqgJlBgbQfqz1oIZAxc3CN8uWooKYPQXXIY,28878
|
259
260
|
sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
|
260
261
|
sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
261
262
|
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
@@ -273,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
|
|
273
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
274
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
|
275
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
276
|
-
skypilot_nightly-1.0.0.
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/METADATA,sha256=hKti-qYovHe9BXjvZnYoV-88kOo2Qz0-xTDwY08RzrM,18945
|
279
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241018.dist-info/RECORD,,
|
@@ -1,320 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
from threading import RLock
|
4
|
-
import time
|
5
|
-
from typing import Any, Dict, List, Optional
|
6
|
-
|
7
|
-
from ray.autoscaler.node_provider import NodeProvider
|
8
|
-
from ray.autoscaler.tags import NODE_KIND_HEAD
|
9
|
-
from ray.autoscaler.tags import NODE_KIND_WORKER
|
10
|
-
from ray.autoscaler.tags import STATUS_UP_TO_DATE
|
11
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
12
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_KIND
|
13
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_NAME
|
14
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
|
15
|
-
from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
|
16
|
-
|
17
|
-
from sky import authentication as auth
|
18
|
-
from sky.clouds.utils import lambda_utils
|
19
|
-
from sky.utils import command_runner
|
20
|
-
from sky.utils import common_utils
|
21
|
-
from sky.utils import subprocess_utils
|
22
|
-
from sky.utils import ux_utils
|
23
|
-
|
24
|
-
_TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata'
|
25
|
-
_REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name'
|
26
|
-
_REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem'
|
27
|
-
_REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
|
28
|
-
_GET_INTERNAL_IP_CMD = 's=$(ip -4 -br addr show | grep UP); echo "$s"; echo "$s" | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1])|104\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
|
29
|
-
|
30
|
-
logger = logging.getLogger(__name__)
|
31
|
-
|
32
|
-
|
33
|
-
def synchronized(f):
|
34
|
-
|
35
|
-
def wrapper(self, *args, **kwargs):
|
36
|
-
self.lock.acquire()
|
37
|
-
try:
|
38
|
-
return f(self, *args, **kwargs)
|
39
|
-
finally:
|
40
|
-
self.lock.release()
|
41
|
-
|
42
|
-
return wrapper
|
43
|
-
|
44
|
-
|
45
|
-
class LambdaNodeProvider(NodeProvider):
|
46
|
-
"""Node Provider for Lambda Cloud.
|
47
|
-
|
48
|
-
This provider assumes Lambda Cloud credentials are set.
|
49
|
-
"""
|
50
|
-
|
51
|
-
def __init__(self, provider_config: Dict[str, Any],
|
52
|
-
cluster_name: str) -> None:
|
53
|
-
NodeProvider.__init__(self, provider_config, cluster_name)
|
54
|
-
self.lock = RLock()
|
55
|
-
self.lambda_client = lambda_utils.LambdaCloudClient()
|
56
|
-
self.cached_nodes: Dict[str, Dict[str, Any]] = {}
|
57
|
-
self.metadata = lambda_utils.Metadata(_TAG_PATH_PREFIX, cluster_name)
|
58
|
-
self.ssh_key_path = os.path.expanduser(auth.PRIVATE_SSH_KEY_PATH)
|
59
|
-
|
60
|
-
def _get_ssh_key_name(prefix: str) -> str:
|
61
|
-
public_key_path = os.path.expanduser(auth.PUBLIC_SSH_KEY_PATH)
|
62
|
-
with open(public_key_path, 'r') as f:
|
63
|
-
public_key = f.read()
|
64
|
-
name, exists = self.lambda_client.get_unique_ssh_key_name(
|
65
|
-
prefix, public_key)
|
66
|
-
if not exists:
|
67
|
-
raise lambda_utils.LambdaCloudError('SSH key not found')
|
68
|
-
return name
|
69
|
-
|
70
|
-
ray_yaml_path = os.path.expanduser(_REMOTE_RAY_YAML)
|
71
|
-
self.on_head = (os.path.exists(ray_yaml_path) and
|
72
|
-
common_utils.read_yaml(ray_yaml_path)['cluster_name']
|
73
|
-
== cluster_name)
|
74
|
-
|
75
|
-
if self.on_head:
|
76
|
-
self.ssh_key_path = os.path.expanduser(_REMOTE_RAY_SSH_KEY)
|
77
|
-
ssh_key_name_path = os.path.expanduser(_REMOTE_SSH_KEY_NAME)
|
78
|
-
if os.path.exists(ssh_key_name_path):
|
79
|
-
with open(ssh_key_name_path, 'r') as f:
|
80
|
-
self.ssh_key_name = f.read()
|
81
|
-
else:
|
82
|
-
# At this point, `~/.ssh/sky-key.pub` contains the public
|
83
|
-
# key used to launch this cluster. Use it to determine
|
84
|
-
# ssh key name and store the name in _REMOTE_SSH_KEY_NAME.
|
85
|
-
# Note: this case only runs during cluster launch, so it is
|
86
|
-
# not possible for ~/.ssh/sky-key.pub to already be regenerated
|
87
|
-
# by the user.
|
88
|
-
self.ssh_key_name = _get_ssh_key_name('')
|
89
|
-
with open(ssh_key_name_path, 'w', encoding='utf-8') as f:
|
90
|
-
f.write(self.ssh_key_name)
|
91
|
-
else:
|
92
|
-
# On local
|
93
|
-
self.ssh_key_name = _get_ssh_key_name(
|
94
|
-
f'sky-key-{common_utils.get_user_hash()}')
|
95
|
-
|
96
|
-
def _guess_and_add_missing_tags(self, vms: List[Dict[str, Any]]) -> None:
|
97
|
-
"""Adds missing vms to local tag file and guesses their tags."""
|
98
|
-
for node in vms:
|
99
|
-
if self.metadata.get(node['id']) is not None:
|
100
|
-
pass
|
101
|
-
elif node['name'] == f'{self.cluster_name}-head':
|
102
|
-
self.metadata.set(
|
103
|
-
node['id'], {
|
104
|
-
'tags': {
|
105
|
-
TAG_RAY_CLUSTER_NAME: self.cluster_name,
|
106
|
-
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
107
|
-
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
108
|
-
TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
|
109
|
-
TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-head',
|
110
|
-
}
|
111
|
-
})
|
112
|
-
elif node['name'] == f'{self.cluster_name}-worker':
|
113
|
-
self.metadata.set(
|
114
|
-
node['id'], {
|
115
|
-
'tags': {
|
116
|
-
TAG_RAY_CLUSTER_NAME: self.cluster_name,
|
117
|
-
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
118
|
-
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
119
|
-
TAG_RAY_USER_NODE_TYPE: 'ray_worker_default',
|
120
|
-
TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-worker',
|
121
|
-
}
|
122
|
-
})
|
123
|
-
|
124
|
-
def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
|
125
|
-
"""List running instances in cluster."""
|
126
|
-
vms = self.lambda_client.list_instances()
|
127
|
-
possible_names = [
|
128
|
-
f'{self.cluster_name}-head', f'{self.cluster_name}-worker'
|
129
|
-
]
|
130
|
-
return [node for node in vms if node.get('name') in possible_names]
|
131
|
-
|
132
|
-
@synchronized
|
133
|
-
def _get_filtered_nodes(self, tag_filters: Dict[str,
|
134
|
-
str]) -> Dict[str, Any]:
|
135
|
-
|
136
|
-
def _extract_metadata(vm: Dict[str, Any]) -> Dict[str, Any]:
|
137
|
-
metadata = {'id': vm['id'], 'status': vm['status'], 'tags': {}}
|
138
|
-
instance_info = self.metadata.get(vm['id'])
|
139
|
-
if instance_info is not None:
|
140
|
-
metadata['tags'] = instance_info['tags']
|
141
|
-
metadata['external_ip'] = vm.get('ip')
|
142
|
-
return metadata
|
143
|
-
|
144
|
-
def _match_tags(vm: Dict[str, Any]):
|
145
|
-
vm_info = self.metadata.get(vm['id'])
|
146
|
-
tags = {} if vm_info is None else vm_info['tags']
|
147
|
-
for k, v in tag_filters.items():
|
148
|
-
if tags.get(k) != v:
|
149
|
-
return False
|
150
|
-
return True
|
151
|
-
|
152
|
-
def _get_internal_ip(node: Dict[str, Any]):
|
153
|
-
# TODO(ewzeng): cache internal ips in metadata file to reduce
|
154
|
-
# ssh overhead.
|
155
|
-
if node['external_ip'] is None or node['status'] != 'active':
|
156
|
-
node['internal_ip'] = None
|
157
|
-
return
|
158
|
-
runner = command_runner.SSHCommandRunner(
|
159
|
-
node=(node['external_ip'], 22),
|
160
|
-
ssh_user='ubuntu',
|
161
|
-
ssh_private_key=self.ssh_key_path)
|
162
|
-
rc, stdout, stderr = runner.run(_GET_INTERNAL_IP_CMD,
|
163
|
-
require_outputs=True,
|
164
|
-
stream_logs=False)
|
165
|
-
subprocess_utils.handle_returncode(
|
166
|
-
rc,
|
167
|
-
_GET_INTERNAL_IP_CMD,
|
168
|
-
'Failed get obtain private IP from node',
|
169
|
-
stderr=stdout + stderr)
|
170
|
-
node['internal_ip'] = stdout.strip()
|
171
|
-
|
172
|
-
vms = self._list_instances_in_cluster()
|
173
|
-
self.metadata.refresh([node['id'] for node in vms])
|
174
|
-
self._guess_and_add_missing_tags(vms)
|
175
|
-
nodes = [_extract_metadata(vm) for vm in filter(_match_tags, vms)]
|
176
|
-
nodes = [
|
177
|
-
node for node in nodes
|
178
|
-
if node['status'] not in ['terminating', 'terminated']
|
179
|
-
]
|
180
|
-
subprocess_utils.run_in_parallel(_get_internal_ip, nodes)
|
181
|
-
self.cached_nodes = {node['id']: node for node in nodes}
|
182
|
-
return self.cached_nodes
|
183
|
-
|
184
|
-
def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
|
185
|
-
"""Return a list of node ids filtered by the specified tags dict.
|
186
|
-
|
187
|
-
This list must not include terminated nodes. For performance reasons,
|
188
|
-
providers are allowed to cache the result of a call to
|
189
|
-
non_terminated_nodes() to serve single-node queries
|
190
|
-
(e.g. is_running(node_id)). This means that non_terminated_nodes() must
|
191
|
-
be called again to refresh results.
|
192
|
-
|
193
|
-
Examples:
|
194
|
-
>>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
|
195
|
-
["node-1", "node-2"]
|
196
|
-
"""
|
197
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
198
|
-
return [k for k, _ in nodes.items()]
|
199
|
-
|
200
|
-
def is_running(self, node_id: str) -> bool:
|
201
|
-
"""Return whether the specified node is running."""
|
202
|
-
return self._get_cached_node(node_id=node_id) is not None
|
203
|
-
|
204
|
-
def is_terminated(self, node_id: str) -> bool:
|
205
|
-
"""Return whether the specified node is terminated."""
|
206
|
-
return self._get_cached_node(node_id=node_id) is None
|
207
|
-
|
208
|
-
def node_tags(self, node_id: str) -> Dict[str, str]:
|
209
|
-
"""Returns the tags of the given node (string dict)."""
|
210
|
-
node = self._get_cached_node(node_id=node_id)
|
211
|
-
if node is None:
|
212
|
-
return {}
|
213
|
-
return node['tags']
|
214
|
-
|
215
|
-
def external_ip(self, node_id: str) -> Optional[str]:
|
216
|
-
"""Returns the external ip of the given node."""
|
217
|
-
node = self._get_cached_node(node_id=node_id)
|
218
|
-
if node is None:
|
219
|
-
return None
|
220
|
-
ip = node.get('external_ip')
|
221
|
-
with ux_utils.print_exception_no_traceback():
|
222
|
-
if ip is None:
|
223
|
-
raise lambda_utils.LambdaCloudError(
|
224
|
-
'A node ip address was not found. Either '
|
225
|
-
'(1) Lambda Cloud has internally errored, or '
|
226
|
-
'(2) the cluster is still booting. '
|
227
|
-
'You can manually terminate the cluster on the '
|
228
|
-
'Lambda Cloud console or (in case 2) wait for '
|
229
|
-
'booting to finish (~2 minutes).')
|
230
|
-
return ip
|
231
|
-
|
232
|
-
def internal_ip(self, node_id: str) -> Optional[str]:
|
233
|
-
"""Returns the internal ip (Ray ip) of the given node."""
|
234
|
-
node = self._get_cached_node(node_id=node_id)
|
235
|
-
if node is None:
|
236
|
-
return None
|
237
|
-
ip = node.get('internal_ip')
|
238
|
-
with ux_utils.print_exception_no_traceback():
|
239
|
-
if ip is None:
|
240
|
-
raise lambda_utils.LambdaCloudError(
|
241
|
-
'A node ip address was not found. Either '
|
242
|
-
'(1) Lambda Cloud has internally errored, or '
|
243
|
-
'(2) the cluster is still booting. '
|
244
|
-
'You can manually terminate the cluster on the '
|
245
|
-
'Lambda Cloud console or (in case 2) wait for '
|
246
|
-
'booting to finish (~2 minutes).')
|
247
|
-
return ip
|
248
|
-
|
249
|
-
def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
|
250
|
-
count: int) -> None:
|
251
|
-
"""Creates a number of nodes within the namespace."""
|
252
|
-
# Get tags
|
253
|
-
config_tags = node_config.get('tags', {}).copy()
|
254
|
-
config_tags.update(tags)
|
255
|
-
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
256
|
-
|
257
|
-
# Create nodes
|
258
|
-
instance_type = node_config['InstanceType']
|
259
|
-
region = self.provider_config['region']
|
260
|
-
|
261
|
-
if config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD:
|
262
|
-
name = f'{self.cluster_name}-head'
|
263
|
-
# Occasionally, the head node will continue running for a short
|
264
|
-
# period after termination. This can lead to the following bug:
|
265
|
-
# 1. Head node autodowns but continues running.
|
266
|
-
# 2. The next autodown event is triggered, which executes ray up.
|
267
|
-
# 3. Head node stops running.
|
268
|
-
# In this case, a new head node is created after the cluster has
|
269
|
-
# terminated. We avoid this with the following check:
|
270
|
-
if self.on_head:
|
271
|
-
raise lambda_utils.LambdaCloudError('Head already exists.')
|
272
|
-
else:
|
273
|
-
name = f'{self.cluster_name}-worker'
|
274
|
-
|
275
|
-
# Lambda launch api only supports launching one node at a time,
|
276
|
-
# so we do a loop. Remove loop when launch api allows quantity > 1
|
277
|
-
booting_list = []
|
278
|
-
for _ in range(count):
|
279
|
-
vm_id = self.lambda_client.create_instances(
|
280
|
-
instance_type=instance_type,
|
281
|
-
region=region,
|
282
|
-
quantity=1,
|
283
|
-
name=name,
|
284
|
-
ssh_key_name=self.ssh_key_name)[0]
|
285
|
-
self.metadata.set(vm_id, {'tags': config_tags})
|
286
|
-
booting_list.append(vm_id)
|
287
|
-
time.sleep(10) # Avoid api rate limits
|
288
|
-
|
289
|
-
# Wait for nodes to finish booting
|
290
|
-
while True:
|
291
|
-
vms = self._list_instances_in_cluster()
|
292
|
-
for vm_id in booting_list.copy():
|
293
|
-
for vm in vms:
|
294
|
-
if vm['id'] == vm_id and vm['status'] == 'active':
|
295
|
-
booting_list.remove(vm_id)
|
296
|
-
if len(booting_list) == 0:
|
297
|
-
return
|
298
|
-
time.sleep(10)
|
299
|
-
|
300
|
-
@synchronized
|
301
|
-
def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
|
302
|
-
"""Sets the tag values (string dict) for the specified node."""
|
303
|
-
node = self._get_node(node_id)
|
304
|
-
assert node is not None, node_id
|
305
|
-
node['tags'].update(tags)
|
306
|
-
self.metadata.set(node_id, {'tags': node['tags']})
|
307
|
-
|
308
|
-
def terminate_node(self, node_id: str) -> None:
|
309
|
-
"""Terminates the specified node."""
|
310
|
-
self.lambda_client.remove_instances(node_id)
|
311
|
-
self.metadata.set(node_id, None)
|
312
|
-
|
313
|
-
def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
314
|
-
self._get_filtered_nodes({}) # Side effect: updates cache
|
315
|
-
return self.cached_nodes.get(node_id, None)
|
316
|
-
|
317
|
-
def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
318
|
-
if node_id in self.cached_nodes:
|
319
|
-
return self.cached_nodes[node_id]
|
320
|
-
return self._get_node(node_id=node_id)
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241016.dist-info → skypilot_nightly-1.0.0.dev20241018.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|