skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/provision/aws/config.py
CHANGED
@@ -383,10 +383,13 @@ def _usable_subnets(
|
|
383
383
|
raise exc
|
384
384
|
|
385
385
|
if not subnets:
|
386
|
+
vpc_msg = (f'Does a default VPC exist in region '
|
387
|
+
f'{ec2.meta.client.meta.region_name}? ') if (
|
388
|
+
vpc_id_of_sg is None) else ''
|
386
389
|
_skypilot_log_error_and_exit_for_failover(
|
387
|
-
'No usable subnets found
|
388
|
-
'manually creating an instance in your specified region to '
|
389
|
-
'populate the list of subnets and
|
390
|
+
f'No usable subnets found. {vpc_msg}'
|
391
|
+
'Try manually creating an instance in your specified region to '
|
392
|
+
'populate the list of subnets and try again. '
|
390
393
|
'Note that the subnet must map public IPs '
|
391
394
|
'on instance launch unless you set `use_internal_ips: true` in '
|
392
395
|
'the `provider` config.')
|
@@ -495,6 +498,11 @@ def _get_subnet_and_vpc_id(ec2, security_group_ids: Optional[List[str]],
|
|
495
498
|
vpc_id_of_sg = None
|
496
499
|
|
497
500
|
all_subnets = list(ec2.subnets.all())
|
501
|
+
# If no VPC is specified, use the default VPC.
|
502
|
+
# We filter only for default VPCs to avoid using subnets that users may
|
503
|
+
# not want SkyPilot to use.
|
504
|
+
if vpc_id_of_sg is None:
|
505
|
+
all_subnets = [s for s in all_subnets if s.vpc.is_default]
|
498
506
|
subnets, vpc_id = _usable_subnets(
|
499
507
|
ec2,
|
500
508
|
user_specified_subnets=None,
|
@@ -545,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
|
|
545
553
|
|
546
554
|
def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
547
555
|
expected_sg_name: str) -> Any:
|
548
|
-
|
549
|
-
vpc_to_existing_sg = {
|
550
|
-
sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
|
551
|
-
ec2,
|
552
|
-
[vpc_id],
|
553
|
-
[expected_sg_name],
|
554
|
-
)
|
555
|
-
}
|
556
|
+
"""Find or create a security group in the specified VPC.
|
556
557
|
|
557
|
-
|
558
|
-
|
558
|
+
Args:
|
559
|
+
ec2: The initialized EC2 client object.
|
560
|
+
vpc_id: The ID of the VPC where the security group should be queried
|
561
|
+
or created.
|
562
|
+
expected_sg_name: The expected name of the security group.
|
563
|
+
|
564
|
+
Returns:
|
565
|
+
The security group object containing the details of the security group.
|
566
|
+
|
567
|
+
Raises:
|
568
|
+
exceptions.NoClusterLaunchedError: If the security group creation fails
|
569
|
+
and is not due to an existing duplicate.
|
570
|
+
botocore.exceptions.ClientError: If the security group creation fails
|
571
|
+
due to AWS service issues.
|
572
|
+
"""
|
573
|
+
# Figure out which security groups with this name exist for each VPC...
|
574
|
+
security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
|
575
|
+
expected_sg_name)
|
576
|
+
if security_group is not None:
|
577
|
+
return security_group
|
559
578
|
|
560
579
|
try:
|
561
580
|
# create a new security group
|
@@ -565,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
|
565
584
|
VpcId=vpc_id,
|
566
585
|
)
|
567
586
|
except ec2.meta.client.exceptions.ClientError as e:
|
587
|
+
if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
|
588
|
+
# The security group already exists, but we didn't see it
|
589
|
+
# because of eventual consistency.
|
590
|
+
logger.warning(f'{expected_sg_name} already exists when creating.')
|
591
|
+
security_group = _get_security_group_from_vpc_id(
|
592
|
+
ec2, vpc_id, expected_sg_name)
|
593
|
+
assert (security_group is not None and
|
594
|
+
security_group.group_name == expected_sg_name), (
|
595
|
+
f'Expected {expected_sg_name} but got {security_group}')
|
596
|
+
logger.info(
|
597
|
+
f'Found existing security group {colorama.Style.BRIGHT}'
|
598
|
+
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
599
|
+
f'[id={security_group.id}]')
|
600
|
+
return security_group
|
568
601
|
message = ('Failed to create security group. Error: '
|
569
602
|
f'{common_utils.format_exception(e)}')
|
570
603
|
logger.warning(message)
|
571
604
|
raise exceptions.NoClusterLaunchedError(message) from e
|
572
605
|
|
573
|
-
security_group =
|
574
|
-
|
575
|
-
|
576
|
-
assert security_group, 'Failed to create security group'
|
577
|
-
security_group = security_group[0]
|
578
|
-
|
606
|
+
security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
|
607
|
+
expected_sg_name)
|
608
|
+
assert security_group is not None, 'Failed to create security group'
|
579
609
|
logger.info(f'Created new security group {colorama.Style.BRIGHT}'
|
580
610
|
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
581
611
|
f'[id={security_group.id}]')
|
582
612
|
return security_group
|
583
613
|
|
584
614
|
|
585
|
-
def
|
586
|
-
|
587
|
-
|
588
|
-
unique_group_names = set(group_names)
|
589
|
-
|
615
|
+
def _get_security_group_from_vpc_id(ec2, vpc_id: str,
|
616
|
+
group_name: str) -> Optional[Any]:
|
617
|
+
"""Get security group by VPC ID and group name."""
|
590
618
|
existing_groups = list(
|
591
619
|
ec2.security_groups.filter(Filters=[{
|
592
620
|
'Name': 'vpc-id',
|
593
|
-
'Values':
|
621
|
+
'Values': [vpc_id]
|
594
622
|
}]))
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
623
|
+
|
624
|
+
for sg in existing_groups:
|
625
|
+
if sg.group_name == group_name:
|
626
|
+
return sg
|
627
|
+
|
628
|
+
return None
|
sky/provision/azure/instance.py
CHANGED
@@ -343,7 +343,7 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
343
343
|
_create_vm(compute_client, vm_name, node_tags, provider_config,
|
344
344
|
node_config, network_interface.id)
|
345
345
|
|
346
|
-
subprocess_utils.run_in_parallel(create_single_instance, range(count))
|
346
|
+
subprocess_utils.run_in_parallel(create_single_instance, list(range(count)))
|
347
347
|
|
348
348
|
# Update disk performance tier
|
349
349
|
performance_tier = node_config.get('disk_performance_tier', None)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""DO provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.do.config import bootstrap_instances
|
4
|
+
from sky.provision.do.instance import cleanup_ports
|
5
|
+
from sky.provision.do.instance import get_cluster_info
|
6
|
+
from sky.provision.do.instance import open_ports
|
7
|
+
from sky.provision.do.instance import query_instances
|
8
|
+
from sky.provision.do.instance import run_instances
|
9
|
+
from sky.provision.do.instance import stop_instances
|
10
|
+
from sky.provision.do.instance import terminate_instances
|
11
|
+
from sky.provision.do.instance import wait_instances
|
@@ -0,0 +1,14 @@
|
|
1
|
+
"""Paperspace configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky import sky_logging
|
4
|
+
from sky.provision import common
|
5
|
+
|
6
|
+
logger = sky_logging.init_logger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
def bootstrap_instances(
|
10
|
+
region: str, cluster_name: str,
|
11
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
12
|
+
"""Bootstraps instances for the given cluster."""
|
13
|
+
del region, cluster_name
|
14
|
+
return config
|
@@ -0,0 +1,287 @@
|
|
1
|
+
"""DigitalOcean instance provisioning."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
import uuid
|
6
|
+
|
7
|
+
from sky import sky_logging
|
8
|
+
from sky import status_lib
|
9
|
+
from sky.provision import common
|
10
|
+
from sky.provision.do import constants
|
11
|
+
from sky.provision.do import utils
|
12
|
+
|
13
|
+
# The maximum number of times to poll for the status of an operation
|
14
|
+
MAX_POLLS = 60 // constants.POLL_INTERVAL
|
15
|
+
# Stopping instances can take several minutes, so we increase the timeout
|
16
|
+
MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 8
|
17
|
+
|
18
|
+
logger = sky_logging.init_logger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
def _get_head_instance(
|
22
|
+
instances: Dict[str, Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
23
|
+
for instance_name, instance_meta in instances.items():
|
24
|
+
if instance_name.endswith('-head'):
|
25
|
+
return instance_meta
|
26
|
+
return None
|
27
|
+
|
28
|
+
|
29
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
30
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
31
|
+
"""Runs instances for the given cluster."""
|
32
|
+
|
33
|
+
pending_status = ['new']
|
34
|
+
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
35
|
+
pending_status + ['off'])
|
36
|
+
while True:
|
37
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
38
|
+
pending_status)
|
39
|
+
if not instances:
|
40
|
+
break
|
41
|
+
instance_statuses = [
|
42
|
+
instance['status'] for instance in instances.values()
|
43
|
+
]
|
44
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready: '
|
45
|
+
f'{instance_statuses}')
|
46
|
+
time.sleep(constants.POLL_INTERVAL)
|
47
|
+
|
48
|
+
exist_instances = utils.filter_instances(cluster_name_on_cloud,
|
49
|
+
status_filters=pending_status +
|
50
|
+
['active', 'off'])
|
51
|
+
if len(exist_instances) > config.count:
|
52
|
+
raise RuntimeError(
|
53
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
54
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
55
|
+
|
56
|
+
stopped_instances = utils.filter_instances(cluster_name_on_cloud,
|
57
|
+
status_filters=['off'])
|
58
|
+
for instance in stopped_instances.values():
|
59
|
+
utils.start_instance(instance)
|
60
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
61
|
+
instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
|
62
|
+
if len(instances) == 0:
|
63
|
+
break
|
64
|
+
num_stopped_instances = len(stopped_instances)
|
65
|
+
num_restarted_instances = num_stopped_instances - len(instances)
|
66
|
+
logger.info(
|
67
|
+
f'Waiting for {num_restarted_instances}/{num_stopped_instances} '
|
68
|
+
'stopped instances to be restarted.')
|
69
|
+
time.sleep(constants.POLL_INTERVAL)
|
70
|
+
else:
|
71
|
+
msg = ('run_instances: Failed to restart all'
|
72
|
+
'instances possibly due to to capacity issue.')
|
73
|
+
logger.warning(msg)
|
74
|
+
raise RuntimeError(msg)
|
75
|
+
|
76
|
+
exist_instances = utils.filter_instances(cluster_name_on_cloud,
|
77
|
+
status_filters=['active'])
|
78
|
+
head_instance = _get_head_instance(exist_instances)
|
79
|
+
to_start_count = config.count - len(exist_instances)
|
80
|
+
if to_start_count < 0:
|
81
|
+
raise RuntimeError(
|
82
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
83
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
84
|
+
if to_start_count == 0:
|
85
|
+
if head_instance is None:
|
86
|
+
head_instance = list(exist_instances.values())[0]
|
87
|
+
utils.rename_instance(
|
88
|
+
head_instance,
|
89
|
+
f'{cluster_name_on_cloud}-{uuid.uuid4().hex[:4]}-head')
|
90
|
+
assert head_instance is not None, ('`head_instance` should not be None')
|
91
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
92
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
93
|
+
return common.ProvisionRecord(
|
94
|
+
provider_name='do',
|
95
|
+
cluster_name=cluster_name_on_cloud,
|
96
|
+
region=region,
|
97
|
+
zone=None,
|
98
|
+
head_instance_id=head_instance['name'],
|
99
|
+
resumed_instance_ids=list(newly_started_instances.keys()),
|
100
|
+
created_instance_ids=[],
|
101
|
+
)
|
102
|
+
|
103
|
+
created_instances: List[Dict[str, Any]] = []
|
104
|
+
for _ in range(to_start_count):
|
105
|
+
instance_type = 'head' if head_instance is None else 'worker'
|
106
|
+
instance = utils.create_instance(
|
107
|
+
region=region,
|
108
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
109
|
+
instance_type=instance_type,
|
110
|
+
config=config)
|
111
|
+
logger.info(f'Launched instance {instance["name"]}.')
|
112
|
+
created_instances.append(instance)
|
113
|
+
if head_instance is None:
|
114
|
+
head_instance = instance
|
115
|
+
|
116
|
+
# Wait for instances to be ready.
|
117
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
118
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
119
|
+
status_filters=['active'])
|
120
|
+
logger.info('Waiting for instances to be ready: '
|
121
|
+
f'({len(instances)}/{config.count}).')
|
122
|
+
if len(instances) == config.count:
|
123
|
+
break
|
124
|
+
|
125
|
+
time.sleep(constants.POLL_INTERVAL)
|
126
|
+
else:
|
127
|
+
# Failed to launch config.count of instances after max retries
|
128
|
+
msg = 'run_instances: Failed to create the instances'
|
129
|
+
logger.warning(msg)
|
130
|
+
raise RuntimeError(msg)
|
131
|
+
assert head_instance is not None, 'head_instance should not be None'
|
132
|
+
return common.ProvisionRecord(
|
133
|
+
provider_name='do',
|
134
|
+
cluster_name=cluster_name_on_cloud,
|
135
|
+
region=region,
|
136
|
+
zone=None,
|
137
|
+
head_instance_id=head_instance['name'],
|
138
|
+
resumed_instance_ids=list(stopped_instances.keys()),
|
139
|
+
created_instance_ids=[
|
140
|
+
instance['name'] for instance in created_instances
|
141
|
+
],
|
142
|
+
)
|
143
|
+
|
144
|
+
|
145
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
146
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
147
|
+
del region, cluster_name_on_cloud, state # unused
|
148
|
+
# We already wait on ready state in `run_instances` no need
|
149
|
+
|
150
|
+
|
151
|
+
def stop_instances(
|
152
|
+
cluster_name_on_cloud: str,
|
153
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
154
|
+
worker_only: bool = False,
|
155
|
+
) -> None:
|
156
|
+
del provider_config # unused
|
157
|
+
all_instances = utils.filter_instances(cluster_name_on_cloud,
|
158
|
+
status_filters=None)
|
159
|
+
num_instances = len(all_instances)
|
160
|
+
|
161
|
+
# Request a stop on all instances
|
162
|
+
for instance_name, instance_meta in all_instances.items():
|
163
|
+
if worker_only and instance_name.endswith('-head'):
|
164
|
+
num_instances -= 1
|
165
|
+
continue
|
166
|
+
utils.stop_instance(instance_meta)
|
167
|
+
|
168
|
+
# Wait for instances to stop
|
169
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
170
|
+
all_instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
|
171
|
+
if len(all_instances) >= num_instances:
|
172
|
+
break
|
173
|
+
time.sleep(constants.POLL_INTERVAL)
|
174
|
+
else:
|
175
|
+
raise RuntimeError(f'Maximum number of polls: '
|
176
|
+
f'{MAX_POLLS_FOR_UP_OR_STOP} reached. '
|
177
|
+
f'Instance {all_instances} is still not in '
|
178
|
+
'STOPPED status.')
|
179
|
+
|
180
|
+
|
181
|
+
def terminate_instances(
|
182
|
+
cluster_name_on_cloud: str,
|
183
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
184
|
+
worker_only: bool = False,
|
185
|
+
) -> None:
|
186
|
+
"""See sky/provision/__init__.py"""
|
187
|
+
del provider_config # unused
|
188
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
189
|
+
status_filters=None)
|
190
|
+
for instance_name, instance_meta in instances.items():
|
191
|
+
logger.debug(f'Terminating instance {instance_name}')
|
192
|
+
if worker_only and instance_name.endswith('-head'):
|
193
|
+
continue
|
194
|
+
utils.down_instance(instance_meta)
|
195
|
+
|
196
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
197
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
198
|
+
status_filters=None)
|
199
|
+
if len(instances) == 0 or len(instances) <= 1 and worker_only:
|
200
|
+
break
|
201
|
+
time.sleep(constants.POLL_INTERVAL)
|
202
|
+
else:
|
203
|
+
msg = ('Failed to delete all instances')
|
204
|
+
logger.warning(msg)
|
205
|
+
raise RuntimeError(msg)
|
206
|
+
|
207
|
+
|
208
|
+
def get_cluster_info(
|
209
|
+
region: str,
|
210
|
+
cluster_name_on_cloud: str,
|
211
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
212
|
+
) -> common.ClusterInfo:
|
213
|
+
del region # unused
|
214
|
+
running_instances = utils.filter_instances(cluster_name_on_cloud,
|
215
|
+
['active'])
|
216
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
217
|
+
head_instance: Optional[str] = None
|
218
|
+
for instance_name, instance_meta in running_instances.items():
|
219
|
+
if instance_name.endswith('-head'):
|
220
|
+
head_instance = instance_name
|
221
|
+
for net in instance_meta['networks']['v4']:
|
222
|
+
if net['type'] == 'public':
|
223
|
+
instance_ip = net['ip_address']
|
224
|
+
break
|
225
|
+
instances[instance_name] = [
|
226
|
+
common.InstanceInfo(
|
227
|
+
instance_id=instance_meta['name'],
|
228
|
+
internal_ip=instance_ip,
|
229
|
+
external_ip=instance_ip,
|
230
|
+
ssh_port=22,
|
231
|
+
tags={},
|
232
|
+
)
|
233
|
+
]
|
234
|
+
|
235
|
+
assert head_instance is not None, 'no head instance found'
|
236
|
+
return common.ClusterInfo(
|
237
|
+
instances=instances,
|
238
|
+
head_instance_id=head_instance,
|
239
|
+
provider_name='do',
|
240
|
+
provider_config=provider_config,
|
241
|
+
)
|
242
|
+
|
243
|
+
|
244
|
+
def query_instances(
|
245
|
+
cluster_name_on_cloud: str,
|
246
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
247
|
+
non_terminated_only: bool = True,
|
248
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
249
|
+
"""See sky/provision/__init__.py"""
|
250
|
+
# terminated instances are not retrieved by the
|
251
|
+
# API making `non_terminated_only` argument moot.
|
252
|
+
del non_terminated_only
|
253
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
254
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
255
|
+
status_filters=None)
|
256
|
+
|
257
|
+
status_map = {
|
258
|
+
'new': status_lib.ClusterStatus.INIT,
|
259
|
+
'archive': status_lib.ClusterStatus.INIT,
|
260
|
+
'active': status_lib.ClusterStatus.UP,
|
261
|
+
'off': status_lib.ClusterStatus.STOPPED,
|
262
|
+
}
|
263
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
264
|
+
for instance_meta in instances.values():
|
265
|
+
status = status_map[instance_meta['status']]
|
266
|
+
statuses[instance_meta['name']] = status
|
267
|
+
return statuses
|
268
|
+
|
269
|
+
|
270
|
+
def open_ports(
|
271
|
+
cluster_name_on_cloud: str,
|
272
|
+
ports: List[str],
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
274
|
+
) -> None:
|
275
|
+
"""See sky/provision/__init__.py"""
|
276
|
+
logger.debug(
|
277
|
+
f'Skip opening ports {ports} for DigitalOcean instances, as all '
|
278
|
+
'ports are open by default.')
|
279
|
+
del cluster_name_on_cloud, provider_config, ports
|
280
|
+
|
281
|
+
|
282
|
+
def cleanup_ports(
|
283
|
+
cluster_name_on_cloud: str,
|
284
|
+
ports: List[str],
|
285
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
286
|
+
) -> None:
|
287
|
+
del cluster_name_on_cloud, provider_config, ports
|