skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,287 @@
|
|
1
|
+
"""DigitalOcean instance provisioning."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
import uuid
|
6
|
+
|
7
|
+
from sky import sky_logging
|
8
|
+
from sky.provision import common
|
9
|
+
from sky.provision.do import constants
|
10
|
+
from sky.provision.do import utils
|
11
|
+
from sky.utils import status_lib
|
12
|
+
|
13
|
+
# The maximum number of times to poll for the status of an operation
|
14
|
+
MAX_POLLS = 60 // constants.POLL_INTERVAL
|
15
|
+
# Stopping instances can take several minutes, so we increase the timeout
|
16
|
+
MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 8
|
17
|
+
|
18
|
+
logger = sky_logging.init_logger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
def _get_head_instance(
|
22
|
+
instances: Dict[str, Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
23
|
+
for instance_name, instance_meta in instances.items():
|
24
|
+
if instance_name.endswith('-head'):
|
25
|
+
return instance_meta
|
26
|
+
return None
|
27
|
+
|
28
|
+
|
29
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
30
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
31
|
+
"""Runs instances for the given cluster."""
|
32
|
+
|
33
|
+
pending_status = ['new']
|
34
|
+
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
35
|
+
pending_status + ['off'])
|
36
|
+
while True:
|
37
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
38
|
+
pending_status)
|
39
|
+
if not instances:
|
40
|
+
break
|
41
|
+
instance_statuses = [
|
42
|
+
instance['status'] for instance in instances.values()
|
43
|
+
]
|
44
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready: '
|
45
|
+
f'{instance_statuses}')
|
46
|
+
time.sleep(constants.POLL_INTERVAL)
|
47
|
+
|
48
|
+
exist_instances = utils.filter_instances(cluster_name_on_cloud,
|
49
|
+
status_filters=pending_status +
|
50
|
+
['active', 'off'])
|
51
|
+
if len(exist_instances) > config.count:
|
52
|
+
raise RuntimeError(
|
53
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
54
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
55
|
+
|
56
|
+
stopped_instances = utils.filter_instances(cluster_name_on_cloud,
|
57
|
+
status_filters=['off'])
|
58
|
+
for instance in stopped_instances.values():
|
59
|
+
utils.start_instance(instance)
|
60
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
61
|
+
instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
|
62
|
+
if len(instances) == 0:
|
63
|
+
break
|
64
|
+
num_stopped_instances = len(stopped_instances)
|
65
|
+
num_restarted_instances = num_stopped_instances - len(instances)
|
66
|
+
logger.info(
|
67
|
+
f'Waiting for {num_restarted_instances}/{num_stopped_instances} '
|
68
|
+
'stopped instances to be restarted.')
|
69
|
+
time.sleep(constants.POLL_INTERVAL)
|
70
|
+
else:
|
71
|
+
msg = ('run_instances: Failed to restart all'
|
72
|
+
'instances possibly due to to capacity issue.')
|
73
|
+
logger.warning(msg)
|
74
|
+
raise RuntimeError(msg)
|
75
|
+
|
76
|
+
exist_instances = utils.filter_instances(cluster_name_on_cloud,
|
77
|
+
status_filters=['active'])
|
78
|
+
head_instance = _get_head_instance(exist_instances)
|
79
|
+
to_start_count = config.count - len(exist_instances)
|
80
|
+
if to_start_count < 0:
|
81
|
+
raise RuntimeError(
|
82
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
83
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
84
|
+
if to_start_count == 0:
|
85
|
+
if head_instance is None:
|
86
|
+
head_instance = list(exist_instances.values())[0]
|
87
|
+
utils.rename_instance(
|
88
|
+
head_instance,
|
89
|
+
f'{cluster_name_on_cloud}-{uuid.uuid4().hex[:4]}-head')
|
90
|
+
assert head_instance is not None, ('`head_instance` should not be None')
|
91
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
92
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
93
|
+
return common.ProvisionRecord(
|
94
|
+
provider_name='do',
|
95
|
+
cluster_name=cluster_name_on_cloud,
|
96
|
+
region=region,
|
97
|
+
zone=None,
|
98
|
+
head_instance_id=head_instance['name'],
|
99
|
+
resumed_instance_ids=list(newly_started_instances.keys()),
|
100
|
+
created_instance_ids=[],
|
101
|
+
)
|
102
|
+
|
103
|
+
created_instances: List[Dict[str, Any]] = []
|
104
|
+
for _ in range(to_start_count):
|
105
|
+
instance_type = 'head' if head_instance is None else 'worker'
|
106
|
+
instance = utils.create_instance(
|
107
|
+
region=region,
|
108
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
109
|
+
instance_type=instance_type,
|
110
|
+
config=config)
|
111
|
+
logger.info(f'Launched instance {instance["name"]}.')
|
112
|
+
created_instances.append(instance)
|
113
|
+
if head_instance is None:
|
114
|
+
head_instance = instance
|
115
|
+
|
116
|
+
# Wait for instances to be ready.
|
117
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
118
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
119
|
+
status_filters=['active'])
|
120
|
+
logger.info('Waiting for instances to be ready: '
|
121
|
+
f'({len(instances)}/{config.count}).')
|
122
|
+
if len(instances) == config.count:
|
123
|
+
break
|
124
|
+
|
125
|
+
time.sleep(constants.POLL_INTERVAL)
|
126
|
+
else:
|
127
|
+
# Failed to launch config.count of instances after max retries
|
128
|
+
msg = 'run_instances: Failed to create the instances'
|
129
|
+
logger.warning(msg)
|
130
|
+
raise RuntimeError(msg)
|
131
|
+
assert head_instance is not None, 'head_instance should not be None'
|
132
|
+
return common.ProvisionRecord(
|
133
|
+
provider_name='do',
|
134
|
+
cluster_name=cluster_name_on_cloud,
|
135
|
+
region=region,
|
136
|
+
zone=None,
|
137
|
+
head_instance_id=head_instance['name'],
|
138
|
+
resumed_instance_ids=list(stopped_instances.keys()),
|
139
|
+
created_instance_ids=[
|
140
|
+
instance['name'] for instance in created_instances
|
141
|
+
],
|
142
|
+
)
|
143
|
+
|
144
|
+
|
145
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
146
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
147
|
+
del region, cluster_name_on_cloud, state # unused
|
148
|
+
# We already wait on ready state in `run_instances` no need
|
149
|
+
|
150
|
+
|
151
|
+
def stop_instances(
|
152
|
+
cluster_name_on_cloud: str,
|
153
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
154
|
+
worker_only: bool = False,
|
155
|
+
) -> None:
|
156
|
+
del provider_config # unused
|
157
|
+
all_instances = utils.filter_instances(cluster_name_on_cloud,
|
158
|
+
status_filters=None)
|
159
|
+
num_instances = len(all_instances)
|
160
|
+
|
161
|
+
# Request a stop on all instances
|
162
|
+
for instance_name, instance_meta in all_instances.items():
|
163
|
+
if worker_only and instance_name.endswith('-head'):
|
164
|
+
num_instances -= 1
|
165
|
+
continue
|
166
|
+
utils.stop_instance(instance_meta)
|
167
|
+
|
168
|
+
# Wait for instances to stop
|
169
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
170
|
+
all_instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
|
171
|
+
if len(all_instances) >= num_instances:
|
172
|
+
break
|
173
|
+
time.sleep(constants.POLL_INTERVAL)
|
174
|
+
else:
|
175
|
+
raise RuntimeError(f'Maximum number of polls: '
|
176
|
+
f'{MAX_POLLS_FOR_UP_OR_STOP} reached. '
|
177
|
+
f'Instance {all_instances} is still not in '
|
178
|
+
'STOPPED status.')
|
179
|
+
|
180
|
+
|
181
|
+
def terminate_instances(
|
182
|
+
cluster_name_on_cloud: str,
|
183
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
184
|
+
worker_only: bool = False,
|
185
|
+
) -> None:
|
186
|
+
"""See sky/provision/__init__.py"""
|
187
|
+
del provider_config # unused
|
188
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
189
|
+
status_filters=None)
|
190
|
+
for instance_name, instance_meta in instances.items():
|
191
|
+
logger.debug(f'Terminating instance {instance_name}')
|
192
|
+
if worker_only and instance_name.endswith('-head'):
|
193
|
+
continue
|
194
|
+
utils.down_instance(instance_meta)
|
195
|
+
|
196
|
+
for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
|
197
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
198
|
+
status_filters=None)
|
199
|
+
if len(instances) == 0 or len(instances) <= 1 and worker_only:
|
200
|
+
break
|
201
|
+
time.sleep(constants.POLL_INTERVAL)
|
202
|
+
else:
|
203
|
+
msg = ('Failed to delete all instances')
|
204
|
+
logger.warning(msg)
|
205
|
+
raise RuntimeError(msg)
|
206
|
+
|
207
|
+
|
208
|
+
def get_cluster_info(
|
209
|
+
region: str,
|
210
|
+
cluster_name_on_cloud: str,
|
211
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
212
|
+
) -> common.ClusterInfo:
|
213
|
+
del region # unused
|
214
|
+
running_instances = utils.filter_instances(cluster_name_on_cloud,
|
215
|
+
['active'])
|
216
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
217
|
+
head_instance: Optional[str] = None
|
218
|
+
for instance_name, instance_meta in running_instances.items():
|
219
|
+
if instance_name.endswith('-head'):
|
220
|
+
head_instance = instance_name
|
221
|
+
for net in instance_meta['networks']['v4']:
|
222
|
+
if net['type'] == 'public':
|
223
|
+
instance_ip = net['ip_address']
|
224
|
+
break
|
225
|
+
instances[instance_name] = [
|
226
|
+
common.InstanceInfo(
|
227
|
+
instance_id=instance_meta['name'],
|
228
|
+
internal_ip=instance_ip,
|
229
|
+
external_ip=instance_ip,
|
230
|
+
ssh_port=22,
|
231
|
+
tags={},
|
232
|
+
)
|
233
|
+
]
|
234
|
+
|
235
|
+
assert head_instance is not None, 'no head instance found'
|
236
|
+
return common.ClusterInfo(
|
237
|
+
instances=instances,
|
238
|
+
head_instance_id=head_instance,
|
239
|
+
provider_name='do',
|
240
|
+
provider_config=provider_config,
|
241
|
+
)
|
242
|
+
|
243
|
+
|
244
|
+
def query_instances(
|
245
|
+
cluster_name_on_cloud: str,
|
246
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
247
|
+
non_terminated_only: bool = True,
|
248
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
249
|
+
"""See sky/provision/__init__.py"""
|
250
|
+
# terminated instances are not retrieved by the
|
251
|
+
# API making `non_terminated_only` argument moot.
|
252
|
+
del non_terminated_only
|
253
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
254
|
+
instances = utils.filter_instances(cluster_name_on_cloud,
|
255
|
+
status_filters=None)
|
256
|
+
|
257
|
+
status_map = {
|
258
|
+
'new': status_lib.ClusterStatus.INIT,
|
259
|
+
'archive': status_lib.ClusterStatus.INIT,
|
260
|
+
'active': status_lib.ClusterStatus.UP,
|
261
|
+
'off': status_lib.ClusterStatus.STOPPED,
|
262
|
+
}
|
263
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
264
|
+
for instance_meta in instances.values():
|
265
|
+
status = status_map[instance_meta['status']]
|
266
|
+
statuses[instance_meta['name']] = status
|
267
|
+
return statuses
|
268
|
+
|
269
|
+
|
270
|
+
def open_ports(
|
271
|
+
cluster_name_on_cloud: str,
|
272
|
+
ports: List[str],
|
273
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
274
|
+
) -> None:
|
275
|
+
"""See sky/provision/__init__.py"""
|
276
|
+
logger.debug(
|
277
|
+
f'Skip opening ports {ports} for DigitalOcean instances, as all '
|
278
|
+
'ports are open by default.')
|
279
|
+
del cluster_name_on_cloud, provider_config, ports
|
280
|
+
|
281
|
+
|
282
|
+
def cleanup_ports(
|
283
|
+
cluster_name_on_cloud: str,
|
284
|
+
ports: List[str],
|
285
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
286
|
+
) -> None:
|
287
|
+
del cluster_name_on_cloud, provider_config, ports
|
@@ -0,0 +1,301 @@
|
|
1
|
+
"""DigitalOcean API client wrapper for SkyPilot.
|
2
|
+
|
3
|
+
Example usage of `pydo` client library was mostly taken from here:
|
4
|
+
https://github.com/digitalocean/pydo/blob/main/examples/poc_droplets_volumes_sshkeys.py
|
5
|
+
"""
|
6
|
+
|
7
|
+
import copy
|
8
|
+
import os
|
9
|
+
from typing import Any, Dict, List, Optional
|
10
|
+
import urllib
|
11
|
+
import uuid
|
12
|
+
|
13
|
+
from sky import sky_logging
|
14
|
+
from sky.adaptors import do
|
15
|
+
from sky.provision import common
|
16
|
+
from sky.provision import constants as provision_constants
|
17
|
+
from sky.provision.do import constants
|
18
|
+
from sky.utils import common_utils
|
19
|
+
|
20
|
+
logger = sky_logging.init_logger(__name__)
|
21
|
+
|
22
|
+
POSSIBLE_CREDENTIALS_PATHS = [
|
23
|
+
os.path.expanduser(
|
24
|
+
'~/Library/Application Support/doctl/config.yaml'), # OS X
|
25
|
+
os.path.expanduser(
|
26
|
+
os.path.join(os.getenv('XDG_CONFIG_HOME', '~/.config/'),
|
27
|
+
'doctl/config.yaml')), # Linux
|
28
|
+
]
|
29
|
+
INITIAL_BACKOFF_SECONDS = 10
|
30
|
+
MAX_BACKOFF_FACTOR = 10
|
31
|
+
MAX_ATTEMPTS = 6
|
32
|
+
SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
|
33
|
+
|
34
|
+
CREDENTIALS_PATH = '~/.config/doctl/config.yaml'
|
35
|
+
_client = None
|
36
|
+
_ssh_key_id = None
|
37
|
+
|
38
|
+
|
39
|
+
class DigitalOceanError(Exception):
|
40
|
+
pass
|
41
|
+
|
42
|
+
|
43
|
+
def _init_client():
|
44
|
+
global _client, CREDENTIALS_PATH
|
45
|
+
assert _client is None
|
46
|
+
CREDENTIALS_PATH = None
|
47
|
+
credentials_found = 0
|
48
|
+
for path in POSSIBLE_CREDENTIALS_PATHS:
|
49
|
+
if os.path.exists(path):
|
50
|
+
CREDENTIALS_PATH = path
|
51
|
+
credentials_found += 1
|
52
|
+
logger.debug(f'Digital Ocean credential path found at {path}')
|
53
|
+
if not credentials_found > 1:
|
54
|
+
logger.debug('more than 1 credential file found')
|
55
|
+
if CREDENTIALS_PATH is None:
|
56
|
+
raise DigitalOceanError(
|
57
|
+
'no credentials file found from '
|
58
|
+
f'the following paths {POSSIBLE_CREDENTIALS_PATHS}')
|
59
|
+
|
60
|
+
# attempt default context
|
61
|
+
credentials = common_utils.read_yaml(CREDENTIALS_PATH)
|
62
|
+
default_token = credentials.get('access-token', None)
|
63
|
+
if default_token is not None:
|
64
|
+
try:
|
65
|
+
test_client = do.pydo.Client(token=default_token)
|
66
|
+
test_client.droplets.list()
|
67
|
+
logger.debug('trying `default` context')
|
68
|
+
_client = test_client
|
69
|
+
return _client
|
70
|
+
except do.exceptions().HttpResponseError:
|
71
|
+
pass
|
72
|
+
|
73
|
+
auth_contexts = credentials.get('auth-contexts', None)
|
74
|
+
if auth_contexts is not None:
|
75
|
+
for context, api_token in auth_contexts.items():
|
76
|
+
try:
|
77
|
+
test_client = do.pydo.Client(token=api_token)
|
78
|
+
test_client.droplets.list()
|
79
|
+
logger.debug(f'using {context} context')
|
80
|
+
_client = test_client
|
81
|
+
break
|
82
|
+
except do.exceptions().HttpResponseError:
|
83
|
+
continue
|
84
|
+
else:
|
85
|
+
raise DigitalOceanError(
|
86
|
+
'no valid api tokens found try '
|
87
|
+
'setting a new API token with `doctl auth init`')
|
88
|
+
return _client
|
89
|
+
|
90
|
+
|
91
|
+
def client():
|
92
|
+
global _client
|
93
|
+
if _client is None:
|
94
|
+
_client = _init_client()
|
95
|
+
return _client
|
96
|
+
|
97
|
+
|
98
|
+
def ssh_key_id(public_key: str):
|
99
|
+
global _ssh_key_id
|
100
|
+
if _ssh_key_id is None:
|
101
|
+
page = 1
|
102
|
+
paginated = True
|
103
|
+
while paginated:
|
104
|
+
try:
|
105
|
+
resp = client().ssh_keys.list(per_page=50, page=page)
|
106
|
+
for ssh_key in resp['ssh_keys']:
|
107
|
+
if ssh_key['public_key'] == public_key:
|
108
|
+
_ssh_key_id = ssh_key
|
109
|
+
return _ssh_key_id
|
110
|
+
except do.exceptions().HttpResponseError as err:
|
111
|
+
raise DigitalOceanError(
|
112
|
+
f'Error: {err.status_code} {err.reason}: '
|
113
|
+
f'{err.error.message}') from err
|
114
|
+
|
115
|
+
pages = resp['links']
|
116
|
+
if 'pages' in pages and 'next' in pages['pages']:
|
117
|
+
pages = pages['pages']
|
118
|
+
parsed_url = urllib.parse.urlparse(pages['next'])
|
119
|
+
page = int(urllib.parse.parse_qs(parsed_url.query)['page'][0])
|
120
|
+
else:
|
121
|
+
paginated = False
|
122
|
+
|
123
|
+
request = {
|
124
|
+
'public_key': public_key,
|
125
|
+
'name': SSH_KEY_NAME_ON_DO,
|
126
|
+
}
|
127
|
+
_ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
|
128
|
+
return _ssh_key_id
|
129
|
+
|
130
|
+
|
131
|
+
def _create_volume(request: Dict[str, Any]) -> Dict[str, Any]:
|
132
|
+
try:
|
133
|
+
resp = client().volumes.create(body=request)
|
134
|
+
volume = resp['volume']
|
135
|
+
except do.exceptions().HttpResponseError as err:
|
136
|
+
raise DigitalOceanError(
|
137
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
138
|
+
) from err
|
139
|
+
else:
|
140
|
+
return volume
|
141
|
+
|
142
|
+
|
143
|
+
def _create_droplet(request: Dict[str, Any]) -> Dict[str, Any]:
|
144
|
+
try:
|
145
|
+
resp = client().droplets.create(body=request)
|
146
|
+
droplet_id = resp['droplet']['id']
|
147
|
+
|
148
|
+
get_resp = client().droplets.get(droplet_id)
|
149
|
+
droplet = get_resp['droplet']
|
150
|
+
except do.exceptions().HttpResponseError as err:
|
151
|
+
raise DigitalOceanError(
|
152
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
153
|
+
) from err
|
154
|
+
return droplet
|
155
|
+
|
156
|
+
|
157
|
+
def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str,
|
158
|
+
config: common.ProvisionConfig) -> Dict[str, Any]:
|
159
|
+
"""Creates a instance and mounts the requested block storage
|
160
|
+
|
161
|
+
Args:
|
162
|
+
region (str): instance region
|
163
|
+
instance_name (str): name of instance
|
164
|
+
config (common.ProvisionConfig): provisioner configuration
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
Dict[str, Any]: instance metadata
|
168
|
+
"""
|
169
|
+
# sort tags by key to support deterministic unit test stubbing
|
170
|
+
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
171
|
+
tags = {
|
172
|
+
'Name': cluster_name_on_cloud,
|
173
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
174
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud,
|
175
|
+
**tags
|
176
|
+
}
|
177
|
+
tags = [f'{key}:{value}' for key, value in tags.items()]
|
178
|
+
default_image = constants.GPU_IMAGES.get(
|
179
|
+
config.node_config['InstanceType'],
|
180
|
+
'gpu-h100x1-base',
|
181
|
+
)
|
182
|
+
image_id = config.node_config['ImageId']
|
183
|
+
image_id = image_id if image_id is not None else default_image
|
184
|
+
instance_name = (f'{cluster_name_on_cloud}-'
|
185
|
+
f'{uuid.uuid4().hex[:4]}-{instance_type}')
|
186
|
+
instance_request = {
|
187
|
+
'name': instance_name,
|
188
|
+
'region': region,
|
189
|
+
'size': config.node_config['InstanceType'],
|
190
|
+
'image': image_id,
|
191
|
+
'ssh_keys': [
|
192
|
+
ssh_key_id(
|
193
|
+
config.authentication_config['ssh_public_key'])['fingerprint']
|
194
|
+
],
|
195
|
+
'tags': tags,
|
196
|
+
}
|
197
|
+
instance = _create_droplet(instance_request)
|
198
|
+
|
199
|
+
volume_request = {
|
200
|
+
'size_gigabytes': config.node_config['DiskSize'],
|
201
|
+
'name': instance_name,
|
202
|
+
'region': region,
|
203
|
+
'filesystem_type': 'ext4',
|
204
|
+
'tags': tags
|
205
|
+
}
|
206
|
+
volume = _create_volume(volume_request)
|
207
|
+
|
208
|
+
attach_request = {'type': 'attach', 'droplet_id': instance['id']}
|
209
|
+
try:
|
210
|
+
client().volume_actions.post_by_id(volume['id'], attach_request)
|
211
|
+
except do.exceptions().HttpResponseError as err:
|
212
|
+
raise DigitalOceanError(
|
213
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
214
|
+
) from err
|
215
|
+
logger.debug(f'{instance_name} created')
|
216
|
+
return instance
|
217
|
+
|
218
|
+
|
219
|
+
def start_instance(instance: Dict[str, Any]):
|
220
|
+
try:
|
221
|
+
client().droplet_actions.post(droplet_id=instance['id'],
|
222
|
+
body={'type': 'power_on'})
|
223
|
+
except do.exceptions().HttpResponseError as err:
|
224
|
+
raise DigitalOceanError(
|
225
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
226
|
+
) from err
|
227
|
+
|
228
|
+
|
229
|
+
def stop_instance(instance: Dict[str, Any]):
|
230
|
+
try:
|
231
|
+
client().droplet_actions.post(
|
232
|
+
droplet_id=instance['id'],
|
233
|
+
body={'type': 'shutdown'},
|
234
|
+
)
|
235
|
+
except do.exceptions().HttpResponseError as err:
|
236
|
+
raise DigitalOceanError(
|
237
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
238
|
+
) from err
|
239
|
+
|
240
|
+
|
241
|
+
def down_instance(instance: Dict[str, Any]):
|
242
|
+
# We use dangerous destroy to atomically delete
|
243
|
+
# block storage and instance for autodown
|
244
|
+
try:
|
245
|
+
client().droplets.destroy_with_associated_resources_dangerous(
|
246
|
+
droplet_id=instance['id'], x_dangerous=True)
|
247
|
+
except do.exceptions().HttpResponseError as err:
|
248
|
+
if 'a destroy is already in progress' in err.error.message:
|
249
|
+
return
|
250
|
+
raise DigitalOceanError(
|
251
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
252
|
+
) from err
|
253
|
+
|
254
|
+
|
255
|
+
def rename_instance(instance: Dict[str, Any], new_name: str):
|
256
|
+
try:
|
257
|
+
client().droplet_actions.rename(droplet=instance['id'],
|
258
|
+
body={
|
259
|
+
'type': 'rename',
|
260
|
+
'name': new_name
|
261
|
+
})
|
262
|
+
except do.exceptions().HttpResponseError as err:
|
263
|
+
raise DigitalOceanError(
|
264
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
265
|
+
) from err
|
266
|
+
|
267
|
+
|
268
|
+
def filter_instances(
|
269
|
+
cluster_name_on_cloud: str,
|
270
|
+
status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
|
271
|
+
"""Returns Dict mapping instance name
|
272
|
+
to instance metadata filtered by status
|
273
|
+
"""
|
274
|
+
|
275
|
+
filtered_instances: Dict[str, Any] = {}
|
276
|
+
page = 1
|
277
|
+
paginated = True
|
278
|
+
while paginated:
|
279
|
+
try:
|
280
|
+
resp = client().droplets.list(
|
281
|
+
tag_name=f'{provision_constants.TAG_SKYPILOT_CLUSTER_NAME}:'
|
282
|
+
f'{cluster_name_on_cloud}',
|
283
|
+
per_page=50,
|
284
|
+
page=page)
|
285
|
+
for instance in resp['droplets']:
|
286
|
+
if status_filters is None or instance[
|
287
|
+
'status'] in status_filters:
|
288
|
+
filtered_instances[instance['name']] = instance
|
289
|
+
except do.exceptions().HttpResponseError as err:
|
290
|
+
raise DigitalOceanError(
|
291
|
+
f'Error: {err.status_code} {err.reason}: {err.error.message}'
|
292
|
+
) from err
|
293
|
+
|
294
|
+
pages = resp['links']
|
295
|
+
if 'pages' in pages and 'next' in pages['pages']:
|
296
|
+
pages = pages['pages']
|
297
|
+
parsed_url = urllib.parse.urlparse(pages['next'])
|
298
|
+
page = int(urllib.parse.parse_qs(parsed_url.query)['page'][0])
|
299
|
+
else:
|
300
|
+
paginated = False
|
301
|
+
return filtered_instances
|