skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -4,17 +4,17 @@ import time
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
5
5
|
|
6
6
|
from sky import sky_logging
|
7
|
-
from sky import status_lib
|
8
7
|
from sky.provision import common
|
9
8
|
from sky.provision.paperspace import utils
|
10
9
|
from sky.utils import common_utils
|
10
|
+
from sky.utils import status_lib
|
11
11
|
from sky.utils import ux_utils
|
12
12
|
|
13
13
|
# The maximum number of times to poll for the status of an operation.
|
14
14
|
POLL_INTERVAL = 5
|
15
15
|
MAX_POLLS = 60 // POLL_INTERVAL
|
16
16
|
# Stopping instances can take several minutes, so we increase the timeout
|
17
|
-
MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS *
|
17
|
+
MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 16
|
18
18
|
|
19
19
|
logger = sky_logging.init_logger(__name__)
|
20
20
|
|
@@ -286,12 +286,13 @@ def query_instances(
|
|
286
286
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
287
287
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
288
288
|
|
289
|
+
# https://docs.digitalocean.com/reference/paperspace/core/commands/machines/#show
|
289
290
|
status_map = {
|
290
291
|
'starting': status_lib.ClusterStatus.INIT,
|
291
292
|
'restarting': status_lib.ClusterStatus.INIT,
|
292
293
|
'upgrading': status_lib.ClusterStatus.INIT,
|
293
294
|
'provisioning': status_lib.ClusterStatus.INIT,
|
294
|
-
'stopping': status_lib.ClusterStatus.
|
295
|
+
'stopping': status_lib.ClusterStatus.STOPPED,
|
295
296
|
'serviceready': status_lib.ClusterStatus.INIT,
|
296
297
|
'ready': status_lib.ClusterStatus.UP,
|
297
298
|
'off': status_lib.ClusterStatus.STOPPED,
|
@@ -132,6 +132,8 @@ class PaperspaceCloudClient:
|
|
132
132
|
'apt-get update \n'
|
133
133
|
'apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \n' # pylint: disable=line-too-long
|
134
134
|
'fi \n'
|
135
|
+
# TODO(tian): Maybe remove this as well since we are now adding
|
136
|
+
# users to docker group in the DockerInitializer. Need to test.
|
135
137
|
'usermod -aG docker paperspace \n'
|
136
138
|
f'echo "{public_key}" >> /home/paperspace/.ssh/authorized_keys \n')
|
137
139
|
try:
|
sky/provision/provisioner.py
CHANGED
@@ -14,9 +14,9 @@ import colorama
|
|
14
14
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
|
+
from sky import exceptions
|
17
18
|
from sky import provision
|
18
19
|
from sky import sky_logging
|
19
|
-
from sky import status_lib
|
20
20
|
from sky.adaptors import aws
|
21
21
|
from sky.backends import backend_utils
|
22
22
|
from sky.provision import common as provision_common
|
@@ -25,7 +25,12 @@ from sky.provision import logging as provision_logging
|
|
25
25
|
from sky.provision import metadata_utils
|
26
26
|
from sky.skylet import constants
|
27
27
|
from sky.utils import common_utils
|
28
|
+
from sky.utils import message_utils
|
29
|
+
from sky.utils import resources_utils
|
28
30
|
from sky.utils import rich_utils
|
31
|
+
from sky.utils import status_lib
|
32
|
+
from sky.utils import subprocess_utils
|
33
|
+
from sky.utils import timeline
|
29
34
|
from sky.utils import ux_utils
|
30
35
|
|
31
36
|
# Do not use __name__ as we do not want to propagate logs to sky.provision,
|
@@ -38,91 +43,53 @@ _MAX_RETRY = 3
|
|
38
43
|
_TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n'
|
39
44
|
|
40
45
|
|
41
|
-
@dataclasses.dataclass
|
42
|
-
class ClusterName:
|
43
|
-
display_name: str
|
44
|
-
name_on_cloud: str
|
45
|
-
|
46
|
-
def __repr__(self) -> str:
|
47
|
-
return repr(self.display_name)
|
48
|
-
|
49
|
-
def __str__(self) -> str:
|
50
|
-
return self.display_name
|
51
|
-
|
52
|
-
|
53
46
|
def _bulk_provision(
|
54
47
|
cloud: clouds.Cloud,
|
55
48
|
region: clouds.Region,
|
56
|
-
|
57
|
-
cluster_name: ClusterName,
|
49
|
+
cluster_name: resources_utils.ClusterName,
|
58
50
|
bootstrap_config: provision_common.ProvisionConfig,
|
59
51
|
) -> provision_common.ProvisionRecord:
|
60
52
|
provider_name = repr(cloud)
|
61
53
|
region_name = region.name
|
62
54
|
|
63
|
-
style = colorama.Style
|
64
|
-
|
65
|
-
if not zones:
|
66
|
-
# For Azure, zones is always an empty list.
|
67
|
-
zone_str = 'all zones'
|
68
|
-
else:
|
69
|
-
zone_str = ','.join(z.name for z in zones)
|
70
|
-
|
71
|
-
if isinstance(cloud, clouds.Kubernetes):
|
72
|
-
# Omit the region name for Kubernetes.
|
73
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud}{style.RESET_ALL} '
|
74
|
-
f'{cluster_name!r}.')
|
75
|
-
else:
|
76
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud} '
|
77
|
-
f'{region_name}{style.RESET_ALL} ({zone_str})')
|
78
|
-
|
79
55
|
start = time.time()
|
80
|
-
|
56
|
+
# TODO(suquark): Should we cache the bootstrapped result?
|
57
|
+
# Currently it is not necessary as bootstrapping takes
|
58
|
+
# only ~3s, caching it seems over-engineering and could
|
59
|
+
# cause other issues like the cache is not synced
|
60
|
+
# with the cloud configuration.
|
61
|
+
config = provision.bootstrap_instances(provider_name, region_name,
|
62
|
+
cluster_name.name_on_cloud,
|
63
|
+
bootstrap_config)
|
64
|
+
|
65
|
+
provision_record = provision.run_instances(provider_name,
|
66
|
+
region_name,
|
67
|
+
cluster_name.name_on_cloud,
|
68
|
+
config=config)
|
69
|
+
|
70
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
71
|
+
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
72
|
+
rich_utils.force_update_status(
|
73
|
+
ux_utils.spinner_message('Launching - Checking instance status',
|
74
|
+
str(provision_logging.config.log_path)))
|
75
|
+
# AWS would take a very short time (<<1s) updating the state of the
|
76
|
+
# instance.
|
77
|
+
time.sleep(1)
|
78
|
+
for retry_cnt in range(_MAX_RETRY):
|
81
79
|
try:
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
f'{common_utils.format_exception(e)}')
|
96
|
-
raise
|
97
|
-
|
98
|
-
provision_record = provision.run_instances(provider_name,
|
99
|
-
region_name,
|
100
|
-
cluster_name.name_on_cloud,
|
101
|
-
config=config)
|
102
|
-
|
103
|
-
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
104
|
-
logger.debug(
|
105
|
-
f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
106
|
-
status.update('[bold cyan]Launching - Checking instance status[/]')
|
107
|
-
# AWS would take a very short time (<<1s) updating the state of the
|
108
|
-
# instance.
|
109
|
-
time.sleep(1)
|
110
|
-
for retry_cnt in range(_MAX_RETRY):
|
111
|
-
try:
|
112
|
-
provision.wait_instances(provider_name,
|
113
|
-
region_name,
|
114
|
-
cluster_name.name_on_cloud,
|
115
|
-
state=status_lib.ClusterStatus.UP)
|
116
|
-
break
|
117
|
-
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
118
|
-
time.sleep(backoff.current_backoff())
|
119
|
-
else:
|
120
|
-
raise RuntimeError(
|
121
|
-
f'Failed to wait for instances of {cluster_name!r} to be '
|
122
|
-
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
123
|
-
logger.debug(
|
124
|
-
f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
125
|
-
'retries.')
|
80
|
+
provision.wait_instances(provider_name,
|
81
|
+
region_name,
|
82
|
+
cluster_name.name_on_cloud,
|
83
|
+
state=status_lib.ClusterStatus.UP)
|
84
|
+
break
|
85
|
+
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
86
|
+
time.sleep(backoff.current_backoff())
|
87
|
+
else:
|
88
|
+
raise RuntimeError(
|
89
|
+
f'Failed to wait for instances of {cluster_name!r} to be '
|
90
|
+
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
91
|
+
logger.debug(f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
92
|
+
'retries.')
|
126
93
|
|
127
94
|
logger.debug(
|
128
95
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
@@ -135,11 +102,12 @@ def bulk_provision(
|
|
135
102
|
cloud: clouds.Cloud,
|
136
103
|
region: clouds.Region,
|
137
104
|
zones: Optional[List[clouds.Zone]],
|
138
|
-
cluster_name: ClusterName,
|
105
|
+
cluster_name: resources_utils.ClusterName,
|
139
106
|
num_nodes: int,
|
140
107
|
cluster_yaml: str,
|
141
108
|
prev_cluster_ever_up: bool,
|
142
109
|
log_dir: str,
|
110
|
+
ports_to_open_on_launch: Optional[List[int]] = None,
|
143
111
|
) -> provision_common.ProvisionRecord:
|
144
112
|
"""Provisions a cluster and wait until fully provisioned.
|
145
113
|
|
@@ -161,7 +129,8 @@ def bulk_provision(
|
|
161
129
|
['node_config'],
|
162
130
|
count=num_nodes,
|
163
131
|
tags={},
|
164
|
-
resume_stopped_nodes=True
|
132
|
+
resume_stopped_nodes=True,
|
133
|
+
ports_to_open_on_launch=ports_to_open_on_launch)
|
165
134
|
|
166
135
|
with provision_logging.setup_provision_logging(log_dir):
|
167
136
|
try:
|
@@ -171,8 +140,11 @@ def bulk_provision(
|
|
171
140
|
logger.debug(
|
172
141
|
'Provision config:\n'
|
173
142
|
f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
|
174
|
-
return _bulk_provision(cloud, region,
|
143
|
+
return _bulk_provision(cloud, region, cluster_name,
|
175
144
|
bootstrap_config)
|
145
|
+
except exceptions.NoClusterLaunchedError:
|
146
|
+
# Skip the teardown if the cluster was never launched.
|
147
|
+
raise
|
176
148
|
except Exception: # pylint: disable=broad-except
|
177
149
|
zone_str = 'all zones'
|
178
150
|
if zones:
|
@@ -225,7 +197,7 @@ def bulk_provision(
|
|
225
197
|
raise
|
226
198
|
|
227
199
|
|
228
|
-
def teardown_cluster(cloud_name: str, cluster_name: ClusterName,
|
200
|
+
def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
|
229
201
|
terminate: bool, provider_config: Dict) -> None:
|
230
202
|
"""Deleting or stopping a cluster.
|
231
203
|
|
@@ -268,6 +240,8 @@ def _ssh_probe_command(ip: str,
|
|
268
240
|
'-o',
|
269
241
|
'IdentitiesOnly=yes',
|
270
242
|
'-o',
|
243
|
+
'AddKeysToAgent=yes',
|
244
|
+
'-o',
|
271
245
|
'ExitOnForwardFailure=yes',
|
272
246
|
'-o',
|
273
247
|
'ServerAliveInterval=5',
|
@@ -371,6 +345,7 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
371
345
|
return True, ''
|
372
346
|
|
373
347
|
|
348
|
+
@timeline.event
|
374
349
|
def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
375
350
|
ssh_credentials: Dict[str, str]):
|
376
351
|
"""Wait until SSH is ready.
|
@@ -394,14 +369,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
394
369
|
# use a queue for SSH querying
|
395
370
|
ips = collections.deque(ip_list)
|
396
371
|
ssh_ports = collections.deque(port_list)
|
397
|
-
|
398
|
-
|
399
|
-
ssh_port =
|
400
|
-
success
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
if time.time() - start > timeout:
|
372
|
+
|
373
|
+
def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
|
374
|
+
ip, ssh_port = ip_ssh_port
|
375
|
+
success = False
|
376
|
+
while not success:
|
377
|
+
success, stderr = waiter(ip, ssh_port, **ssh_credentials)
|
378
|
+
if not success and time.time() - start > timeout:
|
405
379
|
with ux_utils.print_exception_no_traceback():
|
406
380
|
raise RuntimeError(
|
407
381
|
f'Failed to SSH to {ip} after timeout {timeout}s, with '
|
@@ -409,10 +383,18 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
409
383
|
logger.debug('Retrying in 1 second...')
|
410
384
|
time.sleep(1)
|
411
385
|
|
386
|
+
# try one node and multiprocess the rest
|
387
|
+
if ips:
|
388
|
+
ip = ips.popleft()
|
389
|
+
ssh_port = ssh_ports.popleft()
|
390
|
+
_retry_ssh_thread((ip, ssh_port))
|
391
|
+
subprocess_utils.run_in_parallel(_retry_ssh_thread,
|
392
|
+
list(zip(ips, ssh_ports)))
|
393
|
+
|
412
394
|
|
413
395
|
def _post_provision_setup(
|
414
|
-
cloud_name: str, cluster_name: ClusterName,
|
415
|
-
provision_record: provision_common.ProvisionRecord,
|
396
|
+
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
397
|
+
cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
|
416
398
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
417
399
|
config_from_yaml = common_utils.read_yaml(cluster_yaml)
|
418
400
|
provider_config = config_from_yaml.get('provider')
|
@@ -434,35 +416,53 @@ def _post_provision_setup(
|
|
434
416
|
f'{json.dumps(dataclasses.asdict(provision_record), indent=2)}\n'
|
435
417
|
'Cluster info:\n'
|
436
418
|
f'{json.dumps(dataclasses.asdict(cluster_info), indent=2)}')
|
437
|
-
|
438
419
|
head_instance = cluster_info.get_head_instance()
|
439
420
|
if head_instance is None:
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
421
|
+
e = RuntimeError(f'Provision failed for cluster {cluster_name!r}. '
|
422
|
+
'Could not find any head instance. To fix: refresh '
|
423
|
+
f'status with: sky status -r; and retry provisioning.')
|
424
|
+
setattr(e, 'detailed_reason', str(cluster_info))
|
425
|
+
raise e
|
444
426
|
|
445
427
|
# TODO(suquark): Move wheel build here in future PRs.
|
446
428
|
# We don't set docker_user here, as we are configuring the VM itself.
|
447
429
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
448
430
|
cluster_yaml, ssh_user=cluster_info.ssh_user)
|
431
|
+
docker_config = config_from_yaml.get('docker', {})
|
449
432
|
|
450
433
|
with rich_utils.safe_status(
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
434
|
+
ux_utils.spinner_message(
|
435
|
+
'Launching - Waiting for SSH access',
|
436
|
+
provision_logging.config.log_path)) as status:
|
437
|
+
# If on Kubernetes, skip SSH check since the pods are guaranteed to be
|
438
|
+
# ready by the provisioner, and we use kubectl instead of SSH to run the
|
439
|
+
# commands and rsync on the pods. SSH will still be ready after a while
|
440
|
+
# for the users to SSH into the pod.
|
441
|
+
if cloud_name.lower() != 'kubernetes':
|
442
|
+
logger.debug(
|
443
|
+
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
444
|
+
wait_for_ssh(cluster_info, ssh_credentials)
|
445
|
+
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
446
|
+
vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
|
457
447
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
448
|
+
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
449
|
+
indent_str = (ux_utils.INDENT_SYMBOL
|
450
|
+
if docker_config else ux_utils.INDENT_LAST_SYMBOL)
|
451
|
+
logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
|
452
|
+
f'up.{colorama.Style.RESET_ALL}')
|
453
|
+
|
454
|
+
# It's promised by the cluster config that docker_config does not
|
455
|
+
# exist for docker-native clouds, i.e. they provide docker containers
|
456
|
+
# instead of full VMs, like Kubernetes and RunPod, as it requires some
|
457
|
+
# special handlings to run docker inside their docker virtualization.
|
458
|
+
# For their Docker image settings, we do them when provisioning the
|
459
|
+
# cluster. See provision/{cloud}/instance.py:get_cluster_info for more
|
460
|
+
# details.
|
463
461
|
if docker_config:
|
464
462
|
status.update(
|
465
|
-
|
463
|
+
ux_utils.spinner_message(
|
464
|
+
'Launching - Initializing docker container',
|
465
|
+
provision_logging.config.log_path))
|
466
466
|
docker_user = instance_setup.initialize_docker(
|
467
467
|
cluster_name.name_on_cloud,
|
468
468
|
docker_config=docker_config,
|
@@ -476,6 +476,8 @@ def _post_provision_setup(
|
|
476
476
|
cluster_info.docker_user = docker_user
|
477
477
|
ssh_credentials['docker_user'] = docker_user
|
478
478
|
logger.debug(f'Docker user: {docker_user}')
|
479
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
480
|
+
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
479
481
|
|
480
482
|
# We mount the metadata with sky wheel for speedup.
|
481
483
|
# NOTE: currently we mount all credentials for all nodes, because
|
@@ -488,8 +490,9 @@ def _post_provision_setup(
|
|
488
490
|
# for later.
|
489
491
|
file_mounts = config_from_yaml.get('file_mounts', {})
|
490
492
|
|
491
|
-
runtime_preparation_str = (
|
492
|
-
|
493
|
+
runtime_preparation_str = (ux_utils.spinner_message(
|
494
|
+
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
495
|
+
provision_logging.config.log_path))
|
493
496
|
status.update(
|
494
497
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
495
498
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
@@ -506,31 +509,94 @@ def _post_provision_setup(
|
|
506
509
|
**ssh_credentials)
|
507
510
|
head_runner = runners[0]
|
508
511
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
512
|
+
def is_ray_cluster_healthy(ray_status_output: str,
|
513
|
+
expected_num_nodes: int) -> bool:
|
514
|
+
"""Parse the output of `ray status` to get #active nodes.
|
515
|
+
|
516
|
+
The output of `ray status` looks like:
|
517
|
+
Node status
|
518
|
+
---------------------------------------------------------------
|
519
|
+
Active:
|
520
|
+
1 node_291a8b849439ad6186387c35dc76dc43f9058108f09e8b68108cf9ec
|
521
|
+
1 node_0945fbaaa7f0b15a19d2fd3dc48f3a1e2d7c97e4a50ca965f67acbfd
|
522
|
+
Pending:
|
523
|
+
(no pending nodes)
|
524
|
+
Recent failures:
|
525
|
+
(no failures)
|
526
|
+
"""
|
527
|
+
start = ray_status_output.find('Active:')
|
528
|
+
end = ray_status_output.find('Pending:', start)
|
529
|
+
if start == -1 or end == -1:
|
530
|
+
return False
|
531
|
+
num_active_nodes = 0
|
532
|
+
for line in ray_status_output[start:end].split('\n'):
|
533
|
+
if line.strip() and not line.startswith('Active:'):
|
534
|
+
num_active_nodes += 1
|
535
|
+
return num_active_nodes == expected_num_nodes
|
536
|
+
|
537
|
+
def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
|
538
|
+
head_ray_needs_restart = True
|
539
|
+
ray_cluster_healthy = False
|
540
|
+
ray_port = constants.SKY_REMOTE_RAY_PORT
|
541
|
+
|
515
542
|
# Check if head node Ray is alive
|
516
543
|
returncode, stdout, _ = head_runner.run(
|
517
544
|
instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
518
545
|
stream_logs=False,
|
519
546
|
require_outputs=True)
|
520
|
-
if returncode:
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
547
|
+
if not returncode:
|
548
|
+
ray_port = message_utils.decode_payload(stdout)['ray_port']
|
549
|
+
logger.debug(f'Ray cluster on head is up with port {ray_port}.')
|
550
|
+
|
551
|
+
head_ray_needs_restart = bool(returncode)
|
552
|
+
# This is a best effort check to see if the ray cluster has expected
|
553
|
+
# number of nodes connected.
|
554
|
+
ray_cluster_healthy = (not head_ray_needs_restart and
|
555
|
+
is_ray_cluster_healthy(
|
556
|
+
stdout, cluster_info.num_instances))
|
557
|
+
return ray_port, ray_cluster_healthy, head_ray_needs_restart
|
558
|
+
|
559
|
+
status.update(
|
560
|
+
runtime_preparation_str.format(step=3, step_name='runtime'))
|
561
|
+
|
562
|
+
ray_port = constants.SKY_REMOTE_RAY_PORT
|
563
|
+
head_ray_needs_restart = True
|
564
|
+
ray_cluster_healthy = False
|
565
|
+
if (not provision_record.is_instance_just_booted(
|
566
|
+
head_instance.instance_id)):
|
567
|
+
# Check if head node Ray is alive
|
568
|
+
(ray_port, ray_cluster_healthy,
|
569
|
+
head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
|
570
|
+
elif cloud_name.lower() == 'kubernetes':
|
571
|
+
timeout = 90 # 1.5-min maximum timeout
|
572
|
+
start = time.time()
|
573
|
+
while True:
|
574
|
+
# Wait until Ray cluster is ready
|
575
|
+
(ray_port, ray_cluster_healthy,
|
576
|
+
head_ray_needs_restart) = check_ray_port_and_cluster_healthy()
|
577
|
+
if ray_cluster_healthy:
|
578
|
+
logger.debug('Ray cluster is ready. Skip head and worker '
|
579
|
+
'node ray cluster setup.')
|
580
|
+
break
|
581
|
+
if time.time() - start > timeout:
|
582
|
+
# In most cases, the ray cluster will be ready after a few
|
583
|
+
# seconds. Trigger ray start on head or worker nodes to be
|
584
|
+
# safe, if the ray cluster is not ready after timeout.
|
585
|
+
break
|
586
|
+
logger.debug('Ray cluster is not ready yet, waiting for the '
|
587
|
+
'async setup to complete...')
|
588
|
+
time.sleep(1)
|
589
|
+
|
590
|
+
if head_ray_needs_restart:
|
528
591
|
logger.debug('Starting Ray on the entire cluster.')
|
529
592
|
instance_setup.start_ray_on_head_node(
|
530
593
|
cluster_name.name_on_cloud,
|
531
594
|
custom_resource=custom_resource,
|
532
595
|
cluster_info=cluster_info,
|
533
596
|
ssh_credentials=ssh_credentials)
|
597
|
+
else:
|
598
|
+
logger.debug('Ray cluster on head is ready. Skip starting ray '
|
599
|
+
'cluster on head node.')
|
534
600
|
|
535
601
|
# NOTE: We have to check all worker nodes to make sure they are all
|
536
602
|
# healthy, otherwise we can only start Ray on newly started worker
|
@@ -541,10 +607,13 @@ def _post_provision_setup(
|
|
541
607
|
# if provision_record.is_instance_just_booted(inst.instance_id):
|
542
608
|
# worker_ips.append(inst.public_ip)
|
543
609
|
|
544
|
-
if
|
610
|
+
# We don't need to restart ray on worker nodes if the ray cluster is
|
611
|
+
# already healthy, i.e. the head node has expected number of nodes
|
612
|
+
# connected to the ray cluster.
|
613
|
+
if cluster_info.num_instances > 1 and not ray_cluster_healthy:
|
545
614
|
instance_setup.start_ray_on_worker_nodes(
|
546
615
|
cluster_name.name_on_cloud,
|
547
|
-
no_restart=not
|
616
|
+
no_restart=not head_ray_needs_restart,
|
548
617
|
custom_resource=custom_resource,
|
549
618
|
# Pass the ray_port to worker nodes for backward compatibility
|
550
619
|
# as in some existing clusters the ray_port is not dumped with
|
@@ -553,18 +622,23 @@ def _post_provision_setup(
|
|
553
622
|
ray_port=ray_port,
|
554
623
|
cluster_info=cluster_info,
|
555
624
|
ssh_credentials=ssh_credentials)
|
625
|
+
elif ray_cluster_healthy:
|
626
|
+
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
|
627
|
+
'worker nodes.')
|
556
628
|
|
557
629
|
instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
|
558
630
|
cluster_info, ssh_credentials)
|
559
631
|
|
560
|
-
logger.info(
|
561
|
-
|
632
|
+
logger.info(
|
633
|
+
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
634
|
+
provision_logging.config.log_path))
|
562
635
|
return cluster_info
|
563
636
|
|
564
637
|
|
638
|
+
@timeline.event
|
565
639
|
def post_provision_runtime_setup(
|
566
|
-
cloud_name: str, cluster_name: ClusterName,
|
567
|
-
provision_record: provision_common.ProvisionRecord,
|
640
|
+
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
641
|
+
cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
|
568
642
|
custom_resource: Optional[str],
|
569
643
|
log_dir: str) -> provision_common.ClusterInfo:
|
570
644
|
"""Run internal setup commands after provisioning and before user setup.
|
@@ -588,7 +662,10 @@ def post_provision_runtime_setup(
|
|
588
662
|
provision_record=provision_record,
|
589
663
|
custom_resource=custom_resource)
|
590
664
|
except Exception: # pylint: disable=broad-except
|
591
|
-
logger.error(
|
665
|
+
logger.error(
|
666
|
+
ux_utils.error_message(
|
667
|
+
'Failed to set up SkyPilot runtime on cluster.',
|
668
|
+
provision_logging.config.log_path))
|
592
669
|
logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
|
593
670
|
with ux_utils.print_exception_no_traceback():
|
594
671
|
raise
|
sky/provision/runpod/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from sky.provision.runpod.config import bootstrap_instances
|
|
4
4
|
from sky.provision.runpod.instance import cleanup_ports
|
5
5
|
from sky.provision.runpod.instance import get_cluster_info
|
6
6
|
from sky.provision.runpod.instance import query_instances
|
7
|
+
from sky.provision.runpod.instance import query_ports
|
7
8
|
from sky.provision.runpod.instance import run_instances
|
8
9
|
from sky.provision.runpod.instance import stop_instances
|
9
10
|
from sky.provision.runpod.instance import terminate_instances
|