skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,320 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
from threading import RLock
|
4
|
-
import time
|
5
|
-
from typing import Any, Dict, List, Optional
|
6
|
-
|
7
|
-
from ray.autoscaler.node_provider import NodeProvider
|
8
|
-
from ray.autoscaler.tags import NODE_KIND_HEAD
|
9
|
-
from ray.autoscaler.tags import NODE_KIND_WORKER
|
10
|
-
from ray.autoscaler.tags import STATUS_UP_TO_DATE
|
11
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
12
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_KIND
|
13
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_NAME
|
14
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_STATUS
|
15
|
-
from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
|
16
|
-
|
17
|
-
from sky import authentication as auth
|
18
|
-
from sky.clouds.utils import lambda_utils
|
19
|
-
from sky.utils import command_runner
|
20
|
-
from sky.utils import common_utils
|
21
|
-
from sky.utils import subprocess_utils
|
22
|
-
from sky.utils import ux_utils
|
23
|
-
|
24
|
-
_TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata'
|
25
|
-
_REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name'
|
26
|
-
_REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem'
|
27
|
-
_REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml'
|
28
|
-
_GET_INTERNAL_IP_CMD = 'ip -4 -br addr show | grep UP | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1]))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"'
|
29
|
-
|
30
|
-
logger = logging.getLogger(__name__)
|
31
|
-
|
32
|
-
|
33
|
-
def synchronized(f):
|
34
|
-
|
35
|
-
def wrapper(self, *args, **kwargs):
|
36
|
-
self.lock.acquire()
|
37
|
-
try:
|
38
|
-
return f(self, *args, **kwargs)
|
39
|
-
finally:
|
40
|
-
self.lock.release()
|
41
|
-
|
42
|
-
return wrapper
|
43
|
-
|
44
|
-
|
45
|
-
class LambdaNodeProvider(NodeProvider):
|
46
|
-
"""Node Provider for Lambda Cloud.
|
47
|
-
|
48
|
-
This provider assumes Lambda Cloud credentials are set.
|
49
|
-
"""
|
50
|
-
|
51
|
-
def __init__(self, provider_config: Dict[str, Any],
|
52
|
-
cluster_name: str) -> None:
|
53
|
-
NodeProvider.__init__(self, provider_config, cluster_name)
|
54
|
-
self.lock = RLock()
|
55
|
-
self.lambda_client = lambda_utils.LambdaCloudClient()
|
56
|
-
self.cached_nodes: Dict[str, Dict[str, Any]] = {}
|
57
|
-
self.metadata = lambda_utils.Metadata(_TAG_PATH_PREFIX, cluster_name)
|
58
|
-
self.ssh_key_path = os.path.expanduser(auth.PRIVATE_SSH_KEY_PATH)
|
59
|
-
|
60
|
-
def _get_ssh_key_name(prefix: str) -> str:
|
61
|
-
public_key_path = os.path.expanduser(auth.PUBLIC_SSH_KEY_PATH)
|
62
|
-
with open(public_key_path, 'r') as f:
|
63
|
-
public_key = f.read()
|
64
|
-
name, exists = self.lambda_client.get_unique_ssh_key_name(
|
65
|
-
prefix, public_key)
|
66
|
-
if not exists:
|
67
|
-
raise lambda_utils.LambdaCloudError('SSH key not found')
|
68
|
-
return name
|
69
|
-
|
70
|
-
ray_yaml_path = os.path.expanduser(_REMOTE_RAY_YAML)
|
71
|
-
self.on_head = (os.path.exists(ray_yaml_path) and
|
72
|
-
common_utils.read_yaml(ray_yaml_path)['cluster_name']
|
73
|
-
== cluster_name)
|
74
|
-
|
75
|
-
if self.on_head:
|
76
|
-
self.ssh_key_path = os.path.expanduser(_REMOTE_RAY_SSH_KEY)
|
77
|
-
ssh_key_name_path = os.path.expanduser(_REMOTE_SSH_KEY_NAME)
|
78
|
-
if os.path.exists(ssh_key_name_path):
|
79
|
-
with open(ssh_key_name_path, 'r') as f:
|
80
|
-
self.ssh_key_name = f.read()
|
81
|
-
else:
|
82
|
-
# At this point, `~/.ssh/sky-key.pub` contains the public
|
83
|
-
# key used to launch this cluster. Use it to determine
|
84
|
-
# ssh key name and store the name in _REMOTE_SSH_KEY_NAME.
|
85
|
-
# Note: this case only runs during cluster launch, so it is
|
86
|
-
# not possible for ~/.ssh/sky-key.pub to already be regenerated
|
87
|
-
# by the user.
|
88
|
-
self.ssh_key_name = _get_ssh_key_name('')
|
89
|
-
with open(ssh_key_name_path, 'w', encoding='utf-8') as f:
|
90
|
-
f.write(self.ssh_key_name)
|
91
|
-
else:
|
92
|
-
# On local
|
93
|
-
self.ssh_key_name = _get_ssh_key_name(
|
94
|
-
f'sky-key-{common_utils.get_user_hash()}')
|
95
|
-
|
96
|
-
def _guess_and_add_missing_tags(self, vms: List[Dict[str, Any]]) -> None:
|
97
|
-
"""Adds missing vms to local tag file and guesses their tags."""
|
98
|
-
for node in vms:
|
99
|
-
if self.metadata.get(node['id']) is not None:
|
100
|
-
pass
|
101
|
-
elif node['name'] == f'{self.cluster_name}-head':
|
102
|
-
self.metadata.set(
|
103
|
-
node['id'], {
|
104
|
-
'tags': {
|
105
|
-
TAG_RAY_CLUSTER_NAME: self.cluster_name,
|
106
|
-
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
107
|
-
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
108
|
-
TAG_RAY_USER_NODE_TYPE: 'ray_head_default',
|
109
|
-
TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-head',
|
110
|
-
}
|
111
|
-
})
|
112
|
-
elif node['name'] == f'{self.cluster_name}-worker':
|
113
|
-
self.metadata.set(
|
114
|
-
node['id'], {
|
115
|
-
'tags': {
|
116
|
-
TAG_RAY_CLUSTER_NAME: self.cluster_name,
|
117
|
-
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
118
|
-
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
119
|
-
TAG_RAY_USER_NODE_TYPE: 'ray_worker_default',
|
120
|
-
TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-worker',
|
121
|
-
}
|
122
|
-
})
|
123
|
-
|
124
|
-
def _list_instances_in_cluster(self) -> List[Dict[str, Any]]:
|
125
|
-
"""List running instances in cluster."""
|
126
|
-
vms = self.lambda_client.list_instances()
|
127
|
-
possible_names = [
|
128
|
-
f'{self.cluster_name}-head', f'{self.cluster_name}-worker'
|
129
|
-
]
|
130
|
-
return [node for node in vms if node.get('name') in possible_names]
|
131
|
-
|
132
|
-
@synchronized
|
133
|
-
def _get_filtered_nodes(self, tag_filters: Dict[str,
|
134
|
-
str]) -> Dict[str, Any]:
|
135
|
-
|
136
|
-
def _extract_metadata(vm: Dict[str, Any]) -> Dict[str, Any]:
|
137
|
-
metadata = {'id': vm['id'], 'status': vm['status'], 'tags': {}}
|
138
|
-
instance_info = self.metadata.get(vm['id'])
|
139
|
-
if instance_info is not None:
|
140
|
-
metadata['tags'] = instance_info['tags']
|
141
|
-
metadata['external_ip'] = vm.get('ip')
|
142
|
-
return metadata
|
143
|
-
|
144
|
-
def _match_tags(vm: Dict[str, Any]):
|
145
|
-
vm_info = self.metadata.get(vm['id'])
|
146
|
-
tags = {} if vm_info is None else vm_info['tags']
|
147
|
-
for k, v in tag_filters.items():
|
148
|
-
if tags.get(k) != v:
|
149
|
-
return False
|
150
|
-
return True
|
151
|
-
|
152
|
-
def _get_internal_ip(node: Dict[str, Any]):
|
153
|
-
# TODO(ewzeng): cache internal ips in metadata file to reduce
|
154
|
-
# ssh overhead.
|
155
|
-
if node['external_ip'] is None or node['status'] != 'active':
|
156
|
-
node['internal_ip'] = None
|
157
|
-
return
|
158
|
-
runner = command_runner.SSHCommandRunner(
|
159
|
-
node=(node['external_ip'], 22),
|
160
|
-
ssh_user='ubuntu',
|
161
|
-
ssh_private_key=self.ssh_key_path)
|
162
|
-
rc, stdout, stderr = runner.run(_GET_INTERNAL_IP_CMD,
|
163
|
-
require_outputs=True,
|
164
|
-
stream_logs=False)
|
165
|
-
subprocess_utils.handle_returncode(
|
166
|
-
rc,
|
167
|
-
_GET_INTERNAL_IP_CMD,
|
168
|
-
'Failed get obtain private IP from node',
|
169
|
-
stderr=stdout + stderr)
|
170
|
-
node['internal_ip'] = stdout.strip()
|
171
|
-
|
172
|
-
vms = self._list_instances_in_cluster()
|
173
|
-
self.metadata.refresh([node['id'] for node in vms])
|
174
|
-
self._guess_and_add_missing_tags(vms)
|
175
|
-
nodes = [_extract_metadata(vm) for vm in filter(_match_tags, vms)]
|
176
|
-
nodes = [
|
177
|
-
node for node in nodes
|
178
|
-
if node['status'] not in ['terminating', 'terminated']
|
179
|
-
]
|
180
|
-
subprocess_utils.run_in_parallel(_get_internal_ip, nodes)
|
181
|
-
self.cached_nodes = {node['id']: node for node in nodes}
|
182
|
-
return self.cached_nodes
|
183
|
-
|
184
|
-
def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
|
185
|
-
"""Return a list of node ids filtered by the specified tags dict.
|
186
|
-
|
187
|
-
This list must not include terminated nodes. For performance reasons,
|
188
|
-
providers are allowed to cache the result of a call to
|
189
|
-
non_terminated_nodes() to serve single-node queries
|
190
|
-
(e.g. is_running(node_id)). This means that non_terminated_nodes() must
|
191
|
-
be called again to refresh results.
|
192
|
-
|
193
|
-
Examples:
|
194
|
-
>>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"})
|
195
|
-
["node-1", "node-2"]
|
196
|
-
"""
|
197
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
198
|
-
return [k for k, _ in nodes.items()]
|
199
|
-
|
200
|
-
def is_running(self, node_id: str) -> bool:
|
201
|
-
"""Return whether the specified node is running."""
|
202
|
-
return self._get_cached_node(node_id=node_id) is not None
|
203
|
-
|
204
|
-
def is_terminated(self, node_id: str) -> bool:
|
205
|
-
"""Return whether the specified node is terminated."""
|
206
|
-
return self._get_cached_node(node_id=node_id) is None
|
207
|
-
|
208
|
-
def node_tags(self, node_id: str) -> Dict[str, str]:
|
209
|
-
"""Returns the tags of the given node (string dict)."""
|
210
|
-
node = self._get_cached_node(node_id=node_id)
|
211
|
-
if node is None:
|
212
|
-
return {}
|
213
|
-
return node['tags']
|
214
|
-
|
215
|
-
def external_ip(self, node_id: str) -> Optional[str]:
|
216
|
-
"""Returns the external ip of the given node."""
|
217
|
-
node = self._get_cached_node(node_id=node_id)
|
218
|
-
if node is None:
|
219
|
-
return None
|
220
|
-
ip = node.get('external_ip')
|
221
|
-
with ux_utils.print_exception_no_traceback():
|
222
|
-
if ip is None:
|
223
|
-
raise lambda_utils.LambdaCloudError(
|
224
|
-
'A node ip address was not found. Either '
|
225
|
-
'(1) Lambda Cloud has internally errored, or '
|
226
|
-
'(2) the cluster is still booting. '
|
227
|
-
'You can manually terminate the cluster on the '
|
228
|
-
'Lambda Cloud console or (in case 2) wait for '
|
229
|
-
'booting to finish (~2 minutes).')
|
230
|
-
return ip
|
231
|
-
|
232
|
-
def internal_ip(self, node_id: str) -> Optional[str]:
|
233
|
-
"""Returns the internal ip (Ray ip) of the given node."""
|
234
|
-
node = self._get_cached_node(node_id=node_id)
|
235
|
-
if node is None:
|
236
|
-
return None
|
237
|
-
ip = node.get('internal_ip')
|
238
|
-
with ux_utils.print_exception_no_traceback():
|
239
|
-
if ip is None:
|
240
|
-
raise lambda_utils.LambdaCloudError(
|
241
|
-
'A node ip address was not found. Either '
|
242
|
-
'(1) Lambda Cloud has internally errored, or '
|
243
|
-
'(2) the cluster is still booting. '
|
244
|
-
'You can manually terminate the cluster on the '
|
245
|
-
'Lambda Cloud console or (in case 2) wait for '
|
246
|
-
'booting to finish (~2 minutes).')
|
247
|
-
return ip
|
248
|
-
|
249
|
-
def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str],
|
250
|
-
count: int) -> None:
|
251
|
-
"""Creates a number of nodes within the namespace."""
|
252
|
-
# Get tags
|
253
|
-
config_tags = node_config.get('tags', {}).copy()
|
254
|
-
config_tags.update(tags)
|
255
|
-
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
256
|
-
|
257
|
-
# Create nodes
|
258
|
-
instance_type = node_config['InstanceType']
|
259
|
-
region = self.provider_config['region']
|
260
|
-
|
261
|
-
if config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD:
|
262
|
-
name = f'{self.cluster_name}-head'
|
263
|
-
# Occasionally, the head node will continue running for a short
|
264
|
-
# period after termination. This can lead to the following bug:
|
265
|
-
# 1. Head node autodowns but continues running.
|
266
|
-
# 2. The next autodown event is triggered, which executes ray up.
|
267
|
-
# 3. Head node stops running.
|
268
|
-
# In this case, a new head node is created after the cluster has
|
269
|
-
# terminated. We avoid this with the following check:
|
270
|
-
if self.on_head:
|
271
|
-
raise lambda_utils.LambdaCloudError('Head already exists.')
|
272
|
-
else:
|
273
|
-
name = f'{self.cluster_name}-worker'
|
274
|
-
|
275
|
-
# Lambda launch api only supports launching one node at a time,
|
276
|
-
# so we do a loop. Remove loop when launch api allows quantity > 1
|
277
|
-
booting_list = []
|
278
|
-
for _ in range(count):
|
279
|
-
vm_id = self.lambda_client.create_instances(
|
280
|
-
instance_type=instance_type,
|
281
|
-
region=region,
|
282
|
-
quantity=1,
|
283
|
-
name=name,
|
284
|
-
ssh_key_name=self.ssh_key_name)[0]
|
285
|
-
self.metadata.set(vm_id, {'tags': config_tags})
|
286
|
-
booting_list.append(vm_id)
|
287
|
-
time.sleep(10) # Avoid api rate limits
|
288
|
-
|
289
|
-
# Wait for nodes to finish booting
|
290
|
-
while True:
|
291
|
-
vms = self._list_instances_in_cluster()
|
292
|
-
for vm_id in booting_list.copy():
|
293
|
-
for vm in vms:
|
294
|
-
if vm['id'] == vm_id and vm['status'] == 'active':
|
295
|
-
booting_list.remove(vm_id)
|
296
|
-
if len(booting_list) == 0:
|
297
|
-
return
|
298
|
-
time.sleep(10)
|
299
|
-
|
300
|
-
@synchronized
|
301
|
-
def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
|
302
|
-
"""Sets the tag values (string dict) for the specified node."""
|
303
|
-
node = self._get_node(node_id)
|
304
|
-
assert node is not None, node_id
|
305
|
-
node['tags'].update(tags)
|
306
|
-
self.metadata.set(node_id, {'tags': node['tags']})
|
307
|
-
|
308
|
-
def terminate_node(self, node_id: str) -> None:
|
309
|
-
"""Terminates the specified node."""
|
310
|
-
self.lambda_client.remove_instances(node_id)
|
311
|
-
self.metadata.set(node_id, None)
|
312
|
-
|
313
|
-
def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
314
|
-
self._get_filtered_nodes({}) # Side effect: updates cache
|
315
|
-
return self.cached_nodes.get(node_id, None)
|
316
|
-
|
317
|
-
def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]:
|
318
|
-
if node_id in self.cached_nodes:
|
319
|
-
return self.cached_nodes[node_id]
|
320
|
-
return self._get_node(node_id=node_id)
|