skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,488 +0,0 @@
|
|
1
|
-
"""OCI Node Provider.
|
2
|
-
|
3
|
-
Node provider is called by the Ray Autoscaler to provision new compute
|
4
|
-
resources (head / worker nodes).
|
5
|
-
|
6
|
-
To show debug messages, export SKYPILOT_DEBUG=1
|
7
|
-
|
8
|
-
History:
|
9
|
-
- Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
|
10
|
-
|
11
|
-
"""
|
12
|
-
|
13
|
-
import copy
|
14
|
-
from datetime import datetime
|
15
|
-
import logging
|
16
|
-
import threading
|
17
|
-
import time
|
18
|
-
|
19
|
-
from ray.autoscaler.node_provider import NodeProvider
|
20
|
-
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
21
|
-
from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG
|
22
|
-
from ray.autoscaler.tags import TAG_RAY_NODE_KIND
|
23
|
-
from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE
|
24
|
-
|
25
|
-
from sky.adaptors import oci as oci_adaptor
|
26
|
-
from sky.clouds.utils import oci_utils
|
27
|
-
from sky.skylet.providers.oci import utils
|
28
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
29
|
-
|
30
|
-
logger = logging.getLogger(__name__)
|
31
|
-
|
32
|
-
|
33
|
-
def synchronized(f):
|
34
|
-
|
35
|
-
def wrapper(self, *args, **kwargs):
|
36
|
-
self.lock.acquire()
|
37
|
-
try:
|
38
|
-
return f(self, *args, **kwargs)
|
39
|
-
finally:
|
40
|
-
self.lock.release()
|
41
|
-
|
42
|
-
return wrapper
|
43
|
-
|
44
|
-
|
45
|
-
class OCINodeProvider(NodeProvider):
|
46
|
-
"""Node Provider for OracleCloud (OCI)."""
|
47
|
-
|
48
|
-
def __init__(self, provider_config, cluster_name):
|
49
|
-
NodeProvider.__init__(self, provider_config, cluster_name)
|
50
|
-
self.lock = threading.RLock()
|
51
|
-
self.cached_nodes = {}
|
52
|
-
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes",
|
53
|
-
True)
|
54
|
-
self.region = provider_config["region"]
|
55
|
-
|
56
|
-
# Do a read-ahead cache loading to improve performance.
|
57
|
-
self._get_filtered_nodes({})
|
58
|
-
|
59
|
-
@synchronized
|
60
|
-
def _get_filtered_nodes(self, tag_filters, force=False):
|
61
|
-
# Make sure the cluster_name is always an criterion
|
62
|
-
tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
|
63
|
-
|
64
|
-
return_nodes = {}
|
65
|
-
if not force:
|
66
|
-
# Query cache first to reduce API call.
|
67
|
-
cache_hit = False
|
68
|
-
for k, node in self.cached_nodes.items():
|
69
|
-
tags = node["tags"]
|
70
|
-
unmatched_tags = [
|
71
|
-
k for k, v in tag_filters.items()
|
72
|
-
if k not in tags or v != tags[k]
|
73
|
-
]
|
74
|
-
if len(unmatched_tags) == 0:
|
75
|
-
return_nodes[k] = node
|
76
|
-
cache_hit |= True
|
77
|
-
|
78
|
-
if cache_hit:
|
79
|
-
return return_nodes
|
80
|
-
|
81
|
-
insts = oci_query_helper.query_instances_by_tags(
|
82
|
-
tag_filters, self.region)
|
83
|
-
for inst in insts:
|
84
|
-
inst_id = inst.identifier
|
85
|
-
if inst_id in self.cached_nodes:
|
86
|
-
del self.cached_nodes[inst_id]
|
87
|
-
|
88
|
-
item = self.get_inst_obj({
|
89
|
-
"inst_id": inst_id,
|
90
|
-
"ad": inst.availability_domain,
|
91
|
-
"compartment": inst.compartment_id,
|
92
|
-
"lifecycle_state": inst.lifecycle_state,
|
93
|
-
"oci_tags": inst.freeform_tags,
|
94
|
-
})
|
95
|
-
return_nodes[inst_id] = item
|
96
|
-
self.cached_nodes[inst_id] = item
|
97
|
-
|
98
|
-
return return_nodes
|
99
|
-
|
100
|
-
@utils.debug_enabled(logger=logger)
|
101
|
-
def non_terminated_nodes(self, tag_filters):
|
102
|
-
"""Return a list of node ids filtered by the specified tags dict.
|
103
|
-
|
104
|
-
This list must not include terminated nodes. For performance reasons,
|
105
|
-
providers are allowed to cache the result of a call to
|
106
|
-
non_terminated_nodes() to serve single-node queries
|
107
|
-
(e.g. is_running(node_id)). This means that non_terminated_nodes()
|
108
|
-
must be called again to refresh results.
|
109
|
-
"""
|
110
|
-
VALIDITY_TAGS = [
|
111
|
-
TAG_RAY_CLUSTER_NAME,
|
112
|
-
TAG_RAY_NODE_KIND,
|
113
|
-
TAG_RAY_USER_NODE_TYPE,
|
114
|
-
TAG_RAY_LAUNCH_CONFIG,
|
115
|
-
]
|
116
|
-
filters = {
|
117
|
-
tag: tag_filters[tag] for tag in VALIDITY_TAGS if tag in tag_filters
|
118
|
-
}
|
119
|
-
|
120
|
-
nodes = self._get_filtered_nodes(tag_filters=filters)
|
121
|
-
return [k for k, v in nodes.items() if v["status"] == "RUNNING"]
|
122
|
-
|
123
|
-
@utils.debug_enabled(logger=logger)
|
124
|
-
def is_running(self, node_id):
|
125
|
-
"""Return whether the specified node is running."""
|
126
|
-
node = self._get_cached_node(node_id=node_id)
|
127
|
-
check_result = node is None or node["status"] == "RUNNING"
|
128
|
-
|
129
|
-
return check_result
|
130
|
-
|
131
|
-
@utils.debug_enabled(logger=logger)
|
132
|
-
def is_terminated(self, node_id):
|
133
|
-
"""Return whether the specified node is terminated."""
|
134
|
-
node = self._get_cached_node(node_id=node_id)
|
135
|
-
check_result = ((node is None) or (node["status"] == "TERMINATED") or
|
136
|
-
(node["status"] == "TERMINATING"))
|
137
|
-
|
138
|
-
return check_result
|
139
|
-
|
140
|
-
@utils.debug_enabled(logger=logger)
|
141
|
-
def node_tags(self, node_id):
|
142
|
-
return self.cached_nodes[node_id]["tags"]
|
143
|
-
|
144
|
-
@utils.debug_enabled(logger=logger)
|
145
|
-
def external_ip(self, node_id):
|
146
|
-
"""Returns the external ip of the given node."""
|
147
|
-
return self._get_cached_node(node_id=node_id)["external_ip"]
|
148
|
-
|
149
|
-
@utils.debug_enabled(logger=logger)
|
150
|
-
def internal_ip(self, node_id):
|
151
|
-
"""Returns the internal ip (Ray ip) of the given node."""
|
152
|
-
return self._get_cached_node(node_id=node_id)["internal_ip"]
|
153
|
-
|
154
|
-
@synchronized
|
155
|
-
@utils.debug_enabled(logger=logger)
|
156
|
-
def create_node(self, node_config, tags, count):
|
157
|
-
"""Creates a number of nodes within the namespace."""
|
158
|
-
start_time = round(time.time() * 1000)
|
159
|
-
starting_insts = []
|
160
|
-
# Check first if it neccessary to create new nodes / start stopped nodes
|
161
|
-
VALIDITY_TAGS = [
|
162
|
-
TAG_RAY_CLUSTER_NAME,
|
163
|
-
TAG_RAY_NODE_KIND,
|
164
|
-
TAG_RAY_USER_NODE_TYPE,
|
165
|
-
]
|
166
|
-
filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
|
167
|
-
|
168
|
-
# Starting stopped nodes if cache_stopped_nodes=True
|
169
|
-
if self.cache_stopped_nodes:
|
170
|
-
logger.debug("Checking existing stopped nodes.")
|
171
|
-
|
172
|
-
filters_with_launch_config = copy.copy(filters)
|
173
|
-
if TAG_RAY_LAUNCH_CONFIG in tags:
|
174
|
-
filters_with_launch_config[TAG_RAY_LAUNCH_CONFIG] = tags[
|
175
|
-
TAG_RAY_LAUNCH_CONFIG]
|
176
|
-
|
177
|
-
nodes_matching_launch_config = self.stopped_nodes(
|
178
|
-
filters_with_launch_config)
|
179
|
-
logger.debug(f"Found stopped nodes (with same launch config): "
|
180
|
-
f"{len(nodes_matching_launch_config)}")
|
181
|
-
|
182
|
-
reuse_nodes = []
|
183
|
-
if len(nodes_matching_launch_config) >= count:
|
184
|
-
reuse_nodes = nodes_matching_launch_config[:count]
|
185
|
-
else:
|
186
|
-
nodes_all = self.stopped_nodes(filters)
|
187
|
-
logger.debug(f"Found stopped nodes (regardless launch config): "
|
188
|
-
f"{len(nodes_all)}")
|
189
|
-
nodes_matching_launch_config_ids = [
|
190
|
-
n["id"] for n in nodes_matching_launch_config
|
191
|
-
]
|
192
|
-
nodes_non_matching_launch_config = [
|
193
|
-
n for n in nodes_all
|
194
|
-
if n["id"] not in nodes_matching_launch_config_ids
|
195
|
-
]
|
196
|
-
reuse_nodes = (nodes_matching_launch_config +
|
197
|
-
nodes_non_matching_launch_config)
|
198
|
-
reuse_nodes = reuse_nodes[:count]
|
199
|
-
|
200
|
-
logger.info(
|
201
|
-
f"Reusing nodes {len(reuse_nodes)}: {list(reuse_nodes)}. "
|
202
|
-
"To disable reuse, set `cache_stopped_nodes: False` "
|
203
|
-
"under `provider` in the cluster configuration.",)
|
204
|
-
|
205
|
-
for reuse_node in reuse_nodes:
|
206
|
-
if reuse_node["status"] == "STOPPING":
|
207
|
-
get_instance_response = oci_adaptor.get_core_client(
|
208
|
-
self.region,
|
209
|
-
oci_utils.oci_config.get_profile()).get_instance(
|
210
|
-
instance_id=reuse_node["id"])
|
211
|
-
oci_adaptor.oci.wait_until(
|
212
|
-
oci_adaptor.get_core_client(
|
213
|
-
self.region, oci_utils.oci_config.get_profile()),
|
214
|
-
get_instance_response,
|
215
|
-
"lifecycle_state",
|
216
|
-
"STOPPED",
|
217
|
-
)
|
218
|
-
|
219
|
-
start_time1 = round(time.time() * 1000)
|
220
|
-
for matched_node in reuse_nodes:
|
221
|
-
matched_node_id = matched_node["id"]
|
222
|
-
instance_action_response = oci_adaptor.get_core_client(
|
223
|
-
self.region,
|
224
|
-
oci_utils.oci_config.get_profile()).instance_action(
|
225
|
-
instance_id=matched_node_id, action="START")
|
226
|
-
|
227
|
-
starting_inst = instance_action_response.data
|
228
|
-
starting_insts.append({
|
229
|
-
"inst_id": starting_inst.id,
|
230
|
-
"ad": starting_inst.availability_domain,
|
231
|
-
"compartment": starting_inst.compartment_id,
|
232
|
-
"lifecycle_state": starting_inst.lifecycle_state,
|
233
|
-
"oci_tags": starting_inst.freeform_tags,
|
234
|
-
})
|
235
|
-
count -= len(reuse_nodes)
|
236
|
-
|
237
|
-
launch_stopped_time = round(time.time() * 1000) - start_time1
|
238
|
-
logger.debug(
|
239
|
-
"Time elapsed(Launch stopped): {0} milli-seconds.".format(
|
240
|
-
launch_stopped_time))
|
241
|
-
# end if self.cache_stopped_nodes:...
|
242
|
-
|
243
|
-
# Let's create additional new nodes (if neccessary)
|
244
|
-
if count > 0:
|
245
|
-
compartment = oci_query_helper.find_compartment(self.region)
|
246
|
-
vcn = oci_query_helper.find_create_vcn_subnet(self.region)
|
247
|
-
if vcn is None:
|
248
|
-
raise RuntimeError("VcnSubnetNotFound Error!")
|
249
|
-
|
250
|
-
ocpu_count = 0
|
251
|
-
vcpu_str = node_config["VCPUs"]
|
252
|
-
instance_type_str = node_config["InstanceType"]
|
253
|
-
|
254
|
-
if vcpu_str is not None and vcpu_str != "None":
|
255
|
-
if instance_type_str.startswith(
|
256
|
-
f"{oci_utils.oci_config.VM_PREFIX}.A"):
|
257
|
-
# For ARM cpu, 1*ocpu = 1*vcpu
|
258
|
-
ocpu_count = round(float(vcpu_str))
|
259
|
-
else:
|
260
|
-
# For Intel / AMD cpu, 1*ocpu = 2*vcpu
|
261
|
-
ocpu_count = round(float(vcpu_str) / 2)
|
262
|
-
ocpu_count = 1 if (ocpu_count > 0 and
|
263
|
-
ocpu_count < 1) else ocpu_count
|
264
|
-
|
265
|
-
machine_shape_config = None
|
266
|
-
if ocpu_count > 0:
|
267
|
-
mem = node_config["MemoryInGbs"]
|
268
|
-
if mem is not None and mem != "None":
|
269
|
-
machine_shape_config = (oci_adaptor.oci.core.models.
|
270
|
-
LaunchInstanceShapeConfigDetails(
|
271
|
-
ocpus=ocpu_count,
|
272
|
-
memory_in_gbs=mem))
|
273
|
-
else:
|
274
|
-
machine_shape_config = (oci_adaptor.oci.core.models.
|
275
|
-
LaunchInstanceShapeConfigDetails(
|
276
|
-
ocpus=ocpu_count))
|
277
|
-
|
278
|
-
preempitible_config = (
|
279
|
-
oci_adaptor.oci.core.models.PreemptibleInstanceConfigDetails(
|
280
|
-
preemption_action=oci_adaptor.oci.core.models.
|
281
|
-
TerminatePreemptionAction(type="TERMINATE",
|
282
|
-
preserve_boot_volume=False))
|
283
|
-
if node_config["Preemptible"] else None)
|
284
|
-
|
285
|
-
logger.debug(f"Shape: {instance_type_str}, ocpu: {ocpu_count}")
|
286
|
-
logger.debug(f"Shape config is {machine_shape_config}")
|
287
|
-
logger.debug(f"Spot config is {preempitible_config}")
|
288
|
-
|
289
|
-
vm_tags = {
|
290
|
-
**tags,
|
291
|
-
TAG_RAY_CLUSTER_NAME: self.cluster_name,
|
292
|
-
"sky_spot_flag": str(node_config["Preemptible"]).lower(),
|
293
|
-
}
|
294
|
-
# Use UTC time so that header & worker nodes use same rule
|
295
|
-
batch_id = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
296
|
-
node_type = tags[TAG_RAY_NODE_KIND]
|
297
|
-
|
298
|
-
oci_query_helper.subscribe_image(
|
299
|
-
compartment_id=compartment,
|
300
|
-
listing_id=node_config["AppCatalogListingId"],
|
301
|
-
resource_version=node_config["ResourceVersion"],
|
302
|
-
region=self.region,
|
303
|
-
)
|
304
|
-
|
305
|
-
start_time1 = round(time.time() * 1000)
|
306
|
-
for seq in range(1, count + 1):
|
307
|
-
launch_instance_response = oci_adaptor.get_core_client(
|
308
|
-
self.region, oci_utils.oci_config.get_profile()
|
309
|
-
).launch_instance(
|
310
|
-
launch_instance_details=oci_adaptor.oci.core.models.
|
311
|
-
LaunchInstanceDetails(
|
312
|
-
availability_domain=node_config["AvailabilityDomain"],
|
313
|
-
compartment_id=compartment,
|
314
|
-
shape=instance_type_str,
|
315
|
-
display_name=
|
316
|
-
f"{self.cluster_name}_{node_type}_{batch_id}_{seq}",
|
317
|
-
freeform_tags=vm_tags,
|
318
|
-
metadata={
|
319
|
-
"ssh_authorized_keys": node_config["AuthorizedKey"]
|
320
|
-
},
|
321
|
-
source_details=oci_adaptor.oci.core.models.
|
322
|
-
InstanceSourceViaImageDetails(
|
323
|
-
source_type="image",
|
324
|
-
image_id=node_config["ImageId"],
|
325
|
-
boot_volume_size_in_gbs=node_config[
|
326
|
-
"BootVolumeSize"],
|
327
|
-
boot_volume_vpus_per_gb=int(
|
328
|
-
node_config["BootVolumePerf"]),
|
329
|
-
),
|
330
|
-
create_vnic_details=oci_adaptor.oci.core.models.
|
331
|
-
CreateVnicDetails(
|
332
|
-
assign_public_ip=True,
|
333
|
-
subnet_id=vcn,
|
334
|
-
),
|
335
|
-
shape_config=machine_shape_config,
|
336
|
-
preemptible_instance_config=preempitible_config,
|
337
|
-
))
|
338
|
-
|
339
|
-
new_inst = launch_instance_response.data
|
340
|
-
starting_insts.append({
|
341
|
-
"inst_id": new_inst.id,
|
342
|
-
"ad": new_inst.availability_domain,
|
343
|
-
"compartment": new_inst.compartment_id,
|
344
|
-
"lifecycle_state": new_inst.lifecycle_state,
|
345
|
-
"oci_tags": new_inst.freeform_tags,
|
346
|
-
})
|
347
|
-
# end for loop
|
348
|
-
|
349
|
-
launch_new_time = round(time.time() * 1000) - start_time1
|
350
|
-
logger.debug("Time elapsed(Launch): {0} milli-seconds.".format(
|
351
|
-
launch_new_time))
|
352
|
-
# end if count > 0:...
|
353
|
-
|
354
|
-
for ninst in starting_insts:
|
355
|
-
# Waiting for the instance to be RUNNING state
|
356
|
-
get_instance_response = oci_adaptor.get_core_client(
|
357
|
-
self.region, oci_utils.oci_config.get_profile()).get_instance(
|
358
|
-
instance_id=ninst["inst_id"])
|
359
|
-
oci_adaptor.oci.wait_until(
|
360
|
-
oci_adaptor.get_core_client(self.region,
|
361
|
-
oci_utils.oci_config.get_profile()),
|
362
|
-
get_instance_response,
|
363
|
-
"lifecycle_state",
|
364
|
-
"RUNNING",
|
365
|
-
)
|
366
|
-
ninst["lifecycle_state"] = "RUNNING"
|
367
|
-
self.cached_nodes[ninst["inst_id"]] = self.get_inst_obj(ninst)
|
368
|
-
|
369
|
-
total_time = round(time.time() * 1000) - start_time
|
370
|
-
logger.debug(
|
371
|
-
"Total time elapsed: {0} milli-seconds.".format(total_time))
|
372
|
-
|
373
|
-
def get_inst_obj(self, inst_info):
|
374
|
-
list_vnic_attachments_response = oci_adaptor.get_core_client(
|
375
|
-
self.region,
|
376
|
-
oci_utils.oci_config.get_profile()).list_vnic_attachments(
|
377
|
-
availability_domain=inst_info["ad"],
|
378
|
-
compartment_id=inst_info["compartment"],
|
379
|
-
instance_id=inst_info["inst_id"],
|
380
|
-
)
|
381
|
-
|
382
|
-
vnic = list_vnic_attachments_response.data[0]
|
383
|
-
get_vnic_response = (oci_adaptor.get_net_client(
|
384
|
-
self.region, oci_utils.oci_config.get_profile()).get_vnic(
|
385
|
-
vnic_id=vnic.vnic_id).data)
|
386
|
-
|
387
|
-
internal_ip = get_vnic_response.private_ip
|
388
|
-
external_ip = get_vnic_response.public_ip
|
389
|
-
if external_ip is None:
|
390
|
-
external_ip = internal_ip
|
391
|
-
|
392
|
-
return {
|
393
|
-
"id": inst_info["inst_id"],
|
394
|
-
"external_ip": external_ip,
|
395
|
-
"internal_ip": internal_ip,
|
396
|
-
"tags": inst_info["oci_tags"],
|
397
|
-
"status": inst_info["lifecycle_state"],
|
398
|
-
}
|
399
|
-
|
400
|
-
@synchronized
|
401
|
-
@utils.debug_enabled(logger=logger)
|
402
|
-
def set_node_tags(self, node_id, tags):
|
403
|
-
existing_tags = self._get_cached_node(node_id)["tags"]
|
404
|
-
combined_tags = dict(existing_tags, **tags)
|
405
|
-
|
406
|
-
self.cached_nodes[node_id]["tags"] = combined_tags
|
407
|
-
retry_count = 0
|
408
|
-
while retry_count < oci_utils.oci_config.MAX_RETRY_COUNT:
|
409
|
-
try:
|
410
|
-
oci_adaptor.get_core_client(
|
411
|
-
self.region,
|
412
|
-
oci_utils.oci_config.get_profile()).update_instance(
|
413
|
-
instance_id=node_id,
|
414
|
-
update_instance_details=oci_adaptor.oci.core.models.
|
415
|
-
UpdateInstanceDetails(freeform_tags=combined_tags),
|
416
|
-
)
|
417
|
-
logger.info(f"Tags are well set for node {node_id}")
|
418
|
-
break
|
419
|
-
except Exception as e:
|
420
|
-
retry_count = retry_count + 1
|
421
|
-
wait_seconds = oci_utils.oci_config.RETRY_INTERVAL_BASE_SECONDS * retry_count
|
422
|
-
logger.warn(
|
423
|
-
f"Not ready yet, wait {wait_seconds} seconds & retry!")
|
424
|
-
logger.warn(f"Exception message is {str(e)}")
|
425
|
-
time.sleep(wait_seconds)
|
426
|
-
|
427
|
-
@synchronized
|
428
|
-
def terminate_node(self, node_id):
|
429
|
-
"""Terminates the specified node."""
|
430
|
-
logger.info(f"terminate_node {node_id}...")
|
431
|
-
node = self._get_cached_node(node_id)
|
432
|
-
if node is None:
|
433
|
-
logger.info(f"The node is not existed: {node_id}..")
|
434
|
-
return # Node not exists yet.
|
435
|
-
|
436
|
-
logger.debug(f"sky_spot_flag: {node['tags']['sky_spot_flag']}")
|
437
|
-
preemptibleFlag = (True if node and
|
438
|
-
(str(node["tags"]["sky_spot_flag"]) == "true") else
|
439
|
-
False)
|
440
|
-
|
441
|
-
if self.cache_stopped_nodes and not preemptibleFlag:
|
442
|
-
logger.info(f"Stopping instance {node_id}"
|
443
|
-
"(to fully terminate instead, "
|
444
|
-
"set `cache_stopped_nodes: False` "
|
445
|
-
"under `provider` in the cluster configuration)")
|
446
|
-
instance_action_response = oci_adaptor.get_core_client(
|
447
|
-
self.region,
|
448
|
-
oci_utils.oci_config.get_profile()).instance_action(
|
449
|
-
instance_id=node_id, action="STOP")
|
450
|
-
logger.info(
|
451
|
-
f"Stopped the instance {instance_action_response.data.id}")
|
452
|
-
if node_id in self.cached_nodes:
|
453
|
-
self.cached_nodes[node_id]["status"] = "STOPPED"
|
454
|
-
state_word = "Stopped"
|
455
|
-
else:
|
456
|
-
terminate_instance_response = oci_adaptor.get_core_client(
|
457
|
-
self.region,
|
458
|
-
oci_utils.oci_config.get_profile()).terminate_instance(node_id)
|
459
|
-
logger.debug(terminate_instance_response.data)
|
460
|
-
if node_id in self.cached_nodes:
|
461
|
-
del self.cached_nodes[node_id]
|
462
|
-
state_word = "Terminated"
|
463
|
-
|
464
|
-
logger.info(
|
465
|
-
f"{state_word} {node_id} w/ sky_spot_flag: {preemptibleFlag}.")
|
466
|
-
|
467
|
-
def _get_node(self, node_id):
|
468
|
-
self._get_filtered_nodes({},
|
469
|
-
force=True) # All except for those terminated.
|
470
|
-
return self.cached_nodes.get(node_id, None)
|
471
|
-
|
472
|
-
def _get_cached_node(self, node_id):
|
473
|
-
if node_id in self.cached_nodes:
|
474
|
-
return self.cached_nodes[node_id]
|
475
|
-
return self._get_node(node_id=node_id)
|
476
|
-
|
477
|
-
def stopped_nodes(self, tag_filters):
|
478
|
-
"""Return a list of stopped nodes filtered by the specified tags dict."""
|
479
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters, force=True)
|
480
|
-
return [
|
481
|
-
v for _, v in nodes.items()
|
482
|
-
if v["status"] in ("STOPPED", "STOPPING")
|
483
|
-
]
|
484
|
-
|
485
|
-
def running_nodes(self, tag_filters):
|
486
|
-
"""Return a list of running node ids filtered by the specified tags dict."""
|
487
|
-
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
488
|
-
return [k for k, v in nodes.items() if v["status"] == "RUNNING"]
|