skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,436 @@
|
|
1
|
+
"""OCI instance provisioning.
|
2
|
+
|
3
|
+
History:
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
- Hysun He (hysun.he@oracle.com) @ Nov.13, 2024: Implement open_ports
|
6
|
+
and cleanup_ports for supporting SkyServe.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import copy
|
10
|
+
from datetime import datetime
|
11
|
+
import time
|
12
|
+
import typing
|
13
|
+
from typing import Any, Dict, List, Optional
|
14
|
+
|
15
|
+
from sky import exceptions
|
16
|
+
from sky import sky_logging
|
17
|
+
from sky.adaptors import oci as oci_adaptor
|
18
|
+
from sky.clouds.utils import oci_utils
|
19
|
+
from sky.provision import common
|
20
|
+
from sky.provision import constants
|
21
|
+
from sky.provision.oci import query_utils
|
22
|
+
from sky.provision.oci.query_utils import query_helper
|
23
|
+
from sky.utils import common_utils
|
24
|
+
from sky.utils import ux_utils
|
25
|
+
|
26
|
+
if typing.TYPE_CHECKING:
|
27
|
+
from sky.utils import status_lib
|
28
|
+
|
29
|
+
logger = sky_logging.init_logger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
@query_utils.debug_enabled(logger)
|
33
|
+
@common_utils.retry
|
34
|
+
def query_instances(
|
35
|
+
cluster_name_on_cloud: str,
|
36
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
37
|
+
non_terminated_only: bool = True,
|
38
|
+
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
39
|
+
"""Query instances.
|
40
|
+
|
41
|
+
Returns a dictionary of instance IDs and status.
|
42
|
+
|
43
|
+
A None status means the instance is marked as "terminated"
|
44
|
+
or "terminating".
|
45
|
+
"""
|
46
|
+
assert provider_config is not None, cluster_name_on_cloud
|
47
|
+
region = provider_config['region']
|
48
|
+
|
49
|
+
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
50
|
+
statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
|
51
|
+
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
52
|
+
|
53
|
+
instances = _get_filtered_nodes(region, filters)
|
54
|
+
for node in instances:
|
55
|
+
vm_status = node['status']
|
56
|
+
sky_status = status_map[vm_status]
|
57
|
+
if non_terminated_only and sky_status is None:
|
58
|
+
continue
|
59
|
+
statuses[node['inst_id']] = sky_status
|
60
|
+
|
61
|
+
return statuses
|
62
|
+
|
63
|
+
|
64
|
+
@query_utils.debug_enabled(logger)
|
65
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
66
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
67
|
+
"""Start instances with bootstrapped configuration."""
|
68
|
+
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
69
|
+
|
70
|
+
start_time = round(time.time() * 1000)
|
71
|
+
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
72
|
+
|
73
|
+
# Starting stopped nodes if resume_stopped_nodes=True
|
74
|
+
resume_instances = []
|
75
|
+
if config.resume_stopped_nodes:
|
76
|
+
logger.debug('Checking existing stopped nodes.')
|
77
|
+
|
78
|
+
existing_instances = _get_filtered_nodes(region, filters)
|
79
|
+
if len(existing_instances) > config.count:
|
80
|
+
raise RuntimeError(
|
81
|
+
'The number of pending/running/stopped/stopping '
|
82
|
+
f'instances combined ({len(existing_instances)}) in '
|
83
|
+
f'cluster "{cluster_name_on_cloud}" is greater than the '
|
84
|
+
f'number requested by the user ({config.count}). '
|
85
|
+
'This is likely a resource leak. '
|
86
|
+
'Use "sky down" to terminate the cluster.')
|
87
|
+
|
88
|
+
# pylint: disable=line-too-long
|
89
|
+
logger.debug(
|
90
|
+
f'run_instances: Found {[inst["name"] for inst in existing_instances]} '
|
91
|
+
'existing instances in cluster.')
|
92
|
+
existing_instances.sort(key=lambda x: x['name'])
|
93
|
+
|
94
|
+
stopped_instances = []
|
95
|
+
for existing_node in existing_instances:
|
96
|
+
if existing_node['status'] == 'STOPPING':
|
97
|
+
query_helper.wait_instance_until_status(
|
98
|
+
region, existing_node['inst_id'], 'STOPPED')
|
99
|
+
stopped_instances.append(existing_node)
|
100
|
+
elif existing_node['status'] == 'STOPPED':
|
101
|
+
stopped_instances.append(existing_node)
|
102
|
+
elif existing_node['status'] in ('PROVISIONING', 'STARTING',
|
103
|
+
'RUNNING'):
|
104
|
+
resume_instances.append(existing_node)
|
105
|
+
|
106
|
+
for stopped_node in stopped_instances:
|
107
|
+
stopped_node_id = stopped_node['inst_id']
|
108
|
+
instance_action_response = query_helper.start_instance(
|
109
|
+
region, stopped_node_id)
|
110
|
+
|
111
|
+
starting_inst = instance_action_response.data
|
112
|
+
resume_instances.append({
|
113
|
+
'inst_id': starting_inst.id,
|
114
|
+
'name': starting_inst.display_name,
|
115
|
+
'ad': starting_inst.availability_domain,
|
116
|
+
'compartment': starting_inst.compartment_id,
|
117
|
+
'status': starting_inst.lifecycle_state,
|
118
|
+
'oci_tags': starting_inst.freeform_tags,
|
119
|
+
})
|
120
|
+
# end if config.resume_stopped_nodes
|
121
|
+
|
122
|
+
# Try get head id from the existing instances
|
123
|
+
head_instance_id = _get_head_instance_id(resume_instances)
|
124
|
+
logger.debug(f'Check existing head node: {head_instance_id}')
|
125
|
+
|
126
|
+
# Let's create additional new nodes (if neccessary)
|
127
|
+
to_start_count = config.count - len(resume_instances)
|
128
|
+
created_instances = []
|
129
|
+
node_config = config.node_config
|
130
|
+
if to_start_count > 0:
|
131
|
+
compartment = query_helper.find_compartment(region)
|
132
|
+
vcn = query_helper.find_create_vcn_subnet(region)
|
133
|
+
|
134
|
+
ocpu_count = 0
|
135
|
+
vcpu_str = node_config['VCPUs']
|
136
|
+
instance_type_str = node_config['InstanceType']
|
137
|
+
|
138
|
+
if vcpu_str is not None and vcpu_str != 'None':
|
139
|
+
if instance_type_str.startswith(
|
140
|
+
f'{oci_utils.oci_config.VM_PREFIX}.A'):
|
141
|
+
# For ARM cpu, 1*ocpu = 1*vcpu
|
142
|
+
ocpu_count = round(float(vcpu_str))
|
143
|
+
else:
|
144
|
+
# For Intel / AMD cpu, 1*ocpu = 2*vcpu
|
145
|
+
ocpu_count = round(float(vcpu_str) / 2)
|
146
|
+
ocpu_count = 1 if (ocpu_count > 0 and ocpu_count < 1) else ocpu_count
|
147
|
+
|
148
|
+
machine_shape_config = None
|
149
|
+
if ocpu_count > 0:
|
150
|
+
mem = node_config['MemoryInGbs']
|
151
|
+
if mem is not None and mem != 'None':
|
152
|
+
# pylint: disable=line-too-long
|
153
|
+
machine_shape_config = oci_adaptor.oci.core.models.LaunchInstanceShapeConfigDetails(
|
154
|
+
ocpus=ocpu_count, memory_in_gbs=mem)
|
155
|
+
else:
|
156
|
+
# pylint: disable=line-too-long
|
157
|
+
machine_shape_config = oci_adaptor.oci.core.models.LaunchInstanceShapeConfigDetails(
|
158
|
+
ocpus=ocpu_count)
|
159
|
+
|
160
|
+
preempitible_config = (
|
161
|
+
oci_adaptor.oci.core.models.PreemptibleInstanceConfigDetails(
|
162
|
+
preemption_action=oci_adaptor.oci.core.models.
|
163
|
+
TerminatePreemptionAction(type='TERMINATE',
|
164
|
+
preserve_boot_volume=False))
|
165
|
+
if node_config['Preemptible'] else None)
|
166
|
+
|
167
|
+
batch_id = datetime.now().strftime('%Y%m%d%H%M%S')
|
168
|
+
|
169
|
+
vm_tags_head = {
|
170
|
+
**tags,
|
171
|
+
**constants.HEAD_NODE_TAGS,
|
172
|
+
constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
173
|
+
'sky_spot_flag': str(node_config['Preemptible']).lower(),
|
174
|
+
}
|
175
|
+
vm_tags_worker = {
|
176
|
+
**tags,
|
177
|
+
**constants.WORKER_NODE_TAGS,
|
178
|
+
constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
|
179
|
+
'sky_spot_flag': str(node_config['Preemptible']).lower(),
|
180
|
+
}
|
181
|
+
|
182
|
+
for seq in range(1, to_start_count + 1):
|
183
|
+
if head_instance_id is None:
|
184
|
+
vm_tags = vm_tags_head
|
185
|
+
node_type = constants.HEAD_NODE_TAGS[
|
186
|
+
constants.TAG_RAY_NODE_KIND]
|
187
|
+
else:
|
188
|
+
vm_tags = vm_tags_worker
|
189
|
+
node_type = constants.WORKER_NODE_TAGS[
|
190
|
+
constants.TAG_RAY_NODE_KIND]
|
191
|
+
|
192
|
+
launch_instance_response = query_helper.launch_instance(
|
193
|
+
region,
|
194
|
+
oci_adaptor.oci.core.models.LaunchInstanceDetails(
|
195
|
+
availability_domain=node_config['AvailabilityDomain'],
|
196
|
+
compartment_id=compartment,
|
197
|
+
shape=instance_type_str,
|
198
|
+
display_name=
|
199
|
+
f'{cluster_name_on_cloud}_{node_type}_{batch_id}_{seq}',
|
200
|
+
freeform_tags=vm_tags,
|
201
|
+
metadata={
|
202
|
+
'ssh_authorized_keys': node_config['AuthorizedKey']
|
203
|
+
},
|
204
|
+
source_details=oci_adaptor.oci.core.models.
|
205
|
+
InstanceSourceViaImageDetails(
|
206
|
+
source_type='image',
|
207
|
+
image_id=node_config['ImageId'],
|
208
|
+
boot_volume_size_in_gbs=node_config['BootVolumeSize'],
|
209
|
+
boot_volume_vpus_per_gb=int(
|
210
|
+
node_config['BootVolumePerf']),
|
211
|
+
),
|
212
|
+
create_vnic_details=oci_adaptor.oci.core.models.
|
213
|
+
CreateVnicDetails(
|
214
|
+
assign_public_ip=True,
|
215
|
+
subnet_id=vcn,
|
216
|
+
),
|
217
|
+
shape_config=machine_shape_config,
|
218
|
+
preemptible_instance_config=preempitible_config,
|
219
|
+
))
|
220
|
+
|
221
|
+
new_inst = launch_instance_response.data
|
222
|
+
if head_instance_id is None:
|
223
|
+
head_instance_id = new_inst.id
|
224
|
+
logger.debug(f'New head node: {head_instance_id}')
|
225
|
+
|
226
|
+
created_instances.append({
|
227
|
+
'inst_id': new_inst.id,
|
228
|
+
'name': new_inst.display_name,
|
229
|
+
'ad': new_inst.availability_domain,
|
230
|
+
'compartment': new_inst.compartment_id,
|
231
|
+
'status': new_inst.lifecycle_state,
|
232
|
+
'oci_tags': new_inst.freeform_tags,
|
233
|
+
})
|
234
|
+
# end for loop
|
235
|
+
# end if to_start_count > 0:...
|
236
|
+
|
237
|
+
for inst in (resume_instances + created_instances):
|
238
|
+
logger.debug(f'Provisioning for node {inst["name"]}')
|
239
|
+
query_helper.wait_instance_until_status(region, inst['inst_id'],
|
240
|
+
'RUNNING')
|
241
|
+
logger.debug(f'Instance {inst["name"]} is RUNNING.')
|
242
|
+
|
243
|
+
total_time = round(time.time() * 1000) - start_time
|
244
|
+
logger.debug('Total time elapsed: {0} milli-seconds.'.format(total_time))
|
245
|
+
|
246
|
+
assert head_instance_id is not None, head_instance_id
|
247
|
+
|
248
|
+
# Format: TenancyPrefix:AvailabilityDomain, e.g. bxtG:US-SANJOSE-1-AD-1
|
249
|
+
_, ad = str(node_config['AvailabilityDomain']).split(':', maxsplit=1)
|
250
|
+
return common.ProvisionRecord(
|
251
|
+
provider_name='oci',
|
252
|
+
region=region,
|
253
|
+
zone=ad,
|
254
|
+
cluster_name=cluster_name_on_cloud,
|
255
|
+
head_instance_id=head_instance_id,
|
256
|
+
created_instance_ids=[n['inst_id'] for n in created_instances],
|
257
|
+
resumed_instance_ids=[n['inst_id'] for n in resume_instances],
|
258
|
+
)
|
259
|
+
|
260
|
+
|
261
|
+
@query_utils.debug_enabled(logger)
|
262
|
+
def stop_instances(
|
263
|
+
cluster_name_on_cloud: str,
|
264
|
+
provider_config: Dict[str, Any],
|
265
|
+
worker_only: bool = False,
|
266
|
+
) -> None:
|
267
|
+
"""Stop running instances."""
|
268
|
+
# pylint: disable=line-too-long
|
269
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
270
|
+
|
271
|
+
region = provider_config['region']
|
272
|
+
tag_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
273
|
+
if worker_only:
|
274
|
+
tag_filters[constants.TAG_RAY_NODE_KIND] = 'worker'
|
275
|
+
|
276
|
+
nodes = _get_filtered_nodes(region, tag_filters)
|
277
|
+
for node in nodes:
|
278
|
+
query_helper.stop_instance(region, node['inst_id'])
|
279
|
+
|
280
|
+
|
281
|
+
@query_utils.debug_enabled(logger)
|
282
|
+
def terminate_instances(
|
283
|
+
cluster_name_on_cloud: str,
|
284
|
+
provider_config: Dict[str, Any],
|
285
|
+
worker_only: bool = False,
|
286
|
+
) -> None:
|
287
|
+
"""Terminate running or stopped instances."""
|
288
|
+
region = provider_config['region']
|
289
|
+
tag_filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
290
|
+
if worker_only:
|
291
|
+
tag_filters[constants.TAG_RAY_NODE_KIND] = 'worker'
|
292
|
+
query_helper.terminate_instances_by_tags(tag_filters, region)
|
293
|
+
|
294
|
+
|
295
|
+
@query_utils.debug_enabled(logger)
|
296
|
+
def open_ports(
|
297
|
+
cluster_name_on_cloud: str,
|
298
|
+
ports: List[str],
|
299
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
300
|
+
) -> None:
|
301
|
+
"""Open ports for inbound traffic."""
|
302
|
+
assert provider_config is not None, cluster_name_on_cloud
|
303
|
+
region = provider_config['region']
|
304
|
+
query_helper.create_nsg_rules(region=region,
|
305
|
+
cluster_name=cluster_name_on_cloud,
|
306
|
+
ports=ports)
|
307
|
+
|
308
|
+
|
309
|
+
@query_utils.debug_enabled(logger)
|
310
|
+
def cleanup_ports(
|
311
|
+
cluster_name_on_cloud: str,
|
312
|
+
ports: List[str],
|
313
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
314
|
+
) -> None:
|
315
|
+
"""Delete any opened ports."""
|
316
|
+
assert provider_config is not None, cluster_name_on_cloud
|
317
|
+
region = provider_config['region']
|
318
|
+
del ports
|
319
|
+
query_helper.remove_cluster_nsg(region=region,
|
320
|
+
cluster_name=cluster_name_on_cloud)
|
321
|
+
|
322
|
+
|
323
|
+
@query_utils.debug_enabled(logger)
|
324
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
325
|
+
state: Optional['status_lib.ClusterStatus']) -> None:
|
326
|
+
del region, cluster_name_on_cloud, state
|
327
|
+
# We already wait for the instances to be running in run_instances.
|
328
|
+
# We can not implement the wait logic here because the provisioning
|
329
|
+
# instances are not retrieveable by the QL 'query instance resources ...'.
|
330
|
+
|
331
|
+
|
332
|
+
@query_utils.debug_enabled(logger)
|
333
|
+
def get_cluster_info(
|
334
|
+
region: str,
|
335
|
+
cluster_name_on_cloud: str,
|
336
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
337
|
+
) -> common.ClusterInfo:
|
338
|
+
"""Get the metadata of instances in a cluster."""
|
339
|
+
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
340
|
+
running_instances = _get_filtered_nodes(region, filters)
|
341
|
+
|
342
|
+
instances = {}
|
343
|
+
for running_instance in running_instances:
|
344
|
+
inst = _get_inst_obj_with_ip(region, running_instance)
|
345
|
+
instances[inst['id']] = [
|
346
|
+
common.InstanceInfo(
|
347
|
+
instance_id=inst['id'],
|
348
|
+
internal_ip=inst['internal_ip'],
|
349
|
+
external_ip=inst['external_ip'],
|
350
|
+
tags=inst['tags'],
|
351
|
+
)
|
352
|
+
]
|
353
|
+
|
354
|
+
instances = dict(sorted(instances.items(), key=lambda x: x[0]))
|
355
|
+
logger.debug(f'Cluster info: {instances}')
|
356
|
+
|
357
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
358
|
+
logger.debug(f'Head instance id is {head_instance_id}')
|
359
|
+
|
360
|
+
return common.ClusterInfo(
|
361
|
+
provider_name='oci',
|
362
|
+
head_instance_id=head_instance_id,
|
363
|
+
instances=instances,
|
364
|
+
provider_config=provider_config,
|
365
|
+
)
|
366
|
+
|
367
|
+
|
368
|
+
def _get_filtered_nodes(region: str,
|
369
|
+
tag_filters: Dict[str, str]) -> List[Dict[str, Any]]:
|
370
|
+
return_nodes = []
|
371
|
+
|
372
|
+
try:
|
373
|
+
insts = query_helper.query_instances_by_tags(tag_filters, region)
|
374
|
+
except oci_adaptor.oci.exceptions.ServiceError as e:
|
375
|
+
with ux_utils.print_exception_no_traceback():
|
376
|
+
raise exceptions.ClusterStatusFetchingError(
|
377
|
+
f'Failed to query status for OCI cluster {tag_filters}.'
|
378
|
+
'Details: '
|
379
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
380
|
+
|
381
|
+
for inst in insts:
|
382
|
+
inst_id = inst.identifier
|
383
|
+
return_nodes.append({
|
384
|
+
'inst_id': inst_id,
|
385
|
+
'name': inst.display_name,
|
386
|
+
'ad': inst.availability_domain,
|
387
|
+
'compartment': inst.compartment_id,
|
388
|
+
'status': inst.lifecycle_state,
|
389
|
+
'oci_tags': inst.freeform_tags,
|
390
|
+
})
|
391
|
+
|
392
|
+
return return_nodes
|
393
|
+
|
394
|
+
|
395
|
+
def _get_inst_obj_with_ip(region: str, inst_info: Dict[str,
|
396
|
+
Any]) -> Dict[str, Any]:
|
397
|
+
get_vnic_response = query_helper.get_instance_primary_vnic(
|
398
|
+
region, inst_info)
|
399
|
+
internal_ip = get_vnic_response.private_ip
|
400
|
+
external_ip = get_vnic_response.public_ip
|
401
|
+
if external_ip is None:
|
402
|
+
external_ip = internal_ip
|
403
|
+
|
404
|
+
return {
|
405
|
+
'id': inst_info['inst_id'],
|
406
|
+
'name': inst_info['name'],
|
407
|
+
'external_ip': external_ip,
|
408
|
+
'internal_ip': internal_ip,
|
409
|
+
'tags': inst_info['oci_tags'],
|
410
|
+
'status': inst_info['status'],
|
411
|
+
}
|
412
|
+
|
413
|
+
|
414
|
+
def _get_head_instance_id(instances: List[Dict[str, Any]]) -> Optional[str]:
|
415
|
+
head_instance_id = None
|
416
|
+
head_node_tags = tuple(constants.HEAD_NODE_TAGS.items())
|
417
|
+
for inst in instances:
|
418
|
+
is_matched = True
|
419
|
+
for k, v in head_node_tags:
|
420
|
+
if (k, v) not in inst['oci_tags'].items():
|
421
|
+
is_matched = False
|
422
|
+
break
|
423
|
+
if is_matched:
|
424
|
+
if head_instance_id is not None:
|
425
|
+
logger.warning(
|
426
|
+
'There are multiple head nodes in the cluster '
|
427
|
+
f'(current head instance id: {head_instance_id}, '
|
428
|
+
f'newly discovered id: {inst["inst_id"]}. It is likely '
|
429
|
+
f'that something goes wrong.')
|
430
|
+
# Don't break here so that we can continue to check and
|
431
|
+
# warn user about duplicate head instance issue so that
|
432
|
+
# user can take further action on the abnormal cluster.
|
433
|
+
|
434
|
+
head_instance_id = inst['inst_id']
|
435
|
+
|
436
|
+
return head_instance_id
|