skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,285 @@
|
|
1
|
+
"""Nebius instance provisioning."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.provision import common
|
7
|
+
from sky.provision.nebius import utils
|
8
|
+
from sky.utils import common_utils
|
9
|
+
from sky.utils import status_lib
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
PENDING_STATUS = ['STARTING', 'DELETING', 'STOPPING']
|
13
|
+
|
14
|
+
MAX_RETRIES_TO_LAUNCH = 120 # Maximum number of retries
|
15
|
+
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def _filter_instances(region: str,
|
20
|
+
cluster_name_on_cloud: str,
|
21
|
+
status_filters: Optional[List[str]],
|
22
|
+
head_only: bool = False) -> Dict[str, Any]:
|
23
|
+
project_id = utils.get_project_by_region(region)
|
24
|
+
instances = utils.list_instances(project_id)
|
25
|
+
filtered_instances = {}
|
26
|
+
for instance_id, instance in instances.items():
|
27
|
+
if (status_filters is not None and
|
28
|
+
instance['status'] not in status_filters):
|
29
|
+
continue
|
30
|
+
|
31
|
+
if instance['name'] and instance['name'].startswith(
|
32
|
+
f'{cluster_name_on_cloud}-'):
|
33
|
+
if head_only and instance['name'].endswith('-worker'):
|
34
|
+
continue
|
35
|
+
else:
|
36
|
+
filtered_instances[instance_id] = instance
|
37
|
+
return filtered_instances
|
38
|
+
|
39
|
+
|
40
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
41
|
+
head_instance_id = None
|
42
|
+
for inst_id, inst in instances.items():
|
43
|
+
if inst['name'].endswith('-head'):
|
44
|
+
head_instance_id = inst_id
|
45
|
+
break
|
46
|
+
return head_instance_id
|
47
|
+
|
48
|
+
|
49
|
+
def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
|
50
|
+
retry_count = 0
|
51
|
+
while retry_count < MAX_RETRIES_TO_LAUNCH:
|
52
|
+
instances = _filter_instances(region, cluster_name_on_cloud,
|
53
|
+
PENDING_STATUS)
|
54
|
+
if not instances:
|
55
|
+
break
|
56
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready '
|
57
|
+
f'(Attempt {retry_count + 1}/{MAX_RETRIES_TO_LAUNCH}).')
|
58
|
+
time.sleep(utils.POLL_INTERVAL)
|
59
|
+
retry_count += 1
|
60
|
+
|
61
|
+
if retry_count == MAX_RETRIES_TO_LAUNCH:
|
62
|
+
raise TimeoutError(f'Exceeded maximum retries '
|
63
|
+
f'({MAX_RETRIES_TO_LAUNCH * utils.POLL_INTERVAL}'
|
64
|
+
f' seconds) while waiting for instances'
|
65
|
+
f' to be ready.')
|
66
|
+
|
67
|
+
|
68
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
69
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
70
|
+
"""Runs instances for the given cluster."""
|
71
|
+
_wait_until_no_pending(region, cluster_name_on_cloud)
|
72
|
+
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
73
|
+
['RUNNING'])
|
74
|
+
head_instance_id = _get_head_instance_id(running_instances)
|
75
|
+
to_start_count = config.count - len(running_instances)
|
76
|
+
if to_start_count < 0:
|
77
|
+
raise RuntimeError(
|
78
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
79
|
+
f'{len(running_instances)} nodes, but {config.count} are required.')
|
80
|
+
if to_start_count == 0:
|
81
|
+
if head_instance_id is None:
|
82
|
+
raise RuntimeError(
|
83
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
84
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
85
|
+
f'{len(running_instances)} nodes, no need to start more.')
|
86
|
+
return common.ProvisionRecord(provider_name='nebius',
|
87
|
+
cluster_name=cluster_name_on_cloud,
|
88
|
+
region=region,
|
89
|
+
zone=None,
|
90
|
+
head_instance_id=head_instance_id,
|
91
|
+
resumed_instance_ids=[],
|
92
|
+
created_instance_ids=[])
|
93
|
+
|
94
|
+
created_instance_ids = []
|
95
|
+
resumed_instance_ids = []
|
96
|
+
stopped_instances = _filter_instances(region, cluster_name_on_cloud,
|
97
|
+
['STOPPED'])
|
98
|
+
if config.resume_stopped_nodes and len(stopped_instances) > to_start_count:
|
99
|
+
|
100
|
+
raise RuntimeError(
|
101
|
+
'The number of running/stopped/stopping instances combined '
|
102
|
+
f'({len(stopped_instances) + len(running_instances)}) in '
|
103
|
+
f'cluster "{cluster_name_on_cloud}" is greater than the '
|
104
|
+
f'number requested by the user ({config.count}). '
|
105
|
+
'This is likely a resource leak. '
|
106
|
+
'Use "sky down" to terminate the cluster.')
|
107
|
+
|
108
|
+
for stopped_instance_id, _ in stopped_instances.items():
|
109
|
+
if to_start_count > 0:
|
110
|
+
try:
|
111
|
+
utils.start(stopped_instance_id)
|
112
|
+
resumed_instance_ids.append(stopped_instance_id)
|
113
|
+
to_start_count -= 1
|
114
|
+
if stopped_instances[stopped_instance_id]['name'].endswith(
|
115
|
+
'-head'):
|
116
|
+
head_instance_id = stopped_instance_id
|
117
|
+
except Exception as e: # pylint: disable=broad-except
|
118
|
+
logger.warning(f'Start instance error: {e}')
|
119
|
+
raise
|
120
|
+
time.sleep(utils.POLL_INTERVAL) # to avoid fake STOPPED status
|
121
|
+
logger.info(f'Started instance {stopped_instance_id}.')
|
122
|
+
|
123
|
+
for _ in range(to_start_count):
|
124
|
+
node_type = 'head' if head_instance_id is None else 'worker'
|
125
|
+
try:
|
126
|
+
platform, preset = config.node_config['InstanceType'].split('_')
|
127
|
+
instance_id = utils.launch(
|
128
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
129
|
+
node_type=node_type,
|
130
|
+
platform=platform,
|
131
|
+
preset=preset,
|
132
|
+
region=region,
|
133
|
+
image_family=config.node_config['ImageId'],
|
134
|
+
disk_size=config.node_config['DiskSize'],
|
135
|
+
user_data=config.node_config['UserData'])
|
136
|
+
except Exception as e: # pylint: disable=broad-except
|
137
|
+
logger.warning(f'run_instances error: {e}')
|
138
|
+
raise
|
139
|
+
logger.info(f'Launched instance {instance_id}.')
|
140
|
+
created_instance_ids.append(instance_id)
|
141
|
+
if head_instance_id is None:
|
142
|
+
head_instance_id = instance_id
|
143
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
144
|
+
return common.ProvisionRecord(provider_name='nebius',
|
145
|
+
cluster_name=cluster_name_on_cloud,
|
146
|
+
region=region,
|
147
|
+
zone=None,
|
148
|
+
head_instance_id=head_instance_id,
|
149
|
+
resumed_instance_ids=resumed_instance_ids,
|
150
|
+
created_instance_ids=created_instance_ids)
|
151
|
+
|
152
|
+
|
153
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
154
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
155
|
+
_wait_until_no_pending(region, cluster_name_on_cloud)
|
156
|
+
if state is not None:
|
157
|
+
if state == status_lib.ClusterStatus.UP:
|
158
|
+
stopped_instances = _filter_instances(region, cluster_name_on_cloud,
|
159
|
+
['STOPPED'])
|
160
|
+
if stopped_instances:
|
161
|
+
raise RuntimeError(
|
162
|
+
f'Cluster {cluster_name_on_cloud} is in UP state, but '
|
163
|
+
f'{len(stopped_instances)} instances are stopped.')
|
164
|
+
if state == status_lib.ClusterStatus.STOPPED:
|
165
|
+
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
166
|
+
['RUNNIG'])
|
167
|
+
|
168
|
+
if running_instances:
|
169
|
+
raise RuntimeError(
|
170
|
+
f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
|
171
|
+
f'{len(running_instances)} instances are running.')
|
172
|
+
|
173
|
+
|
174
|
+
def stop_instances(
|
175
|
+
cluster_name_on_cloud: str,
|
176
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
177
|
+
worker_only: bool = False,
|
178
|
+
) -> None:
|
179
|
+
assert provider_config is not None
|
180
|
+
exist_instances = _filter_instances(provider_config['region'],
|
181
|
+
cluster_name_on_cloud, ['RUNNING'])
|
182
|
+
for instance in exist_instances:
|
183
|
+
if worker_only and instance.endswith('-head'):
|
184
|
+
continue
|
185
|
+
utils.stop(instance)
|
186
|
+
|
187
|
+
|
188
|
+
def terminate_instances(
|
189
|
+
cluster_name_on_cloud: str,
|
190
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
191
|
+
worker_only: bool = False,
|
192
|
+
) -> None:
|
193
|
+
"""See sky/provision/__init__.py"""
|
194
|
+
|
195
|
+
assert provider_config is not None
|
196
|
+
instances = _filter_instances(provider_config['region'],
|
197
|
+
cluster_name_on_cloud,
|
198
|
+
status_filters=None)
|
199
|
+
for inst_id, inst in instances.items():
|
200
|
+
logger.debug(f'Terminating instance {inst_id}: {inst}')
|
201
|
+
if worker_only and inst['name'].endswith('-head'):
|
202
|
+
continue
|
203
|
+
try:
|
204
|
+
utils.remove(inst_id)
|
205
|
+
except Exception as e: # pylint: disable=broad-except
|
206
|
+
with ux_utils.print_exception_no_traceback():
|
207
|
+
raise RuntimeError(
|
208
|
+
f'Failed to terminate instance {inst_id}: '
|
209
|
+
f'{common_utils.format_exception(e, use_bracket=False)}'
|
210
|
+
) from e
|
211
|
+
utils.delete_cluster(cluster_name_on_cloud, provider_config['region'])
|
212
|
+
|
213
|
+
|
214
|
+
def get_cluster_info(
|
215
|
+
region: str,
|
216
|
+
cluster_name_on_cloud: str,
|
217
|
+
provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
|
218
|
+
_wait_until_no_pending(region, cluster_name_on_cloud)
|
219
|
+
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
220
|
+
['RUNNING'])
|
221
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
222
|
+
head_instance_id = None
|
223
|
+
for instance_id, instance_info in running_instances.items():
|
224
|
+
instances[instance_id] = [
|
225
|
+
common.InstanceInfo(
|
226
|
+
instance_id=instance_id,
|
227
|
+
internal_ip=instance_info['internal_ip'],
|
228
|
+
external_ip=instance_info['external_ip'],
|
229
|
+
tags={},
|
230
|
+
)
|
231
|
+
]
|
232
|
+
if instance_info['name'].endswith('-head'):
|
233
|
+
head_instance_id = instance_id
|
234
|
+
assert head_instance_id is not None
|
235
|
+
return common.ClusterInfo(
|
236
|
+
instances=instances,
|
237
|
+
head_instance_id=head_instance_id,
|
238
|
+
provider_name='nebius',
|
239
|
+
provider_config=provider_config,
|
240
|
+
)
|
241
|
+
|
242
|
+
|
243
|
+
def query_instances(
|
244
|
+
cluster_name_on_cloud: str,
|
245
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
246
|
+
non_terminated_only: bool = True,
|
247
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
248
|
+
"""See sky/provision/__init__.py"""
|
249
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
250
|
+
instances = _filter_instances(provider_config['region'],
|
251
|
+
cluster_name_on_cloud, None)
|
252
|
+
|
253
|
+
status_map = {
|
254
|
+
'STARTING': status_lib.ClusterStatus.INIT,
|
255
|
+
'RUNNING': status_lib.ClusterStatus.UP,
|
256
|
+
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
257
|
+
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
258
|
+
'DELETING': status_lib.ClusterStatus.STOPPED,
|
259
|
+
}
|
260
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
261
|
+
for inst_id, inst in instances.items():
|
262
|
+
status = status_map[inst['status']]
|
263
|
+
if non_terminated_only and status is None:
|
264
|
+
continue
|
265
|
+
statuses[inst_id] = status
|
266
|
+
return statuses
|
267
|
+
|
268
|
+
|
269
|
+
def open_ports(
|
270
|
+
cluster_name_on_cloud: str,
|
271
|
+
ports: List[str],
|
272
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
273
|
+
) -> None:
|
274
|
+
"""See sky/provision/__init__.py"""
|
275
|
+
logger.debug(f'Skip opening ports {ports} for Nebius instances, as all '
|
276
|
+
'ports are open by default.')
|
277
|
+
del cluster_name_on_cloud, provider_config, ports
|
278
|
+
|
279
|
+
|
280
|
+
def cleanup_ports(
|
281
|
+
cluster_name_on_cloud: str,
|
282
|
+
ports: List[str],
|
283
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
284
|
+
) -> None:
|
285
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
@@ -0,0 +1,318 @@
|
|
1
|
+
"""Nebius library wrapper for SkyPilot."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.adaptors import nebius
|
8
|
+
from sky.utils import common_utils
|
9
|
+
|
10
|
+
logger = sky_logging.init_logger(__name__)
|
11
|
+
|
12
|
+
POLL_INTERVAL = 5
|
13
|
+
|
14
|
+
|
15
|
+
def retry(func):
|
16
|
+
"""Decorator to retry a function."""
|
17
|
+
|
18
|
+
def wrapper(*args, **kwargs):
|
19
|
+
"""Wrapper for retrying a function."""
|
20
|
+
cnt = 0
|
21
|
+
while True:
|
22
|
+
try:
|
23
|
+
return func(*args, **kwargs)
|
24
|
+
except nebius.nebius.error.QueryError as e:
|
25
|
+
if cnt >= 3:
|
26
|
+
raise
|
27
|
+
logger.warning('Retrying for exception: '
|
28
|
+
f'{common_utils.format_exception(e)}.')
|
29
|
+
time.sleep(POLL_INTERVAL)
|
30
|
+
|
31
|
+
return wrapper
|
32
|
+
|
33
|
+
|
34
|
+
def get_project_by_region(region: str) -> str:
|
35
|
+
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
36
|
+
projects = service.list(nebius.iam().ListProjectsRequest(
|
37
|
+
parent_id=nebius.get_tenant_id())).wait()
|
38
|
+
# To find a project in a specific region, we rely on the project ID to
|
39
|
+
# deduce the region, since there is currently no method to retrieve region
|
40
|
+
# information directly from the project. Additionally, there is only one
|
41
|
+
# project per region, and projects cannot be created at this time.
|
42
|
+
# The region is determined from the project ID using a region-specific
|
43
|
+
# identifier embedded in it.
|
44
|
+
# Project id looks like project-e00xxxxxxxxxxxxxx where
|
45
|
+
# e00 - id of region 'eu-north1'
|
46
|
+
# e01 - id of region 'eu-west1'
|
47
|
+
region_ids = {'eu-north1': 'e00', 'eu-west1': 'e01'}
|
48
|
+
# TODO(SalikovAlex): fix when info about region will be in projects list
|
49
|
+
# Currently, Nebius cloud supports 2 regions. We manually enumerate
|
50
|
+
# them here. Reference: https://docs.nebius.com/overview/regions
|
51
|
+
|
52
|
+
# Check is there project if in config
|
53
|
+
preferable_project_id = nebius.get_project_id()
|
54
|
+
if preferable_project_id is not None:
|
55
|
+
if preferable_project_id[8:11] == region_ids[region]:
|
56
|
+
return preferable_project_id
|
57
|
+
logger.warning(
|
58
|
+
f'Can\'t use customized NEBIUS_PROJECT_ID ({preferable_project_id})'
|
59
|
+
f' for region {region}. Please check if the project ID is correct.')
|
60
|
+
for project in projects.items:
|
61
|
+
if project.metadata.id[8:11] == region_ids[region]:
|
62
|
+
return project.metadata.id
|
63
|
+
raise Exception(f'No project found for region "{region}".')
|
64
|
+
|
65
|
+
|
66
|
+
def get_or_create_gpu_cluster(name: str, region: str) -> str:
|
67
|
+
"""Creates a GPU cluster.
|
68
|
+
When creating a GPU cluster, select an InfiniBand fabric for it:
|
69
|
+
|
70
|
+
fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
|
71
|
+
fabric-5 for projects in the eu-west1 region.
|
72
|
+
|
73
|
+
https://docs.nebius.com/compute/clusters/gpu
|
74
|
+
"""
|
75
|
+
project_id = get_project_by_region(region)
|
76
|
+
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
77
|
+
try:
|
78
|
+
cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
79
|
+
parent_id=project_id,
|
80
|
+
name=name,
|
81
|
+
)).wait()
|
82
|
+
cluster_id = cluster.metadata.id
|
83
|
+
except nebius.request_error() as no_cluster_found_error:
|
84
|
+
if region == 'eu-north1':
|
85
|
+
fabric = 'fabric-4'
|
86
|
+
elif region == 'eu-west1':
|
87
|
+
fabric = 'fabric-5'
|
88
|
+
else:
|
89
|
+
raise RuntimeError(
|
90
|
+
f'Unsupported region {region}.') from no_cluster_found_error
|
91
|
+
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
92
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
93
|
+
parent_id=project_id,
|
94
|
+
name=name,
|
95
|
+
),
|
96
|
+
spec=nebius.compute().GpuClusterSpec(
|
97
|
+
infiniband_fabric=fabric))).wait()
|
98
|
+
cluster_id = cluster.resource_id
|
99
|
+
return cluster_id
|
100
|
+
|
101
|
+
|
102
|
+
def delete_cluster(name: str, region: str) -> None:
|
103
|
+
"""Delete a GPU cluster."""
|
104
|
+
project_id = get_project_by_region(region)
|
105
|
+
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
106
|
+
try:
|
107
|
+
cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
108
|
+
parent_id=project_id,
|
109
|
+
name=name,
|
110
|
+
)).wait()
|
111
|
+
cluster_id = cluster.metadata.id
|
112
|
+
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
113
|
+
service.delete(
|
114
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
|
115
|
+
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
116
|
+
except nebius.request_error():
|
117
|
+
logger.debug('GPU Cluster does not exist.')
|
118
|
+
|
119
|
+
|
120
|
+
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
121
|
+
"""Lists instances associated with API key."""
|
122
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
123
|
+
result = service.list(
|
124
|
+
nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
|
125
|
+
|
126
|
+
instances = result
|
127
|
+
|
128
|
+
instance_dict: Dict[str, Dict[str, Any]] = {}
|
129
|
+
for instance in instances.items:
|
130
|
+
info = {}
|
131
|
+
info['status'] = instance.status.state.name
|
132
|
+
info['name'] = instance.metadata.name
|
133
|
+
if instance.status.network_interfaces:
|
134
|
+
info['external_ip'] = instance.status.network_interfaces[
|
135
|
+
0].public_ip_address.address.split('/')[0]
|
136
|
+
info['internal_ip'] = instance.status.network_interfaces[
|
137
|
+
0].ip_address.address.split('/')[0]
|
138
|
+
instance_dict[instance.metadata.id] = info
|
139
|
+
|
140
|
+
return instance_dict
|
141
|
+
|
142
|
+
|
143
|
+
def stop(instance_id: str) -> None:
|
144
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
145
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
|
146
|
+
retry_count = 0
|
147
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
148
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
149
|
+
instance = service.get(nebius.compute().GetInstanceRequest(
|
150
|
+
id=instance_id,)).wait()
|
151
|
+
if instance.status.state.name == 'STOPPED':
|
152
|
+
break
|
153
|
+
time.sleep(POLL_INTERVAL)
|
154
|
+
logger.debug(f'Waiting for instance {instance_id} stopping.')
|
155
|
+
retry_count += 1
|
156
|
+
|
157
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
158
|
+
raise TimeoutError(
|
159
|
+
f'Exceeded maximum retries '
|
160
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_STOP * POLL_INTERVAL}'
|
161
|
+
f' seconds) while waiting for instance {instance_id}'
|
162
|
+
f' to be stopped.')
|
163
|
+
|
164
|
+
|
165
|
+
def start(instance_id: str) -> None:
|
166
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
167
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
|
168
|
+
retry_count = 0
|
169
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
170
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
171
|
+
instance = service.get(nebius.compute().GetInstanceRequest(
|
172
|
+
id=instance_id,)).wait()
|
173
|
+
if instance.status.state.name == 'RUNNING':
|
174
|
+
break
|
175
|
+
time.sleep(POLL_INTERVAL)
|
176
|
+
logger.debug(f'Waiting for instance {instance_id} starting.')
|
177
|
+
retry_count += 1
|
178
|
+
|
179
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_START:
|
180
|
+
raise TimeoutError(
|
181
|
+
f'Exceeded maximum retries '
|
182
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_START * POLL_INTERVAL}'
|
183
|
+
f' seconds) while waiting for instance {instance_id}'
|
184
|
+
f' to be ready.')
|
185
|
+
|
186
|
+
|
187
|
+
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
188
|
+
preset: str, region: str, image_family: str, disk_size: int,
|
189
|
+
user_data: str) -> str:
|
190
|
+
# Each node must have a unique name to avoid conflicts between
|
191
|
+
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
192
|
+
# to the node name.
|
193
|
+
instance_name = (f'{cluster_name_on_cloud}-'
|
194
|
+
f'{uuid.uuid4().hex[:4]}-{node_type}')
|
195
|
+
logger.debug(f'Launching instance: {instance_name}')
|
196
|
+
|
197
|
+
disk_name = 'disk-' + instance_name
|
198
|
+
cluster_id = None
|
199
|
+
# 8 GPU virtual machines can be grouped into a GPU cluster.
|
200
|
+
# The GPU clusters are built with InfiniBand secure high-speed networking.
|
201
|
+
# https://docs.nebius.com/compute/clusters/gpu
|
202
|
+
if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
|
203
|
+
if preset == '8gpu-128vcpu-1600gb':
|
204
|
+
cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
|
205
|
+
region)
|
206
|
+
|
207
|
+
project_id = get_project_by_region(region)
|
208
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
209
|
+
disk = service.create(nebius.compute().CreateDiskRequest(
|
210
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
211
|
+
parent_id=project_id,
|
212
|
+
name=disk_name,
|
213
|
+
),
|
214
|
+
spec=nebius.compute().DiskSpec(
|
215
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
216
|
+
image_family=image_family),
|
217
|
+
size_gibibytes=disk_size,
|
218
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
219
|
+
))).wait()
|
220
|
+
disk_id = disk.resource_id
|
221
|
+
retry_count = 0
|
222
|
+
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
223
|
+
disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
224
|
+
parent_id=project_id,
|
225
|
+
name=disk_name,
|
226
|
+
)).wait()
|
227
|
+
if disk.status.state.name == 'READY':
|
228
|
+
break
|
229
|
+
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
230
|
+
time.sleep(POLL_INTERVAL)
|
231
|
+
retry_count += 1
|
232
|
+
|
233
|
+
if retry_count == nebius.MAX_RETRIES_TO_DISK_CREATE:
|
234
|
+
raise TimeoutError(
|
235
|
+
f'Exceeded maximum retries '
|
236
|
+
f'({nebius.MAX_RETRIES_TO_DISK_CREATE * POLL_INTERVAL}'
|
237
|
+
f' seconds) while waiting for disk {disk_name}'
|
238
|
+
f' to be ready.')
|
239
|
+
|
240
|
+
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
241
|
+
sub_net = service.list(nebius.vpc().ListSubnetsRequest(
|
242
|
+
parent_id=project_id,)).wait()
|
243
|
+
|
244
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
245
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
246
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
247
|
+
parent_id=project_id,
|
248
|
+
name=instance_name,
|
249
|
+
),
|
250
|
+
spec=nebius.compute().InstanceSpec(
|
251
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
|
252
|
+
if cluster_id is not None else None,
|
253
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
254
|
+
attach_mode=nebius.compute(
|
255
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
256
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
257
|
+
cloud_init_user_data=user_data,
|
258
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
259
|
+
preset=preset),
|
260
|
+
network_interfaces=[
|
261
|
+
nebius.compute().NetworkInterfaceSpec(
|
262
|
+
subnet_id=sub_net.items[0].metadata.id,
|
263
|
+
ip_address=nebius.compute().IPAddress(),
|
264
|
+
name='network-interface-0',
|
265
|
+
public_ip_address=nebius.compute().PublicIPAddress())
|
266
|
+
]))).wait()
|
267
|
+
instance_id = ''
|
268
|
+
retry_count = 0
|
269
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
270
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
271
|
+
instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
272
|
+
parent_id=project_id,
|
273
|
+
name=instance_name,
|
274
|
+
)).wait()
|
275
|
+
if instance.status.state.name == 'STARTING':
|
276
|
+
instance_id = instance.metadata.id
|
277
|
+
break
|
278
|
+
time.sleep(POLL_INTERVAL)
|
279
|
+
logger.debug(f'Waiting for instance {instance_name} start running.')
|
280
|
+
retry_count += 1
|
281
|
+
|
282
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
283
|
+
raise TimeoutError(
|
284
|
+
f'Exceeded maximum retries '
|
285
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
|
286
|
+
f' seconds) while waiting for instance {instance_name}'
|
287
|
+
f' to be ready.')
|
288
|
+
return instance_id
|
289
|
+
|
290
|
+
|
291
|
+
def remove(instance_id: str) -> None:
|
292
|
+
"""Terminates the given instance."""
|
293
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
294
|
+
result = service.get(
|
295
|
+
nebius.compute().GetInstanceRequest(id=instance_id)).wait()
|
296
|
+
disk_id = result.spec.boot_disk.existing_disk.id
|
297
|
+
service.delete(
|
298
|
+
nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
|
299
|
+
retry_count = 0
|
300
|
+
# The instance begins deleting and attempts to delete the disk.
|
301
|
+
# Must wait until the disk is unlocked and becomes deletable.
|
302
|
+
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
303
|
+
try:
|
304
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
305
|
+
service.delete(
|
306
|
+
nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
|
307
|
+
break
|
308
|
+
except nebius.request_error():
|
309
|
+
logger.debug('Waiting for disk deletion.')
|
310
|
+
time.sleep(POLL_INTERVAL)
|
311
|
+
retry_count += 1
|
312
|
+
|
313
|
+
if retry_count == nebius.MAX_RETRIES_TO_DISK_DELETE:
|
314
|
+
raise TimeoutError(
|
315
|
+
f'Exceeded maximum retries '
|
316
|
+
f'({nebius.MAX_RETRIES_TO_DISK_DELETE * POLL_INTERVAL}'
|
317
|
+
f' seconds) while waiting for disk {disk_id}'
|
318
|
+
f' to be deleted.')
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""OCI provisioner for SkyPilot.
|
2
|
+
|
3
|
+
History:
|
4
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
5
|
+
"""
|
6
|
+
|
7
|
+
from sky.provision.oci.config import bootstrap_instances
|
8
|
+
from sky.provision.oci.instance import cleanup_ports
|
9
|
+
from sky.provision.oci.instance import get_cluster_info
|
10
|
+
from sky.provision.oci.instance import open_ports
|
11
|
+
from sky.provision.oci.instance import query_instances
|
12
|
+
from sky.provision.oci.instance import run_instances
|
13
|
+
from sky.provision.oci.instance import stop_instances
|
14
|
+
from sky.provision.oci.instance import terminate_instances
|
15
|
+
from sky.provision.oci.instance import wait_instances
|
@@ -0,0 +1,51 @@
|
|
1
|
+
"""OCI configuration bootstrapping.
|
2
|
+
|
3
|
+
Creates the resource group and deploys the configuration template to OCI for
|
4
|
+
a cluster to be launched.
|
5
|
+
|
6
|
+
History:
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
|
8
|
+
"""
|
9
|
+
|
10
|
+
from sky import exceptions
|
11
|
+
from sky import sky_logging
|
12
|
+
from sky.adaptors import oci as oci_adaptor
|
13
|
+
from sky.clouds.utils import oci_utils
|
14
|
+
from sky.provision import common
|
15
|
+
from sky.provision.oci.query_utils import query_helper
|
16
|
+
|
17
|
+
logger = sky_logging.init_logger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@common.log_function_start_end
|
21
|
+
def bootstrap_instances(
|
22
|
+
region: str, cluster_name_on_cloud: str,
|
23
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
24
|
+
"""See sky/provision/__init__.py"""
|
25
|
+
# OCI module import and oci client
|
26
|
+
oci_adaptor.get_core_client(region, oci_utils.oci_config.get_profile())
|
27
|
+
|
28
|
+
# Find / create a compartment for creating instances.
|
29
|
+
compartment = query_helper.find_compartment(region)
|
30
|
+
|
31
|
+
# Find the configured VCN, or create a new one.
|
32
|
+
vcn = query_helper.find_create_vcn_subnet(region)
|
33
|
+
if vcn is None:
|
34
|
+
# pylint: disable=line-too-long
|
35
|
+
raise exceptions.ResourcesUnavailableError(
|
36
|
+
'Failed to create a new VCN, possibly you hit the resource limitation.'
|
37
|
+
)
|
38
|
+
|
39
|
+
node_config = config.node_config
|
40
|
+
|
41
|
+
# Subscribe the image if it is from Marketplace listing.
|
42
|
+
query_helper.subscribe_image(
|
43
|
+
compartment_id=compartment,
|
44
|
+
listing_id=node_config['AppCatalogListingId'],
|
45
|
+
resource_version=node_config['ResourceVersion'],
|
46
|
+
region=region,
|
47
|
+
)
|
48
|
+
|
49
|
+
logger.info(f'Using cluster name: {cluster_name_on_cloud}')
|
50
|
+
|
51
|
+
return config
|