skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/lambda_cloud.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
"""Lambda Cloud."""
|
2
|
-
import json
|
3
2
|
import typing
|
4
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
3
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
4
|
|
6
5
|
import requests
|
7
6
|
|
8
7
|
from sky import clouds
|
9
|
-
from sky import status_lib
|
10
8
|
from sky.clouds import service_catalog
|
11
|
-
from sky.
|
9
|
+
from sky.provision.lambda_cloud import lambda_utils
|
10
|
+
from sky.utils import registry
|
12
11
|
from sky.utils import resources_utils
|
12
|
+
from sky.utils import status_lib
|
13
13
|
|
14
14
|
if typing.TYPE_CHECKING:
|
15
15
|
# Renaming to avoid shadowing variables.
|
@@ -21,7 +21,7 @@ _CREDENTIAL_FILES = [
|
|
21
21
|
]
|
22
22
|
|
23
23
|
|
24
|
-
@
|
24
|
+
@registry.CLOUD_REGISTRY.register
|
25
25
|
class Lambda(clouds.Cloud):
|
26
26
|
"""Lambda Labs GPU Cloud."""
|
27
27
|
|
@@ -37,10 +37,6 @@ class Lambda(clouds.Cloud):
|
|
37
37
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
38
38
|
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
|
39
39
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
|
40
|
-
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
|
41
|
-
f'Docker image is currently not supported on {_REPR}. '
|
42
|
-
'You can try running docker command inside the `run` section in task.yaml.'
|
43
|
-
),
|
44
40
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
|
45
41
|
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
|
46
42
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
|
@@ -48,6 +44,9 @@ class Lambda(clouds.Cloud):
|
|
48
44
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
|
49
45
|
}
|
50
46
|
|
47
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
48
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
49
|
+
|
51
50
|
@classmethod
|
52
51
|
def _unsupported_features_for_resources(
|
53
52
|
cls, resources: 'resources_lib.Resources'
|
@@ -123,10 +122,10 @@ class Lambda(clouds.Cloud):
|
|
123
122
|
|
124
123
|
@classmethod
|
125
124
|
def get_default_instance_type(
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
125
|
+
cls,
|
126
|
+
cpus: Optional[str] = None,
|
127
|
+
memory: Optional[str] = None,
|
128
|
+
disk_tier: Optional['resources_utils.DiskTier'] = None
|
130
129
|
) -> Optional[str]:
|
131
130
|
return service_catalog.get_default_instance_type(cpus=cpus,
|
132
131
|
memory=memory,
|
@@ -137,7 +136,7 @@ class Lambda(clouds.Cloud):
|
|
137
136
|
def get_accelerators_from_instance_type(
|
138
137
|
cls,
|
139
138
|
instance_type: str,
|
140
|
-
) -> Optional[Dict[str, int]]:
|
139
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
141
140
|
return service_catalog.get_accelerators_from_instance_type(
|
142
141
|
instance_type, clouds='lambda')
|
143
142
|
|
@@ -156,34 +155,43 @@ class Lambda(clouds.Cloud):
|
|
156
155
|
def make_deploy_resources_variables(
|
157
156
|
self,
|
158
157
|
resources: 'resources_lib.Resources',
|
159
|
-
|
158
|
+
cluster_name: 'resources_utils.ClusterName',
|
160
159
|
region: 'clouds.Region',
|
161
160
|
zones: Optional[List['clouds.Zone']],
|
161
|
+
num_nodes: int,
|
162
162
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
163
|
-
del
|
163
|
+
del cluster_name, dryrun # Unused.
|
164
164
|
assert zones is None, 'Lambda does not support zones.'
|
165
165
|
|
166
166
|
r = resources
|
167
167
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
168
|
-
|
169
|
-
|
170
|
-
else:
|
171
|
-
custom_resources = None
|
168
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
169
|
+
acc_dict)
|
172
170
|
|
173
|
-
|
171
|
+
resources_vars = {
|
174
172
|
'instance_type': resources.instance_type,
|
175
173
|
'custom_resources': custom_resources,
|
176
174
|
'region': region.name,
|
177
175
|
}
|
178
176
|
|
177
|
+
if acc_dict is not None:
|
178
|
+
# Lambda cloud's docker runtime information does not contain
|
179
|
+
# 'nvidia-container-runtime', causing no GPU option is added to
|
180
|
+
# the docker run command. We patch this by adding it here.
|
181
|
+
resources_vars['docker_run_options'] = ['--gpus all']
|
182
|
+
|
183
|
+
return resources_vars
|
184
|
+
|
179
185
|
def _get_feasible_launchable_resources(
|
180
186
|
self, resources: 'resources_lib.Resources'
|
181
|
-
) ->
|
187
|
+
) -> 'resources_utils.FeasibleResources':
|
182
188
|
if resources.instance_type is not None:
|
183
189
|
assert resources.is_launchable(), resources
|
184
190
|
# Accelerators are part of the instance type in Lambda Cloud
|
185
191
|
resources = resources.copy(accelerators=None)
|
186
|
-
return
|
192
|
+
# TODO: Add hints to all return values in this method to help
|
193
|
+
# users understand why the resources are not launchable.
|
194
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
187
195
|
|
188
196
|
def _make(instance_list):
|
189
197
|
resource_list = []
|
@@ -209,9 +217,10 @@ class Lambda(clouds.Cloud):
|
|
209
217
|
memory=resources.memory,
|
210
218
|
disk_tier=resources.disk_tier)
|
211
219
|
if default_instance_type is None:
|
212
|
-
return ([], [])
|
220
|
+
return resources_utils.FeasibleResources([], [], None)
|
213
221
|
else:
|
214
|
-
return (
|
222
|
+
return resources_utils.FeasibleResources(
|
223
|
+
_make([default_instance_type]), [], None)
|
215
224
|
|
216
225
|
assert len(accelerators) == 1, resources
|
217
226
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -226,8 +235,10 @@ class Lambda(clouds.Cloud):
|
|
226
235
|
zone=resources.zone,
|
227
236
|
clouds='lambda')
|
228
237
|
if instance_list is None:
|
229
|
-
return ([], fuzzy_candidate_list
|
230
|
-
|
238
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
239
|
+
None)
|
240
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
241
|
+
fuzzy_candidate_list, None)
|
231
242
|
|
232
243
|
@classmethod
|
233
244
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -253,8 +264,8 @@ class Lambda(clouds.Cloud):
|
|
253
264
|
}
|
254
265
|
|
255
266
|
@classmethod
|
256
|
-
def
|
257
|
-
# TODO(ewzeng): Implement
|
267
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
268
|
+
# TODO(ewzeng): Implement get_user_identities for Lambda
|
258
269
|
return None
|
259
270
|
|
260
271
|
def instance_type_exists(self, instance_type: str) -> bool:
|
sky/clouds/nebius.py
ADDED
@@ -0,0 +1,297 @@
|
|
1
|
+
""" Nebius Cloud. """
|
2
|
+
import logging
|
3
|
+
import typing
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
|
+
|
6
|
+
from sky import clouds
|
7
|
+
from sky.adaptors import nebius
|
8
|
+
from sky.clouds import service_catalog
|
9
|
+
from sky.utils import registry
|
10
|
+
from sky.utils import resources_utils
|
11
|
+
|
12
|
+
if typing.TYPE_CHECKING:
|
13
|
+
from sky import resources as resources_lib
|
14
|
+
|
15
|
+
_CREDENTIAL_FILES = [
|
16
|
+
# credential files for Nebius
|
17
|
+
nebius.NEBIUS_TENANT_ID_FILENAME,
|
18
|
+
nebius.NEBIUS_IAM_TOKEN_FILENAME,
|
19
|
+
nebius.NEBIUS_PROJECT_ID_FILENAME,
|
20
|
+
]
|
21
|
+
|
22
|
+
|
23
|
+
@registry.CLOUD_REGISTRY.register
|
24
|
+
class Nebius(clouds.Cloud):
|
25
|
+
"""Nebius GPU Cloud"""
|
26
|
+
_REPR = 'Nebius'
|
27
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
28
|
+
clouds.CloudImplementationFeatures.AUTO_TERMINATE:
|
29
|
+
('Autodown and Autostop not supported. Can\'t delete disk.'),
|
30
|
+
# Autostop functionality can be implemented, but currently,
|
31
|
+
# there is only a single flag for both autostop and autodown.
|
32
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
33
|
+
('Spot is not supported, as Nebius API does not implement spot.'),
|
34
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
35
|
+
(f'Migrating disk is currently not supported on {_REPR}.'),
|
36
|
+
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
37
|
+
(f'Docker image is currently not supported on {_REPR}. '
|
38
|
+
'You can try running docker command inside the '
|
39
|
+
'`run` section in task.yaml.'),
|
40
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
41
|
+
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
42
|
+
}
|
43
|
+
# Nebius maximum instance name length defined as <= 63 as a hostname length
|
44
|
+
# 63 - 8 - 5 = 50 characters since
|
45
|
+
# we add 4 character from UUID to make uniq `-xxxx`
|
46
|
+
# our provisioner adds additional `-worker`.
|
47
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 50
|
48
|
+
_regions: List[clouds.Region] = []
|
49
|
+
|
50
|
+
# Using the latest SkyPilot provisioner API to provision and check status.
|
51
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
52
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def _unsupported_features_for_resources(
|
56
|
+
cls, resources: 'resources_lib.Resources'
|
57
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
58
|
+
del resources # unused
|
59
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
60
|
+
|
61
|
+
@classmethod
|
62
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
63
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def regions_with_offering(cls, instance_type: str,
|
67
|
+
accelerators: Optional[Dict[str, int]],
|
68
|
+
use_spot: bool, region: Optional[str],
|
69
|
+
zone: Optional[str]) -> List[clouds.Region]:
|
70
|
+
assert zone is None, 'Nebius does not support zones.'
|
71
|
+
del accelerators, zone # unused
|
72
|
+
if use_spot:
|
73
|
+
return []
|
74
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
75
|
+
instance_type, use_spot, 'nebius')
|
76
|
+
|
77
|
+
if region is not None:
|
78
|
+
regions = [r for r in regions if r.name == region]
|
79
|
+
return regions
|
80
|
+
|
81
|
+
@classmethod
|
82
|
+
def get_vcpus_mem_from_instance_type(
|
83
|
+
cls,
|
84
|
+
instance_type: str,
|
85
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
86
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
87
|
+
clouds='nebius')
|
88
|
+
|
89
|
+
@classmethod
|
90
|
+
def zones_provision_loop(
|
91
|
+
cls,
|
92
|
+
*,
|
93
|
+
region: str,
|
94
|
+
num_nodes: int,
|
95
|
+
instance_type: str,
|
96
|
+
accelerators: Optional[Dict[str, int]] = None,
|
97
|
+
use_spot: bool = False,
|
98
|
+
) -> Iterator[None]:
|
99
|
+
del num_nodes # unused
|
100
|
+
regions = cls.regions_with_offering(instance_type,
|
101
|
+
accelerators,
|
102
|
+
use_spot,
|
103
|
+
region=region,
|
104
|
+
zone=None)
|
105
|
+
for r in regions:
|
106
|
+
assert r.zones is None, r
|
107
|
+
yield r.zones
|
108
|
+
|
109
|
+
def instance_type_to_hourly_cost(self,
|
110
|
+
instance_type: str,
|
111
|
+
use_spot: bool,
|
112
|
+
region: Optional[str] = None,
|
113
|
+
zone: Optional[str] = None) -> float:
|
114
|
+
return service_catalog.get_hourly_cost(instance_type,
|
115
|
+
use_spot=use_spot,
|
116
|
+
region=region,
|
117
|
+
zone=zone,
|
118
|
+
clouds='nebius')
|
119
|
+
|
120
|
+
def accelerators_to_hourly_cost(self,
|
121
|
+
accelerators: Dict[str, int],
|
122
|
+
use_spot: bool,
|
123
|
+
region: Optional[str] = None,
|
124
|
+
zone: Optional[str] = None) -> float:
|
125
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
126
|
+
del accelerators, use_spot, region, zone # unused
|
127
|
+
return 0.0
|
128
|
+
|
129
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
130
|
+
return 0.0
|
131
|
+
|
132
|
+
def __repr__(self):
|
133
|
+
return self._REPR
|
134
|
+
|
135
|
+
def is_same_cloud(self, other: clouds.Cloud) -> bool:
|
136
|
+
# Returns true if the two clouds are the same cloud type.
|
137
|
+
return isinstance(other, Nebius)
|
138
|
+
|
139
|
+
@classmethod
|
140
|
+
def get_default_instance_type(
|
141
|
+
cls,
|
142
|
+
cpus: Optional[str] = None,
|
143
|
+
memory: Optional[str] = None,
|
144
|
+
disk_tier: Optional[resources_utils.DiskTier] = None
|
145
|
+
) -> Optional[str]:
|
146
|
+
"""Returns the default instance type for Nebius."""
|
147
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
148
|
+
memory=memory,
|
149
|
+
disk_tier=disk_tier,
|
150
|
+
clouds='nebius')
|
151
|
+
|
152
|
+
@classmethod
|
153
|
+
def get_accelerators_from_instance_type(
|
154
|
+
cls,
|
155
|
+
instance_type: str,
|
156
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
157
|
+
return service_catalog.get_accelerators_from_instance_type(
|
158
|
+
instance_type, clouds='nebius')
|
159
|
+
|
160
|
+
@classmethod
|
161
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
162
|
+
return None
|
163
|
+
|
164
|
+
def make_deploy_resources_variables(
|
165
|
+
self,
|
166
|
+
resources: 'resources_lib.Resources',
|
167
|
+
cluster_name: resources_utils.ClusterName,
|
168
|
+
region: 'clouds.Region',
|
169
|
+
zones: Optional[List['clouds.Zone']],
|
170
|
+
num_nodes: int,
|
171
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
172
|
+
del dryrun, cluster_name
|
173
|
+
assert zones is None, ('Nebius does not support zones', zones)
|
174
|
+
|
175
|
+
r = resources
|
176
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
177
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
178
|
+
acc_dict)
|
179
|
+
platform, _ = resources.instance_type.split('_')
|
180
|
+
|
181
|
+
if platform in ('cpu-d3', 'cpu-e2'):
|
182
|
+
image_family = 'ubuntu22.04-driverless'
|
183
|
+
elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
|
184
|
+
image_family = 'ubuntu22.04-cuda12'
|
185
|
+
else:
|
186
|
+
raise RuntimeError('Unsupported instance type for Nebius cloud:'
|
187
|
+
f' {resources.instance_type}')
|
188
|
+
return {
|
189
|
+
'instance_type': resources.instance_type,
|
190
|
+
'custom_resources': custom_resources,
|
191
|
+
'region': region.name,
|
192
|
+
'image_id': image_family,
|
193
|
+
# Nebius does not support specific zones.
|
194
|
+
'zones': None,
|
195
|
+
}
|
196
|
+
|
197
|
+
def _get_feasible_launchable_resources(
|
198
|
+
self, resources: 'resources_lib.Resources'
|
199
|
+
) -> 'resources_utils.FeasibleResources':
|
200
|
+
"""Returns a list of feasible resources for the given resources."""
|
201
|
+
if resources.instance_type is not None:
|
202
|
+
assert resources.is_launchable(), resources
|
203
|
+
resources = resources.copy(accelerators=None)
|
204
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
205
|
+
|
206
|
+
def _make(instance_list):
|
207
|
+
resource_list = []
|
208
|
+
for instance_type in instance_list:
|
209
|
+
r = resources.copy(
|
210
|
+
cloud=Nebius(),
|
211
|
+
instance_type=instance_type,
|
212
|
+
accelerators=None,
|
213
|
+
cpus=None,
|
214
|
+
)
|
215
|
+
resource_list.append(r)
|
216
|
+
return resource_list
|
217
|
+
|
218
|
+
# Currently, handle a filter on accelerators only.
|
219
|
+
accelerators = resources.accelerators
|
220
|
+
if accelerators is None:
|
221
|
+
# Return a default instance type
|
222
|
+
default_instance_type = Nebius.get_default_instance_type(
|
223
|
+
cpus=resources.cpus,
|
224
|
+
memory=resources.memory,
|
225
|
+
disk_tier=resources.disk_tier)
|
226
|
+
if default_instance_type is None:
|
227
|
+
# TODO: Add hints to all return values in this method to help
|
228
|
+
# users understand why the resources are not launchable.
|
229
|
+
return resources_utils.FeasibleResources([], [], None)
|
230
|
+
else:
|
231
|
+
return resources_utils.FeasibleResources(
|
232
|
+
_make([default_instance_type]), [], None)
|
233
|
+
|
234
|
+
assert len(accelerators) == 1, resources
|
235
|
+
acc, acc_count = list(accelerators.items())[0]
|
236
|
+
(instance_list, fuzzy_candidate_list
|
237
|
+
) = service_catalog.get_instance_type_for_accelerator(
|
238
|
+
acc,
|
239
|
+
acc_count,
|
240
|
+
use_spot=resources.use_spot,
|
241
|
+
cpus=resources.cpus,
|
242
|
+
region=resources.region,
|
243
|
+
zone=resources.zone,
|
244
|
+
clouds='nebius')
|
245
|
+
if instance_list is None:
|
246
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
247
|
+
None)
|
248
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
249
|
+
fuzzy_candidate_list, None)
|
250
|
+
|
251
|
+
@classmethod
|
252
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
253
|
+
""" Verify that the user has valid credentials for Nebius. """
|
254
|
+
logging.debug('Nebius cloud check credentials')
|
255
|
+
token = nebius.get_iam_token()
|
256
|
+
token_msg = (' Credentials can be set up by running: \n'\
|
257
|
+
f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
|
258
|
+
tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
259
|
+
f' $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
|
260
|
+
' Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
|
261
|
+
f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
|
262
|
+
if token is None:
|
263
|
+
return False, f'{token_msg}'
|
264
|
+
sdk = nebius.sdk()
|
265
|
+
tenant_id = nebius.get_tenant_id()
|
266
|
+
if tenant_id is None:
|
267
|
+
return False, f'{tenant_msg}'
|
268
|
+
try:
|
269
|
+
service = nebius.iam().ProjectServiceClient(sdk)
|
270
|
+
service.list(
|
271
|
+
nebius.iam().ListProjectsRequest(parent_id=tenant_id)).wait()
|
272
|
+
except nebius.request_error() as e:
|
273
|
+
return False, (
|
274
|
+
f'{e.status} \n' # First line is indented by 4 spaces
|
275
|
+
f'{token_msg}'
|
276
|
+
f'{tenant_msg}')
|
277
|
+
return True, None
|
278
|
+
|
279
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
280
|
+
return {
|
281
|
+
f'~/.nebius/{filename}': f'~/.nebius/{filename}'
|
282
|
+
for filename in _CREDENTIAL_FILES
|
283
|
+
}
|
284
|
+
|
285
|
+
@classmethod
|
286
|
+
def get_current_user_identity(cls) -> Optional[List[str]]:
|
287
|
+
# NOTE: used for very advanced SkyPilot functionality
|
288
|
+
# Can implement later if desired
|
289
|
+
return None
|
290
|
+
|
291
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
292
|
+
return service_catalog.instance_type_exists(instance_type, 'nebius')
|
293
|
+
|
294
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
295
|
+
return service_catalog.validate_region_zone(region,
|
296
|
+
zone,
|
297
|
+
clouds='nebius')
|