skypilot-nightly 1.0.0.dev20250219__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +85 -0
- sky/backends/backend_utils.py +8 -0
- sky/backends/cloud_vm_ray_backend.py +10 -2
- sky/client/sdk.py +8 -3
- sky/clouds/__init__.py +2 -0
- sky/clouds/nebius.py +294 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/jobs/controller.py +17 -0
- sky/jobs/server/core.py +31 -3
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/instance.py +5 -1
- sky/provision/kubernetes/utils.py +8 -7
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +310 -0
- sky/server/common.py +5 -7
- sky/server/requests/executor.py +94 -87
- sky/server/server.py +10 -5
- sky/server/stream_utils.py +8 -11
- sky/setup_files/dependencies.py +9 -1
- sky/skylet/constants.py +3 -6
- sky/task.py +6 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/utils/common_utils.py +38 -0
- sky/utils/controller_utils.py +66 -2
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +35 -27
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'aa3c387f04fbdd4468751b7d66fcb381bd3449dc'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250221'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -143,6 +143,7 @@ RunPod = clouds.RunPod
|
|
143
143
|
Vast = clouds.Vast
|
144
144
|
Vsphere = clouds.Vsphere
|
145
145
|
Fluidstack = clouds.Fluidstack
|
146
|
+
Nebius = clouds.Nebius
|
146
147
|
|
147
148
|
__all__ = [
|
148
149
|
'__version__',
|
@@ -161,6 +162,7 @@ __all__ = [
|
|
161
162
|
'SCP',
|
162
163
|
'Vsphere',
|
163
164
|
'Fluidstack',
|
165
|
+
'Nebius',
|
164
166
|
'Optimizer',
|
165
167
|
'OptimizeTarget',
|
166
168
|
'backends',
|
sky/adaptors/nebius.py
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
"""Nebius cloud adaptor."""
|
2
|
+
import os
|
3
|
+
|
4
|
+
from sky.adaptors import common
|
5
|
+
|
6
|
+
NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
|
7
|
+
NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
|
8
|
+
NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
|
9
|
+
NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
|
10
|
+
|
11
|
+
MAX_RETRIES_TO_DISK_CREATE = 120
|
12
|
+
MAX_RETRIES_TO_INSTANCE_STOP = 120
|
13
|
+
MAX_RETRIES_TO_INSTANCE_START = 120
|
14
|
+
MAX_RETRIES_TO_INSTANCE_READY = 240
|
15
|
+
|
16
|
+
MAX_RETRIES_TO_DISK_DELETE = 120
|
17
|
+
MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
|
18
|
+
|
19
|
+
POLL_INTERVAL = 5
|
20
|
+
|
21
|
+
_iam_token = None
|
22
|
+
_tenant_id = None
|
23
|
+
|
24
|
+
nebius = common.LazyImport(
|
25
|
+
'nebius',
|
26
|
+
import_error_message='Failed to import dependencies for Nebius AI Cloud. '
|
27
|
+
'Try running: pip install "skypilot[nebius]"',
|
28
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
29
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'}))
|
30
|
+
|
31
|
+
|
32
|
+
def request_error():
|
33
|
+
return nebius.aio.service_error.RequestError
|
34
|
+
|
35
|
+
|
36
|
+
def compute():
|
37
|
+
# pylint: disable=import-outside-toplevel
|
38
|
+
from nebius.api.nebius.compute import v1 as compute_v1
|
39
|
+
return compute_v1
|
40
|
+
|
41
|
+
|
42
|
+
def iam():
|
43
|
+
# pylint: disable=import-outside-toplevel
|
44
|
+
from nebius.api.nebius.iam import v1 as iam_v1
|
45
|
+
return iam_v1
|
46
|
+
|
47
|
+
|
48
|
+
def nebius_common():
|
49
|
+
# pylint: disable=import-outside-toplevel
|
50
|
+
from nebius.api.nebius.common import v1 as common_v1
|
51
|
+
return common_v1
|
52
|
+
|
53
|
+
|
54
|
+
def vpc():
|
55
|
+
# pylint: disable=import-outside-toplevel
|
56
|
+
from nebius.api.nebius.vpc import v1 as vpc_v1
|
57
|
+
return vpc_v1
|
58
|
+
|
59
|
+
|
60
|
+
def get_iam_token():
|
61
|
+
global _iam_token
|
62
|
+
if _iam_token is None:
|
63
|
+
try:
|
64
|
+
with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
|
65
|
+
encoding='utf-8') as file:
|
66
|
+
_iam_token = file.read().strip()
|
67
|
+
except FileNotFoundError:
|
68
|
+
return None
|
69
|
+
return _iam_token
|
70
|
+
|
71
|
+
|
72
|
+
def get_tenant_id():
|
73
|
+
global _tenant_id
|
74
|
+
if _tenant_id is None:
|
75
|
+
try:
|
76
|
+
with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
|
77
|
+
encoding='utf-8') as file:
|
78
|
+
_tenant_id = file.read().strip()
|
79
|
+
except FileNotFoundError:
|
80
|
+
return None
|
81
|
+
return _tenant_id
|
82
|
+
|
83
|
+
|
84
|
+
def sdk():
|
85
|
+
return nebius.sdk.SDK(credentials=get_iam_token())
|
sky/backends/backend_utils.py
CHANGED
@@ -197,6 +197,9 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
|
|
197
197
|
return str(output_path)
|
198
198
|
|
199
199
|
|
200
|
+
# Add retry for the file mounts optimization, as the underlying cp command may
|
201
|
+
# experience transient errors, #4758.
|
202
|
+
@common_utils.retry
|
200
203
|
def _optimize_file_mounts(yaml_path: str) -> None:
|
201
204
|
"""Optimize file mounts in the given ray yaml file.
|
202
205
|
|
@@ -206,6 +209,10 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
206
209
|
- wheel
|
207
210
|
- credentials
|
208
211
|
Format is {dst: src}.
|
212
|
+
|
213
|
+
Raises:
|
214
|
+
subprocess.CalledProcessError: If the file mounts are failed to be
|
215
|
+
copied.
|
209
216
|
"""
|
210
217
|
yaml_config = common_utils.read_yaml(yaml_path)
|
211
218
|
|
@@ -863,6 +870,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
863
870
|
clouds.Paperspace,
|
864
871
|
clouds.Azure,
|
865
872
|
clouds.DO,
|
873
|
+
clouds.Nebius,
|
866
874
|
)):
|
867
875
|
config = auth.configure_ssh_info(config)
|
868
876
|
elif isinstance(cloud, clouds.GCP):
|
@@ -191,7 +191,8 @@ def _get_cluster_config_template(cloud):
|
|
191
191
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
192
192
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
193
193
|
clouds.Vast: 'vast-ray.yml.j2',
|
194
|
-
clouds.Fluidstack: 'fluidstack-ray.yml.j2'
|
194
|
+
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
195
|
+
clouds.Nebius: 'nebius-ray.yml.j2'
|
195
196
|
}
|
196
197
|
return cloud_to_template[type(cloud)]
|
197
198
|
|
@@ -3233,7 +3234,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3233
3234
|
all_file_mounts: Optional[Dict[Path, Path]],
|
3234
3235
|
storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
|
3235
3236
|
) -> None:
|
3236
|
-
"""Mounts all user files to the remote nodes.
|
3237
|
+
"""Mounts all user files to the remote nodes.
|
3238
|
+
|
3239
|
+
Note: This does not handle COPY storage_mounts. These should have
|
3240
|
+
already been translated into file_mounts by task.sync_storage_mounts().
|
3241
|
+
|
3242
|
+
TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
|
3243
|
+
assert here that all storage_mounts are MOUNT mode.
|
3244
|
+
"""
|
3237
3245
|
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
3238
3246
|
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
3239
3247
|
handle.launched_resources.cloud, all_file_mounts)
|
sky/client/sdk.py
CHANGED
@@ -1503,14 +1503,14 @@ def stream_and_get(
|
|
1503
1503
|
|
1504
1504
|
@usage_lib.entrypoint
|
1505
1505
|
@annotations.client_api
|
1506
|
-
def api_cancel(request_ids: Optional[List[str]] = None,
|
1506
|
+
def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
1507
1507
|
all_users: bool = False,
|
1508
1508
|
silent: bool = False) -> server_common.RequestId:
|
1509
1509
|
"""Aborts a request or all requests.
|
1510
1510
|
|
1511
1511
|
Args:
|
1512
|
-
|
1513
|
-
|
1512
|
+
request_ids: The request ID(s) to abort. Can be a single string or a
|
1513
|
+
list of strings.
|
1514
1514
|
all_users: Whether to abort all requests from all users.
|
1515
1515
|
silent: Whether to suppress the output.
|
1516
1516
|
|
@@ -1528,6 +1528,11 @@ def api_cancel(request_ids: Optional[List[str]] = None,
|
|
1528
1528
|
user_id = None
|
1529
1529
|
if not all_users:
|
1530
1530
|
user_id = common_utils.get_user_hash()
|
1531
|
+
|
1532
|
+
# Convert single request ID to list if needed
|
1533
|
+
if isinstance(request_ids, str):
|
1534
|
+
request_ids = [request_ids]
|
1535
|
+
|
1531
1536
|
body = payloads.RequestCancelBody(request_ids=request_ids, user_id=user_id)
|
1532
1537
|
if all_users:
|
1533
1538
|
echo('Cancelling all users\' requests...')
|
sky/clouds/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from sky.clouds.gcp import GCP
|
|
20
20
|
from sky.clouds.ibm import IBM
|
21
21
|
from sky.clouds.kubernetes import Kubernetes
|
22
22
|
from sky.clouds.lambda_cloud import Lambda
|
23
|
+
from sky.clouds.nebius import Nebius
|
23
24
|
from sky.clouds.oci import OCI
|
24
25
|
from sky.clouds.paperspace import Paperspace
|
25
26
|
from sky.clouds.runpod import RunPod
|
@@ -49,6 +50,7 @@ __all__ = [
|
|
49
50
|
'ProvisionerVersion',
|
50
51
|
'StatusVersion',
|
51
52
|
'Fluidstack',
|
53
|
+
'Nebius',
|
52
54
|
# Utility functions
|
53
55
|
'cloud_in_iterable',
|
54
56
|
]
|
sky/clouds/nebius.py
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
""" Nebius Cloud. """
|
2
|
+
import logging
|
3
|
+
import typing
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
|
+
|
6
|
+
from sky import clouds
|
7
|
+
from sky.adaptors import nebius
|
8
|
+
from sky.clouds import service_catalog
|
9
|
+
from sky.utils import registry
|
10
|
+
from sky.utils import resources_utils
|
11
|
+
|
12
|
+
if typing.TYPE_CHECKING:
|
13
|
+
from sky import resources as resources_lib
|
14
|
+
|
15
|
+
_CREDENTIAL_FILES = [
|
16
|
+
# credential files for Nebius
|
17
|
+
nebius.NEBIUS_TENANT_ID_FILENAME,
|
18
|
+
nebius.NEBIUS_IAM_TOKEN_FILENAME
|
19
|
+
]
|
20
|
+
|
21
|
+
|
22
|
+
@registry.CLOUD_REGISTRY.register
|
23
|
+
class Nebius(clouds.Cloud):
|
24
|
+
"""Nebius GPU Cloud"""
|
25
|
+
_REPR = 'Nebius'
|
26
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
27
|
+
clouds.CloudImplementationFeatures.AUTO_TERMINATE:
|
28
|
+
('Autodown and Autostop not supported. Can\'t delete disk.'),
|
29
|
+
# Autostop functionality can be implemented, but currently,
|
30
|
+
# there is only a single flag for both autostop and autodown.
|
31
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
32
|
+
('Spot is not supported, as Nebius API does not implement spot.'),
|
33
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
34
|
+
(f'Migrating disk is currently not supported on {_REPR}.'),
|
35
|
+
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
36
|
+
(f'Docker image is currently not supported on {_REPR}. '
|
37
|
+
'You can try running docker command inside the '
|
38
|
+
'`run` section in task.yaml.'),
|
39
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
40
|
+
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
41
|
+
}
|
42
|
+
# Nebius maximum instance name length defined as <= 63 as a hostname length
|
43
|
+
# 63 - 8 - 5 = 50 characters since
|
44
|
+
# we add 4 character from UUID to make uniq `-xxxx`
|
45
|
+
# our provisioner adds additional `-worker`.
|
46
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 50
|
47
|
+
_regions: List[clouds.Region] = []
|
48
|
+
|
49
|
+
# Using the latest SkyPilot provisioner API to provision and check status.
|
50
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
51
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
52
|
+
|
53
|
+
@classmethod
|
54
|
+
def _unsupported_features_for_resources(
|
55
|
+
cls, resources: 'resources_lib.Resources'
|
56
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
57
|
+
del resources # unused
|
58
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
62
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def regions_with_offering(cls, instance_type: str,
|
66
|
+
accelerators: Optional[Dict[str, int]],
|
67
|
+
use_spot: bool, region: Optional[str],
|
68
|
+
zone: Optional[str]) -> List[clouds.Region]:
|
69
|
+
assert zone is None, 'Nebius does not support zones.'
|
70
|
+
del accelerators, zone # unused
|
71
|
+
if use_spot:
|
72
|
+
return []
|
73
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
74
|
+
instance_type, use_spot, 'nebius')
|
75
|
+
|
76
|
+
if region is not None:
|
77
|
+
regions = [r for r in regions if r.name == region]
|
78
|
+
return regions
|
79
|
+
|
80
|
+
@classmethod
|
81
|
+
def get_vcpus_mem_from_instance_type(
|
82
|
+
cls,
|
83
|
+
instance_type: str,
|
84
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
85
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
86
|
+
clouds='nebius')
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def zones_provision_loop(
|
90
|
+
cls,
|
91
|
+
*,
|
92
|
+
region: str,
|
93
|
+
num_nodes: int,
|
94
|
+
instance_type: str,
|
95
|
+
accelerators: Optional[Dict[str, int]] = None,
|
96
|
+
use_spot: bool = False,
|
97
|
+
) -> Iterator[None]:
|
98
|
+
del num_nodes # unused
|
99
|
+
regions = cls.regions_with_offering(instance_type,
|
100
|
+
accelerators,
|
101
|
+
use_spot,
|
102
|
+
region=region,
|
103
|
+
zone=None)
|
104
|
+
for r in regions:
|
105
|
+
assert r.zones is None, r
|
106
|
+
yield r.zones
|
107
|
+
|
108
|
+
def instance_type_to_hourly_cost(self,
|
109
|
+
instance_type: str,
|
110
|
+
use_spot: bool,
|
111
|
+
region: Optional[str] = None,
|
112
|
+
zone: Optional[str] = None) -> float:
|
113
|
+
return service_catalog.get_hourly_cost(instance_type,
|
114
|
+
use_spot=use_spot,
|
115
|
+
region=region,
|
116
|
+
zone=zone,
|
117
|
+
clouds='nebius')
|
118
|
+
|
119
|
+
def accelerators_to_hourly_cost(self,
|
120
|
+
accelerators: Dict[str, int],
|
121
|
+
use_spot: bool,
|
122
|
+
region: Optional[str] = None,
|
123
|
+
zone: Optional[str] = None) -> float:
|
124
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
125
|
+
del accelerators, use_spot, region, zone # unused
|
126
|
+
return 0.0
|
127
|
+
|
128
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
129
|
+
return 0.0
|
130
|
+
|
131
|
+
def __repr__(self):
|
132
|
+
return self._REPR
|
133
|
+
|
134
|
+
def is_same_cloud(self, other: clouds.Cloud) -> bool:
|
135
|
+
# Returns true if the two clouds are the same cloud type.
|
136
|
+
return isinstance(other, Nebius)
|
137
|
+
|
138
|
+
@classmethod
|
139
|
+
def get_default_instance_type(
|
140
|
+
cls,
|
141
|
+
cpus: Optional[str] = None,
|
142
|
+
memory: Optional[str] = None,
|
143
|
+
disk_tier: Optional[resources_utils.DiskTier] = None
|
144
|
+
) -> Optional[str]:
|
145
|
+
"""Returns the default instance type for Nebius."""
|
146
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
147
|
+
memory=memory,
|
148
|
+
disk_tier=disk_tier,
|
149
|
+
clouds='nebius')
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
def get_accelerators_from_instance_type(
|
153
|
+
cls,
|
154
|
+
instance_type: str,
|
155
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
156
|
+
return service_catalog.get_accelerators_from_instance_type(
|
157
|
+
instance_type, clouds='nebius')
|
158
|
+
|
159
|
+
@classmethod
|
160
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
161
|
+
return None
|
162
|
+
|
163
|
+
def make_deploy_resources_variables(
|
164
|
+
self,
|
165
|
+
resources: 'resources_lib.Resources',
|
166
|
+
cluster_name: resources_utils.ClusterName,
|
167
|
+
region: 'clouds.Region',
|
168
|
+
zones: Optional[List['clouds.Zone']],
|
169
|
+
num_nodes: int,
|
170
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
171
|
+
del dryrun, cluster_name
|
172
|
+
assert zones is None, ('Nebius does not support zones', zones)
|
173
|
+
|
174
|
+
r = resources
|
175
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
176
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
177
|
+
acc_dict)
|
178
|
+
platform, _ = resources.instance_type.split('_')
|
179
|
+
|
180
|
+
if platform in ('cpu-d3', 'cpu-e2'):
|
181
|
+
image_family = 'ubuntu22.04-driverless'
|
182
|
+
elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
|
183
|
+
image_family = 'ubuntu22.04-cuda12'
|
184
|
+
else:
|
185
|
+
raise RuntimeError('Unsupported instance type for Nebius cloud:'
|
186
|
+
f' {resources.instance_type}')
|
187
|
+
return {
|
188
|
+
'instance_type': resources.instance_type,
|
189
|
+
'custom_resources': custom_resources,
|
190
|
+
'region': region.name,
|
191
|
+
'image_id': image_family,
|
192
|
+
# Nebius does not support specific zones.
|
193
|
+
'zones': None,
|
194
|
+
}
|
195
|
+
|
196
|
+
def _get_feasible_launchable_resources(
|
197
|
+
self, resources: 'resources_lib.Resources'
|
198
|
+
) -> 'resources_utils.FeasibleResources':
|
199
|
+
"""Returns a list of feasible resources for the given resources."""
|
200
|
+
if resources.instance_type is not None:
|
201
|
+
assert resources.is_launchable(), resources
|
202
|
+
resources = resources.copy(accelerators=None)
|
203
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
204
|
+
|
205
|
+
def _make(instance_list):
|
206
|
+
resource_list = []
|
207
|
+
for instance_type in instance_list:
|
208
|
+
r = resources.copy(
|
209
|
+
cloud=Nebius(),
|
210
|
+
instance_type=instance_type,
|
211
|
+
accelerators=None,
|
212
|
+
cpus=None,
|
213
|
+
)
|
214
|
+
resource_list.append(r)
|
215
|
+
return resource_list
|
216
|
+
|
217
|
+
# Currently, handle a filter on accelerators only.
|
218
|
+
accelerators = resources.accelerators
|
219
|
+
if accelerators is None:
|
220
|
+
# Return a default instance type
|
221
|
+
default_instance_type = Nebius.get_default_instance_type(
|
222
|
+
cpus=resources.cpus,
|
223
|
+
memory=resources.memory,
|
224
|
+
disk_tier=resources.disk_tier)
|
225
|
+
if default_instance_type is None:
|
226
|
+
# TODO: Add hints to all return values in this method to help
|
227
|
+
# users understand why the resources are not launchable.
|
228
|
+
return resources_utils.FeasibleResources([], [], None)
|
229
|
+
else:
|
230
|
+
return resources_utils.FeasibleResources(
|
231
|
+
_make([default_instance_type]), [], None)
|
232
|
+
|
233
|
+
assert len(accelerators) == 1, resources
|
234
|
+
acc, acc_count = list(accelerators.items())[0]
|
235
|
+
(instance_list, fuzzy_candidate_list
|
236
|
+
) = service_catalog.get_instance_type_for_accelerator(
|
237
|
+
acc,
|
238
|
+
acc_count,
|
239
|
+
use_spot=resources.use_spot,
|
240
|
+
cpus=resources.cpus,
|
241
|
+
region=resources.region,
|
242
|
+
zone=resources.zone,
|
243
|
+
clouds='nebius')
|
244
|
+
if instance_list is None:
|
245
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
246
|
+
None)
|
247
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
248
|
+
fuzzy_candidate_list, None)
|
249
|
+
|
250
|
+
@classmethod
|
251
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
252
|
+
""" Verify that the user has valid credentials for Nebius. """
|
253
|
+
logging.debug('Nebius cloud check credentials')
|
254
|
+
token = nebius.get_iam_token()
|
255
|
+
token_msg = (' Credentials can be set up by running: \n'\
|
256
|
+
f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
|
257
|
+
tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
258
|
+
f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
|
259
|
+
if token is None:
|
260
|
+
return False, f'{token_msg}'
|
261
|
+
sdk = nebius.sdk()
|
262
|
+
tenant_id = nebius.get_tenant_id()
|
263
|
+
if tenant_id is None:
|
264
|
+
return False, f'{tenant_msg}'
|
265
|
+
try:
|
266
|
+
service = nebius.iam().ProjectServiceClient(sdk)
|
267
|
+
service.list(
|
268
|
+
nebius.iam().ListProjectsRequest(parent_id=tenant_id)).wait()
|
269
|
+
except nebius.request_error() as e:
|
270
|
+
return False, (
|
271
|
+
f'{e.status} \n' # First line is indented by 4 spaces
|
272
|
+
f'{token_msg}'
|
273
|
+
f'{tenant_msg}')
|
274
|
+
return True, None
|
275
|
+
|
276
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
277
|
+
return {
|
278
|
+
f'~/.nebius/{filename}': f'~/.nebius/{filename}'
|
279
|
+
for filename in _CREDENTIAL_FILES
|
280
|
+
}
|
281
|
+
|
282
|
+
@classmethod
|
283
|
+
def get_current_user_identity(cls) -> Optional[List[str]]:
|
284
|
+
# NOTE: used for very advanced SkyPilot functionality
|
285
|
+
# Can implement later if desired
|
286
|
+
return None
|
287
|
+
|
288
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
289
|
+
return service_catalog.instance_type_exists(instance_type, 'nebius')
|
290
|
+
|
291
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
292
|
+
return service_catalog.validate_region_zone(region,
|
293
|
+
zone,
|
294
|
+
clouds='nebius')
|
@@ -0,0 +1,116 @@
|
|
1
|
+
"""Nebius Catalog.
|
2
|
+
|
3
|
+
This module loads the service catalog file and can be used to query
|
4
|
+
instance types and pricing information for Nebius.
|
5
|
+
"""
|
6
|
+
import typing
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
|
+
|
9
|
+
from sky.clouds.service_catalog import common
|
10
|
+
from sky.utils import resources_utils
|
11
|
+
from sky.utils import ux_utils
|
12
|
+
|
13
|
+
if typing.TYPE_CHECKING:
|
14
|
+
from sky.clouds import cloud
|
15
|
+
|
16
|
+
# Keep it synced with the frequency in
|
17
|
+
# skypilot-catalog/.github/workflows/update-Nebius-catalog.yml
|
18
|
+
_PULL_FREQUENCY_HOURS = 7
|
19
|
+
|
20
|
+
_df = common.read_catalog('nebius/vms.csv')
|
21
|
+
|
22
|
+
|
23
|
+
def instance_type_exists(instance_type: str) -> bool:
|
24
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
25
|
+
|
26
|
+
|
27
|
+
def validate_region_zone(
|
28
|
+
region: Optional[str],
|
29
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
30
|
+
if zone is not None:
|
31
|
+
with ux_utils.print_exception_no_traceback():
|
32
|
+
raise ValueError('Nebius does not support zones.')
|
33
|
+
return common.validate_region_zone_impl('nebius', _df, region, zone)
|
34
|
+
|
35
|
+
|
36
|
+
def get_hourly_cost(instance_type: str,
|
37
|
+
use_spot: bool = False,
|
38
|
+
region: Optional[str] = None,
|
39
|
+
zone: Optional[str] = None) -> float:
|
40
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
41
|
+
assert not use_spot, 'Nebius does not support spot.'
|
42
|
+
if zone is not None:
|
43
|
+
with ux_utils.print_exception_no_traceback():
|
44
|
+
raise ValueError('Nebius does not support zones.')
|
45
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
46
|
+
zone)
|
47
|
+
|
48
|
+
|
49
|
+
def get_vcpus_mem_from_instance_type(
|
50
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
51
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
52
|
+
|
53
|
+
|
54
|
+
def get_default_instance_type(
|
55
|
+
cpus: Optional[str] = None,
|
56
|
+
memory: Optional[str] = None,
|
57
|
+
disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
|
58
|
+
del disk_tier # unused
|
59
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
|
60
|
+
|
61
|
+
|
62
|
+
def get_accelerators_from_instance_type(
|
63
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
64
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
65
|
+
|
66
|
+
|
67
|
+
def get_instance_type_for_accelerator(
|
68
|
+
acc_name: str,
|
69
|
+
acc_count: int,
|
70
|
+
cpus: Optional[str] = None,
|
71
|
+
memory: Optional[str] = None,
|
72
|
+
use_spot: bool = False,
|
73
|
+
region: Optional[str] = None,
|
74
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
75
|
+
"""Filter the instance types based on resource requirements.
|
76
|
+
|
77
|
+
Returns a list of instance types satisfying the required count of
|
78
|
+
accelerators with sorted prices and a list of candidates with fuzzy search.
|
79
|
+
"""
|
80
|
+
if zone is not None:
|
81
|
+
with ux_utils.print_exception_no_traceback():
|
82
|
+
raise ValueError('Nebius does not support zones.')
|
83
|
+
return common.get_instance_type_for_accelerator_impl(df=_df,
|
84
|
+
acc_name=acc_name,
|
85
|
+
acc_count=acc_count,
|
86
|
+
cpus=cpus,
|
87
|
+
memory=memory,
|
88
|
+
use_spot=use_spot,
|
89
|
+
region=region,
|
90
|
+
zone=zone)
|
91
|
+
|
92
|
+
|
93
|
+
def regions() -> List['cloud.Region']:
|
94
|
+
return common.get_region_zones(_df, use_spot=False)
|
95
|
+
|
96
|
+
|
97
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
98
|
+
use_spot: bool) -> List['cloud.Region']:
|
99
|
+
df = _df[_df['InstanceType'] == instance_type]
|
100
|
+
return common.get_region_zones(df, use_spot)
|
101
|
+
|
102
|
+
|
103
|
+
def list_accelerators(
|
104
|
+
gpus_only: bool,
|
105
|
+
name_filter: Optional[str],
|
106
|
+
region_filter: Optional[str],
|
107
|
+
quantity_filter: Optional[int],
|
108
|
+
case_sensitive: bool = True,
|
109
|
+
all_regions: bool = False,
|
110
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
111
|
+
"""Returns all instance types in Nebius offering GPUs."""
|
112
|
+
|
113
|
+
del require_price # Unused.
|
114
|
+
return common.list_accelerators_impl('nebius', _df, gpus_only, name_filter,
|
115
|
+
region_filter, quantity_filter,
|
116
|
+
case_sensitive, all_regions)
|
sky/jobs/controller.py
CHANGED
@@ -6,6 +6,7 @@ import argparse
|
|
6
6
|
import multiprocessing
|
7
7
|
import os
|
8
8
|
import pathlib
|
9
|
+
import shutil
|
9
10
|
import time
|
10
11
|
import traceback
|
11
12
|
import typing
|
@@ -17,6 +18,7 @@ from sky import exceptions
|
|
17
18
|
from sky import sky_logging
|
18
19
|
from sky.backends import backend_utils
|
19
20
|
from sky.backends import cloud_vm_ray_backend
|
21
|
+
from sky.data import data_utils
|
20
22
|
from sky.jobs import recovery_strategy
|
21
23
|
from sky.jobs import scheduler
|
22
24
|
from sky.jobs import state as managed_job_state
|
@@ -488,6 +490,7 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
488
490
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
489
491
|
task.name, job_id)
|
490
492
|
managed_job_utils.terminate_cluster(cluster_name)
|
493
|
+
|
491
494
|
# Clean up Storages with persistent=False.
|
492
495
|
# TODO(zhwu): this assumes the specific backend.
|
493
496
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
@@ -499,6 +502,20 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
499
502
|
storage.construct()
|
500
503
|
backend.teardown_ephemeral_storage(task)
|
501
504
|
|
505
|
+
# Clean up any files mounted from the local disk, such as two-hop file
|
506
|
+
# mounts.
|
507
|
+
for file_mount in (task.file_mounts or {}).values():
|
508
|
+
try:
|
509
|
+
if not data_utils.is_cloud_store_url(file_mount):
|
510
|
+
path = os.path.expanduser(file_mount)
|
511
|
+
if os.path.isdir(path):
|
512
|
+
shutil.rmtree(path)
|
513
|
+
else:
|
514
|
+
os.remove(path)
|
515
|
+
except Exception as e: # pylint: disable=broad-except
|
516
|
+
logger.warning(
|
517
|
+
f'Failed to clean up file mount {file_mount}: {e}')
|
518
|
+
|
502
519
|
|
503
520
|
def start(job_id, dag_yaml):
|
504
521
|
"""Start the controller."""
|