skypilot-nightly 1.0.0.dev20250220__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +85 -0
  3. sky/backends/backend_utils.py +8 -0
  4. sky/backends/cloud_vm_ray_backend.py +10 -2
  5. sky/client/sdk.py +8 -3
  6. sky/clouds/__init__.py +2 -0
  7. sky/clouds/nebius.py +294 -0
  8. sky/clouds/service_catalog/constants.py +1 -1
  9. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  10. sky/jobs/controller.py +17 -0
  11. sky/jobs/server/core.py +31 -3
  12. sky/provision/__init__.py +1 -0
  13. sky/provision/kubernetes/instance.py +5 -1
  14. sky/provision/kubernetes/utils.py +8 -7
  15. sky/provision/nebius/__init__.py +11 -0
  16. sky/provision/nebius/config.py +11 -0
  17. sky/provision/nebius/instance.py +285 -0
  18. sky/provision/nebius/utils.py +310 -0
  19. sky/setup_files/dependencies.py +9 -1
  20. sky/skylet/constants.py +3 -6
  21. sky/task.py +6 -0
  22. sky/templates/jobs-controller.yaml.j2 +3 -0
  23. sky/templates/nebius-ray.yml.j2 +79 -0
  24. sky/utils/controller_utils.py +66 -2
  25. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
  26. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +30 -22
  27. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
  28. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
  29. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
  30. {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '6b2b31d8358f3ff8394a7a33ec49e9985ada230f'
8
+ _SKYPILOT_COMMIT_SHA = 'aa3c387f04fbdd4468751b7d66fcb381bd3449dc'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250220'
38
+ __version__ = '1.0.0.dev20250221'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -143,6 +143,7 @@ RunPod = clouds.RunPod
143
143
  Vast = clouds.Vast
144
144
  Vsphere = clouds.Vsphere
145
145
  Fluidstack = clouds.Fluidstack
146
+ Nebius = clouds.Nebius
146
147
 
147
148
  __all__ = [
148
149
  '__version__',
@@ -161,6 +162,7 @@ __all__ = [
161
162
  'SCP',
162
163
  'Vsphere',
163
164
  'Fluidstack',
165
+ 'Nebius',
164
166
  'Optimizer',
165
167
  'OptimizeTarget',
166
168
  'backends',
sky/adaptors/nebius.py ADDED
@@ -0,0 +1,85 @@
1
+ """Nebius cloud adaptor."""
2
+ import os
3
+
4
+ from sky.adaptors import common
5
+
6
+ NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
7
+ NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
8
+ NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
9
+ NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
10
+
11
+ MAX_RETRIES_TO_DISK_CREATE = 120
12
+ MAX_RETRIES_TO_INSTANCE_STOP = 120
13
+ MAX_RETRIES_TO_INSTANCE_START = 120
14
+ MAX_RETRIES_TO_INSTANCE_READY = 240
15
+
16
+ MAX_RETRIES_TO_DISK_DELETE = 120
17
+ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
18
+
19
+ POLL_INTERVAL = 5
20
+
21
+ _iam_token = None
22
+ _tenant_id = None
23
+
24
+ nebius = common.LazyImport(
25
+ 'nebius',
26
+ import_error_message='Failed to import dependencies for Nebius AI Cloud. '
27
+ 'Try running: pip install "skypilot[nebius]"',
28
+ # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
29
+ set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'}))
30
+
31
+
32
+ def request_error():
33
+ return nebius.aio.service_error.RequestError
34
+
35
+
36
+ def compute():
37
+ # pylint: disable=import-outside-toplevel
38
+ from nebius.api.nebius.compute import v1 as compute_v1
39
+ return compute_v1
40
+
41
+
42
+ def iam():
43
+ # pylint: disable=import-outside-toplevel
44
+ from nebius.api.nebius.iam import v1 as iam_v1
45
+ return iam_v1
46
+
47
+
48
+ def nebius_common():
49
+ # pylint: disable=import-outside-toplevel
50
+ from nebius.api.nebius.common import v1 as common_v1
51
+ return common_v1
52
+
53
+
54
+ def vpc():
55
+ # pylint: disable=import-outside-toplevel
56
+ from nebius.api.nebius.vpc import v1 as vpc_v1
57
+ return vpc_v1
58
+
59
+
60
+ def get_iam_token():
61
+ global _iam_token
62
+ if _iam_token is None:
63
+ try:
64
+ with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
65
+ encoding='utf-8') as file:
66
+ _iam_token = file.read().strip()
67
+ except FileNotFoundError:
68
+ return None
69
+ return _iam_token
70
+
71
+
72
+ def get_tenant_id():
73
+ global _tenant_id
74
+ if _tenant_id is None:
75
+ try:
76
+ with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
77
+ encoding='utf-8') as file:
78
+ _tenant_id = file.read().strip()
79
+ except FileNotFoundError:
80
+ return None
81
+ return _tenant_id
82
+
83
+
84
+ def sdk():
85
+ return nebius.sdk.SDK(credentials=get_iam_token())
@@ -197,6 +197,9 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
197
197
  return str(output_path)
198
198
 
199
199
 
200
+ # Add retry for the file mounts optimization, as the underlying cp command may
201
+ # experience transient errors, #4758.
202
+ @common_utils.retry
200
203
  def _optimize_file_mounts(yaml_path: str) -> None:
201
204
  """Optimize file mounts in the given ray yaml file.
202
205
 
@@ -206,6 +209,10 @@ def _optimize_file_mounts(yaml_path: str) -> None:
206
209
  - wheel
207
210
  - credentials
208
211
  Format is {dst: src}.
212
+
213
+ Raises:
214
+ subprocess.CalledProcessError: If the file mounts are failed to be
215
+ copied.
209
216
  """
210
217
  yaml_config = common_utils.read_yaml(yaml_path)
211
218
 
@@ -863,6 +870,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
863
870
  clouds.Paperspace,
864
871
  clouds.Azure,
865
872
  clouds.DO,
873
+ clouds.Nebius,
866
874
  )):
867
875
  config = auth.configure_ssh_info(config)
868
876
  elif isinstance(cloud, clouds.GCP):
@@ -191,7 +191,8 @@ def _get_cluster_config_template(cloud):
191
191
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
192
192
  clouds.Vsphere: 'vsphere-ray.yml.j2',
193
193
  clouds.Vast: 'vast-ray.yml.j2',
194
- clouds.Fluidstack: 'fluidstack-ray.yml.j2'
194
+ clouds.Fluidstack: 'fluidstack-ray.yml.j2',
195
+ clouds.Nebius: 'nebius-ray.yml.j2'
195
196
  }
196
197
  return cloud_to_template[type(cloud)]
197
198
 
@@ -3233,7 +3234,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3233
3234
  all_file_mounts: Optional[Dict[Path, Path]],
3234
3235
  storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
3235
3236
  ) -> None:
3236
- """Mounts all user files to the remote nodes."""
3237
+ """Mounts all user files to the remote nodes.
3238
+
3239
+ Note: This does not handle COPY storage_mounts. These should have
3240
+ already been translated into file_mounts by task.sync_storage_mounts().
3241
+
3242
+ TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
3243
+ assert here that all storage_mounts are MOUNT mode.
3244
+ """
3237
3245
  with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
3238
3246
  controller_utils.replace_skypilot_config_path_in_file_mounts(
3239
3247
  handle.launched_resources.cloud, all_file_mounts)
sky/client/sdk.py CHANGED
@@ -1503,14 +1503,14 @@ def stream_and_get(
1503
1503
 
1504
1504
  @usage_lib.entrypoint
1505
1505
  @annotations.client_api
1506
- def api_cancel(request_ids: Optional[List[str]] = None,
1506
+ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
1507
1507
  all_users: bool = False,
1508
1508
  silent: bool = False) -> server_common.RequestId:
1509
1509
  """Aborts a request or all requests.
1510
1510
 
1511
1511
  Args:
1512
- request_id: The prefix of the request ID of the request to abort.
1513
- all: Whether to abort all requests.
1512
+ request_ids: The request ID(s) to abort. Can be a single string or a
1513
+ list of strings.
1514
1514
  all_users: Whether to abort all requests from all users.
1515
1515
  silent: Whether to suppress the output.
1516
1516
 
@@ -1528,6 +1528,11 @@ def api_cancel(request_ids: Optional[List[str]] = None,
1528
1528
  user_id = None
1529
1529
  if not all_users:
1530
1530
  user_id = common_utils.get_user_hash()
1531
+
1532
+ # Convert single request ID to list if needed
1533
+ if isinstance(request_ids, str):
1534
+ request_ids = [request_ids]
1535
+
1531
1536
  body = payloads.RequestCancelBody(request_ids=request_ids, user_id=user_id)
1532
1537
  if all_users:
1533
1538
  echo('Cancelling all users\' requests...')
sky/clouds/__init__.py CHANGED
@@ -20,6 +20,7 @@ from sky.clouds.gcp import GCP
20
20
  from sky.clouds.ibm import IBM
21
21
  from sky.clouds.kubernetes import Kubernetes
22
22
  from sky.clouds.lambda_cloud import Lambda
23
+ from sky.clouds.nebius import Nebius
23
24
  from sky.clouds.oci import OCI
24
25
  from sky.clouds.paperspace import Paperspace
25
26
  from sky.clouds.runpod import RunPod
@@ -49,6 +50,7 @@ __all__ = [
49
50
  'ProvisionerVersion',
50
51
  'StatusVersion',
51
52
  'Fluidstack',
53
+ 'Nebius',
52
54
  # Utility functions
53
55
  'cloud_in_iterable',
54
56
  ]
sky/clouds/nebius.py ADDED
@@ -0,0 +1,294 @@
1
+ """ Nebius Cloud. """
2
+ import logging
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import clouds
7
+ from sky.adaptors import nebius
8
+ from sky.clouds import service_catalog
9
+ from sky.utils import registry
10
+ from sky.utils import resources_utils
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sky import resources as resources_lib
14
+
15
+ _CREDENTIAL_FILES = [
16
+ # credential files for Nebius
17
+ nebius.NEBIUS_TENANT_ID_FILENAME,
18
+ nebius.NEBIUS_IAM_TOKEN_FILENAME
19
+ ]
20
+
21
+
22
+ @registry.CLOUD_REGISTRY.register
23
+ class Nebius(clouds.Cloud):
24
+ """Nebius GPU Cloud"""
25
+ _REPR = 'Nebius'
26
+ _CLOUD_UNSUPPORTED_FEATURES = {
27
+ clouds.CloudImplementationFeatures.AUTO_TERMINATE:
28
+ ('Autodown and Autostop not supported. Can\'t delete disk.'),
29
+ # Autostop functionality can be implemented, but currently,
30
+ # there is only a single flag for both autostop and autodown.
31
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
32
+ ('Spot is not supported, as Nebius API does not implement spot.'),
33
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
34
+ (f'Migrating disk is currently not supported on {_REPR}.'),
35
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE:
36
+ (f'Docker image is currently not supported on {_REPR}. '
37
+ 'You can try running docker command inside the '
38
+ '`run` section in task.yaml.'),
39
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
40
+ (f'Custom disk tier is currently not supported on {_REPR}.'),
41
+ }
42
+ # Nebius maximum instance name length defined as <= 63 as a hostname length
43
+ # 63 - 8 - 5 = 50 characters since
44
+ # we add 4 character from UUID to make uniq `-xxxx`
45
+ # our provisioner adds additional `-worker`.
46
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 50
47
+ _regions: List[clouds.Region] = []
48
+
49
+ # Using the latest SkyPilot provisioner API to provision and check status.
50
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
51
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
52
+
53
+ @classmethod
54
+ def _unsupported_features_for_resources(
55
+ cls, resources: 'resources_lib.Resources'
56
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
57
+ del resources # unused
58
+ return cls._CLOUD_UNSUPPORTED_FEATURES
59
+
60
+ @classmethod
61
+ def _max_cluster_name_length(cls) -> Optional[int]:
62
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
63
+
64
+ @classmethod
65
+ def regions_with_offering(cls, instance_type: str,
66
+ accelerators: Optional[Dict[str, int]],
67
+ use_spot: bool, region: Optional[str],
68
+ zone: Optional[str]) -> List[clouds.Region]:
69
+ assert zone is None, 'Nebius does not support zones.'
70
+ del accelerators, zone # unused
71
+ if use_spot:
72
+ return []
73
+ regions = service_catalog.get_region_zones_for_instance_type(
74
+ instance_type, use_spot, 'nebius')
75
+
76
+ if region is not None:
77
+ regions = [r for r in regions if r.name == region]
78
+ return regions
79
+
80
+ @classmethod
81
+ def get_vcpus_mem_from_instance_type(
82
+ cls,
83
+ instance_type: str,
84
+ ) -> Tuple[Optional[float], Optional[float]]:
85
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
86
+ clouds='nebius')
87
+
88
+ @classmethod
89
+ def zones_provision_loop(
90
+ cls,
91
+ *,
92
+ region: str,
93
+ num_nodes: int,
94
+ instance_type: str,
95
+ accelerators: Optional[Dict[str, int]] = None,
96
+ use_spot: bool = False,
97
+ ) -> Iterator[None]:
98
+ del num_nodes # unused
99
+ regions = cls.regions_with_offering(instance_type,
100
+ accelerators,
101
+ use_spot,
102
+ region=region,
103
+ zone=None)
104
+ for r in regions:
105
+ assert r.zones is None, r
106
+ yield r.zones
107
+
108
+ def instance_type_to_hourly_cost(self,
109
+ instance_type: str,
110
+ use_spot: bool,
111
+ region: Optional[str] = None,
112
+ zone: Optional[str] = None) -> float:
113
+ return service_catalog.get_hourly_cost(instance_type,
114
+ use_spot=use_spot,
115
+ region=region,
116
+ zone=zone,
117
+ clouds='nebius')
118
+
119
+ def accelerators_to_hourly_cost(self,
120
+ accelerators: Dict[str, int],
121
+ use_spot: bool,
122
+ region: Optional[str] = None,
123
+ zone: Optional[str] = None) -> float:
124
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
125
+ del accelerators, use_spot, region, zone # unused
126
+ return 0.0
127
+
128
+ def get_egress_cost(self, num_gigabytes: float) -> float:
129
+ return 0.0
130
+
131
+ def __repr__(self):
132
+ return self._REPR
133
+
134
+ def is_same_cloud(self, other: clouds.Cloud) -> bool:
135
+ # Returns true if the two clouds are the same cloud type.
136
+ return isinstance(other, Nebius)
137
+
138
+ @classmethod
139
+ def get_default_instance_type(
140
+ cls,
141
+ cpus: Optional[str] = None,
142
+ memory: Optional[str] = None,
143
+ disk_tier: Optional[resources_utils.DiskTier] = None
144
+ ) -> Optional[str]:
145
+ """Returns the default instance type for Nebius."""
146
+ return service_catalog.get_default_instance_type(cpus=cpus,
147
+ memory=memory,
148
+ disk_tier=disk_tier,
149
+ clouds='nebius')
150
+
151
+ @classmethod
152
+ def get_accelerators_from_instance_type(
153
+ cls,
154
+ instance_type: str,
155
+ ) -> Optional[Dict[str, Union[int, float]]]:
156
+ return service_catalog.get_accelerators_from_instance_type(
157
+ instance_type, clouds='nebius')
158
+
159
+ @classmethod
160
+ def get_zone_shell_cmd(cls) -> Optional[str]:
161
+ return None
162
+
163
+ def make_deploy_resources_variables(
164
+ self,
165
+ resources: 'resources_lib.Resources',
166
+ cluster_name: resources_utils.ClusterName,
167
+ region: 'clouds.Region',
168
+ zones: Optional[List['clouds.Zone']],
169
+ num_nodes: int,
170
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
171
+ del dryrun, cluster_name
172
+ assert zones is None, ('Nebius does not support zones', zones)
173
+
174
+ r = resources
175
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
176
+ custom_resources = resources_utils.make_ray_custom_resources_str(
177
+ acc_dict)
178
+ platform, _ = resources.instance_type.split('_')
179
+
180
+ if platform in ('cpu-d3', 'cpu-e2'):
181
+ image_family = 'ubuntu22.04-driverless'
182
+ elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
183
+ image_family = 'ubuntu22.04-cuda12'
184
+ else:
185
+ raise RuntimeError('Unsupported instance type for Nebius cloud:'
186
+ f' {resources.instance_type}')
187
+ return {
188
+ 'instance_type': resources.instance_type,
189
+ 'custom_resources': custom_resources,
190
+ 'region': region.name,
191
+ 'image_id': image_family,
192
+ # Nebius does not support specific zones.
193
+ 'zones': None,
194
+ }
195
+
196
+ def _get_feasible_launchable_resources(
197
+ self, resources: 'resources_lib.Resources'
198
+ ) -> 'resources_utils.FeasibleResources':
199
+ """Returns a list of feasible resources for the given resources."""
200
+ if resources.instance_type is not None:
201
+ assert resources.is_launchable(), resources
202
+ resources = resources.copy(accelerators=None)
203
+ return resources_utils.FeasibleResources([resources], [], None)
204
+
205
+ def _make(instance_list):
206
+ resource_list = []
207
+ for instance_type in instance_list:
208
+ r = resources.copy(
209
+ cloud=Nebius(),
210
+ instance_type=instance_type,
211
+ accelerators=None,
212
+ cpus=None,
213
+ )
214
+ resource_list.append(r)
215
+ return resource_list
216
+
217
+ # Currently, handle a filter on accelerators only.
218
+ accelerators = resources.accelerators
219
+ if accelerators is None:
220
+ # Return a default instance type
221
+ default_instance_type = Nebius.get_default_instance_type(
222
+ cpus=resources.cpus,
223
+ memory=resources.memory,
224
+ disk_tier=resources.disk_tier)
225
+ if default_instance_type is None:
226
+ # TODO: Add hints to all return values in this method to help
227
+ # users understand why the resources are not launchable.
228
+ return resources_utils.FeasibleResources([], [], None)
229
+ else:
230
+ return resources_utils.FeasibleResources(
231
+ _make([default_instance_type]), [], None)
232
+
233
+ assert len(accelerators) == 1, resources
234
+ acc, acc_count = list(accelerators.items())[0]
235
+ (instance_list, fuzzy_candidate_list
236
+ ) = service_catalog.get_instance_type_for_accelerator(
237
+ acc,
238
+ acc_count,
239
+ use_spot=resources.use_spot,
240
+ cpus=resources.cpus,
241
+ region=resources.region,
242
+ zone=resources.zone,
243
+ clouds='nebius')
244
+ if instance_list is None:
245
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
246
+ None)
247
+ return resources_utils.FeasibleResources(_make(instance_list),
248
+ fuzzy_candidate_list, None)
249
+
250
+ @classmethod
251
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
252
+ """ Verify that the user has valid credentials for Nebius. """
253
+ logging.debug('Nebius cloud check credentials')
254
+ token = nebius.get_iam_token()
255
+ token_msg = (' Credentials can be set up by running: \n'\
256
+ f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
257
+ tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
258
+ f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
259
+ if token is None:
260
+ return False, f'{token_msg}'
261
+ sdk = nebius.sdk()
262
+ tenant_id = nebius.get_tenant_id()
263
+ if tenant_id is None:
264
+ return False, f'{tenant_msg}'
265
+ try:
266
+ service = nebius.iam().ProjectServiceClient(sdk)
267
+ service.list(
268
+ nebius.iam().ListProjectsRequest(parent_id=tenant_id)).wait()
269
+ except nebius.request_error() as e:
270
+ return False, (
271
+ f'{e.status} \n' # First line is indented by 4 spaces
272
+ f'{token_msg}'
273
+ f'{tenant_msg}')
274
+ return True, None
275
+
276
+ def get_credential_file_mounts(self) -> Dict[str, str]:
277
+ return {
278
+ f'~/.nebius/{filename}': f'~/.nebius/{filename}'
279
+ for filename in _CREDENTIAL_FILES
280
+ }
281
+
282
+ @classmethod
283
+ def get_current_user_identity(cls) -> Optional[List[str]]:
284
+ # NOTE: used for very advanced SkyPilot functionality
285
+ # Can implement later if desired
286
+ return None
287
+
288
+ def instance_type_exists(self, instance_type: str) -> bool:
289
+ return service_catalog.instance_type_exists(instance_type, 'nebius')
290
+
291
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
292
+ return service_catalog.validate_region_zone(region,
293
+ zone,
294
+ clouds='nebius')
@@ -4,4 +4,4 @@ CATALOG_SCHEMA_VERSION = 'v6'
4
4
  CATALOG_DIR = '~/.sky/catalogs'
5
5
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
6
6
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
7
- 'paperspace', 'do')
7
+ 'paperspace', 'do', 'nebius')
@@ -0,0 +1,116 @@
1
+ """Nebius Catalog.
2
+
3
+ This module loads the service catalog file and can be used to query
4
+ instance types and pricing information for Nebius.
5
+ """
6
+ import typing
7
+ from typing import Dict, List, Optional, Tuple, Union
8
+
9
+ from sky.clouds.service_catalog import common
10
+ from sky.utils import resources_utils
11
+ from sky.utils import ux_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky.clouds import cloud
15
+
16
+ # Keep it synced with the frequency in
17
+ # skypilot-catalog/.github/workflows/update-Nebius-catalog.yml
18
+ _PULL_FREQUENCY_HOURS = 7
19
+
20
+ _df = common.read_catalog('nebius/vms.csv')
21
+
22
+
23
+ def instance_type_exists(instance_type: str) -> bool:
24
+ return common.instance_type_exists_impl(_df, instance_type)
25
+
26
+
27
+ def validate_region_zone(
28
+ region: Optional[str],
29
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
30
+ if zone is not None:
31
+ with ux_utils.print_exception_no_traceback():
32
+ raise ValueError('Nebius does not support zones.')
33
+ return common.validate_region_zone_impl('nebius', _df, region, zone)
34
+
35
+
36
+ def get_hourly_cost(instance_type: str,
37
+ use_spot: bool = False,
38
+ region: Optional[str] = None,
39
+ zone: Optional[str] = None) -> float:
40
+ """Returns the cost, or the cheapest cost among all zones for spot."""
41
+ assert not use_spot, 'Nebius does not support spot.'
42
+ if zone is not None:
43
+ with ux_utils.print_exception_no_traceback():
44
+ raise ValueError('Nebius does not support zones.')
45
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
46
+ zone)
47
+
48
+
49
+ def get_vcpus_mem_from_instance_type(
50
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
51
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
52
+
53
+
54
+ def get_default_instance_type(
55
+ cpus: Optional[str] = None,
56
+ memory: Optional[str] = None,
57
+ disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
58
+ del disk_tier # unused
59
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
60
+
61
+
62
+ def get_accelerators_from_instance_type(
63
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
64
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
65
+
66
+
67
+ def get_instance_type_for_accelerator(
68
+ acc_name: str,
69
+ acc_count: int,
70
+ cpus: Optional[str] = None,
71
+ memory: Optional[str] = None,
72
+ use_spot: bool = False,
73
+ region: Optional[str] = None,
74
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
75
+ """Filter the instance types based on resource requirements.
76
+
77
+ Returns a list of instance types satisfying the required count of
78
+ accelerators with sorted prices and a list of candidates with fuzzy search.
79
+ """
80
+ if zone is not None:
81
+ with ux_utils.print_exception_no_traceback():
82
+ raise ValueError('Nebius does not support zones.')
83
+ return common.get_instance_type_for_accelerator_impl(df=_df,
84
+ acc_name=acc_name,
85
+ acc_count=acc_count,
86
+ cpus=cpus,
87
+ memory=memory,
88
+ use_spot=use_spot,
89
+ region=region,
90
+ zone=zone)
91
+
92
+
93
+ def regions() -> List['cloud.Region']:
94
+ return common.get_region_zones(_df, use_spot=False)
95
+
96
+
97
+ def get_region_zones_for_instance_type(instance_type: str,
98
+ use_spot: bool) -> List['cloud.Region']:
99
+ df = _df[_df['InstanceType'] == instance_type]
100
+ return common.get_region_zones(df, use_spot)
101
+
102
+
103
+ def list_accelerators(
104
+ gpus_only: bool,
105
+ name_filter: Optional[str],
106
+ region_filter: Optional[str],
107
+ quantity_filter: Optional[int],
108
+ case_sensitive: bool = True,
109
+ all_regions: bool = False,
110
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
111
+ """Returns all instance types in Nebius offering GPUs."""
112
+
113
+ del require_price # Unused.
114
+ return common.list_accelerators_impl('nebius', _df, gpus_only, name_filter,
115
+ region_filter, quantity_filter,
116
+ case_sensitive, all_regions)
sky/jobs/controller.py CHANGED
@@ -6,6 +6,7 @@ import argparse
6
6
  import multiprocessing
7
7
  import os
8
8
  import pathlib
9
+ import shutil
9
10
  import time
10
11
  import traceback
11
12
  import typing
@@ -17,6 +18,7 @@ from sky import exceptions
17
18
  from sky import sky_logging
18
19
  from sky.backends import backend_utils
19
20
  from sky.backends import cloud_vm_ray_backend
21
+ from sky.data import data_utils
20
22
  from sky.jobs import recovery_strategy
21
23
  from sky.jobs import scheduler
22
24
  from sky.jobs import state as managed_job_state
@@ -488,6 +490,7 @@ def _cleanup(job_id: int, dag_yaml: str):
488
490
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
489
491
  task.name, job_id)
490
492
  managed_job_utils.terminate_cluster(cluster_name)
493
+
491
494
  # Clean up Storages with persistent=False.
492
495
  # TODO(zhwu): this assumes the specific backend.
493
496
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
@@ -499,6 +502,20 @@ def _cleanup(job_id: int, dag_yaml: str):
499
502
  storage.construct()
500
503
  backend.teardown_ephemeral_storage(task)
501
504
 
505
+ # Clean up any files mounted from the local disk, such as two-hop file
506
+ # mounts.
507
+ for file_mount in (task.file_mounts or {}).values():
508
+ try:
509
+ if not data_utils.is_cloud_store_url(file_mount):
510
+ path = os.path.expanduser(file_mount)
511
+ if os.path.isdir(path):
512
+ shutil.rmtree(path)
513
+ else:
514
+ os.remove(path)
515
+ except Exception as e: # pylint: disable=broad-except
516
+ logger.warning(
517
+ f'Failed to clean up file mount {file_mount}: {e}')
518
+
502
519
 
503
520
  def start(job_id, dag_yaml):
504
521
  """Start the controller."""