skypilot-nightly 1.0.0.dev20250220__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +85 -0
- sky/backends/backend_utils.py +8 -0
- sky/backends/cloud_vm_ray_backend.py +10 -2
- sky/client/sdk.py +8 -3
- sky/clouds/__init__.py +2 -0
- sky/clouds/nebius.py +294 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/jobs/controller.py +17 -0
- sky/jobs/server/core.py +31 -3
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/instance.py +5 -1
- sky/provision/kubernetes/utils.py +8 -7
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +310 -0
- sky/setup_files/dependencies.py +9 -1
- sky/skylet/constants.py +3 -6
- sky/task.py +6 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/utils/controller_utils.py +66 -2
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +30 -22
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250220.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,310 @@
|
|
1
|
+
"""Nebius library wrapper for SkyPilot."""
|
2
|
+
import time
|
3
|
+
from typing import Any, Dict
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.adaptors import nebius
|
8
|
+
from sky.utils import common_utils
|
9
|
+
|
10
|
+
logger = sky_logging.init_logger(__name__)
|
11
|
+
|
12
|
+
POLL_INTERVAL = 5
|
13
|
+
|
14
|
+
|
15
|
+
def retry(func):
|
16
|
+
"""Decorator to retry a function."""
|
17
|
+
|
18
|
+
def wrapper(*args, **kwargs):
|
19
|
+
"""Wrapper for retrying a function."""
|
20
|
+
cnt = 0
|
21
|
+
while True:
|
22
|
+
try:
|
23
|
+
return func(*args, **kwargs)
|
24
|
+
except nebius.nebius.error.QueryError as e:
|
25
|
+
if cnt >= 3:
|
26
|
+
raise
|
27
|
+
logger.warning('Retrying for exception: '
|
28
|
+
f'{common_utils.format_exception(e)}.')
|
29
|
+
time.sleep(POLL_INTERVAL)
|
30
|
+
|
31
|
+
return wrapper
|
32
|
+
|
33
|
+
|
34
|
+
def get_project_by_region(region: str) -> str:
|
35
|
+
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
36
|
+
projects = service.list(nebius.iam().ListProjectsRequest(
|
37
|
+
parent_id=nebius.get_tenant_id())).wait()
|
38
|
+
# To find a project in a specific region, we rely on the project ID to
|
39
|
+
# deduce the region, since there is currently no method to retrieve region
|
40
|
+
# information directly from the project. Additionally, there is only one
|
41
|
+
# project per region, and projects cannot be created at this time.
|
42
|
+
# The region is determined from the project ID using a region-specific
|
43
|
+
# identifier embedded in it.
|
44
|
+
# Project id looks like project-e00xxxxxxxxxxxxxx where
|
45
|
+
# e00 - id of region 'eu-north1'
|
46
|
+
# e01 - id of region 'eu-west1'
|
47
|
+
# TODO(SalikovAlex): fix when info about region will be in projects list
|
48
|
+
# Currently, Nebius cloud supports 2 regions. We manually enumerate
|
49
|
+
# them here. Reference: https://docs.nebius.com/overview/regions
|
50
|
+
for project in projects.items:
|
51
|
+
if region == 'eu-north1' and project.metadata.id[8:11] == 'e00':
|
52
|
+
return project.metadata.id
|
53
|
+
if region == 'eu-west1' and project.metadata.id[8:11] == 'e01':
|
54
|
+
return project.metadata.id
|
55
|
+
raise Exception(f'No project found for region "{region}".')
|
56
|
+
|
57
|
+
|
58
|
+
def get_or_create_gpu_cluster(name: str, region: str) -> str:
|
59
|
+
"""Creates a GPU cluster.
|
60
|
+
When creating a GPU cluster, select an InfiniBand fabric for it:
|
61
|
+
|
62
|
+
fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
|
63
|
+
fabric-5 for projects in the eu-west1 region.
|
64
|
+
|
65
|
+
https://docs.nebius.com/compute/clusters/gpu
|
66
|
+
"""
|
67
|
+
project_id = get_project_by_region(region)
|
68
|
+
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
69
|
+
try:
|
70
|
+
cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
71
|
+
parent_id=project_id,
|
72
|
+
name=name,
|
73
|
+
)).wait()
|
74
|
+
cluster_id = cluster.metadata.id
|
75
|
+
except nebius.request_error() as no_cluster_found_error:
|
76
|
+
if region == 'eu-north1':
|
77
|
+
fabric = 'fabric-4'
|
78
|
+
elif region == 'eu-west1':
|
79
|
+
fabric = 'fabric-5'
|
80
|
+
else:
|
81
|
+
raise RuntimeError(
|
82
|
+
f'Unsupported region {region}.') from no_cluster_found_error
|
83
|
+
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
84
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
85
|
+
parent_id=project_id,
|
86
|
+
name=name,
|
87
|
+
),
|
88
|
+
spec=nebius.compute().GpuClusterSpec(
|
89
|
+
infiniband_fabric=fabric))).wait()
|
90
|
+
cluster_id = cluster.resource_id
|
91
|
+
return cluster_id
|
92
|
+
|
93
|
+
|
94
|
+
def delete_cluster(name: str, region: str) -> None:
|
95
|
+
"""Delete a GPU cluster."""
|
96
|
+
project_id = get_project_by_region(region)
|
97
|
+
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
98
|
+
try:
|
99
|
+
cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
100
|
+
parent_id=project_id,
|
101
|
+
name=name,
|
102
|
+
)).wait()
|
103
|
+
cluster_id = cluster.metadata.id
|
104
|
+
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
105
|
+
service.delete(
|
106
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
|
107
|
+
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
108
|
+
except nebius.request_error():
|
109
|
+
logger.debug('GPU Cluster does not exist.')
|
110
|
+
|
111
|
+
|
112
|
+
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
113
|
+
"""Lists instances associated with API key."""
|
114
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
115
|
+
result = service.list(
|
116
|
+
nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
|
117
|
+
|
118
|
+
instances = result
|
119
|
+
|
120
|
+
instance_dict: Dict[str, Dict[str, Any]] = {}
|
121
|
+
for instance in instances.items:
|
122
|
+
info = {}
|
123
|
+
info['status'] = instance.status.state.name
|
124
|
+
info['name'] = instance.metadata.name
|
125
|
+
if instance.status.network_interfaces:
|
126
|
+
info['external_ip'] = instance.status.network_interfaces[
|
127
|
+
0].public_ip_address.address.split('/')[0]
|
128
|
+
info['internal_ip'] = instance.status.network_interfaces[
|
129
|
+
0].ip_address.address.split('/')[0]
|
130
|
+
instance_dict[instance.metadata.id] = info
|
131
|
+
|
132
|
+
return instance_dict
|
133
|
+
|
134
|
+
|
135
|
+
def stop(instance_id: str) -> None:
|
136
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
137
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
|
138
|
+
retry_count = 0
|
139
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
140
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
141
|
+
instance = service.get(nebius.compute().GetInstanceRequest(
|
142
|
+
id=instance_id,)).wait()
|
143
|
+
if instance.status.state.name == 'STOPPED':
|
144
|
+
break
|
145
|
+
time.sleep(POLL_INTERVAL)
|
146
|
+
logger.debug(f'Waiting for instance {instance_id} stopping.')
|
147
|
+
retry_count += 1
|
148
|
+
|
149
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
150
|
+
raise TimeoutError(
|
151
|
+
f'Exceeded maximum retries '
|
152
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_STOP * POLL_INTERVAL}'
|
153
|
+
f' seconds) while waiting for instance {instance_id}'
|
154
|
+
f' to be stopped.')
|
155
|
+
|
156
|
+
|
157
|
+
def start(instance_id: str) -> None:
|
158
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
159
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
|
160
|
+
retry_count = 0
|
161
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
162
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
163
|
+
instance = service.get(nebius.compute().GetInstanceRequest(
|
164
|
+
id=instance_id,)).wait()
|
165
|
+
if instance.status.state.name == 'RUNNING':
|
166
|
+
break
|
167
|
+
time.sleep(POLL_INTERVAL)
|
168
|
+
logger.debug(f'Waiting for instance {instance_id} starting.')
|
169
|
+
retry_count += 1
|
170
|
+
|
171
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_START:
|
172
|
+
raise TimeoutError(
|
173
|
+
f'Exceeded maximum retries '
|
174
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_START * POLL_INTERVAL}'
|
175
|
+
f' seconds) while waiting for instance {instance_id}'
|
176
|
+
f' to be ready.')
|
177
|
+
|
178
|
+
|
179
|
+
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
180
|
+
preset: str, region: str, image_family: str, disk_size: int,
|
181
|
+
user_data: str) -> str:
|
182
|
+
# Each node must have a unique name to avoid conflicts between
|
183
|
+
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
184
|
+
# to the node name.
|
185
|
+
instance_name = (f'{cluster_name_on_cloud}-'
|
186
|
+
f'{uuid.uuid4().hex[:4]}-{node_type}')
|
187
|
+
logger.debug(f'Launching instance: {instance_name}')
|
188
|
+
|
189
|
+
disk_name = 'disk-' + instance_name
|
190
|
+
cluster_id = None
|
191
|
+
# 8 GPU virtual machines can be grouped into a GPU cluster.
|
192
|
+
# The GPU clusters are built with InfiniBand secure high-speed networking.
|
193
|
+
# https://docs.nebius.com/compute/clusters/gpu
|
194
|
+
if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
|
195
|
+
if preset == '8gpu-128vcpu-1600gb':
|
196
|
+
cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
|
197
|
+
region)
|
198
|
+
|
199
|
+
project_id = get_project_by_region(region)
|
200
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
201
|
+
disk = service.create(nebius.compute().CreateDiskRequest(
|
202
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
203
|
+
parent_id=project_id,
|
204
|
+
name=disk_name,
|
205
|
+
),
|
206
|
+
spec=nebius.compute().DiskSpec(
|
207
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
208
|
+
image_family=image_family),
|
209
|
+
size_gibibytes=disk_size,
|
210
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
211
|
+
))).wait()
|
212
|
+
disk_id = disk.resource_id
|
213
|
+
retry_count = 0
|
214
|
+
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
215
|
+
disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
216
|
+
parent_id=project_id,
|
217
|
+
name=disk_name,
|
218
|
+
)).wait()
|
219
|
+
if disk.status.state.name == 'READY':
|
220
|
+
break
|
221
|
+
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
222
|
+
time.sleep(POLL_INTERVAL)
|
223
|
+
retry_count += 1
|
224
|
+
|
225
|
+
if retry_count == nebius.MAX_RETRIES_TO_DISK_CREATE:
|
226
|
+
raise TimeoutError(
|
227
|
+
f'Exceeded maximum retries '
|
228
|
+
f'({nebius.MAX_RETRIES_TO_DISK_CREATE * POLL_INTERVAL}'
|
229
|
+
f' seconds) while waiting for disk {disk_name}'
|
230
|
+
f' to be ready.')
|
231
|
+
|
232
|
+
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
233
|
+
sub_net = service.list(nebius.vpc().ListSubnetsRequest(
|
234
|
+
parent_id=project_id,)).wait()
|
235
|
+
|
236
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
237
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
238
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
239
|
+
parent_id=project_id,
|
240
|
+
name=instance_name,
|
241
|
+
),
|
242
|
+
spec=nebius.compute().InstanceSpec(
|
243
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
|
244
|
+
if cluster_id is not None else None,
|
245
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
246
|
+
attach_mode=nebius.compute(
|
247
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
248
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
249
|
+
cloud_init_user_data=user_data,
|
250
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
251
|
+
preset=preset),
|
252
|
+
network_interfaces=[
|
253
|
+
nebius.compute().NetworkInterfaceSpec(
|
254
|
+
subnet_id=sub_net.items[0].metadata.id,
|
255
|
+
ip_address=nebius.compute().IPAddress(),
|
256
|
+
name='network-interface-0',
|
257
|
+
public_ip_address=nebius.compute().PublicIPAddress())
|
258
|
+
]))).wait()
|
259
|
+
instance_id = ''
|
260
|
+
retry_count = 0
|
261
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
262
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
263
|
+
instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
264
|
+
parent_id=project_id,
|
265
|
+
name=instance_name,
|
266
|
+
)).wait()
|
267
|
+
if instance.status.state.name == 'STARTING':
|
268
|
+
instance_id = instance.metadata.id
|
269
|
+
break
|
270
|
+
time.sleep(POLL_INTERVAL)
|
271
|
+
logger.debug(f'Waiting for instance {instance_name} start running.')
|
272
|
+
retry_count += 1
|
273
|
+
|
274
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
275
|
+
raise TimeoutError(
|
276
|
+
f'Exceeded maximum retries '
|
277
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
|
278
|
+
f' seconds) while waiting for instance {instance_name}'
|
279
|
+
f' to be ready.')
|
280
|
+
return instance_id
|
281
|
+
|
282
|
+
|
283
|
+
def remove(instance_id: str) -> None:
|
284
|
+
"""Terminates the given instance."""
|
285
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
286
|
+
result = service.get(
|
287
|
+
nebius.compute().GetInstanceRequest(id=instance_id)).wait()
|
288
|
+
disk_id = result.spec.boot_disk.existing_disk.id
|
289
|
+
service.delete(
|
290
|
+
nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
|
291
|
+
retry_count = 0
|
292
|
+
# The instance begins deleting and attempts to delete the disk.
|
293
|
+
# Must wait until the disk is unlocked and becomes deletable.
|
294
|
+
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
295
|
+
try:
|
296
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
297
|
+
service.delete(
|
298
|
+
nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
|
299
|
+
break
|
300
|
+
except nebius.request_error():
|
301
|
+
logger.debug('Waiting for disk deletion.')
|
302
|
+
time.sleep(POLL_INTERVAL)
|
303
|
+
retry_count += 1
|
304
|
+
|
305
|
+
if retry_count == nebius.MAX_RETRIES_TO_DISK_DELETE:
|
306
|
+
raise TimeoutError(
|
307
|
+
f'Exceeded maximum retries '
|
308
|
+
f'({nebius.MAX_RETRIES_TO_DISK_DELETE * POLL_INTERVAL}'
|
309
|
+
f' seconds) while waiting for disk {disk_id}'
|
310
|
+
f' to be deleted.')
|
sky/setup_files/dependencies.py
CHANGED
@@ -5,6 +5,7 @@ This file is imported by setup.py, so:
|
|
5
5
|
correct.
|
6
6
|
- It should not import any dependencies, as they may not be installed yet.
|
7
7
|
"""
|
8
|
+
import sys
|
8
9
|
from typing import Dict, List
|
9
10
|
|
10
11
|
install_requires = [
|
@@ -146,6 +147,13 @@ extras_require: Dict[str, List[str]] = {
|
|
146
147
|
# docs instead.
|
147
148
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
148
149
|
],
|
150
|
+
'nebius': ['nebius>=0.2.0',]
|
149
151
|
}
|
150
152
|
|
151
|
-
|
153
|
+
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
154
|
+
if sys.version_info < (3, 10):
|
155
|
+
filtered_keys = [k for k in extras_require if k != 'nebius']
|
156
|
+
extras_require['all'] = sum(
|
157
|
+
[v for k, v in extras_require.items() if k != 'nebius'], [])
|
158
|
+
else:
|
159
|
+
extras_require['all'] = sum(extras_require.values(), [])
|
sky/skylet/constants.py
CHANGED
@@ -281,12 +281,9 @@ FILE_MOUNTS_REMOTE_TMP_DIR = '/tmp/sky-{}-filemounts-files'
|
|
281
281
|
# linking. E.g., in our API server deployment on k8s, ~/.sky/ is mounted from a
|
282
282
|
# persistent volume, so any contents in ~/.sky/ cannot be hard linked elsewhere.
|
283
283
|
FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
|
284
|
-
|
285
|
-
#
|
286
|
-
|
287
|
-
FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
|
288
|
-
FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
|
289
|
-
FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
|
284
|
+
# Base path for two-hop file mounts translation. See
|
285
|
+
# controller_utils.translate_local_file_mounts_to_two_hop().
|
286
|
+
FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
|
290
287
|
|
291
288
|
# Used when an managed jobs are created and
|
292
289
|
# files are synced up to the cloud.
|
sky/task.py
CHANGED
@@ -1132,6 +1132,12 @@ class Task:
|
|
1132
1132
|
raise ValueError(f'Storage Type {store_type} '
|
1133
1133
|
'does not exist!')
|
1134
1134
|
|
1135
|
+
# TODO: Delete from storage_mounts, now that the storage is
|
1136
|
+
# translated into file_mounts. Note: as is, this will break
|
1137
|
+
# controller_utils.
|
1138
|
+
# _maybe_translate_local_file_mounts_and_sync_up(), which still
|
1139
|
+
# needs the storage, but not the file_mounts.
|
1140
|
+
|
1135
1141
|
def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
|
1136
1142
|
"""Returns file mounts of the form (dst=VM path, src=local path).
|
1137
1143
|
|
@@ -10,6 +10,9 @@ file_mounts:
|
|
10
10
|
{%- for remote_catalog_path, local_catalog_path in modified_catalogs.items() %}
|
11
11
|
{{remote_catalog_path}}: {{local_catalog_path}}
|
12
12
|
{%- endfor %}
|
13
|
+
{%- for controller_file_mount_path, local_file_mount_path in local_to_controller_file_mounts.items() %}
|
14
|
+
{{controller_file_mount_path}}: {{local_file_mount_path}}
|
15
|
+
{%- endfor %}
|
13
16
|
|
14
17
|
setup: |
|
15
18
|
{{ sky_activate_python_env }}
|
@@ -0,0 +1,79 @@
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
2
|
+
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
4
|
+
max_workers: {{num_nodes - 1}}
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
6
|
+
idle_timeout_minutes: 60
|
7
|
+
|
8
|
+
provider:
|
9
|
+
type: external
|
10
|
+
module: sky.provision.nebius
|
11
|
+
region: "{{region}}"
|
12
|
+
|
13
|
+
auth:
|
14
|
+
ssh_user: ubuntu
|
15
|
+
ssh_private_key: {{ssh_private_key}}
|
16
|
+
|
17
|
+
available_node_types:
|
18
|
+
ray_head_default:
|
19
|
+
resources: {}
|
20
|
+
node_config:
|
21
|
+
InstanceType: {{instance_type}}
|
22
|
+
ImageId: {{image_id}}
|
23
|
+
DiskSize: {{disk_size}}
|
24
|
+
UserData: |
|
25
|
+
users:
|
26
|
+
- name: skypilot:ssh_user
|
27
|
+
shell: /bin/bash
|
28
|
+
sudo: ALL=(ALL) NOPASSWD:ALL
|
29
|
+
ssh_authorized_keys:
|
30
|
+
- |-
|
31
|
+
skypilot:ssh_public_key_content
|
32
|
+
|
33
|
+
head_node_type: ray_head_default
|
34
|
+
|
35
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
36
|
+
file_mounts: {
|
37
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
38
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
39
|
+
{%- for remote_path, local_path in credentials.items() %}
|
40
|
+
"{{remote_path}}": "{{local_path}}",
|
41
|
+
{%- endfor %}
|
42
|
+
}
|
43
|
+
|
44
|
+
rsync_exclude: []
|
45
|
+
|
46
|
+
initialization_commands: []
|
47
|
+
|
48
|
+
# List of shell commands to run to set up nodes.
|
49
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
50
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
51
|
+
# items!
|
52
|
+
#
|
53
|
+
# Increment the following for catching performance bugs easier:
|
54
|
+
# current num items (num SSH connections): 1
|
55
|
+
setup_commands:
|
56
|
+
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
57
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
58
|
+
# Line 'rm ..': there is another installation of pip.
|
59
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
60
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
61
|
+
# Line 'mkdir -p ..': disable host key check
|
62
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
63
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
64
|
+
{{ initial_setup_command }}
|
65
|
+
{%- endfor %}
|
66
|
+
sudo systemctl stop unattended-upgrades || true;
|
67
|
+
sudo systemctl disable unattended-upgrades || true;
|
68
|
+
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
69
|
+
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
|
70
|
+
sudo pkill -9 apt-get;
|
71
|
+
sudo pkill -9 dpkg;
|
72
|
+
sudo dpkg --configure -a;
|
73
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
74
|
+
{{ conda_installation_commands }}
|
75
|
+
{{ ray_skypilot_installation_commands }}
|
76
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
77
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
78
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
79
|
+
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
sky/utils/controller_utils.py
CHANGED
@@ -662,6 +662,66 @@ def replace_skypilot_config_path_in_file_mounts(
|
|
662
662
|
f'with the real path in file mounts: {file_mounts}')
|
663
663
|
|
664
664
|
|
665
|
+
def _generate_run_uuid() -> str:
|
666
|
+
"""Generates a unique run id for the job."""
|
667
|
+
return common_utils.base36_encode(uuid.uuid4().hex)[:8]
|
668
|
+
|
669
|
+
|
670
|
+
def translate_local_file_mounts_to_two_hop(
|
671
|
+
task: 'task_lib.Task') -> Dict[str, str]:
|
672
|
+
"""Translates local->VM mounts into two-hop file mounts.
|
673
|
+
|
674
|
+
This strategy will upload the local files to the controller first, using a
|
675
|
+
normal rsync as part of sky.launch() for the controller. Then, when the
|
676
|
+
controller launches the task, it will also use local file_mounts from the
|
677
|
+
destination path of the first hop.
|
678
|
+
|
679
|
+
Local machine/API server Controller Job cluster
|
680
|
+
------------------------ ----------------------- --------------------
|
681
|
+
| local path ----|--|-> controller path --|--|-> job dst path |
|
682
|
+
------------------------ ----------------------- --------------------
|
683
|
+
|
684
|
+
Returns:
|
685
|
+
A dict mapping from controller file mount path to local file mount path
|
686
|
+
for the first hop. The task is updated in-place to do the second hop.
|
687
|
+
"""
|
688
|
+
first_hop_file_mounts = {}
|
689
|
+
second_hop_file_mounts = {}
|
690
|
+
|
691
|
+
run_id = _generate_run_uuid()
|
692
|
+
base_tmp_dir = os.path.join(constants.FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH,
|
693
|
+
run_id)
|
694
|
+
|
695
|
+
# Use a simple counter to create unique paths within the base_tmp_dir for
|
696
|
+
# each mount.
|
697
|
+
file_mount_id = 0
|
698
|
+
|
699
|
+
file_mounts_to_translate = task.file_mounts or {}
|
700
|
+
if task.workdir is not None:
|
701
|
+
file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
|
702
|
+
task.workdir = None
|
703
|
+
|
704
|
+
for job_cluster_path, local_path in file_mounts_to_translate.items():
|
705
|
+
if data_utils.is_cloud_store_url(
|
706
|
+
local_path) or data_utils.is_cloud_store_url(job_cluster_path):
|
707
|
+
raise exceptions.NotSupportedError(
|
708
|
+
'Cloud-based file_mounts are specified, but no cloud storage '
|
709
|
+
'is available. Please specify local file_mounts only.')
|
710
|
+
|
711
|
+
controller_path = os.path.join(base_tmp_dir, f'{file_mount_id}')
|
712
|
+
file_mount_id += 1
|
713
|
+
first_hop_file_mounts[controller_path] = local_path
|
714
|
+
second_hop_file_mounts[job_cluster_path] = controller_path
|
715
|
+
|
716
|
+
# Use set_file_mounts to override existing file mounts, if they exist.
|
717
|
+
task.set_file_mounts(second_hop_file_mounts)
|
718
|
+
|
719
|
+
# Return the first hop info so that it can be added to the jobs-controller
|
720
|
+
# YAML.
|
721
|
+
return first_hop_file_mounts
|
722
|
+
|
723
|
+
|
724
|
+
# (maybe translate local file mounts) and (sync up)
|
665
725
|
def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
666
726
|
task_type: str) -> None:
|
667
727
|
"""Translates local->VM mounts into Storage->VM, then syncs up any Storage.
|
@@ -695,7 +755,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
695
755
|
# We should not use common_utils.get_usage_run_id() here, because when
|
696
756
|
# Python API is used, the run id will be the same across multiple
|
697
757
|
# jobs.launch/serve.up calls after the sky is imported.
|
698
|
-
run_id =
|
758
|
+
run_id = _generate_run_uuid()
|
699
759
|
user_hash = common_utils.get_user_hash()
|
700
760
|
original_file_mounts = task.file_mounts if task.file_mounts else {}
|
701
761
|
original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
|
@@ -854,7 +914,11 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
|
|
854
914
|
# Step 4: Upload storage from sources
|
855
915
|
# Upload the local source to a bucket. The task will not be executed
|
856
916
|
# locally, so we need to upload the files/folders to the bucket manually
|
857
|
-
# here before sending the task to the remote jobs controller.
|
917
|
+
# here before sending the task to the remote jobs controller. This will
|
918
|
+
# also upload any storage mounts that are not translated. After
|
919
|
+
# sync_storage_mounts, we will also have file_mounts in the task, but
|
920
|
+
# these aren't used since the storage_mounts for the same paths take
|
921
|
+
# precedence.
|
858
922
|
if task.storage_mounts:
|
859
923
|
# There may be existing (non-translated) storage mounts, so log this
|
860
924
|
# whenever task.storage_mounts is non-empty.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250221
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -107,6 +107,8 @@ Provides-Extra: vast
|
|
107
107
|
Requires-Dist: vastai-sdk>=0.1.12; extra == "vast"
|
108
108
|
Provides-Extra: vsphere
|
109
109
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
|
110
|
+
Provides-Extra: nebius
|
111
|
+
Requires-Dist: nebius>=0.2.0; extra == "nebius"
|
110
112
|
Provides-Extra: all
|
111
113
|
Requires-Dist: urllib3<2; extra == "all"
|
112
114
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
@@ -150,6 +152,7 @@ Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
150
152
|
Requires-Dist: azure-common; extra == "all"
|
151
153
|
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
152
154
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
155
|
+
Requires-Dist: nebius>=0.2.0; extra == "all"
|
153
156
|
Dynamic: author
|
154
157
|
Dynamic: classifier
|
155
158
|
Dynamic: description
|
@@ -224,15 +227,16 @@ SkyPilot supports your existing GPU, TPU, and CPU workloads, with no code change
|
|
224
227
|
Install with pip:
|
225
228
|
```bash
|
226
229
|
# Choose your clouds:
|
227
|
-
pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]"
|
230
|
+
pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,nebius]"
|
228
231
|
```
|
229
232
|
To get the latest features and fixes, use the nightly build or [install from source](https://docs.skypilot.co/en/latest/getting-started/installation.html):
|
230
233
|
```bash
|
231
234
|
# Choose your clouds:
|
232
|
-
pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]"
|
235
|
+
pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp,nebius]"
|
233
236
|
```
|
234
237
|
|
235
|
-
|
238
|
+
|
239
|
+
[Current supported infra](https://docs.skypilot.co/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Digital Ocean, Paperspace, Cloudflare, Samsung, IBM, Vast.ai, VMware vSphere, Nebius):
|
236
240
|
<p align="center">
|
237
241
|
<img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/cloud-logos-light.png" width=85%>
|
238
242
|
</p>
|