dstack 0.19.1__py3-none-any.whl → 0.19.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/metrics.py +138 -0
- dstack/_internal/cli/commands/stats.py +5 -119
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/profile.py +9 -0
- dstack/_internal/core/backends/aws/configurator.py +1 -0
- dstack/_internal/core/backends/base/compute.py +4 -1
- dstack/_internal/core/backends/base/models.py +7 -7
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/cudo/configurator.py +0 -13
- dstack/_internal/core/backends/datacrunch/compute.py +118 -32
- dstack/_internal/core/backends/datacrunch/configurator.py +16 -11
- dstack/_internal/core/backends/gcp/compute.py +140 -26
- dstack/_internal/core/backends/gcp/configurator.py +2 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +34 -0
- dstack/_internal/core/backends/gcp/models.py +13 -1
- dstack/_internal/core/backends/gcp/resources.py +64 -27
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -4
- dstack/_internal/core/backends/lambdalabs/configurator.py +0 -21
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +272 -0
- dstack/_internal/core/backends/nebius/configurator.py +74 -0
- dstack/_internal/core/backends/nebius/models.py +108 -0
- dstack/_internal/core/backends/nebius/resources.py +240 -0
- dstack/_internal/core/backends/tensordock/api_client.py +5 -4
- dstack/_internal/core/backends/tensordock/compute.py +2 -15
- dstack/_internal/core/errors.py +14 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/profiles.py +3 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
- dstack/_internal/server/background/tasks/process_instances.py +12 -7
- dstack/_internal/server/background/tasks/process_running_jobs.py +20 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -2
- dstack/_internal/server/routers/prometheus.py +5 -0
- dstack/_internal/server/security/permissions.py +19 -1
- dstack/_internal/server/services/instances.py +14 -6
- dstack/_internal/server/services/jobs/__init__.py +3 -3
- dstack/_internal/server/services/offers.py +4 -2
- dstack/_internal/server/services/runs.py +0 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-8f9c66f404e9c7e7e020.css} +1 -1
- dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js → main-e190de603dc1e9f485ec.js} +7306 -149
- dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js.map → main-e190de603dc1e9f485ec.js.map} +1 -1
- dstack/_internal/utils/common.py +8 -2
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/ignore.py +2 -0
- dstack/api/server/_fleets.py +3 -5
- dstack/api/server/_runs.py +6 -7
- dstack/version.py +1 -1
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/METADATA +27 -11
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/RECORD +67 -57
- tests/_internal/core/backends/datacrunch/test_configurator.py +6 -2
- tests/_internal/server/background/tasks/test_process_instances.py +4 -2
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +29 -0
- tests/_internal/server/routers/test_backends.py +116 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_prometheus.py +21 -0
- tests/_internal/server/routers/test_runs.py +4 -0
- tests/_internal/utils/test_common.py +16 -1
- tests/_internal/utils/test_event_loop.py +18 -0
- dstack/_internal/core/backends/datacrunch/api_client.py +0 -77
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/WHEEL +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import shlex
|
|
3
|
+
import time
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from nebius.aio.operation import Operation as SDKOperation
|
|
8
|
+
from nebius.aio.service_error import RequestError, StatusCode
|
|
9
|
+
from nebius.api.nebius.common.v1 import Operation
|
|
10
|
+
from nebius.sdk import SDK
|
|
11
|
+
|
|
12
|
+
from dstack._internal.core.backends.base.backend import Compute
|
|
13
|
+
from dstack._internal.core.backends.base.compute import (
|
|
14
|
+
ComputeWithCreateInstanceSupport,
|
|
15
|
+
ComputeWithMultinodeSupport,
|
|
16
|
+
generate_unique_instance_name,
|
|
17
|
+
get_user_data,
|
|
18
|
+
)
|
|
19
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
20
|
+
from dstack._internal.core.backends.nebius import resources
|
|
21
|
+
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
22
|
+
from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
|
|
23
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
24
|
+
from dstack._internal.core.models.common import CoreModel
|
|
25
|
+
from dstack._internal.core.models.instances import (
|
|
26
|
+
InstanceAvailability,
|
|
27
|
+
InstanceConfiguration,
|
|
28
|
+
InstanceOffer,
|
|
29
|
+
InstanceOfferWithAvailability,
|
|
30
|
+
)
|
|
31
|
+
from dstack._internal.core.models.resources import Memory, Range
|
|
32
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
33
|
+
from dstack._internal.utils.logging import get_logger
|
|
34
|
+
|
|
35
|
+
logger = get_logger(__name__)
|
|
36
|
+
CONFIGURABLE_DISK_SIZE = Range[Memory](
|
|
37
|
+
min=Memory.parse("40GB"), # min for the ubuntu22.04-cuda12 image
|
|
38
|
+
max=Memory.parse("8192GB"), # max for the NETWORK_SSD disk type
|
|
39
|
+
)
|
|
40
|
+
WAIT_FOR_DISK_TIMEOUT = 20
|
|
41
|
+
WAIT_FOR_INSTANCE_TIMEOUT = 30
|
|
42
|
+
WAIT_FOR_INSTANCE_UPDATE_INTERVAL = 2.5
|
|
43
|
+
DELETE_INSTANCE_TIMEOUT = 25
|
|
44
|
+
DOCKER_DAEMON_CONFIG = {
|
|
45
|
+
"runtimes": {"nvidia": {"args": [], "path": "nvidia-container-runtime"}},
|
|
46
|
+
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
|
|
47
|
+
"exec-opts": ["native.cgroupdriver=cgroupfs"],
|
|
48
|
+
}
|
|
49
|
+
SETUP_COMMANDS = [
|
|
50
|
+
"ufw allow ssh",
|
|
51
|
+
"ufw allow from 10.0.0.0/8",
|
|
52
|
+
"ufw allow from 172.16.0.0/12",
|
|
53
|
+
"ufw allow from 192.168.0.0/16",
|
|
54
|
+
"ufw default deny incoming",
|
|
55
|
+
"ufw default allow outgoing",
|
|
56
|
+
"ufw enable",
|
|
57
|
+
'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config',
|
|
58
|
+
"service ssh restart",
|
|
59
|
+
f"echo {shlex.quote(json.dumps(DOCKER_DAEMON_CONFIG))} > /etc/docker/daemon.json",
|
|
60
|
+
"service docker restart",
|
|
61
|
+
]
|
|
62
|
+
SUPPORTED_PLATFORMS = [
|
|
63
|
+
"gpu-h100-sxm",
|
|
64
|
+
"gpu-h200-sxm",
|
|
65
|
+
"gpu-l40s-a",
|
|
66
|
+
"gpu-l40s-d",
|
|
67
|
+
"cpu-d3",
|
|
68
|
+
"cpu-e2",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class NebiusCompute(
|
|
73
|
+
ComputeWithCreateInstanceSupport,
|
|
74
|
+
ComputeWithMultinodeSupport,
|
|
75
|
+
Compute,
|
|
76
|
+
):
|
|
77
|
+
def __init__(self, config: NebiusConfig):
|
|
78
|
+
super().__init__()
|
|
79
|
+
self.config = config
|
|
80
|
+
self._subnet_id_cache: dict[str, str] = {}
|
|
81
|
+
|
|
82
|
+
@cached_property
|
|
83
|
+
def _sdk(self) -> SDK:
|
|
84
|
+
assert isinstance(self.config.creds, NebiusServiceAccountCreds)
|
|
85
|
+
return resources.make_sdk(self.config.creds)
|
|
86
|
+
|
|
87
|
+
@cached_property
|
|
88
|
+
def _region_to_project_id(self) -> dict[str, str]:
|
|
89
|
+
return resources.get_region_to_project_id_map(self._sdk)
|
|
90
|
+
|
|
91
|
+
def _get_subnet_id(self, region: str) -> str:
|
|
92
|
+
if region not in self._subnet_id_cache:
|
|
93
|
+
self._subnet_id_cache[region] = resources.get_default_subnet(
|
|
94
|
+
self._sdk, self._region_to_project_id[region]
|
|
95
|
+
).metadata.id
|
|
96
|
+
return self._subnet_id_cache[region]
|
|
97
|
+
|
|
98
|
+
def get_offers(
|
|
99
|
+
self, requirements: Optional[Requirements] = None
|
|
100
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
101
|
+
offers = get_catalog_offers(
|
|
102
|
+
backend=BackendType.NEBIUS,
|
|
103
|
+
locations=self.config.regions or list(self._region_to_project_id),
|
|
104
|
+
requirements=requirements,
|
|
105
|
+
extra_filter=_supported_instances,
|
|
106
|
+
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
107
|
+
)
|
|
108
|
+
return [
|
|
109
|
+
InstanceOfferWithAvailability(
|
|
110
|
+
**offer.dict(),
|
|
111
|
+
availability=InstanceAvailability.UNKNOWN,
|
|
112
|
+
)
|
|
113
|
+
for offer in offers
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
def create_instance(
|
|
117
|
+
self,
|
|
118
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
119
|
+
instance_config: InstanceConfiguration,
|
|
120
|
+
) -> JobProvisioningData:
|
|
121
|
+
# NOTE: This method can block for a long time as it waits for the boot disk to be created
|
|
122
|
+
# and the instance to enter the STARTING state. This has to be done in create_instance so
|
|
123
|
+
# that we can handle quota and availability errors that may occur even after creating an
|
|
124
|
+
# instance.
|
|
125
|
+
instance_name = generate_unique_instance_name(instance_config)
|
|
126
|
+
platform, preset = instance_offer.instance.name.split()
|
|
127
|
+
create_disk_op = resources.create_disk(
|
|
128
|
+
sdk=self._sdk,
|
|
129
|
+
name=instance_name,
|
|
130
|
+
project_id=self._region_to_project_id[instance_offer.region],
|
|
131
|
+
size_mib=instance_offer.instance.resources.disk.size_mib,
|
|
132
|
+
image_family="ubuntu22.04-cuda12",
|
|
133
|
+
)
|
|
134
|
+
create_instance_op = None
|
|
135
|
+
try:
|
|
136
|
+
logger.debug("Blocking until disk %s is created", create_disk_op.resource_id)
|
|
137
|
+
resources.wait_for_operation(create_disk_op, timeout=WAIT_FOR_DISK_TIMEOUT)
|
|
138
|
+
if not create_disk_op.successful():
|
|
139
|
+
raw_op = create_disk_op.raw()
|
|
140
|
+
raise ProvisioningError(
|
|
141
|
+
f"Create disk operation failed. Message: {raw_op.status.message}."
|
|
142
|
+
f" Details: {raw_op.status.details}"
|
|
143
|
+
)
|
|
144
|
+
create_instance_op = resources.create_instance(
|
|
145
|
+
sdk=self._sdk,
|
|
146
|
+
name=instance_name,
|
|
147
|
+
project_id=self._region_to_project_id[instance_offer.region],
|
|
148
|
+
user_data=get_user_data(
|
|
149
|
+
instance_config.get_public_keys(),
|
|
150
|
+
backend_specific_commands=SETUP_COMMANDS,
|
|
151
|
+
),
|
|
152
|
+
platform=platform,
|
|
153
|
+
preset=preset,
|
|
154
|
+
disk_id=create_disk_op.resource_id,
|
|
155
|
+
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
156
|
+
)
|
|
157
|
+
_wait_for_instance(self._sdk, create_instance_op)
|
|
158
|
+
except BaseException:
|
|
159
|
+
if create_instance_op is not None:
|
|
160
|
+
try:
|
|
161
|
+
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
162
|
+
delete_instance_op = resources.delete_instance(
|
|
163
|
+
self._sdk, create_instance_op.resource_id
|
|
164
|
+
)
|
|
165
|
+
resources.wait_for_operation(
|
|
166
|
+
delete_instance_op, timeout=DELETE_INSTANCE_TIMEOUT
|
|
167
|
+
)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.exception(
|
|
170
|
+
"Could not delete instance %s: %s", create_instance_op.resource_id, e
|
|
171
|
+
)
|
|
172
|
+
try:
|
|
173
|
+
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
174
|
+
resources.delete_disk(self._sdk, create_disk_op.resource_id)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.exception(
|
|
177
|
+
"Could not delete boot disk %s: %s", create_disk_op.resource_id, e
|
|
178
|
+
)
|
|
179
|
+
raise
|
|
180
|
+
return JobProvisioningData(
|
|
181
|
+
backend=instance_offer.backend,
|
|
182
|
+
instance_type=instance_offer.instance,
|
|
183
|
+
instance_id=create_instance_op.resource_id,
|
|
184
|
+
hostname=None,
|
|
185
|
+
region=instance_offer.region,
|
|
186
|
+
price=instance_offer.price,
|
|
187
|
+
ssh_port=22,
|
|
188
|
+
username="ubuntu",
|
|
189
|
+
dockerized=True,
|
|
190
|
+
backend_data=NebiusInstanceBackendData(boot_disk_id=create_disk_op.resource_id).json(),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def update_provisioning_data(
|
|
194
|
+
self, provisioning_data, project_ssh_public_key, project_ssh_private_key
|
|
195
|
+
):
|
|
196
|
+
instance = resources.get_instance(self._sdk, provisioning_data.instance_id)
|
|
197
|
+
if not instance.status.network_interfaces:
|
|
198
|
+
return
|
|
199
|
+
interface = instance.status.network_interfaces[0]
|
|
200
|
+
provisioning_data.hostname, _ = interface.public_ip_address.address.split("/")
|
|
201
|
+
provisioning_data.internal_ip, _ = interface.ip_address.address.split("/")
|
|
202
|
+
|
|
203
|
+
def terminate_instance(
|
|
204
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
205
|
+
):
|
|
206
|
+
backend_data_parsed = NebiusInstanceBackendData.load(backend_data)
|
|
207
|
+
try:
|
|
208
|
+
instance = resources.get_instance(self._sdk, instance_id)
|
|
209
|
+
except RequestError as e:
|
|
210
|
+
if e.status.code != StatusCode.NOT_FOUND:
|
|
211
|
+
raise
|
|
212
|
+
instance = None
|
|
213
|
+
if instance is not None:
|
|
214
|
+
if instance.status.state != instance.status.InstanceState.DELETING:
|
|
215
|
+
resources.delete_instance(self._sdk, instance_id)
|
|
216
|
+
raise NotYetTerminated(
|
|
217
|
+
"Requested instance deletion."
|
|
218
|
+
" Will wait for deletion before deleting the boot disk."
|
|
219
|
+
f" Instance state was: {instance.status.state.name}"
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
raise NotYetTerminated(
|
|
223
|
+
"Waiting for instance deletion before deleting the boot disk."
|
|
224
|
+
f" Instance state: {instance.status.state.name}"
|
|
225
|
+
)
|
|
226
|
+
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
227
|
+
resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class NebiusInstanceBackendData(CoreModel):
|
|
231
|
+
boot_disk_id: str
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def load(cls, raw: Optional[str]) -> "NebiusInstanceBackendData":
|
|
235
|
+
assert raw is not None
|
|
236
|
+
return cls.__response__.parse_raw(raw)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
|
|
240
|
+
start = time.monotonic()
|
|
241
|
+
while True:
|
|
242
|
+
if op.done() and not op.successful():
|
|
243
|
+
raise ProvisioningError(
|
|
244
|
+
f"Create instance operation failed. Message: {op.raw().status.message}."
|
|
245
|
+
f" Details: {op.raw().status.details}"
|
|
246
|
+
)
|
|
247
|
+
instance = resources.get_instance(sdk, op.resource_id)
|
|
248
|
+
if instance.status.state in [
|
|
249
|
+
instance.status.InstanceState.STARTING,
|
|
250
|
+
instance.status.InstanceState.RUNNING,
|
|
251
|
+
]:
|
|
252
|
+
break
|
|
253
|
+
if time.monotonic() - start > WAIT_FOR_INSTANCE_TIMEOUT:
|
|
254
|
+
raise BackendError(
|
|
255
|
+
f"Instance {instance.metadata.id} did not start booting in time."
|
|
256
|
+
f" Status: {instance.status.state.name}"
|
|
257
|
+
)
|
|
258
|
+
logger.debug(
|
|
259
|
+
"Waiting for instance %s. Status: %s. Operation status: %s",
|
|
260
|
+
instance.metadata.name,
|
|
261
|
+
instance.status.state.name,
|
|
262
|
+
op.status(),
|
|
263
|
+
)
|
|
264
|
+
time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
|
|
265
|
+
resources.LOOP.await_(
|
|
266
|
+
op.update(timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _supported_instances(offer: InstanceOffer) -> bool:
|
|
271
|
+
platform, _ = offer.instance.name.split()
|
|
272
|
+
return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from nebius.aio.service_error import RequestError
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
6
|
+
BackendRecord,
|
|
7
|
+
Configurator,
|
|
8
|
+
raise_invalid_credentials_error,
|
|
9
|
+
)
|
|
10
|
+
from dstack._internal.core.backends.nebius import resources
|
|
11
|
+
from dstack._internal.core.backends.nebius.backend import NebiusBackend
|
|
12
|
+
from dstack._internal.core.backends.nebius.models import (
|
|
13
|
+
AnyNebiusBackendConfig,
|
|
14
|
+
NebiusBackendConfig,
|
|
15
|
+
NebiusBackendConfigWithCreds,
|
|
16
|
+
NebiusConfig,
|
|
17
|
+
NebiusCreds,
|
|
18
|
+
NebiusServiceAccountCreds,
|
|
19
|
+
NebiusStoredConfig,
|
|
20
|
+
)
|
|
21
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NebiusConfigurator(Configurator):
|
|
25
|
+
TYPE = BackendType.NEBIUS
|
|
26
|
+
BACKEND_CLASS = NebiusBackend
|
|
27
|
+
|
|
28
|
+
def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_enabled: bool):
|
|
29
|
+
assert isinstance(config.creds, NebiusServiceAccountCreds)
|
|
30
|
+
try:
|
|
31
|
+
sdk = resources.make_sdk(config.creds)
|
|
32
|
+
available_regions = set(resources.get_region_to_project_id_map(sdk))
|
|
33
|
+
except (ValueError, RequestError) as e:
|
|
34
|
+
raise_invalid_credentials_error(
|
|
35
|
+
fields=[["creds"]],
|
|
36
|
+
details=str(e),
|
|
37
|
+
)
|
|
38
|
+
if invalid_regions := set(config.regions or []) - available_regions:
|
|
39
|
+
raise_invalid_credentials_error(
|
|
40
|
+
fields=[["regions"]],
|
|
41
|
+
details=(
|
|
42
|
+
f"Configured regions {invalid_regions} do not exist in this Nebius tenancy."
|
|
43
|
+
" Omit `regions` to use all regions or select some of the available regions:"
|
|
44
|
+
f" {available_regions}"
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def create_backend(
|
|
49
|
+
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
50
|
+
) -> BackendRecord:
|
|
51
|
+
return BackendRecord(
|
|
52
|
+
config=NebiusStoredConfig(
|
|
53
|
+
**NebiusBackendConfig.__response__.parse_obj(config).dict()
|
|
54
|
+
).json(),
|
|
55
|
+
auth=NebiusCreds.parse_obj(config.creds).json(),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get_backend_config(
|
|
59
|
+
self, record: BackendRecord, include_creds: bool
|
|
60
|
+
) -> AnyNebiusBackendConfig:
|
|
61
|
+
config = self._get_config(record)
|
|
62
|
+
if include_creds:
|
|
63
|
+
return NebiusBackendConfigWithCreds.__response__.parse_obj(config)
|
|
64
|
+
return NebiusBackendConfig.__response__.parse_obj(config)
|
|
65
|
+
|
|
66
|
+
def get_backend(self, record: BackendRecord) -> NebiusBackend:
|
|
67
|
+
config = self._get_config(record)
|
|
68
|
+
return NebiusBackend(config=config)
|
|
69
|
+
|
|
70
|
+
def _get_config(self, record: BackendRecord) -> NebiusConfig:
|
|
71
|
+
return NebiusConfig.__response__(
|
|
72
|
+
**json.loads(record.config),
|
|
73
|
+
creds=NebiusCreds.parse_raw(record.auth),
|
|
74
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Annotated, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, root_validator
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.models import fill_data
|
|
6
|
+
from dstack._internal.core.models.common import CoreModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NebiusServiceAccountCreds(CoreModel):
|
|
10
|
+
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
11
|
+
"service_account"
|
|
12
|
+
)
|
|
13
|
+
service_account_id: Annotated[str, Field(description="Service account ID")]
|
|
14
|
+
public_key_id: Annotated[str, Field(description="ID of the service account public key")]
|
|
15
|
+
private_key_file: Annotated[
|
|
16
|
+
Optional[str], Field(description=("Path to the service account private key"))
|
|
17
|
+
] = None
|
|
18
|
+
private_key_content: Annotated[
|
|
19
|
+
str,
|
|
20
|
+
Field(
|
|
21
|
+
description=(
|
|
22
|
+
"Content of the service account private key. When configuring via"
|
|
23
|
+
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
24
|
+
" When configuring via UI, it has to be specified explicitly."
|
|
25
|
+
)
|
|
26
|
+
),
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NebiusServiceAccountFileCreds(CoreModel):
|
|
31
|
+
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
32
|
+
"service_account"
|
|
33
|
+
)
|
|
34
|
+
service_account_id: Annotated[str, Field(description="Service account ID")]
|
|
35
|
+
public_key_id: Annotated[str, Field(description="ID of the service account public key")]
|
|
36
|
+
private_key_file: Annotated[
|
|
37
|
+
Optional[str], Field(description=("Path to the service account private key"))
|
|
38
|
+
] = None
|
|
39
|
+
private_key_content: Annotated[
|
|
40
|
+
Optional[str],
|
|
41
|
+
Field(
|
|
42
|
+
description=(
|
|
43
|
+
"Content of the service account private key. When configuring via"
|
|
44
|
+
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
45
|
+
" When configuring via UI, it has to be specified explicitly."
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
49
|
+
|
|
50
|
+
@root_validator
|
|
51
|
+
def fill_data(cls, values):
|
|
52
|
+
return fill_data(
|
|
53
|
+
values, filename_field="private_key_file", data_field="private_key_content"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
AnyNebiusCreds = NebiusServiceAccountCreds
|
|
58
|
+
NebiusCreds = AnyNebiusCreds
|
|
59
|
+
AnyNebiusFileCreds = NebiusServiceAccountFileCreds
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class NebiusBackendConfig(CoreModel):
|
|
63
|
+
"""
|
|
64
|
+
The backend config used in the API, server/config.yml, `NebiusConfigurator`.
|
|
65
|
+
It also serves as a base class for other backend config models.
|
|
66
|
+
Should not include creds.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
type: Annotated[
|
|
70
|
+
Literal["nebius"],
|
|
71
|
+
Field(description="The type of backend"),
|
|
72
|
+
] = "nebius"
|
|
73
|
+
regions: Annotated[
|
|
74
|
+
Optional[list[str]],
|
|
75
|
+
Field(description="The list of Nebius regions. Omit to use all regions"),
|
|
76
|
+
] = None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
80
|
+
"""
|
|
81
|
+
Same as `NebiusBackendConfig` but also includes creds.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
creds: Annotated[AnyNebiusCreds, Field(description="The credentials")]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class NebiusBackendFileConfigWithCreds(NebiusBackendConfig):
|
|
88
|
+
creds: AnyNebiusFileCreds = Field(description="The credentials")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
AnyNebiusBackendConfig = Union[NebiusBackendConfig, NebiusBackendConfigWithCreds]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class NebiusStoredConfig(NebiusBackendConfig):
|
|
95
|
+
"""
|
|
96
|
+
The backend config used for config parameters in the DB.
|
|
97
|
+
Can extend `NebiusBackendConfig` with additional parameters.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class NebiusConfig(NebiusStoredConfig):
|
|
104
|
+
"""
|
|
105
|
+
The backend config used by `NebiusBackend` and `NebiusCompute`.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
creds: AnyNebiusCreds
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Container as ContainerT
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from tempfile import NamedTemporaryFile
|
|
8
|
+
|
|
9
|
+
from nebius.aio.authorization.options import options_to_metadata
|
|
10
|
+
from nebius.aio.operation import Operation as SDKOperation
|
|
11
|
+
from nebius.aio.service_error import RequestError, StatusCode
|
|
12
|
+
from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
|
|
13
|
+
from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
|
|
14
|
+
from nebius.api.nebius.compute.v1 import (
|
|
15
|
+
AttachedDiskSpec,
|
|
16
|
+
CreateDiskRequest,
|
|
17
|
+
CreateInstanceRequest,
|
|
18
|
+
DeleteDiskRequest,
|
|
19
|
+
DeleteInstanceRequest,
|
|
20
|
+
DiskServiceClient,
|
|
21
|
+
DiskSpec,
|
|
22
|
+
ExistingDisk,
|
|
23
|
+
GetInstanceRequest,
|
|
24
|
+
Instance,
|
|
25
|
+
InstanceServiceClient,
|
|
26
|
+
InstanceSpec,
|
|
27
|
+
IPAddress,
|
|
28
|
+
NetworkInterfaceSpec,
|
|
29
|
+
PublicIPAddress,
|
|
30
|
+
ResourcesSpec,
|
|
31
|
+
SourceImageFamily,
|
|
32
|
+
)
|
|
33
|
+
from nebius.api.nebius.iam.v1 import (
|
|
34
|
+
Container,
|
|
35
|
+
ListProjectsRequest,
|
|
36
|
+
ListTenantsRequest,
|
|
37
|
+
ProjectServiceClient,
|
|
38
|
+
TenantServiceClient,
|
|
39
|
+
)
|
|
40
|
+
from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceClient
|
|
41
|
+
from nebius.sdk import SDK
|
|
42
|
+
|
|
43
|
+
from dstack._internal.core.backends.nebius.models import NebiusServiceAccountCreds
|
|
44
|
+
from dstack._internal.core.errors import BackendError, NoCapacityError
|
|
45
|
+
from dstack._internal.utils.event_loop import DaemonEventLoop
|
|
46
|
+
from dstack._internal.utils.logging import get_logger
|
|
47
|
+
|
|
48
|
+
#
|
|
49
|
+
# Guidelines on using the Nebius SDK:
|
|
50
|
+
#
|
|
51
|
+
# Do not use Request.wait() or other sync SDK methods, they suffer from deadlocks.
|
|
52
|
+
# Instead, use async methods and await them with LOOP.await_()
|
|
53
|
+
LOOP = DaemonEventLoop()
|
|
54
|
+
# Pass a timeout to all methods to avoid infinite waiting
|
|
55
|
+
REQUEST_TIMEOUT = 10
|
|
56
|
+
# Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
|
|
57
|
+
REQUEST_MD = options_to_metadata(
|
|
58
|
+
{
|
|
59
|
+
OPTION_RENEW_SYNCHRONOUS: "true",
|
|
60
|
+
OPTION_RENEW_REQUEST_TIMEOUT: "5",
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# disables log messages about errors such as invalid creds or expired timeouts
|
|
65
|
+
logging.getLogger("nebius").setLevel(logging.CRITICAL)
|
|
66
|
+
logger = get_logger(__name__)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@contextmanager
|
|
70
|
+
def wrap_capacity_errors() -> Generator[None, None, None]:
|
|
71
|
+
try:
|
|
72
|
+
yield
|
|
73
|
+
except RequestError as e:
|
|
74
|
+
if e.status.code == StatusCode.RESOURCE_EXHAUSTED or "Quota limit exceeded" in str(e):
|
|
75
|
+
raise NoCapacityError(e)
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@contextmanager
|
|
80
|
+
def ignore_errors(status_codes: ContainerT[StatusCode]) -> Generator[None, None, None]:
|
|
81
|
+
try:
|
|
82
|
+
yield
|
|
83
|
+
except RequestError as e:
|
|
84
|
+
if e.status.code not in status_codes:
|
|
85
|
+
raise
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def make_sdk(creds: NebiusServiceAccountCreds) -> SDK:
|
|
89
|
+
with NamedTemporaryFile("w") as f:
|
|
90
|
+
f.write(creds.private_key_content)
|
|
91
|
+
f.flush()
|
|
92
|
+
return SDK(
|
|
93
|
+
service_account_private_key_file_name=f.name,
|
|
94
|
+
service_account_public_key_id=creds.public_key_id,
|
|
95
|
+
service_account_id=creds.service_account_id,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def wait_for_operation(
|
|
100
|
+
op: SDKOperation[Operation],
|
|
101
|
+
timeout: float,
|
|
102
|
+
interval: float = 1,
|
|
103
|
+
) -> None:
|
|
104
|
+
# Re-implementation of SDKOperation.wait() to avoid https://github.com/nebius/pysdk/issues/74
|
|
105
|
+
deadline = time.monotonic() + timeout
|
|
106
|
+
while not op.done():
|
|
107
|
+
if time.monotonic() + interval > deadline:
|
|
108
|
+
raise TimeoutError(f"Operation {op.id} wait timeout")
|
|
109
|
+
time.sleep(interval)
|
|
110
|
+
LOOP.await_(op.update(timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_region_to_project_id_map(sdk: SDK) -> dict[str, str]:
|
|
114
|
+
tenants = LOOP.await_(
|
|
115
|
+
TenantServiceClient(sdk).list(
|
|
116
|
+
ListTenantsRequest(), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
if len(tenants.items) != 1:
|
|
120
|
+
raise ValueError(f"Expected to find 1 tenant, found {(len(tenants.items))}")
|
|
121
|
+
tenant_id = tenants.items[0].metadata.id
|
|
122
|
+
projects = LOOP.await_(
|
|
123
|
+
ProjectServiceClient(sdk).list(
|
|
124
|
+
ListProjectsRequest(parent_id=tenant_id, page_size=999),
|
|
125
|
+
timeout=REQUEST_TIMEOUT,
|
|
126
|
+
metadata=REQUEST_MD,
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
region_to_projects: defaultdict[str, list[Container]] = defaultdict(list)
|
|
130
|
+
for project in projects.items:
|
|
131
|
+
region_to_projects[project.status.region].append(project)
|
|
132
|
+
region_to_project_id = {}
|
|
133
|
+
for region, region_projects in region_to_projects.items():
|
|
134
|
+
if len(region_projects) != 1:
|
|
135
|
+
# Currently, there can only be one project per region.
|
|
136
|
+
# This condition is implemented just in case Nebius suddenly allows more projects.
|
|
137
|
+
region_projects = [
|
|
138
|
+
p for p in region_projects if p.metadata.name.startswith("default-project")
|
|
139
|
+
]
|
|
140
|
+
if len(region_projects) != 1:
|
|
141
|
+
logger.warning(
|
|
142
|
+
"Could not find the default project in region %s, tenant %s", region, tenant_id
|
|
143
|
+
)
|
|
144
|
+
continue
|
|
145
|
+
region_to_project_id[region] = region_projects[0].metadata.id
|
|
146
|
+
return region_to_project_id
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
|
|
150
|
+
subnets = LOOP.await_(
|
|
151
|
+
SubnetServiceClient(sdk).list(
|
|
152
|
+
ListSubnetsRequest(parent_id=project_id, page_size=999),
|
|
153
|
+
timeout=REQUEST_TIMEOUT,
|
|
154
|
+
metadata=REQUEST_MD,
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
for subnet in subnets.items:
|
|
158
|
+
if subnet.metadata.name.startswith("default-subnet"):
|
|
159
|
+
return subnet
|
|
160
|
+
raise BackendError(f"Could not find default subnet in project {project_id}")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def create_disk(
|
|
164
|
+
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
|
|
165
|
+
) -> SDKOperation[Operation]:
|
|
166
|
+
client = DiskServiceClient(sdk)
|
|
167
|
+
request = CreateDiskRequest(
|
|
168
|
+
metadata=ResourceMetadata(
|
|
169
|
+
name=name,
|
|
170
|
+
parent_id=project_id,
|
|
171
|
+
),
|
|
172
|
+
spec=DiskSpec(
|
|
173
|
+
size_mebibytes=size_mib,
|
|
174
|
+
type=DiskSpec.DiskType.NETWORK_SSD,
|
|
175
|
+
source_image_family=SourceImageFamily(image_family=image_family),
|
|
176
|
+
),
|
|
177
|
+
)
|
|
178
|
+
with wrap_capacity_errors():
|
|
179
|
+
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def delete_disk(sdk: SDK, disk_id: str) -> None:
|
|
183
|
+
LOOP.await_(
|
|
184
|
+
DiskServiceClient(sdk).delete(
|
|
185
|
+
DeleteDiskRequest(id=disk_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def create_instance(
|
|
191
|
+
sdk: SDK,
|
|
192
|
+
name: str,
|
|
193
|
+
project_id: str,
|
|
194
|
+
user_data: str,
|
|
195
|
+
platform: str,
|
|
196
|
+
preset: str,
|
|
197
|
+
disk_id: str,
|
|
198
|
+
subnet_id: str,
|
|
199
|
+
) -> SDKOperation[Operation]:
|
|
200
|
+
client = InstanceServiceClient(sdk)
|
|
201
|
+
request = CreateInstanceRequest(
|
|
202
|
+
metadata=ResourceMetadata(
|
|
203
|
+
name=name,
|
|
204
|
+
parent_id=project_id,
|
|
205
|
+
),
|
|
206
|
+
spec=InstanceSpec(
|
|
207
|
+
cloud_init_user_data=user_data,
|
|
208
|
+
resources=ResourcesSpec(platform=platform, preset=preset),
|
|
209
|
+
boot_disk=AttachedDiskSpec(
|
|
210
|
+
attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
211
|
+
existing_disk=ExistingDisk(id=disk_id),
|
|
212
|
+
),
|
|
213
|
+
network_interfaces=[
|
|
214
|
+
NetworkInterfaceSpec(
|
|
215
|
+
name="dstack-default-interface",
|
|
216
|
+
subnet_id=subnet_id,
|
|
217
|
+
ip_address=IPAddress(),
|
|
218
|
+
public_ip_address=PublicIPAddress(static=True),
|
|
219
|
+
)
|
|
220
|
+
],
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
with wrap_capacity_errors():
|
|
224
|
+
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def get_instance(sdk: SDK, instance_id: str) -> Instance:
|
|
228
|
+
return LOOP.await_(
|
|
229
|
+
InstanceServiceClient(sdk).get(
|
|
230
|
+
GetInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
|
|
236
|
+
return LOOP.await_(
|
|
237
|
+
InstanceServiceClient(sdk).delete(
|
|
238
|
+
DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
239
|
+
)
|
|
240
|
+
)
|