dstack 0.19.1__py3-none-any.whl → 0.19.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. dstack/_internal/cli/commands/metrics.py +138 -0
  2. dstack/_internal/cli/commands/stats.py +5 -119
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/cli/services/profile.py +9 -0
  5. dstack/_internal/core/backends/aws/configurator.py +1 -0
  6. dstack/_internal/core/backends/base/compute.py +4 -1
  7. dstack/_internal/core/backends/base/models.py +7 -7
  8. dstack/_internal/core/backends/configurators.py +9 -0
  9. dstack/_internal/core/backends/cudo/compute.py +2 -0
  10. dstack/_internal/core/backends/cudo/configurator.py +0 -13
  11. dstack/_internal/core/backends/datacrunch/compute.py +118 -32
  12. dstack/_internal/core/backends/datacrunch/configurator.py +16 -11
  13. dstack/_internal/core/backends/gcp/compute.py +140 -26
  14. dstack/_internal/core/backends/gcp/configurator.py +2 -0
  15. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  16. dstack/_internal/core/backends/gcp/features/tcpx.py +34 -0
  17. dstack/_internal/core/backends/gcp/models.py +13 -1
  18. dstack/_internal/core/backends/gcp/resources.py +64 -27
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -4
  20. dstack/_internal/core/backends/lambdalabs/configurator.py +0 -21
  21. dstack/_internal/core/backends/models.py +8 -0
  22. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  23. dstack/_internal/core/backends/nebius/backend.py +16 -0
  24. dstack/_internal/core/backends/nebius/compute.py +272 -0
  25. dstack/_internal/core/backends/nebius/configurator.py +74 -0
  26. dstack/_internal/core/backends/nebius/models.py +108 -0
  27. dstack/_internal/core/backends/nebius/resources.py +240 -0
  28. dstack/_internal/core/backends/tensordock/api_client.py +5 -4
  29. dstack/_internal/core/backends/tensordock/compute.py +2 -15
  30. dstack/_internal/core/errors.py +14 -0
  31. dstack/_internal/core/models/backends/base.py +2 -0
  32. dstack/_internal/core/models/profiles.py +3 -0
  33. dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
  34. dstack/_internal/server/background/tasks/process_instances.py +12 -7
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +20 -0
  36. dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -2
  37. dstack/_internal/server/routers/prometheus.py +5 -0
  38. dstack/_internal/server/security/permissions.py +19 -1
  39. dstack/_internal/server/services/instances.py +14 -6
  40. dstack/_internal/server/services/jobs/__init__.py +3 -3
  41. dstack/_internal/server/services/offers.py +4 -2
  42. dstack/_internal/server/services/runs.py +0 -2
  43. dstack/_internal/server/statics/index.html +1 -1
  44. dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-8f9c66f404e9c7e7e020.css} +1 -1
  45. dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js → main-e190de603dc1e9f485ec.js} +7306 -149
  46. dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js.map → main-e190de603dc1e9f485ec.js.map} +1 -1
  47. dstack/_internal/utils/common.py +8 -2
  48. dstack/_internal/utils/event_loop.py +30 -0
  49. dstack/_internal/utils/ignore.py +2 -0
  50. dstack/api/server/_fleets.py +3 -5
  51. dstack/api/server/_runs.py +6 -7
  52. dstack/version.py +1 -1
  53. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/METADATA +27 -11
  54. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/RECORD +67 -57
  55. tests/_internal/core/backends/datacrunch/test_configurator.py +6 -2
  56. tests/_internal/server/background/tasks/test_process_instances.py +4 -2
  57. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +29 -0
  58. tests/_internal/server/routers/test_backends.py +116 -0
  59. tests/_internal/server/routers/test_fleets.py +2 -0
  60. tests/_internal/server/routers/test_prometheus.py +21 -0
  61. tests/_internal/server/routers/test_runs.py +4 -0
  62. tests/_internal/utils/test_common.py +16 -1
  63. tests/_internal/utils/test_event_loop.py +18 -0
  64. dstack/_internal/core/backends/datacrunch/api_client.py +0 -77
  65. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/LICENSE.md +0 -0
  66. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/WHEEL +0 -0
  67. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/entry_points.txt +0 -0
  68. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,272 @@
1
+ import json
2
+ import shlex
3
+ import time
4
+ from functools import cached_property
5
+ from typing import List, Optional
6
+
7
+ from nebius.aio.operation import Operation as SDKOperation
8
+ from nebius.aio.service_error import RequestError, StatusCode
9
+ from nebius.api.nebius.common.v1 import Operation
10
+ from nebius.sdk import SDK
11
+
12
+ from dstack._internal.core.backends.base.backend import Compute
13
+ from dstack._internal.core.backends.base.compute import (
14
+ ComputeWithCreateInstanceSupport,
15
+ ComputeWithMultinodeSupport,
16
+ generate_unique_instance_name,
17
+ get_user_data,
18
+ )
19
+ from dstack._internal.core.backends.base.offers import get_catalog_offers
20
+ from dstack._internal.core.backends.nebius import resources
21
+ from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
22
+ from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
23
+ from dstack._internal.core.models.backends.base import BackendType
24
+ from dstack._internal.core.models.common import CoreModel
25
+ from dstack._internal.core.models.instances import (
26
+ InstanceAvailability,
27
+ InstanceConfiguration,
28
+ InstanceOffer,
29
+ InstanceOfferWithAvailability,
30
+ )
31
+ from dstack._internal.core.models.resources import Memory, Range
32
+ from dstack._internal.core.models.runs import JobProvisioningData, Requirements
33
+ from dstack._internal.utils.logging import get_logger
34
+
35
+ logger = get_logger(__name__)
36
+ CONFIGURABLE_DISK_SIZE = Range[Memory](
37
+ min=Memory.parse("40GB"), # min for the ubuntu22.04-cuda12 image
38
+ max=Memory.parse("8192GB"), # max for the NETWORK_SSD disk type
39
+ )
40
+ WAIT_FOR_DISK_TIMEOUT = 20
41
+ WAIT_FOR_INSTANCE_TIMEOUT = 30
42
+ WAIT_FOR_INSTANCE_UPDATE_INTERVAL = 2.5
43
+ DELETE_INSTANCE_TIMEOUT = 25
44
+ DOCKER_DAEMON_CONFIG = {
45
+ "runtimes": {"nvidia": {"args": [], "path": "nvidia-container-runtime"}},
46
+ # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
47
+ "exec-opts": ["native.cgroupdriver=cgroupfs"],
48
+ }
49
+ SETUP_COMMANDS = [
50
+ "ufw allow ssh",
51
+ "ufw allow from 10.0.0.0/8",
52
+ "ufw allow from 172.16.0.0/12",
53
+ "ufw allow from 192.168.0.0/16",
54
+ "ufw default deny incoming",
55
+ "ufw default allow outgoing",
56
+ "ufw enable",
57
+ 'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config',
58
+ "service ssh restart",
59
+ f"echo {shlex.quote(json.dumps(DOCKER_DAEMON_CONFIG))} > /etc/docker/daemon.json",
60
+ "service docker restart",
61
+ ]
62
+ SUPPORTED_PLATFORMS = [
63
+ "gpu-h100-sxm",
64
+ "gpu-h200-sxm",
65
+ "gpu-l40s-a",
66
+ "gpu-l40s-d",
67
+ "cpu-d3",
68
+ "cpu-e2",
69
+ ]
70
+
71
+
72
+ class NebiusCompute(
73
+ ComputeWithCreateInstanceSupport,
74
+ ComputeWithMultinodeSupport,
75
+ Compute,
76
+ ):
77
+ def __init__(self, config: NebiusConfig):
78
+ super().__init__()
79
+ self.config = config
80
+ self._subnet_id_cache: dict[str, str] = {}
81
+
82
+ @cached_property
83
+ def _sdk(self) -> SDK:
84
+ assert isinstance(self.config.creds, NebiusServiceAccountCreds)
85
+ return resources.make_sdk(self.config.creds)
86
+
87
+ @cached_property
88
+ def _region_to_project_id(self) -> dict[str, str]:
89
+ return resources.get_region_to_project_id_map(self._sdk)
90
+
91
+ def _get_subnet_id(self, region: str) -> str:
92
+ if region not in self._subnet_id_cache:
93
+ self._subnet_id_cache[region] = resources.get_default_subnet(
94
+ self._sdk, self._region_to_project_id[region]
95
+ ).metadata.id
96
+ return self._subnet_id_cache[region]
97
+
98
+ def get_offers(
99
+ self, requirements: Optional[Requirements] = None
100
+ ) -> List[InstanceOfferWithAvailability]:
101
+ offers = get_catalog_offers(
102
+ backend=BackendType.NEBIUS,
103
+ locations=self.config.regions or list(self._region_to_project_id),
104
+ requirements=requirements,
105
+ extra_filter=_supported_instances,
106
+ configurable_disk_size=CONFIGURABLE_DISK_SIZE,
107
+ )
108
+ return [
109
+ InstanceOfferWithAvailability(
110
+ **offer.dict(),
111
+ availability=InstanceAvailability.UNKNOWN,
112
+ )
113
+ for offer in offers
114
+ ]
115
+
116
+ def create_instance(
117
+ self,
118
+ instance_offer: InstanceOfferWithAvailability,
119
+ instance_config: InstanceConfiguration,
120
+ ) -> JobProvisioningData:
121
+ # NOTE: This method can block for a long time as it waits for the boot disk to be created
122
+ # and the instance to enter the STARTING state. This has to be done in create_instance so
123
+ # that we can handle quota and availability errors that may occur even after creating an
124
+ # instance.
125
+ instance_name = generate_unique_instance_name(instance_config)
126
+ platform, preset = instance_offer.instance.name.split()
127
+ create_disk_op = resources.create_disk(
128
+ sdk=self._sdk,
129
+ name=instance_name,
130
+ project_id=self._region_to_project_id[instance_offer.region],
131
+ size_mib=instance_offer.instance.resources.disk.size_mib,
132
+ image_family="ubuntu22.04-cuda12",
133
+ )
134
+ create_instance_op = None
135
+ try:
136
+ logger.debug("Blocking until disk %s is created", create_disk_op.resource_id)
137
+ resources.wait_for_operation(create_disk_op, timeout=WAIT_FOR_DISK_TIMEOUT)
138
+ if not create_disk_op.successful():
139
+ raw_op = create_disk_op.raw()
140
+ raise ProvisioningError(
141
+ f"Create disk operation failed. Message: {raw_op.status.message}."
142
+ f" Details: {raw_op.status.details}"
143
+ )
144
+ create_instance_op = resources.create_instance(
145
+ sdk=self._sdk,
146
+ name=instance_name,
147
+ project_id=self._region_to_project_id[instance_offer.region],
148
+ user_data=get_user_data(
149
+ instance_config.get_public_keys(),
150
+ backend_specific_commands=SETUP_COMMANDS,
151
+ ),
152
+ platform=platform,
153
+ preset=preset,
154
+ disk_id=create_disk_op.resource_id,
155
+ subnet_id=self._get_subnet_id(instance_offer.region),
156
+ )
157
+ _wait_for_instance(self._sdk, create_instance_op)
158
+ except BaseException:
159
+ if create_instance_op is not None:
160
+ try:
161
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
162
+ delete_instance_op = resources.delete_instance(
163
+ self._sdk, create_instance_op.resource_id
164
+ )
165
+ resources.wait_for_operation(
166
+ delete_instance_op, timeout=DELETE_INSTANCE_TIMEOUT
167
+ )
168
+ except Exception as e:
169
+ logger.exception(
170
+ "Could not delete instance %s: %s", create_instance_op.resource_id, e
171
+ )
172
+ try:
173
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
174
+ resources.delete_disk(self._sdk, create_disk_op.resource_id)
175
+ except Exception as e:
176
+ logger.exception(
177
+ "Could not delete boot disk %s: %s", create_disk_op.resource_id, e
178
+ )
179
+ raise
180
+ return JobProvisioningData(
181
+ backend=instance_offer.backend,
182
+ instance_type=instance_offer.instance,
183
+ instance_id=create_instance_op.resource_id,
184
+ hostname=None,
185
+ region=instance_offer.region,
186
+ price=instance_offer.price,
187
+ ssh_port=22,
188
+ username="ubuntu",
189
+ dockerized=True,
190
+ backend_data=NebiusInstanceBackendData(boot_disk_id=create_disk_op.resource_id).json(),
191
+ )
192
+
193
+ def update_provisioning_data(
194
+ self, provisioning_data, project_ssh_public_key, project_ssh_private_key
195
+ ):
196
+ instance = resources.get_instance(self._sdk, provisioning_data.instance_id)
197
+ if not instance.status.network_interfaces:
198
+ return
199
+ interface = instance.status.network_interfaces[0]
200
+ provisioning_data.hostname, _ = interface.public_ip_address.address.split("/")
201
+ provisioning_data.internal_ip, _ = interface.ip_address.address.split("/")
202
+
203
+ def terminate_instance(
204
+ self, instance_id: str, region: str, backend_data: Optional[str] = None
205
+ ):
206
+ backend_data_parsed = NebiusInstanceBackendData.load(backend_data)
207
+ try:
208
+ instance = resources.get_instance(self._sdk, instance_id)
209
+ except RequestError as e:
210
+ if e.status.code != StatusCode.NOT_FOUND:
211
+ raise
212
+ instance = None
213
+ if instance is not None:
214
+ if instance.status.state != instance.status.InstanceState.DELETING:
215
+ resources.delete_instance(self._sdk, instance_id)
216
+ raise NotYetTerminated(
217
+ "Requested instance deletion."
218
+ " Will wait for deletion before deleting the boot disk."
219
+ f" Instance state was: {instance.status.state.name}"
220
+ )
221
+ else:
222
+ raise NotYetTerminated(
223
+ "Waiting for instance deletion before deleting the boot disk."
224
+ f" Instance state: {instance.status.state.name}"
225
+ )
226
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
227
+ resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
228
+
229
+
230
+ class NebiusInstanceBackendData(CoreModel):
231
+ boot_disk_id: str
232
+
233
+ @classmethod
234
+ def load(cls, raw: Optional[str]) -> "NebiusInstanceBackendData":
235
+ assert raw is not None
236
+ return cls.__response__.parse_raw(raw)
237
+
238
+
239
+ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
240
+ start = time.monotonic()
241
+ while True:
242
+ if op.done() and not op.successful():
243
+ raise ProvisioningError(
244
+ f"Create instance operation failed. Message: {op.raw().status.message}."
245
+ f" Details: {op.raw().status.details}"
246
+ )
247
+ instance = resources.get_instance(sdk, op.resource_id)
248
+ if instance.status.state in [
249
+ instance.status.InstanceState.STARTING,
250
+ instance.status.InstanceState.RUNNING,
251
+ ]:
252
+ break
253
+ if time.monotonic() - start > WAIT_FOR_INSTANCE_TIMEOUT:
254
+ raise BackendError(
255
+ f"Instance {instance.metadata.id} did not start booting in time."
256
+ f" Status: {instance.status.state.name}"
257
+ )
258
+ logger.debug(
259
+ "Waiting for instance %s. Status: %s. Operation status: %s",
260
+ instance.metadata.name,
261
+ instance.status.state.name,
262
+ op.status(),
263
+ )
264
+ time.sleep(WAIT_FOR_INSTANCE_UPDATE_INTERVAL)
265
+ resources.LOOP.await_(
266
+ op.update(timeout=resources.REQUEST_TIMEOUT, metadata=resources.REQUEST_MD)
267
+ )
268
+
269
+
270
+ def _supported_instances(offer: InstanceOffer) -> bool:
271
+ platform, _ = offer.instance.name.split()
272
+ return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
@@ -0,0 +1,74 @@
1
+ import json
2
+
3
+ from nebius.aio.service_error import RequestError
4
+
5
+ from dstack._internal.core.backends.base.configurator import (
6
+ BackendRecord,
7
+ Configurator,
8
+ raise_invalid_credentials_error,
9
+ )
10
+ from dstack._internal.core.backends.nebius import resources
11
+ from dstack._internal.core.backends.nebius.backend import NebiusBackend
12
+ from dstack._internal.core.backends.nebius.models import (
13
+ AnyNebiusBackendConfig,
14
+ NebiusBackendConfig,
15
+ NebiusBackendConfigWithCreds,
16
+ NebiusConfig,
17
+ NebiusCreds,
18
+ NebiusServiceAccountCreds,
19
+ NebiusStoredConfig,
20
+ )
21
+ from dstack._internal.core.models.backends.base import BackendType
22
+
23
+
24
+ class NebiusConfigurator(Configurator):
25
+ TYPE = BackendType.NEBIUS
26
+ BACKEND_CLASS = NebiusBackend
27
+
28
+ def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_enabled: bool):
29
+ assert isinstance(config.creds, NebiusServiceAccountCreds)
30
+ try:
31
+ sdk = resources.make_sdk(config.creds)
32
+ available_regions = set(resources.get_region_to_project_id_map(sdk))
33
+ except (ValueError, RequestError) as e:
34
+ raise_invalid_credentials_error(
35
+ fields=[["creds"]],
36
+ details=str(e),
37
+ )
38
+ if invalid_regions := set(config.regions or []) - available_regions:
39
+ raise_invalid_credentials_error(
40
+ fields=[["regions"]],
41
+ details=(
42
+ f"Configured regions {invalid_regions} do not exist in this Nebius tenancy."
43
+ " Omit `regions` to use all regions or select some of the available regions:"
44
+ f" {available_regions}"
45
+ ),
46
+ )
47
+
48
+ def create_backend(
49
+ self, project_name: str, config: NebiusBackendConfigWithCreds
50
+ ) -> BackendRecord:
51
+ return BackendRecord(
52
+ config=NebiusStoredConfig(
53
+ **NebiusBackendConfig.__response__.parse_obj(config).dict()
54
+ ).json(),
55
+ auth=NebiusCreds.parse_obj(config.creds).json(),
56
+ )
57
+
58
+ def get_backend_config(
59
+ self, record: BackendRecord, include_creds: bool
60
+ ) -> AnyNebiusBackendConfig:
61
+ config = self._get_config(record)
62
+ if include_creds:
63
+ return NebiusBackendConfigWithCreds.__response__.parse_obj(config)
64
+ return NebiusBackendConfig.__response__.parse_obj(config)
65
+
66
+ def get_backend(self, record: BackendRecord) -> NebiusBackend:
67
+ config = self._get_config(record)
68
+ return NebiusBackend(config=config)
69
+
70
+ def _get_config(self, record: BackendRecord) -> NebiusConfig:
71
+ return NebiusConfig.__response__(
72
+ **json.loads(record.config),
73
+ creds=NebiusCreds.parse_raw(record.auth),
74
+ )
@@ -0,0 +1,108 @@
1
+ from typing import Annotated, Literal, Optional, Union
2
+
3
+ from pydantic import Field, root_validator
4
+
5
+ from dstack._internal.core.backends.base.models import fill_data
6
+ from dstack._internal.core.models.common import CoreModel
7
+
8
+
9
+ class NebiusServiceAccountCreds(CoreModel):
10
+ type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
11
+ "service_account"
12
+ )
13
+ service_account_id: Annotated[str, Field(description="Service account ID")]
14
+ public_key_id: Annotated[str, Field(description="ID of the service account public key")]
15
+ private_key_file: Annotated[
16
+ Optional[str], Field(description=("Path to the service account private key"))
17
+ ] = None
18
+ private_key_content: Annotated[
19
+ str,
20
+ Field(
21
+ description=(
22
+ "Content of the service account private key. When configuring via"
23
+ " `server/config.yml`, it's automatically filled from `private_key_file`."
24
+ " When configuring via UI, it has to be specified explicitly."
25
+ )
26
+ ),
27
+ ]
28
+
29
+
30
+ class NebiusServiceAccountFileCreds(CoreModel):
31
+ type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
32
+ "service_account"
33
+ )
34
+ service_account_id: Annotated[str, Field(description="Service account ID")]
35
+ public_key_id: Annotated[str, Field(description="ID of the service account public key")]
36
+ private_key_file: Annotated[
37
+ Optional[str], Field(description=("Path to the service account private key"))
38
+ ] = None
39
+ private_key_content: Annotated[
40
+ Optional[str],
41
+ Field(
42
+ description=(
43
+ "Content of the service account private key. When configuring via"
44
+ " `server/config.yml`, it's automatically filled from `private_key_file`."
45
+ " When configuring via UI, it has to be specified explicitly."
46
+ )
47
+ ),
48
+ ] = None
49
+
50
+ @root_validator
51
+ def fill_data(cls, values):
52
+ return fill_data(
53
+ values, filename_field="private_key_file", data_field="private_key_content"
54
+ )
55
+
56
+
57
+ AnyNebiusCreds = NebiusServiceAccountCreds
58
+ NebiusCreds = AnyNebiusCreds
59
+ AnyNebiusFileCreds = NebiusServiceAccountFileCreds
60
+
61
+
62
+ class NebiusBackendConfig(CoreModel):
63
+ """
64
+ The backend config used in the API, server/config.yml, `NebiusConfigurator`.
65
+ It also serves as a base class for other backend config models.
66
+ Should not include creds.
67
+ """
68
+
69
+ type: Annotated[
70
+ Literal["nebius"],
71
+ Field(description="The type of backend"),
72
+ ] = "nebius"
73
+ regions: Annotated[
74
+ Optional[list[str]],
75
+ Field(description="The list of Nebius regions. Omit to use all regions"),
76
+ ] = None
77
+
78
+
79
+ class NebiusBackendConfigWithCreds(NebiusBackendConfig):
80
+ """
81
+ Same as `NebiusBackendConfig` but also includes creds.
82
+ """
83
+
84
+ creds: Annotated[AnyNebiusCreds, Field(description="The credentials")]
85
+
86
+
87
+ class NebiusBackendFileConfigWithCreds(NebiusBackendConfig):
88
+ creds: AnyNebiusFileCreds = Field(description="The credentials")
89
+
90
+
91
+ AnyNebiusBackendConfig = Union[NebiusBackendConfig, NebiusBackendConfigWithCreds]
92
+
93
+
94
+ class NebiusStoredConfig(NebiusBackendConfig):
95
+ """
96
+ The backend config used for config parameters in the DB.
97
+ Can extend `NebiusBackendConfig` with additional parameters.
98
+ """
99
+
100
+ pass
101
+
102
+
103
+ class NebiusConfig(NebiusStoredConfig):
104
+ """
105
+ The backend config used by `NebiusBackend` and `NebiusCompute`.
106
+ """
107
+
108
+ creds: AnyNebiusCreds
@@ -0,0 +1,240 @@
1
+ import logging
2
+ import time
3
+ from collections import defaultdict
4
+ from collections.abc import Container as ContainerT
5
+ from collections.abc import Generator
6
+ from contextlib import contextmanager
7
+ from tempfile import NamedTemporaryFile
8
+
9
+ from nebius.aio.authorization.options import options_to_metadata
10
+ from nebius.aio.operation import Operation as SDKOperation
11
+ from nebius.aio.service_error import RequestError, StatusCode
12
+ from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
13
+ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
14
+ from nebius.api.nebius.compute.v1 import (
15
+ AttachedDiskSpec,
16
+ CreateDiskRequest,
17
+ CreateInstanceRequest,
18
+ DeleteDiskRequest,
19
+ DeleteInstanceRequest,
20
+ DiskServiceClient,
21
+ DiskSpec,
22
+ ExistingDisk,
23
+ GetInstanceRequest,
24
+ Instance,
25
+ InstanceServiceClient,
26
+ InstanceSpec,
27
+ IPAddress,
28
+ NetworkInterfaceSpec,
29
+ PublicIPAddress,
30
+ ResourcesSpec,
31
+ SourceImageFamily,
32
+ )
33
+ from nebius.api.nebius.iam.v1 import (
34
+ Container,
35
+ ListProjectsRequest,
36
+ ListTenantsRequest,
37
+ ProjectServiceClient,
38
+ TenantServiceClient,
39
+ )
40
+ from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceClient
41
+ from nebius.sdk import SDK
42
+
43
+ from dstack._internal.core.backends.nebius.models import NebiusServiceAccountCreds
44
+ from dstack._internal.core.errors import BackendError, NoCapacityError
45
+ from dstack._internal.utils.event_loop import DaemonEventLoop
46
+ from dstack._internal.utils.logging import get_logger
47
+
48
+ #
49
+ # Guidelines on using the Nebius SDK:
50
+ #
51
+ # Do not use Request.wait() or other sync SDK methods, they suffer from deadlocks.
52
+ # Instead, use async methods and await them with LOOP.await_()
53
+ LOOP = DaemonEventLoop()
54
+ # Pass a timeout to all methods to avoid infinite waiting
55
+ REQUEST_TIMEOUT = 10
56
+ # Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
57
+ REQUEST_MD = options_to_metadata(
58
+ {
59
+ OPTION_RENEW_SYNCHRONOUS: "true",
60
+ OPTION_RENEW_REQUEST_TIMEOUT: "5",
61
+ }
62
+ )
63
+
64
+ # disables log messages about errors such as invalid creds or expired timeouts
65
+ logging.getLogger("nebius").setLevel(logging.CRITICAL)
66
+ logger = get_logger(__name__)
67
+
68
+
69
+ @contextmanager
70
+ def wrap_capacity_errors() -> Generator[None, None, None]:
71
+ try:
72
+ yield
73
+ except RequestError as e:
74
+ if e.status.code == StatusCode.RESOURCE_EXHAUSTED or "Quota limit exceeded" in str(e):
75
+ raise NoCapacityError(e)
76
+ raise
77
+
78
+
79
+ @contextmanager
80
+ def ignore_errors(status_codes: ContainerT[StatusCode]) -> Generator[None, None, None]:
81
+ try:
82
+ yield
83
+ except RequestError as e:
84
+ if e.status.code not in status_codes:
85
+ raise
86
+
87
+
88
+ def make_sdk(creds: NebiusServiceAccountCreds) -> SDK:
89
+ with NamedTemporaryFile("w") as f:
90
+ f.write(creds.private_key_content)
91
+ f.flush()
92
+ return SDK(
93
+ service_account_private_key_file_name=f.name,
94
+ service_account_public_key_id=creds.public_key_id,
95
+ service_account_id=creds.service_account_id,
96
+ )
97
+
98
+
99
+ def wait_for_operation(
100
+ op: SDKOperation[Operation],
101
+ timeout: float,
102
+ interval: float = 1,
103
+ ) -> None:
104
+ # Re-implementation of SDKOperation.wait() to avoid https://github.com/nebius/pysdk/issues/74
105
+ deadline = time.monotonic() + timeout
106
+ while not op.done():
107
+ if time.monotonic() + interval > deadline:
108
+ raise TimeoutError(f"Operation {op.id} wait timeout")
109
+ time.sleep(interval)
110
+ LOOP.await_(op.update(timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
111
+
112
+
113
+ def get_region_to_project_id_map(sdk: SDK) -> dict[str, str]:
114
+ tenants = LOOP.await_(
115
+ TenantServiceClient(sdk).list(
116
+ ListTenantsRequest(), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
117
+ )
118
+ )
119
+ if len(tenants.items) != 1:
120
+ raise ValueError(f"Expected to find 1 tenant, found {(len(tenants.items))}")
121
+ tenant_id = tenants.items[0].metadata.id
122
+ projects = LOOP.await_(
123
+ ProjectServiceClient(sdk).list(
124
+ ListProjectsRequest(parent_id=tenant_id, page_size=999),
125
+ timeout=REQUEST_TIMEOUT,
126
+ metadata=REQUEST_MD,
127
+ )
128
+ )
129
+ region_to_projects: defaultdict[str, list[Container]] = defaultdict(list)
130
+ for project in projects.items:
131
+ region_to_projects[project.status.region].append(project)
132
+ region_to_project_id = {}
133
+ for region, region_projects in region_to_projects.items():
134
+ if len(region_projects) != 1:
135
+ # Currently, there can only be one project per region.
136
+ # This condition is implemented just in case Nebius suddenly allows more projects.
137
+ region_projects = [
138
+ p for p in region_projects if p.metadata.name.startswith("default-project")
139
+ ]
140
+ if len(region_projects) != 1:
141
+ logger.warning(
142
+ "Could not find the default project in region %s, tenant %s", region, tenant_id
143
+ )
144
+ continue
145
+ region_to_project_id[region] = region_projects[0].metadata.id
146
+ return region_to_project_id
147
+
148
+
149
+ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
150
+ subnets = LOOP.await_(
151
+ SubnetServiceClient(sdk).list(
152
+ ListSubnetsRequest(parent_id=project_id, page_size=999),
153
+ timeout=REQUEST_TIMEOUT,
154
+ metadata=REQUEST_MD,
155
+ )
156
+ )
157
+ for subnet in subnets.items:
158
+ if subnet.metadata.name.startswith("default-subnet"):
159
+ return subnet
160
+ raise BackendError(f"Could not find default subnet in project {project_id}")
161
+
162
+
163
+ def create_disk(
164
+ sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
165
+ ) -> SDKOperation[Operation]:
166
+ client = DiskServiceClient(sdk)
167
+ request = CreateDiskRequest(
168
+ metadata=ResourceMetadata(
169
+ name=name,
170
+ parent_id=project_id,
171
+ ),
172
+ spec=DiskSpec(
173
+ size_mebibytes=size_mib,
174
+ type=DiskSpec.DiskType.NETWORK_SSD,
175
+ source_image_family=SourceImageFamily(image_family=image_family),
176
+ ),
177
+ )
178
+ with wrap_capacity_errors():
179
+ return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
180
+
181
+
182
+ def delete_disk(sdk: SDK, disk_id: str) -> None:
183
+ LOOP.await_(
184
+ DiskServiceClient(sdk).delete(
185
+ DeleteDiskRequest(id=disk_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
186
+ )
187
+ )
188
+
189
+
190
+ def create_instance(
191
+ sdk: SDK,
192
+ name: str,
193
+ project_id: str,
194
+ user_data: str,
195
+ platform: str,
196
+ preset: str,
197
+ disk_id: str,
198
+ subnet_id: str,
199
+ ) -> SDKOperation[Operation]:
200
+ client = InstanceServiceClient(sdk)
201
+ request = CreateInstanceRequest(
202
+ metadata=ResourceMetadata(
203
+ name=name,
204
+ parent_id=project_id,
205
+ ),
206
+ spec=InstanceSpec(
207
+ cloud_init_user_data=user_data,
208
+ resources=ResourcesSpec(platform=platform, preset=preset),
209
+ boot_disk=AttachedDiskSpec(
210
+ attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
211
+ existing_disk=ExistingDisk(id=disk_id),
212
+ ),
213
+ network_interfaces=[
214
+ NetworkInterfaceSpec(
215
+ name="dstack-default-interface",
216
+ subnet_id=subnet_id,
217
+ ip_address=IPAddress(),
218
+ public_ip_address=PublicIPAddress(static=True),
219
+ )
220
+ ],
221
+ ),
222
+ )
223
+ with wrap_capacity_errors():
224
+ return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
225
+
226
+
227
+ def get_instance(sdk: SDK, instance_id: str) -> Instance:
228
+ return LOOP.await_(
229
+ InstanceServiceClient(sdk).get(
230
+ GetInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
231
+ )
232
+ )
233
+
234
+
235
+ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
236
+ return LOOP.await_(
237
+ InstanceServiceClient(sdk).delete(
238
+ DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
239
+ )
240
+ )