dstack 0.19.0rc1__py3-none-any.whl → 0.19.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/metrics.py +138 -0
- dstack/_internal/cli/commands/stats.py +5 -119
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/core/backends/base/compute.py +3 -0
- dstack/_internal/core/backends/base/models.py +7 -7
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +270 -0
- dstack/_internal/core/backends/nebius/configurator.py +74 -0
- dstack/_internal/core/backends/nebius/models.py +108 -0
- dstack/_internal/core/backends/nebius/resources.py +222 -0
- dstack/_internal/core/errors.py +14 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
- dstack/_internal/server/background/tasks/process_instances.py +26 -12
- dstack/_internal/server/routers/prometheus.py +5 -12
- dstack/_internal/server/security/permissions.py +19 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
- dstack/_internal/server/services/prometheus.py +175 -112
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-bcb3228138bc8483cc0b.js} +7278 -131
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-bcb3228138bc8483cc0b.js.map} +1 -1
- dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-c0bdaac8f1ea67d499eb.css} +1 -1
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/version.py +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/METADATA +27 -11
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/RECORD +37 -28
- tests/_internal/server/background/tasks/test_process_instances.py +68 -2
- tests/_internal/server/routers/test_backends.py +116 -0
- tests/_internal/server/routers/test_prometheus.py +158 -120
- tests/_internal/utils/test_event_loop.py +18 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/WHEEL +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from nebius.aio.service_error import RequestError
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
6
|
+
BackendRecord,
|
|
7
|
+
Configurator,
|
|
8
|
+
raise_invalid_credentials_error,
|
|
9
|
+
)
|
|
10
|
+
from dstack._internal.core.backends.nebius import resources
|
|
11
|
+
from dstack._internal.core.backends.nebius.backend import NebiusBackend
|
|
12
|
+
from dstack._internal.core.backends.nebius.models import (
|
|
13
|
+
AnyNebiusBackendConfig,
|
|
14
|
+
NebiusBackendConfig,
|
|
15
|
+
NebiusBackendConfigWithCreds,
|
|
16
|
+
NebiusConfig,
|
|
17
|
+
NebiusCreds,
|
|
18
|
+
NebiusServiceAccountCreds,
|
|
19
|
+
NebiusStoredConfig,
|
|
20
|
+
)
|
|
21
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NebiusConfigurator(Configurator):
|
|
25
|
+
TYPE = BackendType.NEBIUS
|
|
26
|
+
BACKEND_CLASS = NebiusBackend
|
|
27
|
+
|
|
28
|
+
def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_enabled: bool):
|
|
29
|
+
assert isinstance(config.creds, NebiusServiceAccountCreds)
|
|
30
|
+
try:
|
|
31
|
+
sdk = resources.make_sdk(config.creds)
|
|
32
|
+
available_regions = set(resources.get_region_to_project_id_map(sdk))
|
|
33
|
+
except (ValueError, RequestError) as e:
|
|
34
|
+
raise_invalid_credentials_error(
|
|
35
|
+
fields=[["creds"]],
|
|
36
|
+
details=str(e),
|
|
37
|
+
)
|
|
38
|
+
if invalid_regions := set(config.regions or []) - available_regions:
|
|
39
|
+
raise_invalid_credentials_error(
|
|
40
|
+
fields=[["regions"]],
|
|
41
|
+
details=(
|
|
42
|
+
f"Configured regions {invalid_regions} do not exist in this Nebius tenancy."
|
|
43
|
+
" Omit `regions` to use all regions or select some of the available regions:"
|
|
44
|
+
f" {available_regions}"
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def create_backend(
|
|
49
|
+
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
50
|
+
) -> BackendRecord:
|
|
51
|
+
return BackendRecord(
|
|
52
|
+
config=NebiusStoredConfig(
|
|
53
|
+
**NebiusBackendConfig.__response__.parse_obj(config).dict()
|
|
54
|
+
).json(),
|
|
55
|
+
auth=NebiusCreds.parse_obj(config.creds).json(),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get_backend_config(
|
|
59
|
+
self, record: BackendRecord, include_creds: bool
|
|
60
|
+
) -> AnyNebiusBackendConfig:
|
|
61
|
+
config = self._get_config(record)
|
|
62
|
+
if include_creds:
|
|
63
|
+
return NebiusBackendConfigWithCreds.__response__.parse_obj(config)
|
|
64
|
+
return NebiusBackendConfig.__response__.parse_obj(config)
|
|
65
|
+
|
|
66
|
+
def get_backend(self, record: BackendRecord) -> NebiusBackend:
|
|
67
|
+
config = self._get_config(record)
|
|
68
|
+
return NebiusBackend(config=config)
|
|
69
|
+
|
|
70
|
+
def _get_config(self, record: BackendRecord) -> NebiusConfig:
|
|
71
|
+
return NebiusConfig.__response__(
|
|
72
|
+
**json.loads(record.config),
|
|
73
|
+
creds=NebiusCreds.parse_raw(record.auth),
|
|
74
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Annotated, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field, root_validator
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.models import fill_data
|
|
6
|
+
from dstack._internal.core.models.common import CoreModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NebiusServiceAccountCreds(CoreModel):
|
|
10
|
+
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
11
|
+
"service_account"
|
|
12
|
+
)
|
|
13
|
+
service_account_id: Annotated[str, Field(description="Service account ID")]
|
|
14
|
+
public_key_id: Annotated[str, Field(description="ID of the service account public key")]
|
|
15
|
+
private_key_file: Annotated[
|
|
16
|
+
Optional[str], Field(description=("Path to the service account private key"))
|
|
17
|
+
] = None
|
|
18
|
+
private_key_content: Annotated[
|
|
19
|
+
str,
|
|
20
|
+
Field(
|
|
21
|
+
description=(
|
|
22
|
+
"Content of the service account private key. When configuring via"
|
|
23
|
+
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
24
|
+
" When configuring via UI, it has to be specified explicitly."
|
|
25
|
+
)
|
|
26
|
+
),
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class NebiusServiceAccountFileCreds(CoreModel):
|
|
31
|
+
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
32
|
+
"service_account"
|
|
33
|
+
)
|
|
34
|
+
service_account_id: Annotated[str, Field(description="Service account ID")]
|
|
35
|
+
public_key_id: Annotated[str, Field(description="ID of the service account public key")]
|
|
36
|
+
private_key_file: Annotated[
|
|
37
|
+
Optional[str], Field(description=("Path to the service account private key"))
|
|
38
|
+
] = None
|
|
39
|
+
private_key_content: Annotated[
|
|
40
|
+
Optional[str],
|
|
41
|
+
Field(
|
|
42
|
+
description=(
|
|
43
|
+
"Content of the service account private key. When configuring via"
|
|
44
|
+
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
45
|
+
" When configuring via UI, it has to be specified explicitly."
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
49
|
+
|
|
50
|
+
@root_validator
|
|
51
|
+
def fill_data(cls, values):
|
|
52
|
+
return fill_data(
|
|
53
|
+
values, filename_field="private_key_file", data_field="private_key_content"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
AnyNebiusCreds = NebiusServiceAccountCreds
|
|
58
|
+
NebiusCreds = AnyNebiusCreds
|
|
59
|
+
AnyNebiusFileCreds = NebiusServiceAccountFileCreds
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class NebiusBackendConfig(CoreModel):
|
|
63
|
+
"""
|
|
64
|
+
The backend config used in the API, server/config.yml, `NebiusConfigurator`.
|
|
65
|
+
It also serves as a base class for other backend config models.
|
|
66
|
+
Should not include creds.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
type: Annotated[
|
|
70
|
+
Literal["nebius"],
|
|
71
|
+
Field(description="The type of backend"),
|
|
72
|
+
] = "nebius"
|
|
73
|
+
regions: Annotated[
|
|
74
|
+
Optional[list[str]],
|
|
75
|
+
Field(description="The list of Nebius regions. Omit to use all regions"),
|
|
76
|
+
] = None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
80
|
+
"""
|
|
81
|
+
Same as `NebiusBackendConfig` but also includes creds.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
creds: Annotated[AnyNebiusCreds, Field(description="The credentials")]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class NebiusBackendFileConfigWithCreds(NebiusBackendConfig):
|
|
88
|
+
creds: AnyNebiusFileCreds = Field(description="The credentials")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
AnyNebiusBackendConfig = Union[NebiusBackendConfig, NebiusBackendConfigWithCreds]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class NebiusStoredConfig(NebiusBackendConfig):
|
|
95
|
+
"""
|
|
96
|
+
The backend config used for config parameters in the DB.
|
|
97
|
+
Can extend `NebiusBackendConfig` with additional parameters.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class NebiusConfig(NebiusStoredConfig):
|
|
104
|
+
"""
|
|
105
|
+
The backend config used by `NebiusBackend` and `NebiusCompute`.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
creds: AnyNebiusCreds
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from collections.abc import Container as ContainerT
|
|
4
|
+
from collections.abc import Generator
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from tempfile import NamedTemporaryFile
|
|
7
|
+
|
|
8
|
+
from nebius.aio.authorization.options import options_to_metadata
|
|
9
|
+
from nebius.aio.operation import Operation as SDKOperation
|
|
10
|
+
from nebius.aio.service_error import RequestError, StatusCode
|
|
11
|
+
from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
|
|
12
|
+
from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
|
|
13
|
+
from nebius.api.nebius.compute.v1 import (
|
|
14
|
+
AttachedDiskSpec,
|
|
15
|
+
CreateDiskRequest,
|
|
16
|
+
CreateInstanceRequest,
|
|
17
|
+
DeleteDiskRequest,
|
|
18
|
+
DeleteInstanceRequest,
|
|
19
|
+
DiskServiceClient,
|
|
20
|
+
DiskSpec,
|
|
21
|
+
ExistingDisk,
|
|
22
|
+
GetInstanceRequest,
|
|
23
|
+
Instance,
|
|
24
|
+
InstanceServiceClient,
|
|
25
|
+
InstanceSpec,
|
|
26
|
+
IPAddress,
|
|
27
|
+
NetworkInterfaceSpec,
|
|
28
|
+
PublicIPAddress,
|
|
29
|
+
ResourcesSpec,
|
|
30
|
+
SourceImageFamily,
|
|
31
|
+
)
|
|
32
|
+
from nebius.api.nebius.iam.v1 import (
|
|
33
|
+
ListProjectsRequest,
|
|
34
|
+
ListTenantsRequest,
|
|
35
|
+
ProjectServiceClient,
|
|
36
|
+
TenantServiceClient,
|
|
37
|
+
)
|
|
38
|
+
from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceClient
|
|
39
|
+
from nebius.sdk import SDK
|
|
40
|
+
|
|
41
|
+
from dstack._internal.core.backends.nebius.models import NebiusServiceAccountCreds
|
|
42
|
+
from dstack._internal.core.errors import BackendError, NoCapacityError
|
|
43
|
+
from dstack._internal.utils.event_loop import DaemonEventLoop
|
|
44
|
+
|
|
45
|
+
#
|
|
46
|
+
# Guidelines on using the Nebius SDK:
|
|
47
|
+
#
|
|
48
|
+
# Do not use Request.wait() or other sync SDK methods, they suffer from deadlocks.
|
|
49
|
+
# Instead, use async methods and await them with LOOP.await_()
|
|
50
|
+
LOOP = DaemonEventLoop()
|
|
51
|
+
# Pass a timeout to all methods to avoid infinite waiting
|
|
52
|
+
REQUEST_TIMEOUT = 10
|
|
53
|
+
# Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
|
|
54
|
+
REQUEST_MD = options_to_metadata(
|
|
55
|
+
{
|
|
56
|
+
OPTION_RENEW_SYNCHRONOUS: "true",
|
|
57
|
+
OPTION_RENEW_REQUEST_TIMEOUT: "5",
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# disables log messages about errors such as invalid creds or expired timeouts
|
|
62
|
+
logging.getLogger("nebius").setLevel(logging.CRITICAL)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@contextmanager
|
|
66
|
+
def wrap_capacity_errors() -> Generator[None, None, None]:
|
|
67
|
+
try:
|
|
68
|
+
yield
|
|
69
|
+
except RequestError as e:
|
|
70
|
+
if e.status.code == StatusCode.RESOURCE_EXHAUSTED or "Quota limit exceeded" in str(e):
|
|
71
|
+
raise NoCapacityError(e)
|
|
72
|
+
raise
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@contextmanager
|
|
76
|
+
def ignore_errors(status_codes: ContainerT[StatusCode]) -> Generator[None, None, None]:
|
|
77
|
+
try:
|
|
78
|
+
yield
|
|
79
|
+
except RequestError as e:
|
|
80
|
+
if e.status.code not in status_codes:
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def make_sdk(creds: NebiusServiceAccountCreds) -> SDK:
|
|
85
|
+
with NamedTemporaryFile("w") as f:
|
|
86
|
+
f.write(creds.private_key_content)
|
|
87
|
+
f.flush()
|
|
88
|
+
return SDK(
|
|
89
|
+
service_account_private_key_file_name=f.name,
|
|
90
|
+
service_account_public_key_id=creds.public_key_id,
|
|
91
|
+
service_account_id=creds.service_account_id,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def wait_for_operation(
|
|
96
|
+
op: SDKOperation[Operation],
|
|
97
|
+
timeout: float,
|
|
98
|
+
interval: float = 1,
|
|
99
|
+
) -> None:
|
|
100
|
+
# Re-implementation of SDKOperation.wait() to avoid https://github.com/nebius/pysdk/issues/74
|
|
101
|
+
deadline = time.monotonic() + timeout
|
|
102
|
+
while not op.done():
|
|
103
|
+
if time.monotonic() + interval > deadline:
|
|
104
|
+
raise TimeoutError(f"Operation {op.id} wait timeout")
|
|
105
|
+
time.sleep(interval)
|
|
106
|
+
LOOP.await_(op.update(timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_region_to_project_id_map(sdk: SDK) -> dict[str, str]:
|
|
110
|
+
tenants = LOOP.await_(
|
|
111
|
+
TenantServiceClient(sdk).list(
|
|
112
|
+
ListTenantsRequest(), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
if len(tenants.items) != 1:
|
|
116
|
+
raise ValueError(f"Expected to find 1 tenant, found {(len(tenants.items))}")
|
|
117
|
+
projects = LOOP.await_(
|
|
118
|
+
ProjectServiceClient(sdk).list(
|
|
119
|
+
ListProjectsRequest(parent_id=tenants.items[0].metadata.id, page_size=999),
|
|
120
|
+
timeout=REQUEST_TIMEOUT,
|
|
121
|
+
metadata=REQUEST_MD,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
result = {}
|
|
125
|
+
for project in projects.items:
|
|
126
|
+
if project.metadata.name == f"default-project-{project.status.region}":
|
|
127
|
+
result[project.status.region] = project.metadata.id
|
|
128
|
+
return result
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
|
|
132
|
+
subnets = LOOP.await_(
|
|
133
|
+
SubnetServiceClient(sdk).list(
|
|
134
|
+
ListSubnetsRequest(parent_id=project_id, page_size=999),
|
|
135
|
+
timeout=REQUEST_TIMEOUT,
|
|
136
|
+
metadata=REQUEST_MD,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
for subnet in subnets.items:
|
|
140
|
+
if subnet.metadata.name.startswith("default-subnet"):
|
|
141
|
+
return subnet
|
|
142
|
+
raise BackendError(f"Could not find default subnet in project {project_id}")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def create_disk(
|
|
146
|
+
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
|
|
147
|
+
) -> SDKOperation[Operation]:
|
|
148
|
+
client = DiskServiceClient(sdk)
|
|
149
|
+
request = CreateDiskRequest(
|
|
150
|
+
metadata=ResourceMetadata(
|
|
151
|
+
name=name,
|
|
152
|
+
parent_id=project_id,
|
|
153
|
+
),
|
|
154
|
+
spec=DiskSpec(
|
|
155
|
+
size_mebibytes=size_mib,
|
|
156
|
+
type=DiskSpec.DiskType.NETWORK_SSD,
|
|
157
|
+
source_image_family=SourceImageFamily(image_family=image_family),
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
with wrap_capacity_errors():
|
|
161
|
+
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def delete_disk(sdk: SDK, disk_id: str) -> None:
|
|
165
|
+
LOOP.await_(
|
|
166
|
+
DiskServiceClient(sdk).delete(
|
|
167
|
+
DeleteDiskRequest(id=disk_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def create_instance(
|
|
173
|
+
sdk: SDK,
|
|
174
|
+
name: str,
|
|
175
|
+
project_id: str,
|
|
176
|
+
user_data: str,
|
|
177
|
+
platform: str,
|
|
178
|
+
preset: str,
|
|
179
|
+
disk_id: str,
|
|
180
|
+
subnet_id: str,
|
|
181
|
+
) -> SDKOperation[Operation]:
|
|
182
|
+
client = InstanceServiceClient(sdk)
|
|
183
|
+
request = CreateInstanceRequest(
|
|
184
|
+
metadata=ResourceMetadata(
|
|
185
|
+
name=name,
|
|
186
|
+
parent_id=project_id,
|
|
187
|
+
),
|
|
188
|
+
spec=InstanceSpec(
|
|
189
|
+
cloud_init_user_data=user_data,
|
|
190
|
+
resources=ResourcesSpec(platform=platform, preset=preset),
|
|
191
|
+
boot_disk=AttachedDiskSpec(
|
|
192
|
+
attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
193
|
+
existing_disk=ExistingDisk(id=disk_id),
|
|
194
|
+
),
|
|
195
|
+
network_interfaces=[
|
|
196
|
+
NetworkInterfaceSpec(
|
|
197
|
+
name="dstack-default-interface",
|
|
198
|
+
subnet_id=subnet_id,
|
|
199
|
+
ip_address=IPAddress(),
|
|
200
|
+
public_ip_address=PublicIPAddress(static=True),
|
|
201
|
+
)
|
|
202
|
+
],
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
with wrap_capacity_errors():
|
|
206
|
+
return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def get_instance(sdk: SDK, instance_id: str) -> Instance:
|
|
210
|
+
return LOOP.await_(
|
|
211
|
+
InstanceServiceClient(sdk).get(
|
|
212
|
+
GetInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
|
|
218
|
+
return LOOP.await_(
|
|
219
|
+
InstanceServiceClient(sdk).delete(
|
|
220
|
+
DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
221
|
+
)
|
|
222
|
+
)
|
dstack/_internal/core/errors.py
CHANGED
|
@@ -102,6 +102,20 @@ class PlacementGroupInUseError(ComputeError):
|
|
|
102
102
|
pass
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
class NotYetTerminated(ComputeError):
|
|
106
|
+
"""
|
|
107
|
+
Used by Compute.terminate_instance to signal that instance termination is not complete
|
|
108
|
+
and the method should be called again after some time to continue termination.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(self, details: str) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Args:
|
|
114
|
+
details: some details about the termination status
|
|
115
|
+
"""
|
|
116
|
+
return super().__init__(details)
|
|
117
|
+
|
|
118
|
+
|
|
105
119
|
class CLIError(DstackError):
|
|
106
120
|
pass
|
|
107
121
|
|
|
@@ -12,6 +12,7 @@ class BackendType(str, enum.Enum):
|
|
|
12
12
|
DATACRUNCH (BackendType): DataCrunch
|
|
13
13
|
KUBERNETES (BackendType): Kubernetes
|
|
14
14
|
LAMBDA (BackendType): Lambda Cloud
|
|
15
|
+
NEBIUS (BackendType): Nebius AI Cloud
|
|
15
16
|
OCI (BackendType): Oracle Cloud Infrastructure
|
|
16
17
|
RUNPOD (BackendType): Runpod Cloud
|
|
17
18
|
TENSORDOCK (BackendType): TensorDock Marketplace
|
|
@@ -29,6 +30,7 @@ class BackendType(str, enum.Enum):
|
|
|
29
30
|
LAMBDA = "lambda"
|
|
30
31
|
LOCAL = "local"
|
|
31
32
|
REMOTE = "remote" # TODO: replace for LOCAL
|
|
33
|
+
NEBIUS = "nebius"
|
|
32
34
|
OCI = "oci"
|
|
33
35
|
RUNPOD = "runpod"
|
|
34
36
|
TENSORDOCK = "tensordock"
|
|
@@ -57,11 +57,11 @@ class ChatCompletionsResponse(CoreModel):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class ChatCompletionsChunk(CoreModel):
|
|
60
|
-
id: str
|
|
60
|
+
id: Optional[str] = None
|
|
61
61
|
choices: List[ChatCompletionsChunkChoice]
|
|
62
|
-
created: int
|
|
62
|
+
created: Optional[int] = None
|
|
63
63
|
model: str
|
|
64
|
-
system_fingerprint: str = ""
|
|
64
|
+
system_fingerprint: Optional[str] = ""
|
|
65
65
|
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
|
|
66
66
|
|
|
67
67
|
|
|
@@ -39,7 +39,7 @@ from dstack._internal.core.backends.remote.provisioning import (
|
|
|
39
39
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
40
40
|
|
|
41
41
|
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
42
|
-
from dstack._internal.core.errors import BackendError, ProvisioningError
|
|
42
|
+
from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
|
|
43
43
|
from dstack._internal.core.models.backends.base import BackendType
|
|
44
44
|
from dstack._internal.core.models.fleets import InstanceGroupPlacement
|
|
45
45
|
from dstack._internal.core.models.instances import (
|
|
@@ -64,6 +64,7 @@ from dstack._internal.core.models.runs import (
|
|
|
64
64
|
Retry,
|
|
65
65
|
)
|
|
66
66
|
from dstack._internal.core.services.profiles import get_retry
|
|
67
|
+
from dstack._internal.server import settings as server_settings
|
|
67
68
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
68
69
|
from dstack._internal.server.db import get_session_ctx
|
|
69
70
|
from dstack._internal.server.models import (
|
|
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
529
530
|
session=session, fleet_id=instance.fleet_id
|
|
530
531
|
)
|
|
531
532
|
|
|
532
|
-
|
|
533
|
+
# Limit number of offers tried to prevent long-running processing
|
|
534
|
+
# in case all offers fail.
|
|
535
|
+
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
533
536
|
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
|
|
534
537
|
continue
|
|
535
538
|
compute = backend.compute()
|
|
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
578
581
|
extra={"instance_name": instance.name},
|
|
579
582
|
)
|
|
580
583
|
continue
|
|
581
|
-
except
|
|
582
|
-
|
|
584
|
+
except Exception:
|
|
585
|
+
logger.exception(
|
|
586
|
+
"Got exception when launching %s in %s/%s",
|
|
587
|
+
instance_offer.instance.name,
|
|
588
|
+
instance_offer.backend.value,
|
|
589
|
+
instance_offer.region,
|
|
590
|
+
)
|
|
583
591
|
continue
|
|
584
592
|
|
|
585
593
|
instance.status = InstanceStatus.PROVISIONING
|
|
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
607
615
|
|
|
608
616
|
if not should_retry:
|
|
609
617
|
instance.status = InstanceStatus.TERMINATED
|
|
610
|
-
instance.termination_reason = "No offers found"
|
|
618
|
+
instance.termination_reason = "All offers failed" if offers else "No offers found"
|
|
611
619
|
logger.info(
|
|
612
|
-
"
|
|
620
|
+
"Terminated instance %s: %s",
|
|
613
621
|
instance.name,
|
|
622
|
+
instance.termination_reason,
|
|
614
623
|
extra={
|
|
615
624
|
"instance_name": instance.name,
|
|
616
625
|
"instance_status": InstanceStatus.TERMINATED.value,
|
|
@@ -837,12 +846,17 @@ async def _terminate(instance: InstanceModel) -> None:
|
|
|
837
846
|
instance.first_termination_retry_at = get_current_datetime()
|
|
838
847
|
instance.last_termination_retry_at = get_current_datetime()
|
|
839
848
|
if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
849
|
+
if isinstance(e, NotYetTerminated):
|
|
850
|
+
logger.debug(
|
|
851
|
+
"Instance %s termination in progress: %s", instance.name, e
|
|
852
|
+
)
|
|
853
|
+
else:
|
|
854
|
+
logger.warning(
|
|
855
|
+
"Failed to terminate instance %s. Will retry. Error: %r",
|
|
856
|
+
instance.name,
|
|
857
|
+
e,
|
|
858
|
+
exc_info=not isinstance(e, BackendError),
|
|
859
|
+
)
|
|
846
860
|
return
|
|
847
861
|
logger.error(
|
|
848
862
|
"Failed all attempts to terminate instance %s."
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Annotated
|
|
2
3
|
|
|
3
4
|
from fastapi import APIRouter, Depends
|
|
@@ -6,14 +7,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
6
7
|
|
|
7
8
|
from dstack._internal.server import settings
|
|
8
9
|
from dstack._internal.server.db import get_session
|
|
9
|
-
from dstack._internal.server.
|
|
10
|
-
from dstack._internal.server.models import ProjectModel
|
|
10
|
+
from dstack._internal.server.security.permissions import OptionalServiceAccount
|
|
11
11
|
from dstack._internal.server.services import prometheus
|
|
12
12
|
from dstack._internal.server.utils.routers import error_not_found
|
|
13
13
|
|
|
14
|
+
_auth = OptionalServiceAccount(os.getenv("DSTACK_PROMETHEUS_AUTH_TOKEN"))
|
|
15
|
+
|
|
14
16
|
router = APIRouter(
|
|
15
17
|
tags=["prometheus"],
|
|
16
18
|
default_response_class=PlainTextResponse,
|
|
19
|
+
dependencies=[Depends(_auth)],
|
|
17
20
|
)
|
|
18
21
|
|
|
19
22
|
|
|
@@ -24,13 +27,3 @@ async def get_prometheus_metrics(
|
|
|
24
27
|
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
25
28
|
raise error_not_found()
|
|
26
29
|
return await prometheus.get_metrics(session=session)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@router.get("/metrics/project/{project_name}", deprecated=True)
|
|
30
|
-
async def get_project_prometheus_metrics(
|
|
31
|
-
session: Annotated[AsyncSession, Depends(get_session)],
|
|
32
|
-
project: Annotated[ProjectModel, Depends(Project())],
|
|
33
|
-
) -> str:
|
|
34
|
-
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
35
|
-
raise error_not_found()
|
|
36
|
-
return await prometheus.get_project_metrics(session=session, project=project)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Tuple
|
|
1
|
+
from typing import Annotated, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
from fastapi import Depends, HTTPException, Security
|
|
4
4
|
from fastapi.security import HTTPBearer
|
|
@@ -99,6 +99,24 @@ class ProjectMember:
|
|
|
99
99
|
return await get_project_member(session, project_name, token.credentials)
|
|
100
100
|
|
|
101
101
|
|
|
102
|
+
class OptionalServiceAccount:
|
|
103
|
+
def __init__(self, token: Optional[str]) -> None:
|
|
104
|
+
self._token = token
|
|
105
|
+
|
|
106
|
+
async def __call__(
|
|
107
|
+
self,
|
|
108
|
+
token: Annotated[
|
|
109
|
+
Optional[HTTPAuthorizationCredentials], Security(HTTPBearer(auto_error=False))
|
|
110
|
+
],
|
|
111
|
+
) -> None:
|
|
112
|
+
if self._token is None:
|
|
113
|
+
return
|
|
114
|
+
if token is None:
|
|
115
|
+
raise error_forbidden()
|
|
116
|
+
if token.credentials != self._token:
|
|
117
|
+
raise error_invalid_token()
|
|
118
|
+
|
|
119
|
+
|
|
102
120
|
async def get_project_member(
|
|
103
121
|
session: AsyncSession, project_name: str, token: str
|
|
104
122
|
) -> Tuple[UserModel, ProjectModel]:
|
|
@@ -35,7 +35,7 @@ class CursorDesktop:
|
|
|
35
35
|
|
|
36
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
37
37
|
return [
|
|
38
|
-
"echo To open in
|
|
38
|
+
"echo To open in Cursor, use link below:",
|
|
39
39
|
"echo ''",
|
|
40
40
|
f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
|
|
41
41
|
"echo ''",
|