dstack 0.19.0rc1__py3-none-any.whl → 0.19.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. dstack/_internal/cli/commands/metrics.py +138 -0
  2. dstack/_internal/cli/commands/stats.py +5 -119
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/core/backends/base/compute.py +3 -0
  5. dstack/_internal/core/backends/base/models.py +7 -7
  6. dstack/_internal/core/backends/configurators.py +9 -0
  7. dstack/_internal/core/backends/models.py +8 -0
  8. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  9. dstack/_internal/core/backends/nebius/backend.py +16 -0
  10. dstack/_internal/core/backends/nebius/compute.py +270 -0
  11. dstack/_internal/core/backends/nebius/configurator.py +74 -0
  12. dstack/_internal/core/backends/nebius/models.py +108 -0
  13. dstack/_internal/core/backends/nebius/resources.py +222 -0
  14. dstack/_internal/core/errors.py +14 -0
  15. dstack/_internal/core/models/backends/base.py +2 -0
  16. dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
  17. dstack/_internal/server/background/tasks/process_instances.py +26 -12
  18. dstack/_internal/server/routers/prometheus.py +5 -12
  19. dstack/_internal/server/security/permissions.py +19 -1
  20. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
  21. dstack/_internal/server/services/prometheus.py +175 -112
  22. dstack/_internal/server/statics/index.html +1 -1
  23. dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-bcb3228138bc8483cc0b.js} +7278 -131
  24. dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-bcb3228138bc8483cc0b.js.map} +1 -1
  25. dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-c0bdaac8f1ea67d499eb.css} +1 -1
  26. dstack/_internal/utils/event_loop.py +30 -0
  27. dstack/version.py +1 -1
  28. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/METADATA +27 -11
  29. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/RECORD +37 -28
  30. tests/_internal/server/background/tasks/test_process_instances.py +68 -2
  31. tests/_internal/server/routers/test_backends.py +116 -0
  32. tests/_internal/server/routers/test_prometheus.py +158 -120
  33. tests/_internal/utils/test_event_loop.py +18 -0
  34. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/LICENSE.md +0 -0
  35. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/WHEEL +0 -0
  36. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/entry_points.txt +0 -0
  37. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import json
2
+
3
+ from nebius.aio.service_error import RequestError
4
+
5
+ from dstack._internal.core.backends.base.configurator import (
6
+ BackendRecord,
7
+ Configurator,
8
+ raise_invalid_credentials_error,
9
+ )
10
+ from dstack._internal.core.backends.nebius import resources
11
+ from dstack._internal.core.backends.nebius.backend import NebiusBackend
12
+ from dstack._internal.core.backends.nebius.models import (
13
+ AnyNebiusBackendConfig,
14
+ NebiusBackendConfig,
15
+ NebiusBackendConfigWithCreds,
16
+ NebiusConfig,
17
+ NebiusCreds,
18
+ NebiusServiceAccountCreds,
19
+ NebiusStoredConfig,
20
+ )
21
+ from dstack._internal.core.models.backends.base import BackendType
22
+
23
+
24
+ class NebiusConfigurator(Configurator):
25
+ TYPE = BackendType.NEBIUS
26
+ BACKEND_CLASS = NebiusBackend
27
+
28
+ def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_enabled: bool):
29
+ assert isinstance(config.creds, NebiusServiceAccountCreds)
30
+ try:
31
+ sdk = resources.make_sdk(config.creds)
32
+ available_regions = set(resources.get_region_to_project_id_map(sdk))
33
+ except (ValueError, RequestError) as e:
34
+ raise_invalid_credentials_error(
35
+ fields=[["creds"]],
36
+ details=str(e),
37
+ )
38
+ if invalid_regions := set(config.regions or []) - available_regions:
39
+ raise_invalid_credentials_error(
40
+ fields=[["regions"]],
41
+ details=(
42
+ f"Configured regions {invalid_regions} do not exist in this Nebius tenancy."
43
+ " Omit `regions` to use all regions or select some of the available regions:"
44
+ f" {available_regions}"
45
+ ),
46
+ )
47
+
48
+ def create_backend(
49
+ self, project_name: str, config: NebiusBackendConfigWithCreds
50
+ ) -> BackendRecord:
51
+ return BackendRecord(
52
+ config=NebiusStoredConfig(
53
+ **NebiusBackendConfig.__response__.parse_obj(config).dict()
54
+ ).json(),
55
+ auth=NebiusCreds.parse_obj(config.creds).json(),
56
+ )
57
+
58
+ def get_backend_config(
59
+ self, record: BackendRecord, include_creds: bool
60
+ ) -> AnyNebiusBackendConfig:
61
+ config = self._get_config(record)
62
+ if include_creds:
63
+ return NebiusBackendConfigWithCreds.__response__.parse_obj(config)
64
+ return NebiusBackendConfig.__response__.parse_obj(config)
65
+
66
+ def get_backend(self, record: BackendRecord) -> NebiusBackend:
67
+ config = self._get_config(record)
68
+ return NebiusBackend(config=config)
69
+
70
+ def _get_config(self, record: BackendRecord) -> NebiusConfig:
71
+ return NebiusConfig.__response__(
72
+ **json.loads(record.config),
73
+ creds=NebiusCreds.parse_raw(record.auth),
74
+ )
@@ -0,0 +1,108 @@
1
+ from typing import Annotated, Literal, Optional, Union
2
+
3
+ from pydantic import Field, root_validator
4
+
5
+ from dstack._internal.core.backends.base.models import fill_data
6
+ from dstack._internal.core.models.common import CoreModel
7
+
8
+
9
+ class NebiusServiceAccountCreds(CoreModel):
10
+ type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
11
+ "service_account"
12
+ )
13
+ service_account_id: Annotated[str, Field(description="Service account ID")]
14
+ public_key_id: Annotated[str, Field(description="ID of the service account public key")]
15
+ private_key_file: Annotated[
16
+ Optional[str], Field(description=("Path to the service account private key"))
17
+ ] = None
18
+ private_key_content: Annotated[
19
+ str,
20
+ Field(
21
+ description=(
22
+ "Content of the service account private key. When configuring via"
23
+ " `server/config.yml`, it's automatically filled from `private_key_file`."
24
+ " When configuring via UI, it has to be specified explicitly."
25
+ )
26
+ ),
27
+ ]
28
+
29
+
30
+ class NebiusServiceAccountFileCreds(CoreModel):
31
+ type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
32
+ "service_account"
33
+ )
34
+ service_account_id: Annotated[str, Field(description="Service account ID")]
35
+ public_key_id: Annotated[str, Field(description="ID of the service account public key")]
36
+ private_key_file: Annotated[
37
+ Optional[str], Field(description=("Path to the service account private key"))
38
+ ] = None
39
+ private_key_content: Annotated[
40
+ Optional[str],
41
+ Field(
42
+ description=(
43
+ "Content of the service account private key. When configuring via"
44
+ " `server/config.yml`, it's automatically filled from `private_key_file`."
45
+ " When configuring via UI, it has to be specified explicitly."
46
+ )
47
+ ),
48
+ ] = None
49
+
50
+ @root_validator
51
+ def fill_data(cls, values):
52
+ return fill_data(
53
+ values, filename_field="private_key_file", data_field="private_key_content"
54
+ )
55
+
56
+
57
+ AnyNebiusCreds = NebiusServiceAccountCreds
58
+ NebiusCreds = AnyNebiusCreds
59
+ AnyNebiusFileCreds = NebiusServiceAccountFileCreds
60
+
61
+
62
+ class NebiusBackendConfig(CoreModel):
63
+ """
64
+ The backend config used in the API, server/config.yml, `NebiusConfigurator`.
65
+ It also serves as a base class for other backend config models.
66
+ Should not include creds.
67
+ """
68
+
69
+ type: Annotated[
70
+ Literal["nebius"],
71
+ Field(description="The type of backend"),
72
+ ] = "nebius"
73
+ regions: Annotated[
74
+ Optional[list[str]],
75
+ Field(description="The list of Nebius regions. Omit to use all regions"),
76
+ ] = None
77
+
78
+
79
+ class NebiusBackendConfigWithCreds(NebiusBackendConfig):
80
+ """
81
+ Same as `NebiusBackendConfig` but also includes creds.
82
+ """
83
+
84
+ creds: Annotated[AnyNebiusCreds, Field(description="The credentials")]
85
+
86
+
87
+ class NebiusBackendFileConfigWithCreds(NebiusBackendConfig):
88
+ creds: AnyNebiusFileCreds = Field(description="The credentials")
89
+
90
+
91
+ AnyNebiusBackendConfig = Union[NebiusBackendConfig, NebiusBackendConfigWithCreds]
92
+
93
+
94
+ class NebiusStoredConfig(NebiusBackendConfig):
95
+ """
96
+ The backend config used for config parameters in the DB.
97
+ Can extend `NebiusBackendConfig` with additional parameters.
98
+ """
99
+
100
+ pass
101
+
102
+
103
+ class NebiusConfig(NebiusStoredConfig):
104
+ """
105
+ The backend config used by `NebiusBackend` and `NebiusCompute`.
106
+ """
107
+
108
+ creds: AnyNebiusCreds
@@ -0,0 +1,222 @@
1
+ import logging
2
+ import time
3
+ from collections.abc import Container as ContainerT
4
+ from collections.abc import Generator
5
+ from contextlib import contextmanager
6
+ from tempfile import NamedTemporaryFile
7
+
8
+ from nebius.aio.authorization.options import options_to_metadata
9
+ from nebius.aio.operation import Operation as SDKOperation
10
+ from nebius.aio.service_error import RequestError, StatusCode
11
+ from nebius.aio.token.renewable import OPTION_RENEW_REQUEST_TIMEOUT, OPTION_RENEW_SYNCHRONOUS
12
+ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
13
+ from nebius.api.nebius.compute.v1 import (
14
+ AttachedDiskSpec,
15
+ CreateDiskRequest,
16
+ CreateInstanceRequest,
17
+ DeleteDiskRequest,
18
+ DeleteInstanceRequest,
19
+ DiskServiceClient,
20
+ DiskSpec,
21
+ ExistingDisk,
22
+ GetInstanceRequest,
23
+ Instance,
24
+ InstanceServiceClient,
25
+ InstanceSpec,
26
+ IPAddress,
27
+ NetworkInterfaceSpec,
28
+ PublicIPAddress,
29
+ ResourcesSpec,
30
+ SourceImageFamily,
31
+ )
32
+ from nebius.api.nebius.iam.v1 import (
33
+ ListProjectsRequest,
34
+ ListTenantsRequest,
35
+ ProjectServiceClient,
36
+ TenantServiceClient,
37
+ )
38
+ from nebius.api.nebius.vpc.v1 import ListSubnetsRequest, Subnet, SubnetServiceClient
39
+ from nebius.sdk import SDK
40
+
41
+ from dstack._internal.core.backends.nebius.models import NebiusServiceAccountCreds
42
+ from dstack._internal.core.errors import BackendError, NoCapacityError
43
+ from dstack._internal.utils.event_loop import DaemonEventLoop
44
+
45
+ #
46
+ # Guidelines on using the Nebius SDK:
47
+ #
48
+ # Do not use Request.wait() or other sync SDK methods, they suffer from deadlocks.
49
+ # Instead, use async methods and await them with LOOP.await_()
50
+ LOOP = DaemonEventLoop()
51
+ # Pass a timeout to all methods to avoid infinite waiting
52
+ REQUEST_TIMEOUT = 10
53
+ # Pass REQUEST_MD to all methods to avoid infinite retries in case of invalid credentials
54
+ REQUEST_MD = options_to_metadata(
55
+ {
56
+ OPTION_RENEW_SYNCHRONOUS: "true",
57
+ OPTION_RENEW_REQUEST_TIMEOUT: "5",
58
+ }
59
+ )
60
+
61
+ # disables log messages about errors such as invalid creds or expired timeouts
62
+ logging.getLogger("nebius").setLevel(logging.CRITICAL)
63
+
64
+
65
+ @contextmanager
66
+ def wrap_capacity_errors() -> Generator[None, None, None]:
67
+ try:
68
+ yield
69
+ except RequestError as e:
70
+ if e.status.code == StatusCode.RESOURCE_EXHAUSTED or "Quota limit exceeded" in str(e):
71
+ raise NoCapacityError(e)
72
+ raise
73
+
74
+
75
+ @contextmanager
76
+ def ignore_errors(status_codes: ContainerT[StatusCode]) -> Generator[None, None, None]:
77
+ try:
78
+ yield
79
+ except RequestError as e:
80
+ if e.status.code not in status_codes:
81
+ raise
82
+
83
+
84
+ def make_sdk(creds: NebiusServiceAccountCreds) -> SDK:
85
+ with NamedTemporaryFile("w") as f:
86
+ f.write(creds.private_key_content)
87
+ f.flush()
88
+ return SDK(
89
+ service_account_private_key_file_name=f.name,
90
+ service_account_public_key_id=creds.public_key_id,
91
+ service_account_id=creds.service_account_id,
92
+ )
93
+
94
+
95
+ def wait_for_operation(
96
+ op: SDKOperation[Operation],
97
+ timeout: float,
98
+ interval: float = 1,
99
+ ) -> None:
100
+ # Re-implementation of SDKOperation.wait() to avoid https://github.com/nebius/pysdk/issues/74
101
+ deadline = time.monotonic() + timeout
102
+ while not op.done():
103
+ if time.monotonic() + interval > deadline:
104
+ raise TimeoutError(f"Operation {op.id} wait timeout")
105
+ time.sleep(interval)
106
+ LOOP.await_(op.update(timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
107
+
108
+
109
+ def get_region_to_project_id_map(sdk: SDK) -> dict[str, str]:
110
+ tenants = LOOP.await_(
111
+ TenantServiceClient(sdk).list(
112
+ ListTenantsRequest(), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
113
+ )
114
+ )
115
+ if len(tenants.items) != 1:
116
+ raise ValueError(f"Expected to find 1 tenant, found {(len(tenants.items))}")
117
+ projects = LOOP.await_(
118
+ ProjectServiceClient(sdk).list(
119
+ ListProjectsRequest(parent_id=tenants.items[0].metadata.id, page_size=999),
120
+ timeout=REQUEST_TIMEOUT,
121
+ metadata=REQUEST_MD,
122
+ )
123
+ )
124
+ result = {}
125
+ for project in projects.items:
126
+ if project.metadata.name == f"default-project-{project.status.region}":
127
+ result[project.status.region] = project.metadata.id
128
+ return result
129
+
130
+
131
+ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
132
+ subnets = LOOP.await_(
133
+ SubnetServiceClient(sdk).list(
134
+ ListSubnetsRequest(parent_id=project_id, page_size=999),
135
+ timeout=REQUEST_TIMEOUT,
136
+ metadata=REQUEST_MD,
137
+ )
138
+ )
139
+ for subnet in subnets.items:
140
+ if subnet.metadata.name.startswith("default-subnet"):
141
+ return subnet
142
+ raise BackendError(f"Could not find default subnet in project {project_id}")
143
+
144
+
145
+ def create_disk(
146
+ sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
147
+ ) -> SDKOperation[Operation]:
148
+ client = DiskServiceClient(sdk)
149
+ request = CreateDiskRequest(
150
+ metadata=ResourceMetadata(
151
+ name=name,
152
+ parent_id=project_id,
153
+ ),
154
+ spec=DiskSpec(
155
+ size_mebibytes=size_mib,
156
+ type=DiskSpec.DiskType.NETWORK_SSD,
157
+ source_image_family=SourceImageFamily(image_family=image_family),
158
+ ),
159
+ )
160
+ with wrap_capacity_errors():
161
+ return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
162
+
163
+
164
+ def delete_disk(sdk: SDK, disk_id: str) -> None:
165
+ LOOP.await_(
166
+ DiskServiceClient(sdk).delete(
167
+ DeleteDiskRequest(id=disk_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
168
+ )
169
+ )
170
+
171
+
172
+ def create_instance(
173
+ sdk: SDK,
174
+ name: str,
175
+ project_id: str,
176
+ user_data: str,
177
+ platform: str,
178
+ preset: str,
179
+ disk_id: str,
180
+ subnet_id: str,
181
+ ) -> SDKOperation[Operation]:
182
+ client = InstanceServiceClient(sdk)
183
+ request = CreateInstanceRequest(
184
+ metadata=ResourceMetadata(
185
+ name=name,
186
+ parent_id=project_id,
187
+ ),
188
+ spec=InstanceSpec(
189
+ cloud_init_user_data=user_data,
190
+ resources=ResourcesSpec(platform=platform, preset=preset),
191
+ boot_disk=AttachedDiskSpec(
192
+ attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
193
+ existing_disk=ExistingDisk(id=disk_id),
194
+ ),
195
+ network_interfaces=[
196
+ NetworkInterfaceSpec(
197
+ name="dstack-default-interface",
198
+ subnet_id=subnet_id,
199
+ ip_address=IPAddress(),
200
+ public_ip_address=PublicIPAddress(static=True),
201
+ )
202
+ ],
203
+ ),
204
+ )
205
+ with wrap_capacity_errors():
206
+ return LOOP.await_(client.create(request, timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD))
207
+
208
+
209
+ def get_instance(sdk: SDK, instance_id: str) -> Instance:
210
+ return LOOP.await_(
211
+ InstanceServiceClient(sdk).get(
212
+ GetInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
213
+ )
214
+ )
215
+
216
+
217
+ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
218
+ return LOOP.await_(
219
+ InstanceServiceClient(sdk).delete(
220
+ DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
221
+ )
222
+ )
@@ -102,6 +102,20 @@ class PlacementGroupInUseError(ComputeError):
102
102
  pass
103
103
 
104
104
 
105
+ class NotYetTerminated(ComputeError):
106
+ """
107
+ Used by Compute.terminate_instance to signal that instance termination is not complete
108
+ and the method should be called again after some time to continue termination.
109
+ """
110
+
111
+ def __init__(self, details: str) -> None:
112
+ """
113
+ Args:
114
+ details: some details about the termination status
115
+ """
116
+ return super().__init__(details)
117
+
118
+
105
119
  class CLIError(DstackError):
106
120
  pass
107
121
 
@@ -12,6 +12,7 @@ class BackendType(str, enum.Enum):
12
12
  DATACRUNCH (BackendType): DataCrunch
13
13
  KUBERNETES (BackendType): Kubernetes
14
14
  LAMBDA (BackendType): Lambda Cloud
15
+ NEBIUS (BackendType): Nebius AI Cloud
15
16
  OCI (BackendType): Oracle Cloud Infrastructure
16
17
  RUNPOD (BackendType): Runpod Cloud
17
18
  TENSORDOCK (BackendType): TensorDock Marketplace
@@ -29,6 +30,7 @@ class BackendType(str, enum.Enum):
29
30
  LAMBDA = "lambda"
30
31
  LOCAL = "local"
31
32
  REMOTE = "remote" # TODO: replace for LOCAL
33
+ NEBIUS = "nebius"
32
34
  OCI = "oci"
33
35
  RUNPOD = "runpod"
34
36
  TENSORDOCK = "tensordock"
@@ -57,11 +57,11 @@ class ChatCompletionsResponse(CoreModel):
57
57
 
58
58
 
59
59
  class ChatCompletionsChunk(CoreModel):
60
- id: str
60
+ id: Optional[str] = None
61
61
  choices: List[ChatCompletionsChunkChoice]
62
- created: int
62
+ created: Optional[int] = None
63
63
  model: str
64
- system_fingerprint: str = ""
64
+ system_fingerprint: Optional[str] = ""
65
65
  object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
66
66
 
67
67
 
@@ -39,7 +39,7 @@ from dstack._internal.core.backends.remote.provisioning import (
39
39
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
40
40
 
41
41
  # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
42
- from dstack._internal.core.errors import BackendError, ProvisioningError
42
+ from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
43
43
  from dstack._internal.core.models.backends.base import BackendType
44
44
  from dstack._internal.core.models.fleets import InstanceGroupPlacement
45
45
  from dstack._internal.core.models.instances import (
@@ -64,6 +64,7 @@ from dstack._internal.core.models.runs import (
64
64
  Retry,
65
65
  )
66
66
  from dstack._internal.core.services.profiles import get_retry
67
+ from dstack._internal.server import settings as server_settings
67
68
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
68
69
  from dstack._internal.server.db import get_session_ctx
69
70
  from dstack._internal.server.models import (
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
529
530
  session=session, fleet_id=instance.fleet_id
530
531
  )
531
532
 
532
- for backend, instance_offer in offers:
533
+ # Limit number of offers tried to prevent long-running processing
534
+ # in case all offers fail.
535
+ for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
533
536
  if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
534
537
  continue
535
538
  compute = backend.compute()
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
578
581
  extra={"instance_name": instance.name},
579
582
  )
580
583
  continue
581
- except NotImplementedError:
582
- # skip a backend without create_instance support, continue with next backend and offer
584
+ except Exception:
585
+ logger.exception(
586
+ "Got exception when launching %s in %s/%s",
587
+ instance_offer.instance.name,
588
+ instance_offer.backend.value,
589
+ instance_offer.region,
590
+ )
583
591
  continue
584
592
 
585
593
  instance.status = InstanceStatus.PROVISIONING
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
607
615
 
608
616
  if not should_retry:
609
617
  instance.status = InstanceStatus.TERMINATED
610
- instance.termination_reason = "No offers found"
618
+ instance.termination_reason = "All offers failed" if offers else "No offers found"
611
619
  logger.info(
612
- "No offers found. Terminated instance %s",
620
+ "Terminated instance %s: %s",
613
621
  instance.name,
622
+ instance.termination_reason,
614
623
  extra={
615
624
  "instance_name": instance.name,
616
625
  "instance_status": InstanceStatus.TERMINATED.value,
@@ -837,12 +846,17 @@ async def _terminate(instance: InstanceModel) -> None:
837
846
  instance.first_termination_retry_at = get_current_datetime()
838
847
  instance.last_termination_retry_at = get_current_datetime()
839
848
  if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
840
- logger.warning(
841
- "Failed to terminate instance %s. Will retry. Error: %r",
842
- instance.name,
843
- e,
844
- exc_info=not isinstance(e, BackendError),
845
- )
849
+ if isinstance(e, NotYetTerminated):
850
+ logger.debug(
851
+ "Instance %s termination in progress: %s", instance.name, e
852
+ )
853
+ else:
854
+ logger.warning(
855
+ "Failed to terminate instance %s. Will retry. Error: %r",
856
+ instance.name,
857
+ e,
858
+ exc_info=not isinstance(e, BackendError),
859
+ )
846
860
  return
847
861
  logger.error(
848
862
  "Failed all attempts to terminate instance %s."
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from typing import Annotated
2
3
 
3
4
  from fastapi import APIRouter, Depends
@@ -6,14 +7,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
6
7
 
7
8
  from dstack._internal.server import settings
8
9
  from dstack._internal.server.db import get_session
9
- from dstack._internal.server.deps import Project
10
- from dstack._internal.server.models import ProjectModel
10
+ from dstack._internal.server.security.permissions import OptionalServiceAccount
11
11
  from dstack._internal.server.services import prometheus
12
12
  from dstack._internal.server.utils.routers import error_not_found
13
13
 
14
+ _auth = OptionalServiceAccount(os.getenv("DSTACK_PROMETHEUS_AUTH_TOKEN"))
15
+
14
16
  router = APIRouter(
15
17
  tags=["prometheus"],
16
18
  default_response_class=PlainTextResponse,
19
+ dependencies=[Depends(_auth)],
17
20
  )
18
21
 
19
22
 
@@ -24,13 +27,3 @@ async def get_prometheus_metrics(
24
27
  if not settings.ENABLE_PROMETHEUS_METRICS:
25
28
  raise error_not_found()
26
29
  return await prometheus.get_metrics(session=session)
27
-
28
-
29
- @router.get("/metrics/project/{project_name}", deprecated=True)
30
- async def get_project_prometheus_metrics(
31
- session: Annotated[AsyncSession, Depends(get_session)],
32
- project: Annotated[ProjectModel, Depends(Project())],
33
- ) -> str:
34
- if not settings.ENABLE_PROMETHEUS_METRICS:
35
- raise error_not_found()
36
- return await prometheus.get_project_metrics(session=session, project=project)
@@ -1,4 +1,4 @@
1
- from typing import Tuple
1
+ from typing import Annotated, Optional, Tuple
2
2
 
3
3
  from fastapi import Depends, HTTPException, Security
4
4
  from fastapi.security import HTTPBearer
@@ -99,6 +99,24 @@ class ProjectMember:
99
99
  return await get_project_member(session, project_name, token.credentials)
100
100
 
101
101
 
102
+ class OptionalServiceAccount:
103
+ def __init__(self, token: Optional[str]) -> None:
104
+ self._token = token
105
+
106
+ async def __call__(
107
+ self,
108
+ token: Annotated[
109
+ Optional[HTTPAuthorizationCredentials], Security(HTTPBearer(auto_error=False))
110
+ ],
111
+ ) -> None:
112
+ if self._token is None:
113
+ return
114
+ if token is None:
115
+ raise error_forbidden()
116
+ if token.credentials != self._token:
117
+ raise error_invalid_token()
118
+
119
+
102
120
  async def get_project_member(
103
121
  session: AsyncSession, project_name: str, token: str
104
122
  ) -> Tuple[UserModel, ProjectModel]:
@@ -35,7 +35,7 @@ class CursorDesktop:
35
35
 
36
36
  def get_print_readme_commands(self) -> List[str]:
37
37
  return [
38
- "echo To open in VS Code Desktop, use link below:",
38
+ "echo To open in Cursor, use link below:",
39
39
  "echo ''",
40
40
  f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
41
41
  "echo ''",