dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/oci/resources.py +5 -5
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +4 -0
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/server/app.py +22 -10
- dstack/_internal/server/background/__init__.py +5 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +62 -48
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
- dstack/_internal/server/background/tasks/process_runs.py +63 -20
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +16 -16
- dstack/_internal/server/schemas/logs.py +1 -9
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +10 -14
- dstack/_internal/server/services/jobs/__init__.py +10 -12
- dstack/_internal/server/services/logs/aws.py +45 -3
- dstack/_internal/server/services/logs/filelog.py +121 -11
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/projects.py +35 -15
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
- dstack/_internal/server/services/runs.py +74 -34
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
- dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
- dstack/_internal/server/testing/common.py +7 -0
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/utils/common.py +10 -21
- dstack/_internal/utils/cron.py +5 -0
- dstack/version.py +1 -1
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
from dstack._internal.core.backends.base.compute import (
|
|
2
|
-
ComputeWithCreateInstanceSupport,
|
|
3
|
-
ComputeWithGatewaySupport,
|
|
4
|
-
ComputeWithMultinodeSupport,
|
|
5
|
-
ComputeWithPlacementGroupSupport,
|
|
6
|
-
ComputeWithPrivateGatewaySupport,
|
|
7
|
-
ComputeWithReservationSupport,
|
|
8
|
-
ComputeWithVolumeSupport,
|
|
9
|
-
)
|
|
10
|
-
from dstack._internal.core.backends.base.configurator import Configurator
|
|
11
|
-
from dstack._internal.core.backends.configurators import list_available_configurator_classes
|
|
12
|
-
from dstack._internal.core.backends.local.compute import LocalCompute
|
|
13
|
-
from dstack._internal.core.models.backends.base import BackendType
|
|
14
|
-
from dstack._internal.settings import LOCAL_BACKEND_ENABLED
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _get_backends_with_compute_feature(
|
|
18
|
-
configurator_classes: list[type[Configurator]],
|
|
19
|
-
compute_feature_class: type,
|
|
20
|
-
) -> list[BackendType]:
|
|
21
|
-
backend_types_and_computes = [
|
|
22
|
-
(configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS)
|
|
23
|
-
for configurator_class in configurator_classes
|
|
24
|
-
]
|
|
25
|
-
if LOCAL_BACKEND_ENABLED:
|
|
26
|
-
backend_types_and_computes.append((BackendType.LOCAL, LocalCompute))
|
|
27
|
-
backend_types = []
|
|
28
|
-
for backend_type, compute_class in backend_types_and_computes:
|
|
29
|
-
if issubclass(compute_class, compute_feature_class):
|
|
30
|
-
backend_types.append(backend_type)
|
|
31
|
-
return backend_types
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
_configurator_classes = list_available_configurator_classes()
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
# The following backend lists do not include unavailable backends (i.e. backends missing deps).
|
|
38
|
-
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
|
|
39
|
-
configurator_classes=_configurator_classes,
|
|
40
|
-
compute_feature_class=ComputeWithCreateInstanceSupport,
|
|
41
|
-
)
|
|
42
|
-
BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature(
|
|
43
|
-
configurator_classes=_configurator_classes,
|
|
44
|
-
compute_feature_class=ComputeWithMultinodeSupport,
|
|
45
|
-
)
|
|
46
|
-
BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = _get_backends_with_compute_feature(
|
|
47
|
-
configurator_classes=_configurator_classes,
|
|
48
|
-
compute_feature_class=ComputeWithPlacementGroupSupport,
|
|
49
|
-
)
|
|
50
|
-
BACKENDS_WITH_RESERVATION_SUPPORT = _get_backends_with_compute_feature(
|
|
51
|
-
configurator_classes=_configurator_classes,
|
|
52
|
-
compute_feature_class=ComputeWithReservationSupport,
|
|
53
|
-
)
|
|
54
|
-
BACKENDS_WITH_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
|
|
55
|
-
configurator_classes=_configurator_classes,
|
|
56
|
-
compute_feature_class=ComputeWithGatewaySupport,
|
|
57
|
-
)
|
|
58
|
-
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
|
|
59
|
-
configurator_classes=_configurator_classes,
|
|
60
|
-
compute_feature_class=ComputeWithPrivateGatewaySupport,
|
|
61
|
-
)
|
|
62
|
-
BACKENDS_WITH_VOLUMES_SUPPORT = _get_backends_with_compute_feature(
|
|
63
|
-
configurator_classes=_configurator_classes,
|
|
64
|
-
compute_feature_class=ComputeWithVolumeSupport,
|
|
65
|
-
)
|
|
@@ -155,8 +155,20 @@ class RiftClient:
|
|
|
155
155
|
logger.debug("Terminating instance with request data: %s", request_data)
|
|
156
156
|
response_data = self._make_request("instances/terminate", request_data)
|
|
157
157
|
if isinstance(response_data, dict):
|
|
158
|
+
logger.debug("Terminating instance with response: %s", response_data)
|
|
158
159
|
info = response_data.get("terminated", [])
|
|
159
|
-
|
|
160
|
+
is_terminated = len(info) > 0
|
|
161
|
+
if not is_terminated:
|
|
162
|
+
# check if the instance is already terminated
|
|
163
|
+
instance_info = self.get_instance_by_id(instance_id)
|
|
164
|
+
is_terminated = instance_info is None or instance_info.get("status") == "Inactive"
|
|
165
|
+
logger.debug(
|
|
166
|
+
"Instance %s is already terminated: %s response: %s",
|
|
167
|
+
instance_id,
|
|
168
|
+
is_terminated,
|
|
169
|
+
instance_info,
|
|
170
|
+
)
|
|
171
|
+
return is_terminated
|
|
160
172
|
|
|
161
173
|
return False
|
|
162
174
|
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dstack._internal.core.backends.base.compute import (
|
|
2
|
+
ComputeWithCreateInstanceSupport,
|
|
3
|
+
ComputeWithGatewaySupport,
|
|
4
|
+
ComputeWithMultinodeSupport,
|
|
5
|
+
ComputeWithPlacementGroupSupport,
|
|
6
|
+
ComputeWithPrivateGatewaySupport,
|
|
7
|
+
ComputeWithReservationSupport,
|
|
8
|
+
ComputeWithVolumeSupport,
|
|
9
|
+
)
|
|
10
|
+
from dstack._internal.core.backends.base.configurator import Configurator
|
|
11
|
+
from dstack._internal.core.backends.configurators import list_available_configurator_classes
|
|
12
|
+
from dstack._internal.core.backends.local.compute import LocalCompute
|
|
13
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
14
|
+
from dstack._internal.settings import LOCAL_BACKEND_ENABLED
|
|
15
|
+
|
|
16
|
+
_configurator_classes = list_available_configurator_classes()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_backends_with_compute_feature(
|
|
20
|
+
configurator_classes: list[type[Configurator]],
|
|
21
|
+
compute_feature_class: type,
|
|
22
|
+
) -> list[BackendType]:
|
|
23
|
+
backend_types_and_computes = [
|
|
24
|
+
(configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS)
|
|
25
|
+
for configurator_class in configurator_classes
|
|
26
|
+
]
|
|
27
|
+
if LOCAL_BACKEND_ENABLED:
|
|
28
|
+
backend_types_and_computes.append((BackendType.LOCAL, LocalCompute))
|
|
29
|
+
backend_types = []
|
|
30
|
+
for backend_type, compute_class in backend_types_and_computes:
|
|
31
|
+
if issubclass(compute_class, compute_feature_class):
|
|
32
|
+
backend_types.append(backend_type)
|
|
33
|
+
return backend_types
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# The following backend lists do not include unavailable backends (i.e. backends missing deps).
|
|
37
|
+
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
|
|
38
|
+
configurator_classes=_configurator_classes,
|
|
39
|
+
compute_feature_class=ComputeWithCreateInstanceSupport,
|
|
40
|
+
)
|
|
41
|
+
BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature(
|
|
42
|
+
configurator_classes=_configurator_classes,
|
|
43
|
+
compute_feature_class=ComputeWithMultinodeSupport,
|
|
44
|
+
)
|
|
45
|
+
BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = _get_backends_with_compute_feature(
|
|
46
|
+
configurator_classes=_configurator_classes,
|
|
47
|
+
compute_feature_class=ComputeWithPlacementGroupSupport,
|
|
48
|
+
)
|
|
49
|
+
BACKENDS_WITH_RESERVATION_SUPPORT = _get_backends_with_compute_feature(
|
|
50
|
+
configurator_classes=_configurator_classes,
|
|
51
|
+
compute_feature_class=ComputeWithReservationSupport,
|
|
52
|
+
)
|
|
53
|
+
BACKENDS_WITH_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
|
|
54
|
+
configurator_classes=_configurator_classes,
|
|
55
|
+
compute_feature_class=ComputeWithGatewaySupport,
|
|
56
|
+
)
|
|
57
|
+
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
|
|
58
|
+
configurator_classes=_configurator_classes,
|
|
59
|
+
compute_feature_class=ComputeWithPrivateGatewaySupport,
|
|
60
|
+
)
|
|
61
|
+
BACKENDS_WITH_VOLUMES_SUPPORT = _get_backends_with_compute_feature(
|
|
62
|
+
configurator_classes=_configurator_classes,
|
|
63
|
+
compute_feature_class=ComputeWithVolumeSupport,
|
|
64
|
+
)
|
|
@@ -26,7 +26,7 @@ from dstack import version
|
|
|
26
26
|
from dstack._internal.core.backends.oci.region import OCIRegionClient
|
|
27
27
|
from dstack._internal.core.errors import BackendError
|
|
28
28
|
from dstack._internal.core.models.instances import InstanceOffer
|
|
29
|
-
from dstack._internal.utils.common import
|
|
29
|
+
from dstack._internal.utils.common import batched
|
|
30
30
|
from dstack._internal.utils.logging import get_logger
|
|
31
31
|
|
|
32
32
|
logger = get_logger(__name__)
|
|
@@ -667,21 +667,21 @@ def add_security_group_rules(
|
|
|
667
667
|
security_group_id: str, rules: Iterable[SecurityRule], client: oci.core.VirtualNetworkClient
|
|
668
668
|
) -> None:
|
|
669
669
|
rules_details = map(SecurityRule.to_sdk_add_rule_details, rules)
|
|
670
|
-
for
|
|
670
|
+
for batch in batched(rules_details, ADD_SECURITY_RULES_MAX_CHUNK_SIZE):
|
|
671
671
|
client.add_network_security_group_security_rules(
|
|
672
672
|
security_group_id,
|
|
673
|
-
oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=
|
|
673
|
+
oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=batch),
|
|
674
674
|
)
|
|
675
675
|
|
|
676
676
|
|
|
677
677
|
def remove_security_group_rules(
|
|
678
678
|
security_group_id: str, rule_ids: Iterable[str], client: oci.core.VirtualNetworkClient
|
|
679
679
|
) -> None:
|
|
680
|
-
for
|
|
680
|
+
for batch in batched(rule_ids, REMOVE_SECURITY_RULES_MAX_CHUNK_SIZE):
|
|
681
681
|
client.remove_network_security_group_security_rules(
|
|
682
682
|
security_group_id,
|
|
683
683
|
oci.core.models.RemoveNetworkSecurityGroupSecurityRulesDetails(
|
|
684
|
-
security_rule_ids=
|
|
684
|
+
security_rule_ids=batch
|
|
685
685
|
),
|
|
686
686
|
)
|
|
687
687
|
|
|
@@ -57,6 +57,8 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
|
|
|
57
57
|
profile_excludes.add("startup_order")
|
|
58
58
|
if profile.stop_criteria is None:
|
|
59
59
|
profile_excludes.add("stop_criteria")
|
|
60
|
+
if profile.schedule is None:
|
|
61
|
+
profile_excludes.add("schedule")
|
|
60
62
|
if configuration_excludes:
|
|
61
63
|
spec_excludes["configuration"] = configuration_excludes
|
|
62
64
|
if profile_excludes:
|
|
@@ -126,6 +126,10 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
|
|
|
126
126
|
configuration_excludes["files"] = True
|
|
127
127
|
if not run_spec.file_archives:
|
|
128
128
|
spec_excludes["file_archives"] = True
|
|
129
|
+
if configuration.schedule is None:
|
|
130
|
+
configuration_excludes["schedule"] = True
|
|
131
|
+
if profile is not None and profile.schedule is None:
|
|
132
|
+
profile_excludes.add("schedule")
|
|
129
133
|
|
|
130
134
|
if configuration_excludes:
|
|
131
135
|
spec_excludes["configuration"] = configuration_excludes
|
|
@@ -8,6 +8,7 @@ from typing_extensions import Annotated, Literal
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import CoreModel, Duration
|
|
10
10
|
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
11
|
+
from dstack._internal.utils.cron import validate_cron
|
|
11
12
|
from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
|
|
12
13
|
from dstack._internal.utils.tags import tags_validator
|
|
13
14
|
|
|
@@ -167,6 +168,38 @@ class UtilizationPolicy(CoreModel):
|
|
|
167
168
|
return v
|
|
168
169
|
|
|
169
170
|
|
|
171
|
+
class Schedule(CoreModel):
|
|
172
|
+
cron: Annotated[
|
|
173
|
+
Union[List[str], str],
|
|
174
|
+
Field(
|
|
175
|
+
description=(
|
|
176
|
+
"A cron expression or a list of cron expressions specifying the UTC time when the run needs to be started"
|
|
177
|
+
)
|
|
178
|
+
),
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
@validator("cron")
|
|
182
|
+
def _validate_cron(cls, v: Union[List[str], str]) -> List[str]:
|
|
183
|
+
if isinstance(v, str):
|
|
184
|
+
values = [v]
|
|
185
|
+
else:
|
|
186
|
+
values = v
|
|
187
|
+
if len(values) == 0:
|
|
188
|
+
raise ValueError("At least one cron expression must be specified")
|
|
189
|
+
for value in values:
|
|
190
|
+
validate_cron(value)
|
|
191
|
+
return values
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def crons(self) -> List[str]:
|
|
195
|
+
"""
|
|
196
|
+
Access `cron` attribute as a list.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(self.cron, str):
|
|
199
|
+
return [self.cron]
|
|
200
|
+
return self.cron
|
|
201
|
+
|
|
202
|
+
|
|
170
203
|
class ProfileParams(CoreModel):
|
|
171
204
|
backends: Annotated[
|
|
172
205
|
Optional[List[BackendType]],
|
|
@@ -281,6 +314,10 @@ class ProfileParams(CoreModel):
|
|
|
281
314
|
)
|
|
282
315
|
),
|
|
283
316
|
] = None
|
|
317
|
+
schedule: Annotated[
|
|
318
|
+
Optional[Schedule],
|
|
319
|
+
Field(description=("The schedule for starting the run at specified time")),
|
|
320
|
+
] = None
|
|
284
321
|
fleets: Annotated[
|
|
285
322
|
Optional[list[str]], Field(description="The fleets considered for reuse")
|
|
286
323
|
] = None
|
dstack/_internal/server/app.py
CHANGED
|
@@ -13,6 +13,7 @@ from fastapi.datastructures import URL
|
|
|
13
13
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
14
14
|
from fastapi.staticfiles import StaticFiles
|
|
15
15
|
from prometheus_client import Counter, Histogram
|
|
16
|
+
from sentry_sdk.types import SamplingContext
|
|
16
17
|
|
|
17
18
|
from dstack._internal.cli.utils.common import console
|
|
18
19
|
from dstack._internal.core.errors import ForbiddenError, ServerClientError
|
|
@@ -81,16 +82,6 @@ REQUEST_DURATION = Histogram(
|
|
|
81
82
|
|
|
82
83
|
|
|
83
84
|
def create_app() -> FastAPI:
|
|
84
|
-
if settings.SENTRY_DSN is not None:
|
|
85
|
-
sentry_sdk.init(
|
|
86
|
-
dsn=settings.SENTRY_DSN,
|
|
87
|
-
release=DSTACK_VERSION,
|
|
88
|
-
environment=settings.SERVER_ENVIRONMENT,
|
|
89
|
-
enable_tracing=True,
|
|
90
|
-
traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE,
|
|
91
|
-
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
85
|
app = FastAPI(
|
|
95
86
|
docs_url="/api/docs",
|
|
96
87
|
lifespan=lifespan,
|
|
@@ -102,6 +93,15 @@ def create_app() -> FastAPI:
|
|
|
102
93
|
@asynccontextmanager
|
|
103
94
|
async def lifespan(app: FastAPI):
|
|
104
95
|
configure_logging()
|
|
96
|
+
if settings.SENTRY_DSN is not None:
|
|
97
|
+
sentry_sdk.init(
|
|
98
|
+
dsn=settings.SENTRY_DSN,
|
|
99
|
+
release=DSTACK_VERSION,
|
|
100
|
+
environment=settings.SERVER_ENVIRONMENT,
|
|
101
|
+
enable_tracing=True,
|
|
102
|
+
traces_sampler=_sentry_traces_sampler,
|
|
103
|
+
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
104
|
+
)
|
|
105
105
|
server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
|
|
106
106
|
asyncio.get_running_loop().set_default_executor(server_executor)
|
|
107
107
|
await migrate()
|
|
@@ -379,3 +379,15 @@ def _print_dstack_logo():
|
|
|
379
379
|
╰━━┻━━┻╯╱╰╯╰━━┻╯
|
|
380
380
|
[/]"""
|
|
381
381
|
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
|
|
385
|
+
parent_sampling_decision = sampling_context["parent_sampled"]
|
|
386
|
+
if parent_sampling_decision is not None:
|
|
387
|
+
return float(parent_sampling_decision)
|
|
388
|
+
transaction_context = sampling_context["transaction_context"]
|
|
389
|
+
name = transaction_context.get("name")
|
|
390
|
+
if name is not None:
|
|
391
|
+
if name.startswith("background."):
|
|
392
|
+
return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
|
|
393
|
+
return settings.SENTRY_TRACES_SAMPLE_RATE
|
|
@@ -79,6 +79,11 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
79
79
|
process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
|
|
80
80
|
)
|
|
81
81
|
_scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
|
|
82
|
+
_scheduler.add_job(
|
|
83
|
+
process_fleets,
|
|
84
|
+
IntervalTrigger(seconds=10, jitter=2),
|
|
85
|
+
max_instances=1,
|
|
86
|
+
)
|
|
82
87
|
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
83
88
|
# Add multiple copies of tasks if requested.
|
|
84
89
|
# max_instances=1 for additional copies to avoid running too many tasks.
|
|
@@ -113,11 +118,5 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
113
118
|
kwargs={"batch_size": 5},
|
|
114
119
|
max_instances=2 if replica == 0 else 1,
|
|
115
120
|
)
|
|
116
|
-
_scheduler.add_job(
|
|
117
|
-
process_fleets,
|
|
118
|
-
IntervalTrigger(seconds=10, jitter=2),
|
|
119
|
-
kwargs={"batch_size": 5},
|
|
120
|
-
max_instances=2 if replica == 0 else 1,
|
|
121
|
-
)
|
|
122
121
|
_scheduler.start()
|
|
123
122
|
return _scheduler
|
|
@@ -1,36 +1,37 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
from datetime import timedelta
|
|
2
|
+
from typing import List
|
|
3
3
|
|
|
4
|
-
from sqlalchemy import select
|
|
4
|
+
from sqlalchemy import select, update
|
|
5
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
-
from sqlalchemy.orm import joinedload
|
|
6
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
9
9
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
|
-
from dstack._internal.server.models import
|
|
10
|
+
from dstack._internal.server.models import (
|
|
11
|
+
FleetModel,
|
|
12
|
+
InstanceModel,
|
|
13
|
+
JobModel,
|
|
14
|
+
PlacementGroupModel,
|
|
15
|
+
RunModel,
|
|
16
|
+
)
|
|
11
17
|
from dstack._internal.server.services.fleets import (
|
|
12
18
|
is_fleet_empty,
|
|
13
19
|
is_fleet_in_use,
|
|
14
20
|
)
|
|
15
21
|
from dstack._internal.server.services.locking import get_locker
|
|
16
|
-
from dstack._internal.server.
|
|
22
|
+
from dstack._internal.server.utils import sentry_utils
|
|
17
23
|
from dstack._internal.utils.common import get_current_datetime
|
|
18
24
|
from dstack._internal.utils.logging import get_logger
|
|
19
25
|
|
|
20
26
|
logger = get_logger(__name__)
|
|
21
27
|
|
|
22
28
|
|
|
29
|
+
BATCH_SIZE = 10
|
|
23
30
|
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
24
31
|
|
|
25
32
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
for _ in range(batch_size):
|
|
29
|
-
tasks.append(_process_next_fleet())
|
|
30
|
-
await asyncio.gather(*tasks)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
async def _process_next_fleet():
|
|
33
|
+
@sentry_utils.instrument_background_task
|
|
34
|
+
async def process_fleets():
|
|
34
35
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
|
|
35
36
|
async with get_session_ctx() as session:
|
|
36
37
|
async with lock:
|
|
@@ -40,51 +41,64 @@ async def _process_next_fleet():
|
|
|
40
41
|
FleetModel.deleted == False,
|
|
41
42
|
FleetModel.id.not_in(lockset),
|
|
42
43
|
FleetModel.last_processed_at
|
|
43
|
-
< get_current_datetime()
|
|
44
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
44
45
|
)
|
|
46
|
+
.options(load_only(FleetModel.id))
|
|
45
47
|
.order_by(FleetModel.last_processed_at.asc())
|
|
46
|
-
.limit(
|
|
48
|
+
.limit(BATCH_SIZE)
|
|
47
49
|
.with_for_update(skip_locked=True, key_share=True)
|
|
48
50
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
fleet_models = list(res.scalars().all())
|
|
52
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
53
|
+
for fleet_id in fleet_ids:
|
|
54
|
+
lockset.add(fleet_id)
|
|
53
55
|
try:
|
|
54
|
-
|
|
55
|
-
await _process_fleet(session=session, fleet_model=fleet_model)
|
|
56
|
+
await _process_fleets(session=session, fleet_models=fleet_models)
|
|
56
57
|
finally:
|
|
57
|
-
lockset.difference_update(
|
|
58
|
+
lockset.difference_update(fleet_ids)
|
|
58
59
|
|
|
59
60
|
|
|
60
|
-
async def
|
|
61
|
-
|
|
61
|
+
async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
|
|
62
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
62
63
|
# Refetch to load related attributes.
|
|
63
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
64
64
|
res = await session.execute(
|
|
65
65
|
select(FleetModel)
|
|
66
|
-
.where(FleetModel.id
|
|
67
|
-
.options(joinedload(FleetModel.
|
|
68
|
-
.options(
|
|
69
|
-
|
|
66
|
+
.where(FleetModel.id.in_(fleet_ids))
|
|
67
|
+
.options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
|
|
68
|
+
.options(
|
|
69
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
|
|
70
|
+
)
|
|
71
|
+
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
70
72
|
.execution_options(populate_existing=True)
|
|
71
73
|
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
fleet_models = list(res.unique().scalars().all())
|
|
75
|
+
|
|
76
|
+
deleted_fleets_ids = []
|
|
77
|
+
now = get_current_datetime()
|
|
78
|
+
for fleet_model in fleet_models:
|
|
79
|
+
deleted = _autodelete_fleet(fleet_model)
|
|
80
|
+
if deleted:
|
|
81
|
+
deleted_fleets_ids.append(fleet_model.id)
|
|
82
|
+
fleet_model.last_processed_at = now
|
|
83
|
+
|
|
84
|
+
await session.execute(
|
|
85
|
+
update(PlacementGroupModel)
|
|
86
|
+
.where(
|
|
87
|
+
PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
|
|
88
|
+
)
|
|
89
|
+
.values(fleet_deleted=True)
|
|
90
|
+
)
|
|
91
|
+
await session.commit()
|
|
74
92
|
|
|
75
93
|
|
|
76
|
-
|
|
94
|
+
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
77
95
|
# Currently all empty fleets are autodeleted.
|
|
78
96
|
# TODO: If fleets with `nodes: 0..` are supported, their deletion should be skipped.
|
|
79
97
|
if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
|
|
80
|
-
|
|
81
|
-
await session.commit()
|
|
82
|
-
return
|
|
98
|
+
return False
|
|
83
99
|
|
|
84
100
|
logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name)
|
|
85
101
|
fleet_model.status = FleetStatus.TERMINATED
|
|
86
102
|
fleet_model.deleted = True
|
|
87
|
-
fleet_model.last_processed_at = get_current_datetime()
|
|
88
|
-
await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
|
|
89
|
-
await session.commit()
|
|
90
103
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
104
|
+
return True
|
|
@@ -17,6 +17,7 @@ from dstack._internal.server.services.gateways import (
|
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
|
|
19
19
|
from dstack._internal.server.services.logging import fmt
|
|
20
|
+
from dstack._internal.server.utils import sentry_utils
|
|
20
21
|
from dstack._internal.utils.common import get_current_datetime
|
|
21
22
|
from dstack._internal.utils.logging import get_logger
|
|
22
23
|
|
|
@@ -28,6 +29,7 @@ async def process_gateways_connections():
|
|
|
28
29
|
await _process_active_connections()
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
@sentry_utils.instrument_background_task
|
|
31
33
|
async def process_gateways():
|
|
32
34
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
33
35
|
async with get_session_ctx() as session:
|
|
@@ -110,7 +112,6 @@ async def _process_connection(conn: GatewayConnection):
|
|
|
110
112
|
async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
|
|
111
113
|
logger.info("%s: started gateway provisioning", fmt(gateway_model))
|
|
112
114
|
# Refetch to load related attributes.
|
|
113
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
114
115
|
res = await session.execute(
|
|
115
116
|
select(GatewayModel)
|
|
116
117
|
.where(GatewayModel.id == gateway_model.id)
|
|
@@ -157,7 +158,6 @@ async def _process_provisioning_gateway(
|
|
|
157
158
|
session: AsyncSession, gateway_model: GatewayModel
|
|
158
159
|
) -> None:
|
|
159
160
|
# Refetch to load related attributes.
|
|
160
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
161
161
|
res = await session.execute(
|
|
162
162
|
select(GatewayModel)
|
|
163
163
|
.where(GatewayModel.id == gateway_model.id)
|
|
@@ -10,13 +10,14 @@ from dstack._internal.core.errors import BackendNotAvailable
|
|
|
10
10
|
from dstack._internal.core.models.profiles import parse_duration
|
|
11
11
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
12
12
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
13
|
-
from dstack._internal.server.models import ProjectModel, VolumeModel
|
|
13
|
+
from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel
|
|
14
14
|
from dstack._internal.server.services import backends as backends_services
|
|
15
15
|
from dstack._internal.server.services.locking import get_locker
|
|
16
16
|
from dstack._internal.server.services.volumes import (
|
|
17
17
|
get_volume_configuration,
|
|
18
18
|
volume_model_to_volume,
|
|
19
19
|
)
|
|
20
|
+
from dstack._internal.server.utils import sentry_utils
|
|
20
21
|
from dstack._internal.utils import common
|
|
21
22
|
from dstack._internal.utils.common import get_current_datetime
|
|
22
23
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -24,6 +25,7 @@ from dstack._internal.utils.logging import get_logger
|
|
|
24
25
|
logger = get_logger(__name__)
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
@sentry_utils.instrument_background_task
|
|
27
29
|
async def process_idle_volumes():
|
|
28
30
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
29
31
|
async with get_session_ctx() as session:
|
|
@@ -49,7 +51,7 @@ async def process_idle_volumes():
|
|
|
49
51
|
select(VolumeModel)
|
|
50
52
|
.where(VolumeModel.id.in_(volume_ids))
|
|
51
53
|
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
52
|
-
.options(joinedload(VolumeModel.user))
|
|
54
|
+
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
53
55
|
.options(joinedload(VolumeModel.attachments))
|
|
54
56
|
.execution_options(populate_existing=True)
|
|
55
57
|
)
|
|
@@ -82,8 +84,7 @@ def _should_delete_volume(volume: VolumeModel) -> bool:
|
|
|
82
84
|
|
|
83
85
|
def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
|
|
84
86
|
last_used = volume.last_job_processed_at or volume.created_at
|
|
85
|
-
|
|
86
|
-
idle_time = get_current_datetime() - last_used_utc
|
|
87
|
+
idle_time = get_current_datetime() - last_used
|
|
87
88
|
return max(idle_time, datetime.timedelta(0))
|
|
88
89
|
|
|
89
90
|
|