dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +12 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +26 -10
- dstack/_internal/server/background/__init__.py +9 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +168 -103
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
- dstack/_internal/server/background/tasks/process_runs.py +84 -34
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +57 -16
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +113 -15
- dstack/_internal/server/services/jobs/__init__.py +18 -13
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +51 -19
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +115 -39
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -8,6 +8,7 @@ from typing_extensions import Annotated, Literal
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import CoreModel, Duration
|
|
10
10
|
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
11
|
+
from dstack._internal.utils.cron import validate_cron
|
|
11
12
|
from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
|
|
12
13
|
from dstack._internal.utils.tags import tags_validator
|
|
13
14
|
|
|
@@ -167,6 +168,38 @@ class UtilizationPolicy(CoreModel):
|
|
|
167
168
|
return v
|
|
168
169
|
|
|
169
170
|
|
|
171
|
+
class Schedule(CoreModel):
|
|
172
|
+
cron: Annotated[
|
|
173
|
+
Union[List[str], str],
|
|
174
|
+
Field(
|
|
175
|
+
description=(
|
|
176
|
+
"A cron expression or a list of cron expressions specifying the UTC time when the run needs to be started"
|
|
177
|
+
)
|
|
178
|
+
),
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
@validator("cron")
|
|
182
|
+
def _validate_cron(cls, v: Union[List[str], str]) -> List[str]:
|
|
183
|
+
if isinstance(v, str):
|
|
184
|
+
values = [v]
|
|
185
|
+
else:
|
|
186
|
+
values = v
|
|
187
|
+
if len(values) == 0:
|
|
188
|
+
raise ValueError("At least one cron expression must be specified")
|
|
189
|
+
for value in values:
|
|
190
|
+
validate_cron(value)
|
|
191
|
+
return values
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def crons(self) -> List[str]:
|
|
195
|
+
"""
|
|
196
|
+
Access `cron` attribute as a list.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(self.cron, str):
|
|
199
|
+
return [self.cron]
|
|
200
|
+
return self.cron
|
|
201
|
+
|
|
202
|
+
|
|
170
203
|
class ProfileParams(CoreModel):
|
|
171
204
|
backends: Annotated[
|
|
172
205
|
Optional[List[BackendType]],
|
|
@@ -281,6 +314,10 @@ class ProfileParams(CoreModel):
|
|
|
281
314
|
)
|
|
282
315
|
),
|
|
283
316
|
] = None
|
|
317
|
+
schedule: Annotated[
|
|
318
|
+
Optional[Schedule],
|
|
319
|
+
Field(description=("The schedule for starting the run at specified time")),
|
|
320
|
+
] = None
|
|
284
321
|
fleets: Annotated[
|
|
285
322
|
Optional[list[str]], Field(description="The fleets considered for reuse")
|
|
286
323
|
] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime, timedelta
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Any, Dict, List, Optional, Type
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Type
|
|
4
4
|
|
|
5
5
|
from pydantic import UUID4, Field, root_validator
|
|
6
6
|
from typing_extensions import Annotated
|
|
@@ -8,8 +8,11 @@ from typing_extensions import Annotated
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
|
|
10
10
|
from dstack._internal.core.models.configurations import (
|
|
11
|
+
DEFAULT_PROBE_METHOD,
|
|
11
12
|
DEFAULT_REPO_DIR,
|
|
12
13
|
AnyRunConfiguration,
|
|
14
|
+
HTTPHeaderSpec,
|
|
15
|
+
HTTPMethod,
|
|
13
16
|
RunConfiguration,
|
|
14
17
|
ServiceConfiguration,
|
|
15
18
|
)
|
|
@@ -223,6 +226,17 @@ class JobSSHKey(CoreModel):
|
|
|
223
226
|
public: str
|
|
224
227
|
|
|
225
228
|
|
|
229
|
+
class ProbeSpec(CoreModel):
|
|
230
|
+
type: Literal["http"] # expect other probe types in the future, namely `exec`
|
|
231
|
+
url: str
|
|
232
|
+
method: HTTPMethod = DEFAULT_PROBE_METHOD
|
|
233
|
+
headers: list[HTTPHeaderSpec] = []
|
|
234
|
+
body: Optional[str] = None
|
|
235
|
+
timeout: int
|
|
236
|
+
interval: int
|
|
237
|
+
ready_after: int
|
|
238
|
+
|
|
239
|
+
|
|
226
240
|
class JobSpec(CoreModel):
|
|
227
241
|
replica_num: int = 0 # default value for backward compatibility
|
|
228
242
|
job_num: int
|
|
@@ -256,6 +270,7 @@ class JobSpec(CoreModel):
|
|
|
256
270
|
file_archives: list[FileArchiveMapping] = []
|
|
257
271
|
# None for non-services and pre-0.19.19 services. See `get_service_port`
|
|
258
272
|
service_port: Optional[int] = None
|
|
273
|
+
probes: list[ProbeSpec] = []
|
|
259
274
|
|
|
260
275
|
|
|
261
276
|
class JobProvisioningData(CoreModel):
|
|
@@ -325,6 +340,10 @@ class ClusterInfo(CoreModel):
|
|
|
325
340
|
gpus_per_job: int
|
|
326
341
|
|
|
327
342
|
|
|
343
|
+
class Probe(CoreModel):
|
|
344
|
+
success_streak: int
|
|
345
|
+
|
|
346
|
+
|
|
328
347
|
class JobSubmission(CoreModel):
|
|
329
348
|
id: UUID4
|
|
330
349
|
submission_num: int
|
|
@@ -341,6 +360,7 @@ class JobSubmission(CoreModel):
|
|
|
341
360
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
342
361
|
job_runtime_data: Optional[JobRuntimeData]
|
|
343
362
|
error: Optional[str] = None
|
|
363
|
+
probes: list[Probe] = []
|
|
344
364
|
|
|
345
365
|
@property
|
|
346
366
|
def age(self) -> timedelta:
|
|
@@ -236,6 +236,13 @@ class SSHTunnel:
|
|
|
236
236
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
237
237
|
self.close()
|
|
238
238
|
|
|
239
|
+
async def __aenter__(self):
|
|
240
|
+
await self.aopen()
|
|
241
|
+
return self
|
|
242
|
+
|
|
243
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
244
|
+
await self.aclose()
|
|
245
|
+
|
|
239
246
|
def _get_proxy_command(self) -> Optional[str]:
|
|
240
247
|
proxy_command: Optional[str] = None
|
|
241
248
|
for params, identity_path in self.ssh_proxies:
|
dstack/_internal/server/app.py
CHANGED
|
@@ -13,6 +13,7 @@ from fastapi.datastructures import URL
|
|
|
13
13
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
14
14
|
from fastapi.staticfiles import StaticFiles
|
|
15
15
|
from prometheus_client import Counter, Histogram
|
|
16
|
+
from sentry_sdk.types import SamplingContext
|
|
16
17
|
|
|
17
18
|
from dstack._internal.cli.utils.common import console
|
|
18
19
|
from dstack._internal.core.errors import ForbiddenError, ServerClientError
|
|
@@ -21,6 +22,7 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app
|
|
|
21
22
|
from dstack._internal.proxy.lib.routers import model_proxy
|
|
22
23
|
from dstack._internal.server import settings
|
|
23
24
|
from dstack._internal.server.background import start_background_tasks
|
|
25
|
+
from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
|
|
24
26
|
from dstack._internal.server.db import get_db, get_session_ctx, migrate
|
|
25
27
|
from dstack._internal.server.routers import (
|
|
26
28
|
backends,
|
|
@@ -81,16 +83,6 @@ REQUEST_DURATION = Histogram(
|
|
|
81
83
|
|
|
82
84
|
|
|
83
85
|
def create_app() -> FastAPI:
|
|
84
|
-
if settings.SENTRY_DSN is not None:
|
|
85
|
-
sentry_sdk.init(
|
|
86
|
-
dsn=settings.SENTRY_DSN,
|
|
87
|
-
release=DSTACK_VERSION,
|
|
88
|
-
environment=settings.SERVER_ENVIRONMENT,
|
|
89
|
-
enable_tracing=True,
|
|
90
|
-
traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE,
|
|
91
|
-
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
86
|
app = FastAPI(
|
|
95
87
|
docs_url="/api/docs",
|
|
96
88
|
lifespan=lifespan,
|
|
@@ -102,6 +94,15 @@ def create_app() -> FastAPI:
|
|
|
102
94
|
@asynccontextmanager
|
|
103
95
|
async def lifespan(app: FastAPI):
|
|
104
96
|
configure_logging()
|
|
97
|
+
if settings.SENTRY_DSN is not None:
|
|
98
|
+
sentry_sdk.init(
|
|
99
|
+
dsn=settings.SENTRY_DSN,
|
|
100
|
+
release=DSTACK_VERSION,
|
|
101
|
+
environment=settings.SERVER_ENVIRONMENT,
|
|
102
|
+
enable_tracing=True,
|
|
103
|
+
traces_sampler=_sentry_traces_sampler,
|
|
104
|
+
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
105
|
+
)
|
|
105
106
|
server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
|
|
106
107
|
asyncio.get_running_loop().set_default_executor(server_executor)
|
|
107
108
|
await migrate()
|
|
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
|
|
|
155
156
|
scheduler = start_background_tasks()
|
|
156
157
|
else:
|
|
157
158
|
logger.info("Background processing is disabled")
|
|
159
|
+
PROBES_SCHEDULER.start()
|
|
158
160
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
159
161
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
160
162
|
logger.info(
|
|
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
|
|
|
166
168
|
yield
|
|
167
169
|
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
168
170
|
scheduler.shutdown()
|
|
171
|
+
PROBES_SCHEDULER.shutdown(wait=False)
|
|
169
172
|
await gateway_connections_pool.remove_all()
|
|
170
173
|
service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
|
|
171
174
|
await service_conn_pool.remove_all()
|
|
@@ -197,6 +200,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
197
200
|
app.include_router(fleets.root_router)
|
|
198
201
|
app.include_router(fleets.project_router)
|
|
199
202
|
app.include_router(instances.root_router)
|
|
203
|
+
app.include_router(instances.project_router)
|
|
200
204
|
app.include_router(repos.router)
|
|
201
205
|
app.include_router(runs.root_router)
|
|
202
206
|
app.include_router(runs.project_router)
|
|
@@ -379,3 +383,15 @@ def _print_dstack_logo():
|
|
|
379
383
|
╰━━┻━━┻╯╱╰╯╰━━┻╯
|
|
380
384
|
[/]"""
|
|
381
385
|
)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
|
|
389
|
+
parent_sampling_decision = sampling_context["parent_sampled"]
|
|
390
|
+
if parent_sampling_decision is not None:
|
|
391
|
+
return float(parent_sampling_decision)
|
|
392
|
+
transaction_context = sampling_context["transaction_context"]
|
|
393
|
+
name = transaction_context.get("name")
|
|
394
|
+
if name is not None:
|
|
395
|
+
if name.startswith("background."):
|
|
396
|
+
return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
|
|
397
|
+
return settings.SENTRY_TRACES_SAMPLE_RATE
|
|
@@ -9,6 +9,7 @@ from dstack._internal.server.background.tasks.process_gateways import (
|
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
|
|
11
11
|
from dstack._internal.server.background.tasks.process_instances import (
|
|
12
|
+
delete_instance_health_checks,
|
|
12
13
|
process_instances,
|
|
13
14
|
)
|
|
14
15
|
from dstack._internal.server.background.tasks.process_metrics import (
|
|
@@ -18,6 +19,7 @@ from dstack._internal.server.background.tasks.process_metrics import (
|
|
|
18
19
|
from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
19
20
|
process_placement_groups,
|
|
20
21
|
)
|
|
22
|
+
from dstack._internal.server.background.tasks.process_probes import process_probes
|
|
21
23
|
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
22
24
|
collect_prometheus_metrics,
|
|
23
25
|
delete_prometheus_metrics,
|
|
@@ -63,6 +65,7 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
63
65
|
# that the first waiting for the lock will acquire it.
|
|
64
66
|
# The jitter is needed to give all tasks a chance to acquire locks.
|
|
65
67
|
|
|
68
|
+
_scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
|
|
66
69
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
67
70
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
68
71
|
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
@@ -79,6 +82,12 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
79
82
|
process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
|
|
80
83
|
)
|
|
81
84
|
_scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
|
|
85
|
+
_scheduler.add_job(
|
|
86
|
+
process_fleets,
|
|
87
|
+
IntervalTrigger(seconds=10, jitter=2),
|
|
88
|
+
max_instances=1,
|
|
89
|
+
)
|
|
90
|
+
_scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1)
|
|
82
91
|
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
83
92
|
# Add multiple copies of tasks if requested.
|
|
84
93
|
# max_instances=1 for additional copies to avoid running too many tasks.
|
|
@@ -113,11 +122,5 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
113
122
|
kwargs={"batch_size": 5},
|
|
114
123
|
max_instances=2 if replica == 0 else 1,
|
|
115
124
|
)
|
|
116
|
-
_scheduler.add_job(
|
|
117
|
-
process_fleets,
|
|
118
|
-
IntervalTrigger(seconds=10, jitter=2),
|
|
119
|
-
kwargs={"batch_size": 5},
|
|
120
|
-
max_instances=2 if replica == 0 else 1,
|
|
121
|
-
)
|
|
122
125
|
_scheduler.start()
|
|
123
126
|
return _scheduler
|
|
@@ -1,36 +1,37 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
from datetime import timedelta
|
|
2
|
+
from typing import List
|
|
3
3
|
|
|
4
|
-
from sqlalchemy import select
|
|
4
|
+
from sqlalchemy import select, update
|
|
5
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
-
from sqlalchemy.orm import joinedload
|
|
6
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
9
9
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
|
-
from dstack._internal.server.models import
|
|
10
|
+
from dstack._internal.server.models import (
|
|
11
|
+
FleetModel,
|
|
12
|
+
InstanceModel,
|
|
13
|
+
JobModel,
|
|
14
|
+
PlacementGroupModel,
|
|
15
|
+
RunModel,
|
|
16
|
+
)
|
|
11
17
|
from dstack._internal.server.services.fleets import (
|
|
12
18
|
is_fleet_empty,
|
|
13
19
|
is_fleet_in_use,
|
|
14
20
|
)
|
|
15
21
|
from dstack._internal.server.services.locking import get_locker
|
|
16
|
-
from dstack._internal.server.
|
|
22
|
+
from dstack._internal.server.utils import sentry_utils
|
|
17
23
|
from dstack._internal.utils.common import get_current_datetime
|
|
18
24
|
from dstack._internal.utils.logging import get_logger
|
|
19
25
|
|
|
20
26
|
logger = get_logger(__name__)
|
|
21
27
|
|
|
22
28
|
|
|
29
|
+
BATCH_SIZE = 10
|
|
23
30
|
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
24
31
|
|
|
25
32
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
for _ in range(batch_size):
|
|
29
|
-
tasks.append(_process_next_fleet())
|
|
30
|
-
await asyncio.gather(*tasks)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
async def _process_next_fleet():
|
|
33
|
+
@sentry_utils.instrument_background_task
|
|
34
|
+
async def process_fleets():
|
|
34
35
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
|
|
35
36
|
async with get_session_ctx() as session:
|
|
36
37
|
async with lock:
|
|
@@ -40,51 +41,64 @@ async def _process_next_fleet():
|
|
|
40
41
|
FleetModel.deleted == False,
|
|
41
42
|
FleetModel.id.not_in(lockset),
|
|
42
43
|
FleetModel.last_processed_at
|
|
43
|
-
< get_current_datetime()
|
|
44
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
44
45
|
)
|
|
46
|
+
.options(load_only(FleetModel.id))
|
|
45
47
|
.order_by(FleetModel.last_processed_at.asc())
|
|
46
|
-
.limit(
|
|
48
|
+
.limit(BATCH_SIZE)
|
|
47
49
|
.with_for_update(skip_locked=True, key_share=True)
|
|
48
50
|
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
fleet_models = list(res.scalars().all())
|
|
52
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
53
|
+
for fleet_id in fleet_ids:
|
|
54
|
+
lockset.add(fleet_id)
|
|
53
55
|
try:
|
|
54
|
-
|
|
55
|
-
await _process_fleet(session=session, fleet_model=fleet_model)
|
|
56
|
+
await _process_fleets(session=session, fleet_models=fleet_models)
|
|
56
57
|
finally:
|
|
57
|
-
lockset.difference_update(
|
|
58
|
+
lockset.difference_update(fleet_ids)
|
|
58
59
|
|
|
59
60
|
|
|
60
|
-
async def
|
|
61
|
-
|
|
61
|
+
async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
|
|
62
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
62
63
|
# Refetch to load related attributes.
|
|
63
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
64
64
|
res = await session.execute(
|
|
65
65
|
select(FleetModel)
|
|
66
|
-
.where(FleetModel.id
|
|
67
|
-
.options(joinedload(FleetModel.
|
|
68
|
-
.options(
|
|
69
|
-
|
|
66
|
+
.where(FleetModel.id.in_(fleet_ids))
|
|
67
|
+
.options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
|
|
68
|
+
.options(
|
|
69
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
|
|
70
|
+
)
|
|
71
|
+
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
70
72
|
.execution_options(populate_existing=True)
|
|
71
73
|
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
fleet_models = list(res.unique().scalars().all())
|
|
75
|
+
|
|
76
|
+
deleted_fleets_ids = []
|
|
77
|
+
now = get_current_datetime()
|
|
78
|
+
for fleet_model in fleet_models:
|
|
79
|
+
deleted = _autodelete_fleet(fleet_model)
|
|
80
|
+
if deleted:
|
|
81
|
+
deleted_fleets_ids.append(fleet_model.id)
|
|
82
|
+
fleet_model.last_processed_at = now
|
|
83
|
+
|
|
84
|
+
await session.execute(
|
|
85
|
+
update(PlacementGroupModel)
|
|
86
|
+
.where(
|
|
87
|
+
PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
|
|
88
|
+
)
|
|
89
|
+
.values(fleet_deleted=True)
|
|
90
|
+
)
|
|
91
|
+
await session.commit()
|
|
74
92
|
|
|
75
93
|
|
|
76
|
-
|
|
94
|
+
def _autodelete_fleet(fleet_model: FleetModel) -> bool:
|
|
77
95
|
# Currently all empty fleets are autodeleted.
|
|
78
96
|
# TODO: If fleets with `nodes: 0..` are supported, their deletion should be skipped.
|
|
79
97
|
if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
|
|
80
|
-
|
|
81
|
-
await session.commit()
|
|
82
|
-
return
|
|
98
|
+
return False
|
|
83
99
|
|
|
84
100
|
logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name)
|
|
85
101
|
fleet_model.status = FleetStatus.TERMINATED
|
|
86
102
|
fleet_model.deleted = True
|
|
87
|
-
fleet_model.last_processed_at = get_current_datetime()
|
|
88
|
-
await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
|
|
89
|
-
await session.commit()
|
|
90
103
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
104
|
+
return True
|
|
@@ -17,6 +17,7 @@ from dstack._internal.server.services.gateways import (
|
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
|
|
19
19
|
from dstack._internal.server.services.logging import fmt
|
|
20
|
+
from dstack._internal.server.utils import sentry_utils
|
|
20
21
|
from dstack._internal.utils.common import get_current_datetime
|
|
21
22
|
from dstack._internal.utils.logging import get_logger
|
|
22
23
|
|
|
@@ -28,6 +29,7 @@ async def process_gateways_connections():
|
|
|
28
29
|
await _process_active_connections()
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
@sentry_utils.instrument_background_task
|
|
31
33
|
async def process_gateways():
|
|
32
34
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
33
35
|
async with get_session_ctx() as session:
|
|
@@ -110,7 +112,6 @@ async def _process_connection(conn: GatewayConnection):
|
|
|
110
112
|
async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
|
|
111
113
|
logger.info("%s: started gateway provisioning", fmt(gateway_model))
|
|
112
114
|
# Refetch to load related attributes.
|
|
113
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
114
115
|
res = await session.execute(
|
|
115
116
|
select(GatewayModel)
|
|
116
117
|
.where(GatewayModel.id == gateway_model.id)
|
|
@@ -157,7 +158,6 @@ async def _process_provisioning_gateway(
|
|
|
157
158
|
session: AsyncSession, gateway_model: GatewayModel
|
|
158
159
|
) -> None:
|
|
159
160
|
# Refetch to load related attributes.
|
|
160
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
161
161
|
res = await session.execute(
|
|
162
162
|
select(GatewayModel)
|
|
163
163
|
.where(GatewayModel.id == gateway_model.id)
|
|
@@ -10,13 +10,14 @@ from dstack._internal.core.errors import BackendNotAvailable
|
|
|
10
10
|
from dstack._internal.core.models.profiles import parse_duration
|
|
11
11
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
12
12
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
13
|
-
from dstack._internal.server.models import ProjectModel, VolumeModel
|
|
13
|
+
from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel
|
|
14
14
|
from dstack._internal.server.services import backends as backends_services
|
|
15
15
|
from dstack._internal.server.services.locking import get_locker
|
|
16
16
|
from dstack._internal.server.services.volumes import (
|
|
17
17
|
get_volume_configuration,
|
|
18
18
|
volume_model_to_volume,
|
|
19
19
|
)
|
|
20
|
+
from dstack._internal.server.utils import sentry_utils
|
|
20
21
|
from dstack._internal.utils import common
|
|
21
22
|
from dstack._internal.utils.common import get_current_datetime
|
|
22
23
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -24,6 +25,7 @@ from dstack._internal.utils.logging import get_logger
|
|
|
24
25
|
logger = get_logger(__name__)
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
@sentry_utils.instrument_background_task
|
|
27
29
|
async def process_idle_volumes():
|
|
28
30
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
29
31
|
async with get_session_ctx() as session:
|
|
@@ -49,7 +51,7 @@ async def process_idle_volumes():
|
|
|
49
51
|
select(VolumeModel)
|
|
50
52
|
.where(VolumeModel.id.in_(volume_ids))
|
|
51
53
|
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
52
|
-
.options(joinedload(VolumeModel.user))
|
|
54
|
+
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
53
55
|
.options(joinedload(VolumeModel.attachments))
|
|
54
56
|
.execution_options(populate_existing=True)
|
|
55
57
|
)
|
|
@@ -82,8 +84,7 @@ def _should_delete_volume(volume: VolumeModel) -> bool:
|
|
|
82
84
|
|
|
83
85
|
def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
|
|
84
86
|
last_used = volume.last_job_processed_at or volume.created_at
|
|
85
|
-
|
|
86
|
-
idle_time = get_current_datetime() - last_used_utc
|
|
87
|
+
idle_time = get_current_datetime() - last_used
|
|
87
88
|
return max(idle_time, datetime.timedelta(0))
|
|
88
89
|
|
|
89
90
|
|