dstack 0.19.18__py3-none-any.whl → 0.19.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +99 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
- dstack/_internal/core/backends/oci/resources.py +5 -5
- dstack/_internal/core/compatibility/runs.py +12 -1
- dstack/_internal/core/compatibility/volumes.py +2 -0
- dstack/_internal/core/models/common.py +38 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +30 -10
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +17 -9
- dstack/_internal/server/background/__init__.py +5 -3
- dstack/_internal/server/background/tasks/process_gateways.py +46 -28
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +3 -11
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +354 -72
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +7 -0
- dstack/_internal/server/services/locking.py +3 -1
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +47 -7
- dstack/_internal/server/services/logs/filelog.py +148 -32
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/prometheus/custom_metrics.py +20 -0
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +115 -32
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +13 -0
- dstack/_internal/server/settings.py +7 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-39a767528976f8078166.js} +11 -30
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-39a767528976f8078166.js.map} +1 -1
- dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
- dstack/_internal/server/testing/common.py +41 -5
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/common.py +10 -21
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/METADATA +7 -5
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/RECORD +74 -71
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/WHEEL +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.20.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -9,6 +9,7 @@ from typing_extensions import Annotated, Self
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.common import CoreModel
|
|
12
|
+
from dstack._internal.core.models.profiles import parse_idle_duration
|
|
12
13
|
from dstack._internal.core.models.resources import Memory
|
|
13
14
|
from dstack._internal.utils.common import get_or_error
|
|
14
15
|
from dstack._internal.utils.tags import tags_validator
|
|
@@ -44,6 +45,16 @@ class VolumeConfiguration(CoreModel):
|
|
|
44
45
|
Optional[str],
|
|
45
46
|
Field(description="The volume ID. Must be specified when registering external volumes"),
|
|
46
47
|
] = None
|
|
48
|
+
auto_cleanup_duration: Annotated[
|
|
49
|
+
Optional[Union[str, int]],
|
|
50
|
+
Field(
|
|
51
|
+
description=(
|
|
52
|
+
"Time to wait after volume is no longer used by any job before deleting it. "
|
|
53
|
+
"Defaults to keep the volume indefinitely. "
|
|
54
|
+
"Use the value 'off' or -1 to disable auto-cleanup."
|
|
55
|
+
)
|
|
56
|
+
),
|
|
57
|
+
] = None
|
|
47
58
|
tags: Annotated[
|
|
48
59
|
Optional[Dict[str, str]],
|
|
49
60
|
Field(
|
|
@@ -56,6 +67,9 @@ class VolumeConfiguration(CoreModel):
|
|
|
56
67
|
] = None
|
|
57
68
|
|
|
58
69
|
_validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator)
|
|
70
|
+
_validate_auto_cleanup_duration = validator(
|
|
71
|
+
"auto_cleanup_duration", pre=True, allow_reuse=True
|
|
72
|
+
)(parse_idle_duration)
|
|
59
73
|
|
|
60
74
|
@property
|
|
61
75
|
def size_gb(self) -> int:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Optional, TypedDict
|
|
1
|
+
from typing import Any, Optional, TypedDict, TypeVar
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -15,20 +15,19 @@ ModelDiff = dict[str, ModelFieldDiff]
|
|
|
15
15
|
|
|
16
16
|
# TODO: calculate nested diffs
|
|
17
17
|
def diff_models(
|
|
18
|
-
old: BaseModel, new: BaseModel,
|
|
18
|
+
old: BaseModel, new: BaseModel, reset: Optional[IncludeExcludeType] = None
|
|
19
19
|
) -> ModelDiff:
|
|
20
20
|
"""
|
|
21
21
|
Returns a diff of model instances fields.
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
2) the default value must be equal to itself (e.g. `math.nan` != `math.nan`).
|
|
23
|
+
The fields specified in the `reset` option are reset to their default values, effectively
|
|
24
|
+
excluding them from comparison (assuming that the default value is equal to itself, e.g,
|
|
25
|
+
`None == None`, `"task" == "task"`, but `math.nan != math.nan`).
|
|
27
26
|
|
|
28
27
|
Args:
|
|
29
28
|
old: The "old" model instance.
|
|
30
29
|
new: The "new" model instance.
|
|
31
|
-
|
|
30
|
+
reset: Fields to reset to their default values before comparison.
|
|
32
31
|
|
|
33
32
|
Returns:
|
|
34
33
|
A dict of changed fields in the form of
|
|
@@ -37,9 +36,9 @@ def diff_models(
|
|
|
37
36
|
if type(old) is not type(new):
|
|
38
37
|
raise TypeError("Both instances must be of the same Pydantic model class.")
|
|
39
38
|
|
|
40
|
-
if
|
|
41
|
-
old =
|
|
42
|
-
new =
|
|
39
|
+
if reset is not None:
|
|
40
|
+
old = copy_model(old, reset=reset)
|
|
41
|
+
new = copy_model(new, reset=reset)
|
|
43
42
|
|
|
44
43
|
changes: ModelDiff = {}
|
|
45
44
|
for field in old.__fields__:
|
|
@@ -49,3 +48,24 @@ def diff_models(
|
|
|
49
48
|
changes[field] = {"old": old_value, "new": new_value}
|
|
50
49
|
|
|
51
50
|
return changes
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
M = TypeVar("M", bound=BaseModel)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def copy_model(model: M, reset: Optional[IncludeExcludeType] = None) -> M:
|
|
57
|
+
"""
|
|
58
|
+
Returns a deep copy of the model instance.
|
|
59
|
+
|
|
60
|
+
Implemented as `BaseModel.parse_obj(BaseModel.dict())`, thus,
|
|
61
|
+
unlike `BaseModel.copy(deep=True)`, runs all validations.
|
|
62
|
+
|
|
63
|
+
The fields specified in the `reset` option are reset to their default values.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
reset: Fields to reset to their default values.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A deep copy of the model instance.
|
|
70
|
+
"""
|
|
71
|
+
return type(model).parse_obj(model.dict(exclude=reset))
|
|
@@ -64,6 +64,7 @@ class SSHAttach:
|
|
|
64
64
|
run_name: str,
|
|
65
65
|
dockerized: bool,
|
|
66
66
|
ssh_proxy: Optional[SSHConnectionParams] = None,
|
|
67
|
+
service_port: Optional[int] = None,
|
|
67
68
|
local_backend: bool = False,
|
|
68
69
|
bind_address: Optional[str] = None,
|
|
69
70
|
):
|
|
@@ -90,6 +91,7 @@ class SSHAttach:
|
|
|
90
91
|
},
|
|
91
92
|
)
|
|
92
93
|
self.ssh_proxy = ssh_proxy
|
|
94
|
+
self.service_port = service_port
|
|
93
95
|
|
|
94
96
|
hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
|
|
95
97
|
self.hosts = hosts
|
dstack/_internal/server/app.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import Awaitable, Callable, List
|
|
|
10
10
|
import sentry_sdk
|
|
11
11
|
from fastapi import FastAPI, Request, Response, status
|
|
12
12
|
from fastapi.datastructures import URL
|
|
13
|
-
from fastapi.responses import HTMLResponse,
|
|
13
|
+
from fastapi.responses import HTMLResponse, RedirectResponse
|
|
14
14
|
from fastapi.staticfiles import StaticFiles
|
|
15
15
|
from prometheus_client import Counter, Histogram
|
|
16
16
|
|
|
@@ -56,6 +56,7 @@ from dstack._internal.server.settings import (
|
|
|
56
56
|
)
|
|
57
57
|
from dstack._internal.server.utils.logging import configure_logging
|
|
58
58
|
from dstack._internal.server.utils.routers import (
|
|
59
|
+
CustomORJSONResponse,
|
|
59
60
|
check_client_server_compatibility,
|
|
60
61
|
error_detail,
|
|
61
62
|
get_server_client_error_details,
|
|
@@ -90,7 +91,10 @@ def create_app() -> FastAPI:
|
|
|
90
91
|
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
|
|
91
92
|
)
|
|
92
93
|
|
|
93
|
-
app = FastAPI(
|
|
94
|
+
app = FastAPI(
|
|
95
|
+
docs_url="/api/docs",
|
|
96
|
+
lifespan=lifespan,
|
|
97
|
+
)
|
|
94
98
|
app.state.proxy_dependency_injector = ServerProxyDependencyInjector()
|
|
95
99
|
return app
|
|
96
100
|
|
|
@@ -147,7 +151,10 @@ async def lifespan(app: FastAPI):
|
|
|
147
151
|
)
|
|
148
152
|
if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
|
|
149
153
|
init_default_storage()
|
|
150
|
-
|
|
154
|
+
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
155
|
+
scheduler = start_background_tasks()
|
|
156
|
+
else:
|
|
157
|
+
logger.info("Background processing is disabled")
|
|
151
158
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
152
159
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
153
160
|
logger.info(
|
|
@@ -157,7 +164,8 @@ async def lifespan(app: FastAPI):
|
|
|
157
164
|
for func in _ON_STARTUP_HOOKS:
|
|
158
165
|
await func(app)
|
|
159
166
|
yield
|
|
160
|
-
|
|
167
|
+
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
168
|
+
scheduler.shutdown()
|
|
161
169
|
await gateway_connections_pool.remove_all()
|
|
162
170
|
service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
|
|
163
171
|
await service_conn_pool.remove_all()
|
|
@@ -208,14 +216,14 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
208
216
|
msg = "Access denied"
|
|
209
217
|
if len(exc.args) > 0:
|
|
210
218
|
msg = exc.args[0]
|
|
211
|
-
return
|
|
219
|
+
return CustomORJSONResponse(
|
|
212
220
|
status_code=status.HTTP_403_FORBIDDEN,
|
|
213
221
|
content=error_detail(msg),
|
|
214
222
|
)
|
|
215
223
|
|
|
216
224
|
@app.exception_handler(ServerClientError)
|
|
217
225
|
async def server_client_error_handler(request: Request, exc: ServerClientError):
|
|
218
|
-
return
|
|
226
|
+
return CustomORJSONResponse(
|
|
219
227
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
220
228
|
content={"detail": get_server_client_error_details(exc)},
|
|
221
229
|
)
|
|
@@ -223,7 +231,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
223
231
|
@app.exception_handler(OSError)
|
|
224
232
|
async def os_error_handler(request, exc: OSError):
|
|
225
233
|
if exc.errno in [36, 63]:
|
|
226
|
-
return
|
|
234
|
+
return CustomORJSONResponse(
|
|
227
235
|
{"detail": "Filename too long"},
|
|
228
236
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
229
237
|
)
|
|
@@ -309,7 +317,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
309
317
|
|
|
310
318
|
@app.get("/healthcheck")
|
|
311
319
|
async def healthcheck():
|
|
312
|
-
return
|
|
320
|
+
return CustomORJSONResponse(content={"status": "running"})
|
|
313
321
|
|
|
314
322
|
if ui and Path(__file__).parent.joinpath("statics").exists():
|
|
315
323
|
app.mount(
|
|
@@ -323,7 +331,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
323
331
|
or _is_proxy_request(request)
|
|
324
332
|
or _is_prometheus_request(request)
|
|
325
333
|
):
|
|
326
|
-
return
|
|
334
|
+
return CustomORJSONResponse(
|
|
327
335
|
{"detail": exc.detail},
|
|
328
336
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
329
337
|
)
|
|
@@ -4,9 +4,10 @@ from apscheduler.triggers.interval import IntervalTrigger
|
|
|
4
4
|
from dstack._internal.server import settings
|
|
5
5
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
6
6
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
7
|
+
process_gateways,
|
|
7
8
|
process_gateways_connections,
|
|
8
|
-
process_submitted_gateways,
|
|
9
9
|
)
|
|
10
|
+
from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
|
|
10
11
|
from dstack._internal.server.background.tasks.process_instances import (
|
|
11
12
|
process_instances,
|
|
12
13
|
)
|
|
@@ -70,11 +71,12 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
70
71
|
)
|
|
71
72
|
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
72
73
|
_scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
|
|
74
|
+
_scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5)
|
|
73
75
|
_scheduler.add_job(
|
|
74
|
-
|
|
76
|
+
process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
|
|
75
77
|
)
|
|
76
78
|
_scheduler.add_job(
|
|
77
|
-
|
|
79
|
+
process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
|
|
78
80
|
)
|
|
79
81
|
_scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
|
|
80
82
|
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
@@ -16,6 +16,7 @@ from dstack._internal.server.services.gateways import (
|
|
|
16
16
|
gateway_connections_pool,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
|
|
19
|
+
from dstack._internal.server.services.logging import fmt
|
|
19
20
|
from dstack._internal.utils.common import get_current_datetime
|
|
20
21
|
from dstack._internal.utils.logging import get_logger
|
|
21
22
|
|
|
@@ -27,14 +28,14 @@ async def process_gateways_connections():
|
|
|
27
28
|
await _process_active_connections()
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
async def
|
|
31
|
+
async def process_gateways():
|
|
31
32
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
32
33
|
async with get_session_ctx() as session:
|
|
33
34
|
async with lock:
|
|
34
35
|
res = await session.execute(
|
|
35
36
|
select(GatewayModel)
|
|
36
37
|
.where(
|
|
37
|
-
GatewayModel.status
|
|
38
|
+
GatewayModel.status.in_([GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING]),
|
|
38
39
|
GatewayModel.id.not_in(lockset),
|
|
39
40
|
)
|
|
40
41
|
.options(lazyload(GatewayModel.gateway_compute))
|
|
@@ -48,7 +49,25 @@ async def process_submitted_gateways():
|
|
|
48
49
|
lockset.add(gateway_model.id)
|
|
49
50
|
try:
|
|
50
51
|
gateway_model_id = gateway_model.id
|
|
51
|
-
|
|
52
|
+
initial_status = gateway_model.status
|
|
53
|
+
if initial_status == GatewayStatus.SUBMITTED:
|
|
54
|
+
await _process_submitted_gateway(session=session, gateway_model=gateway_model)
|
|
55
|
+
elif initial_status == GatewayStatus.PROVISIONING:
|
|
56
|
+
await _process_provisioning_gateway(session=session, gateway_model=gateway_model)
|
|
57
|
+
else:
|
|
58
|
+
logger.error(
|
|
59
|
+
"%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper()
|
|
60
|
+
)
|
|
61
|
+
if gateway_model.status != initial_status:
|
|
62
|
+
logger.info(
|
|
63
|
+
"%s: gateway status has changed %s -> %s%s",
|
|
64
|
+
fmt(gateway_model),
|
|
65
|
+
initial_status.upper(),
|
|
66
|
+
gateway_model.status.upper(),
|
|
67
|
+
f": {gateway_model.status_message}" if gateway_model.status_message else "",
|
|
68
|
+
)
|
|
69
|
+
gateway_model.last_processed_at = get_current_datetime()
|
|
70
|
+
await session.commit()
|
|
52
71
|
finally:
|
|
53
72
|
lockset.difference_update([gateway_model_id])
|
|
54
73
|
|
|
@@ -89,7 +108,7 @@ async def _process_connection(conn: GatewayConnection):
|
|
|
89
108
|
|
|
90
109
|
|
|
91
110
|
async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
|
|
92
|
-
logger.info("
|
|
111
|
+
logger.info("%s: started gateway provisioning", fmt(gateway_model))
|
|
93
112
|
# Refetch to load related attributes.
|
|
94
113
|
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
95
114
|
res = await session.execute(
|
|
@@ -110,8 +129,6 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
|
|
|
110
129
|
except BackendNotAvailable:
|
|
111
130
|
gateway_model.status = GatewayStatus.FAILED
|
|
112
131
|
gateway_model.status_message = "Backend not available"
|
|
113
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
114
|
-
await session.commit()
|
|
115
132
|
return
|
|
116
133
|
|
|
117
134
|
try:
|
|
@@ -123,53 +140,54 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
|
|
|
123
140
|
)
|
|
124
141
|
session.add(gateway_model)
|
|
125
142
|
gateway_model.status = GatewayStatus.PROVISIONING
|
|
126
|
-
await session.commit()
|
|
127
|
-
await session.refresh(gateway_model)
|
|
128
143
|
except BackendError as e:
|
|
129
|
-
logger.info(
|
|
130
|
-
"Failed to create gateway compute for gateway %s: %s", gateway_model.name, repr(e)
|
|
131
|
-
)
|
|
144
|
+
logger.info("%s: failed to create gateway compute: %r", fmt(gateway_model), e)
|
|
132
145
|
gateway_model.status = GatewayStatus.FAILED
|
|
133
146
|
status_message = f"Backend error: {repr(e)}"
|
|
134
147
|
if len(e.args) > 0:
|
|
135
148
|
status_message = str(e.args[0])
|
|
136
149
|
gateway_model.status_message = status_message
|
|
137
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
138
|
-
await session.commit()
|
|
139
|
-
return
|
|
140
150
|
except Exception as e:
|
|
141
|
-
logger.exception(
|
|
142
|
-
"Got exception when creating gateway compute for gateway %s", gateway_model.name
|
|
143
|
-
)
|
|
151
|
+
logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model))
|
|
144
152
|
gateway_model.status = GatewayStatus.FAILED
|
|
145
153
|
gateway_model.status_message = f"Unexpected error: {repr(e)}"
|
|
146
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
147
|
-
await session.commit()
|
|
148
|
-
return
|
|
149
154
|
|
|
155
|
+
|
|
156
|
+
async def _process_provisioning_gateway(
|
|
157
|
+
session: AsyncSession, gateway_model: GatewayModel
|
|
158
|
+
) -> None:
|
|
159
|
+
# Refetch to load related attributes.
|
|
160
|
+
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
161
|
+
res = await session.execute(
|
|
162
|
+
select(GatewayModel)
|
|
163
|
+
.where(GatewayModel.id == gateway_model.id)
|
|
164
|
+
.execution_options(populate_existing=True)
|
|
165
|
+
)
|
|
166
|
+
gateway_model = res.unique().scalar_one()
|
|
167
|
+
|
|
168
|
+
# FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
|
|
169
|
+
# - cannot delete the gateway before it is provisioned because the DB model is locked
|
|
170
|
+
# - connection retry counter is reset on server restart
|
|
171
|
+
# - only one server replica is processing the gateway
|
|
172
|
+
# Easy to fix by doing only one connection/configuration attempt per processing iteration. The
|
|
173
|
+
# main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
|
|
174
|
+
# maintaining a different model for Sky.
|
|
150
175
|
connection = await gateways_services.connect_to_gateway_with_retry(
|
|
151
176
|
gateway_model.gateway_compute
|
|
152
177
|
)
|
|
153
178
|
if connection is None:
|
|
154
179
|
gateway_model.status = GatewayStatus.FAILED
|
|
155
180
|
gateway_model.status_message = "Failed to connect to gateway"
|
|
156
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
157
181
|
gateway_model.gateway_compute.deleted = True
|
|
158
|
-
await session.commit()
|
|
159
182
|
return
|
|
160
|
-
|
|
161
183
|
try:
|
|
162
184
|
await gateways_services.configure_gateway(connection)
|
|
163
185
|
except Exception:
|
|
164
|
-
logger.exception("
|
|
186
|
+
logger.exception("%s: failed to configure gateway", fmt(gateway_model))
|
|
165
187
|
gateway_model.status = GatewayStatus.FAILED
|
|
166
188
|
gateway_model.status_message = "Failed to configure gateway"
|
|
167
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
168
189
|
await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address)
|
|
169
190
|
gateway_model.gateway_compute.active = False
|
|
170
|
-
await session.commit()
|
|
171
191
|
return
|
|
172
192
|
|
|
173
193
|
gateway_model.status = GatewayStatus.RUNNING
|
|
174
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
175
|
-
await session.commit()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import select
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
+
from sqlalchemy.orm import joinedload
|
|
7
|
+
|
|
8
|
+
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
9
|
+
from dstack._internal.core.errors import BackendNotAvailable
|
|
10
|
+
from dstack._internal.core.models.profiles import parse_duration
|
|
11
|
+
from dstack._internal.core.models.volumes import VolumeStatus
|
|
12
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
13
|
+
from dstack._internal.server.models import ProjectModel, VolumeModel
|
|
14
|
+
from dstack._internal.server.services import backends as backends_services
|
|
15
|
+
from dstack._internal.server.services.locking import get_locker
|
|
16
|
+
from dstack._internal.server.services.volumes import (
|
|
17
|
+
get_volume_configuration,
|
|
18
|
+
volume_model_to_volume,
|
|
19
|
+
)
|
|
20
|
+
from dstack._internal.utils import common
|
|
21
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
22
|
+
from dstack._internal.utils.logging import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def process_idle_volumes():
|
|
28
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
29
|
+
async with get_session_ctx() as session:
|
|
30
|
+
async with lock:
|
|
31
|
+
res = await session.execute(
|
|
32
|
+
select(VolumeModel.id)
|
|
33
|
+
.where(
|
|
34
|
+
VolumeModel.status == VolumeStatus.ACTIVE,
|
|
35
|
+
VolumeModel.deleted == False,
|
|
36
|
+
VolumeModel.id.not_in(lockset),
|
|
37
|
+
)
|
|
38
|
+
.order_by(VolumeModel.last_processed_at.asc())
|
|
39
|
+
.limit(10)
|
|
40
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
41
|
+
)
|
|
42
|
+
volume_ids = list(res.scalars().all())
|
|
43
|
+
if not volume_ids:
|
|
44
|
+
return
|
|
45
|
+
for volume_id in volume_ids:
|
|
46
|
+
lockset.add(volume_id)
|
|
47
|
+
|
|
48
|
+
res = await session.execute(
|
|
49
|
+
select(VolumeModel)
|
|
50
|
+
.where(VolumeModel.id.in_(volume_ids))
|
|
51
|
+
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
52
|
+
.options(joinedload(VolumeModel.user))
|
|
53
|
+
.options(joinedload(VolumeModel.attachments))
|
|
54
|
+
.execution_options(populate_existing=True)
|
|
55
|
+
)
|
|
56
|
+
volume_models = list(res.unique().scalars().all())
|
|
57
|
+
try:
|
|
58
|
+
volumes_to_delete = [v for v in volume_models if _should_delete_volume(v)]
|
|
59
|
+
if not volumes_to_delete:
|
|
60
|
+
return
|
|
61
|
+
await _delete_idle_volumes(session, volumes_to_delete)
|
|
62
|
+
finally:
|
|
63
|
+
lockset.difference_update(volume_ids)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _should_delete_volume(volume: VolumeModel) -> bool:
|
|
67
|
+
if volume.attachments:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
config = get_volume_configuration(volume)
|
|
71
|
+
if not config.auto_cleanup_duration:
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
duration_seconds = parse_duration(config.auto_cleanup_duration)
|
|
75
|
+
if not duration_seconds or duration_seconds <= 0:
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
idle_time = _get_idle_time(volume)
|
|
79
|
+
threshold = datetime.timedelta(seconds=duration_seconds)
|
|
80
|
+
return idle_time > threshold
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
|
|
84
|
+
last_used = volume.last_job_processed_at or volume.created_at
|
|
85
|
+
last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
|
|
86
|
+
idle_time = get_current_datetime() - last_used_utc
|
|
87
|
+
return max(idle_time, datetime.timedelta(0))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]):
|
|
91
|
+
# Note: Multiple volumes are deleted in the same transaction,
|
|
92
|
+
# so long deletion of one volume may block processing other volumes.
|
|
93
|
+
for volume_model in volumes:
|
|
94
|
+
logger.info("Deleting idle volume %s", volume_model.name)
|
|
95
|
+
try:
|
|
96
|
+
await _delete_idle_volume(session, volume_model)
|
|
97
|
+
except Exception:
|
|
98
|
+
logger.exception("Error when deleting idle volume %s", volume_model.name)
|
|
99
|
+
|
|
100
|
+
volume_model.deleted = True
|
|
101
|
+
volume_model.deleted_at = get_current_datetime()
|
|
102
|
+
|
|
103
|
+
logger.info("Deleted idle volume %s", volume_model.name)
|
|
104
|
+
|
|
105
|
+
await session.commit()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def _delete_idle_volume(session: AsyncSession, volume_model: VolumeModel):
|
|
109
|
+
volume = volume_model_to_volume(volume_model)
|
|
110
|
+
|
|
111
|
+
if volume.provisioning_data is None:
|
|
112
|
+
logger.error(
|
|
113
|
+
f"Failed to delete volume {volume_model.name}. volume.provisioning_data is None."
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
if volume.provisioning_data.backend is None:
|
|
118
|
+
logger.error(
|
|
119
|
+
f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None."
|
|
120
|
+
)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
backend = await backends_services.get_project_backend_by_type_or_error(
|
|
125
|
+
project=volume_model.project,
|
|
126
|
+
backend_type=volume.provisioning_data.backend,
|
|
127
|
+
)
|
|
128
|
+
except BackendNotAvailable:
|
|
129
|
+
logger.error(
|
|
130
|
+
f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available."
|
|
131
|
+
)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
compute = backend.compute()
|
|
135
|
+
assert isinstance(compute, ComputeWithVolumeSupport)
|
|
136
|
+
await common.run_async(
|
|
137
|
+
compute.delete_volume,
|
|
138
|
+
volume=volume,
|
|
139
|
+
)
|
|
@@ -17,12 +17,6 @@ depends_on = None
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def upgrade() -> None:
|
|
20
|
-
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
-
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
22
|
-
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
23
|
-
batch_op.execute("UPDATE jobs SET deployment_num = 0")
|
|
24
|
-
batch_op.alter_column("deployment_num", nullable=False)
|
|
25
|
-
|
|
26
20
|
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
27
21
|
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
28
22
|
batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
|
|
@@ -32,6 +26,12 @@ def upgrade() -> None:
|
|
|
32
26
|
batch_op.alter_column("deployment_num", nullable=False)
|
|
33
27
|
batch_op.alter_column("desired_replica_count", nullable=False)
|
|
34
28
|
|
|
29
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
30
|
+
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
31
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
32
|
+
batch_op.execute("UPDATE jobs SET deployment_num = 0")
|
|
33
|
+
batch_op.alter_column("deployment_num", nullable=False)
|
|
34
|
+
|
|
35
35
|
|
|
36
36
|
def downgrade() -> None:
|
|
37
37
|
with op.batch_alter_table("runs", schema=None) as batch_op:
|
dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Add VolumeModel.last_job_processed_at
|
|
2
|
+
|
|
3
|
+
Revision ID: d5863798bf41
|
|
4
|
+
Revises: 644b8a114187
|
|
5
|
+
Create Date: 2025-07-15 14:26:22.981687
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "d5863798bf41"
|
|
16
|
+
down_revision = "644b8a114187"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column(
|
|
26
|
+
"last_job_processed_at",
|
|
27
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
28
|
+
nullable=True,
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# ### end Alembic commands ###
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def downgrade() -> None:
|
|
36
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
37
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
38
|
+
batch_op.drop_column("last_job_processed_at")
|
|
39
|
+
|
|
40
|
+
# ### end Alembic commands ###
|
|
@@ -645,6 +645,7 @@ class VolumeModel(BaseModel):
|
|
|
645
645
|
last_processed_at: Mapped[datetime] = mapped_column(
|
|
646
646
|
NaiveDateTime, default=get_current_datetime
|
|
647
647
|
)
|
|
648
|
+
last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
648
649
|
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
649
650
|
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
650
651
|
|