dstack 0.19.18__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (69) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +99 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/compatibility/runs.py +12 -1
  4. dstack/_internal/core/compatibility/volumes.py +2 -0
  5. dstack/_internal/core/models/common.py +38 -2
  6. dstack/_internal/core/models/configurations.py +9 -1
  7. dstack/_internal/core/models/fleets.py +2 -1
  8. dstack/_internal/core/models/profiles.py +8 -5
  9. dstack/_internal/core/models/resources.py +15 -8
  10. dstack/_internal/core/models/runs.py +41 -138
  11. dstack/_internal/core/models/volumes.py +14 -0
  12. dstack/_internal/core/services/diff.py +30 -10
  13. dstack/_internal/core/services/ssh/attach.py +2 -0
  14. dstack/_internal/server/app.py +17 -9
  15. dstack/_internal/server/background/__init__.py +5 -3
  16. dstack/_internal/server/background/tasks/process_gateways.py +46 -28
  17. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  18. dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
  19. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  20. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  21. dstack/_internal/server/models.py +1 -0
  22. dstack/_internal/server/routers/backends.py +23 -16
  23. dstack/_internal/server/routers/files.py +7 -6
  24. dstack/_internal/server/routers/fleets.py +47 -36
  25. dstack/_internal/server/routers/gateways.py +27 -18
  26. dstack/_internal/server/routers/instances.py +18 -13
  27. dstack/_internal/server/routers/logs.py +7 -3
  28. dstack/_internal/server/routers/metrics.py +14 -8
  29. dstack/_internal/server/routers/projects.py +33 -22
  30. dstack/_internal/server/routers/repos.py +7 -6
  31. dstack/_internal/server/routers/runs.py +49 -28
  32. dstack/_internal/server/routers/secrets.py +20 -15
  33. dstack/_internal/server/routers/server.py +7 -4
  34. dstack/_internal/server/routers/users.py +22 -19
  35. dstack/_internal/server/routers/volumes.py +34 -25
  36. dstack/_internal/server/schemas/logs.py +2 -2
  37. dstack/_internal/server/schemas/runs.py +17 -5
  38. dstack/_internal/server/services/fleets.py +354 -72
  39. dstack/_internal/server/services/gateways/__init__.py +13 -4
  40. dstack/_internal/server/services/gateways/client.py +5 -3
  41. dstack/_internal/server/services/instances.py +8 -0
  42. dstack/_internal/server/services/jobs/__init__.py +45 -0
  43. dstack/_internal/server/services/jobs/configurators/base.py +7 -0
  44. dstack/_internal/server/services/locking.py +3 -1
  45. dstack/_internal/server/services/logging.py +4 -2
  46. dstack/_internal/server/services/logs/__init__.py +15 -2
  47. dstack/_internal/server/services/logs/aws.py +2 -4
  48. dstack/_internal/server/services/logs/filelog.py +33 -27
  49. dstack/_internal/server/services/logs/gcp.py +3 -5
  50. dstack/_internal/server/services/proxy/repo.py +4 -1
  51. dstack/_internal/server/services/runs.py +115 -32
  52. dstack/_internal/server/services/services/__init__.py +2 -1
  53. dstack/_internal/server/services/users.py +3 -1
  54. dstack/_internal/server/services/volumes.py +13 -0
  55. dstack/_internal/server/settings.py +7 -2
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-64f8273740c4b52c18f5.js} +6 -6
  58. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  59. dstack/_internal/server/testing/common.py +41 -5
  60. dstack/_internal/server/utils/routers.py +31 -8
  61. dstack/_internal/utils/json_utils.py +54 -0
  62. dstack/api/_public/runs.py +13 -2
  63. dstack/api/server/_runs.py +12 -2
  64. dstack/version.py +1 -1
  65. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/METADATA +7 -5
  66. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/RECORD +69 -66
  67. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  68. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  69. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional, TypedDict
1
+ from typing import Any, Optional, TypedDict, TypeVar
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -15,20 +15,19 @@ ModelDiff = dict[str, ModelFieldDiff]
15
15
 
16
16
  # TODO: calculate nested diffs
17
17
  def diff_models(
18
- old: BaseModel, new: BaseModel, ignore: Optional[IncludeExcludeType] = None
18
+ old: BaseModel, new: BaseModel, reset: Optional[IncludeExcludeType] = None
19
19
  ) -> ModelDiff:
20
20
  """
21
21
  Returns a diff of model instances fields.
22
22
 
23
- NOTE: `ignore` is implemented as `BaseModel.parse_obj(BaseModel.dict(exclude=ignore))`,
24
- that is, the "ignored" fields are actually not ignored but reset to the default values
25
- before comparison, meaning that 1) any field in `ignore` must have a default value,
26
- 2) the default value must be equal to itself (e.g. `math.nan` != `math.nan`).
23
+ The fields specified in the `reset` option are reset to their default values, effectively
24
+ excluding them from comparison (assuming that the default value is equal to itself, e.g,
25
+ `None == None`, `"task" == "task"`, but `math.nan != math.nan`).
27
26
 
28
27
  Args:
29
28
  old: The "old" model instance.
30
29
  new: The "new" model instance.
31
- ignore: Optional fields to ignore.
30
+ reset: Fields to reset to their default values before comparison.
32
31
 
33
32
  Returns:
34
33
  A dict of changed fields in the form of
@@ -37,9 +36,9 @@ def diff_models(
37
36
  if type(old) is not type(new):
38
37
  raise TypeError("Both instances must be of the same Pydantic model class.")
39
38
 
40
- if ignore is not None:
41
- old = type(old).parse_obj(old.dict(exclude=ignore))
42
- new = type(new).parse_obj(new.dict(exclude=ignore))
39
+ if reset is not None:
40
+ old = copy_model(old, reset=reset)
41
+ new = copy_model(new, reset=reset)
43
42
 
44
43
  changes: ModelDiff = {}
45
44
  for field in old.__fields__:
@@ -49,3 +48,24 @@ def diff_models(
49
48
  changes[field] = {"old": old_value, "new": new_value}
50
49
 
51
50
  return changes
51
+
52
+
53
+ M = TypeVar("M", bound=BaseModel)
54
+
55
+
56
+ def copy_model(model: M, reset: Optional[IncludeExcludeType] = None) -> M:
57
+ """
58
+ Returns a deep copy of the model instance.
59
+
60
+ Implemented as `BaseModel.parse_obj(BaseModel.dict())`, thus,
61
+ unlike `BaseModel.copy(deep=True)`, runs all validations.
62
+
63
+ The fields specified in the `reset` option are reset to their default values.
64
+
65
+ Args:
66
+ reset: Fields to reset to their default values.
67
+
68
+ Returns:
69
+ A deep copy of the model instance.
70
+ """
71
+ return type(model).parse_obj(model.dict(exclude=reset))
@@ -64,6 +64,7 @@ class SSHAttach:
64
64
  run_name: str,
65
65
  dockerized: bool,
66
66
  ssh_proxy: Optional[SSHConnectionParams] = None,
67
+ service_port: Optional[int] = None,
67
68
  local_backend: bool = False,
68
69
  bind_address: Optional[str] = None,
69
70
  ):
@@ -90,6 +91,7 @@ class SSHAttach:
90
91
  },
91
92
  )
92
93
  self.ssh_proxy = ssh_proxy
94
+ self.service_port = service_port
93
95
 
94
96
  hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
95
97
  self.hosts = hosts
@@ -10,7 +10,7 @@ from typing import Awaitable, Callable, List
10
10
  import sentry_sdk
11
11
  from fastapi import FastAPI, Request, Response, status
12
12
  from fastapi.datastructures import URL
13
- from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
13
+ from fastapi.responses import HTMLResponse, RedirectResponse
14
14
  from fastapi.staticfiles import StaticFiles
15
15
  from prometheus_client import Counter, Histogram
16
16
 
@@ -56,6 +56,7 @@ from dstack._internal.server.settings import (
56
56
  )
57
57
  from dstack._internal.server.utils.logging import configure_logging
58
58
  from dstack._internal.server.utils.routers import (
59
+ CustomORJSONResponse,
59
60
  check_client_server_compatibility,
60
61
  error_detail,
61
62
  get_server_client_error_details,
@@ -90,7 +91,10 @@ def create_app() -> FastAPI:
90
91
  profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
91
92
  )
92
93
 
93
- app = FastAPI(docs_url="/api/docs", lifespan=lifespan)
94
+ app = FastAPI(
95
+ docs_url="/api/docs",
96
+ lifespan=lifespan,
97
+ )
94
98
  app.state.proxy_dependency_injector = ServerProxyDependencyInjector()
95
99
  return app
96
100
 
@@ -147,7 +151,10 @@ async def lifespan(app: FastAPI):
147
151
  )
148
152
  if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
149
153
  init_default_storage()
150
- scheduler = start_background_tasks()
154
+ if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
155
+ scheduler = start_background_tasks()
156
+ else:
157
+ logger.info("Background processing is disabled")
151
158
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
152
159
  logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
153
160
  logger.info(
@@ -157,7 +164,8 @@ async def lifespan(app: FastAPI):
157
164
  for func in _ON_STARTUP_HOOKS:
158
165
  await func(app)
159
166
  yield
160
- scheduler.shutdown()
167
+ if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
168
+ scheduler.shutdown()
161
169
  await gateway_connections_pool.remove_all()
162
170
  service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
163
171
  await service_conn_pool.remove_all()
@@ -208,14 +216,14 @@ def register_routes(app: FastAPI, ui: bool = True):
208
216
  msg = "Access denied"
209
217
  if len(exc.args) > 0:
210
218
  msg = exc.args[0]
211
- return JSONResponse(
219
+ return CustomORJSONResponse(
212
220
  status_code=status.HTTP_403_FORBIDDEN,
213
221
  content=error_detail(msg),
214
222
  )
215
223
 
216
224
  @app.exception_handler(ServerClientError)
217
225
  async def server_client_error_handler(request: Request, exc: ServerClientError):
218
- return JSONResponse(
226
+ return CustomORJSONResponse(
219
227
  status_code=status.HTTP_400_BAD_REQUEST,
220
228
  content={"detail": get_server_client_error_details(exc)},
221
229
  )
@@ -223,7 +231,7 @@ def register_routes(app: FastAPI, ui: bool = True):
223
231
  @app.exception_handler(OSError)
224
232
  async def os_error_handler(request, exc: OSError):
225
233
  if exc.errno in [36, 63]:
226
- return JSONResponse(
234
+ return CustomORJSONResponse(
227
235
  {"detail": "Filename too long"},
228
236
  status_code=status.HTTP_400_BAD_REQUEST,
229
237
  )
@@ -309,7 +317,7 @@ def register_routes(app: FastAPI, ui: bool = True):
309
317
 
310
318
  @app.get("/healthcheck")
311
319
  async def healthcheck():
312
- return JSONResponse(content={"status": "running"})
320
+ return CustomORJSONResponse(content={"status": "running"})
313
321
 
314
322
  if ui and Path(__file__).parent.joinpath("statics").exists():
315
323
  app.mount(
@@ -323,7 +331,7 @@ def register_routes(app: FastAPI, ui: bool = True):
323
331
  or _is_proxy_request(request)
324
332
  or _is_prometheus_request(request)
325
333
  ):
326
- return JSONResponse(
334
+ return CustomORJSONResponse(
327
335
  {"detail": exc.detail},
328
336
  status_code=status.HTTP_404_NOT_FOUND,
329
337
  )
@@ -4,9 +4,10 @@ from apscheduler.triggers.interval import IntervalTrigger
4
4
  from dstack._internal.server import settings
5
5
  from dstack._internal.server.background.tasks.process_fleets import process_fleets
6
6
  from dstack._internal.server.background.tasks.process_gateways import (
7
+ process_gateways,
7
8
  process_gateways_connections,
8
- process_submitted_gateways,
9
9
  )
10
+ from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
10
11
  from dstack._internal.server.background.tasks.process_instances import (
11
12
  process_instances,
12
13
  )
@@ -70,11 +71,12 @@ def start_background_tasks() -> AsyncIOScheduler:
70
71
  )
71
72
  _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
72
73
  _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
74
+ _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5)
73
75
  _scheduler.add_job(
74
- process_submitted_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5
76
+ process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
75
77
  )
76
78
  _scheduler.add_job(
77
- process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
79
+ process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
78
80
  )
79
81
  _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
80
82
  for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
@@ -16,6 +16,7 @@ from dstack._internal.server.services.gateways import (
16
16
  gateway_connections_pool,
17
17
  )
18
18
  from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
19
+ from dstack._internal.server.services.logging import fmt
19
20
  from dstack._internal.utils.common import get_current_datetime
20
21
  from dstack._internal.utils.logging import get_logger
21
22
 
@@ -27,14 +28,14 @@ async def process_gateways_connections():
27
28
  await _process_active_connections()
28
29
 
29
30
 
30
- async def process_submitted_gateways():
31
+ async def process_gateways():
31
32
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
32
33
  async with get_session_ctx() as session:
33
34
  async with lock:
34
35
  res = await session.execute(
35
36
  select(GatewayModel)
36
37
  .where(
37
- GatewayModel.status == GatewayStatus.SUBMITTED,
38
+ GatewayModel.status.in_([GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING]),
38
39
  GatewayModel.id.not_in(lockset),
39
40
  )
40
41
  .options(lazyload(GatewayModel.gateway_compute))
@@ -48,7 +49,25 @@ async def process_submitted_gateways():
48
49
  lockset.add(gateway_model.id)
49
50
  try:
50
51
  gateway_model_id = gateway_model.id
51
- await _process_submitted_gateway(session=session, gateway_model=gateway_model)
52
+ initial_status = gateway_model.status
53
+ if initial_status == GatewayStatus.SUBMITTED:
54
+ await _process_submitted_gateway(session=session, gateway_model=gateway_model)
55
+ elif initial_status == GatewayStatus.PROVISIONING:
56
+ await _process_provisioning_gateway(session=session, gateway_model=gateway_model)
57
+ else:
58
+ logger.error(
59
+ "%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper()
60
+ )
61
+ if gateway_model.status != initial_status:
62
+ logger.info(
63
+ "%s: gateway status has changed %s -> %s%s",
64
+ fmt(gateway_model),
65
+ initial_status.upper(),
66
+ gateway_model.status.upper(),
67
+ f": {gateway_model.status_message}" if gateway_model.status_message else "",
68
+ )
69
+ gateway_model.last_processed_at = get_current_datetime()
70
+ await session.commit()
52
71
  finally:
53
72
  lockset.difference_update([gateway_model_id])
54
73
 
@@ -89,7 +108,7 @@ async def _process_connection(conn: GatewayConnection):
89
108
 
90
109
 
91
110
  async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
92
- logger.info("Started gateway %s provisioning", gateway_model.name)
111
+ logger.info("%s: started gateway provisioning", fmt(gateway_model))
93
112
  # Refetch to load related attributes.
94
113
  # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
95
114
  res = await session.execute(
@@ -110,8 +129,6 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
110
129
  except BackendNotAvailable:
111
130
  gateway_model.status = GatewayStatus.FAILED
112
131
  gateway_model.status_message = "Backend not available"
113
- gateway_model.last_processed_at = get_current_datetime()
114
- await session.commit()
115
132
  return
116
133
 
117
134
  try:
@@ -123,53 +140,54 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
123
140
  )
124
141
  session.add(gateway_model)
125
142
  gateway_model.status = GatewayStatus.PROVISIONING
126
- await session.commit()
127
- await session.refresh(gateway_model)
128
143
  except BackendError as e:
129
- logger.info(
130
- "Failed to create gateway compute for gateway %s: %s", gateway_model.name, repr(e)
131
- )
144
+ logger.info("%s: failed to create gateway compute: %r", fmt(gateway_model), e)
132
145
  gateway_model.status = GatewayStatus.FAILED
133
146
  status_message = f"Backend error: {repr(e)}"
134
147
  if len(e.args) > 0:
135
148
  status_message = str(e.args[0])
136
149
  gateway_model.status_message = status_message
137
- gateway_model.last_processed_at = get_current_datetime()
138
- await session.commit()
139
- return
140
150
  except Exception as e:
141
- logger.exception(
142
- "Got exception when creating gateway compute for gateway %s", gateway_model.name
143
- )
151
+ logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model))
144
152
  gateway_model.status = GatewayStatus.FAILED
145
153
  gateway_model.status_message = f"Unexpected error: {repr(e)}"
146
- gateway_model.last_processed_at = get_current_datetime()
147
- await session.commit()
148
- return
149
154
 
155
+
156
+ async def _process_provisioning_gateway(
157
+ session: AsyncSession, gateway_model: GatewayModel
158
+ ) -> None:
159
+ # Refetch to load related attributes.
160
+ # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
161
+ res = await session.execute(
162
+ select(GatewayModel)
163
+ .where(GatewayModel.id == gateway_model.id)
164
+ .execution_options(populate_existing=True)
165
+ )
166
+ gateway_model = res.unique().scalar_one()
167
+
168
+ # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
169
+ # - cannot delete the gateway before it is provisioned because the DB model is locked
170
+ # - connection retry counter is reset on server restart
171
+ # - only one server replica is processing the gateway
172
+ # Easy to fix by doing only one connection/configuration attempt per processing iteration. The
173
+ # main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
174
+ # maintaining a different model for Sky.
150
175
  connection = await gateways_services.connect_to_gateway_with_retry(
151
176
  gateway_model.gateway_compute
152
177
  )
153
178
  if connection is None:
154
179
  gateway_model.status = GatewayStatus.FAILED
155
180
  gateway_model.status_message = "Failed to connect to gateway"
156
- gateway_model.last_processed_at = get_current_datetime()
157
181
  gateway_model.gateway_compute.deleted = True
158
- await session.commit()
159
182
  return
160
-
161
183
  try:
162
184
  await gateways_services.configure_gateway(connection)
163
185
  except Exception:
164
- logger.exception("Failed to configure gateway %s", gateway_model.name)
186
+ logger.exception("%s: failed to configure gateway", fmt(gateway_model))
165
187
  gateway_model.status = GatewayStatus.FAILED
166
188
  gateway_model.status_message = "Failed to configure gateway"
167
- gateway_model.last_processed_at = get_current_datetime()
168
189
  await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address)
169
190
  gateway_model.gateway_compute.active = False
170
- await session.commit()
171
191
  return
172
192
 
173
193
  gateway_model.status = GatewayStatus.RUNNING
174
- gateway_model.last_processed_at = get_current_datetime()
175
- await session.commit()
@@ -0,0 +1,139 @@
1
+ import datetime
2
+ from typing import List
3
+
4
+ from sqlalchemy import select
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+ from sqlalchemy.orm import joinedload
7
+
8
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
9
+ from dstack._internal.core.errors import BackendNotAvailable
10
+ from dstack._internal.core.models.profiles import parse_duration
11
+ from dstack._internal.core.models.volumes import VolumeStatus
12
+ from dstack._internal.server.db import get_db, get_session_ctx
13
+ from dstack._internal.server.models import ProjectModel, VolumeModel
14
+ from dstack._internal.server.services import backends as backends_services
15
+ from dstack._internal.server.services.locking import get_locker
16
+ from dstack._internal.server.services.volumes import (
17
+ get_volume_configuration,
18
+ volume_model_to_volume,
19
+ )
20
+ from dstack._internal.utils import common
21
+ from dstack._internal.utils.common import get_current_datetime
22
+ from dstack._internal.utils.logging import get_logger
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ async def process_idle_volumes():
28
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
29
+ async with get_session_ctx() as session:
30
+ async with lock:
31
+ res = await session.execute(
32
+ select(VolumeModel.id)
33
+ .where(
34
+ VolumeModel.status == VolumeStatus.ACTIVE,
35
+ VolumeModel.deleted == False,
36
+ VolumeModel.id.not_in(lockset),
37
+ )
38
+ .order_by(VolumeModel.last_processed_at.asc())
39
+ .limit(10)
40
+ .with_for_update(skip_locked=True, key_share=True)
41
+ )
42
+ volume_ids = list(res.scalars().all())
43
+ if not volume_ids:
44
+ return
45
+ for volume_id in volume_ids:
46
+ lockset.add(volume_id)
47
+
48
+ res = await session.execute(
49
+ select(VolumeModel)
50
+ .where(VolumeModel.id.in_(volume_ids))
51
+ .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
52
+ .options(joinedload(VolumeModel.user))
53
+ .options(joinedload(VolumeModel.attachments))
54
+ .execution_options(populate_existing=True)
55
+ )
56
+ volume_models = list(res.unique().scalars().all())
57
+ try:
58
+ volumes_to_delete = [v for v in volume_models if _should_delete_volume(v)]
59
+ if not volumes_to_delete:
60
+ return
61
+ await _delete_idle_volumes(session, volumes_to_delete)
62
+ finally:
63
+ lockset.difference_update(volume_ids)
64
+
65
+
66
+ def _should_delete_volume(volume: VolumeModel) -> bool:
67
+ if volume.attachments:
68
+ return False
69
+
70
+ config = get_volume_configuration(volume)
71
+ if not config.auto_cleanup_duration:
72
+ return False
73
+
74
+ duration_seconds = parse_duration(config.auto_cleanup_duration)
75
+ if not duration_seconds or duration_seconds <= 0:
76
+ return False
77
+
78
+ idle_time = _get_idle_time(volume)
79
+ threshold = datetime.timedelta(seconds=duration_seconds)
80
+ return idle_time > threshold
81
+
82
+
83
+ def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
84
+ last_used = volume.last_job_processed_at or volume.created_at
85
+ last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
86
+ idle_time = get_current_datetime() - last_used_utc
87
+ return max(idle_time, datetime.timedelta(0))
88
+
89
+
90
+ async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]):
91
+ # Note: Multiple volumes are deleted in the same transaction,
92
+ # so long deletion of one volume may block processing other volumes.
93
+ for volume_model in volumes:
94
+ logger.info("Deleting idle volume %s", volume_model.name)
95
+ try:
96
+ await _delete_idle_volume(session, volume_model)
97
+ except Exception:
98
+ logger.exception("Error when deleting idle volume %s", volume_model.name)
99
+
100
+ volume_model.deleted = True
101
+ volume_model.deleted_at = get_current_datetime()
102
+
103
+ logger.info("Deleted idle volume %s", volume_model.name)
104
+
105
+ await session.commit()
106
+
107
+
108
+ async def _delete_idle_volume(session: AsyncSession, volume_model: VolumeModel):
109
+ volume = volume_model_to_volume(volume_model)
110
+
111
+ if volume.provisioning_data is None:
112
+ logger.error(
113
+ f"Failed to delete volume {volume_model.name}. volume.provisioning_data is None."
114
+ )
115
+ return
116
+
117
+ if volume.provisioning_data.backend is None:
118
+ logger.error(
119
+ f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None."
120
+ )
121
+ return
122
+
123
+ try:
124
+ backend = await backends_services.get_project_backend_by_type_or_error(
125
+ project=volume_model.project,
126
+ backend_type=volume.provisioning_data.backend,
127
+ )
128
+ except BackendNotAvailable:
129
+ logger.error(
130
+ f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available."
131
+ )
132
+ return
133
+
134
+ compute = backend.compute()
135
+ assert isinstance(compute, ComputeWithVolumeSupport)
136
+ await common.run_async(
137
+ compute.delete_volume,
138
+ volume=volume,
139
+ )
@@ -739,3 +739,5 @@ async def _attach_volume(
739
739
  attachment_data=attachment_data.json(),
740
740
  )
741
741
  instance.volume_attachments.append(volume_attachment_model)
742
+
743
+ volume_model.last_job_processed_at = common_utils.get_current_datetime()
@@ -17,12 +17,6 @@ depends_on = None
17
17
 
18
18
 
19
19
  def upgrade() -> None:
20
- with op.batch_alter_table("jobs", schema=None) as batch_op:
21
- batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
22
- with op.batch_alter_table("jobs", schema=None) as batch_op:
23
- batch_op.execute("UPDATE jobs SET deployment_num = 0")
24
- batch_op.alter_column("deployment_num", nullable=False)
25
-
26
20
  with op.batch_alter_table("runs", schema=None) as batch_op:
27
21
  batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
28
22
  batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
@@ -32,6 +26,12 @@ def upgrade() -> None:
32
26
  batch_op.alter_column("deployment_num", nullable=False)
33
27
  batch_op.alter_column("desired_replica_count", nullable=False)
34
28
 
29
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
30
+ batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
31
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
32
+ batch_op.execute("UPDATE jobs SET deployment_num = 0")
33
+ batch_op.alter_column("deployment_num", nullable=False)
34
+
35
35
 
36
36
  def downgrade() -> None:
37
37
  with op.batch_alter_table("runs", schema=None) as batch_op:
@@ -0,0 +1,40 @@
1
+ """Add VolumeModel.last_job_processed_at
2
+
3
+ Revision ID: d5863798bf41
4
+ Revises: 644b8a114187
5
+ Create Date: 2025-07-15 14:26:22.981687
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ import dstack._internal.server.models
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "d5863798bf41"
16
+ down_revision = "644b8a114187"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
24
+ batch_op.add_column(
25
+ sa.Column(
26
+ "last_job_processed_at",
27
+ dstack._internal.server.models.NaiveDateTime(),
28
+ nullable=True,
29
+ )
30
+ )
31
+
32
+ # ### end Alembic commands ###
33
+
34
+
35
+ def downgrade() -> None:
36
+ # ### commands auto generated by Alembic - please adjust! ###
37
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
38
+ batch_op.drop_column("last_job_processed_at")
39
+
40
+ # ### end Alembic commands ###
@@ -645,6 +645,7 @@ class VolumeModel(BaseModel):
645
645
  last_processed_at: Mapped[datetime] = mapped_column(
646
646
  NaiveDateTime, default=get_current_datetime
647
647
  )
648
+ last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
648
649
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
649
650
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
650
651
 
@@ -27,7 +27,10 @@ from dstack._internal.server.services.config import (
27
27
  get_backend_config_yaml,
28
28
  update_backend_config_yaml,
29
29
  )
30
- from dstack._internal.server.utils.routers import get_base_api_additional_responses
30
+ from dstack._internal.server.utils.routers import (
31
+ CustomORJSONResponse,
32
+ get_base_api_additional_responses,
33
+ )
31
34
 
32
35
  root_router = APIRouter(
33
36
  prefix="/api/backends",
@@ -41,35 +44,37 @@ project_router = APIRouter(
41
44
  )
42
45
 
43
46
 
44
- @root_router.post("/list_types")
45
- async def list_backend_types() -> List[BackendType]:
46
- return dstack._internal.core.backends.configurators.list_available_backend_types()
47
+ @root_router.post("/list_types", response_model=List[BackendType])
48
+ async def list_backend_types():
49
+ return CustomORJSONResponse(
50
+ dstack._internal.core.backends.configurators.list_available_backend_types()
51
+ )
47
52
 
48
53
 
49
- @project_router.post("/create")
54
+ @project_router.post("/create", response_model=AnyBackendConfigWithCreds)
50
55
  async def create_backend(
51
56
  body: AnyBackendConfigWithCreds,
52
57
  session: AsyncSession = Depends(get_session),
53
58
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
54
- ) -> AnyBackendConfigWithCreds:
59
+ ):
55
60
  _, project = user_project
56
61
  config = await backends.create_backend(session=session, project=project, config=body)
57
62
  if settings.SERVER_CONFIG_ENABLED:
58
63
  await ServerConfigManager().sync_config(session=session)
59
- return config
64
+ return CustomORJSONResponse(config)
60
65
 
61
66
 
62
- @project_router.post("/update")
67
+ @project_router.post("/update", response_model=AnyBackendConfigWithCreds)
63
68
  async def update_backend(
64
69
  body: AnyBackendConfigWithCreds,
65
70
  session: AsyncSession = Depends(get_session),
66
71
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
67
- ) -> AnyBackendConfigWithCreds:
72
+ ):
68
73
  _, project = user_project
69
74
  config = await backends.update_backend(session=session, project=project, config=body)
70
75
  if settings.SERVER_CONFIG_ENABLED:
71
76
  await ServerConfigManager().sync_config(session=session)
72
- return config
77
+ return CustomORJSONResponse(config)
73
78
 
74
79
 
75
80
  @project_router.post("/delete")
@@ -86,16 +91,16 @@ async def delete_backends(
86
91
  await ServerConfigManager().sync_config(session=session)
87
92
 
88
93
 
89
- @project_router.post("/{backend_name}/config_info")
94
+ @project_router.post("/{backend_name}/config_info", response_model=AnyBackendConfigWithCreds)
90
95
  async def get_backend_config_info(
91
96
  backend_name: BackendType,
92
97
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
93
- ) -> AnyBackendConfigWithCreds:
98
+ ):
94
99
  _, project = user_project
95
100
  config = await backends.get_backend_config(project=project, backend_type=backend_name)
96
101
  if config is None:
97
102
  raise ResourceNotExistsError()
98
- return config
103
+ return CustomORJSONResponse(config)
99
104
 
100
105
 
101
106
  @project_router.post("/create_yaml")
@@ -126,10 +131,12 @@ async def update_backend_yaml(
126
131
  )
127
132
 
128
133
 
129
- @project_router.post("/{backend_name}/get_yaml")
134
+ @project_router.post("/{backend_name}/get_yaml", response_model=BackendInfoYAML)
130
135
  async def get_backend_yaml(
131
136
  backend_name: BackendType,
132
137
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
133
- ) -> BackendInfoYAML:
138
+ ):
134
139
  _, project = user_project
135
- return await get_backend_config_yaml(project=project, backend_type=backend_name)
140
+ return CustomORJSONResponse(
141
+ await get_backend_config_yaml(project=project, backend_type=backend_name)
142
+ )