dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/__init__.py +0 -65
  11. dstack/_internal/core/backends/configurators.py +9 -0
  12. dstack/_internal/core/backends/features.py +64 -0
  13. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  14. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  15. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  16. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  17. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  18. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  20. dstack/_internal/core/backends/models.py +8 -0
  21. dstack/_internal/core/compatibility/fleets.py +2 -0
  22. dstack/_internal/core/compatibility/runs.py +12 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/profiles.py +37 -0
  29. dstack/_internal/core/models/runs.py +21 -1
  30. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  31. dstack/_internal/server/app.py +26 -10
  32. dstack/_internal/server/background/__init__.py +9 -6
  33. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  34. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  35. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  36. dstack/_internal/server/background/tasks/process_instances.py +168 -103
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  38. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  39. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  40. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  41. dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
  42. dstack/_internal/server/background/tasks/process_runs.py +84 -34
  43. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  45. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  46. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  47. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  48. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  49. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  50. dstack/_internal/server/models.py +57 -16
  51. dstack/_internal/server/routers/instances.py +33 -5
  52. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  53. dstack/_internal/server/schemas/instances.py +32 -0
  54. dstack/_internal/server/schemas/runner.py +5 -0
  55. dstack/_internal/server/services/fleets.py +19 -10
  56. dstack/_internal/server/services/gateways/__init__.py +17 -17
  57. dstack/_internal/server/services/instances.py +113 -15
  58. dstack/_internal/server/services/jobs/__init__.py +18 -13
  59. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  60. dstack/_internal/server/services/logging.py +4 -2
  61. dstack/_internal/server/services/logs/aws.py +13 -1
  62. dstack/_internal/server/services/logs/gcp.py +16 -1
  63. dstack/_internal/server/services/offers.py +3 -3
  64. dstack/_internal/server/services/probes.py +6 -0
  65. dstack/_internal/server/services/projects.py +51 -19
  66. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  67. dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
  68. dstack/_internal/server/services/runner/client.py +52 -20
  69. dstack/_internal/server/services/runner/ssh.py +4 -4
  70. dstack/_internal/server/services/runs.py +115 -39
  71. dstack/_internal/server/services/services/__init__.py +4 -1
  72. dstack/_internal/server/services/ssh.py +66 -0
  73. dstack/_internal/server/services/users.py +2 -3
  74. dstack/_internal/server/services/volumes.py +11 -11
  75. dstack/_internal/server/settings.py +16 -0
  76. dstack/_internal/server/statics/index.html +1 -1
  77. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  78. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  79. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  80. dstack/_internal/server/testing/common.py +51 -0
  81. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  82. dstack/_internal/server/utils/sentry_utils.py +12 -0
  83. dstack/_internal/settings.py +3 -0
  84. dstack/_internal/utils/common.py +15 -0
  85. dstack/_internal/utils/cron.py +5 -0
  86. dstack/api/server/__init__.py +1 -1
  87. dstack/version.py +1 -1
  88. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
  89. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
  90. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  91. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
@@ -8,6 +8,7 @@ from typing_extensions import Annotated, Literal
8
8
  from dstack._internal.core.models.backends.base import BackendType
9
9
  from dstack._internal.core.models.common import CoreModel, Duration
10
10
  from dstack._internal.utils.common import list_enum_values_for_annotation
11
+ from dstack._internal.utils.cron import validate_cron
11
12
  from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
12
13
  from dstack._internal.utils.tags import tags_validator
13
14
 
@@ -167,6 +168,38 @@ class UtilizationPolicy(CoreModel):
167
168
  return v
168
169
 
169
170
 
171
+ class Schedule(CoreModel):
172
+ cron: Annotated[
173
+ Union[List[str], str],
174
+ Field(
175
+ description=(
176
+ "A cron expression or a list of cron expressions specifying the UTC time when the run needs to be started"
177
+ )
178
+ ),
179
+ ]
180
+
181
+ @validator("cron")
182
+ def _validate_cron(cls, v: Union[List[str], str]) -> List[str]:
183
+ if isinstance(v, str):
184
+ values = [v]
185
+ else:
186
+ values = v
187
+ if len(values) == 0:
188
+ raise ValueError("At least one cron expression must be specified")
189
+ for value in values:
190
+ validate_cron(value)
191
+ return values
192
+
193
+ @property
194
+ def crons(self) -> List[str]:
195
+ """
196
+ Access `cron` attribute as a list.
197
+ """
198
+ if isinstance(self.cron, str):
199
+ return [self.cron]
200
+ return self.cron
201
+
202
+
170
203
  class ProfileParams(CoreModel):
171
204
  backends: Annotated[
172
205
  Optional[List[BackendType]],
@@ -281,6 +314,10 @@ class ProfileParams(CoreModel):
281
314
  )
282
315
  ),
283
316
  ] = None
317
+ schedule: Annotated[
318
+ Optional[Schedule],
319
+ Field(description=("The schedule for starting the run at specified time")),
320
+ ] = None
284
321
  fleets: Annotated[
285
322
  Optional[list[str]], Field(description="The fleets considered for reuse")
286
323
  ] = None
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime, timedelta
2
2
  from enum import Enum
3
- from typing import Any, Dict, List, Optional, Type
3
+ from typing import Any, Dict, List, Literal, Optional, Type
4
4
 
5
5
  from pydantic import UUID4, Field, root_validator
6
6
  from typing_extensions import Annotated
@@ -8,8 +8,11 @@ from typing_extensions import Annotated
8
8
  from dstack._internal.core.models.backends.base import BackendType
9
9
  from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
10
10
  from dstack._internal.core.models.configurations import (
11
+ DEFAULT_PROBE_METHOD,
11
12
  DEFAULT_REPO_DIR,
12
13
  AnyRunConfiguration,
14
+ HTTPHeaderSpec,
15
+ HTTPMethod,
13
16
  RunConfiguration,
14
17
  ServiceConfiguration,
15
18
  )
@@ -223,6 +226,17 @@ class JobSSHKey(CoreModel):
223
226
  public: str
224
227
 
225
228
 
229
+ class ProbeSpec(CoreModel):
230
+ type: Literal["http"] # expect other probe types in the future, namely `exec`
231
+ url: str
232
+ method: HTTPMethod = DEFAULT_PROBE_METHOD
233
+ headers: list[HTTPHeaderSpec] = []
234
+ body: Optional[str] = None
235
+ timeout: int
236
+ interval: int
237
+ ready_after: int
238
+
239
+
226
240
  class JobSpec(CoreModel):
227
241
  replica_num: int = 0 # default value for backward compatibility
228
242
  job_num: int
@@ -256,6 +270,7 @@ class JobSpec(CoreModel):
256
270
  file_archives: list[FileArchiveMapping] = []
257
271
  # None for non-services and pre-0.19.19 services. See `get_service_port`
258
272
  service_port: Optional[int] = None
273
+ probes: list[ProbeSpec] = []
259
274
 
260
275
 
261
276
  class JobProvisioningData(CoreModel):
@@ -325,6 +340,10 @@ class ClusterInfo(CoreModel):
325
340
  gpus_per_job: int
326
341
 
327
342
 
343
+ class Probe(CoreModel):
344
+ success_streak: int
345
+
346
+
328
347
  class JobSubmission(CoreModel):
329
348
  id: UUID4
330
349
  submission_num: int
@@ -341,6 +360,7 @@ class JobSubmission(CoreModel):
341
360
  job_provisioning_data: Optional[JobProvisioningData]
342
361
  job_runtime_data: Optional[JobRuntimeData]
343
362
  error: Optional[str] = None
363
+ probes: list[Probe] = []
344
364
 
345
365
  @property
346
366
  def age(self) -> timedelta:
@@ -236,6 +236,13 @@ class SSHTunnel:
236
236
  def __exit__(self, exc_type, exc_val, exc_tb):
237
237
  self.close()
238
238
 
239
+ async def __aenter__(self):
240
+ await self.aopen()
241
+ return self
242
+
243
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
244
+ await self.aclose()
245
+
239
246
  def _get_proxy_command(self) -> Optional[str]:
240
247
  proxy_command: Optional[str] = None
241
248
  for params, identity_path in self.ssh_proxies:
@@ -13,6 +13,7 @@ from fastapi.datastructures import URL
13
13
  from fastapi.responses import HTMLResponse, RedirectResponse
14
14
  from fastapi.staticfiles import StaticFiles
15
15
  from prometheus_client import Counter, Histogram
16
+ from sentry_sdk.types import SamplingContext
16
17
 
17
18
  from dstack._internal.cli.utils.common import console
18
19
  from dstack._internal.core.errors import ForbiddenError, ServerClientError
@@ -21,6 +22,7 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app
21
22
  from dstack._internal.proxy.lib.routers import model_proxy
22
23
  from dstack._internal.server import settings
23
24
  from dstack._internal.server.background import start_background_tasks
25
+ from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
24
26
  from dstack._internal.server.db import get_db, get_session_ctx, migrate
25
27
  from dstack._internal.server.routers import (
26
28
  backends,
@@ -81,16 +83,6 @@ REQUEST_DURATION = Histogram(
81
83
 
82
84
 
83
85
  def create_app() -> FastAPI:
84
- if settings.SENTRY_DSN is not None:
85
- sentry_sdk.init(
86
- dsn=settings.SENTRY_DSN,
87
- release=DSTACK_VERSION,
88
- environment=settings.SERVER_ENVIRONMENT,
89
- enable_tracing=True,
90
- traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE,
91
- profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
92
- )
93
-
94
86
  app = FastAPI(
95
87
  docs_url="/api/docs",
96
88
  lifespan=lifespan,
@@ -102,6 +94,15 @@ def create_app() -> FastAPI:
102
94
  @asynccontextmanager
103
95
  async def lifespan(app: FastAPI):
104
96
  configure_logging()
97
+ if settings.SENTRY_DSN is not None:
98
+ sentry_sdk.init(
99
+ dsn=settings.SENTRY_DSN,
100
+ release=DSTACK_VERSION,
101
+ environment=settings.SERVER_ENVIRONMENT,
102
+ enable_tracing=True,
103
+ traces_sampler=_sentry_traces_sampler,
104
+ profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
105
+ )
105
106
  server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
106
107
  asyncio.get_running_loop().set_default_executor(server_executor)
107
108
  await migrate()
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
155
156
  scheduler = start_background_tasks()
156
157
  else:
157
158
  logger.info("Background processing is disabled")
159
+ PROBES_SCHEDULER.start()
158
160
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
159
161
  logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
160
162
  logger.info(
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
166
168
  yield
167
169
  if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
168
170
  scheduler.shutdown()
171
+ PROBES_SCHEDULER.shutdown(wait=False)
169
172
  await gateway_connections_pool.remove_all()
170
173
  service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
171
174
  await service_conn_pool.remove_all()
@@ -197,6 +200,7 @@ def register_routes(app: FastAPI, ui: bool = True):
197
200
  app.include_router(fleets.root_router)
198
201
  app.include_router(fleets.project_router)
199
202
  app.include_router(instances.root_router)
203
+ app.include_router(instances.project_router)
200
204
  app.include_router(repos.router)
201
205
  app.include_router(runs.root_router)
202
206
  app.include_router(runs.project_router)
@@ -379,3 +383,15 @@ def _print_dstack_logo():
379
383
  ╰━━┻━━┻╯╱╰╯╰━━┻╯
380
384
  [/]"""
381
385
  )
386
+
387
+
388
+ def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
389
+ parent_sampling_decision = sampling_context["parent_sampled"]
390
+ if parent_sampling_decision is not None:
391
+ return float(parent_sampling_decision)
392
+ transaction_context = sampling_context["transaction_context"]
393
+ name = transaction_context.get("name")
394
+ if name is not None:
395
+ if name.startswith("background."):
396
+ return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
397
+ return settings.SENTRY_TRACES_SAMPLE_RATE
@@ -9,6 +9,7 @@ from dstack._internal.server.background.tasks.process_gateways import (
9
9
  )
10
10
  from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
11
11
  from dstack._internal.server.background.tasks.process_instances import (
12
+ delete_instance_health_checks,
12
13
  process_instances,
13
14
  )
14
15
  from dstack._internal.server.background.tasks.process_metrics import (
@@ -18,6 +19,7 @@ from dstack._internal.server.background.tasks.process_metrics import (
18
19
  from dstack._internal.server.background.tasks.process_placement_groups import (
19
20
  process_placement_groups,
20
21
  )
22
+ from dstack._internal.server.background.tasks.process_probes import process_probes
21
23
  from dstack._internal.server.background.tasks.process_prometheus_metrics import (
22
24
  collect_prometheus_metrics,
23
25
  delete_prometheus_metrics,
@@ -63,6 +65,7 @@ def start_background_tasks() -> AsyncIOScheduler:
63
65
  # that the first waiting for the lock will acquire it.
64
66
  # The jitter is needed to give all tasks a chance to acquire locks.
65
67
 
68
+ _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
66
69
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
67
70
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
68
71
  if settings.ENABLE_PROMETHEUS_METRICS:
@@ -79,6 +82,12 @@ def start_background_tasks() -> AsyncIOScheduler:
79
82
  process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
80
83
  )
81
84
  _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
85
+ _scheduler.add_job(
86
+ process_fleets,
87
+ IntervalTrigger(seconds=10, jitter=2),
88
+ max_instances=1,
89
+ )
90
+ _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1)
82
91
  for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
83
92
  # Add multiple copies of tasks if requested.
84
93
  # max_instances=1 for additional copies to avoid running too many tasks.
@@ -113,11 +122,5 @@ def start_background_tasks() -> AsyncIOScheduler:
113
122
  kwargs={"batch_size": 5},
114
123
  max_instances=2 if replica == 0 else 1,
115
124
  )
116
- _scheduler.add_job(
117
- process_fleets,
118
- IntervalTrigger(seconds=10, jitter=2),
119
- kwargs={"batch_size": 5},
120
- max_instances=2 if replica == 0 else 1,
121
- )
122
125
  _scheduler.start()
123
126
  return _scheduler
@@ -1,36 +1,37 @@
1
- import asyncio
2
1
  from datetime import timedelta
2
+ from typing import List
3
3
 
4
- from sqlalchemy import select
4
+ from sqlalchemy import select, update
5
5
  from sqlalchemy.ext.asyncio import AsyncSession
6
- from sqlalchemy.orm import joinedload
6
+ from sqlalchemy.orm import joinedload, load_only
7
7
 
8
8
  from dstack._internal.core.models.fleets import FleetStatus
9
9
  from dstack._internal.server.db import get_db, get_session_ctx
10
- from dstack._internal.server.models import FleetModel
10
+ from dstack._internal.server.models import (
11
+ FleetModel,
12
+ InstanceModel,
13
+ JobModel,
14
+ PlacementGroupModel,
15
+ RunModel,
16
+ )
11
17
  from dstack._internal.server.services.fleets import (
12
18
  is_fleet_empty,
13
19
  is_fleet_in_use,
14
20
  )
15
21
  from dstack._internal.server.services.locking import get_locker
16
- from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
22
+ from dstack._internal.server.utils import sentry_utils
17
23
  from dstack._internal.utils.common import get_current_datetime
18
24
  from dstack._internal.utils.logging import get_logger
19
25
 
20
26
  logger = get_logger(__name__)
21
27
 
22
28
 
29
+ BATCH_SIZE = 10
23
30
  MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
24
31
 
25
32
 
26
- async def process_fleets(batch_size: int = 1):
27
- tasks = []
28
- for _ in range(batch_size):
29
- tasks.append(_process_next_fleet())
30
- await asyncio.gather(*tasks)
31
-
32
-
33
- async def _process_next_fleet():
33
+ @sentry_utils.instrument_background_task
34
+ async def process_fleets():
34
35
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
35
36
  async with get_session_ctx() as session:
36
37
  async with lock:
@@ -40,51 +41,64 @@ async def _process_next_fleet():
40
41
  FleetModel.deleted == False,
41
42
  FleetModel.id.not_in(lockset),
42
43
  FleetModel.last_processed_at
43
- < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
44
+ < get_current_datetime() - MIN_PROCESSING_INTERVAL,
44
45
  )
46
+ .options(load_only(FleetModel.id))
45
47
  .order_by(FleetModel.last_processed_at.asc())
46
- .limit(1)
48
+ .limit(BATCH_SIZE)
47
49
  .with_for_update(skip_locked=True, key_share=True)
48
50
  )
49
- fleet_model = res.scalar()
50
- if fleet_model is None:
51
- return
52
- lockset.add(fleet_model.id)
51
+ fleet_models = list(res.scalars().all())
52
+ fleet_ids = [fm.id for fm in fleet_models]
53
+ for fleet_id in fleet_ids:
54
+ lockset.add(fleet_id)
53
55
  try:
54
- fleet_model_id = fleet_model.id
55
- await _process_fleet(session=session, fleet_model=fleet_model)
56
+ await _process_fleets(session=session, fleet_models=fleet_models)
56
57
  finally:
57
- lockset.difference_update([fleet_model_id])
58
+ lockset.difference_update(fleet_ids)
58
59
 
59
60
 
60
- async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
61
- logger.debug("Processing fleet %s", fleet_model.name)
61
+ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
62
+ fleet_ids = [fm.id for fm in fleet_models]
62
63
  # Refetch to load related attributes.
63
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
64
64
  res = await session.execute(
65
65
  select(FleetModel)
66
- .where(FleetModel.id == fleet_model.id)
67
- .options(joinedload(FleetModel.project))
68
- .options(joinedload(FleetModel.instances))
69
- .options(joinedload(FleetModel.runs))
66
+ .where(FleetModel.id.in_(fleet_ids))
67
+ .options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
68
+ .options(
69
+ joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
70
+ )
71
+ .options(joinedload(FleetModel.runs).load_only(RunModel.status))
70
72
  .execution_options(populate_existing=True)
71
73
  )
72
- fleet_model = res.unique().scalar_one()
73
- await _autodelete_fleet(session=session, fleet_model=fleet_model)
74
+ fleet_models = list(res.unique().scalars().all())
75
+
76
+ deleted_fleets_ids = []
77
+ now = get_current_datetime()
78
+ for fleet_model in fleet_models:
79
+ deleted = _autodelete_fleet(fleet_model)
80
+ if deleted:
81
+ deleted_fleets_ids.append(fleet_model.id)
82
+ fleet_model.last_processed_at = now
83
+
84
+ await session.execute(
85
+ update(PlacementGroupModel)
86
+ .where(
87
+ PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
88
+ )
89
+ .values(fleet_deleted=True)
90
+ )
91
+ await session.commit()
74
92
 
75
93
 
76
- async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
94
+ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
77
95
  # Currently all empty fleets are autodeleted.
78
96
  # TODO: If fleets with `nodes: 0..` are supported, their deletion should be skipped.
79
97
  if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
80
- fleet_model.last_processed_at = get_current_datetime()
81
- await session.commit()
82
- return
98
+ return False
83
99
 
84
100
  logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name)
85
101
  fleet_model.status = FleetStatus.TERMINATED
86
102
  fleet_model.deleted = True
87
- fleet_model.last_processed_at = get_current_datetime()
88
- await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
89
- await session.commit()
90
103
  logger.info("Fleet %s deleted", fleet_model.name)
104
+ return True
@@ -17,6 +17,7 @@ from dstack._internal.server.services.gateways import (
17
17
  )
18
18
  from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
19
19
  from dstack._internal.server.services.logging import fmt
20
+ from dstack._internal.server.utils import sentry_utils
20
21
  from dstack._internal.utils.common import get_current_datetime
21
22
  from dstack._internal.utils.logging import get_logger
22
23
 
@@ -28,6 +29,7 @@ async def process_gateways_connections():
28
29
  await _process_active_connections()
29
30
 
30
31
 
32
+ @sentry_utils.instrument_background_task
31
33
  async def process_gateways():
32
34
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
33
35
  async with get_session_ctx() as session:
@@ -110,7 +112,6 @@ async def _process_connection(conn: GatewayConnection):
110
112
  async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
111
113
  logger.info("%s: started gateway provisioning", fmt(gateway_model))
112
114
  # Refetch to load related attributes.
113
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
114
115
  res = await session.execute(
115
116
  select(GatewayModel)
116
117
  .where(GatewayModel.id == gateway_model.id)
@@ -157,7 +158,6 @@ async def _process_provisioning_gateway(
157
158
  session: AsyncSession, gateway_model: GatewayModel
158
159
  ) -> None:
159
160
  # Refetch to load related attributes.
160
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
161
161
  res = await session.execute(
162
162
  select(GatewayModel)
163
163
  .where(GatewayModel.id == gateway_model.id)
@@ -10,13 +10,14 @@ from dstack._internal.core.errors import BackendNotAvailable
10
10
  from dstack._internal.core.models.profiles import parse_duration
11
11
  from dstack._internal.core.models.volumes import VolumeStatus
12
12
  from dstack._internal.server.db import get_db, get_session_ctx
13
- from dstack._internal.server.models import ProjectModel, VolumeModel
13
+ from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel
14
14
  from dstack._internal.server.services import backends as backends_services
15
15
  from dstack._internal.server.services.locking import get_locker
16
16
  from dstack._internal.server.services.volumes import (
17
17
  get_volume_configuration,
18
18
  volume_model_to_volume,
19
19
  )
20
+ from dstack._internal.server.utils import sentry_utils
20
21
  from dstack._internal.utils import common
21
22
  from dstack._internal.utils.common import get_current_datetime
22
23
  from dstack._internal.utils.logging import get_logger
@@ -24,6 +25,7 @@ from dstack._internal.utils.logging import get_logger
24
25
  logger = get_logger(__name__)
25
26
 
26
27
 
28
+ @sentry_utils.instrument_background_task
27
29
  async def process_idle_volumes():
28
30
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
29
31
  async with get_session_ctx() as session:
@@ -49,7 +51,7 @@ async def process_idle_volumes():
49
51
  select(VolumeModel)
50
52
  .where(VolumeModel.id.in_(volume_ids))
51
53
  .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
52
- .options(joinedload(VolumeModel.user))
54
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
53
55
  .options(joinedload(VolumeModel.attachments))
54
56
  .execution_options(populate_existing=True)
55
57
  )
@@ -82,8 +84,7 @@ def _should_delete_volume(volume: VolumeModel) -> bool:
82
84
 
83
85
  def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
84
86
  last_used = volume.last_job_processed_at or volume.created_at
85
- last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
86
- idle_time = get_current_datetime() - last_used_utc
87
+ idle_time = get_current_datetime() - last_used
87
88
  return max(idle_time, datetime.timedelta(0))
88
89
 
89
90