dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/core/backends/__init__.py +0 -65
  2. dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
  3. dstack/_internal/core/backends/features.py +64 -0
  4. dstack/_internal/core/backends/oci/resources.py +5 -5
  5. dstack/_internal/core/compatibility/fleets.py +2 -0
  6. dstack/_internal/core/compatibility/runs.py +4 -0
  7. dstack/_internal/core/models/profiles.py +37 -0
  8. dstack/_internal/server/app.py +22 -10
  9. dstack/_internal/server/background/__init__.py +5 -6
  10. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  11. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  12. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  13. dstack/_internal/server/background/tasks/process_instances.py +62 -48
  14. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  15. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  16. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  17. dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
  18. dstack/_internal/server/background/tasks/process_runs.py +63 -20
  19. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  20. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  21. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  22. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  23. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  24. dstack/_internal/server/models.py +16 -16
  25. dstack/_internal/server/schemas/logs.py +1 -9
  26. dstack/_internal/server/services/fleets.py +19 -10
  27. dstack/_internal/server/services/gateways/__init__.py +17 -17
  28. dstack/_internal/server/services/instances.py +10 -14
  29. dstack/_internal/server/services/jobs/__init__.py +10 -12
  30. dstack/_internal/server/services/logs/aws.py +45 -3
  31. dstack/_internal/server/services/logs/filelog.py +121 -11
  32. dstack/_internal/server/services/offers.py +3 -3
  33. dstack/_internal/server/services/projects.py +35 -15
  34. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  35. dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
  36. dstack/_internal/server/services/runs.py +74 -34
  37. dstack/_internal/server/services/services/__init__.py +4 -1
  38. dstack/_internal/server/services/users.py +2 -3
  39. dstack/_internal/server/services/volumes.py +11 -11
  40. dstack/_internal/server/settings.py +3 -0
  41. dstack/_internal/server/statics/index.html +1 -1
  42. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
  43. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
  44. dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
  45. dstack/_internal/server/testing/common.py +7 -0
  46. dstack/_internal/server/utils/sentry_utils.py +12 -0
  47. dstack/_internal/utils/common.py +10 -21
  48. dstack/_internal/utils/cron.py +5 -0
  49. dstack/version.py +1 -1
  50. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
  51. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
  52. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,65 +0,0 @@
1
- from dstack._internal.core.backends.base.compute import (
2
- ComputeWithCreateInstanceSupport,
3
- ComputeWithGatewaySupport,
4
- ComputeWithMultinodeSupport,
5
- ComputeWithPlacementGroupSupport,
6
- ComputeWithPrivateGatewaySupport,
7
- ComputeWithReservationSupport,
8
- ComputeWithVolumeSupport,
9
- )
10
- from dstack._internal.core.backends.base.configurator import Configurator
11
- from dstack._internal.core.backends.configurators import list_available_configurator_classes
12
- from dstack._internal.core.backends.local.compute import LocalCompute
13
- from dstack._internal.core.models.backends.base import BackendType
14
- from dstack._internal.settings import LOCAL_BACKEND_ENABLED
15
-
16
-
17
- def _get_backends_with_compute_feature(
18
- configurator_classes: list[type[Configurator]],
19
- compute_feature_class: type,
20
- ) -> list[BackendType]:
21
- backend_types_and_computes = [
22
- (configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS)
23
- for configurator_class in configurator_classes
24
- ]
25
- if LOCAL_BACKEND_ENABLED:
26
- backend_types_and_computes.append((BackendType.LOCAL, LocalCompute))
27
- backend_types = []
28
- for backend_type, compute_class in backend_types_and_computes:
29
- if issubclass(compute_class, compute_feature_class):
30
- backend_types.append(backend_type)
31
- return backend_types
32
-
33
-
34
- _configurator_classes = list_available_configurator_classes()
35
-
36
-
37
- # The following backend lists do not include unavailable backends (i.e. backends missing deps).
38
- BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
39
- configurator_classes=_configurator_classes,
40
- compute_feature_class=ComputeWithCreateInstanceSupport,
41
- )
42
- BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature(
43
- configurator_classes=_configurator_classes,
44
- compute_feature_class=ComputeWithMultinodeSupport,
45
- )
46
- BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = _get_backends_with_compute_feature(
47
- configurator_classes=_configurator_classes,
48
- compute_feature_class=ComputeWithPlacementGroupSupport,
49
- )
50
- BACKENDS_WITH_RESERVATION_SUPPORT = _get_backends_with_compute_feature(
51
- configurator_classes=_configurator_classes,
52
- compute_feature_class=ComputeWithReservationSupport,
53
- )
54
- BACKENDS_WITH_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
55
- configurator_classes=_configurator_classes,
56
- compute_feature_class=ComputeWithGatewaySupport,
57
- )
58
- BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
59
- configurator_classes=_configurator_classes,
60
- compute_feature_class=ComputeWithPrivateGatewaySupport,
61
- )
62
- BACKENDS_WITH_VOLUMES_SUPPORT = _get_backends_with_compute_feature(
63
- configurator_classes=_configurator_classes,
64
- compute_feature_class=ComputeWithVolumeSupport,
65
- )
@@ -155,8 +155,20 @@ class RiftClient:
155
155
  logger.debug("Terminating instance with request data: %s", request_data)
156
156
  response_data = self._make_request("instances/terminate", request_data)
157
157
  if isinstance(response_data, dict):
158
+ logger.debug("Terminating instance with response: %s", response_data)
158
159
  info = response_data.get("terminated", [])
159
- return len(info) > 0
160
+ is_terminated = len(info) > 0
161
+ if not is_terminated:
162
+ # check if the instance is already terminated
163
+ instance_info = self.get_instance_by_id(instance_id)
164
+ is_terminated = instance_info is None or instance_info.get("status") == "Inactive"
165
+ logger.debug(
166
+ "Instance %s is already terminated: %s response: %s",
167
+ instance_id,
168
+ is_terminated,
169
+ instance_info,
170
+ )
171
+ return is_terminated
160
172
 
161
173
  return False
162
174
 
@@ -0,0 +1,64 @@
1
+ from dstack._internal.core.backends.base.compute import (
2
+ ComputeWithCreateInstanceSupport,
3
+ ComputeWithGatewaySupport,
4
+ ComputeWithMultinodeSupport,
5
+ ComputeWithPlacementGroupSupport,
6
+ ComputeWithPrivateGatewaySupport,
7
+ ComputeWithReservationSupport,
8
+ ComputeWithVolumeSupport,
9
+ )
10
+ from dstack._internal.core.backends.base.configurator import Configurator
11
+ from dstack._internal.core.backends.configurators import list_available_configurator_classes
12
+ from dstack._internal.core.backends.local.compute import LocalCompute
13
+ from dstack._internal.core.models.backends.base import BackendType
14
+ from dstack._internal.settings import LOCAL_BACKEND_ENABLED
15
+
16
+ _configurator_classes = list_available_configurator_classes()
17
+
18
+
19
+ def _get_backends_with_compute_feature(
20
+ configurator_classes: list[type[Configurator]],
21
+ compute_feature_class: type,
22
+ ) -> list[BackendType]:
23
+ backend_types_and_computes = [
24
+ (configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS)
25
+ for configurator_class in configurator_classes
26
+ ]
27
+ if LOCAL_BACKEND_ENABLED:
28
+ backend_types_and_computes.append((BackendType.LOCAL, LocalCompute))
29
+ backend_types = []
30
+ for backend_type, compute_class in backend_types_and_computes:
31
+ if issubclass(compute_class, compute_feature_class):
32
+ backend_types.append(backend_type)
33
+ return backend_types
34
+
35
+
36
+ # The following backend lists do not include unavailable backends (i.e. backends missing deps).
37
+ BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
38
+ configurator_classes=_configurator_classes,
39
+ compute_feature_class=ComputeWithCreateInstanceSupport,
40
+ )
41
+ BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature(
42
+ configurator_classes=_configurator_classes,
43
+ compute_feature_class=ComputeWithMultinodeSupport,
44
+ )
45
+ BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT = _get_backends_with_compute_feature(
46
+ configurator_classes=_configurator_classes,
47
+ compute_feature_class=ComputeWithPlacementGroupSupport,
48
+ )
49
+ BACKENDS_WITH_RESERVATION_SUPPORT = _get_backends_with_compute_feature(
50
+ configurator_classes=_configurator_classes,
51
+ compute_feature_class=ComputeWithReservationSupport,
52
+ )
53
+ BACKENDS_WITH_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
54
+ configurator_classes=_configurator_classes,
55
+ compute_feature_class=ComputeWithGatewaySupport,
56
+ )
57
+ BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT = _get_backends_with_compute_feature(
58
+ configurator_classes=_configurator_classes,
59
+ compute_feature_class=ComputeWithPrivateGatewaySupport,
60
+ )
61
+ BACKENDS_WITH_VOLUMES_SUPPORT = _get_backends_with_compute_feature(
62
+ configurator_classes=_configurator_classes,
63
+ compute_feature_class=ComputeWithVolumeSupport,
64
+ )
@@ -26,7 +26,7 @@ from dstack import version
26
26
  from dstack._internal.core.backends.oci.region import OCIRegionClient
27
27
  from dstack._internal.core.errors import BackendError
28
28
  from dstack._internal.core.models.instances import InstanceOffer
29
- from dstack._internal.utils.common import split_chunks
29
+ from dstack._internal.utils.common import batched
30
30
  from dstack._internal.utils.logging import get_logger
31
31
 
32
32
  logger = get_logger(__name__)
@@ -667,21 +667,21 @@ def add_security_group_rules(
667
667
  security_group_id: str, rules: Iterable[SecurityRule], client: oci.core.VirtualNetworkClient
668
668
  ) -> None:
669
669
  rules_details = map(SecurityRule.to_sdk_add_rule_details, rules)
670
- for chunk in split_chunks(rules_details, ADD_SECURITY_RULES_MAX_CHUNK_SIZE):
670
+ for batch in batched(rules_details, ADD_SECURITY_RULES_MAX_CHUNK_SIZE):
671
671
  client.add_network_security_group_security_rules(
672
672
  security_group_id,
673
- oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=chunk),
673
+ oci.core.models.AddNetworkSecurityGroupSecurityRulesDetails(security_rules=batch),
674
674
  )
675
675
 
676
676
 
677
677
  def remove_security_group_rules(
678
678
  security_group_id: str, rule_ids: Iterable[str], client: oci.core.VirtualNetworkClient
679
679
  ) -> None:
680
- for chunk in split_chunks(rule_ids, REMOVE_SECURITY_RULES_MAX_CHUNK_SIZE):
680
+ for batch in batched(rule_ids, REMOVE_SECURITY_RULES_MAX_CHUNK_SIZE):
681
681
  client.remove_network_security_group_security_rules(
682
682
  security_group_id,
683
683
  oci.core.models.RemoveNetworkSecurityGroupSecurityRulesDetails(
684
- security_rule_ids=chunk
684
+ security_rule_ids=batch
685
685
  ),
686
686
  )
687
687
 
@@ -57,6 +57,8 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
57
57
  profile_excludes.add("startup_order")
58
58
  if profile.stop_criteria is None:
59
59
  profile_excludes.add("stop_criteria")
60
+ if profile.schedule is None:
61
+ profile_excludes.add("schedule")
60
62
  if configuration_excludes:
61
63
  spec_excludes["configuration"] = configuration_excludes
62
64
  if profile_excludes:
@@ -126,6 +126,10 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
126
126
  configuration_excludes["files"] = True
127
127
  if not run_spec.file_archives:
128
128
  spec_excludes["file_archives"] = True
129
+ if configuration.schedule is None:
130
+ configuration_excludes["schedule"] = True
131
+ if profile is not None and profile.schedule is None:
132
+ profile_excludes.add("schedule")
129
133
 
130
134
  if configuration_excludes:
131
135
  spec_excludes["configuration"] = configuration_excludes
@@ -8,6 +8,7 @@ from typing_extensions import Annotated, Literal
8
8
  from dstack._internal.core.models.backends.base import BackendType
9
9
  from dstack._internal.core.models.common import CoreModel, Duration
10
10
  from dstack._internal.utils.common import list_enum_values_for_annotation
11
+ from dstack._internal.utils.cron import validate_cron
11
12
  from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
12
13
  from dstack._internal.utils.tags import tags_validator
13
14
 
@@ -167,6 +168,38 @@ class UtilizationPolicy(CoreModel):
167
168
  return v
168
169
 
169
170
 
171
+ class Schedule(CoreModel):
172
+ cron: Annotated[
173
+ Union[List[str], str],
174
+ Field(
175
+ description=(
176
+ "A cron expression or a list of cron expressions specifying the UTC time when the run needs to be started"
177
+ )
178
+ ),
179
+ ]
180
+
181
+ @validator("cron")
182
+ def _validate_cron(cls, v: Union[List[str], str]) -> List[str]:
183
+ if isinstance(v, str):
184
+ values = [v]
185
+ else:
186
+ values = v
187
+ if len(values) == 0:
188
+ raise ValueError("At least one cron expression must be specified")
189
+ for value in values:
190
+ validate_cron(value)
191
+ return values
192
+
193
+ @property
194
+ def crons(self) -> List[str]:
195
+ """
196
+ Access `cron` attribute as a list.
197
+ """
198
+ if isinstance(self.cron, str):
199
+ return [self.cron]
200
+ return self.cron
201
+
202
+
170
203
  class ProfileParams(CoreModel):
171
204
  backends: Annotated[
172
205
  Optional[List[BackendType]],
@@ -281,6 +314,10 @@ class ProfileParams(CoreModel):
281
314
  )
282
315
  ),
283
316
  ] = None
317
+ schedule: Annotated[
318
+ Optional[Schedule],
319
+ Field(description=("The schedule for starting the run at specified time")),
320
+ ] = None
284
321
  fleets: Annotated[
285
322
  Optional[list[str]], Field(description="The fleets considered for reuse")
286
323
  ] = None
@@ -13,6 +13,7 @@ from fastapi.datastructures import URL
13
13
  from fastapi.responses import HTMLResponse, RedirectResponse
14
14
  from fastapi.staticfiles import StaticFiles
15
15
  from prometheus_client import Counter, Histogram
16
+ from sentry_sdk.types import SamplingContext
16
17
 
17
18
  from dstack._internal.cli.utils.common import console
18
19
  from dstack._internal.core.errors import ForbiddenError, ServerClientError
@@ -81,16 +82,6 @@ REQUEST_DURATION = Histogram(
81
82
 
82
83
 
83
84
  def create_app() -> FastAPI:
84
- if settings.SENTRY_DSN is not None:
85
- sentry_sdk.init(
86
- dsn=settings.SENTRY_DSN,
87
- release=DSTACK_VERSION,
88
- environment=settings.SERVER_ENVIRONMENT,
89
- enable_tracing=True,
90
- traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE,
91
- profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
92
- )
93
-
94
85
  app = FastAPI(
95
86
  docs_url="/api/docs",
96
87
  lifespan=lifespan,
@@ -102,6 +93,15 @@ def create_app() -> FastAPI:
102
93
  @asynccontextmanager
103
94
  async def lifespan(app: FastAPI):
104
95
  configure_logging()
96
+ if settings.SENTRY_DSN is not None:
97
+ sentry_sdk.init(
98
+ dsn=settings.SENTRY_DSN,
99
+ release=DSTACK_VERSION,
100
+ environment=settings.SERVER_ENVIRONMENT,
101
+ enable_tracing=True,
102
+ traces_sampler=_sentry_traces_sampler,
103
+ profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
104
+ )
105
105
  server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
106
106
  asyncio.get_running_loop().set_default_executor(server_executor)
107
107
  await migrate()
@@ -379,3 +379,15 @@ def _print_dstack_logo():
379
379
  ╰━━┻━━┻╯╱╰╯╰━━┻╯
380
380
  [/]"""
381
381
  )
382
+
383
+
384
+ def _sentry_traces_sampler(sampling_context: SamplingContext) -> float:
385
+ parent_sampling_decision = sampling_context["parent_sampled"]
386
+ if parent_sampling_decision is not None:
387
+ return float(parent_sampling_decision)
388
+ transaction_context = sampling_context["transaction_context"]
389
+ name = transaction_context.get("name")
390
+ if name is not None:
391
+ if name.startswith("background."):
392
+ return settings.SENTRY_TRACES_BACKGROUND_SAMPLE_RATE
393
+ return settings.SENTRY_TRACES_SAMPLE_RATE
@@ -79,6 +79,11 @@ def start_background_tasks() -> AsyncIOScheduler:
79
79
  process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1
80
80
  )
81
81
  _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
82
+ _scheduler.add_job(
83
+ process_fleets,
84
+ IntervalTrigger(seconds=10, jitter=2),
85
+ max_instances=1,
86
+ )
82
87
  for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
83
88
  # Add multiple copies of tasks if requested.
84
89
  # max_instances=1 for additional copies to avoid running too many tasks.
@@ -113,11 +118,5 @@ def start_background_tasks() -> AsyncIOScheduler:
113
118
  kwargs={"batch_size": 5},
114
119
  max_instances=2 if replica == 0 else 1,
115
120
  )
116
- _scheduler.add_job(
117
- process_fleets,
118
- IntervalTrigger(seconds=10, jitter=2),
119
- kwargs={"batch_size": 5},
120
- max_instances=2 if replica == 0 else 1,
121
- )
122
121
  _scheduler.start()
123
122
  return _scheduler
@@ -1,36 +1,37 @@
1
- import asyncio
2
1
  from datetime import timedelta
2
+ from typing import List
3
3
 
4
- from sqlalchemy import select
4
+ from sqlalchemy import select, update
5
5
  from sqlalchemy.ext.asyncio import AsyncSession
6
- from sqlalchemy.orm import joinedload
6
+ from sqlalchemy.orm import joinedload, load_only
7
7
 
8
8
  from dstack._internal.core.models.fleets import FleetStatus
9
9
  from dstack._internal.server.db import get_db, get_session_ctx
10
- from dstack._internal.server.models import FleetModel
10
+ from dstack._internal.server.models import (
11
+ FleetModel,
12
+ InstanceModel,
13
+ JobModel,
14
+ PlacementGroupModel,
15
+ RunModel,
16
+ )
11
17
  from dstack._internal.server.services.fleets import (
12
18
  is_fleet_empty,
13
19
  is_fleet_in_use,
14
20
  )
15
21
  from dstack._internal.server.services.locking import get_locker
16
- from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
22
+ from dstack._internal.server.utils import sentry_utils
17
23
  from dstack._internal.utils.common import get_current_datetime
18
24
  from dstack._internal.utils.logging import get_logger
19
25
 
20
26
  logger = get_logger(__name__)
21
27
 
22
28
 
29
+ BATCH_SIZE = 10
23
30
  MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
24
31
 
25
32
 
26
- async def process_fleets(batch_size: int = 1):
27
- tasks = []
28
- for _ in range(batch_size):
29
- tasks.append(_process_next_fleet())
30
- await asyncio.gather(*tasks)
31
-
32
-
33
- async def _process_next_fleet():
33
+ @sentry_utils.instrument_background_task
34
+ async def process_fleets():
34
35
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
35
36
  async with get_session_ctx() as session:
36
37
  async with lock:
@@ -40,51 +41,64 @@ async def _process_next_fleet():
40
41
  FleetModel.deleted == False,
41
42
  FleetModel.id.not_in(lockset),
42
43
  FleetModel.last_processed_at
43
- < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
44
+ < get_current_datetime() - MIN_PROCESSING_INTERVAL,
44
45
  )
46
+ .options(load_only(FleetModel.id))
45
47
  .order_by(FleetModel.last_processed_at.asc())
46
- .limit(1)
48
+ .limit(BATCH_SIZE)
47
49
  .with_for_update(skip_locked=True, key_share=True)
48
50
  )
49
- fleet_model = res.scalar()
50
- if fleet_model is None:
51
- return
52
- lockset.add(fleet_model.id)
51
+ fleet_models = list(res.scalars().all())
52
+ fleet_ids = [fm.id for fm in fleet_models]
53
+ for fleet_id in fleet_ids:
54
+ lockset.add(fleet_id)
53
55
  try:
54
- fleet_model_id = fleet_model.id
55
- await _process_fleet(session=session, fleet_model=fleet_model)
56
+ await _process_fleets(session=session, fleet_models=fleet_models)
56
57
  finally:
57
- lockset.difference_update([fleet_model_id])
58
+ lockset.difference_update(fleet_ids)
58
59
 
59
60
 
60
- async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
61
- logger.debug("Processing fleet %s", fleet_model.name)
61
+ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
62
+ fleet_ids = [fm.id for fm in fleet_models]
62
63
  # Refetch to load related attributes.
63
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
64
64
  res = await session.execute(
65
65
  select(FleetModel)
66
- .where(FleetModel.id == fleet_model.id)
67
- .options(joinedload(FleetModel.project))
68
- .options(joinedload(FleetModel.instances))
69
- .options(joinedload(FleetModel.runs))
66
+ .where(FleetModel.id.in_(fleet_ids))
67
+ .options(joinedload(FleetModel.instances).load_only(InstanceModel.deleted))
68
+ .options(
69
+ joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id)
70
+ )
71
+ .options(joinedload(FleetModel.runs).load_only(RunModel.status))
70
72
  .execution_options(populate_existing=True)
71
73
  )
72
- fleet_model = res.unique().scalar_one()
73
- await _autodelete_fleet(session=session, fleet_model=fleet_model)
74
+ fleet_models = list(res.unique().scalars().all())
75
+
76
+ deleted_fleets_ids = []
77
+ now = get_current_datetime()
78
+ for fleet_model in fleet_models:
79
+ deleted = _autodelete_fleet(fleet_model)
80
+ if deleted:
81
+ deleted_fleets_ids.append(fleet_model.id)
82
+ fleet_model.last_processed_at = now
83
+
84
+ await session.execute(
85
+ update(PlacementGroupModel)
86
+ .where(
87
+ PlacementGroupModel.fleet_id.in_(deleted_fleets_ids),
88
+ )
89
+ .values(fleet_deleted=True)
90
+ )
91
+ await session.commit()
74
92
 
75
93
 
76
- async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
94
+ def _autodelete_fleet(fleet_model: FleetModel) -> bool:
77
95
  # Currently all empty fleets are autodeleted.
78
96
  # TODO: If fleets with `nodes: 0..` are supported, their deletion should be skipped.
79
97
  if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
80
- fleet_model.last_processed_at = get_current_datetime()
81
- await session.commit()
82
- return
98
+ return False
83
99
 
84
100
  logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name)
85
101
  fleet_model.status = FleetStatus.TERMINATED
86
102
  fleet_model.deleted = True
87
- fleet_model.last_processed_at = get_current_datetime()
88
- await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
89
- await session.commit()
90
103
  logger.info("Fleet %s deleted", fleet_model.name)
104
+ return True
@@ -17,6 +17,7 @@ from dstack._internal.server.services.gateways import (
17
17
  )
18
18
  from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
19
19
  from dstack._internal.server.services.logging import fmt
20
+ from dstack._internal.server.utils import sentry_utils
20
21
  from dstack._internal.utils.common import get_current_datetime
21
22
  from dstack._internal.utils.logging import get_logger
22
23
 
@@ -28,6 +29,7 @@ async def process_gateways_connections():
28
29
  await _process_active_connections()
29
30
 
30
31
 
32
+ @sentry_utils.instrument_background_task
31
33
  async def process_gateways():
32
34
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
33
35
  async with get_session_ctx() as session:
@@ -110,7 +112,6 @@ async def _process_connection(conn: GatewayConnection):
110
112
  async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
111
113
  logger.info("%s: started gateway provisioning", fmt(gateway_model))
112
114
  # Refetch to load related attributes.
113
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
114
115
  res = await session.execute(
115
116
  select(GatewayModel)
116
117
  .where(GatewayModel.id == gateway_model.id)
@@ -157,7 +158,6 @@ async def _process_provisioning_gateway(
157
158
  session: AsyncSession, gateway_model: GatewayModel
158
159
  ) -> None:
159
160
  # Refetch to load related attributes.
160
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
161
161
  res = await session.execute(
162
162
  select(GatewayModel)
163
163
  .where(GatewayModel.id == gateway_model.id)
@@ -10,13 +10,14 @@ from dstack._internal.core.errors import BackendNotAvailable
10
10
  from dstack._internal.core.models.profiles import parse_duration
11
11
  from dstack._internal.core.models.volumes import VolumeStatus
12
12
  from dstack._internal.server.db import get_db, get_session_ctx
13
- from dstack._internal.server.models import ProjectModel, VolumeModel
13
+ from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel
14
14
  from dstack._internal.server.services import backends as backends_services
15
15
  from dstack._internal.server.services.locking import get_locker
16
16
  from dstack._internal.server.services.volumes import (
17
17
  get_volume_configuration,
18
18
  volume_model_to_volume,
19
19
  )
20
+ from dstack._internal.server.utils import sentry_utils
20
21
  from dstack._internal.utils import common
21
22
  from dstack._internal.utils.common import get_current_datetime
22
23
  from dstack._internal.utils.logging import get_logger
@@ -24,6 +25,7 @@ from dstack._internal.utils.logging import get_logger
24
25
  logger = get_logger(__name__)
25
26
 
26
27
 
28
+ @sentry_utils.instrument_background_task
27
29
  async def process_idle_volumes():
28
30
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
29
31
  async with get_session_ctx() as session:
@@ -49,7 +51,7 @@ async def process_idle_volumes():
49
51
  select(VolumeModel)
50
52
  .where(VolumeModel.id.in_(volume_ids))
51
53
  .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
52
- .options(joinedload(VolumeModel.user))
54
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
53
55
  .options(joinedload(VolumeModel.attachments))
54
56
  .execution_options(populate_existing=True)
55
57
  )
@@ -82,8 +84,7 @@ def _should_delete_volume(volume: VolumeModel) -> bool:
82
84
 
83
85
  def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
84
86
  last_used = volume.last_job_processed_at or volume.created_at
85
- last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
86
- idle_time = get_current_datetime() - last_used_utc
87
+ idle_time = get_current_datetime() - last_used
87
88
  return max(idle_time, datetime.timedelta(0))
88
89
 
89
90