dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/core/backends/__init__.py +0 -65
  2. dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
  3. dstack/_internal/core/backends/features.py +64 -0
  4. dstack/_internal/core/backends/oci/resources.py +5 -5
  5. dstack/_internal/core/compatibility/fleets.py +2 -0
  6. dstack/_internal/core/compatibility/runs.py +4 -0
  7. dstack/_internal/core/models/profiles.py +37 -0
  8. dstack/_internal/server/app.py +22 -10
  9. dstack/_internal/server/background/__init__.py +5 -6
  10. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  11. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  12. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  13. dstack/_internal/server/background/tasks/process_instances.py +62 -48
  14. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  15. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  16. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  17. dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
  18. dstack/_internal/server/background/tasks/process_runs.py +63 -20
  19. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  20. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  21. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  22. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  23. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  24. dstack/_internal/server/models.py +16 -16
  25. dstack/_internal/server/schemas/logs.py +1 -9
  26. dstack/_internal/server/services/fleets.py +19 -10
  27. dstack/_internal/server/services/gateways/__init__.py +17 -17
  28. dstack/_internal/server/services/instances.py +10 -14
  29. dstack/_internal/server/services/jobs/__init__.py +10 -12
  30. dstack/_internal/server/services/logs/aws.py +45 -3
  31. dstack/_internal/server/services/logs/filelog.py +121 -11
  32. dstack/_internal/server/services/offers.py +3 -3
  33. dstack/_internal/server/services/projects.py +35 -15
  34. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  35. dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
  36. dstack/_internal/server/services/runs.py +74 -34
  37. dstack/_internal/server/services/services/__init__.py +4 -1
  38. dstack/_internal/server/services/users.py +2 -3
  39. dstack/_internal/server/services/volumes.py +11 -11
  40. dstack/_internal/server/settings.py +3 -0
  41. dstack/_internal/server/statics/index.html +1 -1
  42. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
  43. dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
  44. dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
  45. dstack/_internal/server/testing/common.py +7 -0
  46. dstack/_internal/server/utils/sentry_utils.py +12 -0
  47. dstack/_internal/utils/common.py +10 -21
  48. dstack/_internal/utils/cron.py +5 -0
  49. dstack/version.py +1 -1
  50. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
  51. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
  52. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,9 +2,9 @@ import asyncio
2
2
  import datetime
3
3
  from typing import List, Optional, Set, Tuple
4
4
 
5
- from sqlalchemy import select
5
+ from sqlalchemy import and_, or_, select
6
6
  from sqlalchemy.ext.asyncio import AsyncSession
7
- from sqlalchemy.orm import joinedload, selectinload
7
+ from sqlalchemy.orm import joinedload, load_only, selectinload
8
8
 
9
9
  import dstack._internal.server.services.services.autoscalers as autoscalers
10
10
  from dstack._internal.core.errors import ServerError
@@ -20,7 +20,13 @@ from dstack._internal.core.models.runs import (
20
20
  RunTerminationReason,
21
21
  )
22
22
  from dstack._internal.server.db import get_db, get_session_ctx
23
- from dstack._internal.server.models import JobModel, ProjectModel, RunModel
23
+ from dstack._internal.server.models import (
24
+ InstanceModel,
25
+ JobModel,
26
+ ProjectModel,
27
+ RunModel,
28
+ UserModel,
29
+ )
24
30
  from dstack._internal.server.services.jobs import (
25
31
  find_job,
26
32
  get_job_specs_from_run_spec,
@@ -37,6 +43,7 @@ from dstack._internal.server.services.runs import (
37
43
  )
38
44
  from dstack._internal.server.services.secrets import get_project_secrets_mapping
39
45
  from dstack._internal.server.services.services import update_service_desired_replica_count
46
+ from dstack._internal.server.utils import sentry_utils
40
47
  from dstack._internal.utils import common
41
48
  from dstack._internal.utils.logging import get_logger
42
49
 
@@ -53,22 +60,54 @@ async def process_runs(batch_size: int = 1):
53
60
  await asyncio.gather(*tasks)
54
61
 
55
62
 
63
+ @sentry_utils.instrument_background_task
56
64
  async def _process_next_run():
57
65
  run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
58
66
  job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
67
+ now = common.get_current_datetime()
59
68
  async with get_session_ctx() as session:
60
69
  async with run_lock, job_lock:
61
70
  res = await session.execute(
62
71
  select(RunModel)
63
72
  .where(
64
- RunModel.status.not_in(RunStatus.finished_statuses()),
65
73
  RunModel.id.not_in(run_lockset),
66
- RunModel.last_processed_at
67
- < common.get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
74
+ RunModel.last_processed_at < now - MIN_PROCESSING_INTERVAL,
75
+ # Filter out runs that don't need to be processed.
76
+ # This is only to reduce unnecessary commits.
77
+ # Otherwise, we could fetch all active runs and filter them when processing.
78
+ or_(
79
+ # Active non-pending runs:
80
+ RunModel.status.not_in(
81
+ RunStatus.finished_statuses() + [RunStatus.PENDING]
82
+ ),
83
+ # Retrying runs:
84
+ and_(
85
+ RunModel.status == RunStatus.PENDING,
86
+ RunModel.resubmission_attempt > 0,
87
+ ),
88
+ # Scheduled ready runs:
89
+ and_(
90
+ RunModel.status == RunStatus.PENDING,
91
+ RunModel.resubmission_attempt == 0,
92
+ RunModel.next_triggered_at.is_not(None),
93
+ RunModel.next_triggered_at < now,
94
+ ),
95
+ # Scaled-to-zero runs:
96
+ # Such runs cannot be scheduled, thus we check next_triggered_at.
97
+ # If we allow scheduled services with downscaling to zero
98
+ # This check won't pass.
99
+ and_(
100
+ RunModel.status == RunStatus.PENDING,
101
+ RunModel.resubmission_attempt == 0,
102
+ RunModel.next_triggered_at.is_(None),
103
+ ),
104
+ ),
68
105
  )
106
+ .options(joinedload(RunModel.jobs).load_only(JobModel.id))
107
+ .options(load_only(RunModel.id))
69
108
  .order_by(RunModel.last_processed_at.asc())
70
109
  .limit(1)
71
- .with_for_update(skip_locked=True, key_share=True)
110
+ .with_for_update(skip_locked=True, key_share=True, of=RunModel)
72
111
  )
73
112
  run_model = res.scalar()
74
113
  if run_model is None:
@@ -98,20 +137,22 @@ async def _process_next_run():
98
137
 
99
138
 
100
139
  async def _process_run(session: AsyncSession, run_model: RunModel):
101
- logger.debug("%s: processing run", fmt(run_model))
102
140
  # Refetch to load related attributes.
103
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
104
141
  res = await session.execute(
105
142
  select(RunModel)
106
143
  .where(RunModel.id == run_model.id)
107
144
  .execution_options(populate_existing=True)
108
- .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
109
- .options(joinedload(RunModel.user))
110
- .options(joinedload(RunModel.repo))
111
- .options(selectinload(RunModel.jobs).joinedload(JobModel.instance))
145
+ .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
146
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
147
+ .options(
148
+ selectinload(RunModel.jobs)
149
+ .joinedload(JobModel.instance)
150
+ .load_only(InstanceModel.fleet_id)
151
+ )
112
152
  .execution_options(populate_existing=True)
113
153
  )
114
154
  run_model = res.unique().scalar_one()
155
+ logger.debug("%s: processing run", fmt(run_model))
115
156
  try:
116
157
  if run_model.status == RunStatus.PENDING:
117
158
  await _process_pending_run(session, run_model)
@@ -135,8 +176,12 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
135
176
  async def _process_pending_run(session: AsyncSession, run_model: RunModel):
136
177
  """Jobs are not created yet"""
137
178
  run = run_model_to_run(run_model)
138
- if not _pending_run_ready_for_resubmission(run_model, run):
139
- logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model))
179
+
180
+ # TODO: Do not select such runs in the first place to avoid redundant processing
181
+ if run_model.resubmission_attempt > 0 and not _retrying_run_ready_for_resubmission(
182
+ run_model, run
183
+ ):
184
+ logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model))
140
185
  return
141
186
 
142
187
  run_model.desired_replica_count = 1
@@ -160,7 +205,7 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
160
205
  logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
161
206
 
162
207
 
163
- def _pending_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
208
+ def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
164
209
  if run.latest_job_submission is None:
165
210
  # Should not be possible
166
211
  return True
@@ -197,7 +242,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
197
242
  We handle fails, scaling, and status changes.
198
243
  """
199
244
  run = run_model_to_run(run_model)
200
- run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
245
+ run_spec = run.run_spec
201
246
  retry_single_job = _can_retry_single_job(run_spec)
202
247
 
203
248
  run_statuses: Set[RunStatus] = set()
@@ -337,9 +382,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
337
382
  )
338
383
  if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
339
384
  current_time = common.get_current_datetime()
340
- submit_to_provision_duration = (
341
- current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
342
- ).total_seconds()
385
+ submit_to_provision_duration = (current_time - run_model.submitted_at).total_seconds()
343
386
  logger.info(
344
387
  "%s: run took %.2f seconds from submission to provisioning.",
345
388
  fmt(run_model),
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple
5
5
 
6
6
  from sqlalchemy import select
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
- from sqlalchemy.orm import joinedload, lazyload, selectinload
8
+ from sqlalchemy.orm import joinedload, load_only, selectinload
9
9
 
10
10
  from dstack._internal.core.backends.base.backend import Backend
11
11
  from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
43
43
  JobModel,
44
44
  ProjectModel,
45
45
  RunModel,
46
+ UserModel,
46
47
  VolumeAttachmentModel,
47
48
  VolumeModel,
48
49
  )
@@ -74,6 +75,7 @@ from dstack._internal.server.services.runs import (
74
75
  from dstack._internal.server.services.volumes import (
75
76
  volume_model_to_volume,
76
77
  )
78
+ from dstack._internal.server.utils import sentry_utils
77
79
  from dstack._internal.utils import common as common_utils
78
80
  from dstack._internal.utils import env as env_utils
79
81
  from dstack._internal.utils.logging import get_logger
@@ -108,6 +110,7 @@ def _get_effective_batch_size(batch_size: int) -> int:
108
110
  return batch_size
109
111
 
110
112
 
113
+ @sentry_utils.instrument_background_task
111
114
  async def _process_next_submitted_job():
112
115
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
113
116
  async with get_session_ctx() as session:
@@ -119,6 +122,7 @@ async def _process_next_submitted_job():
119
122
  JobModel.status == JobStatus.SUBMITTED,
120
123
  JobModel.id.not_in(lockset),
121
124
  )
125
+ .options(load_only(JobModel.id))
122
126
  # Jobs are process in FIFO sorted by priority globally,
123
127
  # thus runs from different projects can "overtake" each other by using higher priorities.
124
128
  # That's not a big problem as long as projects do not compete for the same compute resources.
@@ -151,9 +155,7 @@ async def _process_next_submitted_job():
151
155
 
152
156
 
153
157
  async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
154
- logger.debug("%s: provisioning has started", fmt(job_model))
155
158
  # Refetch to load related attributes.
156
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
157
159
  res = await session.execute(
158
160
  select(JobModel).where(JobModel.id == job_model.id).options(joinedload(JobModel.instance))
159
161
  )
@@ -162,15 +164,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
162
164
  select(RunModel)
163
165
  .where(RunModel.id == job_model.run_id)
164
166
  .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
165
- .options(joinedload(RunModel.user))
167
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
166
168
  .options(joinedload(RunModel.fleet).joinedload(FleetModel.instances))
167
169
  )
168
170
  run_model = res.unique().scalar_one()
169
- project = run_model.project
170
- run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
171
- profile = run_spec.merged_profile
171
+ logger.debug("%s: provisioning has started", fmt(job_model))
172
172
 
173
+ project = run_model.project
173
174
  run = run_model_to_run(run_model)
175
+ run_spec = run.run_spec
176
+ profile = run_spec.merged_profile
174
177
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
175
178
 
176
179
  master_job = find_job(run.jobs, job_model.replica_num, 0)
@@ -228,7 +231,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
228
231
  InstanceModel.deleted == False,
229
232
  InstanceModel.total_blocks > InstanceModel.busy_blocks,
230
233
  )
231
- .options(lazyload(InstanceModel.jobs))
232
234
  .order_by(InstanceModel.id) # take locks in order
233
235
  .with_for_update(key_share=True)
234
236
  )
@@ -357,9 +359,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
357
359
  await session.execute(
358
360
  select(VolumeModel)
359
361
  .where(VolumeModel.id.in_(volumes_ids))
360
- .options(selectinload(VolumeModel.user))
362
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
361
363
  .order_by(VolumeModel.id) # take locks in order
362
- .with_for_update(key_share=True)
364
+ .with_for_update(key_share=True, of=VolumeModel)
363
365
  )
364
366
  async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
365
367
  if len(volume_models) > 0:
@@ -2,7 +2,7 @@ import asyncio
2
2
 
3
3
  from sqlalchemy import or_, select
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
- from sqlalchemy.orm import joinedload, lazyload
5
+ from sqlalchemy.orm import joinedload
6
6
 
7
7
  from dstack._internal.core.models.runs import JobStatus
8
8
  from dstack._internal.server.db import get_db, get_session_ctx
@@ -18,7 +18,11 @@ from dstack._internal.server.services.jobs import (
18
18
  )
19
19
  from dstack._internal.server.services.locking import get_locker
20
20
  from dstack._internal.server.services.logging import fmt
21
- from dstack._internal.utils.common import get_current_datetime, get_or_error
21
+ from dstack._internal.server.utils import sentry_utils
22
+ from dstack._internal.utils.common import (
23
+ get_current_datetime,
24
+ get_or_error,
25
+ )
22
26
  from dstack._internal.utils.logging import get_logger
23
27
 
24
28
  logger = get_logger(__name__)
@@ -31,6 +35,7 @@ async def process_terminating_jobs(batch_size: int = 1):
31
35
  await asyncio.gather(*tasks)
32
36
 
33
37
 
38
+ @sentry_utils.instrument_background_task
34
39
  async def _process_next_terminating_job():
35
40
  job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
36
41
  instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
@@ -43,7 +48,10 @@ async def _process_next_terminating_job():
43
48
  .where(
44
49
  JobModel.id.not_in(job_lockset),
45
50
  JobModel.status == JobStatus.TERMINATING,
46
- or_(JobModel.remove_at.is_(None), JobModel.remove_at < get_current_datetime()),
51
+ or_(
52
+ JobModel.remove_at.is_(None),
53
+ JobModel.remove_at < get_current_datetime(),
54
+ ),
47
55
  )
48
56
  .order_by(JobModel.last_processed_at.asc())
49
57
  .limit(1)
@@ -59,7 +67,6 @@ async def _process_next_terminating_job():
59
67
  InstanceModel.id == job_model.used_instance_id,
60
68
  InstanceModel.id.not_in(instance_lockset),
61
69
  )
62
- .options(lazyload(InstanceModel.jobs))
63
70
  .with_for_update(skip_locked=True, key_share=True)
64
71
  )
65
72
  instance_model = res.scalar()
@@ -88,6 +95,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
88
95
  .options(
89
96
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
90
97
  joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
98
+ joinedload(InstanceModel.jobs).load_only(JobModel.id),
91
99
  )
92
100
  )
93
101
  instance_model = res.unique().scalar()
@@ -7,6 +7,7 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable
7
7
  from dstack._internal.core.models.volumes import VolumeStatus
8
8
  from dstack._internal.server.db import get_db, get_session_ctx
9
9
  from dstack._internal.server.models import (
10
+ FleetModel,
10
11
  InstanceModel,
11
12
  ProjectModel,
12
13
  VolumeAttachmentModel,
@@ -15,12 +16,14 @@ from dstack._internal.server.models import (
15
16
  from dstack._internal.server.services import backends as backends_services
16
17
  from dstack._internal.server.services import volumes as volumes_services
17
18
  from dstack._internal.server.services.locking import get_locker
19
+ from dstack._internal.server.utils import sentry_utils
18
20
  from dstack._internal.utils.common import get_current_datetime, run_async
19
21
  from dstack._internal.utils.logging import get_logger
20
22
 
21
23
  logger = get_logger(__name__)
22
24
 
23
25
 
26
+ @sentry_utils.instrument_background_task
24
27
  async def process_submitted_volumes():
25
28
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
26
29
  async with get_session_ctx() as session:
@@ -49,7 +52,6 @@ async def process_submitted_volumes():
49
52
  async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeModel):
50
53
  logger.info("Started submitted volume %s processing", volume_model.name)
51
54
  # Refetch to load related attributes.
52
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
53
55
  res = await session.execute(
54
56
  select(VolumeModel)
55
57
  .where(VolumeModel.id == volume_model.id)
@@ -59,6 +61,7 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
59
61
  joinedload(VolumeModel.attachments)
60
62
  .joinedload(VolumeAttachmentModel.instance)
61
63
  .joinedload(InstanceModel.fleet)
64
+ .load_only(FleetModel.name)
62
65
  )
63
66
  .execution_options(populate_existing=True)
64
67
  )
@@ -0,0 +1,55 @@
1
+ """Index status columns
2
+
3
+ Revision ID: 50dd7ea98639
4
+ Revises: ec02a26a256c
5
+ Create Date: 2025-07-25 10:36:25.127923
6
+
7
+ """
8
+
9
+ from alembic import op
10
+
11
+ # revision identifiers, used by Alembic.
12
+ revision = "50dd7ea98639"
13
+ down_revision = "ec02a26a256c"
14
+ branch_labels = None
15
+ depends_on = None
16
+
17
+
18
+ def upgrade() -> None:
19
+ # ### commands auto generated by Alembic - please adjust! ###
20
+ with op.batch_alter_table("runs", schema=None) as batch_op:
21
+ batch_op.create_index(batch_op.f("ix_runs_status"), ["status"], unique=False)
22
+
23
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
24
+ batch_op.create_index(batch_op.f("ix_jobs_status"), ["status"], unique=False)
25
+
26
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
27
+ batch_op.create_index(batch_op.f("ix_fleets_status"), ["status"], unique=False)
28
+
29
+ with op.batch_alter_table("instances", schema=None) as batch_op:
30
+ batch_op.create_index(batch_op.f("ix_instances_status"), ["status"], unique=False)
31
+
32
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
33
+ batch_op.create_index(batch_op.f("ix_volumes_status"), ["status"], unique=False)
34
+
35
+ # ### end Alembic commands ###
36
+
37
+
38
+ def downgrade() -> None:
39
+ # ### commands auto generated by Alembic - please adjust! ###
40
+ with op.batch_alter_table("runs", schema=None) as batch_op:
41
+ batch_op.drop_index(batch_op.f("ix_runs_status"))
42
+
43
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
44
+ batch_op.drop_index(batch_op.f("ix_jobs_status"))
45
+
46
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
47
+ batch_op.drop_index(batch_op.f("ix_fleets_status"))
48
+
49
+ with op.batch_alter_table("instances", schema=None) as batch_op:
50
+ batch_op.drop_index(batch_op.f("ix_instances_status"))
51
+
52
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
53
+ batch_op.drop_index(batch_op.f("ix_volumes_status"))
54
+
55
+ # ### end Alembic commands ###
@@ -0,0 +1,38 @@
1
+ """Add RunModel.next_triggered_at
2
+
3
+ Revision ID: ec02a26a256c
4
+ Revises: d5863798bf41
5
+ Create Date: 2025-07-17 15:47:00.443217
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ import dstack._internal.server.models
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "ec02a26a256c"
16
+ down_revision = "d5863798bf41"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ with op.batch_alter_table("runs", schema=None) as batch_op:
24
+ batch_op.add_column(
25
+ sa.Column(
26
+ "next_triggered_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
27
+ )
28
+ )
29
+
30
+ # ### end Alembic commands ###
31
+
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ with op.batch_alter_table("runs", schema=None) as batch_op:
36
+ batch_op.drop_column("next_triggered_at")
37
+
38
+ # ### end Alembic commands ###
@@ -1,6 +1,6 @@
1
1
  import enum
2
2
  import uuid
3
- from datetime import datetime
3
+ from datetime import datetime, timezone
4
4
  from typing import Callable, List, Optional, Union
5
5
 
6
6
  from sqlalchemy import (
@@ -51,9 +51,10 @@ logger = get_logger(__name__)
51
51
 
52
52
  class NaiveDateTime(TypeDecorator):
53
53
  """
54
- A custom type decorator that ensures datetime objects are offset-naive when stored in the database.
55
- This is needed because we use datetimes in UTC only and store them as offset-naive.
56
- Some databases (e.g. Postgres) throw an error if the timezone is set.
54
+ A custom type decorator that ensures datetime objects are offset-naive when stored in the database
55
+ and offset-aware with UTC timezone when loaded from the database.
56
+ This is because we use datetimes in UTC everywhere, and
57
+ some databases (e.g. Postgres) throw an error if the timezone is set.
57
58
  """
58
59
 
59
60
  impl = DateTime
@@ -65,7 +66,9 @@ class NaiveDateTime(TypeDecorator):
65
66
  return value
66
67
 
67
68
  def process_result_value(self, value, dialect):
68
- return value
69
+ if value is None:
70
+ return None
71
+ return value.replace(tzinfo=timezone.utc)
69
72
 
70
73
 
71
74
  class DecryptedString(CoreModel):
@@ -355,7 +358,8 @@ class RunModel(BaseModel):
355
358
  run_name: Mapped[str] = mapped_column(String(100))
356
359
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
357
360
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
358
- status: Mapped[RunStatus] = mapped_column(Enum(RunStatus))
361
+ next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
362
+ status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True)
359
363
  termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
360
364
  Enum(RunTerminationReason)
361
365
  )
@@ -396,7 +400,7 @@ class JobModel(BaseModel):
396
400
  submission_num: Mapped[int] = mapped_column(Integer)
397
401
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
398
402
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
399
- status: Mapped[JobStatus] = mapped_column(Enum(JobStatus))
403
+ status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True)
400
404
  termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column(
401
405
  Enum(JobTerminationReason)
402
406
  )
@@ -524,7 +528,7 @@ class FleetModel(BaseModel):
524
528
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
525
529
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
526
530
 
527
- status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus))
531
+ status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True)
528
532
  status_message: Mapped[Optional[str]] = mapped_column(Text)
529
533
 
530
534
  spec: Mapped[str] = mapped_column(Text)
@@ -543,7 +547,6 @@ class InstanceModel(BaseModel):
543
547
 
544
548
  instance_num: Mapped[int] = mapped_column(Integer, default=0)
545
549
 
546
- # instance
547
550
  created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
548
551
  last_processed_at: Mapped[datetime] = mapped_column(
549
552
  NaiveDateTime, default=get_current_datetime
@@ -564,7 +567,7 @@ class InstanceModel(BaseModel):
564
567
  fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
565
568
  fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
566
569
 
567
- status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus))
570
+ status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True)
568
571
  unreachable: Mapped[bool] = mapped_column(Boolean)
569
572
 
570
573
  # VM
@@ -580,7 +583,6 @@ class InstanceModel(BaseModel):
580
583
  requirements: Mapped[Optional[str]] = mapped_column(Text)
581
584
  instance_configuration: Mapped[Optional[str]] = mapped_column(Text)
582
585
 
583
- # temination policy
584
586
  termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100))
585
587
  # TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here
586
588
  # (make Optional instead; also instead of -1)
@@ -598,11 +600,9 @@ class InstanceModel(BaseModel):
598
600
  first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
599
601
  last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
600
602
 
601
- # backend
602
603
  backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
603
604
  backend_data: Mapped[Optional[str]] = mapped_column(Text)
604
605
 
605
- # offer
606
606
  offer: Mapped[Optional[str]] = mapped_column(Text)
607
607
  region: Mapped[Optional[str]] = mapped_column(String(2000))
608
608
  price: Mapped[Optional[float]] = mapped_column(Float)
@@ -615,14 +615,14 @@ class InstanceModel(BaseModel):
615
615
  total_blocks: Mapped[Optional[int]] = mapped_column(Integer)
616
616
  busy_blocks: Mapped[int] = mapped_column(Integer, default=0)
617
617
 
618
- jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
618
+ jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance")
619
619
  last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
620
620
 
621
621
  volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
622
622
  back_populates="instance",
623
623
  # Add delete-orphan option so that removing entries from volume_attachments
624
624
  # automatically marks them for deletion.
625
- # SQLalchemy requires delete when using delete-orphan.
625
+ # SQLAlchemy requires delete when using delete-orphan.
626
626
  cascade="save-update, merge, delete-orphan, delete",
627
627
  )
628
628
 
@@ -649,7 +649,7 @@ class VolumeModel(BaseModel):
649
649
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
650
650
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
651
651
 
652
- status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus))
652
+ status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True)
653
653
  status_message: Mapped[Optional[str]] = mapped_column(Text)
654
654
 
655
655
  configuration: Mapped[str] = mapped_column(Text)
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from typing import Optional
3
3
 
4
- from pydantic import UUID4, Field, validator
4
+ from pydantic import UUID4, Field
5
5
 
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
@@ -15,11 +15,3 @@ class PollLogsRequest(CoreModel):
15
15
  next_token: Optional[str] = None
16
16
  limit: int = Field(100, ge=0, le=1000)
17
17
  diagnose: bool = False
18
-
19
- @validator("descending")
20
- @classmethod
21
- def validate_descending(cls, v):
22
- # Descending is not supported until we migrate from base64-encoded logs to plain text logs.
23
- if v is True:
24
- raise ValueError("descending: true is not supported")
25
- return v
@@ -1,6 +1,6 @@
1
1
  import uuid
2
2
  from collections.abc import Callable
3
- from datetime import datetime, timezone
3
+ from datetime import datetime
4
4
  from functools import wraps
5
5
  from typing import List, Literal, Optional, Tuple, TypeVar, Union, cast
6
6
 
@@ -8,8 +8,8 @@ from sqlalchemy import and_, func, or_, select
8
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
9
  from sqlalchemy.orm import joinedload, selectinload
10
10
 
11
- from dstack._internal.core.backends import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
12
11
  from dstack._internal.core.backends.base.backend import Backend
12
+ from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
13
13
  from dstack._internal.core.errors import (
14
14
  ForbiddenError,
15
15
  ResourceExistsError,
@@ -49,6 +49,7 @@ from dstack._internal.server.db import get_db
49
49
  from dstack._internal.server.models import (
50
50
  FleetModel,
51
51
  InstanceModel,
52
+ JobModel,
52
53
  ProjectModel,
53
54
  UserModel,
54
55
  )
@@ -66,7 +67,6 @@ from dstack._internal.server.services.plugins import apply_plugin_policies
66
67
  from dstack._internal.server.services.projects import (
67
68
  get_member,
68
69
  get_member_permissions,
69
- list_project_models,
70
70
  list_user_project_models,
71
71
  )
72
72
  from dstack._internal.server.services.resources import set_resources_defaults
@@ -87,10 +87,11 @@ async def list_fleets(
87
87
  limit: int,
88
88
  ascending: bool,
89
89
  ) -> List[Fleet]:
90
- if user.global_role == GlobalRole.ADMIN:
91
- projects = await list_project_models(session=session)
92
- else:
93
- projects = await list_user_project_models(session=session, user=user)
90
+ projects = await list_user_project_models(
91
+ session=session,
92
+ user=user,
93
+ only_names=True,
94
+ )
94
95
  if project_name is not None:
95
96
  projects = [p for p in projects if p.name == project_name]
96
97
  fleet_models = await list_projects_fleet_models(
@@ -398,7 +399,11 @@ async def apply_plan(
398
399
  FleetModel.id == fleet_model.id,
399
400
  FleetModel.deleted == False,
400
401
  )
401
- .options(selectinload(FleetModel.instances))
402
+ .options(
403
+ selectinload(FleetModel.instances)
404
+ .joinedload(InstanceModel.jobs)
405
+ .load_only(JobModel.id)
406
+ )
402
407
  .options(selectinload(FleetModel.runs))
403
408
  .execution_options(populate_existing=True)
404
409
  .order_by(FleetModel.id) # take locks in order
@@ -563,7 +568,11 @@ async def delete_fleets(
563
568
  FleetModel.name.in_(names),
564
569
  FleetModel.deleted == False,
565
570
  )
566
- .options(selectinload(FleetModel.instances))
571
+ .options(
572
+ selectinload(FleetModel.instances)
573
+ .joinedload(InstanceModel.jobs)
574
+ .load_only(JobModel.id)
575
+ )
567
576
  .options(selectinload(FleetModel.runs))
568
577
  .execution_options(populate_existing=True)
569
578
  .order_by(FleetModel.id) # take locks in order
@@ -600,7 +609,7 @@ def fleet_model_to_fleet(
600
609
  name=fleet_model.name,
601
610
  project_name=fleet_model.project.name,
602
611
  spec=spec,
603
- created_at=fleet_model.created_at.replace(tzinfo=timezone.utc),
612
+ created_at=fleet_model.created_at,
604
613
  status=fleet_model.status,
605
614
  status_message=fleet_model.status_message,
606
615
  instances=instances,