dstack 0.19.19__py3-none-any.whl → 0.19.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/cloudrift/api_client.py +13 -1
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/oci/resources.py +5 -5
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +4 -0
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/server/app.py +22 -10
- dstack/_internal/server/background/__init__.py +5 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +62 -48
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
- dstack/_internal/server/background/tasks/process_runs.py +63 -20
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +16 -16
- dstack/_internal/server/schemas/logs.py +1 -9
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +10 -14
- dstack/_internal/server/services/jobs/__init__.py +10 -12
- dstack/_internal/server/services/logs/aws.py +45 -3
- dstack/_internal/server/services/logs/filelog.py +121 -11
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/projects.py +35 -15
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +22 -3
- dstack/_internal/server/services/runs.py +74 -34
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js → main-39a767528976f8078166.js} +7 -26
- dstack/_internal/server/statics/{main-64f8273740c4b52c18f5.js.map → main-39a767528976f8078166.js.map} +1 -1
- dstack/_internal/server/statics/{main-d58fc0460cb0eae7cb5c.css → main-8f9ee218d3eb45989682.css} +2 -2
- dstack/_internal/server/testing/common.py +7 -0
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/utils/common.py +10 -21
- dstack/_internal/utils/cron.py +5 -0
- dstack/version.py +1 -1
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/RECORD +54 -49
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.19.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,9 +2,9 @@ import asyncio
|
|
|
2
2
|
import datetime
|
|
3
3
|
from typing import List, Optional, Set, Tuple
|
|
4
4
|
|
|
5
|
-
from sqlalchemy import select
|
|
5
|
+
from sqlalchemy import and_, or_, select
|
|
6
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
-
from sqlalchemy.orm import joinedload, selectinload
|
|
7
|
+
from sqlalchemy.orm import joinedload, load_only, selectinload
|
|
8
8
|
|
|
9
9
|
import dstack._internal.server.services.services.autoscalers as autoscalers
|
|
10
10
|
from dstack._internal.core.errors import ServerError
|
|
@@ -20,7 +20,13 @@ from dstack._internal.core.models.runs import (
|
|
|
20
20
|
RunTerminationReason,
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
23
|
-
from dstack._internal.server.models import
|
|
23
|
+
from dstack._internal.server.models import (
|
|
24
|
+
InstanceModel,
|
|
25
|
+
JobModel,
|
|
26
|
+
ProjectModel,
|
|
27
|
+
RunModel,
|
|
28
|
+
UserModel,
|
|
29
|
+
)
|
|
24
30
|
from dstack._internal.server.services.jobs import (
|
|
25
31
|
find_job,
|
|
26
32
|
get_job_specs_from_run_spec,
|
|
@@ -37,6 +43,7 @@ from dstack._internal.server.services.runs import (
|
|
|
37
43
|
)
|
|
38
44
|
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
39
45
|
from dstack._internal.server.services.services import update_service_desired_replica_count
|
|
46
|
+
from dstack._internal.server.utils import sentry_utils
|
|
40
47
|
from dstack._internal.utils import common
|
|
41
48
|
from dstack._internal.utils.logging import get_logger
|
|
42
49
|
|
|
@@ -53,22 +60,54 @@ async def process_runs(batch_size: int = 1):
|
|
|
53
60
|
await asyncio.gather(*tasks)
|
|
54
61
|
|
|
55
62
|
|
|
63
|
+
@sentry_utils.instrument_background_task
|
|
56
64
|
async def _process_next_run():
|
|
57
65
|
run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
|
|
58
66
|
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
67
|
+
now = common.get_current_datetime()
|
|
59
68
|
async with get_session_ctx() as session:
|
|
60
69
|
async with run_lock, job_lock:
|
|
61
70
|
res = await session.execute(
|
|
62
71
|
select(RunModel)
|
|
63
72
|
.where(
|
|
64
|
-
RunModel.status.not_in(RunStatus.finished_statuses()),
|
|
65
73
|
RunModel.id.not_in(run_lockset),
|
|
66
|
-
RunModel.last_processed_at
|
|
67
|
-
|
|
74
|
+
RunModel.last_processed_at < now - MIN_PROCESSING_INTERVAL,
|
|
75
|
+
# Filter out runs that don't need to be processed.
|
|
76
|
+
# This is only to reduce unnecessary commits.
|
|
77
|
+
# Otherwise, we could fetch all active runs and filter them when processing.
|
|
78
|
+
or_(
|
|
79
|
+
# Active non-pending runs:
|
|
80
|
+
RunModel.status.not_in(
|
|
81
|
+
RunStatus.finished_statuses() + [RunStatus.PENDING]
|
|
82
|
+
),
|
|
83
|
+
# Retrying runs:
|
|
84
|
+
and_(
|
|
85
|
+
RunModel.status == RunStatus.PENDING,
|
|
86
|
+
RunModel.resubmission_attempt > 0,
|
|
87
|
+
),
|
|
88
|
+
# Scheduled ready runs:
|
|
89
|
+
and_(
|
|
90
|
+
RunModel.status == RunStatus.PENDING,
|
|
91
|
+
RunModel.resubmission_attempt == 0,
|
|
92
|
+
RunModel.next_triggered_at.is_not(None),
|
|
93
|
+
RunModel.next_triggered_at < now,
|
|
94
|
+
),
|
|
95
|
+
# Scaled-to-zero runs:
|
|
96
|
+
# Such runs cannot be scheduled, thus we check next_triggered_at.
|
|
97
|
+
# If we allow scheduled services with downscaling to zero
|
|
98
|
+
# This check won't pass.
|
|
99
|
+
and_(
|
|
100
|
+
RunModel.status == RunStatus.PENDING,
|
|
101
|
+
RunModel.resubmission_attempt == 0,
|
|
102
|
+
RunModel.next_triggered_at.is_(None),
|
|
103
|
+
),
|
|
104
|
+
),
|
|
68
105
|
)
|
|
106
|
+
.options(joinedload(RunModel.jobs).load_only(JobModel.id))
|
|
107
|
+
.options(load_only(RunModel.id))
|
|
69
108
|
.order_by(RunModel.last_processed_at.asc())
|
|
70
109
|
.limit(1)
|
|
71
|
-
.with_for_update(skip_locked=True, key_share=True)
|
|
110
|
+
.with_for_update(skip_locked=True, key_share=True, of=RunModel)
|
|
72
111
|
)
|
|
73
112
|
run_model = res.scalar()
|
|
74
113
|
if run_model is None:
|
|
@@ -98,20 +137,22 @@ async def _process_next_run():
|
|
|
98
137
|
|
|
99
138
|
|
|
100
139
|
async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
101
|
-
logger.debug("%s: processing run", fmt(run_model))
|
|
102
140
|
# Refetch to load related attributes.
|
|
103
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
104
141
|
res = await session.execute(
|
|
105
142
|
select(RunModel)
|
|
106
143
|
.where(RunModel.id == run_model.id)
|
|
107
144
|
.execution_options(populate_existing=True)
|
|
108
|
-
.options(joinedload(RunModel.project).
|
|
109
|
-
.options(joinedload(RunModel.user))
|
|
110
|
-
.options(
|
|
111
|
-
|
|
145
|
+
.options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
|
|
146
|
+
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
147
|
+
.options(
|
|
148
|
+
selectinload(RunModel.jobs)
|
|
149
|
+
.joinedload(JobModel.instance)
|
|
150
|
+
.load_only(InstanceModel.fleet_id)
|
|
151
|
+
)
|
|
112
152
|
.execution_options(populate_existing=True)
|
|
113
153
|
)
|
|
114
154
|
run_model = res.unique().scalar_one()
|
|
155
|
+
logger.debug("%s: processing run", fmt(run_model))
|
|
115
156
|
try:
|
|
116
157
|
if run_model.status == RunStatus.PENDING:
|
|
117
158
|
await _process_pending_run(session, run_model)
|
|
@@ -135,8 +176,12 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
|
135
176
|
async def _process_pending_run(session: AsyncSession, run_model: RunModel):
|
|
136
177
|
"""Jobs are not created yet"""
|
|
137
178
|
run = run_model_to_run(run_model)
|
|
138
|
-
|
|
139
|
-
|
|
179
|
+
|
|
180
|
+
# TODO: Do not select such runs in the first place to avoid redundant processing
|
|
181
|
+
if run_model.resubmission_attempt > 0 and not _retrying_run_ready_for_resubmission(
|
|
182
|
+
run_model, run
|
|
183
|
+
):
|
|
184
|
+
logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model))
|
|
140
185
|
return
|
|
141
186
|
|
|
142
187
|
run_model.desired_replica_count = 1
|
|
@@ -160,7 +205,7 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
|
|
|
160
205
|
logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
|
|
161
206
|
|
|
162
207
|
|
|
163
|
-
def
|
|
208
|
+
def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
|
|
164
209
|
if run.latest_job_submission is None:
|
|
165
210
|
# Should not be possible
|
|
166
211
|
return True
|
|
@@ -197,7 +242,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
197
242
|
We handle fails, scaling, and status changes.
|
|
198
243
|
"""
|
|
199
244
|
run = run_model_to_run(run_model)
|
|
200
|
-
run_spec =
|
|
245
|
+
run_spec = run.run_spec
|
|
201
246
|
retry_single_job = _can_retry_single_job(run_spec)
|
|
202
247
|
|
|
203
248
|
run_statuses: Set[RunStatus] = set()
|
|
@@ -337,9 +382,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
337
382
|
)
|
|
338
383
|
if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
|
|
339
384
|
current_time = common.get_current_datetime()
|
|
340
|
-
submit_to_provision_duration = (
|
|
341
|
-
current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
|
|
342
|
-
).total_seconds()
|
|
385
|
+
submit_to_provision_duration = (current_time - run_model.submitted_at).total_seconds()
|
|
343
386
|
logger.info(
|
|
344
387
|
"%s: run took %.2f seconds from submission to provisioning.",
|
|
345
388
|
fmt(run_model),
|
|
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple
|
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import select
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
-
from sqlalchemy.orm import joinedload,
|
|
8
|
+
from sqlalchemy.orm import joinedload, load_only, selectinload
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.backend import Backend
|
|
11
11
|
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
43
43
|
JobModel,
|
|
44
44
|
ProjectModel,
|
|
45
45
|
RunModel,
|
|
46
|
+
UserModel,
|
|
46
47
|
VolumeAttachmentModel,
|
|
47
48
|
VolumeModel,
|
|
48
49
|
)
|
|
@@ -74,6 +75,7 @@ from dstack._internal.server.services.runs import (
|
|
|
74
75
|
from dstack._internal.server.services.volumes import (
|
|
75
76
|
volume_model_to_volume,
|
|
76
77
|
)
|
|
78
|
+
from dstack._internal.server.utils import sentry_utils
|
|
77
79
|
from dstack._internal.utils import common as common_utils
|
|
78
80
|
from dstack._internal.utils import env as env_utils
|
|
79
81
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -108,6 +110,7 @@ def _get_effective_batch_size(batch_size: int) -> int:
|
|
|
108
110
|
return batch_size
|
|
109
111
|
|
|
110
112
|
|
|
113
|
+
@sentry_utils.instrument_background_task
|
|
111
114
|
async def _process_next_submitted_job():
|
|
112
115
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
113
116
|
async with get_session_ctx() as session:
|
|
@@ -119,6 +122,7 @@ async def _process_next_submitted_job():
|
|
|
119
122
|
JobModel.status == JobStatus.SUBMITTED,
|
|
120
123
|
JobModel.id.not_in(lockset),
|
|
121
124
|
)
|
|
125
|
+
.options(load_only(JobModel.id))
|
|
122
126
|
# Jobs are process in FIFO sorted by priority globally,
|
|
123
127
|
# thus runs from different projects can "overtake" each other by using higher priorities.
|
|
124
128
|
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
@@ -151,9 +155,7 @@ async def _process_next_submitted_job():
|
|
|
151
155
|
|
|
152
156
|
|
|
153
157
|
async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
154
|
-
logger.debug("%s: provisioning has started", fmt(job_model))
|
|
155
158
|
# Refetch to load related attributes.
|
|
156
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
157
159
|
res = await session.execute(
|
|
158
160
|
select(JobModel).where(JobModel.id == job_model.id).options(joinedload(JobModel.instance))
|
|
159
161
|
)
|
|
@@ -162,15 +164,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
162
164
|
select(RunModel)
|
|
163
165
|
.where(RunModel.id == job_model.run_id)
|
|
164
166
|
.options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
|
|
165
|
-
.options(joinedload(RunModel.user))
|
|
167
|
+
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
166
168
|
.options(joinedload(RunModel.fleet).joinedload(FleetModel.instances))
|
|
167
169
|
)
|
|
168
170
|
run_model = res.unique().scalar_one()
|
|
169
|
-
|
|
170
|
-
run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
|
|
171
|
-
profile = run_spec.merged_profile
|
|
171
|
+
logger.debug("%s: provisioning has started", fmt(job_model))
|
|
172
172
|
|
|
173
|
+
project = run_model.project
|
|
173
174
|
run = run_model_to_run(run_model)
|
|
175
|
+
run_spec = run.run_spec
|
|
176
|
+
profile = run_spec.merged_profile
|
|
174
177
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
175
178
|
|
|
176
179
|
master_job = find_job(run.jobs, job_model.replica_num, 0)
|
|
@@ -228,7 +231,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
228
231
|
InstanceModel.deleted == False,
|
|
229
232
|
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
230
233
|
)
|
|
231
|
-
.options(lazyload(InstanceModel.jobs))
|
|
232
234
|
.order_by(InstanceModel.id) # take locks in order
|
|
233
235
|
.with_for_update(key_share=True)
|
|
234
236
|
)
|
|
@@ -357,9 +359,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
357
359
|
await session.execute(
|
|
358
360
|
select(VolumeModel)
|
|
359
361
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
360
|
-
.options(
|
|
362
|
+
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
361
363
|
.order_by(VolumeModel.id) # take locks in order
|
|
362
|
-
.with_for_update(key_share=True)
|
|
364
|
+
.with_for_update(key_share=True, of=VolumeModel)
|
|
363
365
|
)
|
|
364
366
|
async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
365
367
|
if len(volume_models) > 0:
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
|
|
3
3
|
from sqlalchemy import or_, select
|
|
4
4
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
-
from sqlalchemy.orm import joinedload
|
|
5
|
+
from sqlalchemy.orm import joinedload
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
8
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
@@ -18,7 +18,11 @@ from dstack._internal.server.services.jobs import (
|
|
|
18
18
|
)
|
|
19
19
|
from dstack._internal.server.services.locking import get_locker
|
|
20
20
|
from dstack._internal.server.services.logging import fmt
|
|
21
|
-
from dstack._internal.utils
|
|
21
|
+
from dstack._internal.server.utils import sentry_utils
|
|
22
|
+
from dstack._internal.utils.common import (
|
|
23
|
+
get_current_datetime,
|
|
24
|
+
get_or_error,
|
|
25
|
+
)
|
|
22
26
|
from dstack._internal.utils.logging import get_logger
|
|
23
27
|
|
|
24
28
|
logger = get_logger(__name__)
|
|
@@ -31,6 +35,7 @@ async def process_terminating_jobs(batch_size: int = 1):
|
|
|
31
35
|
await asyncio.gather(*tasks)
|
|
32
36
|
|
|
33
37
|
|
|
38
|
+
@sentry_utils.instrument_background_task
|
|
34
39
|
async def _process_next_terminating_job():
|
|
35
40
|
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
36
41
|
instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
@@ -43,7 +48,10 @@ async def _process_next_terminating_job():
|
|
|
43
48
|
.where(
|
|
44
49
|
JobModel.id.not_in(job_lockset),
|
|
45
50
|
JobModel.status == JobStatus.TERMINATING,
|
|
46
|
-
or_(
|
|
51
|
+
or_(
|
|
52
|
+
JobModel.remove_at.is_(None),
|
|
53
|
+
JobModel.remove_at < get_current_datetime(),
|
|
54
|
+
),
|
|
47
55
|
)
|
|
48
56
|
.order_by(JobModel.last_processed_at.asc())
|
|
49
57
|
.limit(1)
|
|
@@ -59,7 +67,6 @@ async def _process_next_terminating_job():
|
|
|
59
67
|
InstanceModel.id == job_model.used_instance_id,
|
|
60
68
|
InstanceModel.id.not_in(instance_lockset),
|
|
61
69
|
)
|
|
62
|
-
.options(lazyload(InstanceModel.jobs))
|
|
63
70
|
.with_for_update(skip_locked=True, key_share=True)
|
|
64
71
|
)
|
|
65
72
|
instance_model = res.scalar()
|
|
@@ -88,6 +95,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
88
95
|
.options(
|
|
89
96
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
90
97
|
joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
|
|
98
|
+
joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
91
99
|
)
|
|
92
100
|
)
|
|
93
101
|
instance_model = res.unique().scalar()
|
|
@@ -7,6 +7,7 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
|
7
7
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
8
8
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
9
9
|
from dstack._internal.server.models import (
|
|
10
|
+
FleetModel,
|
|
10
11
|
InstanceModel,
|
|
11
12
|
ProjectModel,
|
|
12
13
|
VolumeAttachmentModel,
|
|
@@ -15,12 +16,14 @@ from dstack._internal.server.models import (
|
|
|
15
16
|
from dstack._internal.server.services import backends as backends_services
|
|
16
17
|
from dstack._internal.server.services import volumes as volumes_services
|
|
17
18
|
from dstack._internal.server.services.locking import get_locker
|
|
19
|
+
from dstack._internal.server.utils import sentry_utils
|
|
18
20
|
from dstack._internal.utils.common import get_current_datetime, run_async
|
|
19
21
|
from dstack._internal.utils.logging import get_logger
|
|
20
22
|
|
|
21
23
|
logger = get_logger(__name__)
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
@sentry_utils.instrument_background_task
|
|
24
27
|
async def process_submitted_volumes():
|
|
25
28
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
26
29
|
async with get_session_ctx() as session:
|
|
@@ -49,7 +52,6 @@ async def process_submitted_volumes():
|
|
|
49
52
|
async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeModel):
|
|
50
53
|
logger.info("Started submitted volume %s processing", volume_model.name)
|
|
51
54
|
# Refetch to load related attributes.
|
|
52
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
53
55
|
res = await session.execute(
|
|
54
56
|
select(VolumeModel)
|
|
55
57
|
.where(VolumeModel.id == volume_model.id)
|
|
@@ -59,6 +61,7 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
|
|
|
59
61
|
joinedload(VolumeModel.attachments)
|
|
60
62
|
.joinedload(VolumeAttachmentModel.instance)
|
|
61
63
|
.joinedload(InstanceModel.fleet)
|
|
64
|
+
.load_only(FleetModel.name)
|
|
62
65
|
)
|
|
63
66
|
.execution_options(populate_existing=True)
|
|
64
67
|
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Index status columns
|
|
2
|
+
|
|
3
|
+
Revision ID: 50dd7ea98639
|
|
4
|
+
Revises: ec02a26a256c
|
|
5
|
+
Create Date: 2025-07-25 10:36:25.127923
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
# revision identifiers, used by Alembic.
|
|
12
|
+
revision = "50dd7ea98639"
|
|
13
|
+
down_revision = "ec02a26a256c"
|
|
14
|
+
branch_labels = None
|
|
15
|
+
depends_on = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def upgrade() -> None:
|
|
19
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
20
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
21
|
+
batch_op.create_index(batch_op.f("ix_runs_status"), ["status"], unique=False)
|
|
22
|
+
|
|
23
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
24
|
+
batch_op.create_index(batch_op.f("ix_jobs_status"), ["status"], unique=False)
|
|
25
|
+
|
|
26
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
27
|
+
batch_op.create_index(batch_op.f("ix_fleets_status"), ["status"], unique=False)
|
|
28
|
+
|
|
29
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
30
|
+
batch_op.create_index(batch_op.f("ix_instances_status"), ["status"], unique=False)
|
|
31
|
+
|
|
32
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
33
|
+
batch_op.create_index(batch_op.f("ix_volumes_status"), ["status"], unique=False)
|
|
34
|
+
|
|
35
|
+
# ### end Alembic commands ###
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def downgrade() -> None:
|
|
39
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
40
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
41
|
+
batch_op.drop_index(batch_op.f("ix_runs_status"))
|
|
42
|
+
|
|
43
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
44
|
+
batch_op.drop_index(batch_op.f("ix_jobs_status"))
|
|
45
|
+
|
|
46
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
47
|
+
batch_op.drop_index(batch_op.f("ix_fleets_status"))
|
|
48
|
+
|
|
49
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
50
|
+
batch_op.drop_index(batch_op.f("ix_instances_status"))
|
|
51
|
+
|
|
52
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
53
|
+
batch_op.drop_index(batch_op.f("ix_volumes_status"))
|
|
54
|
+
|
|
55
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Add RunModel.next_triggered_at
|
|
2
|
+
|
|
3
|
+
Revision ID: ec02a26a256c
|
|
4
|
+
Revises: d5863798bf41
|
|
5
|
+
Create Date: 2025-07-17 15:47:00.443217
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "ec02a26a256c"
|
|
16
|
+
down_revision = "d5863798bf41"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column(
|
|
26
|
+
"next_triggered_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# ### end Alembic commands ###
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
35
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
36
|
+
batch_op.drop_column("next_triggered_at")
|
|
37
|
+
|
|
38
|
+
# ### end Alembic commands ###
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
import uuid
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
4
|
from typing import Callable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import (
|
|
@@ -51,9 +51,10 @@ logger = get_logger(__name__)
|
|
|
51
51
|
|
|
52
52
|
class NaiveDateTime(TypeDecorator):
|
|
53
53
|
"""
|
|
54
|
-
A custom type decorator that ensures datetime objects are offset-naive when stored in the database
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
A custom type decorator that ensures datetime objects are offset-naive when stored in the database
|
|
55
|
+
and offset-aware with UTC timezone when loaded from the database.
|
|
56
|
+
This is because we use datetimes in UTC everywhere, and
|
|
57
|
+
some databases (e.g. Postgres) throw an error if the timezone is set.
|
|
57
58
|
"""
|
|
58
59
|
|
|
59
60
|
impl = DateTime
|
|
@@ -65,7 +66,9 @@ class NaiveDateTime(TypeDecorator):
|
|
|
65
66
|
return value
|
|
66
67
|
|
|
67
68
|
def process_result_value(self, value, dialect):
|
|
68
|
-
|
|
69
|
+
if value is None:
|
|
70
|
+
return None
|
|
71
|
+
return value.replace(tzinfo=timezone.utc)
|
|
69
72
|
|
|
70
73
|
|
|
71
74
|
class DecryptedString(CoreModel):
|
|
@@ -355,7 +358,8 @@ class RunModel(BaseModel):
|
|
|
355
358
|
run_name: Mapped[str] = mapped_column(String(100))
|
|
356
359
|
submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
357
360
|
last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
358
|
-
|
|
361
|
+
next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
362
|
+
status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True)
|
|
359
363
|
termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
|
|
360
364
|
Enum(RunTerminationReason)
|
|
361
365
|
)
|
|
@@ -396,7 +400,7 @@ class JobModel(BaseModel):
|
|
|
396
400
|
submission_num: Mapped[int] = mapped_column(Integer)
|
|
397
401
|
submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
398
402
|
last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
399
|
-
status: Mapped[JobStatus] = mapped_column(Enum(JobStatus))
|
|
403
|
+
status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True)
|
|
400
404
|
termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column(
|
|
401
405
|
Enum(JobTerminationReason)
|
|
402
406
|
)
|
|
@@ -524,7 +528,7 @@ class FleetModel(BaseModel):
|
|
|
524
528
|
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
525
529
|
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
526
530
|
|
|
527
|
-
status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus))
|
|
531
|
+
status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True)
|
|
528
532
|
status_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
529
533
|
|
|
530
534
|
spec: Mapped[str] = mapped_column(Text)
|
|
@@ -543,7 +547,6 @@ class InstanceModel(BaseModel):
|
|
|
543
547
|
|
|
544
548
|
instance_num: Mapped[int] = mapped_column(Integer, default=0)
|
|
545
549
|
|
|
546
|
-
# instance
|
|
547
550
|
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
548
551
|
last_processed_at: Mapped[datetime] = mapped_column(
|
|
549
552
|
NaiveDateTime, default=get_current_datetime
|
|
@@ -564,7 +567,7 @@ class InstanceModel(BaseModel):
|
|
|
564
567
|
fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
|
|
565
568
|
fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
|
|
566
569
|
|
|
567
|
-
status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus))
|
|
570
|
+
status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True)
|
|
568
571
|
unreachable: Mapped[bool] = mapped_column(Boolean)
|
|
569
572
|
|
|
570
573
|
# VM
|
|
@@ -580,7 +583,6 @@ class InstanceModel(BaseModel):
|
|
|
580
583
|
requirements: Mapped[Optional[str]] = mapped_column(Text)
|
|
581
584
|
instance_configuration: Mapped[Optional[str]] = mapped_column(Text)
|
|
582
585
|
|
|
583
|
-
# temination policy
|
|
584
586
|
termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100))
|
|
585
587
|
# TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here
|
|
586
588
|
# (make Optional instead; also instead of -1)
|
|
@@ -598,11 +600,9 @@ class InstanceModel(BaseModel):
|
|
|
598
600
|
first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
599
601
|
last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
600
602
|
|
|
601
|
-
# backend
|
|
602
603
|
backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
|
|
603
604
|
backend_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
604
605
|
|
|
605
|
-
# offer
|
|
606
606
|
offer: Mapped[Optional[str]] = mapped_column(Text)
|
|
607
607
|
region: Mapped[Optional[str]] = mapped_column(String(2000))
|
|
608
608
|
price: Mapped[Optional[float]] = mapped_column(Float)
|
|
@@ -615,14 +615,14 @@ class InstanceModel(BaseModel):
|
|
|
615
615
|
total_blocks: Mapped[Optional[int]] = mapped_column(Integer)
|
|
616
616
|
busy_blocks: Mapped[int] = mapped_column(Integer, default=0)
|
|
617
617
|
|
|
618
|
-
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance"
|
|
618
|
+
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance")
|
|
619
619
|
last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
620
620
|
|
|
621
621
|
volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
|
|
622
622
|
back_populates="instance",
|
|
623
623
|
# Add delete-orphan option so that removing entries from volume_attachments
|
|
624
624
|
# automatically marks them for deletion.
|
|
625
|
-
#
|
|
625
|
+
# SQLAlchemy requires delete when using delete-orphan.
|
|
626
626
|
cascade="save-update, merge, delete-orphan, delete",
|
|
627
627
|
)
|
|
628
628
|
|
|
@@ -649,7 +649,7 @@ class VolumeModel(BaseModel):
|
|
|
649
649
|
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
650
650
|
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
651
651
|
|
|
652
|
-
status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus))
|
|
652
|
+
status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True)
|
|
653
653
|
status_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
654
654
|
|
|
655
655
|
configuration: Mapped[str] = mapped_column(Text)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from pydantic import UUID4, Field
|
|
4
|
+
from pydantic import UUID4, Field
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
@@ -15,11 +15,3 @@ class PollLogsRequest(CoreModel):
|
|
|
15
15
|
next_token: Optional[str] = None
|
|
16
16
|
limit: int = Field(100, ge=0, le=1000)
|
|
17
17
|
diagnose: bool = False
|
|
18
|
-
|
|
19
|
-
@validator("descending")
|
|
20
|
-
@classmethod
|
|
21
|
-
def validate_descending(cls, v):
|
|
22
|
-
# Descending is not supported until we migrate from base64-encoded logs to plain text logs.
|
|
23
|
-
if v is True:
|
|
24
|
-
raise ValueError("descending: true is not supported")
|
|
25
|
-
return v
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from collections.abc import Callable
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from typing import List, Literal, Optional, Tuple, TypeVar, Union, cast
|
|
6
6
|
|
|
@@ -8,8 +8,8 @@ from sqlalchemy import and_, func, or_, select
|
|
|
8
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
9
|
from sqlalchemy.orm import joinedload, selectinload
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.backends import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
12
11
|
from dstack._internal.core.backends.base.backend import Backend
|
|
12
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
13
13
|
from dstack._internal.core.errors import (
|
|
14
14
|
ForbiddenError,
|
|
15
15
|
ResourceExistsError,
|
|
@@ -49,6 +49,7 @@ from dstack._internal.server.db import get_db
|
|
|
49
49
|
from dstack._internal.server.models import (
|
|
50
50
|
FleetModel,
|
|
51
51
|
InstanceModel,
|
|
52
|
+
JobModel,
|
|
52
53
|
ProjectModel,
|
|
53
54
|
UserModel,
|
|
54
55
|
)
|
|
@@ -66,7 +67,6 @@ from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
|
66
67
|
from dstack._internal.server.services.projects import (
|
|
67
68
|
get_member,
|
|
68
69
|
get_member_permissions,
|
|
69
|
-
list_project_models,
|
|
70
70
|
list_user_project_models,
|
|
71
71
|
)
|
|
72
72
|
from dstack._internal.server.services.resources import set_resources_defaults
|
|
@@ -87,10 +87,11 @@ async def list_fleets(
|
|
|
87
87
|
limit: int,
|
|
88
88
|
ascending: bool,
|
|
89
89
|
) -> List[Fleet]:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
90
|
+
projects = await list_user_project_models(
|
|
91
|
+
session=session,
|
|
92
|
+
user=user,
|
|
93
|
+
only_names=True,
|
|
94
|
+
)
|
|
94
95
|
if project_name is not None:
|
|
95
96
|
projects = [p for p in projects if p.name == project_name]
|
|
96
97
|
fleet_models = await list_projects_fleet_models(
|
|
@@ -398,7 +399,11 @@ async def apply_plan(
|
|
|
398
399
|
FleetModel.id == fleet_model.id,
|
|
399
400
|
FleetModel.deleted == False,
|
|
400
401
|
)
|
|
401
|
-
.options(
|
|
402
|
+
.options(
|
|
403
|
+
selectinload(FleetModel.instances)
|
|
404
|
+
.joinedload(InstanceModel.jobs)
|
|
405
|
+
.load_only(JobModel.id)
|
|
406
|
+
)
|
|
402
407
|
.options(selectinload(FleetModel.runs))
|
|
403
408
|
.execution_options(populate_existing=True)
|
|
404
409
|
.order_by(FleetModel.id) # take locks in order
|
|
@@ -563,7 +568,11 @@ async def delete_fleets(
|
|
|
563
568
|
FleetModel.name.in_(names),
|
|
564
569
|
FleetModel.deleted == False,
|
|
565
570
|
)
|
|
566
|
-
.options(
|
|
571
|
+
.options(
|
|
572
|
+
selectinload(FleetModel.instances)
|
|
573
|
+
.joinedload(InstanceModel.jobs)
|
|
574
|
+
.load_only(JobModel.id)
|
|
575
|
+
)
|
|
567
576
|
.options(selectinload(FleetModel.runs))
|
|
568
577
|
.execution_options(populate_existing=True)
|
|
569
578
|
.order_by(FleetModel.id) # take locks in order
|
|
@@ -600,7 +609,7 @@ def fleet_model_to_fleet(
|
|
|
600
609
|
name=fleet_model.name,
|
|
601
610
|
project_name=fleet_model.project.name,
|
|
602
611
|
spec=spec,
|
|
603
|
-
created_at=fleet_model.created_at
|
|
612
|
+
created_at=fleet_model.created_at,
|
|
604
613
|
status=fleet_model.status,
|
|
605
614
|
status_message=fleet_model.status_message,
|
|
606
615
|
instances=instances,
|