dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +111 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/backends/aws/compute.py +237 -18
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/cudo/compute.py +23 -9
- dstack/_internal/core/backends/gcp/compute.py +13 -7
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/compatibility/fleets.py +12 -11
- dstack/_internal/core/compatibility/gateways.py +9 -8
- dstack/_internal/core/compatibility/logs.py +4 -3
- dstack/_internal/core/compatibility/runs.py +29 -21
- dstack/_internal/core/compatibility/volumes.py +11 -8
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/common.py +45 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +56 -3
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +37 -9
- dstack/_internal/server/background/__init__.py +66 -40
- dstack/_internal/server/background/tasks/process_fleets.py +19 -3
- dstack/_internal/server/background/tasks/process_gateways.py +47 -29
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_instances.py +13 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_runs.py +8 -4
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
- dstack/_internal/server/background/tasks/process_volumes.py +2 -2
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +2 -2
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +358 -75
- dstack/_internal/server/services/gateways/__init__.py +17 -6
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +12 -1
- dstack/_internal/server/services/locking.py +104 -13
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +2 -4
- dstack/_internal/server/services/logs/filelog.py +33 -27
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +139 -72
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +15 -2
- dstack/_internal/server/settings.py +25 -6
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
- dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
- dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
- dstack/_internal/server/testing/common.py +48 -8
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -16,6 +16,7 @@ from dstack._internal.server.services.gateways import (
|
|
|
16
16
|
gateway_connections_pool,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
|
|
19
|
+
from dstack._internal.server.services.logging import fmt
|
|
19
20
|
from dstack._internal.utils.common import get_current_datetime
|
|
20
21
|
from dstack._internal.utils.logging import get_logger
|
|
21
22
|
|
|
@@ -27,14 +28,14 @@ async def process_gateways_connections():
|
|
|
27
28
|
await _process_active_connections()
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
async def
|
|
31
|
-
lock, lockset = get_locker().get_lockset(GatewayModel.__tablename__)
|
|
31
|
+
async def process_gateways():
|
|
32
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
32
33
|
async with get_session_ctx() as session:
|
|
33
34
|
async with lock:
|
|
34
35
|
res = await session.execute(
|
|
35
36
|
select(GatewayModel)
|
|
36
37
|
.where(
|
|
37
|
-
GatewayModel.status
|
|
38
|
+
GatewayModel.status.in_([GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING]),
|
|
38
39
|
GatewayModel.id.not_in(lockset),
|
|
39
40
|
)
|
|
40
41
|
.options(lazyload(GatewayModel.gateway_compute))
|
|
@@ -48,7 +49,25 @@ async def process_submitted_gateways():
|
|
|
48
49
|
lockset.add(gateway_model.id)
|
|
49
50
|
try:
|
|
50
51
|
gateway_model_id = gateway_model.id
|
|
51
|
-
|
|
52
|
+
initial_status = gateway_model.status
|
|
53
|
+
if initial_status == GatewayStatus.SUBMITTED:
|
|
54
|
+
await _process_submitted_gateway(session=session, gateway_model=gateway_model)
|
|
55
|
+
elif initial_status == GatewayStatus.PROVISIONING:
|
|
56
|
+
await _process_provisioning_gateway(session=session, gateway_model=gateway_model)
|
|
57
|
+
else:
|
|
58
|
+
logger.error(
|
|
59
|
+
"%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper()
|
|
60
|
+
)
|
|
61
|
+
if gateway_model.status != initial_status:
|
|
62
|
+
logger.info(
|
|
63
|
+
"%s: gateway status has changed %s -> %s%s",
|
|
64
|
+
fmt(gateway_model),
|
|
65
|
+
initial_status.upper(),
|
|
66
|
+
gateway_model.status.upper(),
|
|
67
|
+
f": {gateway_model.status_message}" if gateway_model.status_message else "",
|
|
68
|
+
)
|
|
69
|
+
gateway_model.last_processed_at = get_current_datetime()
|
|
70
|
+
await session.commit()
|
|
52
71
|
finally:
|
|
53
72
|
lockset.difference_update([gateway_model_id])
|
|
54
73
|
|
|
@@ -89,7 +108,7 @@ async def _process_connection(conn: GatewayConnection):
|
|
|
89
108
|
|
|
90
109
|
|
|
91
110
|
async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
|
|
92
|
-
logger.info("
|
|
111
|
+
logger.info("%s: started gateway provisioning", fmt(gateway_model))
|
|
93
112
|
# Refetch to load related attributes.
|
|
94
113
|
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
95
114
|
res = await session.execute(
|
|
@@ -110,8 +129,6 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
|
|
|
110
129
|
except BackendNotAvailable:
|
|
111
130
|
gateway_model.status = GatewayStatus.FAILED
|
|
112
131
|
gateway_model.status_message = "Backend not available"
|
|
113
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
114
|
-
await session.commit()
|
|
115
132
|
return
|
|
116
133
|
|
|
117
134
|
try:
|
|
@@ -123,53 +140,54 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
|
|
|
123
140
|
)
|
|
124
141
|
session.add(gateway_model)
|
|
125
142
|
gateway_model.status = GatewayStatus.PROVISIONING
|
|
126
|
-
await session.commit()
|
|
127
|
-
await session.refresh(gateway_model)
|
|
128
143
|
except BackendError as e:
|
|
129
|
-
logger.info(
|
|
130
|
-
"Failed to create gateway compute for gateway %s: %s", gateway_model.name, repr(e)
|
|
131
|
-
)
|
|
144
|
+
logger.info("%s: failed to create gateway compute: %r", fmt(gateway_model), e)
|
|
132
145
|
gateway_model.status = GatewayStatus.FAILED
|
|
133
146
|
status_message = f"Backend error: {repr(e)}"
|
|
134
147
|
if len(e.args) > 0:
|
|
135
148
|
status_message = str(e.args[0])
|
|
136
149
|
gateway_model.status_message = status_message
|
|
137
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
138
|
-
await session.commit()
|
|
139
|
-
return
|
|
140
150
|
except Exception as e:
|
|
141
|
-
logger.exception(
|
|
142
|
-
"Got exception when creating gateway compute for gateway %s", gateway_model.name
|
|
143
|
-
)
|
|
151
|
+
logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model))
|
|
144
152
|
gateway_model.status = GatewayStatus.FAILED
|
|
145
153
|
gateway_model.status_message = f"Unexpected error: {repr(e)}"
|
|
146
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
147
|
-
await session.commit()
|
|
148
|
-
return
|
|
149
154
|
|
|
155
|
+
|
|
156
|
+
async def _process_provisioning_gateway(
|
|
157
|
+
session: AsyncSession, gateway_model: GatewayModel
|
|
158
|
+
) -> None:
|
|
159
|
+
# Refetch to load related attributes.
|
|
160
|
+
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
161
|
+
res = await session.execute(
|
|
162
|
+
select(GatewayModel)
|
|
163
|
+
.where(GatewayModel.id == gateway_model.id)
|
|
164
|
+
.execution_options(populate_existing=True)
|
|
165
|
+
)
|
|
166
|
+
gateway_model = res.unique().scalar_one()
|
|
167
|
+
|
|
168
|
+
# FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
|
|
169
|
+
# - cannot delete the gateway before it is provisioned because the DB model is locked
|
|
170
|
+
# - connection retry counter is reset on server restart
|
|
171
|
+
# - only one server replica is processing the gateway
|
|
172
|
+
# Easy to fix by doing only one connection/configuration attempt per processing iteration. The
|
|
173
|
+
# main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
|
|
174
|
+
# maintaining a different model for Sky.
|
|
150
175
|
connection = await gateways_services.connect_to_gateway_with_retry(
|
|
151
176
|
gateway_model.gateway_compute
|
|
152
177
|
)
|
|
153
178
|
if connection is None:
|
|
154
179
|
gateway_model.status = GatewayStatus.FAILED
|
|
155
180
|
gateway_model.status_message = "Failed to connect to gateway"
|
|
156
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
157
181
|
gateway_model.gateway_compute.deleted = True
|
|
158
|
-
await session.commit()
|
|
159
182
|
return
|
|
160
|
-
|
|
161
183
|
try:
|
|
162
184
|
await gateways_services.configure_gateway(connection)
|
|
163
185
|
except Exception:
|
|
164
|
-
logger.exception("
|
|
186
|
+
logger.exception("%s: failed to configure gateway", fmt(gateway_model))
|
|
165
187
|
gateway_model.status = GatewayStatus.FAILED
|
|
166
188
|
gateway_model.status_message = "Failed to configure gateway"
|
|
167
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
168
189
|
await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address)
|
|
169
190
|
gateway_model.gateway_compute.active = False
|
|
170
|
-
await session.commit()
|
|
171
191
|
return
|
|
172
192
|
|
|
173
193
|
gateway_model.status = GatewayStatus.RUNNING
|
|
174
|
-
gateway_model.last_processed_at = get_current_datetime()
|
|
175
|
-
await session.commit()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import select
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
+
from sqlalchemy.orm import joinedload
|
|
7
|
+
|
|
8
|
+
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
9
|
+
from dstack._internal.core.errors import BackendNotAvailable
|
|
10
|
+
from dstack._internal.core.models.profiles import parse_duration
|
|
11
|
+
from dstack._internal.core.models.volumes import VolumeStatus
|
|
12
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
13
|
+
from dstack._internal.server.models import ProjectModel, VolumeModel
|
|
14
|
+
from dstack._internal.server.services import backends as backends_services
|
|
15
|
+
from dstack._internal.server.services.locking import get_locker
|
|
16
|
+
from dstack._internal.server.services.volumes import (
|
|
17
|
+
get_volume_configuration,
|
|
18
|
+
volume_model_to_volume,
|
|
19
|
+
)
|
|
20
|
+
from dstack._internal.utils import common
|
|
21
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
22
|
+
from dstack._internal.utils.logging import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def process_idle_volumes():
|
|
28
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
29
|
+
async with get_session_ctx() as session:
|
|
30
|
+
async with lock:
|
|
31
|
+
res = await session.execute(
|
|
32
|
+
select(VolumeModel.id)
|
|
33
|
+
.where(
|
|
34
|
+
VolumeModel.status == VolumeStatus.ACTIVE,
|
|
35
|
+
VolumeModel.deleted == False,
|
|
36
|
+
VolumeModel.id.not_in(lockset),
|
|
37
|
+
)
|
|
38
|
+
.order_by(VolumeModel.last_processed_at.asc())
|
|
39
|
+
.limit(10)
|
|
40
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
41
|
+
)
|
|
42
|
+
volume_ids = list(res.scalars().all())
|
|
43
|
+
if not volume_ids:
|
|
44
|
+
return
|
|
45
|
+
for volume_id in volume_ids:
|
|
46
|
+
lockset.add(volume_id)
|
|
47
|
+
|
|
48
|
+
res = await session.execute(
|
|
49
|
+
select(VolumeModel)
|
|
50
|
+
.where(VolumeModel.id.in_(volume_ids))
|
|
51
|
+
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
52
|
+
.options(joinedload(VolumeModel.user))
|
|
53
|
+
.options(joinedload(VolumeModel.attachments))
|
|
54
|
+
.execution_options(populate_existing=True)
|
|
55
|
+
)
|
|
56
|
+
volume_models = list(res.unique().scalars().all())
|
|
57
|
+
try:
|
|
58
|
+
volumes_to_delete = [v for v in volume_models if _should_delete_volume(v)]
|
|
59
|
+
if not volumes_to_delete:
|
|
60
|
+
return
|
|
61
|
+
await _delete_idle_volumes(session, volumes_to_delete)
|
|
62
|
+
finally:
|
|
63
|
+
lockset.difference_update(volume_ids)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _should_delete_volume(volume: VolumeModel) -> bool:
|
|
67
|
+
if volume.attachments:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
config = get_volume_configuration(volume)
|
|
71
|
+
if not config.auto_cleanup_duration:
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
duration_seconds = parse_duration(config.auto_cleanup_duration)
|
|
75
|
+
if not duration_seconds or duration_seconds <= 0:
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
idle_time = _get_idle_time(volume)
|
|
79
|
+
threshold = datetime.timedelta(seconds=duration_seconds)
|
|
80
|
+
return idle_time > threshold
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
|
|
84
|
+
last_used = volume.last_job_processed_at or volume.created_at
|
|
85
|
+
last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
|
|
86
|
+
idle_time = get_current_datetime() - last_used_utc
|
|
87
|
+
return max(idle_time, datetime.timedelta(0))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]):
|
|
91
|
+
# Note: Multiple volumes are deleted in the same transaction,
|
|
92
|
+
# so long deletion of one volume may block processing other volumes.
|
|
93
|
+
for volume_model in volumes:
|
|
94
|
+
logger.info("Deleting idle volume %s", volume_model.name)
|
|
95
|
+
try:
|
|
96
|
+
await _delete_idle_volume(session, volume_model)
|
|
97
|
+
except Exception:
|
|
98
|
+
logger.exception("Error when deleting idle volume %s", volume_model.name)
|
|
99
|
+
|
|
100
|
+
volume_model.deleted = True
|
|
101
|
+
volume_model.deleted_at = get_current_datetime()
|
|
102
|
+
|
|
103
|
+
logger.info("Deleted idle volume %s", volume_model.name)
|
|
104
|
+
|
|
105
|
+
await session.commit()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def _delete_idle_volume(session: AsyncSession, volume_model: VolumeModel):
|
|
109
|
+
volume = volume_model_to_volume(volume_model)
|
|
110
|
+
|
|
111
|
+
if volume.provisioning_data is None:
|
|
112
|
+
logger.error(
|
|
113
|
+
f"Failed to delete volume {volume_model.name}. volume.provisioning_data is None."
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
if volume.provisioning_data.backend is None:
|
|
118
|
+
logger.error(
|
|
119
|
+
f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None."
|
|
120
|
+
)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
backend = await backends_services.get_project_backend_by_type_or_error(
|
|
125
|
+
project=volume_model.project,
|
|
126
|
+
backend_type=volume.provisioning_data.backend,
|
|
127
|
+
)
|
|
128
|
+
except BackendNotAvailable:
|
|
129
|
+
logger.error(
|
|
130
|
+
f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available."
|
|
131
|
+
)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
compute = backend.compute()
|
|
135
|
+
assert isinstance(compute, ComputeWithVolumeSupport)
|
|
136
|
+
await common.run_async(
|
|
137
|
+
compute.delete_volume,
|
|
138
|
+
volume=volume,
|
|
139
|
+
)
|
|
@@ -45,6 +45,7 @@ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
|
45
45
|
from dstack._internal.core.errors import (
|
|
46
46
|
BackendError,
|
|
47
47
|
NotYetTerminated,
|
|
48
|
+
PlacementGroupNotSupportedError,
|
|
48
49
|
ProvisioningError,
|
|
49
50
|
)
|
|
50
51
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -73,7 +74,7 @@ from dstack._internal.core.models.runs import (
|
|
|
73
74
|
from dstack._internal.core.services.profiles import get_retry
|
|
74
75
|
from dstack._internal.server import settings as server_settings
|
|
75
76
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
76
|
-
from dstack._internal.server.db import get_session_ctx
|
|
77
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
77
78
|
from dstack._internal.server.models import (
|
|
78
79
|
FleetModel,
|
|
79
80
|
InstanceModel,
|
|
@@ -110,6 +111,8 @@ from dstack._internal.utils.ssh import (
|
|
|
110
111
|
pkey_from_str,
|
|
111
112
|
)
|
|
112
113
|
|
|
114
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
|
|
115
|
+
|
|
113
116
|
PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60)
|
|
114
117
|
|
|
115
118
|
TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20)
|
|
@@ -129,7 +132,7 @@ async def process_instances(batch_size: int = 1):
|
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
async def _process_next_instance():
|
|
132
|
-
lock, lockset = get_locker().get_lockset(InstanceModel.__tablename__)
|
|
135
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
|
|
133
136
|
async with get_session_ctx() as session:
|
|
134
137
|
async with lock:
|
|
135
138
|
res = await session.execute(
|
|
@@ -145,6 +148,8 @@ async def _process_next_instance():
|
|
|
145
148
|
]
|
|
146
149
|
),
|
|
147
150
|
InstanceModel.id.not_in(lockset),
|
|
151
|
+
InstanceModel.last_processed_at
|
|
152
|
+
< get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
|
|
148
153
|
)
|
|
149
154
|
.options(lazyload(InstanceModel.jobs))
|
|
150
155
|
.order_by(InstanceModel.last_processed_at.asc())
|
|
@@ -1063,6 +1068,12 @@ async def _create_placement_group(
|
|
|
1063
1068
|
placement_group_model_to_placement_group(placement_group_model),
|
|
1064
1069
|
master_instance_offer,
|
|
1065
1070
|
)
|
|
1071
|
+
except PlacementGroupNotSupportedError:
|
|
1072
|
+
logger.debug(
|
|
1073
|
+
"Skipping offer %s because placement group not supported",
|
|
1074
|
+
master_instance_offer.instance.name,
|
|
1075
|
+
)
|
|
1076
|
+
return None
|
|
1066
1077
|
except BackendError as e:
|
|
1067
1078
|
logger.warning(
|
|
1068
1079
|
"Failed to create placement group %s in %s/%s: %r",
|
|
@@ -7,7 +7,7 @@ from sqlalchemy.orm import joinedload
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
9
9
|
from dstack._internal.core.errors import PlacementGroupInUseError
|
|
10
|
-
from dstack._internal.server.db import get_session_ctx
|
|
10
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
11
11
|
from dstack._internal.server.models import PlacementGroupModel, ProjectModel
|
|
12
12
|
from dstack._internal.server.services import backends as backends_services
|
|
13
13
|
from dstack._internal.server.services.locking import get_locker
|
|
@@ -19,7 +19,9 @@ logger = get_logger(__name__)
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
async def process_placement_groups():
|
|
22
|
-
lock, lockset = get_locker().get_lockset(
|
|
22
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
23
|
+
PlacementGroupModel.__tablename__
|
|
24
|
+
)
|
|
23
25
|
async with get_session_ctx() as session:
|
|
24
26
|
async with lock:
|
|
25
27
|
res = await session.execute(
|
|
@@ -34,10 +34,11 @@ from dstack._internal.core.models.runs import (
|
|
|
34
34
|
JobTerminationReason,
|
|
35
35
|
Run,
|
|
36
36
|
RunSpec,
|
|
37
|
+
RunStatus,
|
|
37
38
|
)
|
|
38
39
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
39
40
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
40
|
-
from dstack._internal.server.db import get_session_ctx
|
|
41
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
41
42
|
from dstack._internal.server.models import (
|
|
42
43
|
InstanceModel,
|
|
43
44
|
JobModel,
|
|
@@ -79,6 +80,7 @@ from dstack._internal.utils.logging import get_logger
|
|
|
79
80
|
logger = get_logger(__name__)
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
|
|
82
84
|
# Minimum time before terminating active job in case of connectivity issues.
|
|
83
85
|
# Should be sufficient to survive most problems caused by
|
|
84
86
|
# the server network flickering and providers' glitches.
|
|
@@ -93,20 +95,29 @@ async def process_running_jobs(batch_size: int = 1):
|
|
|
93
95
|
|
|
94
96
|
|
|
95
97
|
async def _process_next_running_job():
|
|
96
|
-
lock, lockset = get_locker().get_lockset(JobModel.__tablename__)
|
|
98
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
97
99
|
async with get_session_ctx() as session:
|
|
98
100
|
async with lock:
|
|
99
101
|
res = await session.execute(
|
|
100
102
|
select(JobModel)
|
|
103
|
+
.join(JobModel.run)
|
|
101
104
|
.where(
|
|
102
105
|
JobModel.status.in_(
|
|
103
106
|
[JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING]
|
|
104
107
|
),
|
|
108
|
+
RunModel.status.not_in([RunStatus.TERMINATING]),
|
|
105
109
|
JobModel.id.not_in(lockset),
|
|
110
|
+
JobModel.last_processed_at
|
|
111
|
+
< common_utils.get_current_datetime().replace(tzinfo=None)
|
|
112
|
+
- MIN_PROCESSING_INTERVAL,
|
|
106
113
|
)
|
|
107
114
|
.order_by(JobModel.last_processed_at.asc())
|
|
108
115
|
.limit(1)
|
|
109
|
-
.with_for_update(
|
|
116
|
+
.with_for_update(
|
|
117
|
+
skip_locked=True,
|
|
118
|
+
key_share=True,
|
|
119
|
+
of=JobModel,
|
|
120
|
+
)
|
|
110
121
|
)
|
|
111
122
|
job_model = res.unique().scalar()
|
|
112
123
|
if job_model is None:
|
|
@@ -19,7 +19,7 @@ from dstack._internal.core.models.runs import (
|
|
|
19
19
|
RunStatus,
|
|
20
20
|
RunTerminationReason,
|
|
21
21
|
)
|
|
22
|
-
from dstack._internal.server.db import get_session_ctx
|
|
22
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
23
23
|
from dstack._internal.server.models import JobModel, ProjectModel, RunModel
|
|
24
24
|
from dstack._internal.server.services.jobs import (
|
|
25
25
|
find_job,
|
|
@@ -41,6 +41,8 @@ from dstack._internal.utils import common
|
|
|
41
41
|
from dstack._internal.utils.logging import get_logger
|
|
42
42
|
|
|
43
43
|
logger = get_logger(__name__)
|
|
44
|
+
|
|
45
|
+
MIN_PROCESSING_INTERVAL = datetime.timedelta(seconds=5)
|
|
44
46
|
ROLLING_DEPLOYMENT_MAX_SURGE = 1 # at most one extra replica during rolling deployment
|
|
45
47
|
|
|
46
48
|
|
|
@@ -52,8 +54,8 @@ async def process_runs(batch_size: int = 1):
|
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
async def _process_next_run():
|
|
55
|
-
run_lock, run_lockset = get_locker().get_lockset(RunModel.__tablename__)
|
|
56
|
-
job_lock, job_lockset = get_locker().get_lockset(JobModel.__tablename__)
|
|
57
|
+
run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
|
|
58
|
+
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
57
59
|
async with get_session_ctx() as session:
|
|
58
60
|
async with run_lock, job_lock:
|
|
59
61
|
res = await session.execute(
|
|
@@ -61,6 +63,8 @@ async def _process_next_run():
|
|
|
61
63
|
.where(
|
|
62
64
|
RunModel.status.not_in(RunStatus.finished_statuses()),
|
|
63
65
|
RunModel.id.not_in(run_lockset),
|
|
66
|
+
RunModel.last_processed_at
|
|
67
|
+
< common.get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
|
|
64
68
|
)
|
|
65
69
|
.order_by(RunModel.last_processed_at.asc())
|
|
66
70
|
.limit(1)
|
|
@@ -337,7 +341,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
337
341
|
current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
|
|
338
342
|
).total_seconds()
|
|
339
343
|
logger.info(
|
|
340
|
-
"%s: run took %.2f seconds from
|
|
344
|
+
"%s: run took %.2f seconds from submission to provisioning.",
|
|
341
345
|
fmt(run_model),
|
|
342
346
|
submit_to_provision_duration,
|
|
343
347
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import uuid
|
|
3
|
+
from datetime import datetime, timedelta
|
|
3
4
|
from typing import List, Optional, Tuple
|
|
4
5
|
|
|
5
6
|
from sqlalchemy import select
|
|
@@ -80,15 +81,35 @@ from dstack._internal.utils.logging import get_logger
|
|
|
80
81
|
logger = get_logger(__name__)
|
|
81
82
|
|
|
82
83
|
|
|
84
|
+
# Track when we last processed a job.
|
|
85
|
+
# This is needed for a trick:
|
|
86
|
+
# If no tasks were processed recently, we force batch_size 1.
|
|
87
|
+
# If there are lots of runs/jobs with same offers submitted,
|
|
88
|
+
# we warm up the cache instead of requesting the offers concurrently.
|
|
89
|
+
# Mostly useful when runs are submitted via API without getting run plan first.
|
|
90
|
+
BATCH_SIZE_RESET_TIMEOUT = timedelta(minutes=2)
|
|
91
|
+
last_processed_at: Optional[datetime] = None
|
|
92
|
+
|
|
93
|
+
|
|
83
94
|
async def process_submitted_jobs(batch_size: int = 1):
|
|
84
95
|
tasks = []
|
|
85
|
-
|
|
96
|
+
effective_batch_size = _get_effective_batch_size(batch_size)
|
|
97
|
+
for _ in range(effective_batch_size):
|
|
86
98
|
tasks.append(_process_next_submitted_job())
|
|
87
99
|
await asyncio.gather(*tasks)
|
|
88
100
|
|
|
89
101
|
|
|
102
|
+
def _get_effective_batch_size(batch_size: int) -> int:
|
|
103
|
+
if (
|
|
104
|
+
last_processed_at is None
|
|
105
|
+
or last_processed_at < common_utils.get_current_datetime() - BATCH_SIZE_RESET_TIMEOUT
|
|
106
|
+
):
|
|
107
|
+
return 1
|
|
108
|
+
return batch_size
|
|
109
|
+
|
|
110
|
+
|
|
90
111
|
async def _process_next_submitted_job():
|
|
91
|
-
lock, lockset = get_locker().get_lockset(JobModel.__tablename__)
|
|
112
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
92
113
|
async with get_session_ctx() as session:
|
|
93
114
|
async with lock:
|
|
94
115
|
res = await session.execute(
|
|
@@ -125,6 +146,8 @@ async def _process_next_submitted_job():
|
|
|
125
146
|
await _process_submitted_job(session=session, job_model=job_model)
|
|
126
147
|
finally:
|
|
127
148
|
lockset.difference_update([job_model_id])
|
|
149
|
+
global last_processed_at
|
|
150
|
+
last_processed_at = common_utils.get_current_datetime()
|
|
128
151
|
|
|
129
152
|
|
|
130
153
|
async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
@@ -214,7 +237,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
214
237
|
if get_db().dialect_name == "sqlite":
|
|
215
238
|
# Start new transaction to see committed changes after lock
|
|
216
239
|
await session.commit()
|
|
217
|
-
async with get_locker().lock_ctx(
|
|
240
|
+
async with get_locker(get_db().dialect_name).lock_ctx(
|
|
241
|
+
InstanceModel.__tablename__, instances_ids
|
|
242
|
+
):
|
|
218
243
|
# If another job freed the instance but is still trying to detach volumes,
|
|
219
244
|
# do not provision on it to prevent attaching volumes that are currently detaching.
|
|
220
245
|
detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
|
|
@@ -243,8 +268,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
243
268
|
)
|
|
244
269
|
job_model.instance_assigned = True
|
|
245
270
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
246
|
-
|
|
247
|
-
|
|
271
|
+
if len(pool_instances) > 0:
|
|
272
|
+
await session.commit()
|
|
273
|
+
return
|
|
274
|
+
# If no instances were locked, we can proceed in the same transaction.
|
|
248
275
|
|
|
249
276
|
if job_model.instance is not None:
|
|
250
277
|
res = await session.execute(
|
|
@@ -334,7 +361,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
334
361
|
.order_by(VolumeModel.id) # take locks in order
|
|
335
362
|
.with_for_update(key_share=True)
|
|
336
363
|
)
|
|
337
|
-
async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
364
|
+
async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
338
365
|
if len(volume_models) > 0:
|
|
339
366
|
await _attach_volumes(
|
|
340
367
|
session=session,
|
|
@@ -527,7 +554,9 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
|
|
|
527
554
|
if len(fleet_model.instances) == 0:
|
|
528
555
|
# No instances means the fleet is not in the db yet, so don't lock.
|
|
529
556
|
return 0
|
|
530
|
-
async with get_locker().lock_ctx(
|
|
557
|
+
async with get_locker(get_db().dialect_name).lock_ctx(
|
|
558
|
+
FleetModel.__tablename__, [fleet_model.id]
|
|
559
|
+
):
|
|
531
560
|
fleet_model = (
|
|
532
561
|
(
|
|
533
562
|
await session.execute(
|
|
@@ -710,3 +739,5 @@ async def _attach_volume(
|
|
|
710
739
|
attachment_data=attachment_data.json(),
|
|
711
740
|
)
|
|
712
741
|
instance.volume_attachments.append(volume_attachment_model)
|
|
742
|
+
|
|
743
|
+
volume_model.last_job_processed_at = common_utils.get_current_datetime()
|
|
@@ -5,7 +5,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
5
5
|
from sqlalchemy.orm import joinedload, lazyload
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
|
-
from dstack._internal.server.db import get_session_ctx
|
|
8
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
9
9
|
from dstack._internal.server.models import (
|
|
10
10
|
InstanceModel,
|
|
11
11
|
JobModel,
|
|
@@ -32,8 +32,10 @@ async def process_terminating_jobs(batch_size: int = 1):
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
async def _process_next_terminating_job():
|
|
35
|
-
job_lock, job_lockset = get_locker().get_lockset(JobModel.__tablename__)
|
|
36
|
-
instance_lock, instance_lockset = get_locker().get_lockset(
|
|
35
|
+
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
36
|
+
instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
37
|
+
InstanceModel.__tablename__
|
|
38
|
+
)
|
|
37
39
|
async with get_session_ctx() as session:
|
|
38
40
|
async with job_lock, instance_lock:
|
|
39
41
|
res = await session.execute(
|
|
@@ -5,7 +5,7 @@ from sqlalchemy.orm import joinedload
|
|
|
5
5
|
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
6
6
|
from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
7
7
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
8
|
-
from dstack._internal.server.db import get_session_ctx
|
|
8
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
9
9
|
from dstack._internal.server.models import (
|
|
10
10
|
InstanceModel,
|
|
11
11
|
ProjectModel,
|
|
@@ -22,7 +22,7 @@ logger = get_logger(__name__)
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
async def process_submitted_volumes():
|
|
25
|
-
lock, lockset = get_locker().get_lockset(VolumeModel.__tablename__)
|
|
25
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
26
26
|
async with get_session_ctx() as session:
|
|
27
27
|
async with lock:
|
|
28
28
|
res = await session.execute(
|
|
@@ -17,12 +17,6 @@ depends_on = None
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def upgrade() -> None:
|
|
20
|
-
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
21
|
-
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
22
|
-
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
23
|
-
batch_op.execute("UPDATE jobs SET deployment_num = 0")
|
|
24
|
-
batch_op.alter_column("deployment_num", nullable=False)
|
|
25
|
-
|
|
26
20
|
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
27
21
|
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
28
22
|
batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
|
|
@@ -32,6 +26,12 @@ def upgrade() -> None:
|
|
|
32
26
|
batch_op.alter_column("deployment_num", nullable=False)
|
|
33
27
|
batch_op.alter_column("desired_replica_count", nullable=False)
|
|
34
28
|
|
|
29
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
30
|
+
batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
|
|
31
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
32
|
+
batch_op.execute("UPDATE jobs SET deployment_num = 0")
|
|
33
|
+
batch_op.alter_column("deployment_num", nullable=False)
|
|
34
|
+
|
|
35
35
|
|
|
36
36
|
def downgrade() -> None:
|
|
37
37
|
with op.batch_alter_table("runs", schema=None) as batch_op:
|
dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Add VolumeModel.last_job_processed_at
|
|
2
|
+
|
|
3
|
+
Revision ID: d5863798bf41
|
|
4
|
+
Revises: 644b8a114187
|
|
5
|
+
Create Date: 2025-07-15 14:26:22.981687
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "d5863798bf41"
|
|
16
|
+
down_revision = "644b8a114187"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column(
|
|
26
|
+
"last_job_processed_at",
|
|
27
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
28
|
+
nullable=True,
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# ### end Alembic commands ###
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def downgrade() -> None:
|
|
36
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
37
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
38
|
+
batch_op.drop_column("last_job_processed_at")
|
|
39
|
+
|
|
40
|
+
# ### end Alembic commands ###
|