dstack 0.19.17__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (86) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +111 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/backends/aws/compute.py +237 -18
  4. dstack/_internal/core/backends/base/compute.py +20 -2
  5. dstack/_internal/core/backends/cudo/compute.py +23 -9
  6. dstack/_internal/core/backends/gcp/compute.py +13 -7
  7. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  8. dstack/_internal/core/compatibility/fleets.py +12 -11
  9. dstack/_internal/core/compatibility/gateways.py +9 -8
  10. dstack/_internal/core/compatibility/logs.py +4 -3
  11. dstack/_internal/core/compatibility/runs.py +29 -21
  12. dstack/_internal/core/compatibility/volumes.py +11 -8
  13. dstack/_internal/core/errors.py +4 -0
  14. dstack/_internal/core/models/common.py +45 -2
  15. dstack/_internal/core/models/configurations.py +9 -1
  16. dstack/_internal/core/models/fleets.py +2 -1
  17. dstack/_internal/core/models/profiles.py +8 -5
  18. dstack/_internal/core/models/resources.py +15 -8
  19. dstack/_internal/core/models/runs.py +41 -138
  20. dstack/_internal/core/models/volumes.py +14 -0
  21. dstack/_internal/core/services/diff.py +56 -3
  22. dstack/_internal/core/services/ssh/attach.py +2 -0
  23. dstack/_internal/server/app.py +37 -9
  24. dstack/_internal/server/background/__init__.py +66 -40
  25. dstack/_internal/server/background/tasks/process_fleets.py +19 -3
  26. dstack/_internal/server/background/tasks/process_gateways.py +47 -29
  27. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  28. dstack/_internal/server/background/tasks/process_instances.py +13 -2
  29. dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
  30. dstack/_internal/server/background/tasks/process_running_jobs.py +14 -3
  31. dstack/_internal/server/background/tasks/process_runs.py +8 -4
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +38 -7
  33. dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
  34. dstack/_internal/server/background/tasks/process_volumes.py +2 -2
  35. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  36. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  37. dstack/_internal/server/models.py +1 -0
  38. dstack/_internal/server/routers/backends.py +23 -16
  39. dstack/_internal/server/routers/files.py +7 -6
  40. dstack/_internal/server/routers/fleets.py +47 -36
  41. dstack/_internal/server/routers/gateways.py +27 -18
  42. dstack/_internal/server/routers/instances.py +18 -13
  43. dstack/_internal/server/routers/logs.py +7 -3
  44. dstack/_internal/server/routers/metrics.py +14 -8
  45. dstack/_internal/server/routers/projects.py +33 -22
  46. dstack/_internal/server/routers/repos.py +7 -6
  47. dstack/_internal/server/routers/runs.py +49 -28
  48. dstack/_internal/server/routers/secrets.py +20 -15
  49. dstack/_internal/server/routers/server.py +7 -4
  50. dstack/_internal/server/routers/users.py +22 -19
  51. dstack/_internal/server/routers/volumes.py +34 -25
  52. dstack/_internal/server/schemas/logs.py +2 -2
  53. dstack/_internal/server/schemas/runs.py +17 -5
  54. dstack/_internal/server/services/fleets.py +358 -75
  55. dstack/_internal/server/services/gateways/__init__.py +17 -6
  56. dstack/_internal/server/services/gateways/client.py +5 -3
  57. dstack/_internal/server/services/instances.py +8 -0
  58. dstack/_internal/server/services/jobs/__init__.py +45 -0
  59. dstack/_internal/server/services/jobs/configurators/base.py +12 -1
  60. dstack/_internal/server/services/locking.py +104 -13
  61. dstack/_internal/server/services/logging.py +4 -2
  62. dstack/_internal/server/services/logs/__init__.py +15 -2
  63. dstack/_internal/server/services/logs/aws.py +2 -4
  64. dstack/_internal/server/services/logs/filelog.py +33 -27
  65. dstack/_internal/server/services/logs/gcp.py +3 -5
  66. dstack/_internal/server/services/proxy/repo.py +4 -1
  67. dstack/_internal/server/services/runs.py +139 -72
  68. dstack/_internal/server/services/services/__init__.py +2 -1
  69. dstack/_internal/server/services/users.py +3 -1
  70. dstack/_internal/server/services/volumes.py +15 -2
  71. dstack/_internal/server/settings.py +25 -6
  72. dstack/_internal/server/statics/index.html +1 -1
  73. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js → main-64f8273740c4b52c18f5.js} +71 -67
  74. dstack/_internal/server/statics/{main-d151637af20f70b2e796.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  75. dstack/_internal/server/statics/{main-d48635d8fe670d53961c.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
  76. dstack/_internal/server/testing/common.py +48 -8
  77. dstack/_internal/server/utils/routers.py +31 -8
  78. dstack/_internal/utils/json_utils.py +54 -0
  79. dstack/api/_public/runs.py +13 -2
  80. dstack/api/server/_runs.py +12 -2
  81. dstack/version.py +1 -1
  82. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/METADATA +17 -14
  83. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/RECORD +86 -83
  84. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  85. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  86. {dstack-0.19.17.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -16,6 +16,7 @@ from dstack._internal.server.services.gateways import (
16
16
  gateway_connections_pool,
17
17
  )
18
18
  from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
19
+ from dstack._internal.server.services.logging import fmt
19
20
  from dstack._internal.utils.common import get_current_datetime
20
21
  from dstack._internal.utils.logging import get_logger
21
22
 
@@ -27,14 +28,14 @@ async def process_gateways_connections():
27
28
  await _process_active_connections()
28
29
 
29
30
 
30
- async def process_submitted_gateways():
31
- lock, lockset = get_locker().get_lockset(GatewayModel.__tablename__)
31
+ async def process_gateways():
32
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
32
33
  async with get_session_ctx() as session:
33
34
  async with lock:
34
35
  res = await session.execute(
35
36
  select(GatewayModel)
36
37
  .where(
37
- GatewayModel.status == GatewayStatus.SUBMITTED,
38
+ GatewayModel.status.in_([GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING]),
38
39
  GatewayModel.id.not_in(lockset),
39
40
  )
40
41
  .options(lazyload(GatewayModel.gateway_compute))
@@ -48,7 +49,25 @@ async def process_submitted_gateways():
48
49
  lockset.add(gateway_model.id)
49
50
  try:
50
51
  gateway_model_id = gateway_model.id
51
- await _process_submitted_gateway(session=session, gateway_model=gateway_model)
52
+ initial_status = gateway_model.status
53
+ if initial_status == GatewayStatus.SUBMITTED:
54
+ await _process_submitted_gateway(session=session, gateway_model=gateway_model)
55
+ elif initial_status == GatewayStatus.PROVISIONING:
56
+ await _process_provisioning_gateway(session=session, gateway_model=gateway_model)
57
+ else:
58
+ logger.error(
59
+ "%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper()
60
+ )
61
+ if gateway_model.status != initial_status:
62
+ logger.info(
63
+ "%s: gateway status has changed %s -> %s%s",
64
+ fmt(gateway_model),
65
+ initial_status.upper(),
66
+ gateway_model.status.upper(),
67
+ f": {gateway_model.status_message}" if gateway_model.status_message else "",
68
+ )
69
+ gateway_model.last_processed_at = get_current_datetime()
70
+ await session.commit()
52
71
  finally:
53
72
  lockset.difference_update([gateway_model_id])
54
73
 
@@ -89,7 +108,7 @@ async def _process_connection(conn: GatewayConnection):
89
108
 
90
109
 
91
110
  async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
92
- logger.info("Started gateway %s provisioning", gateway_model.name)
111
+ logger.info("%s: started gateway provisioning", fmt(gateway_model))
93
112
  # Refetch to load related attributes.
94
113
  # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
95
114
  res = await session.execute(
@@ -110,8 +129,6 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
110
129
  except BackendNotAvailable:
111
130
  gateway_model.status = GatewayStatus.FAILED
112
131
  gateway_model.status_message = "Backend not available"
113
- gateway_model.last_processed_at = get_current_datetime()
114
- await session.commit()
115
132
  return
116
133
 
117
134
  try:
@@ -123,53 +140,54 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew
123
140
  )
124
141
  session.add(gateway_model)
125
142
  gateway_model.status = GatewayStatus.PROVISIONING
126
- await session.commit()
127
- await session.refresh(gateway_model)
128
143
  except BackendError as e:
129
- logger.info(
130
- "Failed to create gateway compute for gateway %s: %s", gateway_model.name, repr(e)
131
- )
144
+ logger.info("%s: failed to create gateway compute: %r", fmt(gateway_model), e)
132
145
  gateway_model.status = GatewayStatus.FAILED
133
146
  status_message = f"Backend error: {repr(e)}"
134
147
  if len(e.args) > 0:
135
148
  status_message = str(e.args[0])
136
149
  gateway_model.status_message = status_message
137
- gateway_model.last_processed_at = get_current_datetime()
138
- await session.commit()
139
- return
140
150
  except Exception as e:
141
- logger.exception(
142
- "Got exception when creating gateway compute for gateway %s", gateway_model.name
143
- )
151
+ logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model))
144
152
  gateway_model.status = GatewayStatus.FAILED
145
153
  gateway_model.status_message = f"Unexpected error: {repr(e)}"
146
- gateway_model.last_processed_at = get_current_datetime()
147
- await session.commit()
148
- return
149
154
 
155
+
156
+ async def _process_provisioning_gateway(
157
+ session: AsyncSession, gateway_model: GatewayModel
158
+ ) -> None:
159
+ # Refetch to load related attributes.
160
+ # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
161
+ res = await session.execute(
162
+ select(GatewayModel)
163
+ .where(GatewayModel.id == gateway_model.id)
164
+ .execution_options(populate_existing=True)
165
+ )
166
+ gateway_model = res.unique().scalar_one()
167
+
168
+ # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
169
+ # - cannot delete the gateway before it is provisioned because the DB model is locked
170
+ # - connection retry counter is reset on server restart
171
+ # - only one server replica is processing the gateway
172
+ # Easy to fix by doing only one connection/configuration attempt per processing iteration. The
173
+ # main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
174
+ # maintaining a different model for Sky.
150
175
  connection = await gateways_services.connect_to_gateway_with_retry(
151
176
  gateway_model.gateway_compute
152
177
  )
153
178
  if connection is None:
154
179
  gateway_model.status = GatewayStatus.FAILED
155
180
  gateway_model.status_message = "Failed to connect to gateway"
156
- gateway_model.last_processed_at = get_current_datetime()
157
181
  gateway_model.gateway_compute.deleted = True
158
- await session.commit()
159
182
  return
160
-
161
183
  try:
162
184
  await gateways_services.configure_gateway(connection)
163
185
  except Exception:
164
- logger.exception("Failed to configure gateway %s", gateway_model.name)
186
+ logger.exception("%s: failed to configure gateway", fmt(gateway_model))
165
187
  gateway_model.status = GatewayStatus.FAILED
166
188
  gateway_model.status_message = "Failed to configure gateway"
167
- gateway_model.last_processed_at = get_current_datetime()
168
189
  await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address)
169
190
  gateway_model.gateway_compute.active = False
170
- await session.commit()
171
191
  return
172
192
 
173
193
  gateway_model.status = GatewayStatus.RUNNING
174
- gateway_model.last_processed_at = get_current_datetime()
175
- await session.commit()
@@ -0,0 +1,139 @@
1
+ import datetime
2
+ from typing import List
3
+
4
+ from sqlalchemy import select
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+ from sqlalchemy.orm import joinedload
7
+
8
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
9
+ from dstack._internal.core.errors import BackendNotAvailable
10
+ from dstack._internal.core.models.profiles import parse_duration
11
+ from dstack._internal.core.models.volumes import VolumeStatus
12
+ from dstack._internal.server.db import get_db, get_session_ctx
13
+ from dstack._internal.server.models import ProjectModel, VolumeModel
14
+ from dstack._internal.server.services import backends as backends_services
15
+ from dstack._internal.server.services.locking import get_locker
16
+ from dstack._internal.server.services.volumes import (
17
+ get_volume_configuration,
18
+ volume_model_to_volume,
19
+ )
20
+ from dstack._internal.utils import common
21
+ from dstack._internal.utils.common import get_current_datetime
22
+ from dstack._internal.utils.logging import get_logger
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ async def process_idle_volumes():
28
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
29
+ async with get_session_ctx() as session:
30
+ async with lock:
31
+ res = await session.execute(
32
+ select(VolumeModel.id)
33
+ .where(
34
+ VolumeModel.status == VolumeStatus.ACTIVE,
35
+ VolumeModel.deleted == False,
36
+ VolumeModel.id.not_in(lockset),
37
+ )
38
+ .order_by(VolumeModel.last_processed_at.asc())
39
+ .limit(10)
40
+ .with_for_update(skip_locked=True, key_share=True)
41
+ )
42
+ volume_ids = list(res.scalars().all())
43
+ if not volume_ids:
44
+ return
45
+ for volume_id in volume_ids:
46
+ lockset.add(volume_id)
47
+
48
+ res = await session.execute(
49
+ select(VolumeModel)
50
+ .where(VolumeModel.id.in_(volume_ids))
51
+ .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
52
+ .options(joinedload(VolumeModel.user))
53
+ .options(joinedload(VolumeModel.attachments))
54
+ .execution_options(populate_existing=True)
55
+ )
56
+ volume_models = list(res.unique().scalars().all())
57
+ try:
58
+ volumes_to_delete = [v for v in volume_models if _should_delete_volume(v)]
59
+ if not volumes_to_delete:
60
+ return
61
+ await _delete_idle_volumes(session, volumes_to_delete)
62
+ finally:
63
+ lockset.difference_update(volume_ids)
64
+
65
+
66
+ def _should_delete_volume(volume: VolumeModel) -> bool:
67
+ if volume.attachments:
68
+ return False
69
+
70
+ config = get_volume_configuration(volume)
71
+ if not config.auto_cleanup_duration:
72
+ return False
73
+
74
+ duration_seconds = parse_duration(config.auto_cleanup_duration)
75
+ if not duration_seconds or duration_seconds <= 0:
76
+ return False
77
+
78
+ idle_time = _get_idle_time(volume)
79
+ threshold = datetime.timedelta(seconds=duration_seconds)
80
+ return idle_time > threshold
81
+
82
+
83
+ def _get_idle_time(volume: VolumeModel) -> datetime.timedelta:
84
+ last_used = volume.last_job_processed_at or volume.created_at
85
+ last_used_utc = last_used.replace(tzinfo=datetime.timezone.utc)
86
+ idle_time = get_current_datetime() - last_used_utc
87
+ return max(idle_time, datetime.timedelta(0))
88
+
89
+
90
+ async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]):
91
+ # Note: Multiple volumes are deleted in the same transaction,
92
+ # so long deletion of one volume may block processing other volumes.
93
+ for volume_model in volumes:
94
+ logger.info("Deleting idle volume %s", volume_model.name)
95
+ try:
96
+ await _delete_idle_volume(session, volume_model)
97
+ except Exception:
98
+ logger.exception("Error when deleting idle volume %s", volume_model.name)
99
+
100
+ volume_model.deleted = True
101
+ volume_model.deleted_at = get_current_datetime()
102
+
103
+ logger.info("Deleted idle volume %s", volume_model.name)
104
+
105
+ await session.commit()
106
+
107
+
108
+ async def _delete_idle_volume(session: AsyncSession, volume_model: VolumeModel):
109
+ volume = volume_model_to_volume(volume_model)
110
+
111
+ if volume.provisioning_data is None:
112
+ logger.error(
113
+ f"Failed to delete volume {volume_model.name}. volume.provisioning_data is None."
114
+ )
115
+ return
116
+
117
+ if volume.provisioning_data.backend is None:
118
+ logger.error(
119
+ f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None."
120
+ )
121
+ return
122
+
123
+ try:
124
+ backend = await backends_services.get_project_backend_by_type_or_error(
125
+ project=volume_model.project,
126
+ backend_type=volume.provisioning_data.backend,
127
+ )
128
+ except BackendNotAvailable:
129
+ logger.error(
130
+ f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available."
131
+ )
132
+ return
133
+
134
+ compute = backend.compute()
135
+ assert isinstance(compute, ComputeWithVolumeSupport)
136
+ await common.run_async(
137
+ compute.delete_volume,
138
+ volume=volume,
139
+ )
@@ -45,6 +45,7 @@ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
45
45
  from dstack._internal.core.errors import (
46
46
  BackendError,
47
47
  NotYetTerminated,
48
+ PlacementGroupNotSupportedError,
48
49
  ProvisioningError,
49
50
  )
50
51
  from dstack._internal.core.models.backends.base import BackendType
@@ -73,7 +74,7 @@ from dstack._internal.core.models.runs import (
73
74
  from dstack._internal.core.services.profiles import get_retry
74
75
  from dstack._internal.server import settings as server_settings
75
76
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
76
- from dstack._internal.server.db import get_session_ctx
77
+ from dstack._internal.server.db import get_db, get_session_ctx
77
78
  from dstack._internal.server.models import (
78
79
  FleetModel,
79
80
  InstanceModel,
@@ -110,6 +111,8 @@ from dstack._internal.utils.ssh import (
110
111
  pkey_from_str,
111
112
  )
112
113
 
114
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
115
+
113
116
  PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60)
114
117
 
115
118
  TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20)
@@ -129,7 +132,7 @@ async def process_instances(batch_size: int = 1):
129
132
 
130
133
 
131
134
  async def _process_next_instance():
132
- lock, lockset = get_locker().get_lockset(InstanceModel.__tablename__)
135
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
133
136
  async with get_session_ctx() as session:
134
137
  async with lock:
135
138
  res = await session.execute(
@@ -145,6 +148,8 @@ async def _process_next_instance():
145
148
  ]
146
149
  ),
147
150
  InstanceModel.id.not_in(lockset),
151
+ InstanceModel.last_processed_at
152
+ < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
148
153
  )
149
154
  .options(lazyload(InstanceModel.jobs))
150
155
  .order_by(InstanceModel.last_processed_at.asc())
@@ -1063,6 +1068,12 @@ async def _create_placement_group(
1063
1068
  placement_group_model_to_placement_group(placement_group_model),
1064
1069
  master_instance_offer,
1065
1070
  )
1071
+ except PlacementGroupNotSupportedError:
1072
+ logger.debug(
1073
+ "Skipping offer %s because placement group not supported",
1074
+ master_instance_offer.instance.name,
1075
+ )
1076
+ return None
1066
1077
  except BackendError as e:
1067
1078
  logger.warning(
1068
1079
  "Failed to create placement group %s in %s/%s: %r",
@@ -7,7 +7,7 @@ from sqlalchemy.orm import joinedload
7
7
 
8
8
  from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
9
9
  from dstack._internal.core.errors import PlacementGroupInUseError
10
- from dstack._internal.server.db import get_session_ctx
10
+ from dstack._internal.server.db import get_db, get_session_ctx
11
11
  from dstack._internal.server.models import PlacementGroupModel, ProjectModel
12
12
  from dstack._internal.server.services import backends as backends_services
13
13
  from dstack._internal.server.services.locking import get_locker
@@ -19,7 +19,9 @@ logger = get_logger(__name__)
19
19
 
20
20
 
21
21
  async def process_placement_groups():
22
- lock, lockset = get_locker().get_lockset(PlacementGroupModel.__tablename__)
22
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(
23
+ PlacementGroupModel.__tablename__
24
+ )
23
25
  async with get_session_ctx() as session:
24
26
  async with lock:
25
27
  res = await session.execute(
@@ -34,10 +34,11 @@ from dstack._internal.core.models.runs import (
34
34
  JobTerminationReason,
35
35
  Run,
36
36
  RunSpec,
37
+ RunStatus,
37
38
  )
38
39
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
39
40
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
40
- from dstack._internal.server.db import get_session_ctx
41
+ from dstack._internal.server.db import get_db, get_session_ctx
41
42
  from dstack._internal.server.models import (
42
43
  InstanceModel,
43
44
  JobModel,
@@ -79,6 +80,7 @@ from dstack._internal.utils.logging import get_logger
79
80
  logger = get_logger(__name__)
80
81
 
81
82
 
83
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
82
84
  # Minimum time before terminating active job in case of connectivity issues.
83
85
  # Should be sufficient to survive most problems caused by
84
86
  # the server network flickering and providers' glitches.
@@ -93,20 +95,29 @@ async def process_running_jobs(batch_size: int = 1):
93
95
 
94
96
 
95
97
  async def _process_next_running_job():
96
- lock, lockset = get_locker().get_lockset(JobModel.__tablename__)
98
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
97
99
  async with get_session_ctx() as session:
98
100
  async with lock:
99
101
  res = await session.execute(
100
102
  select(JobModel)
103
+ .join(JobModel.run)
101
104
  .where(
102
105
  JobModel.status.in_(
103
106
  [JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING]
104
107
  ),
108
+ RunModel.status.not_in([RunStatus.TERMINATING]),
105
109
  JobModel.id.not_in(lockset),
110
+ JobModel.last_processed_at
111
+ < common_utils.get_current_datetime().replace(tzinfo=None)
112
+ - MIN_PROCESSING_INTERVAL,
106
113
  )
107
114
  .order_by(JobModel.last_processed_at.asc())
108
115
  .limit(1)
109
- .with_for_update(skip_locked=True, key_share=True)
116
+ .with_for_update(
117
+ skip_locked=True,
118
+ key_share=True,
119
+ of=JobModel,
120
+ )
110
121
  )
111
122
  job_model = res.unique().scalar()
112
123
  if job_model is None:
@@ -19,7 +19,7 @@ from dstack._internal.core.models.runs import (
19
19
  RunStatus,
20
20
  RunTerminationReason,
21
21
  )
22
- from dstack._internal.server.db import get_session_ctx
22
+ from dstack._internal.server.db import get_db, get_session_ctx
23
23
  from dstack._internal.server.models import JobModel, ProjectModel, RunModel
24
24
  from dstack._internal.server.services.jobs import (
25
25
  find_job,
@@ -41,6 +41,8 @@ from dstack._internal.utils import common
41
41
  from dstack._internal.utils.logging import get_logger
42
42
 
43
43
  logger = get_logger(__name__)
44
+
45
+ MIN_PROCESSING_INTERVAL = datetime.timedelta(seconds=5)
44
46
  ROLLING_DEPLOYMENT_MAX_SURGE = 1 # at most one extra replica during rolling deployment
45
47
 
46
48
 
@@ -52,8 +54,8 @@ async def process_runs(batch_size: int = 1):
52
54
 
53
55
 
54
56
  async def _process_next_run():
55
- run_lock, run_lockset = get_locker().get_lockset(RunModel.__tablename__)
56
- job_lock, job_lockset = get_locker().get_lockset(JobModel.__tablename__)
57
+ run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
58
+ job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
57
59
  async with get_session_ctx() as session:
58
60
  async with run_lock, job_lock:
59
61
  res = await session.execute(
@@ -61,6 +63,8 @@ async def _process_next_run():
61
63
  .where(
62
64
  RunModel.status.not_in(RunStatus.finished_statuses()),
63
65
  RunModel.id.not_in(run_lockset),
66
+ RunModel.last_processed_at
67
+ < common.get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
64
68
  )
65
69
  .order_by(RunModel.last_processed_at.asc())
66
70
  .limit(1)
@@ -337,7 +341,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
337
341
  current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
338
342
  ).total_seconds()
339
343
  logger.info(
340
- "%s: run took %.2f seconds from submision to provisioning.",
344
+ "%s: run took %.2f seconds from submission to provisioning.",
341
345
  fmt(run_model),
342
346
  submit_to_provision_duration,
343
347
  )
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  import uuid
3
+ from datetime import datetime, timedelta
3
4
  from typing import List, Optional, Tuple
4
5
 
5
6
  from sqlalchemy import select
@@ -80,15 +81,35 @@ from dstack._internal.utils.logging import get_logger
80
81
  logger = get_logger(__name__)
81
82
 
82
83
 
84
+ # Track when we last processed a job.
85
+ # This is needed for a trick:
86
+ # If no tasks were processed recently, we force batch_size 1.
87
+ # If there are lots of runs/jobs with same offers submitted,
88
+ # we warm up the cache instead of requesting the offers concurrently.
89
+ # Mostly useful when runs are submitted via API without getting run plan first.
90
+ BATCH_SIZE_RESET_TIMEOUT = timedelta(minutes=2)
91
+ last_processed_at: Optional[datetime] = None
92
+
93
+
83
94
  async def process_submitted_jobs(batch_size: int = 1):
84
95
  tasks = []
85
- for _ in range(batch_size):
96
+ effective_batch_size = _get_effective_batch_size(batch_size)
97
+ for _ in range(effective_batch_size):
86
98
  tasks.append(_process_next_submitted_job())
87
99
  await asyncio.gather(*tasks)
88
100
 
89
101
 
102
+ def _get_effective_batch_size(batch_size: int) -> int:
103
+ if (
104
+ last_processed_at is None
105
+ or last_processed_at < common_utils.get_current_datetime() - BATCH_SIZE_RESET_TIMEOUT
106
+ ):
107
+ return 1
108
+ return batch_size
109
+
110
+
90
111
  async def _process_next_submitted_job():
91
- lock, lockset = get_locker().get_lockset(JobModel.__tablename__)
112
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
92
113
  async with get_session_ctx() as session:
93
114
  async with lock:
94
115
  res = await session.execute(
@@ -125,6 +146,8 @@ async def _process_next_submitted_job():
125
146
  await _process_submitted_job(session=session, job_model=job_model)
126
147
  finally:
127
148
  lockset.difference_update([job_model_id])
149
+ global last_processed_at
150
+ last_processed_at = common_utils.get_current_datetime()
128
151
 
129
152
 
130
153
  async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
@@ -214,7 +237,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
214
237
  if get_db().dialect_name == "sqlite":
215
238
  # Start new transaction to see committed changes after lock
216
239
  await session.commit()
217
- async with get_locker().lock_ctx(InstanceModel.__tablename__, instances_ids):
240
+ async with get_locker(get_db().dialect_name).lock_ctx(
241
+ InstanceModel.__tablename__, instances_ids
242
+ ):
218
243
  # If another job freed the instance but is still trying to detach volumes,
219
244
  # do not provision on it to prevent attaching volumes that are currently detaching.
220
245
  detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
@@ -243,8 +268,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
243
268
  )
244
269
  job_model.instance_assigned = True
245
270
  job_model.last_processed_at = common_utils.get_current_datetime()
246
- await session.commit()
247
- return
271
+ if len(pool_instances) > 0:
272
+ await session.commit()
273
+ return
274
+ # If no instances were locked, we can proceed in the same transaction.
248
275
 
249
276
  if job_model.instance is not None:
250
277
  res = await session.execute(
@@ -334,7 +361,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
334
361
  .order_by(VolumeModel.id) # take locks in order
335
362
  .with_for_update(key_share=True)
336
363
  )
337
- async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
364
+ async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
338
365
  if len(volume_models) > 0:
339
366
  await _attach_volumes(
340
367
  session=session,
@@ -527,7 +554,9 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
527
554
  if len(fleet_model.instances) == 0:
528
555
  # No instances means the fleet is not in the db yet, so don't lock.
529
556
  return 0
530
- async with get_locker().lock_ctx(FleetModel.__tablename__, [fleet_model.id]):
557
+ async with get_locker(get_db().dialect_name).lock_ctx(
558
+ FleetModel.__tablename__, [fleet_model.id]
559
+ ):
531
560
  fleet_model = (
532
561
  (
533
562
  await session.execute(
@@ -710,3 +739,5 @@ async def _attach_volume(
710
739
  attachment_data=attachment_data.json(),
711
740
  )
712
741
  instance.volume_attachments.append(volume_attachment_model)
742
+
743
+ volume_model.last_job_processed_at = common_utils.get_current_datetime()
@@ -5,7 +5,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
5
5
  from sqlalchemy.orm import joinedload, lazyload
6
6
 
7
7
  from dstack._internal.core.models.runs import JobStatus
8
- from dstack._internal.server.db import get_session_ctx
8
+ from dstack._internal.server.db import get_db, get_session_ctx
9
9
  from dstack._internal.server.models import (
10
10
  InstanceModel,
11
11
  JobModel,
@@ -32,8 +32,10 @@ async def process_terminating_jobs(batch_size: int = 1):
32
32
 
33
33
 
34
34
  async def _process_next_terminating_job():
35
- job_lock, job_lockset = get_locker().get_lockset(JobModel.__tablename__)
36
- instance_lock, instance_lockset = get_locker().get_lockset(InstanceModel.__tablename__)
35
+ job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
36
+ instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
37
+ InstanceModel.__tablename__
38
+ )
37
39
  async with get_session_ctx() as session:
38
40
  async with job_lock, instance_lock:
39
41
  res = await session.execute(
@@ -5,7 +5,7 @@ from sqlalchemy.orm import joinedload
5
5
  from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
6
6
  from dstack._internal.core.errors import BackendError, BackendNotAvailable
7
7
  from dstack._internal.core.models.volumes import VolumeStatus
8
- from dstack._internal.server.db import get_session_ctx
8
+ from dstack._internal.server.db import get_db, get_session_ctx
9
9
  from dstack._internal.server.models import (
10
10
  InstanceModel,
11
11
  ProjectModel,
@@ -22,7 +22,7 @@ logger = get_logger(__name__)
22
22
 
23
23
 
24
24
  async def process_submitted_volumes():
25
- lock, lockset = get_locker().get_lockset(VolumeModel.__tablename__)
25
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
26
26
  async with get_session_ctx() as session:
27
27
  async with lock:
28
28
  res = await session.execute(
@@ -17,12 +17,6 @@ depends_on = None
17
17
 
18
18
 
19
19
  def upgrade() -> None:
20
- with op.batch_alter_table("jobs", schema=None) as batch_op:
21
- batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
22
- with op.batch_alter_table("jobs", schema=None) as batch_op:
23
- batch_op.execute("UPDATE jobs SET deployment_num = 0")
24
- batch_op.alter_column("deployment_num", nullable=False)
25
-
26
20
  with op.batch_alter_table("runs", schema=None) as batch_op:
27
21
  batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
28
22
  batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
@@ -32,6 +26,12 @@ def upgrade() -> None:
32
26
  batch_op.alter_column("deployment_num", nullable=False)
33
27
  batch_op.alter_column("desired_replica_count", nullable=False)
34
28
 
29
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
30
+ batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
31
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
32
+ batch_op.execute("UPDATE jobs SET deployment_num = 0")
33
+ batch_op.alter_column("deployment_num", nullable=False)
34
+
35
35
 
36
36
  def downgrade() -> None:
37
37
  with op.batch_alter_table("runs", schema=None) as batch_op:
@@ -0,0 +1,40 @@
1
+ """Add VolumeModel.last_job_processed_at
2
+
3
+ Revision ID: d5863798bf41
4
+ Revises: 644b8a114187
5
+ Create Date: 2025-07-15 14:26:22.981687
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ import dstack._internal.server.models
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "d5863798bf41"
16
+ down_revision = "644b8a114187"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
24
+ batch_op.add_column(
25
+ sa.Column(
26
+ "last_job_processed_at",
27
+ dstack._internal.server.models.NaiveDateTime(),
28
+ nullable=True,
29
+ )
30
+ )
31
+
32
+ # ### end Alembic commands ###
33
+
34
+
35
+ def downgrade() -> None:
36
+ # ### commands auto generated by Alembic - please adjust! ###
37
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
38
+ batch_op.drop_column("last_job_processed_at")
39
+
40
+ # ### end Alembic commands ###