dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,8 @@ from sqlalchemy import select
6
6
  from sqlalchemy.ext.asyncio import AsyncSession
7
7
  from sqlalchemy.orm import joinedload, lazyload, selectinload
8
8
 
9
- from dstack._internal.core.backends.base import Backend
9
+ from dstack._internal.core.backends.base.backend import Backend
10
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
10
11
  from dstack._internal.core.errors import BackendError, ServerClientError
11
12
  from dstack._internal.core.models.common import NetworkMode
12
13
  from dstack._internal.core.models.fleets import (
@@ -17,10 +18,8 @@ from dstack._internal.core.models.fleets import (
17
18
  )
18
19
  from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
19
20
  from dstack._internal.core.models.profiles import (
20
- DEFAULT_POOL_NAME,
21
21
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
22
22
  CreationPolicy,
23
- Profile,
24
23
  TerminationPolicy,
25
24
  )
26
25
  from dstack._internal.core.models.resources import Memory
@@ -35,12 +34,12 @@ from dstack._internal.core.models.runs import (
35
34
  )
36
35
  from dstack._internal.core.models.volumes import Volume
37
36
  from dstack._internal.core.services.profiles import get_termination
37
+ from dstack._internal.server import settings
38
38
  from dstack._internal.server.db import get_db, get_session_ctx
39
39
  from dstack._internal.server.models import (
40
40
  FleetModel,
41
41
  InstanceModel,
42
42
  JobModel,
43
- PoolModel,
44
43
  ProjectModel,
45
44
  RunModel,
46
45
  VolumeAttachmentModel,
@@ -50,6 +49,12 @@ from dstack._internal.server.services.backends import get_project_backend_by_typ
50
49
  from dstack._internal.server.services.fleets import (
51
50
  fleet_model_to_fleet,
52
51
  )
52
+ from dstack._internal.server.services.instances import (
53
+ filter_pool_instances,
54
+ get_instance_offer,
55
+ get_instance_provisioning_data,
56
+ get_shared_pool_instances_with_offers,
57
+ )
53
58
  from dstack._internal.server.services.jobs import (
54
59
  check_can_attach_job_volumes,
55
60
  find_job,
@@ -61,12 +66,6 @@ from dstack._internal.server.services.jobs import (
61
66
  from dstack._internal.server.services.locking import get_locker
62
67
  from dstack._internal.server.services.logging import fmt
63
68
  from dstack._internal.server.services.offers import get_offers_by_requirements
64
- from dstack._internal.server.services.pools import (
65
- filter_pool_instances,
66
- get_instance_offer,
67
- get_instance_provisioning_data,
68
- get_shared_pool_instances_with_offers,
69
- )
70
69
  from dstack._internal.server.services.runs import (
71
70
  check_run_spec_requires_instance_mounts,
72
71
  run_model_to_run,
@@ -172,29 +171,27 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
172
171
  except ServerClientError as e:
173
172
  logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
174
173
  job_model.status = JobStatus.TERMINATING
175
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
176
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
174
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
177
175
  job_model.termination_reason_message = e.msg
178
176
  job_model.last_processed_at = common_utils.get_current_datetime()
179
177
  await session.commit()
180
178
  return
181
179
 
182
- pool = await _get_pool(session=session, project=project, profile=profile)
183
-
184
180
  # Submitted jobs processing happens in two steps (transactions).
185
181
  # First, the jobs gets an instance assigned (or no instance).
186
182
  # Then, the job runs on the assigned instance or a new instance is provisioned.
187
183
  # This is needed to avoid holding instances lock for a long time.
188
184
  if not job_model.instance_assigned:
189
- # Try assigning instances from the pool.
185
+ # Try assigning an existing instance
190
186
  res = await session.execute(
191
187
  select(InstanceModel)
192
188
  .where(
193
- InstanceModel.pool_id == pool.id,
189
+ InstanceModel.project_id == project.id,
194
190
  InstanceModel.deleted == False,
195
191
  InstanceModel.total_blocks > InstanceModel.busy_blocks,
196
192
  )
197
193
  .options(lazyload(InstanceModel.jobs))
194
+ .order_by(InstanceModel.id) # take locks in order
198
195
  .with_for_update()
199
196
  )
200
197
  pool_instances = list(res.unique().scalars().all())
@@ -287,7 +284,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
287
284
  )
288
285
  instance = _create_instance_model_for_job(
289
286
  project=project,
290
- pool=pool,
291
287
  fleet_model=fleet_model,
292
288
  run_spec=run_spec,
293
289
  job_model=job_model,
@@ -319,6 +315,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
319
315
  select(VolumeModel)
320
316
  .where(VolumeModel.id.in_(volumes_ids))
321
317
  .options(selectinload(VolumeModel.user))
318
+ .order_by(VolumeModel.id) # take locks in order
322
319
  .with_for_update()
323
320
  )
324
321
  async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
@@ -334,19 +331,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
334
331
  await session.commit()
335
332
 
336
333
 
337
- async def _get_pool(session: AsyncSession, project: ProjectModel, profile: Profile) -> PoolModel:
338
- res = await session.execute(
339
- select(PoolModel)
340
- .where(
341
- PoolModel.project_id == project.id,
342
- PoolModel.name == (profile.pool_name or DEFAULT_POOL_NAME),
343
- PoolModel.deleted == False,
344
- )
345
- .options(lazyload(PoolModel.instances))
346
- )
347
- return res.scalar_one()
348
-
349
-
350
334
  async def _assign_job_to_pool_instance(
351
335
  session: AsyncSession,
352
336
  pool_instances: List[InstanceModel],
@@ -450,7 +434,7 @@ async def _run_job_on_new_instance(
450
434
  )
451
435
  # Limit number of offers tried to prevent long-running processing
452
436
  # in case all offers fail.
453
- for backend, offer in offers[:15]:
437
+ for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
454
438
  logger.debug(
455
439
  "%s: trying %s in %s/%s for $%0.4f per hour",
456
440
  fmt(job_model),
@@ -545,7 +529,6 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
545
529
 
546
530
  def _create_instance_model_for_job(
547
531
  project: ProjectModel,
548
- pool: PoolModel,
549
532
  fleet_model: FleetModel,
550
533
  run_spec: RunSpec,
551
534
  job_model: JobModel,
@@ -568,7 +551,6 @@ def _create_instance_model_for_job(
568
551
  name=f"{fleet_model.name}-{instance_num}",
569
552
  instance_num=instance_num,
570
553
  project=project,
571
- pool=pool,
572
554
  created_at=common_utils.get_current_datetime(),
573
555
  started_at=common_utils.get_current_datetime(),
574
556
  status=InstanceStatus.PROVISIONING,
@@ -674,8 +656,7 @@ async def _attach_volumes(
674
656
  except (ServerClientError, BackendError) as e:
675
657
  logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
676
658
  job_model.status = JobStatus.TERMINATING
677
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
678
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
659
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
679
660
  job_model.termination_reason_message = "Failed to attach volume"
680
661
  except Exception:
681
662
  logger.exception(
@@ -683,8 +664,7 @@ async def _attach_volumes(
683
664
  fmt(job_model),
684
665
  )
685
666
  job_model.status = JobStatus.TERMINATING
686
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
687
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
667
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
688
668
  job_model.termination_reason_message = "Failed to attach volume"
689
669
  finally:
690
670
  job_model.job_runtime_data = job_runtime_data.json()
@@ -697,13 +677,15 @@ async def _attach_volume(
697
677
  instance: InstanceModel,
698
678
  instance_id: str,
699
679
  ):
680
+ compute = backend.compute()
681
+ assert isinstance(compute, ComputeWithVolumeSupport)
700
682
  volume = volume_model_to_volume(volume_model)
701
683
  # Refresh only to check if the volume wasn't deleted before the lock
702
684
  await session.refresh(volume_model)
703
685
  if volume_model.deleted:
704
686
  raise ServerClientError("Cannot attach a deleted volume")
705
687
  attachment_data = await common_utils.run_async(
706
- backend.compute().attach_volume,
688
+ compute.attach_volume,
707
689
  volume=volume,
708
690
  instance_id=instance_id,
709
691
  )
@@ -2,6 +2,7 @@ from sqlalchemy import select
2
2
  from sqlalchemy.ext.asyncio import AsyncSession
3
3
  from sqlalchemy.orm import joinedload
4
4
 
5
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
5
6
  from dstack._internal.core.errors import BackendError, BackendNotAvailable
6
7
  from dstack._internal.core.models.volumes import VolumeStatus
7
8
  from dstack._internal.server.db import get_session_ctx
@@ -81,17 +82,19 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
81
82
  await session.commit()
82
83
  return
83
84
 
85
+ compute = backend.compute()
86
+ assert isinstance(compute, ComputeWithVolumeSupport)
84
87
  try:
85
88
  if volume.configuration.volume_id is not None:
86
89
  logger.info("Registering external volume %s", volume_model.name)
87
90
  vpd = await run_async(
88
- backend.compute().register_volume,
91
+ compute.register_volume,
89
92
  volume=volume,
90
93
  )
91
94
  else:
92
95
  logger.info("Provisioning new volume %s", volume_model.name)
93
96
  vpd = await run_async(
94
- backend.compute().create_volume,
97
+ compute.create_volume,
95
98
  volume=volume,
96
99
  )
97
100
  except BackendError as e:
@@ -0,0 +1,40 @@
1
+ """Add JobPrometheusMetrics
2
+
3
+ Revision ID: 60e444118b6d
4
+ Revises: a751ef183f27
5
+ Create Date: 2025-02-21 10:59:26.339353
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "60e444118b6d"
17
+ down_revision = "a751ef183f27"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table(
25
+ "job_prometheus_metrics",
26
+ sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
27
+ sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
28
+ sa.Column("text", sa.Text(), nullable=False),
29
+ sa.ForeignKeyConstraint(
30
+ ["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs")
31
+ ),
32
+ sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")),
33
+ )
34
+ # ### end Alembic commands ###
35
+
36
+
37
+ def downgrade() -> None:
38
+ # ### commands auto generated by Alembic - please adjust! ###
39
+ op.drop_table("job_prometheus_metrics")
40
+ # ### end Alembic commands ###
@@ -0,0 +1,36 @@
1
+ """Make InstanceModel.pool_id optional
2
+
3
+ Revision ID: 7bc2586e8b9e
4
+ Revises: bc8ca4a505c6
5
+ Create Date: 2025-03-13 11:13:39.748303
6
+
7
+ """
8
+
9
+ import sqlalchemy_utils
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "7bc2586e8b9e"
14
+ down_revision = "bc8ca4a505c6"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("instances", schema=None) as batch_op:
22
+ batch_op.alter_column(
23
+ "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=True
24
+ )
25
+
26
+ # ### end Alembic commands ###
27
+
28
+
29
+ def downgrade() -> None:
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ with op.batch_alter_table("instances", schema=None) as batch_op:
32
+ batch_op.alter_column(
33
+ "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=False
34
+ )
35
+
36
+ # ### end Alembic commands ###
@@ -0,0 +1,140 @@
1
+ """Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
2
+
3
+ Revision ID: 98d1b92988bc
4
+ Revises: 60e444118b6d
5
+ Create Date: 2025-02-28 15:12:37.649876
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from alembic_postgresql_enum import TableReference
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "98d1b92988bc"
15
+ down_revision = "60e444118b6d"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # SQLite
22
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "termination_reason",
25
+ existing_type=sa.VARCHAR(length=34),
26
+ type_=sa.Enum(
27
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
28
+ "INTERRUPTED_BY_NO_CAPACITY",
29
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
30
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
31
+ "TERMINATED_BY_USER",
32
+ "VOLUME_ERROR",
33
+ "GATEWAY_ERROR",
34
+ "SCALED_DOWN",
35
+ "DONE_BY_RUNNER",
36
+ "ABORTED_BY_USER",
37
+ "TERMINATED_BY_SERVER",
38
+ "INACTIVITY_DURATION_EXCEEDED",
39
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
40
+ "CONTAINER_EXITED_WITH_ERROR",
41
+ "PORTS_BINDING_FAILED",
42
+ "CREATING_CONTAINER_ERROR",
43
+ "EXECUTOR_ERROR",
44
+ "MAX_DURATION_EXCEEDED",
45
+ name="jobterminationreason",
46
+ ),
47
+ existing_nullable=True,
48
+ )
49
+ # PostgreSQL
50
+ op.sync_enum_values(
51
+ enum_schema="public",
52
+ enum_name="jobterminationreason",
53
+ new_values=[
54
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
55
+ "INTERRUPTED_BY_NO_CAPACITY",
56
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
57
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
58
+ "TERMINATED_BY_USER",
59
+ "VOLUME_ERROR",
60
+ "GATEWAY_ERROR",
61
+ "SCALED_DOWN",
62
+ "DONE_BY_RUNNER",
63
+ "ABORTED_BY_USER",
64
+ "TERMINATED_BY_SERVER",
65
+ "INACTIVITY_DURATION_EXCEEDED",
66
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
67
+ "CONTAINER_EXITED_WITH_ERROR",
68
+ "PORTS_BINDING_FAILED",
69
+ "CREATING_CONTAINER_ERROR",
70
+ "EXECUTOR_ERROR",
71
+ "MAX_DURATION_EXCEEDED",
72
+ ],
73
+ affected_columns=[
74
+ TableReference(
75
+ table_schema="public", table_name="jobs", column_name="termination_reason"
76
+ )
77
+ ],
78
+ enum_values_to_rename=[],
79
+ )
80
+
81
+
82
+ def downgrade() -> None:
83
+ # SQLite
84
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
85
+ batch_op.alter_column(
86
+ "termination_reason",
87
+ existing_type=sa.Enum(
88
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
89
+ "INTERRUPTED_BY_NO_CAPACITY",
90
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
91
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
92
+ "TERMINATED_BY_USER",
93
+ "VOLUME_ERROR",
94
+ "GATEWAY_ERROR",
95
+ "SCALED_DOWN",
96
+ "DONE_BY_RUNNER",
97
+ "ABORTED_BY_USER",
98
+ "TERMINATED_BY_SERVER",
99
+ "INACTIVITY_DURATION_EXCEEDED",
100
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
101
+ "CONTAINER_EXITED_WITH_ERROR",
102
+ "PORTS_BINDING_FAILED",
103
+ "CREATING_CONTAINER_ERROR",
104
+ "EXECUTOR_ERROR",
105
+ "MAX_DURATION_EXCEEDED",
106
+ name="jobterminationreason",
107
+ ),
108
+ type_=sa.VARCHAR(length=34),
109
+ existing_nullable=True,
110
+ )
111
+ # PostgreSQL
112
+ op.sync_enum_values(
113
+ enum_schema="public",
114
+ enum_name="jobterminationreason",
115
+ new_values=[
116
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
117
+ "INTERRUPTED_BY_NO_CAPACITY",
118
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
119
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
120
+ "TERMINATED_BY_USER",
121
+ "VOLUME_ERROR",
122
+ "GATEWAY_ERROR",
123
+ "SCALED_DOWN",
124
+ "DONE_BY_RUNNER",
125
+ "ABORTED_BY_USER",
126
+ "TERMINATED_BY_SERVER",
127
+ "INACTIVITY_DURATION_EXCEEDED",
128
+ "CONTAINER_EXITED_WITH_ERROR",
129
+ "PORTS_BINDING_FAILED",
130
+ "CREATING_CONTAINER_ERROR",
131
+ "EXECUTOR_ERROR",
132
+ "MAX_DURATION_EXCEEDED",
133
+ ],
134
+ affected_columns=[
135
+ TableReference(
136
+ table_schema="public", table_name="jobs", column_name="termination_reason"
137
+ )
138
+ ],
139
+ enum_values_to_rename=[],
140
+ )
@@ -0,0 +1,171 @@
1
+ """Store BackendType as string
2
+
3
+ Revision ID: bc8ca4a505c6
4
+ Revises: 98d1b92988bc
5
+ Create Date: 2025-03-10 14:49:06.837118
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from sqlalchemy.dialects import postgresql
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "bc8ca4a505c6"
15
+ down_revision = "98d1b92988bc"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table("backends", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "type",
25
+ existing_type=postgresql.ENUM(
26
+ "AWS",
27
+ "AZURE",
28
+ "CUDO",
29
+ "DATACRUNCH",
30
+ "DSTACK",
31
+ "GCP",
32
+ "KUBERNETES",
33
+ "LAMBDA",
34
+ "LOCAL",
35
+ "REMOTE",
36
+ "NEBIUS",
37
+ "OCI",
38
+ "RUNPOD",
39
+ "TENSORDOCK",
40
+ "VASTAI",
41
+ "VULTR",
42
+ name="backendtype",
43
+ ),
44
+ type_=sa.String(length=100),
45
+ existing_nullable=False,
46
+ )
47
+
48
+ with op.batch_alter_table("instances", schema=None) as batch_op:
49
+ batch_op.alter_column(
50
+ "backend",
51
+ existing_type=postgresql.ENUM(
52
+ "AWS",
53
+ "AZURE",
54
+ "CUDO",
55
+ "DATACRUNCH",
56
+ "DSTACK",
57
+ "GCP",
58
+ "KUBERNETES",
59
+ "LAMBDA",
60
+ "LOCAL",
61
+ "REMOTE",
62
+ "NEBIUS",
63
+ "OCI",
64
+ "RUNPOD",
65
+ "TENSORDOCK",
66
+ "VASTAI",
67
+ "VULTR",
68
+ name="backendtype",
69
+ ),
70
+ type_=sa.String(length=100),
71
+ existing_nullable=True,
72
+ )
73
+
74
+ sa.Enum(
75
+ "AWS",
76
+ "AZURE",
77
+ "CUDO",
78
+ "DATACRUNCH",
79
+ "DSTACK",
80
+ "GCP",
81
+ "KUBERNETES",
82
+ "LAMBDA",
83
+ "LOCAL",
84
+ "REMOTE",
85
+ "NEBIUS",
86
+ "OCI",
87
+ "RUNPOD",
88
+ "TENSORDOCK",
89
+ "VASTAI",
90
+ "VULTR",
91
+ name="backendtype",
92
+ ).drop(op.get_bind())
93
+ # ### end Alembic commands ###
94
+
95
+
96
+ def downgrade() -> None:
97
+ # ### commands auto generated by Alembic - please adjust! ###
98
+ sa.Enum(
99
+ "AWS",
100
+ "AZURE",
101
+ "CUDO",
102
+ "DATACRUNCH",
103
+ "DSTACK",
104
+ "GCP",
105
+ "KUBERNETES",
106
+ "LAMBDA",
107
+ "LOCAL",
108
+ "REMOTE",
109
+ "NEBIUS",
110
+ "OCI",
111
+ "RUNPOD",
112
+ "TENSORDOCK",
113
+ "VASTAI",
114
+ "VULTR",
115
+ name="backendtype",
116
+ ).create(op.get_bind())
117
+ with op.batch_alter_table("instances", schema=None) as batch_op:
118
+ batch_op.alter_column(
119
+ "backend",
120
+ existing_type=sa.String(length=100),
121
+ type_=postgresql.ENUM(
122
+ "AWS",
123
+ "AZURE",
124
+ "CUDO",
125
+ "DATACRUNCH",
126
+ "DSTACK",
127
+ "GCP",
128
+ "KUBERNETES",
129
+ "LAMBDA",
130
+ "LOCAL",
131
+ "REMOTE",
132
+ "NEBIUS",
133
+ "OCI",
134
+ "RUNPOD",
135
+ "TENSORDOCK",
136
+ "VASTAI",
137
+ "VULTR",
138
+ name="backendtype",
139
+ ),
140
+ existing_nullable=True,
141
+ postgresql_using="backend::VARCHAR::backendtype",
142
+ )
143
+
144
+ with op.batch_alter_table("backends", schema=None) as batch_op:
145
+ batch_op.alter_column(
146
+ "type",
147
+ existing_type=sa.String(length=100),
148
+ type_=postgresql.ENUM(
149
+ "AWS",
150
+ "AZURE",
151
+ "CUDO",
152
+ "DATACRUNCH",
153
+ "DSTACK",
154
+ "GCP",
155
+ "KUBERNETES",
156
+ "LAMBDA",
157
+ "LOCAL",
158
+ "REMOTE",
159
+ "NEBIUS",
160
+ "OCI",
161
+ "RUNPOD",
162
+ "TENSORDOCK",
163
+ "VASTAI",
164
+ "VULTR",
165
+ name="backendtype",
166
+ ),
167
+ existing_nullable=False,
168
+ postgresql_using="type::VARCHAR::backendtype",
169
+ )
170
+
171
+ # ### end Alembic commands ###