dstack 0.18.43__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4a0fe83e84574654e397.js} +76 -19
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ from typing import Optional
4
4
  from unittest.mock import MagicMock, Mock, patch
5
5
 
6
6
  import pytest
7
+ from freezegun import freeze_time
7
8
  from sqlalchemy.ext.asyncio import AsyncSession
8
9
 
9
10
  from dstack._internal.core.errors import SSHError
@@ -11,6 +12,7 @@ from dstack._internal.core.models.backends.base import BackendType
11
12
  from dstack._internal.core.models.common import NetworkMode
12
13
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
13
14
  from dstack._internal.core.models.instances import InstanceStatus
15
+ from dstack._internal.core.models.profiles import UtilizationPolicy
14
16
  from dstack._internal.core.models.runs import (
15
17
  JobRuntimeData,
16
18
  JobStatus,
@@ -39,7 +41,7 @@ from dstack._internal.server.services.volumes import (
39
41
  from dstack._internal.server.testing.common import (
40
42
  create_instance,
41
43
  create_job,
42
- create_pool,
44
+ create_job_metrics_point,
43
45
  create_project,
44
46
  create_repo,
45
47
  create_run,
@@ -103,11 +105,9 @@ class TestProcessRunningJobs:
103
105
  repo=repo,
104
106
  user=user,
105
107
  )
106
- pool = await create_pool(session=session, project=project)
107
108
  instance = await create_instance(
108
109
  session=session,
109
110
  project=project,
110
- pool=pool,
111
111
  status=InstanceStatus.BUSY,
112
112
  )
113
113
  job_provisioning_data = get_job_provisioning_data(dockerized=False)
@@ -153,11 +153,9 @@ class TestProcessRunningJobs:
153
153
  repo=repo,
154
154
  user=user,
155
155
  )
156
- pool = await create_pool(session=session, project=project)
157
156
  instance = await create_instance(
158
157
  session=session,
159
158
  project=project,
160
- pool=pool,
161
159
  status=InstanceStatus.BUSY,
162
160
  )
163
161
  job_provisioning_data = get_job_provisioning_data(dockerized=False)
@@ -204,11 +202,9 @@ class TestProcessRunningJobs:
204
202
  repo=repo,
205
203
  user=user,
206
204
  )
207
- pool = await create_pool(session=session, project=project)
208
205
  instance = await create_instance(
209
206
  session=session,
210
207
  project=project,
211
- pool=pool,
212
208
  status=InstanceStatus.BUSY,
213
209
  )
214
210
  job_provisioning_data = get_job_provisioning_data(dockerized=False)
@@ -304,11 +300,9 @@ class TestProcessRunningJobs:
304
300
  run_name="test-run",
305
301
  run_spec=run_spec,
306
302
  )
307
- pool = await create_pool(session=session, project=project)
308
303
  instance = await create_instance(
309
304
  session=session,
310
305
  project=project,
311
- pool=pool,
312
306
  status=InstanceStatus.BUSY,
313
307
  )
314
308
  job_provisioning_data = get_job_provisioning_data(dockerized=True)
@@ -374,11 +368,9 @@ class TestProcessRunningJobs:
374
368
  repo=repo,
375
369
  user=user,
376
370
  )
377
- pool = await create_pool(session=session, project=project)
378
371
  instance = await create_instance(
379
372
  session=session,
380
373
  project=project,
381
- pool=pool,
382
374
  status=InstanceStatus.BUSY,
383
375
  )
384
376
  job = await create_job(
@@ -431,11 +423,9 @@ class TestProcessRunningJobs:
431
423
  repo=repo,
432
424
  user=user,
433
425
  )
434
- pool = await create_pool(session=session, project=project)
435
426
  instance = await create_instance(
436
427
  session=session,
437
428
  project=project,
438
- pool=pool,
439
429
  status=InstanceStatus.BUSY,
440
430
  )
441
431
  job_provisioning_data = get_job_provisioning_data(dockerized=True)
@@ -476,11 +466,9 @@ class TestProcessRunningJobs:
476
466
  repo=repo,
477
467
  user=user,
478
468
  )
479
- pool = await create_pool(session, project)
480
469
  instance = await create_instance(
481
470
  session=session,
482
471
  project=project,
483
- pool=pool,
484
472
  status=InstanceStatus.IDLE,
485
473
  )
486
474
  job_provisioning_data = get_job_provisioning_data(dockerized=True)
@@ -525,11 +513,9 @@ class TestProcessRunningJobs:
525
513
  run_name="test-run",
526
514
  run_spec=run_spec,
527
515
  )
528
- pool = await create_pool(session=session, project=project)
529
516
  instance = await create_instance(
530
517
  session=session,
531
518
  project=project,
532
- pool=pool,
533
519
  status=InstanceStatus.BUSY,
534
520
  )
535
521
  job = await create_job(
@@ -652,11 +638,9 @@ class TestProcessRunningJobs:
652
638
  ),
653
639
  ),
654
640
  )
655
- pool = await create_pool(session=session, project=project)
656
641
  instance = await create_instance(
657
642
  session=session,
658
643
  project=project,
659
- pool=pool,
660
644
  status=InstanceStatus.BUSY,
661
645
  )
662
646
  job = await create_job(
@@ -688,3 +672,123 @@ class TestProcessRunningJobs:
688
672
  assert job.status == expected_status
689
673
  assert job.termination_reason == expected_termination_reason
690
674
  assert job.inactivity_secs == expected_inactivity_secs
675
+
676
+ @pytest.mark.asyncio
677
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
678
+ @pytest.mark.parametrize(
679
+ ["samples", "expected_status"],
680
+ [
681
+ pytest.param(
682
+ [
683
+ (datetime(2023, 1, 1, 12, 25, 20, tzinfo=timezone.utc), 30),
684
+ (datetime(2023, 1, 1, 12, 25, 30, tzinfo=timezone.utc), 30),
685
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
686
+ ],
687
+ JobStatus.RUNNING,
688
+ id="not-enough-points",
689
+ ),
690
+ pytest.param(
691
+ [
692
+ (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
693
+ (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
694
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 80),
695
+ ],
696
+ JobStatus.RUNNING,
697
+ id="any-above-min",
698
+ ),
699
+ pytest.param(
700
+ [
701
+ (datetime(2023, 1, 1, 12, 10, 10, tzinfo=timezone.utc), 80), # outside window
702
+ (datetime(2023, 1, 1, 12, 10, 20, tzinfo=timezone.utc), 80), # outside window
703
+ (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
704
+ (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
705
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
706
+ ],
707
+ JobStatus.TERMINATING,
708
+ id="all-below-min",
709
+ ),
710
+ ],
711
+ )
712
+ @freeze_time(datetime(2023, 1, 1, 12, 30, tzinfo=timezone.utc))
713
+ async def test_gpu_utilization(
714
+ self,
715
+ test_db,
716
+ session: AsyncSession,
717
+ samples: list[tuple[datetime, int]],
718
+ expected_status: JobStatus,
719
+ ) -> None:
720
+ project = await create_project(session=session)
721
+ user = await create_user(session=session)
722
+ repo = await create_repo(
723
+ session=session,
724
+ project_id=project.id,
725
+ )
726
+ run = await create_run(
727
+ session=session,
728
+ project=project,
729
+ repo=repo,
730
+ user=user,
731
+ status=RunStatus.RUNNING,
732
+ run_name="test-run",
733
+ run_spec=get_run_spec(
734
+ run_name="test-run",
735
+ repo_id=repo.name,
736
+ configuration=DevEnvironmentConfiguration(
737
+ name="test-run",
738
+ ide="vscode",
739
+ utilization_policy=UtilizationPolicy(
740
+ min_gpu_utilization=80,
741
+ time_window=600,
742
+ ),
743
+ ),
744
+ ),
745
+ )
746
+ instance = await create_instance(
747
+ session=session,
748
+ project=project,
749
+ status=InstanceStatus.BUSY,
750
+ )
751
+ job = await create_job(
752
+ session=session,
753
+ run=run,
754
+ status=JobStatus.RUNNING,
755
+ job_provisioning_data=get_job_provisioning_data(),
756
+ instance=instance,
757
+ instance_assigned=True,
758
+ )
759
+ for timestamp, gpu_util in samples:
760
+ # two GPUs, the second one always 100% utilized
761
+ await create_job_metrics_point(
762
+ session=session,
763
+ job_model=job,
764
+ timestamp=timestamp,
765
+ gpus_memory_usage_bytes=[1024, 1024],
766
+ gpus_util_percent=[gpu_util, 100],
767
+ )
768
+ with (
769
+ patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock,
770
+ patch(
771
+ "dstack._internal.server.services.runner.client.RunnerClient"
772
+ ) as RunnerClientMock,
773
+ ):
774
+ runner_client_mock = RunnerClientMock.return_value
775
+ runner_client_mock.pull.return_value = PullResponse(
776
+ job_states=[],
777
+ job_logs=[],
778
+ runner_logs=[],
779
+ last_updated=0,
780
+ no_connections_secs=0,
781
+ )
782
+ await process_running_jobs()
783
+ SSHTunnelMock.assert_called_once()
784
+ runner_client_mock.pull.assert_called_once()
785
+ await session.refresh(job)
786
+ assert job.status == expected_status
787
+ if expected_status == JobStatus.TERMINATING:
788
+ assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER
789
+ assert job.termination_reason_message == (
790
+ "The job GPU utilization below 80% for 600 seconds"
791
+ )
792
+ else:
793
+ assert job.termination_reason is None
794
+ assert job.termination_reason_message is None
@@ -21,7 +21,6 @@ from dstack._internal.server.models import RunModel
21
21
  from dstack._internal.server.testing.common import (
22
22
  create_instance,
23
23
  create_job,
24
- create_pool,
25
24
  create_project,
26
25
  create_repo,
27
26
  create_run,
@@ -42,9 +41,6 @@ async def make_run(
42
41
  session=session,
43
42
  project_id=project.id,
44
43
  )
45
- project.default_pool = await create_pool(
46
- session=session, project=project, pool_name="default-pool"
47
- )
48
44
  run_name = "test-run"
49
45
  profile = Profile(
50
46
  name="test-profile",
@@ -60,7 +56,7 @@ async def make_run(
60
56
  replicas=parse_obj_as(Range[int], replicas),
61
57
  ),
62
58
  )
63
- return await create_run(
59
+ run = await create_run(
64
60
  session=session,
65
61
  project=project,
66
62
  repo=repo,
@@ -69,6 +65,8 @@ async def make_run(
69
65
  run_spec=run_spec,
70
66
  status=status,
71
67
  )
68
+ run.project = project
69
+ return run
72
70
 
73
71
 
74
72
  class TestProcessRuns:
@@ -117,11 +115,9 @@ class TestProcessRuns:
117
115
  async def test_terminate_run_jobs(self, test_db, session: AsyncSession):
118
116
  run = await make_run(session, status=RunStatus.TERMINATING)
119
117
  run.termination_reason = RunTerminationReason.JOB_FAILED
120
- pool = await create_pool(session=session, project=run.project)
121
118
  instance = await create_instance(
122
119
  session=session,
123
120
  project=run.project,
124
- pool=pool,
125
121
  status=InstanceStatus.BUSY,
126
122
  )
127
123
  job = await create_job(
@@ -146,9 +142,7 @@ class TestProcessRuns:
146
142
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
147
143
  async def test_retry_running_to_pending(self, test_db, session: AsyncSession):
148
144
  run = await make_run(session, status=RunStatus.RUNNING)
149
- instance = await create_instance(
150
- session, project=run.project, pool=run.project.default_pool, spot=True
151
- )
145
+ instance = await create_instance(session, project=run.project, spot=True)
152
146
  await create_job(
153
147
  session=session,
154
148
  run=run,
@@ -169,9 +163,7 @@ class TestProcessRuns:
169
163
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
170
164
  async def test_retry_running_to_failed(self, test_db, session: AsyncSession):
171
165
  run = await make_run(session, status=RunStatus.RUNNING)
172
- instance = await create_instance(
173
- session, project=run.project, pool=run.project.default_pool, spot=True
174
- )
166
+ instance = await create_instance(session, project=run.project, spot=True)
175
167
  # job exited with non-zero code
176
168
  await create_job(
177
169
  session=session,
@@ -237,9 +229,7 @@ class TestProcessRunsReplicas:
237
229
  submitted_at=run.submitted_at,
238
230
  last_processed_at=run.submitted_at,
239
231
  replica_num=0,
240
- instance=await create_instance(
241
- session, project=run.project, pool=run.project.default_pool, spot=True
242
- ),
232
+ instance=await create_instance(session, project=run.project, spot=True),
243
233
  job_provisioning_data=get_job_provisioning_data(),
244
234
  )
245
235
  await create_job(
@@ -250,9 +240,7 @@ class TestProcessRunsReplicas:
250
240
  submitted_at=run.submitted_at,
251
241
  last_processed_at=run.submitted_at,
252
242
  replica_num=1,
253
- instance=await create_instance(
254
- session, project=run.project, pool=run.project.default_pool, spot=True
255
- ),
243
+ instance=await create_instance(session, project=run.project, spot=True),
256
244
  job_provisioning_data=get_job_provisioning_data(),
257
245
  )
258
246
  with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock:
@@ -273,9 +261,7 @@ class TestProcessRunsReplicas:
273
261
  submitted_at=run.submitted_at,
274
262
  last_processed_at=run.last_processed_at,
275
263
  replica_num=0,
276
- instance=await create_instance(
277
- session, project=run.project, pool=run.project.default_pool, spot=True
278
- ),
264
+ instance=await create_instance(session, project=run.project, spot=True),
279
265
  job_provisioning_data=get_job_provisioning_data(),
280
266
  )
281
267
  await create_job(
@@ -14,7 +14,7 @@ from dstack._internal.core.models.instances import (
14
14
  InstanceType,
15
15
  Resources,
16
16
  )
17
- from dstack._internal.core.models.profiles import Profile, ProfileRetryPolicy
17
+ from dstack._internal.core.models.profiles import Profile
18
18
  from dstack._internal.core.models.runs import (
19
19
  JobProvisioningData,
20
20
  JobStatus,
@@ -29,10 +29,10 @@ from dstack._internal.core.models.volumes import (
29
29
  from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
30
30
  from dstack._internal.server.models import InstanceModel, JobModel, VolumeAttachmentModel
31
31
  from dstack._internal.server.testing.common import (
32
+ ComputeMockSpec,
32
33
  create_fleet,
33
34
  create_instance,
34
35
  create_job,
35
- create_pool,
36
36
  create_project,
37
37
  create_repo,
38
38
  create_run,
@@ -52,7 +52,6 @@ class TestProcessSubmittedJobs:
52
52
  async def test_fails_job_when_no_backends(self, test_db, session: AsyncSession):
53
53
  project = await create_project(session=session)
54
54
  user = await create_user(session=session)
55
- await create_pool(session=session, project=project)
56
55
  repo = await create_repo(
57
56
  session=session,
58
57
  project_id=project.id,
@@ -93,7 +92,6 @@ class TestProcessSubmittedJobs:
93
92
  ):
94
93
  project = await create_project(session=session)
95
94
  user = await create_user(session=session)
96
- pool = await create_pool(session=session, project=project)
97
95
  repo = await create_repo(
98
96
  session=session,
99
97
  project_id=project.id,
@@ -151,12 +149,6 @@ class TestProcessSubmittedJobs:
151
149
  assert job is not None
152
150
  assert job.status == JobStatus.PROVISIONING
153
151
 
154
- await session.refresh(pool)
155
- instance_offer = InstanceOfferWithAvailability.parse_raw(pool.instances[0].offer)
156
- assert offer == instance_offer
157
- pool_job_provisioning_data = pool.instances[0].job_provisioning_data
158
- assert pool_job_provisioning_data == job.job_provisioning_data
159
-
160
152
  @pytest.mark.asyncio
161
153
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
162
154
  async def test_fails_job_when_privileged_true_and_no_offers_with_create_instance_support(
@@ -166,7 +158,6 @@ class TestProcessSubmittedJobs:
166
158
  ):
167
159
  project = await create_project(session=session)
168
160
  user = await create_user(session=session)
169
- pool = await create_pool(session=session, project=project)
170
161
  repo = await create_repo(
171
162
  session=session,
172
163
  project_id=project.id,
@@ -227,9 +218,6 @@ class TestProcessSubmittedJobs:
227
218
  assert job.status == JobStatus.TERMINATING
228
219
  assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
229
220
 
230
- await session.refresh(pool)
231
- assert not pool.instances
232
-
233
221
  @pytest.mark.asyncio
234
222
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
235
223
  async def test_fails_job_when_instance_mounts_and_no_offers_with_create_instance_support(
@@ -239,7 +227,6 @@ class TestProcessSubmittedJobs:
239
227
  ):
240
228
  project = await create_project(session=session)
241
229
  user = await create_user(session=session)
242
- pool = await create_pool(session=session, project=project)
243
230
  repo = await create_repo(
244
231
  session=session,
245
232
  project_id=project.id,
@@ -300,9 +287,6 @@ class TestProcessSubmittedJobs:
300
287
  assert job.status == JobStatus.TERMINATING
301
288
  assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
302
289
 
303
- await session.refresh(pool)
304
- assert not pool.instances
305
-
306
290
  @pytest.mark.asyncio
307
291
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
308
292
  async def test_provisions_job_with_optional_instance_volume_not_attached(
@@ -312,7 +296,6 @@ class TestProcessSubmittedJobs:
312
296
  ):
313
297
  project = await create_project(session=session)
314
298
  user = await create_user(session=session)
315
- pool = await create_pool(session=session, project=project)
316
299
  repo = await create_repo(
317
300
  session=session,
318
301
  project_id=project.id,
@@ -369,18 +352,11 @@ class TestProcessSubmittedJobs:
369
352
  assert job is not None
370
353
  assert job.status == JobStatus.PROVISIONING
371
354
 
372
- await session.refresh(pool)
373
- instance_offer = InstanceOfferWithAvailability.parse_raw(pool.instances[0].offer)
374
- assert offer == instance_offer
375
- pool_job_provisioning_data = pool.instances[0].job_provisioning_data
376
- assert pool_job_provisioning_data == job.job_provisioning_data
377
-
378
355
  @pytest.mark.asyncio
379
356
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
380
357
  async def test_fails_job_when_no_capacity(self, test_db, session: AsyncSession):
381
358
  project = await create_project(session=session)
382
359
  user = await create_user(session=session)
383
- pool = await create_pool(session=session, project=project)
384
360
  repo = await create_repo(
385
361
  session=session,
386
362
  project_id=project.id,
@@ -396,7 +372,6 @@ class TestProcessSubmittedJobs:
396
372
  repo_id=repo.name,
397
373
  profile=Profile(
398
374
  name="default",
399
- retry_policy=ProfileRetryPolicy(retry=True, duration=3600),
400
375
  ),
401
376
  ),
402
377
  )
@@ -414,15 +389,12 @@ class TestProcessSubmittedJobs:
414
389
  assert job is not None
415
390
  assert job.status == JobStatus.TERMINATING
416
391
  assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
417
- await session.refresh(pool)
418
- assert not pool.instances
419
392
 
420
393
  @pytest.mark.asyncio
421
394
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
422
395
  async def test_assignes_job_to_instance(self, test_db, session: AsyncSession):
423
396
  project = await create_project(session)
424
397
  user = await create_user(session)
425
- pool = await create_pool(session=session, project=project)
426
398
  repo = await create_repo(
427
399
  session=session,
428
400
  project_id=project.id,
@@ -430,10 +402,8 @@ class TestProcessSubmittedJobs:
430
402
  instance = await create_instance(
431
403
  session=session,
432
404
  project=project,
433
- pool=pool,
434
405
  status=InstanceStatus.IDLE,
435
406
  )
436
- await session.refresh(pool)
437
407
  run = await create_run(
438
408
  session=session,
439
409
  project=project,
@@ -459,7 +429,6 @@ class TestProcessSubmittedJobs:
459
429
  async def test_assigns_job_to_instance_with_volumes(self, test_db, session: AsyncSession):
460
430
  project = await create_project(session)
461
431
  user = await create_user(session)
462
- pool = await create_pool(session=session, project=project)
463
432
  repo = await create_repo(
464
433
  session=session,
465
434
  project_id=project.id,
@@ -476,12 +445,10 @@ class TestProcessSubmittedJobs:
476
445
  instance = await create_instance(
477
446
  session=session,
478
447
  project=project,
479
- pool=pool,
480
448
  status=InstanceStatus.IDLE,
481
449
  backend=BackendType.AWS,
482
450
  region="us-east-1",
483
451
  )
484
- await session.refresh(pool)
485
452
  run_spec = get_run_spec(run_name="test-run", repo_id=repo.name)
486
453
  run_spec.configuration.volumes = [
487
454
  VolumeMountPoint(name=volume.name, path="/volume"),
@@ -506,6 +473,7 @@ class TestProcessSubmittedJobs:
506
473
  backend_mock = Mock()
507
474
  m.return_value = backend_mock
508
475
  backend_mock.TYPE = BackendType.AWS
476
+ backend_mock.compute.return_value = Mock(spec=ComputeMockSpec)
509
477
  backend_mock.compute.return_value.attach_volume.return_value = VolumeAttachmentData()
510
478
  # Submitted jobs processing happens in two steps
511
479
  await process_submitted_jobs()
@@ -532,7 +500,6 @@ class TestProcessSubmittedJobs:
532
500
  async def test_assigns_job_to_shared_instance(self, test_db, session: AsyncSession):
533
501
  project = await create_project(session)
534
502
  user = await create_user(session)
535
- pool = await create_pool(session=session, project=project)
536
503
  repo = await create_repo(
537
504
  session=session,
538
505
  project_id=project.id,
@@ -541,13 +508,11 @@ class TestProcessSubmittedJobs:
541
508
  instance = await create_instance(
542
509
  session=session,
543
510
  project=project,
544
- pool=pool,
545
511
  status=InstanceStatus.IDLE,
546
512
  offer=offer,
547
513
  total_blocks=4,
548
514
  busy_blocks=1,
549
515
  )
550
- await session.refresh(pool)
551
516
  run = await create_run(
552
517
  session=session,
553
518
  project=project,
@@ -577,12 +542,10 @@ class TestProcessSubmittedJobs:
577
542
  project = await create_project(session)
578
543
  user = await create_user(session)
579
544
  repo = await create_repo(session=session, project_id=project.id)
580
- pool = await create_pool(session=session, project=project)
581
545
  fleet = await create_fleet(session=session, project=project)
582
546
  instance = await create_instance(
583
547
  session=session,
584
548
  project=project,
585
- pool=pool,
586
549
  instance_num=0,
587
550
  status=InstanceStatus.BUSY,
588
551
  )
@@ -7,6 +7,7 @@ from dstack._internal.core.models.backends.base import BackendType
7
7
  from dstack._internal.core.models.volumes import VolumeProvisioningData, VolumeStatus
8
8
  from dstack._internal.server.background.tasks.process_volumes import process_submitted_volumes
9
9
  from dstack._internal.server.testing.common import (
10
+ ComputeMockSpec,
10
11
  create_project,
11
12
  create_user,
12
13
  create_volume,
@@ -40,6 +41,7 @@ class TestProcessSubmittedVolumes:
40
41
  ) as m:
41
42
  aws_mock = Mock()
42
43
  m.return_value = aws_mock
44
+ aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
43
45
  aws_mock.compute.return_value.create_volume.return_value = VolumeProvisioningData(
44
46
  backend=BackendType.AWS,
45
47
  volume_id="1234",