dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
@@ -28,9 +28,9 @@ from dstack._internal.server.background.tasks.process_instances import (
28
28
  process_instances,
29
29
  )
30
30
  from dstack._internal.server.testing.common import (
31
+ ComputeMockSpec,
31
32
  create_instance,
32
33
  create_job,
33
- create_pool,
34
34
  create_project,
35
35
  create_repo,
36
36
  create_run,
@@ -49,10 +49,10 @@ class TestCheckShim:
49
49
  self, test_db, session: AsyncSession
50
50
  ):
51
51
  project = await create_project(session=session)
52
- pool = await create_pool(session, project)
53
-
54
52
  instance = await create_instance(
55
- session, project, pool, status=InstanceStatus.PROVISIONING
53
+ session=session,
54
+ project=project,
55
+ status=InstanceStatus.PROVISIONING,
56
56
  )
57
57
  instance.termination_deadline = get_current_datetime() + dt.timedelta(days=1)
58
58
  instance.health_status = "ssh connect problem"
@@ -78,10 +78,10 @@ class TestCheckShim:
78
78
  self, test_db, session: AsyncSession
79
79
  ):
80
80
  project = await create_project(session=session)
81
- pool = await create_pool(session, project)
82
-
83
81
  instance = await create_instance(
84
- session, project, pool, status=InstanceStatus.PROVISIONING
82
+ session=session,
83
+ project=project,
84
+ status=InstanceStatus.PROVISIONING,
85
85
  )
86
86
  instance.started_at = get_current_datetime() + dt.timedelta(minutes=-20)
87
87
  instance.health_status = "ssh connect problem"
@@ -110,7 +110,6 @@ class TestCheckShim:
110
110
  ):
111
111
  user = await create_user(session=session)
112
112
  project = await create_project(session=session, owner=user)
113
- pool = await create_pool(session, project)
114
113
  repo = await create_repo(
115
114
  session=session,
116
115
  project_id=project.id,
@@ -121,9 +120,10 @@ class TestCheckShim:
121
120
  repo=repo,
122
121
  user=user,
123
122
  )
124
-
125
123
  instance = await create_instance(
126
- session, project, pool, status=InstanceStatus.PROVISIONING
124
+ session=session,
125
+ project=project,
126
+ status=InstanceStatus.PROVISIONING,
127
127
  )
128
128
  instance.termination_deadline = get_current_datetime().replace(
129
129
  tzinfo=dt.timezone.utc
@@ -158,10 +158,11 @@ class TestCheckShim:
158
158
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
159
159
  async def test_check_shim_start_termination_deadline(self, test_db, session: AsyncSession):
160
160
  project = await create_project(session=session)
161
- pool = await create_pool(session, project)
162
-
163
- instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE)
164
-
161
+ instance = await create_instance(
162
+ session=session,
163
+ project=project,
164
+ status=InstanceStatus.IDLE,
165
+ )
165
166
  health_status = "SSH connection fail"
166
167
  with patch(
167
168
  "dstack._internal.server.background.tasks.process_instances._instance_healthcheck"
@@ -183,9 +184,11 @@ class TestCheckShim:
183
184
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
184
185
  async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession):
185
186
  project = await create_project(session=session)
186
- pool = await create_pool(session, project)
187
-
188
- instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE)
187
+ instance = await create_instance(
188
+ session=session,
189
+ project=project,
190
+ status=InstanceStatus.IDLE,
191
+ )
189
192
  instance.termination_deadline = get_current_datetime() + dt.timedelta(minutes=19)
190
193
  await session.commit()
191
194
 
@@ -206,9 +209,11 @@ class TestCheckShim:
206
209
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
207
210
  async def test_check_shim_terminate_instance_by_dedaline(self, test_db, session: AsyncSession):
208
211
  project = await create_project(session=session)
209
- pool = await create_pool(session, project)
210
-
211
- instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE)
212
+ instance = await create_instance(
213
+ session=session,
214
+ project=project,
215
+ status=InstanceStatus.IDLE,
216
+ )
212
217
  termination_deadline_time = get_current_datetime() + dt.timedelta(minutes=-19)
213
218
  instance.termination_deadline = termination_deadline_time
214
219
  await session.commit()
@@ -251,7 +256,6 @@ class TestCheckShim:
251
256
  ):
252
257
  # see https://github.com/dstackai/dstack/issues/2041
253
258
  project = await create_project(session=session)
254
- pool = await create_pool(session, project)
255
259
  if has_job:
256
260
  user = await create_user(session=session)
257
261
  repo = await create_repo(
@@ -272,9 +276,8 @@ class TestCheckShim:
272
276
  else:
273
277
  job = None
274
278
  instance = await create_instance(
275
- session,
276
- project,
277
- pool,
279
+ session=session,
280
+ project=project,
278
281
  created_at=get_current_datetime(),
279
282
  termination_policy=termination_policy,
280
283
  status=InstanceStatus.IDLE,
@@ -302,8 +305,9 @@ class TestTerminateIdleTime:
302
305
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
303
306
  async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
304
307
  project = await create_project(session=session)
305
- pool = await create_pool(session, project)
306
- instance = await create_instance(session, project, pool, status=InstanceStatus.IDLE)
308
+ instance = await create_instance(
309
+ session=session, project=project, status=InstanceStatus.IDLE
310
+ )
307
311
  instance.termination_idle_time = 300
308
312
  instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
309
313
  instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19)
@@ -320,11 +324,9 @@ class TestSSHInstanceTerminateProvisionTimeoutExpired:
320
324
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
321
325
  async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
322
326
  project = await create_project(session=session)
323
- pool = await create_pool(session, project)
324
327
  instance = await create_instance(
325
- session,
326
- project,
327
- pool,
328
+ session=session,
329
+ project=project,
328
330
  status=InstanceStatus.PENDING,
329
331
  created_at=get_current_datetime() - dt.timedelta(days=100),
330
332
  )
@@ -357,10 +359,9 @@ class TestTerminate:
357
359
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
358
360
  async def test_terminate(self, test_db, session: AsyncSession):
359
361
  project = await create_project(session=session)
360
- pool = await create_pool(session, project)
361
-
362
- instance = await create_instance(session, project, pool, status=InstanceStatus.TERMINATING)
363
-
362
+ instance = await create_instance(
363
+ session=session, project=project, status=InstanceStatus.TERMINATING
364
+ )
364
365
  reason = "some reason"
365
366
  instance.termination_reason = reason
366
367
  instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19)
@@ -384,8 +385,9 @@ class TestTerminate:
384
385
  @pytest.mark.parametrize("error", [BackendError("err"), RuntimeError("err")])
385
386
  async def test_terminate_retry(self, test_db, session: AsyncSession, error: Exception):
386
387
  project = await create_project(session=session)
387
- pool = await create_pool(session, project)
388
- instance = await create_instance(session, project, pool, status=InstanceStatus.TERMINATING)
388
+ instance = await create_instance(
389
+ session=session, project=project, status=InstanceStatus.TERMINATING
390
+ )
389
391
  instance.termination_reason = "some reason"
390
392
  initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
391
393
  instance.last_job_processed_at = initial_time
@@ -415,8 +417,9 @@ class TestTerminate:
415
417
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
416
418
  async def test_terminate_not_retries_if_too_early(self, test_db, session: AsyncSession):
417
419
  project = await create_project(session=session)
418
- pool = await create_pool(session, project)
419
- instance = await create_instance(session, project, pool, status=InstanceStatus.TERMINATING)
420
+ instance = await create_instance(
421
+ session=session, project=project, status=InstanceStatus.TERMINATING
422
+ )
420
423
  instance.termination_reason = "some reason"
421
424
  initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
422
425
  instance.last_job_processed_at = initial_time
@@ -446,8 +449,9 @@ class TestTerminate:
446
449
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
447
450
  async def test_terminate_on_termination_deadline(self, test_db, session: AsyncSession):
448
451
  project = await create_project(session=session)
449
- pool = await create_pool(session, project)
450
- instance = await create_instance(session, project, pool, status=InstanceStatus.TERMINATING)
452
+ instance = await create_instance(
453
+ session=session, project=project, status=InstanceStatus.TERMINATING
454
+ )
451
455
  instance.termination_reason = "some reason"
452
456
  initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
453
457
  instance.last_job_processed_at = initial_time
@@ -505,11 +509,9 @@ class TestCreateInstance:
505
509
  expected_blocks: int,
506
510
  ):
507
511
  project = await create_project(session=session)
508
- pool = await create_pool(session, project)
509
512
  instance = await create_instance(
510
- session,
511
- project,
512
- pool,
513
+ session=session,
514
+ project=project,
513
515
  status=InstanceStatus.PENDING,
514
516
  total_blocks=requested_blocks,
515
517
  busy_blocks=0,
@@ -531,6 +533,7 @@ class TestCreateInstance:
531
533
  price=1.0,
532
534
  availability=InstanceAvailability.AVAILABLE,
533
535
  )
536
+ backend_mock.compute.return_value = Mock(spec=ComputeMockSpec)
534
537
  backend_mock.compute.return_value.get_offers_cached.return_value = [offer]
535
538
  backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData(
536
539
  backend=offer.backend,
@@ -611,11 +614,9 @@ class TestAddSSHInstance:
611
614
  host_info["cpus"] = cpus
612
615
  host_info["gpu_count"] = gpus
613
616
  project = await create_project(session=session)
614
- pool = await create_pool(session, project)
615
617
  instance = await create_instance(
616
- session,
617
- project,
618
- pool,
618
+ session=session,
619
+ project=project,
619
620
  status=InstanceStatus.PENDING,
620
621
  created_at=get_current_datetime(),
621
622
  remote_connection_info=get_remote_connection_info(),
@@ -21,7 +21,6 @@ from dstack._internal.server.testing.common import (
21
21
  create_instance,
22
22
  create_job,
23
23
  create_job_metrics_point,
24
- create_pool,
25
24
  create_project,
26
25
  create_repo,
27
26
  create_run,
@@ -45,11 +44,9 @@ class TestCollectMetrics:
45
44
  session=session,
46
45
  project_id=project.id,
47
46
  )
48
- pool = await create_pool(session=session, project=project)
49
47
  instance = await create_instance(
50
48
  session=session,
51
49
  project=project,
52
- pool=pool,
53
50
  status=InstanceStatus.BUSY,
54
51
  )
55
52
  run = await create_run(
@@ -7,6 +7,7 @@ from dstack._internal.server.background.tasks.process_placement_groups import (
7
7
  process_placement_groups,
8
8
  )
9
9
  from dstack._internal.server.testing.common import (
10
+ ComputeMockSpec,
10
11
  create_fleet,
11
12
  create_placement_group,
12
13
  create_project,
@@ -34,6 +35,7 @@ class TestProcessPlacementGroups:
34
35
  with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m:
35
36
  aws_mock = Mock()
36
37
  m.return_value = aws_mock
38
+ aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
37
39
  await process_placement_groups()
38
40
  aws_mock.compute.return_value.delete_placement_group.assert_called_once()
39
41
  await session.refresh(placement_group1)
@@ -0,0 +1,186 @@
1
+ from collections.abc import Generator
2
+ from datetime import datetime, timezone
3
+ from unittest.mock import Mock, patch
4
+
5
+ import pytest
6
+ import pytest_asyncio
7
+ from freezegun import freeze_time
8
+ from sqlalchemy import select
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+
11
+ from dstack._internal.core.models.instances import InstanceStatus
12
+ from dstack._internal.core.models.runs import JobStatus
13
+ from dstack._internal.core.models.users import GlobalRole, ProjectRole
14
+ from dstack._internal.server.background.tasks.process_prometheus_metrics import (
15
+ collect_prometheus_metrics,
16
+ delete_prometheus_metrics,
17
+ )
18
+ from dstack._internal.server.models import JobModel, JobPrometheusMetrics
19
+ from dstack._internal.server.services.projects import add_project_member
20
+ from dstack._internal.server.testing.common import (
21
+ create_instance,
22
+ create_job,
23
+ create_job_prometheus_metrics,
24
+ create_project,
25
+ create_repo,
26
+ create_run,
27
+ create_user,
28
+ get_job_provisioning_data,
29
+ )
30
+
31
+
32
+ @pytest.mark.asyncio
33
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
34
+ @pytest.mark.usefixtures("test_db", "image_config_mock")
35
+ class TestCollectPrometheusMetrics:
36
+ @pytest_asyncio.fixture
37
+ async def job(self, session: AsyncSession) -> JobModel:
38
+ user = await create_user(session=session, global_role=GlobalRole.USER)
39
+ project = await create_project(session=session, owner=user)
40
+ await add_project_member(
41
+ session=session, project=project, user=user, project_role=ProjectRole.USER
42
+ )
43
+ repo = await create_repo(
44
+ session=session,
45
+ project_id=project.id,
46
+ )
47
+ instance = await create_instance(
48
+ session=session,
49
+ project=project,
50
+ status=InstanceStatus.BUSY,
51
+ )
52
+ run = await create_run(
53
+ session=session,
54
+ project=project,
55
+ repo=repo,
56
+ user=user,
57
+ )
58
+ job = await create_job(
59
+ session=session,
60
+ run=run,
61
+ status=JobStatus.RUNNING,
62
+ job_provisioning_data=get_job_provisioning_data(),
63
+ instance_assigned=True,
64
+ instance=instance,
65
+ )
66
+ return job
67
+
68
+ @pytest.fixture
69
+ def ssh_tunnel_mock(self) -> Generator[Mock, None, None]:
70
+ with patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock:
71
+ yield SSHTunnelMock
72
+
73
+ @pytest.fixture
74
+ def shim_client_mock(self) -> Generator[Mock, None, None]:
75
+ with patch("dstack._internal.server.services.runner.client.ShimClient") as ShimClientMock:
76
+ yield ShimClientMock.return_value
77
+
78
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
79
+ async def test_inserts_new_record(
80
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
81
+ ):
82
+ shim_client_mock.get_task_metrics.return_value = "# prom response"
83
+
84
+ await collect_prometheus_metrics()
85
+
86
+ ssh_tunnel_mock.assert_called_once()
87
+ shim_client_mock.get_task_metrics.assert_called_once()
88
+ res = await session.execute(
89
+ select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id)
90
+ )
91
+ metrics = res.scalar_one()
92
+ assert metrics.text == "# prom response"
93
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
94
+
95
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
96
+ async def test_updates_record(
97
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
98
+ ):
99
+ metrics = await create_job_prometheus_metrics(
100
+ session=session,
101
+ job=job,
102
+ collected_at=datetime(2023, 1, 2, 3, 5, 0),
103
+ text="# prom old response",
104
+ )
105
+ shim_client_mock.get_task_metrics.return_value = "# prom new response"
106
+
107
+ await collect_prometheus_metrics()
108
+
109
+ ssh_tunnel_mock.assert_called_once()
110
+ shim_client_mock.get_task_metrics.assert_called_once()
111
+ res = await session.execute(
112
+ select(JobPrometheusMetrics)
113
+ .where(JobPrometheusMetrics.job_id == job.id)
114
+ .execution_options(populate_existing=True)
115
+ )
116
+ metrics = res.scalar_one()
117
+ assert metrics.text == "# prom new response"
118
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
119
+
120
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
121
+ async def test_skips_recently_updated(
122
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
123
+ ):
124
+ metrics = await create_job_prometheus_metrics(
125
+ session=session,
126
+ job=job,
127
+ collected_at=datetime(2023, 1, 2, 3, 5, 15),
128
+ text="# prom old response",
129
+ )
130
+ shim_client_mock.get_task_metrics.return_value = "# prom new response"
131
+
132
+ await collect_prometheus_metrics()
133
+
134
+ ssh_tunnel_mock.assert_not_called()
135
+ shim_client_mock.get_task_metrics.assert_not_called()
136
+ res = await session.execute(
137
+ select(JobPrometheusMetrics)
138
+ .where(JobPrometheusMetrics.job_id == job.id)
139
+ .execution_options(populate_existing=True)
140
+ )
141
+ metrics = res.scalar_one()
142
+ assert metrics.text == "# prom old response"
143
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 15)
144
+
145
+
146
+ @pytest.mark.asyncio
147
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
148
+ @pytest.mark.usefixtures("test_db", "image_config_mock")
149
+ class TestDeletePrometheusMetrics:
150
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
151
+ async def test_deletes_old_metrics(self, session: AsyncSession):
152
+ user = await create_user(session=session, global_role=GlobalRole.USER)
153
+ project = await create_project(session=session, owner=user)
154
+ await add_project_member(
155
+ session=session, project=project, user=user, project_role=ProjectRole.USER
156
+ )
157
+ repo = await create_repo(session=session, project_id=project.id)
158
+ run_1 = await create_run(
159
+ session=session, project=project, repo=repo, user=user, run_name="run-1"
160
+ )
161
+ job_1 = await create_job(session=session, run=run_1)
162
+ # old metrics
163
+ await create_job_prometheus_metrics(
164
+ session=session,
165
+ job=job_1,
166
+ collected_at=datetime(2023, 1, 2, 2, 3, 30),
167
+ )
168
+ run_2 = await create_run(
169
+ session=session, project=project, repo=repo, user=user, run_name="run-2"
170
+ )
171
+ job_2 = await create_job(session=session, run=run_2)
172
+ # recent metrics
173
+ metrics_2 = await create_job_prometheus_metrics(
174
+ session=session,
175
+ job=job_2,
176
+ collected_at=datetime(2023, 1, 2, 3, 5, 0),
177
+ )
178
+
179
+ await delete_prometheus_metrics()
180
+
181
+ res = await session.execute(
182
+ select(JobPrometheusMetrics).join(JobModel).where(JobModel.project_id == project.id)
183
+ )
184
+ all_metrics = res.scalars().all()
185
+ assert len(all_metrics) == 1
186
+ assert all_metrics[0] == metrics_2