dstack 0.18.43__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4a0fe83e84574654e397.js} +76 -19
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/top_level.txt +0 -0
@@ -1,113 +1,172 @@
1
1
  import json
2
+ from collections import defaultdict
3
+ from collections.abc import Sequence
2
4
  from datetime import datetime, timezone
5
+ from typing import Optional
3
6
 
4
7
  from sqlalchemy import select
5
8
  from sqlalchemy.ext.asyncio import AsyncSession
6
9
 
7
- from dstack._internal.core.errors import ResourceNotExistsError
10
+ from dstack._internal.core.models.instances import Resources
8
11
  from dstack._internal.core.models.metrics import JobMetrics, Metric
9
- from dstack._internal.server.models import JobMetricsPoint, JobModel, ProjectModel
10
- from dstack._internal.server.services.jobs import get_run_job_model
12
+ from dstack._internal.server.models import JobMetricsPoint, JobModel
13
+ from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
14
+ from dstack._internal.utils.common import get_or_error
15
+ from dstack._internal.utils.logging import get_logger
11
16
 
12
-
13
- async def get_job_metrics(
14
- session: AsyncSession,
15
- project: ProjectModel,
16
- run_name: str,
17
- replica_num: int,
18
- job_num: int,
19
- ) -> JobMetrics:
20
- job_model = await get_run_job_model(
21
- session=session,
22
- project=project,
23
- run_name=run_name,
24
- replica_num=replica_num,
25
- job_num=job_num,
26
- )
27
- if job_model is None:
28
- raise ResourceNotExistsError("Found no job with given parameters")
29
- job_metrics = await _get_job_metrics(
30
- session=session,
31
- job_model=job_model,
32
- )
33
- return job_metrics
17
+ logger = get_logger(__name__)
34
18
 
35
19
 
36
- async def _get_job_metrics(
20
+ async def get_job_metrics(
37
21
  session: AsyncSession,
38
22
  job_model: JobModel,
23
+ after: Optional[datetime] = None,
24
+ before: Optional[datetime] = None,
25
+ limit: Optional[int] = None,
39
26
  ) -> JobMetrics:
40
- res = await session.execute(
27
+ """
28
+ Returns metrics ordered from the latest to the earliest.
29
+
30
+ Expected usage:
31
+ * limit=100 — get the latest 100 points
32
+ * after=<now - 1 hour> — get points for the last one hour
33
+ * before=<earliest timestamp from the last batch>, limit=100 ­— paginate back in history
34
+ """
35
+ stmt = (
41
36
  select(JobMetricsPoint)
42
37
  .where(JobMetricsPoint.job_id == job_model.id)
43
38
  .order_by(JobMetricsPoint.timestamp_micro.desc())
44
- .limit(2)
45
39
  )
40
+ if after is not None:
41
+ # we need +1 point for cpu_usage_percent, thus >=
42
+ stmt = stmt.where(JobMetricsPoint.timestamp_micro >= _datetime_to_unix_time_micro(after))
43
+ if before is not None:
44
+ stmt = stmt.where(JobMetricsPoint.timestamp_micro < _datetime_to_unix_time_micro(before))
45
+ if limit is not None:
46
+ # +1 for cpu_usage_percent
47
+ stmt = stmt.limit(limit + 1)
48
+ res = await session.execute(stmt)
46
49
  points = res.scalars().all()
50
+ # we need at least 2 points to calculate cpu_usage_percent
47
51
  if len(points) < 2:
48
52
  return JobMetrics(metrics=[])
49
- last_point = points[0]
50
- prev_point = points[1]
51
- return _calculate_job_metrics(last_point, prev_point)
53
+ return _calculate_job_metrics(job_model, points)
52
54
 
53
55
 
54
- def _calculate_job_metrics(last_point: JobMetricsPoint, prev_point: JobMetricsPoint) -> JobMetrics:
55
- metrics = []
56
- timestamp = _unix_time_micro_to_datetime(last_point.timestamp_micro)
57
- metrics.append(
56
+ def _calculate_job_metrics(job_model: JobModel, points: Sequence[JobMetricsPoint]) -> JobMetrics:
57
+ timestamps: list[datetime] = []
58
+ cpu_usage_points: list[int] = []
59
+ memory_usage_points: list[int] = []
60
+ memory_working_set_points: list[int] = []
61
+ gpus_memory_usage_points: defaultdict[int, list[int]] = defaultdict(list)
62
+ gpus_util_points: defaultdict[int, list[int]] = defaultdict(list)
63
+
64
+ cpus_detected_num: Optional[int] = None
65
+ memory_total: Optional[int] = None
66
+ gpu_memory_total: Optional[int] = None
67
+ resources: Optional[Resources] = None
68
+ jrd = get_job_runtime_data(job_model)
69
+ if jrd is not None and jrd.offer is not None:
70
+ resources = jrd.offer.instance.resources
71
+ else:
72
+ jpd = get_job_provisioning_data(job_model)
73
+ if jpd is not None:
74
+ resources = jpd.instance_type.resources
75
+ if resources is not None:
76
+ cpus_detected_num = resources.cpus
77
+ memory_total = resources.memory_mib * 1024 * 1024
78
+ if len(resources.gpus) > 0:
79
+ gpu_memory_total = resources.gpus[0].memory_mib * 1024 * 1024
80
+
81
+ gpus_detected_num: Optional[int] = None
82
+ gpus_detected_num_mismatch: bool = False
83
+ for point, prev_point in zip(points, points[1:]):
84
+ timestamps.append(_unix_time_micro_to_datetime(point.timestamp_micro))
85
+ cpu_usage_points.append(_get_cpu_usage(point, prev_point))
86
+ memory_usage_points.append(point.memory_usage_bytes)
87
+ memory_working_set_points.append(point.memory_working_set_bytes)
88
+ gpus_memory_usage = json.loads(point.gpus_memory_usage_bytes)
89
+ gpus_util = json.loads(point.gpus_util_percent)
90
+ if gpus_detected_num is None:
91
+ gpus_detected_num = len(gpus_memory_usage)
92
+ if len(gpus_memory_usage) != gpus_detected_num or len(gpus_util) != gpus_detected_num:
93
+ gpus_detected_num_mismatch = True
94
+ if not gpus_detected_num_mismatch:
95
+ for i in range(gpus_detected_num):
96
+ gpus_memory_usage_points[i].append(gpus_memory_usage[i])
97
+ gpus_util_points[i].append(gpus_util[i])
98
+
99
+ metrics: list[Metric] = [
58
100
  Metric(
59
101
  name="cpu_usage_percent",
60
- timestamps=[timestamp],
61
- values=[_get_cpu_usage(last_point, prev_point)],
62
- )
63
- )
64
- metrics.append(
102
+ timestamps=timestamps,
103
+ values=cpu_usage_points,
104
+ ),
65
105
  Metric(
66
106
  name="memory_usage_bytes",
67
- timestamps=[timestamp],
68
- values=[last_point.memory_usage_bytes],
69
- )
70
- )
71
- metrics.append(
107
+ timestamps=timestamps,
108
+ values=memory_usage_points,
109
+ ),
72
110
  Metric(
73
111
  name="memory_working_set_bytes",
74
- timestamps=[timestamp],
75
- values=[last_point.memory_working_set_bytes],
76
- )
77
- )
78
-
79
- gpus_memory_usage_bytes = json.loads(last_point.gpus_memory_usage_bytes)
80
- gpus_util_percent = json.loads(last_point.gpus_util_percent)
81
- gpus_detected_num = len(gpus_memory_usage_bytes)
82
- metrics.append(
83
- Metric(
84
- name="gpus_detected_num",
85
- timestamps=[timestamp],
86
- values=[gpus_detected_num],
87
- )
88
- )
89
- for i in range(gpus_detected_num):
112
+ timestamps=timestamps,
113
+ values=memory_working_set_points,
114
+ ),
115
+ ]
116
+ if cpus_detected_num is not None:
117
+ metrics.append(_make_constant_metric("cpus_detected_num", timestamps, cpus_detected_num))
118
+ if memory_total is not None:
119
+ metrics.append(_make_constant_metric("memory_total_bytes", timestamps, memory_total))
120
+ if gpus_detected_num_mismatch:
121
+ # If number of GPUs changed in the time window, skip GPU metrics altogether, otherwise
122
+ # results can be unpredictable (e.g, one GPU takes place of another, as they are
123
+ # identified by an array index only).
124
+ logger.warning("gpus_detected_num mismatch, skipping GPU metrics")
125
+ else:
90
126
  metrics.append(
91
- Metric(
92
- name=f"gpu_memory_usage_bytes_gpu{i}",
93
- timestamps=[timestamp],
94
- values=[gpus_memory_usage_bytes[i]],
95
- )
127
+ _make_constant_metric("gpus_detected_num", timestamps, get_or_error(gpus_detected_num))
96
128
  )
97
- metrics.append(
98
- Metric(
99
- name=f"gpu_util_percent_gpu{i}",
100
- timestamps=[timestamp],
101
- values=[gpus_util_percent[i]],
129
+ if gpu_memory_total is not None:
130
+ metrics.append(
131
+ _make_constant_metric("gpu_memory_total_bytes", timestamps, gpu_memory_total)
132
+ )
133
+ for index, gpu_memory_usage_points in gpus_memory_usage_points.items():
134
+ metrics.append(
135
+ Metric(
136
+ name=f"gpu_memory_usage_bytes_gpu{index}",
137
+ timestamps=timestamps,
138
+ values=gpu_memory_usage_points,
139
+ )
140
+ )
141
+ for index, gpu_util_points in gpus_util_points.items():
142
+ metrics.append(
143
+ Metric(
144
+ name=f"gpu_util_percent_gpu{index}",
145
+ timestamps=timestamps,
146
+ values=gpu_util_points,
147
+ )
102
148
  )
103
- )
104
149
  return JobMetrics(metrics=metrics)
105
150
 
106
151
 
152
+ def _make_constant_metric(name: str, timestamps: list[datetime], value: float) -> Metric:
153
+ return Metric(
154
+ name=name,
155
+ timestamps=timestamps,
156
+ values=[value] * len(timestamps),
157
+ )
158
+
159
+
107
160
  def _get_cpu_usage(last_point: JobMetricsPoint, prev_point: JobMetricsPoint) -> int:
108
161
  window = last_point.timestamp_micro - prev_point.timestamp_micro
162
+ if window == 0:
163
+ return 0
109
164
  return round((last_point.cpu_usage_micro - prev_point.cpu_usage_micro) / window * 100)
110
165
 
111
166
 
112
167
  def _unix_time_micro_to_datetime(unix_time_ms: int) -> datetime:
113
168
  return datetime.fromtimestamp(unix_time_ms / 1_000_000, tz=timezone.utc)
169
+
170
+
171
+ def _datetime_to_unix_time_micro(dt: datetime) -> int:
172
+ return int(dt.timestamp() * 1_000_000)
@@ -7,7 +7,7 @@ from dstack._internal.core.backends import (
7
7
  BACKENDS_WITH_MULTINODE_SUPPORT,
8
8
  BACKENDS_WITH_RESERVATION_SUPPORT,
9
9
  )
10
- from dstack._internal.core.backends.base import Backend
10
+ from dstack._internal.core.backends.base.backend import Backend
11
11
  from dstack._internal.core.models.backends.base import BackendType
12
12
  from dstack._internal.core.models.instances import (
13
13
  InstanceOfferWithAvailability,
@@ -7,19 +7,22 @@ from sqlalchemy import func as safunc
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
8
  from sqlalchemy.orm import joinedload
9
9
 
10
- from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
11
- from dstack._internal.core.models.backends import BackendInfo
12
- from dstack._internal.core.models.backends.dstack import (
13
- DstackBaseBackendConfigInfo,
14
- DstackConfigInfo,
10
+ from dstack._internal.core.backends.configurators import get_configurator
11
+ from dstack._internal.core.backends.dstack.models import (
12
+ DstackBackendConfig,
13
+ DstackBaseBackendConfig,
15
14
  )
15
+ from dstack._internal.core.backends.models import BackendInfo
16
+ from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
16
17
  from dstack._internal.core.models.common import is_core_model_instance
17
18
  from dstack._internal.core.models.projects import Member, MemberPermissions, Project
18
19
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
19
20
  from dstack._internal.server.models import MemberModel, ProjectModel, UserModel
20
21
  from dstack._internal.server.schemas.projects import MemberSetting
21
22
  from dstack._internal.server.services import users
22
- from dstack._internal.server.services.backends import get_configurator
23
+ from dstack._internal.server.services.backends import (
24
+ get_backend_config_from_backend_model,
25
+ )
23
26
  from dstack._internal.server.services.permissions import get_default_permissions
24
27
  from dstack._internal.server.settings import DEFAULT_PROJECT_NAME
25
28
  from dstack._internal.utils.common import get_current_datetime, run_async
@@ -176,12 +179,16 @@ async def set_project_members(
176
179
  # FIXME: potentially long write transaction
177
180
  # clear_project_members() issues DELETE without commit
178
181
  await clear_project_members(session=session, project=project)
179
- usernames = [m.username for m in members]
180
- res = await session.execute(select(UserModel).where(UserModel.name.in_(usernames)))
182
+ names = [m.username for m in members]
183
+ res = await session.execute(
184
+ select(UserModel).where((UserModel.name.in_(names)) | (UserModel.email.in_(names)))
185
+ )
181
186
  users = res.scalars().all()
187
+ # Create lookup maps for both username and email
182
188
  username_to_user = {user.name: user for user in users}
189
+ email_to_user = {user.email: user for user in users if user.email}
183
190
  for i, member in enumerate(members):
184
- user_to_add = username_to_user.get(member.username)
191
+ user_to_add = username_to_user.get(member.username) or email_to_user.get(member.username)
185
192
  if user_to_add is None:
186
193
  continue
187
194
  await add_project_member(
@@ -376,20 +383,22 @@ def project_model_to_project(
376
383
  b.type.value,
377
384
  )
378
385
  continue
379
- config_info = configurator.get_config_info(model=b, include_creds=False)
380
- if is_core_model_instance(config_info, DstackConfigInfo):
381
- for backend_type in config_info.base_backends:
386
+ backend_config = get_backend_config_from_backend_model(
387
+ configurator, b, include_creds=False
388
+ )
389
+ if is_core_model_instance(backend_config, DstackBackendConfig):
390
+ for backend_type in backend_config.base_backends:
382
391
  backends.append(
383
392
  BackendInfo(
384
393
  name=backend_type,
385
- config=DstackBaseBackendConfigInfo(type=backend_type),
394
+ config=DstackBaseBackendConfig(type=backend_type),
386
395
  )
387
396
  )
388
397
  else:
389
398
  backends.append(
390
399
  BackendInfo(
391
400
  name=b.type,
392
- config=config_info,
401
+ config=backend_config,
393
402
  )
394
403
  )
395
404
  return Project(
@@ -0,0 +1,245 @@
1
+ import itertools
2
+ from collections.abc import Generator, Iterable
3
+ from datetime import timezone
4
+
5
+ from prometheus_client import Metric
6
+ from prometheus_client.parser import text_string_to_metric_families
7
+ from prometheus_client.samples import Sample
8
+ from sqlalchemy import select
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+ from sqlalchemy.orm import joinedload
11
+
12
+ from dstack._internal.core.models.instances import InstanceStatus
13
+ from dstack._internal.core.models.runs import JobStatus, RunSpec
14
+ from dstack._internal.server.models import (
15
+ InstanceModel,
16
+ JobModel,
17
+ JobPrometheusMetrics,
18
+ ProjectModel,
19
+ RunModel,
20
+ )
21
+ from dstack._internal.server.services.instances import get_instance_offer
22
+ from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
23
+ from dstack._internal.utils.common import get_current_datetime
24
+
25
+ _INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
26
+ _INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
27
+ _INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
28
+ _JOB_DURATION = "dstack_job_duration_seconds_total"
29
+ _JOB_PRICE = "dstack_job_price_dollars_per_hour"
30
+ _JOB_GPU_COUNT = "dstack_job_gpu_count"
31
+
32
+
33
+ async def get_metrics(session: AsyncSession) -> str:
34
+ metrics_iter = itertools.chain(
35
+ await get_instance_metrics(session),
36
+ await get_job_metrics(session),
37
+ await get_job_gpu_metrics(session),
38
+ )
39
+ return "\n".join(_render_metrics(metrics_iter)) + "\n"
40
+
41
+
42
+ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
43
+ res = await session.execute(
44
+ select(InstanceModel)
45
+ .join(ProjectModel)
46
+ .where(
47
+ InstanceModel.deleted == False,
48
+ InstanceModel.status.in_(
49
+ [
50
+ InstanceStatus.PROVISIONING,
51
+ InstanceStatus.IDLE,
52
+ InstanceStatus.BUSY,
53
+ InstanceStatus.TERMINATING,
54
+ ]
55
+ ),
56
+ )
57
+ .order_by(ProjectModel.name, InstanceModel.name)
58
+ .options(
59
+ joinedload(InstanceModel.project),
60
+ joinedload(InstanceModel.fleet),
61
+ )
62
+ )
63
+ instances = res.unique().scalars().all()
64
+ metrics: dict[str, Metric] = {
65
+ _INSTANCE_DURATION: Metric(
66
+ name=_INSTANCE_DURATION,
67
+ documentation="Total seconds the instance is running",
68
+ typ="counter",
69
+ ),
70
+ _INSTANCE_PRICE: Metric(
71
+ name=_INSTANCE_PRICE, documentation="Instance price, USD/hour", typ="gauge"
72
+ ),
73
+ _INSTANCE_GPU_COUNT: Metric(
74
+ name=_INSTANCE_GPU_COUNT, documentation="Instance GPU count", typ="gauge"
75
+ ),
76
+ }
77
+ now = get_current_datetime()
78
+ for instance in instances:
79
+ fleet = instance.fleet
80
+ offer = get_instance_offer(instance)
81
+ gpu = ""
82
+ gpu_count = 0
83
+ if offer is not None and len(offer.instance.resources.gpus) > 0:
84
+ gpu = offer.instance.resources.gpus[0].name
85
+ gpu_count = len(offer.instance.resources.gpus)
86
+ labels: dict[str, str] = {
87
+ "dstack_project_name": instance.project.name,
88
+ "dstack_fleet_name": fleet.name if fleet is not None else "",
89
+ "dstack_fleet_id": str(fleet.id) if fleet is not None else "",
90
+ "dstack_instance_name": str(instance.name),
91
+ "dstack_instance_id": str(instance.id),
92
+ "dstack_instance_type": offer.instance.name if offer is not None else "",
93
+ "dstack_backend": instance.backend.value if instance.backend is not None else "",
94
+ "dstack_gpu": gpu,
95
+ }
96
+ duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
97
+ metrics[_INSTANCE_DURATION].add_sample(
98
+ name=_INSTANCE_DURATION, labels=labels, value=duration
99
+ )
100
+ metrics[_INSTANCE_PRICE].add_sample(
101
+ name=_INSTANCE_PRICE, labels=labels, value=instance.price or 0.0
102
+ )
103
+ metrics[_INSTANCE_GPU_COUNT].add_sample(
104
+ name=_INSTANCE_GPU_COUNT, labels=labels, value=gpu_count
105
+ )
106
+ return metrics.values()
107
+
108
+
109
+ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
110
+ res = await session.execute(
111
+ select(JobModel)
112
+ .join(ProjectModel)
113
+ .where(
114
+ JobModel.status.in_(
115
+ [
116
+ JobStatus.PROVISIONING,
117
+ JobStatus.PULLING,
118
+ JobStatus.RUNNING,
119
+ JobStatus.TERMINATING,
120
+ ]
121
+ )
122
+ )
123
+ .order_by(ProjectModel.name, JobModel.job_name)
124
+ .options(
125
+ joinedload(JobModel.project),
126
+ joinedload(JobModel.run).joinedload(RunModel.user),
127
+ )
128
+ )
129
+ jobs = res.scalars().all()
130
+ metrics: dict[str, Metric] = {
131
+ _JOB_DURATION: Metric(
132
+ name=_JOB_DURATION, documentation="Total seconds the job is running", typ="counter"
133
+ ),
134
+ _JOB_PRICE: Metric(
135
+ name=_JOB_PRICE, documentation="Job instance price, USD/hour", typ="gauge"
136
+ ),
137
+ _JOB_GPU_COUNT: Metric(name=_JOB_GPU_COUNT, documentation="Job GPU count", typ="gauge"),
138
+ }
139
+ now = get_current_datetime()
140
+ for job in jobs:
141
+ jpd = get_job_provisioning_data(job)
142
+ if jpd is None:
143
+ continue
144
+ jrd = get_job_runtime_data(job)
145
+ gpus = jpd.instance_type.resources.gpus
146
+ price = jpd.price
147
+ if jrd is not None and jrd.offer is not None:
148
+ gpus = jrd.offer.instance.resources.gpus
149
+ price = jrd.offer.price
150
+ run_spec = RunSpec.__response__.parse_raw(job.run.run_spec)
151
+ labels = _get_job_labels(job)
152
+ labels["dstack_run_type"] = run_spec.configuration.type
153
+ labels["dstack_backend"] = jpd.get_base_backend().value
154
+ labels["dstack_gpu"] = gpus[0].name if gpus else ""
155
+ duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
156
+ metrics[_JOB_DURATION].add_sample(name=_JOB_DURATION, labels=labels, value=duration)
157
+ metrics[_JOB_PRICE].add_sample(name=_JOB_PRICE, labels=labels, value=price)
158
+ metrics[_JOB_GPU_COUNT].add_sample(name=_JOB_GPU_COUNT, labels=labels, value=len(gpus))
159
+ return metrics.values()
160
+
161
+
162
+ async def get_job_gpu_metrics(session: AsyncSession) -> Iterable[Metric]:
163
+ res = await session.execute(
164
+ select(JobPrometheusMetrics)
165
+ .join(JobModel)
166
+ .join(ProjectModel)
167
+ .where(JobModel.status.in_([JobStatus.RUNNING]))
168
+ .order_by(ProjectModel.name, JobModel.job_name)
169
+ .options(
170
+ joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
171
+ joinedload(JobPrometheusMetrics.job)
172
+ .joinedload(JobModel.run)
173
+ .joinedload(RunModel.user),
174
+ )
175
+ )
176
+ metrics_models = res.scalars().all()
177
+ return _parse_and_enrich_job_gpu_metrics(metrics_models)
178
+
179
+
180
+ async def get_project_metrics(session: AsyncSession, project: ProjectModel) -> str:
181
+ res = await session.execute(
182
+ select(JobPrometheusMetrics)
183
+ .join(JobModel)
184
+ .where(
185
+ JobModel.project_id == project.id,
186
+ JobModel.status.in_([JobStatus.RUNNING]),
187
+ )
188
+ .order_by(JobModel.job_name)
189
+ .options(
190
+ joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
191
+ joinedload(JobPrometheusMetrics.job)
192
+ .joinedload(JobModel.run)
193
+ .joinedload(RunModel.user),
194
+ )
195
+ )
196
+ metrics_models = res.scalars().all()
197
+ return "\n".join(_render_metrics(_parse_and_enrich_job_gpu_metrics(metrics_models))) + "\n"
198
+
199
+
200
+ def _parse_and_enrich_job_gpu_metrics(
201
+ metrics_models: Iterable[JobPrometheusMetrics],
202
+ ) -> Iterable[Metric]:
203
+ metrics: dict[str, Metric] = {}
204
+ for metrics_model in metrics_models:
205
+ for metric in text_string_to_metric_families(metrics_model.text):
206
+ samples = metric.samples
207
+ metric.samples = []
208
+ name = metric.name
209
+ metric = metrics.setdefault(name, metric)
210
+ for sample in samples:
211
+ labels = sample.labels
212
+ labels.update(_get_job_labels(metrics_model.job))
213
+ # text_string_to_metric_families "fixes" counter names appending _total,
214
+ # we rebuild Sample to revert this
215
+ metric.samples.append(Sample(name, labels, *sample[2:]))
216
+ return metrics.values()
217
+
218
+
219
+ def _get_job_labels(job: JobModel) -> dict[str, str]:
220
+ return {
221
+ "dstack_project_name": job.project.name,
222
+ "dstack_user_name": job.run.user.name,
223
+ "dstack_run_name": job.run_name,
224
+ "dstack_run_id": str(job.run_id),
225
+ "dstack_job_name": job.job_name,
226
+ "dstack_job_id": str(job.id),
227
+ "dstack_job_num": str(job.job_num),
228
+ "dstack_replica_num": str(job.replica_num),
229
+ }
230
+
231
+
232
+ def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]:
233
+ for metric in metrics:
234
+ if not metric.samples:
235
+ continue
236
+ yield f"# HELP {metric.name} {metric.documentation}"
237
+ yield f"# TYPE {metric.name} {metric.type}"
238
+ for sample in metric.samples:
239
+ parts: list[str] = [f"{sample.name}{{"]
240
+ parts.extend(",".join(f'{name}="{value}"' for name, value in sample.labels.items()))
241
+ parts.append(f"}} {float(sample.value)}")
242
+ # text_string_to_metric_families converts milliseconds to float seconds
243
+ if isinstance(sample.timestamp, float):
244
+ parts.append(f" {int(sample.timestamp * 1000)}")
245
+ yield "".join(parts)
@@ -178,9 +178,6 @@ class ShimClient:
178
178
  # API v1 (a.k.a. Legacy API) — `/api/{submit,pull,stop}`
179
179
  _API_V2_MIN_SHIM_VERSION = (0, 18, 34)
180
180
 
181
- # A surrogate task ID for API-v1-over-v2 emulation (`_v2_compat_*` methods)
182
- _LEGACY_TASK_ID = "00000000-0000-0000-0000-000000000000"
183
-
184
181
  _shim_version: Optional["_Version"]
185
182
  _api_version: int
186
183
  _negotiated: bool = False
@@ -339,6 +336,20 @@ class ShimClient:
339
336
  resp = self._request("GET", "/api/pull", raise_for_status=True)
340
337
  return self._response(LegacyPullResponse, resp)
341
338
 
339
+ # Metrics
340
+
341
+ def get_task_metrics(self, task_id: "_TaskID") -> Optional[str]:
342
+ resp = self._request("GET", f"/metrics/tasks/{task_id}")
343
+ if resp.status_code == HTTPStatus.NOT_FOUND:
344
+ # Metrics exporter is not installed or old shim version
345
+ return None
346
+ if resp.status_code == HTTPStatus.BAD_GATEWAY:
347
+ # Metrics exporter is not available or returned an error
348
+ logger.info("failed to collect metrics for task %s: %s", task_id, resp.text)
349
+ return None
350
+ self._raise_for_status(resp)
351
+ return resp.text
352
+
342
353
  # Private methods used for public methods implementations
343
354
 
344
355
  def _request(