dstack 0.18.43__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4a0fe83e84574654e397.js} +76 -19
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0.dist-info}/top_level.txt +0 -0
@@ -86,9 +86,7 @@ class VolumeAttachment(CoreModel):
86
86
  class Volume(CoreModel):
87
87
  id: uuid.UUID
88
88
  name: str
89
- # Default user to "" for client backward compatibility (old 0.18 servers).
90
- # TODO: Remove in 0.19
91
- user: str = ""
89
+ user: str
92
90
  project_name: str
93
91
  configuration: VolumeConfiguration
94
92
  external: bool
@@ -12,18 +12,7 @@ from dstack._internal.core.models.runs import Retry
12
12
  def get_retry(profile: Profile) -> Optional[Retry]:
13
13
  profile_retry = profile.retry
14
14
  if profile_retry is None:
15
- # Handle retry_policy before retry was introduced
16
- # TODO: Remove once retry_policy no longer supported
17
- profile_retry_policy = profile.retry_policy
18
- if profile_retry_policy is None:
19
- return None
20
- if not profile_retry_policy.retry:
21
- return None
22
- duration = profile_retry_policy.duration or DEFAULT_RETRY_DURATION
23
- return Retry(
24
- on_events=[RetryEvent.NO_CAPACITY, RetryEvent.INTERRUPTION, RetryEvent.ERROR],
25
- duration=duration,
26
- )
15
+ return None
27
16
  if isinstance(profile_retry, bool):
28
17
  if profile_retry:
29
18
  return Retry(
@@ -32,6 +21,12 @@ def get_retry(profile: Profile) -> Optional[Retry]:
32
21
  )
33
22
  return None
34
23
  profile_retry = profile_retry.copy()
24
+ if profile_retry.on_events is None:
25
+ profile_retry.on_events = [
26
+ RetryEvent.NO_CAPACITY,
27
+ RetryEvent.INTERRUPTION,
28
+ RetryEvent.ERROR,
29
+ ]
35
30
  if profile_retry.duration is None:
36
31
  profile_retry.duration = DEFAULT_RETRY_DURATION
37
32
  return Retry.parse_obj(profile_retry)
@@ -42,10 +37,6 @@ def get_termination(
42
37
  ) -> Tuple[TerminationPolicy, int]:
43
38
  termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
44
39
  termination_idle_time = default_termination_idle_time
45
- if profile.termination_policy is not None:
46
- termination_policy = profile.termination_policy
47
- if profile.termination_idle_time is not None:
48
- termination_idle_time = profile.termination_idle_time
49
40
  if profile.idle_duration is not None and int(profile.idle_duration) < 0:
50
41
  termination_policy = TerminationPolicy.DONT_DESTROY
51
42
  elif profile.idle_duration is not None:
@@ -10,7 +10,6 @@ from git.exc import GitCommandError
10
10
  from dstack._internal.core.errors import DstackError
11
11
  from dstack._internal.core.models.config import RepoConfig
12
12
  from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepoCreds
13
- from dstack._internal.core.models.repos.base import RepoProtocol
14
13
  from dstack._internal.core.models.repos.remote import GitRepoURL
15
14
  from dstack._internal.utils.logging import get_logger
16
15
  from dstack._internal.utils.path import PathLike
@@ -41,7 +40,6 @@ def get_local_repo_credentials(
41
40
  r = requests.get(f"{url.as_https()}/info/refs?service=git-upload-pack", timeout=10)
42
41
  if r.status_code == 200:
43
42
  return RemoteRepoCreds(
44
- protocol=RepoProtocol.HTTPS,
45
43
  clone_url=url.as_https(),
46
44
  private_key=None,
47
45
  oauth_token=None,
@@ -93,7 +91,6 @@ def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> Re
93
91
  f"Can't access `{url.as_https()}` using the `{masked}` token"
94
92
  )
95
93
  return RemoteRepoCreds(
96
- protocol=RepoProtocol.HTTPS,
97
94
  clone_url=url.as_https(),
98
95
  oauth_token=oauth_token,
99
96
  private_key=None,
@@ -123,7 +120,6 @@ def check_remote_repo_credentials_ssh(url: GitRepoURL, identity_file: PathLike)
123
120
  )
124
121
 
125
122
  return RemoteRepoCreds(
126
- protocol=RepoProtocol.SSH,
127
123
  clone_url=url.as_ssh(),
128
124
  private_key=private_key,
129
125
  oauth_token=None,
@@ -27,8 +27,8 @@ from dstack._internal.server.routers import (
27
27
  instances,
28
28
  logs,
29
29
  metrics,
30
- pools,
31
30
  projects,
31
+ prometheus,
32
32
  repos,
33
33
  runs,
34
34
  secrets,
@@ -183,8 +183,7 @@ def register_routes(app: FastAPI, ui: bool = True):
183
183
  app.include_router(volumes.project_router)
184
184
  app.include_router(service_proxy.router, prefix="/proxy/services", tags=["service-proxy"])
185
185
  app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
186
- app.include_router(pools.root_router)
187
- app.include_router(pools.router)
186
+ app.include_router(prometheus.router)
188
187
 
189
188
  @app.exception_handler(ForbiddenError)
190
189
  async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -252,7 +251,11 @@ def register_routes(app: FastAPI, ui: bool = True):
252
251
 
253
252
  @app.exception_handler(404)
254
253
  async def custom_http_exception_handler(request, exc):
255
- if request.url.path.startswith("/api") or _is_proxy_request(request):
254
+ if (
255
+ request.url.path.startswith("/api")
256
+ or _is_proxy_request(request)
257
+ or _is_prometheus_request(request)
258
+ ):
256
259
  return JSONResponse(
257
260
  {"detail": exc.detail},
258
261
  status_code=status.HTTP_404_NOT_FOUND,
@@ -283,6 +286,10 @@ def _is_proxy_request(request: Request) -> bool:
283
286
  ) and referrer.path.startswith("/proxy")
284
287
 
285
288
 
289
+ def _is_prometheus_request(request: Request) -> bool:
290
+ return request.url.path.startswith("/metrics")
291
+
292
+
286
293
  def _print_dstack_logo():
287
294
  console.print(
288
295
  """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
@@ -1,6 +1,7 @@
1
1
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
2
2
  from apscheduler.triggers.interval import IntervalTrigger
3
3
 
4
+ from dstack._internal.server import settings
4
5
  from dstack._internal.server.background.tasks.process_fleets import process_fleets
5
6
  from dstack._internal.server.background.tasks.process_gateways import (
6
7
  process_gateways_connections,
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
16
17
  from dstack._internal.server.background.tasks.process_placement_groups import (
17
18
  process_placement_groups,
18
19
  )
20
+ from dstack._internal.server.background.tasks.process_prometheus_metrics import (
21
+ collect_prometheus_metrics,
22
+ delete_prometheus_metrics,
23
+ )
19
24
  from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
20
25
  from dstack._internal.server.background.tasks.process_runs import process_runs
21
26
  from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
43
48
  # * 150 active instances with up to 2 minutes processing latency
44
49
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
45
50
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
51
+ if settings.ENABLE_PROMETHEUS_METRICS:
52
+ _scheduler.add_job(
53
+ collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
54
+ )
55
+ _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
46
56
  # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
47
57
  _scheduler.add_job(
48
58
  process_submitted_jobs,
@@ -54,17 +54,13 @@ async def process_submitted_gateways():
54
54
 
55
55
 
56
56
  async def _remove_inactive_connections():
57
- connections = await gateway_connections_pool.all()
58
- ip_addresses = [c.ip_address for c in connections]
59
57
  async with get_session_ctx() as session:
60
58
  res = await session.execute(
61
- select(GatewayComputeModel).where(
62
- GatewayComputeModel.ip_address.in_(ip_addresses),
63
- GatewayComputeModel.active == False,
64
- )
59
+ select(GatewayComputeModel.ip_address).where(GatewayComputeModel.active == True)
65
60
  )
66
- removed_connections = res.scalars().all()
67
- for conn in removed_connections:
61
+ active_connection_ips = set(res.scalars().all())
62
+ for conn in await gateway_connections_pool.all():
63
+ if conn.ip_address not in active_connection_ips:
68
64
  await gateway_connections_pool.remove(conn.ip_address)
69
65
 
70
66
 
@@ -20,6 +20,8 @@ from dstack._internal.core.backends.base.compute import (
20
20
  DSTACK_RUNNER_BINARY_PATH,
21
21
  DSTACK_SHIM_BINARY_PATH,
22
22
  DSTACK_WORKING_DIR,
23
+ ComputeWithCreateInstanceSupport,
24
+ ComputeWithPlacementGroupSupport,
23
25
  get_shim_env,
24
26
  get_shim_pre_start_commands,
25
27
  )
@@ -76,19 +78,19 @@ from dstack._internal.server.services.fleets import (
76
78
  fleet_model_to_fleet,
77
79
  get_create_instance_offers,
78
80
  )
79
- from dstack._internal.server.services.locking import get_locker
80
- from dstack._internal.server.services.offers import is_divisible_into_blocks
81
- from dstack._internal.server.services.placement import (
82
- get_fleet_placement_groups,
83
- placement_group_model_to_placement_group,
84
- )
85
- from dstack._internal.server.services.pools import (
81
+ from dstack._internal.server.services.instances import (
86
82
  get_instance_configuration,
87
83
  get_instance_profile,
88
84
  get_instance_provisioning_data,
89
85
  get_instance_requirements,
90
86
  get_instance_ssh_private_keys,
91
87
  )
88
+ from dstack._internal.server.services.locking import get_locker
89
+ from dstack._internal.server.services.offers import is_divisible_into_blocks
90
+ from dstack._internal.server.services.placement import (
91
+ get_fleet_placement_groups,
92
+ placement_group_model_to_placement_group,
93
+ )
92
94
  from dstack._internal.server.services.runner import client as runner_client
93
95
  from dstack._internal.server.services.runner.client import HealthStatus
94
96
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
@@ -530,12 +532,15 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
530
532
  for backend, instance_offer in offers:
531
533
  if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
532
534
  continue
535
+ compute = backend.compute()
536
+ assert isinstance(compute, ComputeWithCreateInstanceSupport)
533
537
  instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
534
538
  if (
535
539
  instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
536
540
  and instance.fleet
537
541
  and instance_configuration.placement_group_name
538
542
  ):
543
+ assert isinstance(compute, ComputeWithPlacementGroupSupport)
539
544
  placement_group_model = _create_placement_group_if_does_not_exist(
540
545
  session=session,
541
546
  fleet_model=instance.fleet,
@@ -546,7 +551,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
546
551
  )
547
552
  if placement_group_model is not None:
548
553
  placement_group = placement_group_model_to_placement_group(placement_group_model)
549
- pgpd = await run_async(backend.compute().create_placement_group, placement_group)
554
+ pgpd = await run_async(compute.create_placement_group, placement_group)
550
555
  placement_group_model.provisioning_data = pgpd.json()
551
556
  session.add(placement_group_model)
552
557
  placement_groups.append(placement_group)
@@ -559,7 +564,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
559
564
  )
560
565
  try:
561
566
  job_provisioning_data = await run_async(
562
- backend.compute().create_instance,
567
+ compute.create_instance,
563
568
  instance_offer,
564
569
  instance_configuration,
565
570
  )
@@ -11,8 +11,8 @@ from dstack._internal.server import settings
11
11
  from dstack._internal.server.db import get_session_ctx
12
12
  from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel
13
13
  from dstack._internal.server.schemas.runner import MetricsResponse
14
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
14
15
  from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
15
- from dstack._internal.server.services.pools import get_instance_ssh_private_keys
16
16
  from dstack._internal.server.services.runner import client
17
17
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
18
18
  from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
@@ -5,6 +5,7 @@ from sqlalchemy import select
5
5
  from sqlalchemy.ext.asyncio import AsyncSession
6
6
  from sqlalchemy.orm import joinedload
7
7
 
8
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
8
9
  from dstack._internal.core.errors import PlacementGroupInUseError
9
10
  from dstack._internal.server.db import get_session_ctx
10
11
  from dstack._internal.server.models import PlacementGroupModel, ProjectModel
@@ -28,6 +29,7 @@ async def process_placement_groups():
28
29
  PlacementGroupModel.deleted == False,
29
30
  PlacementGroupModel.id.not_in(lockset),
30
31
  )
32
+ .order_by(PlacementGroupModel.id) # take locks in order
31
33
  .with_for_update(skip_locked=True)
32
34
  )
33
35
  placement_group_models = res.scalars().all()
@@ -80,8 +82,10 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel):
80
82
  "Failed to delete placement group %s. Backend not available.", placement_group.name
81
83
  )
82
84
  return
85
+ compute = backend.compute()
86
+ assert isinstance(compute, ComputeWithPlacementGroupSupport)
83
87
  try:
84
- await run_async(backend.compute().delete_placement_group, placement_group)
88
+ await run_async(compute.delete_placement_group, placement_group)
85
89
  except PlacementGroupInUseError:
86
90
  logger.info(
87
91
  "Placement group %s is still in use. Skipping deletion for now.", placement_group.name
@@ -0,0 +1,135 @@
1
+ import uuid
2
+ from datetime import datetime, timedelta
3
+ from typing import Optional
4
+
5
+ import sqlalchemy.exc
6
+ from sqlalchemy import delete, or_, select, update
7
+ from sqlalchemy.orm import joinedload
8
+
9
+ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
10
+ from dstack._internal.core.models.runs import JobStatus
11
+ from dstack._internal.server.db import get_session_ctx
12
+ from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
13
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
14
+ from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
15
+ from dstack._internal.server.services.runner import client
16
+ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
17
+ from dstack._internal.server.utils.common import gather_map_async
18
+ from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
19
+ from dstack._internal.utils.logging import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ MAX_JOBS_FETCHED = 100
25
+ BATCH_SIZE = 10
26
+ MIN_COLLECT_INTERVAL_SECONDS = 9
27
+ # 10 minutes should be more than enough to scrape metrics, and, in any case,
28
+ # 10 minutes old metrics has little to no value
29
+ METRICS_TTL_SECONDS = 600
30
+
31
+
32
+ async def collect_prometheus_metrics():
33
+ now = get_current_datetime()
34
+ cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
35
+ async with get_session_ctx() as session:
36
+ res = await session.execute(
37
+ select(JobModel)
38
+ .join(JobPrometheusMetrics, isouter=True)
39
+ .where(
40
+ JobModel.status.in_([JobStatus.RUNNING]),
41
+ or_(
42
+ JobPrometheusMetrics.job_id.is_(None),
43
+ JobPrometheusMetrics.collected_at < cutoff,
44
+ ),
45
+ )
46
+ .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
47
+ .order_by(JobModel.last_processed_at.asc())
48
+ .limit(MAX_JOBS_FETCHED)
49
+ )
50
+ job_models = res.unique().scalars().all()
51
+ for batch in batched(job_models, BATCH_SIZE):
52
+ await _collect_jobs_metrics(batch, now)
53
+
54
+
55
+ async def delete_prometheus_metrics():
56
+ now = get_current_datetime()
57
+ cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
58
+ async with get_session_ctx() as session:
59
+ await session.execute(
60
+ delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
61
+ )
62
+ await session.commit()
63
+
64
+
65
+ async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
66
+ results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
67
+ async with get_session_ctx() as session:
68
+ for job_model, result in results:
69
+ if result is None:
70
+ continue
71
+ if isinstance(result, BaseException):
72
+ logger.error(
73
+ "Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
74
+ )
75
+ continue
76
+ res = await session.execute(
77
+ update(JobPrometheusMetrics)
78
+ .where(JobPrometheusMetrics.job_id == job_model.id)
79
+ .values(
80
+ collected_at=collected_at,
81
+ text=result,
82
+ )
83
+ .returning(JobPrometheusMetrics)
84
+ )
85
+ metrics = res.scalar()
86
+ if metrics is None:
87
+ metrics = JobPrometheusMetrics(
88
+ job_id=job_model.id,
89
+ collected_at=collected_at,
90
+ text=result,
91
+ )
92
+ try:
93
+ async with session.begin_nested():
94
+ session.add(metrics)
95
+ except sqlalchemy.exc.IntegrityError:
96
+ # Concurrent server replica already committed, ignoring
97
+ pass
98
+ await session.commit()
99
+
100
+
101
+ async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
102
+ ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
103
+ jpd = get_job_provisioning_data(job_model)
104
+ jrd = get_job_runtime_data(job_model)
105
+ if jpd is None:
106
+ return None
107
+ try:
108
+ res = await run_async(
109
+ _pull_job_metrics,
110
+ ssh_private_keys,
111
+ jpd,
112
+ jrd,
113
+ job_model.id,
114
+ )
115
+ except Exception:
116
+ logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
117
+ return None
118
+
119
+ if isinstance(res, bool):
120
+ logger.warning(
121
+ "Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
122
+ )
123
+ return None
124
+
125
+ if res is None:
126
+ # Either not supported by shim or exporter is not available
127
+ return None
128
+
129
+ return res
130
+
131
+
132
+ @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
133
+ def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
134
+ shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
135
+ return shim_client.get_task_metrics(task_id)
@@ -1,4 +1,6 @@
1
1
  import asyncio
2
+ from collections.abc import Iterable
3
+ from datetime import timedelta
2
4
  from typing import Dict, List, Optional
3
5
 
4
6
  from sqlalchemy import select
@@ -15,6 +17,7 @@ from dstack._internal.core.models.instances import (
15
17
  RemoteConnectionInfo,
16
18
  SSHConnectionParams,
17
19
  )
20
+ from dstack._internal.core.models.metrics import Metric
18
21
  from dstack._internal.core.models.repos import RemoteRepoCreds
19
22
  from dstack._internal.core.models.runs import (
20
23
  ClusterInfo,
@@ -40,6 +43,7 @@ from dstack._internal.server.models import (
40
43
  from dstack._internal.server.schemas.runner import TaskStatus
41
44
  from dstack._internal.server.services import logs as logs_services
42
45
  from dstack._internal.server.services import services
46
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
43
47
  from dstack._internal.server.services.jobs import (
44
48
  find_job,
45
49
  get_job_attached_volumes,
@@ -48,7 +52,7 @@ from dstack._internal.server.services.jobs import (
48
52
  )
49
53
  from dstack._internal.server.services.locking import get_locker
50
54
  from dstack._internal.server.services.logging import fmt
51
- from dstack._internal.server.services.pools import get_instance_ssh_private_keys
55
+ from dstack._internal.server.services.metrics import get_job_metrics
52
56
  from dstack._internal.server.services.repos import (
53
57
  get_code_model,
54
58
  get_repo_creds,
@@ -123,7 +127,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
123
127
  run_model = res.unique().scalar_one()
124
128
  repo_model = run_model.repo
125
129
  project = run_model.project
126
- run = run_model_to_run(run_model)
130
+ run = run_model_to_run(run_model, include_sensitive=True)
127
131
  job_submission = job_model_to_job_submission(job_model)
128
132
  job_provisioning_data = job_submission.job_provisioning_data
129
133
  if job_provisioning_data is None:
@@ -343,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
343
347
  job_model.status = JobStatus.TERMINATING
344
348
  job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
345
349
 
350
+ if job_model.status == JobStatus.RUNNING:
351
+ await _check_gpu_utilization(session, job_model, job)
352
+
346
353
  job_model.last_processed_at = common_utils.get_current_datetime()
347
354
  await session.commit()
348
355
 
@@ -646,27 +653,67 @@ def _terminate_if_inactivity_duration_exceeded(
646
653
  run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
647
654
  ) -> None:
648
655
  conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
649
- if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
656
+ if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
650
657
  conf.inactivity_duration, int
651
658
  ):
652
- logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
653
- job_model.inactivity_secs = no_connections_secs
654
- if no_connections_secs is None:
655
- # TODO(0.19 or earlier): make no_connections_secs required
656
- job_model.status = JobStatus.TERMINATING
657
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
658
- job_model.termination_reason_message = (
659
- "The selected instance was created before dstack 0.18.41"
660
- " and does not support inactivity_duration"
661
- )
662
- elif no_connections_secs >= conf.inactivity_duration:
663
- job_model.status = JobStatus.TERMINATING
664
- # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
665
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
666
- job_model.termination_reason_message = (
667
- f"The job was inactive for {no_connections_secs} seconds,"
668
- f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
669
- )
659
+ # reset in case inactivity_duration was disabled via in-place update
660
+ job_model.inactivity_secs = None
661
+ return
662
+ logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
663
+ job_model.inactivity_secs = no_connections_secs
664
+ if no_connections_secs is None:
665
+ # TODO(0.19 or earlier): make no_connections_secs required
666
+ job_model.status = JobStatus.TERMINATING
667
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
668
+ job_model.termination_reason_message = (
669
+ "The selected instance was created before dstack 0.18.41"
670
+ " and does not support inactivity_duration"
671
+ )
672
+ elif no_connections_secs >= conf.inactivity_duration:
673
+ job_model.status = JobStatus.TERMINATING
674
+ # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
675
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
676
+ job_model.termination_reason_message = (
677
+ f"The job was inactive for {no_connections_secs} seconds,"
678
+ f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
679
+ )
680
+
681
+
682
+ async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
683
+ policy = job.job_spec.utilization_policy
684
+ if policy is None:
685
+ return
686
+ after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
687
+ job_metrics = await get_job_metrics(session, job_model, after=after)
688
+ gpus_util_metrics: list[Metric] = []
689
+ for metric in job_metrics.metrics:
690
+ if metric.name.startswith("gpu_util_percent_gpu"):
691
+ gpus_util_metrics.append(metric)
692
+ if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
693
+ # Job has started recently, not enough points collected.
694
+ # Assuming that metrics collection interval less than 1 minute.
695
+ logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
696
+ return
697
+ if _should_terminate_due_to_low_gpu_util(
698
+ policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
699
+ ):
700
+ logger.info("%s: GPU utilization check: terminating", fmt(job_model))
701
+ job_model.status = JobStatus.TERMINATING
702
+ # TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
703
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
704
+ job_model.termination_reason_message = (
705
+ f"The job GPU utilization below {policy.min_gpu_utilization}%"
706
+ f" for {policy.time_window} seconds"
707
+ )
708
+ else:
709
+ logger.debug("%s: GPU utilization check: OK", fmt(job_model))
710
+
711
+
712
+ def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
713
+ for gpu_util in gpus_util:
714
+ if all(util < min_util for util in gpu_util):
715
+ return True
716
+ return False
670
717
 
671
718
 
672
719
  def _get_cluster_info(
@@ -696,20 +743,29 @@ def _get_cluster_info(
696
743
 
697
744
 
698
745
  async def _get_job_code(
699
- session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: str
746
+ session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
700
747
  ) -> bytes:
748
+ if code_hash is None:
749
+ return b""
701
750
  code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash)
702
751
  if code_model is None:
703
752
  return b""
704
- storage = get_default_storage()
705
- if storage is None or code_model.blob is not None:
753
+ if code_model.blob is not None:
706
754
  return code_model.blob
755
+ storage = get_default_storage()
756
+ if storage is None:
757
+ return b""
707
758
  blob = await common_utils.run_async(
708
759
  storage.get_code,
709
760
  project.name,
710
761
  repo.name,
711
762
  code_hash,
712
763
  )
764
+ if blob is None:
765
+ logger.error(
766
+ "Failed to get repo code hash %s from storage for repo %s", code_hash, repo.name
767
+ )
768
+ return b""
713
769
  return blob
714
770
 
715
771
 
@@ -74,6 +74,7 @@ async def _process_next_run():
74
74
  JobModel.run_id == run_model.id,
75
75
  JobModel.id.not_in(job_lockset),
76
76
  )
77
+ .order_by(JobModel.id) # take locks in order
77
78
  .with_for_update(skip_locked=True)
78
79
  )
79
80
  job_models = res.scalars().all()