dstack 0.18.44__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (267) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -21
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/core/backends/__init__.py +56 -39
  10. dstack/_internal/core/backends/aws/__init__.py +0 -25
  11. dstack/_internal/core/backends/aws/auth.py +1 -10
  12. dstack/_internal/core/backends/aws/backend.py +26 -0
  13. dstack/_internal/core/backends/aws/compute.py +20 -45
  14. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  15. dstack/_internal/core/backends/aws/models.py +135 -0
  16. dstack/_internal/core/backends/aws/resources.py +1 -1
  17. dstack/_internal/core/backends/azure/__init__.py +0 -20
  18. dstack/_internal/core/backends/azure/auth.py +2 -11
  19. dstack/_internal/core/backends/azure/backend.py +21 -0
  20. dstack/_internal/core/backends/azure/compute.py +13 -27
  21. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  22. dstack/_internal/core/backends/azure/models.py +89 -0
  23. dstack/_internal/core/backends/base/__init__.py +0 -12
  24. dstack/_internal/core/backends/base/backend.py +18 -0
  25. dstack/_internal/core/backends/base/compute.py +153 -33
  26. dstack/_internal/core/backends/base/configurator.py +105 -0
  27. dstack/_internal/core/backends/base/models.py +14 -0
  28. dstack/_internal/core/backends/configurators.py +138 -0
  29. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  30. dstack/_internal/core/backends/cudo/backend.py +16 -0
  31. dstack/_internal/core/backends/cudo/compute.py +8 -26
  32. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  33. dstack/_internal/core/backends/cudo/models.py +37 -0
  34. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  35. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  36. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  37. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  38. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  39. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  40. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  41. dstack/_internal/core/backends/gcp/auth.py +2 -11
  42. dstack/_internal/core/backends/gcp/backend.py +17 -0
  43. dstack/_internal/core/backends/gcp/compute.py +13 -43
  44. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  45. dstack/_internal/core/backends/gcp/models.py +125 -0
  46. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  47. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  48. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  49. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  50. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  51. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  52. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  53. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  54. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  55. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  56. dstack/_internal/core/backends/local/__init__.py +0 -13
  57. dstack/_internal/core/backends/local/backend.py +14 -0
  58. dstack/_internal/core/backends/local/compute.py +16 -2
  59. dstack/_internal/core/backends/models.py +128 -0
  60. dstack/_internal/core/backends/oci/__init__.py +0 -15
  61. dstack/_internal/core/backends/oci/auth.py +1 -5
  62. dstack/_internal/core/backends/oci/backend.py +16 -0
  63. dstack/_internal/core/backends/oci/compute.py +9 -23
  64. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  65. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  66. dstack/_internal/core/backends/oci/region.py +1 -1
  67. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  68. dstack/_internal/core/backends/runpod/backend.py +16 -0
  69. dstack/_internal/core/backends/runpod/compute.py +7 -3
  70. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  71. dstack/_internal/core/backends/runpod/models.py +54 -0
  72. dstack/_internal/core/backends/template/__init__.py +0 -0
  73. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  74. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  75. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  76. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  77. dstack/_internal/core/backends/tensordock/models.py +38 -0
  78. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  79. dstack/_internal/core/backends/vastai/backend.py +16 -0
  80. dstack/_internal/core/backends/vastai/compute.py +2 -2
  81. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  82. dstack/_internal/core/backends/vastai/models.py +37 -0
  83. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  84. dstack/_internal/core/backends/vultr/backend.py +16 -0
  85. dstack/_internal/core/backends/vultr/compute.py +10 -24
  86. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  87. dstack/_internal/core/backends/vultr/models.py +34 -0
  88. dstack/_internal/core/models/backends/__init__.py +0 -184
  89. dstack/_internal/core/models/backends/base.py +0 -19
  90. dstack/_internal/core/models/configurations.py +20 -15
  91. dstack/_internal/core/models/envs.py +4 -3
  92. dstack/_internal/core/models/fleets.py +17 -22
  93. dstack/_internal/core/models/gateways.py +3 -3
  94. dstack/_internal/core/models/instances.py +24 -0
  95. dstack/_internal/core/models/profiles.py +41 -46
  96. dstack/_internal/core/models/projects.py +1 -1
  97. dstack/_internal/core/models/repos/base.py +0 -5
  98. dstack/_internal/core/models/repos/local.py +3 -3
  99. dstack/_internal/core/models/repos/remote.py +26 -12
  100. dstack/_internal/core/models/repos/virtual.py +1 -1
  101. dstack/_internal/core/models/resources.py +45 -76
  102. dstack/_internal/core/models/runs.py +17 -19
  103. dstack/_internal/core/models/volumes.py +1 -3
  104. dstack/_internal/core/services/profiles.py +7 -16
  105. dstack/_internal/core/services/repos.py +0 -4
  106. dstack/_internal/server/app.py +0 -3
  107. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  108. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  109. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  110. dstack/_internal/server/background/tasks/process_placement_groups.py +4 -1
  111. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_running_jobs.py +14 -5
  113. dstack/_internal/server/background/tasks/process_submitted_jobs.py +16 -37
  114. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  115. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  116. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  117. dstack/_internal/server/models.py +48 -9
  118. dstack/_internal/server/routers/backends.py +14 -23
  119. dstack/_internal/server/routers/instances.py +3 -4
  120. dstack/_internal/server/routers/metrics.py +10 -8
  121. dstack/_internal/server/routers/prometheus.py +1 -1
  122. dstack/_internal/server/routers/repos.py +1 -2
  123. dstack/_internal/server/routers/runs.py +13 -59
  124. dstack/_internal/server/schemas/gateways.py +14 -23
  125. dstack/_internal/server/schemas/projects.py +7 -2
  126. dstack/_internal/server/schemas/repos.py +2 -38
  127. dstack/_internal/server/schemas/runner.py +1 -0
  128. dstack/_internal/server/schemas/runs.py +1 -24
  129. dstack/_internal/server/services/backends/__init__.py +85 -158
  130. dstack/_internal/server/services/config.py +52 -576
  131. dstack/_internal/server/services/fleets.py +8 -103
  132. dstack/_internal/server/services/gateways/__init__.py +12 -4
  133. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  134. dstack/_internal/server/services/jobs/__init__.py +9 -6
  135. dstack/_internal/server/services/jobs/configurators/base.py +16 -0
  136. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  137. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  138. dstack/_internal/server/services/metrics.py +39 -13
  139. dstack/_internal/server/services/offers.py +1 -1
  140. dstack/_internal/server/services/projects.py +23 -14
  141. dstack/_internal/server/services/prometheus.py +176 -18
  142. dstack/_internal/server/services/runs.py +24 -16
  143. dstack/_internal/server/services/volumes.py +8 -4
  144. dstack/_internal/server/statics/index.html +1 -1
  145. dstack/_internal/server/statics/{main-4eb116b97819badd1e2c.js → main-4a0fe83e84574654e397.js} +18 -14
  146. dstack/_internal/server/statics/{main-4eb116b97819badd1e2c.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
  147. dstack/_internal/server/testing/common.py +58 -32
  148. dstack/_internal/utils/json_schema.py +6 -0
  149. dstack/_internal/utils/ssh.py +2 -1
  150. dstack/api/__init__.py +4 -0
  151. dstack/api/_public/__init__.py +16 -20
  152. dstack/api/_public/backends.py +1 -1
  153. dstack/api/_public/repos.py +36 -36
  154. dstack/api/_public/runs.py +167 -83
  155. dstack/api/server/__init__.py +11 -13
  156. dstack/api/server/_backends.py +12 -16
  157. dstack/api/server/_fleets.py +15 -57
  158. dstack/api/server/_gateways.py +3 -14
  159. dstack/api/server/_repos.py +1 -4
  160. dstack/api/server/_runs.py +21 -100
  161. dstack/api/server/_volumes.py +10 -5
  162. dstack/version.py +1 -1
  163. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/METADATA +1 -1
  164. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/RECORD +218 -204
  165. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  166. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  167. tests/_internal/core/backends/aws/test_resources.py +1 -1
  168. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  169. tests/_internal/core/backends/cudo/__init__.py +0 -0
  170. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  171. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  172. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  173. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  174. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  175. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  176. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  177. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  178. tests/_internal/core/backends/runpod/__init__.py +0 -0
  179. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  180. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  181. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  182. tests/_internal/core/backends/vastai/__init__.py +0 -0
  183. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  184. tests/_internal/core/backends/vultr/__init__.py +0 -0
  185. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  186. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  187. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  188. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  189. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  190. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +0 -3
  191. tests/_internal/server/background/tasks/test_process_running_jobs.py +0 -21
  192. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  193. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  194. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  195. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  196. tests/_internal/server/routers/test_backends.py +6 -764
  197. tests/_internal/server/routers/test_fleets.py +0 -26
  198. tests/_internal/server/routers/test_gateways.py +27 -3
  199. tests/_internal/server/routers/test_instances.py +0 -10
  200. tests/_internal/server/routers/test_metrics.py +27 -0
  201. tests/_internal/server/routers/test_projects.py +56 -0
  202. tests/_internal/server/routers/test_prometheus.py +116 -27
  203. tests/_internal/server/routers/test_repos.py +0 -15
  204. tests/_internal/server/routers/test_runs.py +4 -219
  205. tests/_internal/server/routers/test_volumes.py +2 -3
  206. tests/_internal/server/services/backends/__init__.py +0 -0
  207. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  208. tests/_internal/server/services/test_config.py +7 -4
  209. tests/_internal/server/services/test_fleets.py +1 -4
  210. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  211. tests/_internal/server/services/test_metrics.py +9 -5
  212. tests/_internal/server/services/test_repos.py +1 -14
  213. tests/_internal/server/services/test_runs.py +0 -4
  214. dstack/_internal/cli/commands/pool.py +0 -581
  215. dstack/_internal/cli/commands/run.py +0 -75
  216. dstack/_internal/core/backends/aws/config.py +0 -18
  217. dstack/_internal/core/backends/azure/config.py +0 -12
  218. dstack/_internal/core/backends/base/config.py +0 -5
  219. dstack/_internal/core/backends/cudo/config.py +0 -9
  220. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  221. dstack/_internal/core/backends/gcp/config.py +0 -22
  222. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  223. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  224. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  225. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  226. dstack/_internal/core/backends/nebius/compute.py +0 -220
  227. dstack/_internal/core/backends/nebius/config.py +0 -6
  228. dstack/_internal/core/backends/nebius/types.py +0 -37
  229. dstack/_internal/core/backends/oci/config.py +0 -6
  230. dstack/_internal/core/backends/runpod/config.py +0 -17
  231. dstack/_internal/core/backends/tensordock/config.py +0 -9
  232. dstack/_internal/core/backends/vastai/config.py +0 -6
  233. dstack/_internal/core/backends/vultr/config.py +0 -9
  234. dstack/_internal/core/models/backends/aws.py +0 -86
  235. dstack/_internal/core/models/backends/azure.py +0 -68
  236. dstack/_internal/core/models/backends/cudo.py +0 -43
  237. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  238. dstack/_internal/core/models/backends/gcp.py +0 -67
  239. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  240. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  241. dstack/_internal/core/models/backends/nebius.py +0 -54
  242. dstack/_internal/core/models/backends/runpod.py +0 -42
  243. dstack/_internal/core/models/backends/tensordock.py +0 -44
  244. dstack/_internal/core/models/backends/vastai.py +0 -43
  245. dstack/_internal/core/models/backends/vultr.py +0 -40
  246. dstack/_internal/core/models/pools.py +0 -43
  247. dstack/_internal/server/routers/pools.py +0 -142
  248. dstack/_internal/server/schemas/pools.py +0 -38
  249. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  250. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  251. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  252. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  253. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  254. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  255. dstack/_internal/server/services/backends/configurators/runpod.py +0 -67
  256. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  257. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  258. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  259. dstack/api/_public/pools.py +0 -41
  260. dstack/api/_public/resources.py +0 -105
  261. dstack/api/server/_pools.py +0 -63
  262. tests/_internal/server/routers/test_pools.py +0 -612
  263. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  264. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/LICENSE.md +0 -0
  265. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/WHEEL +0 -0
  266. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/entry_points.txt +0 -0
  267. {dstack-0.18.44.dist-info → dstack-0.19.0.dist-info}/top_level.txt +0 -0
@@ -10,7 +10,6 @@ from git.exc import GitCommandError
10
10
  from dstack._internal.core.errors import DstackError
11
11
  from dstack._internal.core.models.config import RepoConfig
12
12
  from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepoCreds
13
- from dstack._internal.core.models.repos.base import RepoProtocol
14
13
  from dstack._internal.core.models.repos.remote import GitRepoURL
15
14
  from dstack._internal.utils.logging import get_logger
16
15
  from dstack._internal.utils.path import PathLike
@@ -41,7 +40,6 @@ def get_local_repo_credentials(
41
40
  r = requests.get(f"{url.as_https()}/info/refs?service=git-upload-pack", timeout=10)
42
41
  if r.status_code == 200:
43
42
  return RemoteRepoCreds(
44
- protocol=RepoProtocol.HTTPS,
45
43
  clone_url=url.as_https(),
46
44
  private_key=None,
47
45
  oauth_token=None,
@@ -93,7 +91,6 @@ def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> Re
93
91
  f"Can't access `{url.as_https()}` using the `{masked}` token"
94
92
  )
95
93
  return RemoteRepoCreds(
96
- protocol=RepoProtocol.HTTPS,
97
94
  clone_url=url.as_https(),
98
95
  oauth_token=oauth_token,
99
96
  private_key=None,
@@ -123,7 +120,6 @@ def check_remote_repo_credentials_ssh(url: GitRepoURL, identity_file: PathLike)
123
120
  )
124
121
 
125
122
  return RemoteRepoCreds(
126
- protocol=RepoProtocol.SSH,
127
123
  clone_url=url.as_ssh(),
128
124
  private_key=private_key,
129
125
  oauth_token=None,
@@ -27,7 +27,6 @@ from dstack._internal.server.routers import (
27
27
  instances,
28
28
  logs,
29
29
  metrics,
30
- pools,
31
30
  projects,
32
31
  prometheus,
33
32
  repos,
@@ -184,8 +183,6 @@ def register_routes(app: FastAPI, ui: bool = True):
184
183
  app.include_router(volumes.project_router)
185
184
  app.include_router(service_proxy.router, prefix="/proxy/services", tags=["service-proxy"])
186
185
  app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
187
- app.include_router(pools.root_router)
188
- app.include_router(pools.router)
189
186
  app.include_router(prometheus.router)
190
187
 
191
188
  @app.exception_handler(ForbiddenError)
@@ -54,17 +54,13 @@ async def process_submitted_gateways():
54
54
 
55
55
 
56
56
  async def _remove_inactive_connections():
57
- connections = await gateway_connections_pool.all()
58
- ip_addresses = [c.ip_address for c in connections]
59
57
  async with get_session_ctx() as session:
60
58
  res = await session.execute(
61
- select(GatewayComputeModel).where(
62
- GatewayComputeModel.ip_address.in_(ip_addresses),
63
- GatewayComputeModel.active == False,
64
- )
59
+ select(GatewayComputeModel.ip_address).where(GatewayComputeModel.active == True)
65
60
  )
66
- removed_connections = res.scalars().all()
67
- for conn in removed_connections:
61
+ active_connection_ips = set(res.scalars().all())
62
+ for conn in await gateway_connections_pool.all():
63
+ if conn.ip_address not in active_connection_ips:
68
64
  await gateway_connections_pool.remove(conn.ip_address)
69
65
 
70
66
 
@@ -20,6 +20,8 @@ from dstack._internal.core.backends.base.compute import (
20
20
  DSTACK_RUNNER_BINARY_PATH,
21
21
  DSTACK_SHIM_BINARY_PATH,
22
22
  DSTACK_WORKING_DIR,
23
+ ComputeWithCreateInstanceSupport,
24
+ ComputeWithPlacementGroupSupport,
23
25
  get_shim_env,
24
26
  get_shim_pre_start_commands,
25
27
  )
@@ -76,19 +78,19 @@ from dstack._internal.server.services.fleets import (
76
78
  fleet_model_to_fleet,
77
79
  get_create_instance_offers,
78
80
  )
79
- from dstack._internal.server.services.locking import get_locker
80
- from dstack._internal.server.services.offers import is_divisible_into_blocks
81
- from dstack._internal.server.services.placement import (
82
- get_fleet_placement_groups,
83
- placement_group_model_to_placement_group,
84
- )
85
- from dstack._internal.server.services.pools import (
81
+ from dstack._internal.server.services.instances import (
86
82
  get_instance_configuration,
87
83
  get_instance_profile,
88
84
  get_instance_provisioning_data,
89
85
  get_instance_requirements,
90
86
  get_instance_ssh_private_keys,
91
87
  )
88
+ from dstack._internal.server.services.locking import get_locker
89
+ from dstack._internal.server.services.offers import is_divisible_into_blocks
90
+ from dstack._internal.server.services.placement import (
91
+ get_fleet_placement_groups,
92
+ placement_group_model_to_placement_group,
93
+ )
92
94
  from dstack._internal.server.services.runner import client as runner_client
93
95
  from dstack._internal.server.services.runner.client import HealthStatus
94
96
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
@@ -530,12 +532,15 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
530
532
  for backend, instance_offer in offers:
531
533
  if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
532
534
  continue
535
+ compute = backend.compute()
536
+ assert isinstance(compute, ComputeWithCreateInstanceSupport)
533
537
  instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
534
538
  if (
535
539
  instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
536
540
  and instance.fleet
537
541
  and instance_configuration.placement_group_name
538
542
  ):
543
+ assert isinstance(compute, ComputeWithPlacementGroupSupport)
539
544
  placement_group_model = _create_placement_group_if_does_not_exist(
540
545
  session=session,
541
546
  fleet_model=instance.fleet,
@@ -546,7 +551,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
546
551
  )
547
552
  if placement_group_model is not None:
548
553
  placement_group = placement_group_model_to_placement_group(placement_group_model)
549
- pgpd = await run_async(backend.compute().create_placement_group, placement_group)
554
+ pgpd = await run_async(compute.create_placement_group, placement_group)
550
555
  placement_group_model.provisioning_data = pgpd.json()
551
556
  session.add(placement_group_model)
552
557
  placement_groups.append(placement_group)
@@ -559,7 +564,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
559
564
  )
560
565
  try:
561
566
  job_provisioning_data = await run_async(
562
- backend.compute().create_instance,
567
+ compute.create_instance,
563
568
  instance_offer,
564
569
  instance_configuration,
565
570
  )
@@ -11,8 +11,8 @@ from dstack._internal.server import settings
11
11
  from dstack._internal.server.db import get_session_ctx
12
12
  from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel
13
13
  from dstack._internal.server.schemas.runner import MetricsResponse
14
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
14
15
  from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
15
- from dstack._internal.server.services.pools import get_instance_ssh_private_keys
16
16
  from dstack._internal.server.services.runner import client
17
17
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
18
18
  from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
@@ -5,6 +5,7 @@ from sqlalchemy import select
5
5
  from sqlalchemy.ext.asyncio import AsyncSession
6
6
  from sqlalchemy.orm import joinedload
7
7
 
8
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
8
9
  from dstack._internal.core.errors import PlacementGroupInUseError
9
10
  from dstack._internal.server.db import get_session_ctx
10
11
  from dstack._internal.server.models import PlacementGroupModel, ProjectModel
@@ -81,8 +82,10 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel):
81
82
  "Failed to delete placement group %s. Backend not available.", placement_group.name
82
83
  )
83
84
  return
85
+ compute = backend.compute()
86
+ assert isinstance(compute, ComputeWithPlacementGroupSupport)
84
87
  try:
85
- await run_async(backend.compute().delete_placement_group, placement_group)
88
+ await run_async(compute.delete_placement_group, placement_group)
86
89
  except PlacementGroupInUseError:
87
90
  logger.info(
88
91
  "Placement group %s is still in use. Skipping deletion for now.", placement_group.name
@@ -10,8 +10,8 @@ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
10
10
  from dstack._internal.core.models.runs import JobStatus
11
11
  from dstack._internal.server.db import get_session_ctx
12
12
  from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
13
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
13
14
  from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
14
- from dstack._internal.server.services.pools import get_instance_ssh_private_keys
15
15
  from dstack._internal.server.services.runner import client
16
16
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
17
17
  from dstack._internal.server.utils.common import gather_map_async
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
43
43
  from dstack._internal.server.schemas.runner import TaskStatus
44
44
  from dstack._internal.server.services import logs as logs_services
45
45
  from dstack._internal.server.services import services
46
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
46
47
  from dstack._internal.server.services.jobs import (
47
48
  find_job,
48
49
  get_job_attached_volumes,
@@ -52,7 +53,6 @@ from dstack._internal.server.services.jobs import (
52
53
  from dstack._internal.server.services.locking import get_locker
53
54
  from dstack._internal.server.services.logging import fmt
54
55
  from dstack._internal.server.services.metrics import get_job_metrics
55
- from dstack._internal.server.services.pools import get_instance_ssh_private_keys
56
56
  from dstack._internal.server.services.repos import (
57
57
  get_code_model,
58
58
  get_repo_creds,
@@ -127,7 +127,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
127
127
  run_model = res.unique().scalar_one()
128
128
  repo_model = run_model.repo
129
129
  project = run_model.project
130
- run = run_model_to_run(run_model)
130
+ run = run_model_to_run(run_model, include_sensitive=True)
131
131
  job_submission = job_model_to_job_submission(job_model)
132
132
  job_provisioning_data = job_submission.job_provisioning_data
133
133
  if job_provisioning_data is None:
@@ -743,20 +743,29 @@ def _get_cluster_info(
743
743
 
744
744
 
745
745
  async def _get_job_code(
746
- session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: str
746
+ session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
747
747
  ) -> bytes:
748
+ if code_hash is None:
749
+ return b""
748
750
  code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash)
749
751
  if code_model is None:
750
752
  return b""
751
- storage = get_default_storage()
752
- if storage is None or code_model.blob is not None:
753
+ if code_model.blob is not None:
753
754
  return code_model.blob
755
+ storage = get_default_storage()
756
+ if storage is None:
757
+ return b""
754
758
  blob = await common_utils.run_async(
755
759
  storage.get_code,
756
760
  project.name,
757
761
  repo.name,
758
762
  code_hash,
759
763
  )
764
+ if blob is None:
765
+ logger.error(
766
+ "Failed to get repo code hash %s from storage for repo %s", code_hash, repo.name
767
+ )
768
+ return b""
760
769
  return blob
761
770
 
762
771
 
@@ -6,7 +6,8 @@ from sqlalchemy import select
6
6
  from sqlalchemy.ext.asyncio import AsyncSession
7
7
  from sqlalchemy.orm import joinedload, lazyload, selectinload
8
8
 
9
- from dstack._internal.core.backends.base import Backend
9
+ from dstack._internal.core.backends.base.backend import Backend
10
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
10
11
  from dstack._internal.core.errors import BackendError, ServerClientError
11
12
  from dstack._internal.core.models.common import NetworkMode
12
13
  from dstack._internal.core.models.fleets import (
@@ -17,10 +18,8 @@ from dstack._internal.core.models.fleets import (
17
18
  )
18
19
  from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
19
20
  from dstack._internal.core.models.profiles import (
20
- DEFAULT_POOL_NAME,
21
21
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
22
22
  CreationPolicy,
23
- Profile,
24
23
  TerminationPolicy,
25
24
  )
26
25
  from dstack._internal.core.models.resources import Memory
@@ -41,7 +40,6 @@ from dstack._internal.server.models import (
41
40
  FleetModel,
42
41
  InstanceModel,
43
42
  JobModel,
44
- PoolModel,
45
43
  ProjectModel,
46
44
  RunModel,
47
45
  VolumeAttachmentModel,
@@ -51,6 +49,12 @@ from dstack._internal.server.services.backends import get_project_backend_by_typ
51
49
  from dstack._internal.server.services.fleets import (
52
50
  fleet_model_to_fleet,
53
51
  )
52
+ from dstack._internal.server.services.instances import (
53
+ filter_pool_instances,
54
+ get_instance_offer,
55
+ get_instance_provisioning_data,
56
+ get_shared_pool_instances_with_offers,
57
+ )
54
58
  from dstack._internal.server.services.jobs import (
55
59
  check_can_attach_job_volumes,
56
60
  find_job,
@@ -62,12 +66,6 @@ from dstack._internal.server.services.jobs import (
62
66
  from dstack._internal.server.services.locking import get_locker
63
67
  from dstack._internal.server.services.logging import fmt
64
68
  from dstack._internal.server.services.offers import get_offers_by_requirements
65
- from dstack._internal.server.services.pools import (
66
- filter_pool_instances,
67
- get_instance_offer,
68
- get_instance_provisioning_data,
69
- get_shared_pool_instances_with_offers,
70
- )
71
69
  from dstack._internal.server.services.runs import (
72
70
  check_run_spec_requires_instance_mounts,
73
71
  run_model_to_run,
@@ -173,25 +171,22 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
173
171
  except ServerClientError as e:
174
172
  logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
175
173
  job_model.status = JobStatus.TERMINATING
176
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
177
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
174
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
178
175
  job_model.termination_reason_message = e.msg
179
176
  job_model.last_processed_at = common_utils.get_current_datetime()
180
177
  await session.commit()
181
178
  return
182
179
 
183
- pool = await _get_pool(session=session, project=project, profile=profile)
184
-
185
180
  # Submitted jobs processing happens in two steps (transactions).
186
181
  # First, the jobs gets an instance assigned (or no instance).
187
182
  # Then, the job runs on the assigned instance or a new instance is provisioned.
188
183
  # This is needed to avoid holding instances lock for a long time.
189
184
  if not job_model.instance_assigned:
190
- # Try assigning instances from the pool.
185
+ # Try assigning an existing instance
191
186
  res = await session.execute(
192
187
  select(InstanceModel)
193
188
  .where(
194
- InstanceModel.pool_id == pool.id,
189
+ InstanceModel.project_id == project.id,
195
190
  InstanceModel.deleted == False,
196
191
  InstanceModel.total_blocks > InstanceModel.busy_blocks,
197
192
  )
@@ -289,7 +284,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
289
284
  )
290
285
  instance = _create_instance_model_for_job(
291
286
  project=project,
292
- pool=pool,
293
287
  fleet_model=fleet_model,
294
288
  run_spec=run_spec,
295
289
  job_model=job_model,
@@ -337,19 +331,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
337
331
  await session.commit()
338
332
 
339
333
 
340
- async def _get_pool(session: AsyncSession, project: ProjectModel, profile: Profile) -> PoolModel:
341
- res = await session.execute(
342
- select(PoolModel)
343
- .where(
344
- PoolModel.project_id == project.id,
345
- PoolModel.name == (profile.pool_name or DEFAULT_POOL_NAME),
346
- PoolModel.deleted == False,
347
- )
348
- .options(lazyload(PoolModel.instances))
349
- )
350
- return res.scalar_one()
351
-
352
-
353
334
  async def _assign_job_to_pool_instance(
354
335
  session: AsyncSession,
355
336
  pool_instances: List[InstanceModel],
@@ -548,7 +529,6 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
548
529
 
549
530
  def _create_instance_model_for_job(
550
531
  project: ProjectModel,
551
- pool: PoolModel,
552
532
  fleet_model: FleetModel,
553
533
  run_spec: RunSpec,
554
534
  job_model: JobModel,
@@ -571,7 +551,6 @@ def _create_instance_model_for_job(
571
551
  name=f"{fleet_model.name}-{instance_num}",
572
552
  instance_num=instance_num,
573
553
  project=project,
574
- pool=pool,
575
554
  created_at=common_utils.get_current_datetime(),
576
555
  started_at=common_utils.get_current_datetime(),
577
556
  status=InstanceStatus.PROVISIONING,
@@ -677,8 +656,7 @@ async def _attach_volumes(
677
656
  except (ServerClientError, BackendError) as e:
678
657
  logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
679
658
  job_model.status = JobStatus.TERMINATING
680
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
681
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
659
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
682
660
  job_model.termination_reason_message = "Failed to attach volume"
683
661
  except Exception:
684
662
  logger.exception(
@@ -686,8 +664,7 @@ async def _attach_volumes(
686
664
  fmt(job_model),
687
665
  )
688
666
  job_model.status = JobStatus.TERMINATING
689
- # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
690
- job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
667
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
691
668
  job_model.termination_reason_message = "Failed to attach volume"
692
669
  finally:
693
670
  job_model.job_runtime_data = job_runtime_data.json()
@@ -700,13 +677,15 @@ async def _attach_volume(
700
677
  instance: InstanceModel,
701
678
  instance_id: str,
702
679
  ):
680
+ compute = backend.compute()
681
+ assert isinstance(compute, ComputeWithVolumeSupport)
703
682
  volume = volume_model_to_volume(volume_model)
704
683
  # Refresh only to check if the volume wasn't deleted before the lock
705
684
  await session.refresh(volume_model)
706
685
  if volume_model.deleted:
707
686
  raise ServerClientError("Cannot attach a deleted volume")
708
687
  attachment_data = await common_utils.run_async(
709
- backend.compute().attach_volume,
688
+ compute.attach_volume,
710
689
  volume=volume,
711
690
  instance_id=instance_id,
712
691
  )
@@ -2,6 +2,7 @@ from sqlalchemy import select
2
2
  from sqlalchemy.ext.asyncio import AsyncSession
3
3
  from sqlalchemy.orm import joinedload
4
4
 
5
+ from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
5
6
  from dstack._internal.core.errors import BackendError, BackendNotAvailable
6
7
  from dstack._internal.core.models.volumes import VolumeStatus
7
8
  from dstack._internal.server.db import get_session_ctx
@@ -81,17 +82,19 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
81
82
  await session.commit()
82
83
  return
83
84
 
85
+ compute = backend.compute()
86
+ assert isinstance(compute, ComputeWithVolumeSupport)
84
87
  try:
85
88
  if volume.configuration.volume_id is not None:
86
89
  logger.info("Registering external volume %s", volume_model.name)
87
90
  vpd = await run_async(
88
- backend.compute().register_volume,
91
+ compute.register_volume,
89
92
  volume=volume,
90
93
  )
91
94
  else:
92
95
  logger.info("Provisioning new volume %s", volume_model.name)
93
96
  vpd = await run_async(
94
- backend.compute().create_volume,
97
+ compute.create_volume,
95
98
  volume=volume,
96
99
  )
97
100
  except BackendError as e:
@@ -0,0 +1,36 @@
1
+ """Make InstanceModel.pool_id optional
2
+
3
+ Revision ID: 7bc2586e8b9e
4
+ Revises: bc8ca4a505c6
5
+ Create Date: 2025-03-13 11:13:39.748303
6
+
7
+ """
8
+
9
+ import sqlalchemy_utils
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "7bc2586e8b9e"
14
+ down_revision = "bc8ca4a505c6"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("instances", schema=None) as batch_op:
22
+ batch_op.alter_column(
23
+ "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=True
24
+ )
25
+
26
+ # ### end Alembic commands ###
27
+
28
+
29
+ def downgrade() -> None:
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ with op.batch_alter_table("instances", schema=None) as batch_op:
32
+ batch_op.alter_column(
33
+ "pool_id", existing_type=sqlalchemy_utils.UUIDType(binary=False), nullable=False
34
+ )
35
+
36
+ # ### end Alembic commands ###
@@ -0,0 +1,171 @@
1
+ """Store BackendType as string
2
+
3
+ Revision ID: bc8ca4a505c6
4
+ Revises: 98d1b92988bc
5
+ Create Date: 2025-03-10 14:49:06.837118
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from sqlalchemy.dialects import postgresql
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "bc8ca4a505c6"
15
+ down_revision = "98d1b92988bc"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table("backends", schema=None) as batch_op:
23
+ batch_op.alter_column(
24
+ "type",
25
+ existing_type=postgresql.ENUM(
26
+ "AWS",
27
+ "AZURE",
28
+ "CUDO",
29
+ "DATACRUNCH",
30
+ "DSTACK",
31
+ "GCP",
32
+ "KUBERNETES",
33
+ "LAMBDA",
34
+ "LOCAL",
35
+ "REMOTE",
36
+ "NEBIUS",
37
+ "OCI",
38
+ "RUNPOD",
39
+ "TENSORDOCK",
40
+ "VASTAI",
41
+ "VULTR",
42
+ name="backendtype",
43
+ ),
44
+ type_=sa.String(length=100),
45
+ existing_nullable=False,
46
+ )
47
+
48
+ with op.batch_alter_table("instances", schema=None) as batch_op:
49
+ batch_op.alter_column(
50
+ "backend",
51
+ existing_type=postgresql.ENUM(
52
+ "AWS",
53
+ "AZURE",
54
+ "CUDO",
55
+ "DATACRUNCH",
56
+ "DSTACK",
57
+ "GCP",
58
+ "KUBERNETES",
59
+ "LAMBDA",
60
+ "LOCAL",
61
+ "REMOTE",
62
+ "NEBIUS",
63
+ "OCI",
64
+ "RUNPOD",
65
+ "TENSORDOCK",
66
+ "VASTAI",
67
+ "VULTR",
68
+ name="backendtype",
69
+ ),
70
+ type_=sa.String(length=100),
71
+ existing_nullable=True,
72
+ )
73
+
74
+ sa.Enum(
75
+ "AWS",
76
+ "AZURE",
77
+ "CUDO",
78
+ "DATACRUNCH",
79
+ "DSTACK",
80
+ "GCP",
81
+ "KUBERNETES",
82
+ "LAMBDA",
83
+ "LOCAL",
84
+ "REMOTE",
85
+ "NEBIUS",
86
+ "OCI",
87
+ "RUNPOD",
88
+ "TENSORDOCK",
89
+ "VASTAI",
90
+ "VULTR",
91
+ name="backendtype",
92
+ ).drop(op.get_bind())
93
+ # ### end Alembic commands ###
94
+
95
+
96
+ def downgrade() -> None:
97
+ # ### commands auto generated by Alembic - please adjust! ###
98
+ sa.Enum(
99
+ "AWS",
100
+ "AZURE",
101
+ "CUDO",
102
+ "DATACRUNCH",
103
+ "DSTACK",
104
+ "GCP",
105
+ "KUBERNETES",
106
+ "LAMBDA",
107
+ "LOCAL",
108
+ "REMOTE",
109
+ "NEBIUS",
110
+ "OCI",
111
+ "RUNPOD",
112
+ "TENSORDOCK",
113
+ "VASTAI",
114
+ "VULTR",
115
+ name="backendtype",
116
+ ).create(op.get_bind())
117
+ with op.batch_alter_table("instances", schema=None) as batch_op:
118
+ batch_op.alter_column(
119
+ "backend",
120
+ existing_type=sa.String(length=100),
121
+ type_=postgresql.ENUM(
122
+ "AWS",
123
+ "AZURE",
124
+ "CUDO",
125
+ "DATACRUNCH",
126
+ "DSTACK",
127
+ "GCP",
128
+ "KUBERNETES",
129
+ "LAMBDA",
130
+ "LOCAL",
131
+ "REMOTE",
132
+ "NEBIUS",
133
+ "OCI",
134
+ "RUNPOD",
135
+ "TENSORDOCK",
136
+ "VASTAI",
137
+ "VULTR",
138
+ name="backendtype",
139
+ ),
140
+ existing_nullable=True,
141
+ postgresql_using="backend::VARCHAR::backendtype",
142
+ )
143
+
144
+ with op.batch_alter_table("backends", schema=None) as batch_op:
145
+ batch_op.alter_column(
146
+ "type",
147
+ existing_type=sa.String(length=100),
148
+ type_=postgresql.ENUM(
149
+ "AWS",
150
+ "AZURE",
151
+ "CUDO",
152
+ "DATACRUNCH",
153
+ "DSTACK",
154
+ "GCP",
155
+ "KUBERNETES",
156
+ "LAMBDA",
157
+ "LOCAL",
158
+ "REMOTE",
159
+ "NEBIUS",
160
+ "OCI",
161
+ "RUNPOD",
162
+ "TENSORDOCK",
163
+ "VASTAI",
164
+ "VULTR",
165
+ name="backendtype",
166
+ ),
167
+ existing_nullable=False,
168
+ postgresql_using="type::VARCHAR::backendtype",
169
+ )
170
+
171
+ # ### end Alembic commands ###