dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (879) hide show
  1. dstack/_internal/cli/commands/__init__.py +80 -0
  2. dstack/_internal/cli/commands/apply.py +100 -0
  3. dstack/_internal/cli/commands/attach.py +161 -0
  4. dstack/_internal/cli/commands/completion.py +22 -0
  5. dstack/_internal/cli/commands/delete.py +44 -0
  6. dstack/_internal/cli/commands/event.py +168 -0
  7. dstack/_internal/cli/commands/fleet.py +161 -0
  8. dstack/_internal/cli/commands/gateway.py +159 -0
  9. dstack/_internal/cli/commands/init.py +64 -0
  10. dstack/_internal/cli/commands/login.py +352 -0
  11. dstack/_internal/cli/commands/logs.py +62 -0
  12. dstack/_internal/cli/commands/metrics.py +153 -0
  13. dstack/_internal/cli/commands/offer.py +146 -0
  14. dstack/_internal/cli/commands/project.py +259 -0
  15. dstack/_internal/cli/commands/ps.py +81 -0
  16. dstack/_internal/cli/commands/run.py +69 -0
  17. dstack/_internal/cli/commands/secrets.py +92 -0
  18. dstack/_internal/cli/commands/server.py +96 -0
  19. dstack/_internal/cli/commands/stop.py +26 -0
  20. dstack/_internal/cli/commands/volume.py +117 -0
  21. dstack/_internal/cli/main.py +101 -0
  22. dstack/_internal/cli/models/gateways.py +16 -0
  23. dstack/_internal/cli/models/offers.py +47 -0
  24. dstack/_internal/cli/models/runs.py +16 -0
  25. dstack/_internal/cli/services/args.py +31 -0
  26. dstack/_internal/cli/services/completion.py +91 -0
  27. dstack/_internal/cli/services/configurators/__init__.py +86 -0
  28. dstack/_internal/cli/services/configurators/base.py +103 -0
  29. dstack/_internal/cli/services/configurators/fleet.py +475 -0
  30. dstack/_internal/cli/services/configurators/gateway.py +231 -0
  31. dstack/_internal/cli/services/configurators/run.py +882 -0
  32. dstack/_internal/cli/services/configurators/volume.py +222 -0
  33. dstack/_internal/cli/services/events.py +68 -0
  34. dstack/_internal/cli/services/profile.py +182 -0
  35. dstack/_internal/cli/services/repos.py +71 -0
  36. dstack/_internal/cli/services/resources.py +54 -0
  37. dstack/_internal/cli/utils/common.py +159 -0
  38. dstack/_internal/cli/utils/fleet.py +106 -0
  39. dstack/_internal/cli/utils/gateway.py +56 -0
  40. dstack/_internal/cli/utils/gpu.py +178 -0
  41. dstack/_internal/cli/utils/rich.py +156 -0
  42. dstack/_internal/cli/utils/run.py +517 -0
  43. dstack/_internal/cli/utils/secrets.py +25 -0
  44. dstack/_internal/cli/utils/updates.py +98 -0
  45. dstack/_internal/cli/utils/volume.py +58 -0
  46. dstack/_internal/compat.py +3 -0
  47. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  48. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  49. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  50. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  51. dstack/_internal/core/backends/aws/auth.py +30 -0
  52. dstack/_internal/core/backends/aws/backend.py +31 -0
  53. dstack/_internal/core/backends/aws/compute.py +1153 -0
  54. dstack/_internal/core/backends/aws/configurator.py +191 -0
  55. dstack/_internal/core/backends/aws/models.py +135 -0
  56. dstack/_internal/core/backends/aws/resources.py +700 -0
  57. dstack/_internal/core/backends/azure/auth.py +39 -0
  58. dstack/_internal/core/backends/azure/backend.py +21 -0
  59. dstack/_internal/core/backends/azure/compute.py +676 -0
  60. dstack/_internal/core/backends/azure/configurator.py +472 -0
  61. dstack/_internal/core/backends/azure/models.py +98 -0
  62. dstack/_internal/core/backends/azure/resources.py +116 -0
  63. dstack/_internal/core/backends/azure/utils.py +42 -0
  64. dstack/_internal/core/backends/base/backend.py +18 -0
  65. dstack/_internal/core/backends/base/compute.py +1101 -0
  66. dstack/_internal/core/backends/base/configurator.py +117 -0
  67. dstack/_internal/core/backends/base/models.py +24 -0
  68. dstack/_internal/core/backends/base/offers.py +232 -0
  69. dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
  70. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  71. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  72. dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
  73. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  74. dstack/_internal/core/backends/configurators.py +181 -0
  75. dstack/_internal/core/backends/cudo/__init__.py +0 -0
  76. dstack/_internal/core/backends/cudo/api_client.py +111 -0
  77. dstack/_internal/core/backends/cudo/backend.py +16 -0
  78. dstack/_internal/core/backends/cudo/compute.py +174 -0
  79. dstack/_internal/core/backends/cudo/configurator.py +63 -0
  80. dstack/_internal/core/backends/cudo/models.py +37 -0
  81. dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
  82. dstack/_internal/core/backends/datacrunch/backend.py +18 -0
  83. dstack/_internal/core/backends/datacrunch/compute.py +8 -0
  84. dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
  85. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  86. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  87. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  88. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  89. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  90. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  91. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  92. dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
  93. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  94. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  95. dstack/_internal/core/backends/dstack/__init__.py +0 -0
  96. dstack/_internal/core/backends/dstack/models.py +26 -0
  97. dstack/_internal/core/backends/features.py +74 -0
  98. dstack/_internal/core/backends/gcp/__init__.py +0 -0
  99. dstack/_internal/core/backends/gcp/auth.py +57 -0
  100. dstack/_internal/core/backends/gcp/backend.py +17 -0
  101. dstack/_internal/core/backends/gcp/compute.py +1257 -0
  102. dstack/_internal/core/backends/gcp/configurator.py +206 -0
  103. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  104. dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
  105. dstack/_internal/core/backends/gcp/models.py +160 -0
  106. dstack/_internal/core/backends/gcp/resources.py +585 -0
  107. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  108. dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
  109. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  110. dstack/_internal/core/backends/hotaisle/compute.py +188 -0
  111. dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
  112. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  113. dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
  114. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  115. dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
  116. dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
  117. dstack/_internal/core/backends/kubernetes/models.py +71 -0
  118. dstack/_internal/core/backends/kubernetes/utils.py +81 -0
  119. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
  120. dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
  121. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  122. dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
  123. dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
  124. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  125. dstack/_internal/core/backends/local/__init__.py +0 -0
  126. dstack/_internal/core/backends/local/backend.py +14 -0
  127. dstack/_internal/core/backends/local/compute.py +130 -0
  128. dstack/_internal/core/backends/models.py +158 -0
  129. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  130. dstack/_internal/core/backends/nebius/backend.py +16 -0
  131. dstack/_internal/core/backends/nebius/compute.py +401 -0
  132. dstack/_internal/core/backends/nebius/configurator.py +98 -0
  133. dstack/_internal/core/backends/nebius/models.py +185 -0
  134. dstack/_internal/core/backends/nebius/resources.py +433 -0
  135. dstack/_internal/core/backends/oci/__init__.py +0 -0
  136. dstack/_internal/core/backends/oci/auth.py +21 -0
  137. dstack/_internal/core/backends/oci/backend.py +16 -0
  138. dstack/_internal/core/backends/oci/compute.py +209 -0
  139. dstack/_internal/core/backends/oci/configurator.py +156 -0
  140. dstack/_internal/core/backends/oci/exceptions.py +15 -0
  141. dstack/_internal/core/backends/oci/models.py +87 -0
  142. dstack/_internal/core/backends/oci/region.py +86 -0
  143. dstack/_internal/core/backends/oci/resources.py +836 -0
  144. dstack/_internal/core/backends/runpod/__init__.py +0 -0
  145. dstack/_internal/core/backends/runpod/api_client.py +627 -0
  146. dstack/_internal/core/backends/runpod/backend.py +16 -0
  147. dstack/_internal/core/backends/runpod/compute.py +444 -0
  148. dstack/_internal/core/backends/runpod/configurator.py +63 -0
  149. dstack/_internal/core/backends/runpod/models.py +54 -0
  150. dstack/_internal/core/backends/template/__init__.py +0 -0
  151. dstack/_internal/core/backends/template/backend.py.jinja +16 -0
  152. dstack/_internal/core/backends/template/compute.py.jinja +95 -0
  153. dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
  154. dstack/_internal/core/backends/template/models.py.jinja +62 -0
  155. dstack/_internal/core/backends/tensordock/models.py +40 -0
  156. dstack/_internal/core/backends/vastai/__init__.py +0 -0
  157. dstack/_internal/core/backends/vastai/api_client.py +143 -0
  158. dstack/_internal/core/backends/vastai/backend.py +16 -0
  159. dstack/_internal/core/backends/vastai/compute.py +141 -0
  160. dstack/_internal/core/backends/vastai/configurator.py +69 -0
  161. dstack/_internal/core/backends/vastai/models.py +37 -0
  162. dstack/_internal/core/backends/verda/__init__.py +0 -0
  163. dstack/_internal/core/backends/verda/backend.py +16 -0
  164. dstack/_internal/core/backends/verda/compute.py +266 -0
  165. dstack/_internal/core/backends/verda/configurator.py +73 -0
  166. dstack/_internal/core/backends/verda/models.py +38 -0
  167. dstack/_internal/core/backends/vultr/__init__.py +0 -0
  168. dstack/_internal/core/backends/vultr/api_client.py +116 -0
  169. dstack/_internal/core/backends/vultr/backend.py +16 -0
  170. dstack/_internal/core/backends/vultr/compute.py +167 -0
  171. dstack/_internal/core/backends/vultr/configurator.py +71 -0
  172. dstack/_internal/core/backends/vultr/models.py +34 -0
  173. dstack/_internal/core/compatibility/__init__.py +0 -0
  174. dstack/_internal/core/compatibility/events.py +13 -0
  175. dstack/_internal/core/compatibility/fleets.py +58 -0
  176. dstack/_internal/core/compatibility/gateways.py +39 -0
  177. dstack/_internal/core/compatibility/gpus.py +13 -0
  178. dstack/_internal/core/compatibility/logs.py +14 -0
  179. dstack/_internal/core/compatibility/runs.py +86 -0
  180. dstack/_internal/core/compatibility/volumes.py +37 -0
  181. dstack/_internal/core/consts.py +8 -0
  182. dstack/_internal/core/errors.py +160 -0
  183. dstack/_internal/core/models/__init__.py +0 -0
  184. dstack/_internal/core/models/auth.py +28 -0
  185. dstack/_internal/core/models/backends/__init__.py +0 -0
  186. dstack/_internal/core/models/backends/base.py +48 -0
  187. dstack/_internal/core/models/common.py +143 -0
  188. dstack/_internal/core/models/compute_groups.py +39 -0
  189. dstack/_internal/core/models/config.py +28 -0
  190. dstack/_internal/core/models/configurations.py +1123 -0
  191. dstack/_internal/core/models/envs.py +149 -0
  192. dstack/_internal/core/models/events.py +98 -0
  193. dstack/_internal/core/models/files.py +67 -0
  194. dstack/_internal/core/models/fleets.py +437 -0
  195. dstack/_internal/core/models/gateways.py +146 -0
  196. dstack/_internal/core/models/gpus.py +45 -0
  197. dstack/_internal/core/models/health.py +28 -0
  198. dstack/_internal/core/models/instances.py +346 -0
  199. dstack/_internal/core/models/logs.py +27 -0
  200. dstack/_internal/core/models/metrics.py +14 -0
  201. dstack/_internal/core/models/placement.py +27 -0
  202. dstack/_internal/core/models/profiles.py +431 -0
  203. dstack/_internal/core/models/projects.py +46 -0
  204. dstack/_internal/core/models/repos/__init__.py +34 -0
  205. dstack/_internal/core/models/repos/base.py +36 -0
  206. dstack/_internal/core/models/repos/local.py +96 -0
  207. dstack/_internal/core/models/repos/remote.py +341 -0
  208. dstack/_internal/core/models/repos/virtual.py +85 -0
  209. dstack/_internal/core/models/resources.py +424 -0
  210. dstack/_internal/core/models/routers.py +24 -0
  211. dstack/_internal/core/models/runs.py +618 -0
  212. dstack/_internal/core/models/secrets.py +16 -0
  213. dstack/_internal/core/models/server.py +7 -0
  214. dstack/_internal/core/models/services.py +76 -0
  215. dstack/_internal/core/models/unix.py +53 -0
  216. dstack/_internal/core/models/users.py +60 -0
  217. dstack/_internal/core/models/volumes.py +221 -0
  218. dstack/_internal/core/services/__init__.py +16 -0
  219. dstack/_internal/core/services/api_client.py +15 -0
  220. dstack/_internal/core/services/configs/__init__.py +116 -0
  221. dstack/_internal/core/services/diff.py +71 -0
  222. dstack/_internal/core/services/logs.py +58 -0
  223. dstack/_internal/core/services/profiles.py +46 -0
  224. dstack/_internal/core/services/repos.py +236 -0
  225. dstack/_internal/core/services/ssh/__init__.py +27 -0
  226. dstack/_internal/core/services/ssh/attach.py +241 -0
  227. dstack/_internal/core/services/ssh/client.py +113 -0
  228. dstack/_internal/core/services/ssh/key_manager.py +53 -0
  229. dstack/_internal/core/services/ssh/ports.py +89 -0
  230. dstack/_internal/core/services/ssh/tunnel.py +337 -0
  231. dstack/_internal/proxy/__init__.py +8 -0
  232. dstack/_internal/proxy/gateway/__init__.py +0 -0
  233. dstack/_internal/proxy/gateway/app.py +89 -0
  234. dstack/_internal/proxy/gateway/auth.py +26 -0
  235. dstack/_internal/proxy/gateway/const.py +7 -0
  236. dstack/_internal/proxy/gateway/deps.py +73 -0
  237. dstack/_internal/proxy/gateway/main.py +17 -0
  238. dstack/_internal/proxy/gateway/models.py +23 -0
  239. dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
  240. dstack/_internal/proxy/gateway/repo/repo.py +121 -0
  241. dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
  242. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
  243. dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
  244. dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
  245. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
  246. dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
  247. dstack/_internal/proxy/gateway/routers/auth.py +10 -0
  248. dstack/_internal/proxy/gateway/routers/config.py +28 -0
  249. dstack/_internal/proxy/gateway/routers/registry.py +124 -0
  250. dstack/_internal/proxy/gateway/routers/stats.py +18 -0
  251. dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
  252. dstack/_internal/proxy/gateway/schemas/common.py +5 -0
  253. dstack/_internal/proxy/gateway/schemas/config.py +9 -0
  254. dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
  255. dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
  256. dstack/_internal/proxy/gateway/services/__init__.py +0 -0
  257. dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
  258. dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
  259. dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
  260. dstack/_internal/proxy/gateway/services/nginx.py +455 -0
  261. dstack/_internal/proxy/gateway/services/registry.py +426 -0
  262. dstack/_internal/proxy/gateway/services/server_client.py +95 -0
  263. dstack/_internal/proxy/gateway/services/stats.py +170 -0
  264. dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
  265. dstack/_internal/proxy/gateway/testing/common.py +13 -0
  266. dstack/_internal/proxy/lib/__init__.py +0 -0
  267. dstack/_internal/proxy/lib/auth.py +7 -0
  268. dstack/_internal/proxy/lib/deps.py +106 -0
  269. dstack/_internal/proxy/lib/errors.py +14 -0
  270. dstack/_internal/proxy/lib/models.py +112 -0
  271. dstack/_internal/proxy/lib/repo.py +27 -0
  272. dstack/_internal/proxy/lib/routers/__init__.py +0 -0
  273. dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
  274. dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
  275. dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
  276. dstack/_internal/proxy/lib/services/__init__.py +0 -0
  277. dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
  278. dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
  279. dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
  280. dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
  281. dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
  282. dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
  283. dstack/_internal/proxy/lib/services/service_connection.py +160 -0
  284. dstack/_internal/proxy/lib/testing/__init__.py +0 -0
  285. dstack/_internal/proxy/lib/testing/auth.py +11 -0
  286. dstack/_internal/proxy/lib/testing/common.py +51 -0
  287. dstack/_internal/server/__init__.py +0 -0
  288. dstack/_internal/server/alembic.ini +100 -0
  289. dstack/_internal/server/app.py +432 -0
  290. dstack/_internal/server/background/__init__.py +142 -0
  291. dstack/_internal/server/background/tasks/__init__.py +0 -0
  292. dstack/_internal/server/background/tasks/common.py +24 -0
  293. dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
  294. dstack/_internal/server/background/tasks/process_events.py +17 -0
  295. dstack/_internal/server/background/tasks/process_fleets.py +289 -0
  296. dstack/_internal/server/background/tasks/process_gateways.py +188 -0
  297. dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
  298. dstack/_internal/server/background/tasks/process_instances.py +1186 -0
  299. dstack/_internal/server/background/tasks/process_metrics.py +172 -0
  300. dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
  301. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  302. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
  303. dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
  304. dstack/_internal/server/background/tasks/process_runs.py +842 -0
  305. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
  306. dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
  307. dstack/_internal/server/background/tasks/process_volumes.py +129 -0
  308. dstack/_internal/server/compatibility/__init__.py +0 -0
  309. dstack/_internal/server/compatibility/common.py +20 -0
  310. dstack/_internal/server/compatibility/gpus.py +22 -0
  311. dstack/_internal/server/db.py +127 -0
  312. dstack/_internal/server/deps.py +19 -0
  313. dstack/_internal/server/main.py +4 -0
  314. dstack/_internal/server/migrations/__init__.py +0 -0
  315. dstack/_internal/server/migrations/env.py +112 -0
  316. dstack/_internal/server/migrations/script.py.mako +28 -0
  317. dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
  318. dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
  319. dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
  320. dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
  321. dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
  322. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  323. dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
  324. dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
  325. dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
  326. dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
  327. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  328. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  329. dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
  330. dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
  331. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  332. dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
  333. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  334. dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
  335. dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
  336. dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
  337. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  338. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  339. dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
  340. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  341. dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
  342. dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
  343. dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
  344. dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
  345. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  346. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  347. dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
  348. dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
  349. dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
  350. dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
  351. dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
  352. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  353. dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
  354. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  355. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  356. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  357. dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
  358. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  359. dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
  360. dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
  361. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  362. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  363. dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
  364. dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
  365. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  366. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
  367. dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
  368. dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
  369. dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
  370. dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
  371. dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
  372. dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
  373. dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
  374. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  375. dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
  376. dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
  377. dstack/_internal/server/migrations/versions/__init__.py +0 -0
  378. dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
  379. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  380. dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
  381. dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
  382. dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
  383. dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
  384. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  385. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  386. dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
  387. dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
  388. dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
  389. dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
  390. dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
  391. dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
  392. dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
  393. dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
  394. dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
  395. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  396. dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
  397. dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
  398. dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
  399. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  400. dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
  401. dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
  402. dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
  403. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  404. dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
  405. dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
  406. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  407. dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
  408. dstack/_internal/server/models.py +930 -0
  409. dstack/_internal/server/routers/__init__.py +0 -0
  410. dstack/_internal/server/routers/auth.py +34 -0
  411. dstack/_internal/server/routers/backends.py +142 -0
  412. dstack/_internal/server/routers/events.py +60 -0
  413. dstack/_internal/server/routers/files.py +68 -0
  414. dstack/_internal/server/routers/fleets.py +202 -0
  415. dstack/_internal/server/routers/gateways.py +109 -0
  416. dstack/_internal/server/routers/gpus.py +32 -0
  417. dstack/_internal/server/routers/instances.py +77 -0
  418. dstack/_internal/server/routers/logs.py +34 -0
  419. dstack/_internal/server/routers/metrics.py +82 -0
  420. dstack/_internal/server/routers/projects.py +205 -0
  421. dstack/_internal/server/routers/prometheus.py +35 -0
  422. dstack/_internal/server/routers/repos.py +118 -0
  423. dstack/_internal/server/routers/runs.py +216 -0
  424. dstack/_internal/server/routers/secrets.py +86 -0
  425. dstack/_internal/server/routers/server.py +19 -0
  426. dstack/_internal/server/routers/users.py +158 -0
  427. dstack/_internal/server/routers/volumes.py +122 -0
  428. dstack/_internal/server/schemas/__init__.py +0 -0
  429. dstack/_internal/server/schemas/auth.py +83 -0
  430. dstack/_internal/server/schemas/backends.py +16 -0
  431. dstack/_internal/server/schemas/common.py +9 -0
  432. dstack/_internal/server/schemas/events.py +211 -0
  433. dstack/_internal/server/schemas/files.py +5 -0
  434. dstack/_internal/server/schemas/fleets.py +49 -0
  435. dstack/_internal/server/schemas/gateways.py +31 -0
  436. dstack/_internal/server/schemas/gpus.py +26 -0
  437. dstack/_internal/server/schemas/health/__init__.py +0 -0
  438. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  439. dstack/_internal/server/schemas/instances.py +47 -0
  440. dstack/_internal/server/schemas/logs.py +17 -0
  441. dstack/_internal/server/schemas/projects.py +81 -0
  442. dstack/_internal/server/schemas/repos.py +24 -0
  443. dstack/_internal/server/schemas/runner.py +269 -0
  444. dstack/_internal/server/schemas/runs.py +66 -0
  445. dstack/_internal/server/schemas/secrets.py +16 -0
  446. dstack/_internal/server/schemas/users.py +72 -0
  447. dstack/_internal/server/schemas/volumes.py +29 -0
  448. dstack/_internal/server/security/__init__.py +0 -0
  449. dstack/_internal/server/security/permissions.py +251 -0
  450. dstack/_internal/server/services/__init__.py +0 -0
  451. dstack/_internal/server/services/auth.py +77 -0
  452. dstack/_internal/server/services/backends/__init__.py +404 -0
  453. dstack/_internal/server/services/backends/handlers.py +105 -0
  454. dstack/_internal/server/services/compute_groups.py +22 -0
  455. dstack/_internal/server/services/config.py +279 -0
  456. dstack/_internal/server/services/docker.py +162 -0
  457. dstack/_internal/server/services/encryption/__init__.py +102 -0
  458. dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
  459. dstack/_internal/server/services/encryption/keys/aes.py +68 -0
  460. dstack/_internal/server/services/encryption/keys/base.py +19 -0
  461. dstack/_internal/server/services/encryption/keys/identity.py +28 -0
  462. dstack/_internal/server/services/events.py +477 -0
  463. dstack/_internal/server/services/files.py +91 -0
  464. dstack/_internal/server/services/fleets.py +1224 -0
  465. dstack/_internal/server/services/gateways/__init__.py +686 -0
  466. dstack/_internal/server/services/gateways/client.py +209 -0
  467. dstack/_internal/server/services/gateways/connection.py +139 -0
  468. dstack/_internal/server/services/gateways/pool.py +58 -0
  469. dstack/_internal/server/services/gpus.py +387 -0
  470. dstack/_internal/server/services/instances.py +731 -0
  471. dstack/_internal/server/services/jobs/__init__.py +840 -0
  472. dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
  473. dstack/_internal/server/services/jobs/configurators/base.py +469 -0
  474. dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
  475. dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
  476. dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
  477. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  478. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
  479. dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
  480. dstack/_internal/server/services/jobs/configurators/service.py +28 -0
  481. dstack/_internal/server/services/jobs/configurators/task.py +39 -0
  482. dstack/_internal/server/services/locking.py +187 -0
  483. dstack/_internal/server/services/logging.py +29 -0
  484. dstack/_internal/server/services/logs/__init__.py +122 -0
  485. dstack/_internal/server/services/logs/aws.py +373 -0
  486. dstack/_internal/server/services/logs/base.py +47 -0
  487. dstack/_internal/server/services/logs/filelog.py +261 -0
  488. dstack/_internal/server/services/logs/fluentbit.py +329 -0
  489. dstack/_internal/server/services/logs/gcp.py +181 -0
  490. dstack/_internal/server/services/metrics.py +172 -0
  491. dstack/_internal/server/services/offers.py +249 -0
  492. dstack/_internal/server/services/permissions.py +37 -0
  493. dstack/_internal/server/services/placement.py +234 -0
  494. dstack/_internal/server/services/plugins.py +109 -0
  495. dstack/_internal/server/services/probes.py +10 -0
  496. dstack/_internal/server/services/projects.py +835 -0
  497. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  498. dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
  499. dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
  500. dstack/_internal/server/services/proxy/__init__.py +3 -0
  501. dstack/_internal/server/services/proxy/auth.py +12 -0
  502. dstack/_internal/server/services/proxy/deps.py +18 -0
  503. dstack/_internal/server/services/proxy/repo.py +189 -0
  504. dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
  505. dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
  506. dstack/_internal/server/services/proxy/services/__init__.py +0 -0
  507. dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
  508. dstack/_internal/server/services/repos.py +362 -0
  509. dstack/_internal/server/services/requirements/__init__.py +0 -0
  510. dstack/_internal/server/services/requirements/combine.py +260 -0
  511. dstack/_internal/server/services/resources.py +21 -0
  512. dstack/_internal/server/services/runner/__init__.py +0 -0
  513. dstack/_internal/server/services/runner/client.py +646 -0
  514. dstack/_internal/server/services/runner/ssh.py +128 -0
  515. dstack/_internal/server/services/runs/__init__.py +1026 -0
  516. dstack/_internal/server/services/runs/plan.py +703 -0
  517. dstack/_internal/server/services/runs/replicas.py +317 -0
  518. dstack/_internal/server/services/runs/spec.py +191 -0
  519. dstack/_internal/server/services/secrets.py +245 -0
  520. dstack/_internal/server/services/services/__init__.py +345 -0
  521. dstack/_internal/server/services/services/autoscalers.py +140 -0
  522. dstack/_internal/server/services/services/options.py +53 -0
  523. dstack/_internal/server/services/ssh.py +67 -0
  524. dstack/_internal/server/services/storage/__init__.py +37 -0
  525. dstack/_internal/server/services/storage/base.py +48 -0
  526. dstack/_internal/server/services/storage/gcs.py +66 -0
  527. dstack/_internal/server/services/storage/s3.py +69 -0
  528. dstack/_internal/server/services/users.py +461 -0
  529. dstack/_internal/server/services/volumes.py +496 -0
  530. dstack/_internal/server/settings.py +161 -0
  531. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  532. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  533. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  534. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  535. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  536. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  537. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  538. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  539. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  540. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  541. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  542. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  543. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  544. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  545. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  546. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  547. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  548. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  549. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  550. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  551. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  552. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  553. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  554. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  555. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  556. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  557. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  558. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  559. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  560. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  561. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  562. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  563. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  564. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  565. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  566. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  567. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  568. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  569. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  570. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  571. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  572. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  573. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  574. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  575. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  576. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  577. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  578. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  579. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  580. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  581. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  582. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  583. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  584. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  585. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  586. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  587. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  588. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  589. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  590. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  591. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  592. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  593. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  594. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  595. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  596. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  597. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  598. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  599. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  600. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  601. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  602. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  603. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  604. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  605. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  606. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  607. dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
  608. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  609. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  610. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  611. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  612. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  613. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  614. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  615. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  616. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  617. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  618. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  619. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  620. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  621. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  622. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  623. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  624. dstack/_internal/server/statics/index.html +3 -0
  625. dstack/_internal/server/statics/logo-notext.svg +116 -0
  626. dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
  627. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
  628. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
  629. dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
  630. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  631. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  632. dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
  633. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  634. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  635. dstack/_internal/server/testing/__init__.py +0 -0
  636. dstack/_internal/server/testing/common.py +1220 -0
  637. dstack/_internal/server/testing/conf.py +53 -0
  638. dstack/_internal/server/testing/matchers.py +31 -0
  639. dstack/_internal/server/utils/__init__.py +0 -0
  640. dstack/_internal/server/utils/common.py +55 -0
  641. dstack/_internal/server/utils/logging.py +51 -0
  642. dstack/_internal/server/utils/provisioning.py +368 -0
  643. dstack/_internal/server/utils/routers.py +166 -0
  644. dstack/_internal/server/utils/sentry_utils.py +24 -0
  645. dstack/_internal/settings.py +49 -0
  646. dstack/_internal/utils/__init__.py +0 -0
  647. dstack/_internal/utils/common.py +318 -0
  648. dstack/_internal/utils/cron.py +5 -0
  649. dstack/_internal/utils/crypto.py +40 -0
  650. dstack/_internal/utils/env.py +88 -0
  651. dstack/_internal/utils/event_loop.py +30 -0
  652. dstack/_internal/utils/files.py +69 -0
  653. dstack/_internal/utils/gpu.py +59 -0
  654. dstack/_internal/utils/hash.py +31 -0
  655. dstack/_internal/utils/interpolator.py +91 -0
  656. dstack/_internal/utils/json_schema.py +11 -0
  657. dstack/_internal/utils/json_utils.py +54 -0
  658. dstack/_internal/utils/logging.py +5 -0
  659. dstack/_internal/utils/nested_list.py +47 -0
  660. dstack/_internal/utils/network.py +50 -0
  661. dstack/_internal/utils/path.py +57 -0
  662. dstack/_internal/utils/random_names.py +258 -0
  663. dstack/_internal/utils/ssh.py +346 -0
  664. dstack/_internal/utils/tags.py +42 -0
  665. dstack/_internal/utils/typing.py +14 -0
  666. dstack/_internal/utils/version.py +22 -0
  667. dstack/api/__init__.py +46 -0
  668. dstack/api/_public/__init__.py +96 -0
  669. dstack/api/_public/backends.py +42 -0
  670. dstack/api/_public/common.py +5 -0
  671. dstack/api/_public/repos.py +202 -0
  672. dstack/api/_public/runs.py +714 -0
  673. dstack/api/server/__init__.py +206 -0
  674. dstack/api/server/_auth.py +30 -0
  675. dstack/api/server/_backends.py +38 -0
  676. dstack/api/server/_events.py +64 -0
  677. dstack/api/server/_files.py +18 -0
  678. dstack/api/server/_fleets.py +82 -0
  679. dstack/api/server/_gateways.py +54 -0
  680. dstack/api/server/_gpus.py +27 -0
  681. dstack/api/server/_group.py +22 -0
  682. dstack/api/server/_logs.py +15 -0
  683. dstack/api/server/_metrics.py +23 -0
  684. dstack/api/server/_projects.py +124 -0
  685. dstack/api/server/_repos.py +64 -0
  686. dstack/api/server/_runs.py +102 -0
  687. dstack/api/server/_secrets.py +36 -0
  688. dstack/api/server/_users.py +82 -0
  689. dstack/api/server/_volumes.py +39 -0
  690. dstack/api/server/utils.py +34 -0
  691. dstack/api/utils.py +105 -0
  692. dstack/core/__init__.py +0 -0
  693. dstack/plugins/__init__.py +8 -0
  694. dstack/plugins/_base.py +72 -0
  695. dstack/plugins/_models.py +8 -0
  696. dstack/plugins/_utils.py +19 -0
  697. dstack/plugins/builtin/__init__.py +0 -0
  698. dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
  699. dstack/plugins/builtin/rest_plugin/_models.py +48 -0
  700. dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
  701. dstack/version.py +3 -1
  702. dstack-0.20.7.dist-info/METADATA +519 -0
  703. dstack-0.20.7.dist-info/RECORD +720 -0
  704. {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
  705. dstack-0.20.7.dist-info/entry_points.txt +2 -0
  706. dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
  707. dstack/aws/__init__.py +0 -180
  708. dstack/aws/artifacts.py +0 -111
  709. dstack/aws/config.py +0 -40
  710. dstack/aws/jobs.py +0 -245
  711. dstack/aws/logs.py +0 -186
  712. dstack/aws/repos.py +0 -137
  713. dstack/aws/run_names.py +0 -17
  714. dstack/aws/runners.py +0 -693
  715. dstack/aws/runs.py +0 -79
  716. dstack/aws/secrets.py +0 -99
  717. dstack/aws/tags.py +0 -138
  718. dstack/backend.py +0 -299
  719. dstack/cli/app.py +0 -41
  720. dstack/cli/artifacts.py +0 -87
  721. dstack/cli/common.py +0 -57
  722. dstack/cli/config.py +0 -194
  723. dstack/cli/dashboard.py +0 -26
  724. dstack/cli/delete.py +0 -49
  725. dstack/cli/init.py +0 -33
  726. dstack/cli/logs.py +0 -87
  727. dstack/cli/main.py +0 -81
  728. dstack/cli/restart.py +0 -43
  729. dstack/cli/run.py +0 -223
  730. dstack/cli/schema.py +0 -46
  731. dstack/cli/secrets.py +0 -97
  732. dstack/cli/status.py +0 -140
  733. dstack/cli/stop.py +0 -53
  734. dstack/cli/tags.py +0 -100
  735. dstack/config.py +0 -80
  736. dstack/dashboard/artifacts.py +0 -26
  737. dstack/dashboard/logs.py +0 -73
  738. dstack/dashboard/main.py +0 -45
  739. dstack/dashboard/repos.py +0 -41
  740. dstack/dashboard/runs.py +0 -140
  741. dstack/dashboard/secrets.py +0 -53
  742. dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
  743. dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
  744. dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
  745. dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
  746. dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
  747. dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
  748. dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
  749. dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
  750. dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
  751. dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
  752. dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
  753. dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  754. dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
  755. dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
  756. dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
  757. dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
  758. dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
  759. dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
  760. dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
  761. dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
  762. dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
  763. dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
  764. dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
  765. dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
  766. dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  767. dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  768. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  769. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  770. dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  771. dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  772. dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  773. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  774. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  775. dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  776. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  777. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  778. dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  779. dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  780. dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  781. dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  782. dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  783. dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  784. dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  785. dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  786. dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  787. dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  788. dstack/dashboard/statics/assets/browserconfig.xml +0 -15
  789. dstack/dashboard/statics/assets/coast-228x228.png +0 -0
  790. dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
  791. dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
  792. dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
  793. dstack/dashboard/statics/assets/favicon.ico +0 -0
  794. dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
  795. dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
  796. dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
  797. dstack/dashboard/statics/assets/manifest.webapp +0 -14
  798. dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
  799. dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
  800. dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
  801. dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
  802. dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
  803. dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
  804. dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
  805. dstack/dashboard/statics/index.html +0 -7
  806. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
  807. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
  808. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
  809. dstack/dashboard/statics/main.css +0 -5058
  810. dstack/dashboard/statics/splash_thumbnail.png +0 -0
  811. dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  812. dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
  813. dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
  814. dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
  815. dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
  816. dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
  817. dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
  818. dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
  819. dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
  820. dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
  821. dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
  822. dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
  823. dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
  824. dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
  825. dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
  826. dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
  827. dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
  828. dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
  829. dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
  830. dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
  831. dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
  832. dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
  833. dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  834. dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
  835. dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
  836. dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
  837. dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
  838. dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
  839. dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
  840. dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
  841. dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
  842. dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
  843. dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
  844. dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
  845. dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
  846. dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
  847. dstack/dashboard/tags.py +0 -119
  848. dstack/jobs.py +0 -255
  849. dstack/providers/__init__.py +0 -316
  850. dstack/providers/_python/main.py +0 -88
  851. dstack/providers/_tensorboard/main.py +0 -93
  852. dstack/providers/_torchrun/main.py +0 -121
  853. dstack/providers/bash/main.py +0 -90
  854. dstack/providers/code/main.py +0 -95
  855. dstack/providers/docker/main.py +0 -79
  856. dstack/providers/lab/main.py +0 -95
  857. dstack/providers/notebook/main.py +0 -90
  858. dstack/random_name.py +0 -29
  859. dstack/repo.py +0 -135
  860. dstack/runners.py +0 -35
  861. dstack/util.py +0 -15
  862. dstack-0.0.9.dist-info/METADATA +0 -176
  863. dstack-0.0.9.dist-info/RECORD +0 -179
  864. dstack-0.0.9.dist-info/entry_points.txt +0 -3
  865. dstack-0.0.9.dist-info/top_level.txt +0 -2
  866. tests/test_config.py +0 -70
  867. /dstack/{cli → _internal}/__init__.py +0 -0
  868. /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
  869. /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
  870. /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
  871. /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
  872. /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
  873. /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
  874. /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
  875. /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
  876. /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
  877. {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
  878. /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
  879. /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
@@ -0,0 +1,1106 @@
1
+ import asyncio
2
+ import itertools
3
+ import uuid
4
+ from contextlib import AsyncExitStack
5
+ from datetime import datetime, timedelta
6
+ from typing import List, Optional, Union
7
+
8
+ from sqlalchemy import func, or_, select
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+ from sqlalchemy.orm import (
11
+ contains_eager,
12
+ joinedload,
13
+ load_only,
14
+ noload,
15
+ selectinload,
16
+ with_loader_criteria,
17
+ )
18
+
19
+ from dstack._internal.core.backends.base.backend import Backend
20
+ from dstack._internal.core.backends.base.compute import (
21
+ ComputeWithGroupProvisioningSupport,
22
+ ComputeWithPlacementGroupSupport,
23
+ ComputeWithVolumeSupport,
24
+ )
25
+ from dstack._internal.core.backends.base.models import JobConfiguration
26
+ from dstack._internal.core.backends.features import (
27
+ BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT,
28
+ BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT,
29
+ )
30
+ from dstack._internal.core.errors import BackendError, ServerClientError
31
+ from dstack._internal.core.models.common import NetworkMode
32
+ from dstack._internal.core.models.compute_groups import ComputeGroupProvisioningData
33
+ from dstack._internal.core.models.fleets import (
34
+ FleetConfiguration,
35
+ FleetNodesSpec,
36
+ FleetSpec,
37
+ FleetStatus,
38
+ InstanceGroupPlacement,
39
+ )
40
+ from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
41
+ from dstack._internal.core.models.profiles import (
42
+ DEFAULT_RUN_TERMINATION_IDLE_TIME,
43
+ CreationPolicy,
44
+ Profile,
45
+ TerminationPolicy,
46
+ )
47
+ from dstack._internal.core.models.resources import Memory
48
+ from dstack._internal.core.models.runs import (
49
+ Job,
50
+ JobProvisioningData,
51
+ JobRuntimeData,
52
+ JobStatus,
53
+ JobTerminationReason,
54
+ Requirements,
55
+ Run,
56
+ )
57
+ from dstack._internal.core.models.volumes import Volume
58
+ from dstack._internal.core.services.profiles import get_termination
59
+ from dstack._internal.server import settings
60
+ from dstack._internal.server.background.tasks.process_compute_groups import ComputeGroupStatus
61
+ from dstack._internal.server.db import (
62
+ get_db,
63
+ get_session_ctx,
64
+ is_db_postgres,
65
+ is_db_sqlite,
66
+ sqlite_commit,
67
+ )
68
+ from dstack._internal.server.models import (
69
+ ComputeGroupModel,
70
+ FleetModel,
71
+ InstanceModel,
72
+ JobModel,
73
+ ProjectModel,
74
+ RunModel,
75
+ UserModel,
76
+ VolumeAttachmentModel,
77
+ VolumeModel,
78
+ )
79
+ from dstack._internal.server.services import events
80
+ from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
81
+ from dstack._internal.server.services.fleets import (
82
+ check_can_create_new_cloud_instance_in_fleet,
83
+ fleet_model_to_fleet,
84
+ generate_fleet_name,
85
+ get_fleet_master_instance_provisioning_data,
86
+ get_next_instance_num,
87
+ is_cloud_cluster,
88
+ )
89
+ from dstack._internal.server.services.instances import (
90
+ format_instance_blocks_for_event,
91
+ get_instance_provisioning_data,
92
+ switch_instance_status,
93
+ )
94
+ from dstack._internal.server.services.jobs import (
95
+ check_can_attach_job_volumes,
96
+ find_job,
97
+ find_jobs,
98
+ get_job_configured_volume_models,
99
+ get_job_configured_volumes,
100
+ get_job_runtime_data,
101
+ is_master_job,
102
+ is_multinode_job,
103
+ switch_job_status,
104
+ )
105
+ from dstack._internal.server.services.locking import get_locker, string_to_lock_id
106
+ from dstack._internal.server.services.logging import fmt
107
+ from dstack._internal.server.services.offers import get_offers_by_requirements
108
+ from dstack._internal.server.services.placement import (
109
+ find_or_create_suitable_placement_group,
110
+ get_fleet_placement_group_models,
111
+ get_placement_group_model_for_job,
112
+ placement_group_model_to_placement_group_optional,
113
+ schedule_fleet_placement_groups_deletion,
114
+ )
115
+ from dstack._internal.server.services.runs import (
116
+ run_model_to_run,
117
+ )
118
+ from dstack._internal.server.services.runs.plan import (
119
+ find_optimal_fleet_with_offers,
120
+ get_run_candidate_fleet_models_filters,
121
+ get_run_profile_and_requirements_in_fleet,
122
+ select_run_candidate_fleet_models_with_filters,
123
+ )
124
+ from dstack._internal.server.services.runs.spec import (
125
+ check_run_spec_requires_instance_mounts,
126
+ get_nodes_required_num,
127
+ )
128
+ from dstack._internal.server.services.volumes import (
129
+ volume_model_to_volume,
130
+ )
131
+ from dstack._internal.server.utils import sentry_utils
132
+ from dstack._internal.settings import FeatureFlags
133
+ from dstack._internal.utils import common as common_utils
134
+ from dstack._internal.utils.logging import get_logger
135
+
136
+ logger = get_logger(__name__)
137
+
138
+
139
+ # Track when we last processed a job.
140
+ # This is needed for a trick:
141
+ # If no tasks were processed recently, we force batch_size 1.
142
+ # If there are lots of runs/jobs with same offers submitted,
143
+ # we warm up the cache instead of requesting the offers concurrently.
144
+ # Mostly useful when runs are submitted via API without getting run plan first.
145
+ BATCH_SIZE_RESET_TIMEOUT = timedelta(minutes=2)
146
+ last_processed_at: Optional[datetime] = None
147
+
148
+
149
+ async def process_submitted_jobs(batch_size: int = 1):
150
+ tasks = []
151
+ effective_batch_size = _get_effective_batch_size(batch_size)
152
+ for _ in range(effective_batch_size):
153
+ tasks.append(_process_next_submitted_job())
154
+ await asyncio.gather(*tasks)
155
+
156
+
157
+ def _get_effective_batch_size(batch_size: int) -> int:
158
+ if (
159
+ last_processed_at is None
160
+ or last_processed_at < common_utils.get_current_datetime() - BATCH_SIZE_RESET_TIMEOUT
161
+ ):
162
+ return 1
163
+ return batch_size
164
+
165
+
166
+ @sentry_utils.instrument_background_task
167
+ async def _process_next_submitted_job():
168
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
169
+ async with get_session_ctx() as session:
170
+ async with lock:
171
+ res = await session.execute(
172
+ select(JobModel)
173
+ .join(JobModel.run)
174
+ .where(
175
+ JobModel.status == JobStatus.SUBMITTED,
176
+ JobModel.waiting_master_job.is_not(True),
177
+ JobModel.id.not_in(lockset),
178
+ )
179
+ .options(load_only(JobModel.id))
180
+ # Jobs are process in FIFO sorted by priority globally,
181
+ # thus runs from different projects can "overtake" each other by using higher priorities.
182
+ # That's not a big problem as long as projects do not compete for the same compute resources.
183
+ # Jobs with lower priorities from other projects will be processed without major lag
184
+ # as long as new higher priority runs are not constantly submitted.
185
+ # TODO: Consider processing jobs from different projects fairly/round-robin
186
+ # Fully fair processing can be tricky to implement via the current DB queue as
187
+ # there can be many projects and we are limited by the max DB connections.
188
+ .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
189
+ .limit(1)
190
+ .with_for_update(
191
+ skip_locked=True,
192
+ key_share=True,
193
+ # Do not lock joined run, only job.
194
+ # Locking run here may cause deadlock.
195
+ of=JobModel,
196
+ )
197
+ )
198
+ job_model = res.scalar()
199
+ if job_model is None:
200
+ return
201
+ lockset.add(job_model.id)
202
+ job_model_id = job_model.id
203
+ try:
204
+ async with AsyncExitStack() as exit_stack:
205
+ await _process_submitted_job(
206
+ exit_stack=exit_stack,
207
+ session=session,
208
+ job_model=job_model,
209
+ )
210
+ finally:
211
+ lockset.difference_update([job_model_id])
212
+ global last_processed_at
213
+ last_processed_at = common_utils.get_current_datetime()
214
+
215
+
216
+ async def _process_submitted_job(
217
+ exit_stack: AsyncExitStack, session: AsyncSession, job_model: JobModel
218
+ ):
219
+ # Refetch to load related attributes.
220
+ res = await session.execute(
221
+ select(JobModel)
222
+ .where(JobModel.id == job_model.id)
223
+ .options(joinedload(JobModel.instance))
224
+ .options(
225
+ joinedload(JobModel.fleet).joinedload(FleetModel.instances),
226
+ with_loader_criteria(
227
+ InstanceModel, InstanceModel.deleted == False, include_aliases=True
228
+ ),
229
+ )
230
+ )
231
+ job_model = res.unique().scalar_one()
232
+ res = await session.execute(
233
+ select(RunModel)
234
+ .where(RunModel.id == job_model.run_id)
235
+ .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
236
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
237
+ .options(
238
+ joinedload(RunModel.fleet).joinedload(FleetModel.instances),
239
+ with_loader_criteria(
240
+ InstanceModel, InstanceModel.deleted == False, include_aliases=True
241
+ ),
242
+ )
243
+ )
244
+ run_model = res.unique().scalar_one()
245
+ logger.debug("%s: provisioning has started", fmt(job_model))
246
+
247
+ project = run_model.project
248
+ run = run_model_to_run(run_model)
249
+ run_spec = run.run_spec
250
+ run_profile = run_spec.merged_profile
251
+ job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
252
+ replica_jobs = find_jobs(run.jobs, replica_num=job_model.replica_num)
253
+ replica_job_models = _get_job_models_for_jobs(run_model.jobs, replica_jobs)
254
+ multinode = job.job_spec.jobs_per_replica > 1
255
+
256
+ # Master job chooses fleet for the run.
257
+ # Due to two-step processing, it's saved to job_model.fleet.
258
+ # Other jobs just inherit fleet from run_model.fleet.
259
+ # If master job chooses no fleet, the new fleet will be created.
260
+ fleet_model = run_model.fleet or job_model.fleet
261
+
262
+ master_job = find_job(run.jobs, job_model.replica_num, 0)
263
+ master_job_provisioning_data = None
264
+ if job.job_spec.job_num != 0:
265
+ if master_job.job_submissions[-1].job_provisioning_data is None:
266
+ logger.debug("%s: waiting for master job to be provisioned", fmt(job_model))
267
+ job_model.last_processed_at = common_utils.get_current_datetime()
268
+ await session.commit()
269
+ return
270
+ master_job_provisioning_data = JobProvisioningData.__response__.parse_obj(
271
+ master_job.job_submissions[-1].job_provisioning_data
272
+ )
273
+ if job.job_spec.job_num != 0 or job.job_spec.replica_num != 0:
274
+ if run_model.fleet_id is None:
275
+ logger.debug("%s: waiting for the run to be assigned to the fleet", fmt(job_model))
276
+ job_model.last_processed_at = common_utils.get_current_datetime()
277
+ await session.commit()
278
+ return
279
+ try:
280
+ volume_models = await get_job_configured_volume_models(
281
+ session=session,
282
+ project=project,
283
+ run_spec=run_spec,
284
+ job_num=job.job_spec.job_num,
285
+ job_spec=job.job_spec,
286
+ )
287
+ volumes = await get_job_configured_volumes(
288
+ session=session,
289
+ project=project,
290
+ run_spec=run_spec,
291
+ job_num=job.job_spec.job_num,
292
+ job_spec=job.job_spec,
293
+ )
294
+ check_can_attach_job_volumes(volumes)
295
+ except ServerClientError as e:
296
+ logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
297
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
298
+ job_model.termination_reason_message = e.msg
299
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
300
+ job_model.last_processed_at = common_utils.get_current_datetime()
301
+ await session.commit()
302
+ return
303
+
304
+ # Submitted jobs processing happens in two steps (transactions).
305
+ # First, the jobs gets an instance assigned (or no instance).
306
+ # Then, the job runs on the assigned instance or a new instance is provisioned.
307
+ # This is needed to avoid holding instances lock for a long time.
308
+ if not job_model.instance_assigned:
309
+ fleet_filters, instance_filters = await get_run_candidate_fleet_models_filters(
310
+ session=session,
311
+ project=project,
312
+ run_model=run_model,
313
+ run_spec=run_spec,
314
+ )
315
+ (
316
+ fleet_models_with_instances,
317
+ fleet_models_without_instances,
318
+ ) = await select_run_candidate_fleet_models_with_filters(
319
+ session=session,
320
+ fleet_filters=fleet_filters,
321
+ instance_filters=instance_filters,
322
+ lock_instances=True,
323
+ )
324
+ instances_ids = sorted(
325
+ itertools.chain.from_iterable(
326
+ [i.id for i in f.instances] for f in fleet_models_with_instances
327
+ )
328
+ )
329
+ await sqlite_commit(session)
330
+ await exit_stack.enter_async_context(
331
+ get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instances_ids)
332
+ )
333
+ if is_db_sqlite():
334
+ fleets_with_instances_ids = [f.id for f in fleet_models_with_instances]
335
+ fleet_models_with_instances = await _refetch_fleet_models_with_instances(
336
+ session=session,
337
+ fleets_ids=fleets_with_instances_ids,
338
+ instances_ids=instances_ids,
339
+ fleet_filters=fleet_filters,
340
+ instance_filters=instance_filters,
341
+ )
342
+ fleet_models = fleet_models_with_instances + fleet_models_without_instances
343
+ fleet_model, fleet_instances_with_offers, _ = await find_optimal_fleet_with_offers(
344
+ project=project,
345
+ fleet_models=fleet_models,
346
+ run_model=run_model,
347
+ run_spec=run.run_spec,
348
+ job=job,
349
+ master_job_provisioning_data=master_job_provisioning_data,
350
+ volumes=volumes,
351
+ exclude_not_available=True,
352
+ )
353
+ if fleet_model is None:
354
+ if run_spec.merged_profile.fleets is not None:
355
+ # Run cannot create new fleets when fleets are specified
356
+ logger.debug("%s: failed to use specified fleets", fmt(job_model))
357
+ job_model.termination_reason = (
358
+ JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
359
+ )
360
+ job_model.termination_reason_message = "Failed to use specified fleets"
361
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
362
+ job_model.last_processed_at = common_utils.get_current_datetime()
363
+ await session.commit()
364
+ return
365
+ if not FeatureFlags.AUTOCREATED_FLEETS_ENABLED:
366
+ logger.debug("%s: no fleet found", fmt(job_model))
367
+ job_model.termination_reason = (
368
+ JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
369
+ )
370
+ # Note: `_get_job_status_message` relies on the "No fleet found" substring to return "no fleets"
371
+ job_model.termination_reason_message = (
372
+ "No matching fleet found. Possible reasons: "
373
+ "https://dstack.ai/docs/guides/troubleshooting/#no-fleets"
374
+ )
375
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
376
+ job_model.last_processed_at = common_utils.get_current_datetime()
377
+ await session.commit()
378
+ return
379
+ instance = await _assign_job_to_fleet_instance(
380
+ session=session,
381
+ fleet_model=fleet_model,
382
+ instances_with_offers=fleet_instances_with_offers,
383
+ job_model=job_model,
384
+ multinode=multinode,
385
+ )
386
+ job_model.last_processed_at = common_utils.get_current_datetime()
387
+ await session.commit()
388
+ return
389
+
390
+ jobs_to_provision = _get_jobs_to_provision(job, replica_jobs, job_model)
391
+ # TODO: Volume attachment for compute groups is not yet supported since
392
+ # currently supported compute groups (e.g. Runpod) don't need explicit volume attachment.
393
+ need_volume_attachment = True
394
+
395
+ if job_model.instance is not None:
396
+ res = await session.execute(
397
+ select(InstanceModel)
398
+ .where(InstanceModel.id == job_model.instance.id)
399
+ .options(selectinload(InstanceModel.volume_attachments))
400
+ .execution_options(populate_existing=True)
401
+ )
402
+ instance = res.unique().scalar_one()
403
+ switch_job_status(session, job_model, JobStatus.PROVISIONING)
404
+ else:
405
+ if run_profile.creation_policy == CreationPolicy.REUSE:
406
+ logger.debug("%s: reuse instance failed", fmt(job_model))
407
+ job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
408
+ job_model.termination_reason_message = "Could not reuse any instances for this job"
409
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
410
+ job_model.last_processed_at = common_utils.get_current_datetime()
411
+ await session.commit()
412
+ return
413
+
414
+ master_instance_provisioning_data = (
415
+ await _fetch_fleet_with_master_instance_provisioning_data(
416
+ exit_stack=exit_stack,
417
+ session=session,
418
+ fleet_model=fleet_model,
419
+ job=job,
420
+ )
421
+ )
422
+ master_provisioning_data = (
423
+ master_job_provisioning_data or master_instance_provisioning_data
424
+ )
425
+ run_job_result = await _run_jobs_on_new_instances(
426
+ session=session,
427
+ project=project,
428
+ fleet_model=fleet_model,
429
+ job_model=job_model,
430
+ run=run,
431
+ jobs=jobs_to_provision,
432
+ project_ssh_public_key=project.ssh_public_key,
433
+ project_ssh_private_key=project.ssh_private_key,
434
+ master_job_provisioning_data=master_provisioning_data,
435
+ volumes=volumes,
436
+ )
437
+ if run_job_result is None:
438
+ logger.debug("%s: provisioning failed", fmt(job_model))
439
+ job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
440
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
441
+ job_model.last_processed_at = common_utils.get_current_datetime()
442
+ await session.commit()
443
+ return
444
+
445
+ if fleet_model is None:
446
+ fleet_model = await _create_fleet_model_for_job(
447
+ exit_stack=exit_stack,
448
+ session=session,
449
+ project=project,
450
+ run=run,
451
+ )
452
+ session.add(fleet_model)
453
+ events.emit(
454
+ session,
455
+ f"Fleet created for job. Fleet status: {fleet_model.status.upper()}",
456
+ actor=events.SystemActor(),
457
+ targets=[
458
+ events.Target.from_model(fleet_model),
459
+ events.Target.from_model(job_model),
460
+ ],
461
+ )
462
+
463
+ provisioning_data, offer, effective_profile, _ = run_job_result
464
+ compute_group_model = None
465
+ if isinstance(provisioning_data, ComputeGroupProvisioningData):
466
+ need_volume_attachment = False
467
+ provisioned_jobs = jobs_to_provision
468
+ jpds = provisioning_data.job_provisioning_datas
469
+ compute_group_model = ComputeGroupModel(
470
+ id=uuid.uuid4(),
471
+ project=project,
472
+ fleet=fleet_model,
473
+ status=ComputeGroupStatus.RUNNING,
474
+ provisioning_data=provisioning_data.json(),
475
+ )
476
+ session.add(compute_group_model)
477
+ else:
478
+ provisioned_jobs = [job]
479
+ jpds = [provisioning_data]
480
+
481
+ logger.info("%s: provisioned %s new instance(s)", fmt(job_model), len(provisioned_jobs))
482
+ provisioned_job_models = _get_job_models_for_jobs(run_model.jobs, provisioned_jobs)
483
+ instance = None # Instance for attaching volumes in case of single job provisioned
484
+ for provisioned_job_model, jpd in zip(provisioned_job_models, jpds):
485
+ provisioned_job_model.job_provisioning_data = jpd.json()
486
+ switch_job_status(session, provisioned_job_model, JobStatus.PROVISIONING)
487
+ # FIXME: Fleet is not locked which may lead to duplicate instance_num.
488
+ # This is currently hard to fix without locking the fleet for entire provisioning duration.
489
+ # Processing should be done in multiple steps so that
490
+ # InstanceModel is created before provisioning.
491
+ instance_num = await _get_next_instance_num(
492
+ session=session,
493
+ fleet_model=fleet_model,
494
+ )
495
+ instance = _create_instance_model_for_job(
496
+ project=project,
497
+ fleet_model=fleet_model,
498
+ compute_group_model=compute_group_model,
499
+ job_model=provisioned_job_model,
500
+ job_provisioning_data=jpd,
501
+ offer=offer,
502
+ instance_num=instance_num,
503
+ profile=effective_profile,
504
+ )
505
+ provisioned_job_model.job_runtime_data = _prepare_job_runtime_data(
506
+ offer, multinode
507
+ ).json()
508
+ session.add(instance)
509
+ events.emit(
510
+ session,
511
+ f"Instance created for job. Instance status: {instance.status.upper()}",
512
+ actor=events.SystemActor(),
513
+ targets=[
514
+ events.Target.from_model(instance),
515
+ events.Target.from_model(provisioned_job_model),
516
+ ],
517
+ )
518
+ provisioned_job_model.used_instance_id = instance.id
519
+ provisioned_job_model.last_processed_at = common_utils.get_current_datetime()
520
+
521
+ _allow_other_replica_jobs_to_provision(job_model, replica_job_models, jobs_to_provision)
522
+
523
+ volumes_ids = sorted([v.id for vs in volume_models for v in vs])
524
+ if need_volume_attachment:
525
+ # Take lock to prevent attaching volumes that are to be deleted.
526
+ # If the volume was deleted before the lock, the volume will fail to attach and the job will fail.
527
+ # TODO: Lock instances for attaching volumes?
528
+ await session.execute(
529
+ select(VolumeModel)
530
+ .where(VolumeModel.id.in_(volumes_ids))
531
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
532
+ .order_by(VolumeModel.id) # take locks in order
533
+ .with_for_update(key_share=True, of=VolumeModel)
534
+ )
535
+ await exit_stack.enter_async_context(
536
+ get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids)
537
+ )
538
+ if len(volume_models) > 0:
539
+ assert instance is not None
540
+ await _attach_volumes(
541
+ session=session,
542
+ project=project,
543
+ job_model=job_model,
544
+ instance=instance,
545
+ volume_models=volume_models,
546
+ )
547
+ await session.commit()
548
+
549
+
550
+ async def _refetch_fleet_models_with_instances(
551
+ session: AsyncSession,
552
+ fleets_ids: list[uuid.UUID],
553
+ instances_ids: list[uuid.UUID],
554
+ fleet_filters: list,
555
+ instance_filters: list,
556
+ ) -> list[FleetModel]:
557
+ res = await session.execute(
558
+ select(FleetModel)
559
+ .outerjoin(FleetModel.instances)
560
+ .where(
561
+ FleetModel.id.in_(fleets_ids),
562
+ *fleet_filters,
563
+ )
564
+ .where(
565
+ InstanceModel.id.in_(instances_ids),
566
+ *instance_filters,
567
+ )
568
+ .options(contains_eager(FleetModel.instances))
569
+ )
570
+ fleet_models = list(res.unique().scalars().all())
571
+ return fleet_models
572
+
573
+
574
+ async def _fetch_fleet_with_master_instance_provisioning_data(
575
+ exit_stack: AsyncExitStack,
576
+ session: AsyncSession,
577
+ fleet_model: Optional[FleetModel],
578
+ job: Job,
579
+ ) -> Optional[JobProvisioningData]:
580
+ master_instance_provisioning_data = None
581
+ if is_master_job(job) and fleet_model is not None:
582
+ fleet = fleet_model_to_fleet(fleet_model)
583
+ if fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER:
584
+ # To avoid violating fleet placement cluster during master provisioning,
585
+ # we must lock empty fleets and respect existing instances in non-empty fleets.
586
+ # On SQLite always take the lock during master provisioning for simplicity.
587
+ await exit_stack.enter_async_context(
588
+ get_locker(get_db().dialect_name).lock_ctx(
589
+ FleetModel.__tablename__, [fleet_model.id]
590
+ )
591
+ )
592
+ await sqlite_commit(session)
593
+ res = await session.execute(
594
+ select(FleetModel)
595
+ .outerjoin(FleetModel.instances)
596
+ .where(
597
+ FleetModel.id == fleet_model.id,
598
+ or_(
599
+ InstanceModel.id.is_(None),
600
+ InstanceModel.deleted == True,
601
+ ),
602
+ )
603
+ .with_for_update(key_share=True, of=FleetModel)
604
+ .execution_options(populate_existing=True)
605
+ .options(noload(FleetModel.instances))
606
+ )
607
+ empty_fleet_model = res.unique().scalar()
608
+ if empty_fleet_model is not None:
609
+ fleet_model = empty_fleet_model
610
+ else:
611
+ res = await session.execute(
612
+ select(FleetModel)
613
+ .join(FleetModel.instances)
614
+ .where(
615
+ FleetModel.id == fleet_model.id,
616
+ InstanceModel.deleted == False,
617
+ )
618
+ .options(contains_eager(FleetModel.instances))
619
+ .execution_options(populate_existing=True)
620
+ )
621
+ fleet_model = res.unique().scalar_one()
622
+ master_instance_provisioning_data = get_fleet_master_instance_provisioning_data(
623
+ fleet_model=fleet_model,
624
+ fleet_spec=fleet.spec,
625
+ )
626
+ return master_instance_provisioning_data
627
+
628
+
629
+ async def _assign_job_to_fleet_instance(
630
+ session: AsyncSession,
631
+ fleet_model: Optional[FleetModel],
632
+ job_model: JobModel,
633
+ instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]],
634
+ multinode: bool,
635
+ ) -> Optional[InstanceModel]:
636
+ job_model.fleet = fleet_model
637
+ job_model.instance_assigned = True
638
+ if len(instances_with_offers) == 0:
639
+ return None
640
+
641
+ instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
642
+ instance, offer = instances_with_offers[0]
643
+ # Reload InstanceModel with volume attachments
644
+ res = await session.execute(
645
+ select(InstanceModel)
646
+ .where(InstanceModel.id == instance.id)
647
+ .options(joinedload(InstanceModel.volume_attachments))
648
+ )
649
+ instance = res.unique().scalar_one()
650
+ switch_instance_status(session, instance, InstanceStatus.BUSY)
651
+ instance.busy_blocks += offer.blocks
652
+
653
+ job_model.instance = instance
654
+ job_model.used_instance_id = instance.id
655
+ job_model.job_provisioning_data = instance.job_provisioning_data
656
+ job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
657
+ events.emit(
658
+ session,
659
+ (
660
+ "Job assigned to instance."
661
+ f" Instance blocks: {format_instance_blocks_for_event(instance)}"
662
+ ),
663
+ actor=events.SystemActor(),
664
+ targets=[
665
+ events.Target.from_model(job_model),
666
+ events.Target.from_model(instance),
667
+ ],
668
+ )
669
+ return instance
670
+
671
+
672
+ def _get_jobs_to_provision(job: Job, replica_jobs: list[Job], job_model: JobModel) -> list[Job]:
673
+ """
674
+ Returns the passed job for non-master jobs and all replica jobs for master jobs in multinode setups.
675
+ """
676
+ jobs_to_provision = [job]
677
+ if (
678
+ is_multinode_job(job)
679
+ and is_master_job(job)
680
+ # job_model.waiting_master_job is not set for legacy jobs.
681
+ # In this case compute group provisioning not supported
682
+ # and jobs always provision one-by-one.
683
+ and job_model.waiting_master_job is not None
684
+ ):
685
+ jobs_to_provision = replica_jobs
686
+ return jobs_to_provision
687
+
688
+
689
+ def _allow_other_replica_jobs_to_provision(
690
+ job_model: JobModel,
691
+ replica_job_models: list[JobModel],
692
+ jobs_to_provision: list[Job],
693
+ ):
694
+ if len(jobs_to_provision) > 1:
695
+ logger.debug("%s: allow replica jobs to be provisioned one-by-one", fmt(job_model))
696
+ for replica_job_model in replica_job_models:
697
+ replica_job_model.waiting_master_job = False
698
+
699
+
700
+ async def _run_jobs_on_new_instances(
701
+ session: AsyncSession,
702
+ project: ProjectModel,
703
+ job_model: JobModel,
704
+ run: Run,
705
+ jobs: list[Job],
706
+ project_ssh_public_key: str,
707
+ project_ssh_private_key: str,
708
+ master_job_provisioning_data: Optional[JobProvisioningData] = None,
709
+ volumes: Optional[list[list[Volume]]] = None,
710
+ fleet_model: Optional[FleetModel] = None,
711
+ ) -> Optional[
712
+ tuple[
713
+ Union[JobProvisioningData, ComputeGroupProvisioningData],
714
+ InstanceOfferWithAvailability,
715
+ Profile,
716
+ Requirements,
717
+ ]
718
+ ]:
719
+ """
720
+ Provisions an instance for a job or a compute group for multiple jobs and runs the jobs.
721
+ Even when multiple jobs are passes, it may still provision only one instance
722
+ and run only the master job in case there are no offers supporting cluster groups.
723
+ Other jobs should be provisioned one-by-one later.
724
+ """
725
+ if volumes is None:
726
+ volumes = []
727
+ job = jobs[0]
728
+ profile = run.run_spec.merged_profile
729
+ requirements = job.job_spec.requirements
730
+ fleet = None
731
+ if fleet_model is not None:
732
+ fleet = fleet_model_to_fleet(fleet_model)
733
+ try:
734
+ check_can_create_new_cloud_instance_in_fleet(fleet)
735
+ profile, requirements = get_run_profile_and_requirements_in_fleet(
736
+ job=job,
737
+ run_spec=run.run_spec,
738
+ fleet=fleet,
739
+ )
740
+ except ValueError as e:
741
+ logger.debug("%s: %s", fmt(job_model), e.args[0])
742
+ return None
743
+ # TODO: Respect fleet provisioning properties such as tags
744
+
745
+ # The placement group is determined when provisioning the master instance
746
+ # and used for all other instances in the fleet.
747
+ placement_group_models = await get_fleet_placement_group_models(
748
+ session=session,
749
+ fleet_id=fleet_model.id if fleet_model else None,
750
+ )
751
+ placement_group_model = get_placement_group_model_for_job(
752
+ placement_group_models=placement_group_models,
753
+ fleet_model=fleet_model,
754
+ )
755
+ multinode = requirements.multinode or is_multinode_job(job)
756
+ offers = await get_offers_by_requirements(
757
+ project=project,
758
+ profile=profile,
759
+ requirements=requirements,
760
+ exclude_not_available=True,
761
+ multinode=multinode,
762
+ master_job_provisioning_data=master_job_provisioning_data,
763
+ volumes=volumes,
764
+ privileged=job.job_spec.privileged,
765
+ instance_mounts=check_run_spec_requires_instance_mounts(run.run_spec),
766
+ placement_group=placement_group_model_to_placement_group_optional(placement_group_model),
767
+ )
768
+ # Limit number of offers tried to prevent long-running processing
769
+ # in case all offers fail.
770
+ for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
771
+ logger.debug(
772
+ "%s: trying %s in %s/%s for $%0.4f per hour",
773
+ fmt(job_model),
774
+ offer.instance.name,
775
+ offer.backend.value,
776
+ offer.region,
777
+ offer.price,
778
+ )
779
+ offer_volumes = _get_offer_volumes(volumes, offer)
780
+ job_configurations = [JobConfiguration(job=j, volumes=offer_volumes) for j in jobs]
781
+ compute = backend.compute()
782
+ if (
783
+ fleet_model is not None
784
+ and len(fleet_model.instances) == 0
785
+ and is_cloud_cluster(fleet_model)
786
+ and offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
787
+ and isinstance(compute, ComputeWithPlacementGroupSupport)
788
+ and (
789
+ compute.are_placement_groups_compatible_with_reservations(offer.backend)
790
+ or job.job_spec.requirements.reservation is None
791
+ )
792
+ ):
793
+ placement_group_model = await find_or_create_suitable_placement_group(
794
+ fleet_model=fleet_model,
795
+ placement_groups=placement_group_models,
796
+ instance_offer=offer,
797
+ compute=compute,
798
+ )
799
+ if placement_group_model is None: # error occurred
800
+ continue
801
+ session.add(placement_group_model)
802
+ placement_group_models.append(placement_group_model)
803
+ try:
804
+ if len(jobs) > 1 and offer.backend in BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT:
805
+ assert isinstance(compute, ComputeWithGroupProvisioningSupport)
806
+ cgpd = await common_utils.run_async(
807
+ compute.run_jobs,
808
+ run,
809
+ job_configurations,
810
+ offer,
811
+ project_ssh_public_key,
812
+ project_ssh_private_key,
813
+ placement_group_model_to_placement_group_optional(placement_group_model),
814
+ )
815
+ return cgpd, offer, profile, requirements
816
+ else:
817
+ jpd = await common_utils.run_async(
818
+ compute.run_job,
819
+ run,
820
+ job,
821
+ offer,
822
+ project_ssh_public_key,
823
+ project_ssh_private_key,
824
+ offer_volumes,
825
+ placement_group_model_to_placement_group_optional(placement_group_model),
826
+ )
827
+ return jpd, offer, profile, requirements
828
+ except BackendError as e:
829
+ logger.warning(
830
+ "%s: %s launch in %s/%s failed: %s",
831
+ fmt(job_model),
832
+ offer.instance.name,
833
+ offer.backend.value,
834
+ offer.region,
835
+ repr(e),
836
+ )
837
+ continue
838
+ except Exception:
839
+ logger.exception(
840
+ "%s: got exception when launching %s in %s/%s",
841
+ fmt(job_model),
842
+ offer.instance.name,
843
+ offer.backend.value,
844
+ offer.region,
845
+ )
846
+ continue
847
+ finally:
848
+ if fleet_model is not None and len(fleet_model.instances) == 0:
849
+ # Clean up placement groups that did not end up being used.
850
+ # Flush to update still uncommitted placement groups.
851
+ await session.flush()
852
+ await schedule_fleet_placement_groups_deletion(
853
+ session=session,
854
+ fleet_id=fleet_model.id,
855
+ except_placement_group_ids=(
856
+ [placement_group_model.id] if placement_group_model is not None else []
857
+ ),
858
+ )
859
+ return None
860
+
861
+
862
+ async def _create_fleet_model_for_job(
863
+ exit_stack: AsyncExitStack,
864
+ session: AsyncSession,
865
+ project: ProjectModel,
866
+ run: Run,
867
+ ) -> FleetModel:
868
+ placement = InstanceGroupPlacement.ANY
869
+ if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
870
+ placement = InstanceGroupPlacement.CLUSTER
871
+ nodes = get_nodes_required_num(run.run_spec)
872
+ lock_namespace = f"fleet_names_{project.name}"
873
+ if is_db_sqlite():
874
+ # Start new transaction to see committed changes after lock
875
+ await session.commit()
876
+ elif is_db_postgres():
877
+ await session.execute(
878
+ select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace)))
879
+ )
880
+ await exit_stack.enter_async_context(
881
+ get_locker(get_db().dialect_name).get_lockset(lock_namespace)[0]
882
+ )
883
+ fleet_name = await generate_fleet_name(session=session, project=project)
884
+ spec = FleetSpec(
885
+ configuration=FleetConfiguration(
886
+ name=fleet_name,
887
+ placement=placement,
888
+ reservation=run.run_spec.configuration.reservation,
889
+ nodes=FleetNodesSpec(
890
+ min=nodes,
891
+ target=nodes,
892
+ max=None,
893
+ ),
894
+ ),
895
+ profile=run.run_spec.merged_profile,
896
+ autocreated=True,
897
+ )
898
+ fleet_model = FleetModel(
899
+ id=uuid.uuid4(),
900
+ name=fleet_name,
901
+ project=project,
902
+ status=FleetStatus.ACTIVE,
903
+ spec=spec.json(),
904
+ instances=[],
905
+ )
906
+ return fleet_model
907
+
908
+
909
+ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int:
910
+ res = await session.execute(
911
+ select(InstanceModel.instance_num).where(
912
+ InstanceModel.fleet_id == fleet_model.id,
913
+ InstanceModel.deleted.is_(False),
914
+ )
915
+ )
916
+ taken_instance_nums = set(res.scalars().all())
917
+ return get_next_instance_num(taken_instance_nums)
918
+
919
+
920
+ def _create_instance_model_for_job(
921
+ project: ProjectModel,
922
+ fleet_model: FleetModel,
923
+ compute_group_model: Optional[ComputeGroupModel],
924
+ job_model: JobModel,
925
+ job_provisioning_data: JobProvisioningData,
926
+ offer: InstanceOfferWithAvailability,
927
+ instance_num: int,
928
+ profile: Profile,
929
+ ) -> InstanceModel:
930
+ if not job_provisioning_data.dockerized:
931
+ # terminate vastai/k8s instances immediately
932
+ termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
933
+ termination_idle_time = 0
934
+ else:
935
+ termination_policy, termination_idle_time = get_termination(
936
+ profile, DEFAULT_RUN_TERMINATION_IDLE_TIME
937
+ )
938
+ instance = InstanceModel(
939
+ id=uuid.uuid4(),
940
+ name=f"{fleet_model.name}-{instance_num}",
941
+ instance_num=instance_num,
942
+ project=project,
943
+ fleet=fleet_model,
944
+ compute_group=compute_group_model,
945
+ created_at=common_utils.get_current_datetime(),
946
+ started_at=common_utils.get_current_datetime(),
947
+ status=InstanceStatus.PROVISIONING,
948
+ unreachable=False,
949
+ job_provisioning_data=job_provisioning_data.json(),
950
+ offer=offer.json(),
951
+ termination_policy=termination_policy,
952
+ termination_idle_time=termination_idle_time,
953
+ jobs=[job_model],
954
+ backend=offer.backend,
955
+ price=offer.price,
956
+ region=offer.region,
957
+ volume_attachments=[],
958
+ total_blocks=1,
959
+ busy_blocks=1,
960
+ )
961
+ return instance
962
+
963
+
964
+ def _prepare_job_runtime_data(
965
+ offer: InstanceOfferWithAvailability, multinode: bool
966
+ ) -> JobRuntimeData:
967
+ if offer.blocks == offer.total_blocks:
968
+ if settings.JOB_NETWORK_MODE == settings.JobNetworkMode.FORCED_BRIDGE:
969
+ network_mode = NetworkMode.BRIDGE
970
+ elif settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_WHEN_POSSIBLE:
971
+ network_mode = NetworkMode.HOST
972
+ else:
973
+ assert settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_FOR_MULTINODE_ONLY
974
+ network_mode = NetworkMode.HOST if multinode else NetworkMode.BRIDGE
975
+ return JobRuntimeData(
976
+ network_mode=network_mode,
977
+ offer=offer,
978
+ )
979
+ return JobRuntimeData(
980
+ network_mode=NetworkMode.BRIDGE,
981
+ offer=offer,
982
+ cpu=offer.instance.resources.cpus,
983
+ gpu=len(offer.instance.resources.gpus),
984
+ memory=Memory(offer.instance.resources.memory_mib / 1024),
985
+ )
986
+
987
+
988
+ def _get_offer_volumes(
989
+ volumes: List[List[Volume]],
990
+ offer: InstanceOfferWithAvailability,
991
+ ) -> List[Volume]:
992
+ """
993
+ Returns volumes suitable for the offer for each mount point.
994
+ """
995
+ offer_volumes = []
996
+ for mount_point_volumes in volumes:
997
+ offer_volumes.append(_get_offer_mount_point_volume(mount_point_volumes, offer))
998
+ return offer_volumes
999
+
1000
+
1001
+ def _get_offer_mount_point_volume(
1002
+ volumes: List[Volume],
1003
+ offer: InstanceOfferWithAvailability,
1004
+ ) -> Volume:
1005
+ """
1006
+ Returns the first suitable volume for the offer among possible mount point volumes.
1007
+ """
1008
+ for volume in volumes:
1009
+ if (
1010
+ volume.configuration.backend != offer.backend
1011
+ or volume.configuration.region.lower() != offer.region.lower()
1012
+ ):
1013
+ continue
1014
+ return volume
1015
+ raise ServerClientError("Failed to find an eligible volume for the mount point")
1016
+
1017
+
1018
+ async def _attach_volumes(
1019
+ session: AsyncSession,
1020
+ project: ProjectModel,
1021
+ job_model: JobModel,
1022
+ instance: InstanceModel,
1023
+ volume_models: List[List[VolumeModel]],
1024
+ ):
1025
+ job_provisioning_data = common_utils.get_or_error(get_instance_provisioning_data(instance))
1026
+ backend = await get_project_backend_by_type_or_error(
1027
+ project=project,
1028
+ backend_type=job_provisioning_data.backend,
1029
+ )
1030
+ job_runtime_data = common_utils.get_or_error(get_job_runtime_data(job_model))
1031
+ job_runtime_data.volume_names = []
1032
+ logger.info("Attaching volumes: %s", [[v.name for v in vs] for vs in volume_models])
1033
+ for mount_point_volume_models in volume_models:
1034
+ for volume_model in mount_point_volume_models:
1035
+ volume = volume_model_to_volume(volume_model)
1036
+ try:
1037
+ if (
1038
+ job_provisioning_data.get_base_backend() != volume.configuration.backend
1039
+ or job_provisioning_data.region.lower() != volume.configuration.region.lower()
1040
+ ):
1041
+ continue
1042
+ if volume.provisioning_data is not None and volume.provisioning_data.attachable:
1043
+ await _attach_volume(
1044
+ session=session,
1045
+ backend=backend,
1046
+ volume_model=volume_model,
1047
+ instance=instance,
1048
+ jpd=job_provisioning_data,
1049
+ )
1050
+ job_runtime_data.volume_names.append(volume.name)
1051
+ break # attach next mount point
1052
+ except (ServerClientError, BackendError) as e:
1053
+ logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
1054
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
1055
+ job_model.termination_reason_message = "Failed to attach volume"
1056
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
1057
+ except Exception:
1058
+ logger.exception(
1059
+ "%s: got exception when attaching volume",
1060
+ fmt(job_model),
1061
+ )
1062
+ job_model.termination_reason = JobTerminationReason.VOLUME_ERROR
1063
+ job_model.termination_reason_message = "Failed to attach volume"
1064
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
1065
+ finally:
1066
+ job_model.job_runtime_data = job_runtime_data.json()
1067
+
1068
+
1069
+ async def _attach_volume(
1070
+ session: AsyncSession,
1071
+ backend: Backend,
1072
+ volume_model: VolumeModel,
1073
+ instance: InstanceModel,
1074
+ jpd: JobProvisioningData,
1075
+ ):
1076
+ compute = backend.compute()
1077
+ assert isinstance(compute, ComputeWithVolumeSupport)
1078
+ volume = volume_model_to_volume(volume_model)
1079
+ # Refresh only to check if the volume wasn't deleted before the lock
1080
+ await session.refresh(volume_model)
1081
+ if volume_model.deleted:
1082
+ raise ServerClientError("Cannot attach a deleted volume")
1083
+ attachment_data = await common_utils.run_async(
1084
+ compute.attach_volume,
1085
+ volume=volume,
1086
+ provisioning_data=jpd,
1087
+ )
1088
+ volume_attachment_model = VolumeAttachmentModel(
1089
+ volume=volume_model,
1090
+ attachment_data=attachment_data.json(),
1091
+ )
1092
+ instance.volume_attachments.append(volume_attachment_model)
1093
+
1094
+ volume_model.last_job_processed_at = common_utils.get_current_datetime()
1095
+
1096
+
1097
+ def _get_job_models_for_jobs(
1098
+ job_models: list[JobModel],
1099
+ jobs: list[Job],
1100
+ ) -> list[JobModel]:
1101
+ """
1102
+ Returns job models of latest submissions for a list of jobs.
1103
+ Preserves jobs order.
1104
+ """
1105
+ id_to_job_model_map = {jm.id: jm for jm in job_models}
1106
+ return [id_to_job_model_map[j.job_submissions[-1].id] for j in jobs]