dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (879) hide show
  1. dstack/_internal/cli/commands/__init__.py +80 -0
  2. dstack/_internal/cli/commands/apply.py +100 -0
  3. dstack/_internal/cli/commands/attach.py +161 -0
  4. dstack/_internal/cli/commands/completion.py +22 -0
  5. dstack/_internal/cli/commands/delete.py +44 -0
  6. dstack/_internal/cli/commands/event.py +168 -0
  7. dstack/_internal/cli/commands/fleet.py +161 -0
  8. dstack/_internal/cli/commands/gateway.py +159 -0
  9. dstack/_internal/cli/commands/init.py +64 -0
  10. dstack/_internal/cli/commands/login.py +352 -0
  11. dstack/_internal/cli/commands/logs.py +62 -0
  12. dstack/_internal/cli/commands/metrics.py +153 -0
  13. dstack/_internal/cli/commands/offer.py +146 -0
  14. dstack/_internal/cli/commands/project.py +259 -0
  15. dstack/_internal/cli/commands/ps.py +81 -0
  16. dstack/_internal/cli/commands/run.py +69 -0
  17. dstack/_internal/cli/commands/secrets.py +92 -0
  18. dstack/_internal/cli/commands/server.py +96 -0
  19. dstack/_internal/cli/commands/stop.py +26 -0
  20. dstack/_internal/cli/commands/volume.py +117 -0
  21. dstack/_internal/cli/main.py +101 -0
  22. dstack/_internal/cli/models/gateways.py +16 -0
  23. dstack/_internal/cli/models/offers.py +47 -0
  24. dstack/_internal/cli/models/runs.py +16 -0
  25. dstack/_internal/cli/services/args.py +31 -0
  26. dstack/_internal/cli/services/completion.py +91 -0
  27. dstack/_internal/cli/services/configurators/__init__.py +86 -0
  28. dstack/_internal/cli/services/configurators/base.py +103 -0
  29. dstack/_internal/cli/services/configurators/fleet.py +475 -0
  30. dstack/_internal/cli/services/configurators/gateway.py +231 -0
  31. dstack/_internal/cli/services/configurators/run.py +882 -0
  32. dstack/_internal/cli/services/configurators/volume.py +222 -0
  33. dstack/_internal/cli/services/events.py +68 -0
  34. dstack/_internal/cli/services/profile.py +182 -0
  35. dstack/_internal/cli/services/repos.py +71 -0
  36. dstack/_internal/cli/services/resources.py +54 -0
  37. dstack/_internal/cli/utils/common.py +159 -0
  38. dstack/_internal/cli/utils/fleet.py +106 -0
  39. dstack/_internal/cli/utils/gateway.py +56 -0
  40. dstack/_internal/cli/utils/gpu.py +178 -0
  41. dstack/_internal/cli/utils/rich.py +156 -0
  42. dstack/_internal/cli/utils/run.py +517 -0
  43. dstack/_internal/cli/utils/secrets.py +25 -0
  44. dstack/_internal/cli/utils/updates.py +98 -0
  45. dstack/_internal/cli/utils/volume.py +58 -0
  46. dstack/_internal/compat.py +3 -0
  47. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  48. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  49. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  50. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  51. dstack/_internal/core/backends/aws/auth.py +30 -0
  52. dstack/_internal/core/backends/aws/backend.py +31 -0
  53. dstack/_internal/core/backends/aws/compute.py +1153 -0
  54. dstack/_internal/core/backends/aws/configurator.py +191 -0
  55. dstack/_internal/core/backends/aws/models.py +135 -0
  56. dstack/_internal/core/backends/aws/resources.py +700 -0
  57. dstack/_internal/core/backends/azure/auth.py +39 -0
  58. dstack/_internal/core/backends/azure/backend.py +21 -0
  59. dstack/_internal/core/backends/azure/compute.py +676 -0
  60. dstack/_internal/core/backends/azure/configurator.py +472 -0
  61. dstack/_internal/core/backends/azure/models.py +98 -0
  62. dstack/_internal/core/backends/azure/resources.py +116 -0
  63. dstack/_internal/core/backends/azure/utils.py +42 -0
  64. dstack/_internal/core/backends/base/backend.py +18 -0
  65. dstack/_internal/core/backends/base/compute.py +1101 -0
  66. dstack/_internal/core/backends/base/configurator.py +117 -0
  67. dstack/_internal/core/backends/base/models.py +24 -0
  68. dstack/_internal/core/backends/base/offers.py +232 -0
  69. dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
  70. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  71. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  72. dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
  73. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  74. dstack/_internal/core/backends/configurators.py +181 -0
  75. dstack/_internal/core/backends/cudo/__init__.py +0 -0
  76. dstack/_internal/core/backends/cudo/api_client.py +111 -0
  77. dstack/_internal/core/backends/cudo/backend.py +16 -0
  78. dstack/_internal/core/backends/cudo/compute.py +174 -0
  79. dstack/_internal/core/backends/cudo/configurator.py +63 -0
  80. dstack/_internal/core/backends/cudo/models.py +37 -0
  81. dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
  82. dstack/_internal/core/backends/datacrunch/backend.py +18 -0
  83. dstack/_internal/core/backends/datacrunch/compute.py +8 -0
  84. dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
  85. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  86. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  87. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  88. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  89. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  90. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  91. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  92. dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
  93. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  94. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  95. dstack/_internal/core/backends/dstack/__init__.py +0 -0
  96. dstack/_internal/core/backends/dstack/models.py +26 -0
  97. dstack/_internal/core/backends/features.py +74 -0
  98. dstack/_internal/core/backends/gcp/__init__.py +0 -0
  99. dstack/_internal/core/backends/gcp/auth.py +57 -0
  100. dstack/_internal/core/backends/gcp/backend.py +17 -0
  101. dstack/_internal/core/backends/gcp/compute.py +1257 -0
  102. dstack/_internal/core/backends/gcp/configurator.py +206 -0
  103. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  104. dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
  105. dstack/_internal/core/backends/gcp/models.py +160 -0
  106. dstack/_internal/core/backends/gcp/resources.py +585 -0
  107. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  108. dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
  109. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  110. dstack/_internal/core/backends/hotaisle/compute.py +188 -0
  111. dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
  112. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  113. dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
  114. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  115. dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
  116. dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
  117. dstack/_internal/core/backends/kubernetes/models.py +71 -0
  118. dstack/_internal/core/backends/kubernetes/utils.py +81 -0
  119. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
  120. dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
  121. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  122. dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
  123. dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
  124. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  125. dstack/_internal/core/backends/local/__init__.py +0 -0
  126. dstack/_internal/core/backends/local/backend.py +14 -0
  127. dstack/_internal/core/backends/local/compute.py +130 -0
  128. dstack/_internal/core/backends/models.py +158 -0
  129. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  130. dstack/_internal/core/backends/nebius/backend.py +16 -0
  131. dstack/_internal/core/backends/nebius/compute.py +401 -0
  132. dstack/_internal/core/backends/nebius/configurator.py +98 -0
  133. dstack/_internal/core/backends/nebius/models.py +185 -0
  134. dstack/_internal/core/backends/nebius/resources.py +433 -0
  135. dstack/_internal/core/backends/oci/__init__.py +0 -0
  136. dstack/_internal/core/backends/oci/auth.py +21 -0
  137. dstack/_internal/core/backends/oci/backend.py +16 -0
  138. dstack/_internal/core/backends/oci/compute.py +209 -0
  139. dstack/_internal/core/backends/oci/configurator.py +156 -0
  140. dstack/_internal/core/backends/oci/exceptions.py +15 -0
  141. dstack/_internal/core/backends/oci/models.py +87 -0
  142. dstack/_internal/core/backends/oci/region.py +86 -0
  143. dstack/_internal/core/backends/oci/resources.py +836 -0
  144. dstack/_internal/core/backends/runpod/__init__.py +0 -0
  145. dstack/_internal/core/backends/runpod/api_client.py +627 -0
  146. dstack/_internal/core/backends/runpod/backend.py +16 -0
  147. dstack/_internal/core/backends/runpod/compute.py +444 -0
  148. dstack/_internal/core/backends/runpod/configurator.py +63 -0
  149. dstack/_internal/core/backends/runpod/models.py +54 -0
  150. dstack/_internal/core/backends/template/__init__.py +0 -0
  151. dstack/_internal/core/backends/template/backend.py.jinja +16 -0
  152. dstack/_internal/core/backends/template/compute.py.jinja +95 -0
  153. dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
  154. dstack/_internal/core/backends/template/models.py.jinja +62 -0
  155. dstack/_internal/core/backends/tensordock/models.py +40 -0
  156. dstack/_internal/core/backends/vastai/__init__.py +0 -0
  157. dstack/_internal/core/backends/vastai/api_client.py +143 -0
  158. dstack/_internal/core/backends/vastai/backend.py +16 -0
  159. dstack/_internal/core/backends/vastai/compute.py +141 -0
  160. dstack/_internal/core/backends/vastai/configurator.py +69 -0
  161. dstack/_internal/core/backends/vastai/models.py +37 -0
  162. dstack/_internal/core/backends/verda/__init__.py +0 -0
  163. dstack/_internal/core/backends/verda/backend.py +16 -0
  164. dstack/_internal/core/backends/verda/compute.py +266 -0
  165. dstack/_internal/core/backends/verda/configurator.py +73 -0
  166. dstack/_internal/core/backends/verda/models.py +38 -0
  167. dstack/_internal/core/backends/vultr/__init__.py +0 -0
  168. dstack/_internal/core/backends/vultr/api_client.py +116 -0
  169. dstack/_internal/core/backends/vultr/backend.py +16 -0
  170. dstack/_internal/core/backends/vultr/compute.py +167 -0
  171. dstack/_internal/core/backends/vultr/configurator.py +71 -0
  172. dstack/_internal/core/backends/vultr/models.py +34 -0
  173. dstack/_internal/core/compatibility/__init__.py +0 -0
  174. dstack/_internal/core/compatibility/events.py +13 -0
  175. dstack/_internal/core/compatibility/fleets.py +58 -0
  176. dstack/_internal/core/compatibility/gateways.py +39 -0
  177. dstack/_internal/core/compatibility/gpus.py +13 -0
  178. dstack/_internal/core/compatibility/logs.py +14 -0
  179. dstack/_internal/core/compatibility/runs.py +86 -0
  180. dstack/_internal/core/compatibility/volumes.py +37 -0
  181. dstack/_internal/core/consts.py +8 -0
  182. dstack/_internal/core/errors.py +160 -0
  183. dstack/_internal/core/models/__init__.py +0 -0
  184. dstack/_internal/core/models/auth.py +28 -0
  185. dstack/_internal/core/models/backends/__init__.py +0 -0
  186. dstack/_internal/core/models/backends/base.py +48 -0
  187. dstack/_internal/core/models/common.py +143 -0
  188. dstack/_internal/core/models/compute_groups.py +39 -0
  189. dstack/_internal/core/models/config.py +28 -0
  190. dstack/_internal/core/models/configurations.py +1123 -0
  191. dstack/_internal/core/models/envs.py +149 -0
  192. dstack/_internal/core/models/events.py +98 -0
  193. dstack/_internal/core/models/files.py +67 -0
  194. dstack/_internal/core/models/fleets.py +437 -0
  195. dstack/_internal/core/models/gateways.py +146 -0
  196. dstack/_internal/core/models/gpus.py +45 -0
  197. dstack/_internal/core/models/health.py +28 -0
  198. dstack/_internal/core/models/instances.py +346 -0
  199. dstack/_internal/core/models/logs.py +27 -0
  200. dstack/_internal/core/models/metrics.py +14 -0
  201. dstack/_internal/core/models/placement.py +27 -0
  202. dstack/_internal/core/models/profiles.py +431 -0
  203. dstack/_internal/core/models/projects.py +46 -0
  204. dstack/_internal/core/models/repos/__init__.py +34 -0
  205. dstack/_internal/core/models/repos/base.py +36 -0
  206. dstack/_internal/core/models/repos/local.py +96 -0
  207. dstack/_internal/core/models/repos/remote.py +341 -0
  208. dstack/_internal/core/models/repos/virtual.py +85 -0
  209. dstack/_internal/core/models/resources.py +424 -0
  210. dstack/_internal/core/models/routers.py +24 -0
  211. dstack/_internal/core/models/runs.py +618 -0
  212. dstack/_internal/core/models/secrets.py +16 -0
  213. dstack/_internal/core/models/server.py +7 -0
  214. dstack/_internal/core/models/services.py +76 -0
  215. dstack/_internal/core/models/unix.py +53 -0
  216. dstack/_internal/core/models/users.py +60 -0
  217. dstack/_internal/core/models/volumes.py +221 -0
  218. dstack/_internal/core/services/__init__.py +16 -0
  219. dstack/_internal/core/services/api_client.py +15 -0
  220. dstack/_internal/core/services/configs/__init__.py +116 -0
  221. dstack/_internal/core/services/diff.py +71 -0
  222. dstack/_internal/core/services/logs.py +58 -0
  223. dstack/_internal/core/services/profiles.py +46 -0
  224. dstack/_internal/core/services/repos.py +236 -0
  225. dstack/_internal/core/services/ssh/__init__.py +27 -0
  226. dstack/_internal/core/services/ssh/attach.py +241 -0
  227. dstack/_internal/core/services/ssh/client.py +113 -0
  228. dstack/_internal/core/services/ssh/key_manager.py +53 -0
  229. dstack/_internal/core/services/ssh/ports.py +89 -0
  230. dstack/_internal/core/services/ssh/tunnel.py +337 -0
  231. dstack/_internal/proxy/__init__.py +8 -0
  232. dstack/_internal/proxy/gateway/__init__.py +0 -0
  233. dstack/_internal/proxy/gateway/app.py +89 -0
  234. dstack/_internal/proxy/gateway/auth.py +26 -0
  235. dstack/_internal/proxy/gateway/const.py +7 -0
  236. dstack/_internal/proxy/gateway/deps.py +73 -0
  237. dstack/_internal/proxy/gateway/main.py +17 -0
  238. dstack/_internal/proxy/gateway/models.py +23 -0
  239. dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
  240. dstack/_internal/proxy/gateway/repo/repo.py +121 -0
  241. dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
  242. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
  243. dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
  244. dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
  245. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
  246. dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
  247. dstack/_internal/proxy/gateway/routers/auth.py +10 -0
  248. dstack/_internal/proxy/gateway/routers/config.py +28 -0
  249. dstack/_internal/proxy/gateway/routers/registry.py +124 -0
  250. dstack/_internal/proxy/gateway/routers/stats.py +18 -0
  251. dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
  252. dstack/_internal/proxy/gateway/schemas/common.py +5 -0
  253. dstack/_internal/proxy/gateway/schemas/config.py +9 -0
  254. dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
  255. dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
  256. dstack/_internal/proxy/gateway/services/__init__.py +0 -0
  257. dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
  258. dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
  259. dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
  260. dstack/_internal/proxy/gateway/services/nginx.py +455 -0
  261. dstack/_internal/proxy/gateway/services/registry.py +426 -0
  262. dstack/_internal/proxy/gateway/services/server_client.py +95 -0
  263. dstack/_internal/proxy/gateway/services/stats.py +170 -0
  264. dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
  265. dstack/_internal/proxy/gateway/testing/common.py +13 -0
  266. dstack/_internal/proxy/lib/__init__.py +0 -0
  267. dstack/_internal/proxy/lib/auth.py +7 -0
  268. dstack/_internal/proxy/lib/deps.py +106 -0
  269. dstack/_internal/proxy/lib/errors.py +14 -0
  270. dstack/_internal/proxy/lib/models.py +112 -0
  271. dstack/_internal/proxy/lib/repo.py +27 -0
  272. dstack/_internal/proxy/lib/routers/__init__.py +0 -0
  273. dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
  274. dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
  275. dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
  276. dstack/_internal/proxy/lib/services/__init__.py +0 -0
  277. dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
  278. dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
  279. dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
  280. dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
  281. dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
  282. dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
  283. dstack/_internal/proxy/lib/services/service_connection.py +160 -0
  284. dstack/_internal/proxy/lib/testing/__init__.py +0 -0
  285. dstack/_internal/proxy/lib/testing/auth.py +11 -0
  286. dstack/_internal/proxy/lib/testing/common.py +51 -0
  287. dstack/_internal/server/__init__.py +0 -0
  288. dstack/_internal/server/alembic.ini +100 -0
  289. dstack/_internal/server/app.py +432 -0
  290. dstack/_internal/server/background/__init__.py +142 -0
  291. dstack/_internal/server/background/tasks/__init__.py +0 -0
  292. dstack/_internal/server/background/tasks/common.py +24 -0
  293. dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
  294. dstack/_internal/server/background/tasks/process_events.py +17 -0
  295. dstack/_internal/server/background/tasks/process_fleets.py +289 -0
  296. dstack/_internal/server/background/tasks/process_gateways.py +188 -0
  297. dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
  298. dstack/_internal/server/background/tasks/process_instances.py +1186 -0
  299. dstack/_internal/server/background/tasks/process_metrics.py +172 -0
  300. dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
  301. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  302. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
  303. dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
  304. dstack/_internal/server/background/tasks/process_runs.py +842 -0
  305. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
  306. dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
  307. dstack/_internal/server/background/tasks/process_volumes.py +129 -0
  308. dstack/_internal/server/compatibility/__init__.py +0 -0
  309. dstack/_internal/server/compatibility/common.py +20 -0
  310. dstack/_internal/server/compatibility/gpus.py +22 -0
  311. dstack/_internal/server/db.py +127 -0
  312. dstack/_internal/server/deps.py +19 -0
  313. dstack/_internal/server/main.py +4 -0
  314. dstack/_internal/server/migrations/__init__.py +0 -0
  315. dstack/_internal/server/migrations/env.py +112 -0
  316. dstack/_internal/server/migrations/script.py.mako +28 -0
  317. dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
  318. dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
  319. dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
  320. dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
  321. dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
  322. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  323. dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
  324. dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
  325. dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
  326. dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
  327. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  328. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  329. dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
  330. dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
  331. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  332. dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
  333. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  334. dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
  335. dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
  336. dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
  337. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  338. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  339. dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
  340. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  341. dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
  342. dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
  343. dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
  344. dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
  345. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  346. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  347. dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
  348. dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
  349. dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
  350. dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
  351. dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
  352. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  353. dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
  354. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  355. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  356. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  357. dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
  358. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  359. dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
  360. dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
  361. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  362. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  363. dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
  364. dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
  365. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  366. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
  367. dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
  368. dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
  369. dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
  370. dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
  371. dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
  372. dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
  373. dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
  374. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  375. dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
  376. dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
  377. dstack/_internal/server/migrations/versions/__init__.py +0 -0
  378. dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
  379. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  380. dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
  381. dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
  382. dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
  383. dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
  384. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  385. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  386. dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
  387. dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
  388. dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
  389. dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
  390. dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
  391. dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
  392. dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
  393. dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
  394. dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
  395. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  396. dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
  397. dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
  398. dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
  399. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  400. dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
  401. dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
  402. dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
  403. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  404. dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
  405. dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
  406. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  407. dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
  408. dstack/_internal/server/models.py +930 -0
  409. dstack/_internal/server/routers/__init__.py +0 -0
  410. dstack/_internal/server/routers/auth.py +34 -0
  411. dstack/_internal/server/routers/backends.py +142 -0
  412. dstack/_internal/server/routers/events.py +60 -0
  413. dstack/_internal/server/routers/files.py +68 -0
  414. dstack/_internal/server/routers/fleets.py +202 -0
  415. dstack/_internal/server/routers/gateways.py +109 -0
  416. dstack/_internal/server/routers/gpus.py +32 -0
  417. dstack/_internal/server/routers/instances.py +77 -0
  418. dstack/_internal/server/routers/logs.py +34 -0
  419. dstack/_internal/server/routers/metrics.py +82 -0
  420. dstack/_internal/server/routers/projects.py +205 -0
  421. dstack/_internal/server/routers/prometheus.py +35 -0
  422. dstack/_internal/server/routers/repos.py +118 -0
  423. dstack/_internal/server/routers/runs.py +216 -0
  424. dstack/_internal/server/routers/secrets.py +86 -0
  425. dstack/_internal/server/routers/server.py +19 -0
  426. dstack/_internal/server/routers/users.py +158 -0
  427. dstack/_internal/server/routers/volumes.py +122 -0
  428. dstack/_internal/server/schemas/__init__.py +0 -0
  429. dstack/_internal/server/schemas/auth.py +83 -0
  430. dstack/_internal/server/schemas/backends.py +16 -0
  431. dstack/_internal/server/schemas/common.py +9 -0
  432. dstack/_internal/server/schemas/events.py +211 -0
  433. dstack/_internal/server/schemas/files.py +5 -0
  434. dstack/_internal/server/schemas/fleets.py +49 -0
  435. dstack/_internal/server/schemas/gateways.py +31 -0
  436. dstack/_internal/server/schemas/gpus.py +26 -0
  437. dstack/_internal/server/schemas/health/__init__.py +0 -0
  438. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  439. dstack/_internal/server/schemas/instances.py +47 -0
  440. dstack/_internal/server/schemas/logs.py +17 -0
  441. dstack/_internal/server/schemas/projects.py +81 -0
  442. dstack/_internal/server/schemas/repos.py +24 -0
  443. dstack/_internal/server/schemas/runner.py +269 -0
  444. dstack/_internal/server/schemas/runs.py +66 -0
  445. dstack/_internal/server/schemas/secrets.py +16 -0
  446. dstack/_internal/server/schemas/users.py +72 -0
  447. dstack/_internal/server/schemas/volumes.py +29 -0
  448. dstack/_internal/server/security/__init__.py +0 -0
  449. dstack/_internal/server/security/permissions.py +251 -0
  450. dstack/_internal/server/services/__init__.py +0 -0
  451. dstack/_internal/server/services/auth.py +77 -0
  452. dstack/_internal/server/services/backends/__init__.py +404 -0
  453. dstack/_internal/server/services/backends/handlers.py +105 -0
  454. dstack/_internal/server/services/compute_groups.py +22 -0
  455. dstack/_internal/server/services/config.py +279 -0
  456. dstack/_internal/server/services/docker.py +162 -0
  457. dstack/_internal/server/services/encryption/__init__.py +102 -0
  458. dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
  459. dstack/_internal/server/services/encryption/keys/aes.py +68 -0
  460. dstack/_internal/server/services/encryption/keys/base.py +19 -0
  461. dstack/_internal/server/services/encryption/keys/identity.py +28 -0
  462. dstack/_internal/server/services/events.py +477 -0
  463. dstack/_internal/server/services/files.py +91 -0
  464. dstack/_internal/server/services/fleets.py +1224 -0
  465. dstack/_internal/server/services/gateways/__init__.py +686 -0
  466. dstack/_internal/server/services/gateways/client.py +209 -0
  467. dstack/_internal/server/services/gateways/connection.py +139 -0
  468. dstack/_internal/server/services/gateways/pool.py +58 -0
  469. dstack/_internal/server/services/gpus.py +387 -0
  470. dstack/_internal/server/services/instances.py +731 -0
  471. dstack/_internal/server/services/jobs/__init__.py +840 -0
  472. dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
  473. dstack/_internal/server/services/jobs/configurators/base.py +469 -0
  474. dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
  475. dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
  476. dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
  477. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  478. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
  479. dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
  480. dstack/_internal/server/services/jobs/configurators/service.py +28 -0
  481. dstack/_internal/server/services/jobs/configurators/task.py +39 -0
  482. dstack/_internal/server/services/locking.py +187 -0
  483. dstack/_internal/server/services/logging.py +29 -0
  484. dstack/_internal/server/services/logs/__init__.py +122 -0
  485. dstack/_internal/server/services/logs/aws.py +373 -0
  486. dstack/_internal/server/services/logs/base.py +47 -0
  487. dstack/_internal/server/services/logs/filelog.py +261 -0
  488. dstack/_internal/server/services/logs/fluentbit.py +329 -0
  489. dstack/_internal/server/services/logs/gcp.py +181 -0
  490. dstack/_internal/server/services/metrics.py +172 -0
  491. dstack/_internal/server/services/offers.py +249 -0
  492. dstack/_internal/server/services/permissions.py +37 -0
  493. dstack/_internal/server/services/placement.py +234 -0
  494. dstack/_internal/server/services/plugins.py +109 -0
  495. dstack/_internal/server/services/probes.py +10 -0
  496. dstack/_internal/server/services/projects.py +835 -0
  497. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  498. dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
  499. dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
  500. dstack/_internal/server/services/proxy/__init__.py +3 -0
  501. dstack/_internal/server/services/proxy/auth.py +12 -0
  502. dstack/_internal/server/services/proxy/deps.py +18 -0
  503. dstack/_internal/server/services/proxy/repo.py +189 -0
  504. dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
  505. dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
  506. dstack/_internal/server/services/proxy/services/__init__.py +0 -0
  507. dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
  508. dstack/_internal/server/services/repos.py +362 -0
  509. dstack/_internal/server/services/requirements/__init__.py +0 -0
  510. dstack/_internal/server/services/requirements/combine.py +260 -0
  511. dstack/_internal/server/services/resources.py +21 -0
  512. dstack/_internal/server/services/runner/__init__.py +0 -0
  513. dstack/_internal/server/services/runner/client.py +646 -0
  514. dstack/_internal/server/services/runner/ssh.py +128 -0
  515. dstack/_internal/server/services/runs/__init__.py +1026 -0
  516. dstack/_internal/server/services/runs/plan.py +703 -0
  517. dstack/_internal/server/services/runs/replicas.py +317 -0
  518. dstack/_internal/server/services/runs/spec.py +191 -0
  519. dstack/_internal/server/services/secrets.py +245 -0
  520. dstack/_internal/server/services/services/__init__.py +345 -0
  521. dstack/_internal/server/services/services/autoscalers.py +140 -0
  522. dstack/_internal/server/services/services/options.py +53 -0
  523. dstack/_internal/server/services/ssh.py +67 -0
  524. dstack/_internal/server/services/storage/__init__.py +37 -0
  525. dstack/_internal/server/services/storage/base.py +48 -0
  526. dstack/_internal/server/services/storage/gcs.py +66 -0
  527. dstack/_internal/server/services/storage/s3.py +69 -0
  528. dstack/_internal/server/services/users.py +461 -0
  529. dstack/_internal/server/services/volumes.py +496 -0
  530. dstack/_internal/server/settings.py +161 -0
  531. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  532. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  533. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  534. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  535. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  536. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  537. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  538. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  539. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  540. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  541. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  542. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  543. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  544. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  545. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  546. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  547. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  548. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  549. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  550. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  551. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  552. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  553. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  554. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  555. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  556. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  557. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  558. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  559. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  560. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  561. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  562. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  563. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  564. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  565. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  566. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  567. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  568. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  569. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  570. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  571. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  572. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  573. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  574. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  575. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  576. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  577. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  578. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  579. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  580. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  581. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  582. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  583. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  584. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  585. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  586. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  587. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  588. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  589. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  590. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  591. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  592. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  593. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  594. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  595. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  596. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  597. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  598. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  599. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  600. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  601. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  602. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  603. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  604. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  605. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  606. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  607. dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
  608. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  609. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  610. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  611. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  612. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  613. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  614. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  615. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  616. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  617. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  618. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  619. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  620. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  621. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  622. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  623. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  624. dstack/_internal/server/statics/index.html +3 -0
  625. dstack/_internal/server/statics/logo-notext.svg +116 -0
  626. dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
  627. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
  628. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
  629. dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
  630. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  631. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  632. dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
  633. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  634. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  635. dstack/_internal/server/testing/__init__.py +0 -0
  636. dstack/_internal/server/testing/common.py +1220 -0
  637. dstack/_internal/server/testing/conf.py +53 -0
  638. dstack/_internal/server/testing/matchers.py +31 -0
  639. dstack/_internal/server/utils/__init__.py +0 -0
  640. dstack/_internal/server/utils/common.py +55 -0
  641. dstack/_internal/server/utils/logging.py +51 -0
  642. dstack/_internal/server/utils/provisioning.py +368 -0
  643. dstack/_internal/server/utils/routers.py +166 -0
  644. dstack/_internal/server/utils/sentry_utils.py +24 -0
  645. dstack/_internal/settings.py +49 -0
  646. dstack/_internal/utils/__init__.py +0 -0
  647. dstack/_internal/utils/common.py +318 -0
  648. dstack/_internal/utils/cron.py +5 -0
  649. dstack/_internal/utils/crypto.py +40 -0
  650. dstack/_internal/utils/env.py +88 -0
  651. dstack/_internal/utils/event_loop.py +30 -0
  652. dstack/_internal/utils/files.py +69 -0
  653. dstack/_internal/utils/gpu.py +59 -0
  654. dstack/_internal/utils/hash.py +31 -0
  655. dstack/_internal/utils/interpolator.py +91 -0
  656. dstack/_internal/utils/json_schema.py +11 -0
  657. dstack/_internal/utils/json_utils.py +54 -0
  658. dstack/_internal/utils/logging.py +5 -0
  659. dstack/_internal/utils/nested_list.py +47 -0
  660. dstack/_internal/utils/network.py +50 -0
  661. dstack/_internal/utils/path.py +57 -0
  662. dstack/_internal/utils/random_names.py +258 -0
  663. dstack/_internal/utils/ssh.py +346 -0
  664. dstack/_internal/utils/tags.py +42 -0
  665. dstack/_internal/utils/typing.py +14 -0
  666. dstack/_internal/utils/version.py +22 -0
  667. dstack/api/__init__.py +46 -0
  668. dstack/api/_public/__init__.py +96 -0
  669. dstack/api/_public/backends.py +42 -0
  670. dstack/api/_public/common.py +5 -0
  671. dstack/api/_public/repos.py +202 -0
  672. dstack/api/_public/runs.py +714 -0
  673. dstack/api/server/__init__.py +206 -0
  674. dstack/api/server/_auth.py +30 -0
  675. dstack/api/server/_backends.py +38 -0
  676. dstack/api/server/_events.py +64 -0
  677. dstack/api/server/_files.py +18 -0
  678. dstack/api/server/_fleets.py +82 -0
  679. dstack/api/server/_gateways.py +54 -0
  680. dstack/api/server/_gpus.py +27 -0
  681. dstack/api/server/_group.py +22 -0
  682. dstack/api/server/_logs.py +15 -0
  683. dstack/api/server/_metrics.py +23 -0
  684. dstack/api/server/_projects.py +124 -0
  685. dstack/api/server/_repos.py +64 -0
  686. dstack/api/server/_runs.py +102 -0
  687. dstack/api/server/_secrets.py +36 -0
  688. dstack/api/server/_users.py +82 -0
  689. dstack/api/server/_volumes.py +39 -0
  690. dstack/api/server/utils.py +34 -0
  691. dstack/api/utils.py +105 -0
  692. dstack/core/__init__.py +0 -0
  693. dstack/plugins/__init__.py +8 -0
  694. dstack/plugins/_base.py +72 -0
  695. dstack/plugins/_models.py +8 -0
  696. dstack/plugins/_utils.py +19 -0
  697. dstack/plugins/builtin/__init__.py +0 -0
  698. dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
  699. dstack/plugins/builtin/rest_plugin/_models.py +48 -0
  700. dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
  701. dstack/version.py +3 -1
  702. dstack-0.20.7.dist-info/METADATA +519 -0
  703. dstack-0.20.7.dist-info/RECORD +720 -0
  704. {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
  705. dstack-0.20.7.dist-info/entry_points.txt +2 -0
  706. dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
  707. dstack/aws/__init__.py +0 -180
  708. dstack/aws/artifacts.py +0 -111
  709. dstack/aws/config.py +0 -40
  710. dstack/aws/jobs.py +0 -245
  711. dstack/aws/logs.py +0 -186
  712. dstack/aws/repos.py +0 -137
  713. dstack/aws/run_names.py +0 -17
  714. dstack/aws/runners.py +0 -693
  715. dstack/aws/runs.py +0 -79
  716. dstack/aws/secrets.py +0 -99
  717. dstack/aws/tags.py +0 -138
  718. dstack/backend.py +0 -299
  719. dstack/cli/app.py +0 -41
  720. dstack/cli/artifacts.py +0 -87
  721. dstack/cli/common.py +0 -57
  722. dstack/cli/config.py +0 -194
  723. dstack/cli/dashboard.py +0 -26
  724. dstack/cli/delete.py +0 -49
  725. dstack/cli/init.py +0 -33
  726. dstack/cli/logs.py +0 -87
  727. dstack/cli/main.py +0 -81
  728. dstack/cli/restart.py +0 -43
  729. dstack/cli/run.py +0 -223
  730. dstack/cli/schema.py +0 -46
  731. dstack/cli/secrets.py +0 -97
  732. dstack/cli/status.py +0 -140
  733. dstack/cli/stop.py +0 -53
  734. dstack/cli/tags.py +0 -100
  735. dstack/config.py +0 -80
  736. dstack/dashboard/artifacts.py +0 -26
  737. dstack/dashboard/logs.py +0 -73
  738. dstack/dashboard/main.py +0 -45
  739. dstack/dashboard/repos.py +0 -41
  740. dstack/dashboard/runs.py +0 -140
  741. dstack/dashboard/secrets.py +0 -53
  742. dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
  743. dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
  744. dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
  745. dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
  746. dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
  747. dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
  748. dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
  749. dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
  750. dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
  751. dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
  752. dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
  753. dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  754. dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
  755. dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
  756. dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
  757. dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
  758. dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
  759. dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
  760. dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
  761. dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
  762. dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
  763. dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
  764. dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
  765. dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
  766. dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  767. dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  768. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  769. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  770. dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  771. dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  772. dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  773. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  774. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  775. dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  776. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  777. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  778. dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  779. dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  780. dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  781. dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  782. dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  783. dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  784. dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  785. dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  786. dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  787. dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  788. dstack/dashboard/statics/assets/browserconfig.xml +0 -15
  789. dstack/dashboard/statics/assets/coast-228x228.png +0 -0
  790. dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
  791. dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
  792. dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
  793. dstack/dashboard/statics/assets/favicon.ico +0 -0
  794. dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
  795. dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
  796. dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
  797. dstack/dashboard/statics/assets/manifest.webapp +0 -14
  798. dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
  799. dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
  800. dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
  801. dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
  802. dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
  803. dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
  804. dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
  805. dstack/dashboard/statics/index.html +0 -7
  806. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
  807. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
  808. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
  809. dstack/dashboard/statics/main.css +0 -5058
  810. dstack/dashboard/statics/splash_thumbnail.png +0 -0
  811. dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  812. dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
  813. dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
  814. dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
  815. dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
  816. dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
  817. dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
  818. dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
  819. dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
  820. dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
  821. dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
  822. dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
  823. dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
  824. dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
  825. dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
  826. dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
  827. dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
  828. dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
  829. dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
  830. dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
  831. dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
  832. dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
  833. dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  834. dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
  835. dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
  836. dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
  837. dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
  838. dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
  839. dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
  840. dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
  841. dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
  842. dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
  843. dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
  844. dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
  845. dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
  846. dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
  847. dstack/dashboard/tags.py +0 -119
  848. dstack/jobs.py +0 -255
  849. dstack/providers/__init__.py +0 -316
  850. dstack/providers/_python/main.py +0 -88
  851. dstack/providers/_tensorboard/main.py +0 -93
  852. dstack/providers/_torchrun/main.py +0 -121
  853. dstack/providers/bash/main.py +0 -90
  854. dstack/providers/code/main.py +0 -95
  855. dstack/providers/docker/main.py +0 -79
  856. dstack/providers/lab/main.py +0 -95
  857. dstack/providers/notebook/main.py +0 -90
  858. dstack/random_name.py +0 -29
  859. dstack/repo.py +0 -135
  860. dstack/runners.py +0 -35
  861. dstack/util.py +0 -15
  862. dstack-0.0.9.dist-info/METADATA +0 -176
  863. dstack-0.0.9.dist-info/RECORD +0 -179
  864. dstack-0.0.9.dist-info/entry_points.txt +0 -3
  865. dstack-0.0.9.dist-info/top_level.txt +0 -2
  866. tests/test_config.py +0 -70
  867. /dstack/{cli → _internal}/__init__.py +0 -0
  868. /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
  869. /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
  870. /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
  871. /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
  872. /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
  873. /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
  874. /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
  875. /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
  876. /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
  877. {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
  878. /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
  879. /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
@@ -0,0 +1,842 @@
1
+ import asyncio
2
+ import datetime
3
+ import json
4
+ from typing import List, Optional, Set, Tuple
5
+
6
+ from sqlalchemy import and_, func, or_, select
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+ from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only, with_loader_criteria
9
+
10
+ import dstack._internal.server.services.services.autoscalers as autoscalers
11
+ from dstack._internal.core.errors import ServerError
12
+ from dstack._internal.core.models.configurations import ReplicaGroup
13
+ from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
14
+ from dstack._internal.core.models.runs import (
15
+ Job,
16
+ JobSpec,
17
+ JobStatus,
18
+ JobTerminationReason,
19
+ Run,
20
+ RunSpec,
21
+ RunStatus,
22
+ RunTerminationReason,
23
+ )
24
+ from dstack._internal.server.db import get_db, get_session_ctx
25
+ from dstack._internal.server.models import (
26
+ FleetModel,
27
+ InstanceModel,
28
+ JobModel,
29
+ ProjectModel,
30
+ RunModel,
31
+ UserModel,
32
+ )
33
+ from dstack._internal.server.services.jobs import (
34
+ find_job,
35
+ get_job_specs_from_run_spec,
36
+ group_jobs_by_replica_latest,
37
+ is_master_job,
38
+ job_model_to_job_submission,
39
+ switch_job_status,
40
+ )
41
+ from dstack._internal.server.services.locking import get_locker
42
+ from dstack._internal.server.services.prometheus.client_metrics import run_metrics
43
+ from dstack._internal.server.services.runs import (
44
+ fmt,
45
+ process_terminating_run,
46
+ run_model_to_run,
47
+ switch_run_status,
48
+ )
49
+ from dstack._internal.server.services.runs.replicas import (
50
+ build_replica_lists,
51
+ has_out_of_date_replicas,
52
+ is_replica_registered,
53
+ job_belongs_to_group,
54
+ retry_run_replica_jobs,
55
+ scale_down_replicas,
56
+ scale_run_replicas,
57
+ scale_run_replicas_per_group,
58
+ )
59
+ from dstack._internal.server.services.secrets import get_project_secrets_mapping
60
+ from dstack._internal.server.services.services import update_service_desired_replica_count
61
+ from dstack._internal.server.utils import sentry_utils
62
+ from dstack._internal.utils import common
63
+ from dstack._internal.utils.logging import get_logger
64
+
65
+ logger = get_logger(__name__)
66
+
67
+ MIN_PROCESSING_INTERVAL = datetime.timedelta(seconds=5)
68
+ ROLLING_DEPLOYMENT_MAX_SURGE = 1 # at most one extra replica during rolling deployment
69
+
70
+
71
+ async def process_runs(batch_size: int = 1):
72
+ tasks = []
73
+ for _ in range(batch_size):
74
+ tasks.append(_process_next_run())
75
+ await asyncio.gather(*tasks)
76
+
77
+
78
+ @sentry_utils.instrument_background_task
79
+ async def _process_next_run():
80
+ run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
81
+ job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
82
+ now = common.get_current_datetime()
83
+ async with get_session_ctx() as session:
84
+ async with run_lock, job_lock:
85
+ res = await session.execute(
86
+ select(RunModel)
87
+ .where(
88
+ RunModel.id.not_in(run_lockset),
89
+ RunModel.last_processed_at < now - MIN_PROCESSING_INTERVAL,
90
+ # Filter out runs that don't need to be processed.
91
+ # This is only to reduce unnecessary commits.
92
+ # Otherwise, we could fetch all active runs and filter them when processing.
93
+ or_(
94
+ # Active non-pending runs:
95
+ RunModel.status.not_in(
96
+ RunStatus.finished_statuses() + [RunStatus.PENDING]
97
+ ),
98
+ # Retrying runs:
99
+ and_(
100
+ RunModel.status == RunStatus.PENDING,
101
+ RunModel.resubmission_attempt > 0,
102
+ ),
103
+ # Scheduled ready runs:
104
+ and_(
105
+ RunModel.status == RunStatus.PENDING,
106
+ RunModel.resubmission_attempt == 0,
107
+ RunModel.next_triggered_at.is_not(None),
108
+ RunModel.next_triggered_at < now,
109
+ ),
110
+ # Scaled-to-zero runs:
111
+ # Such runs cannot be scheduled, thus we check next_triggered_at.
112
+ # If we allow scheduled services with downscaling to zero
113
+ # This check won't pass.
114
+ and_(
115
+ RunModel.status == RunStatus.PENDING,
116
+ RunModel.resubmission_attempt == 0,
117
+ RunModel.next_triggered_at.is_(None),
118
+ ),
119
+ ),
120
+ )
121
+ .options(
122
+ joinedload(RunModel.jobs).load_only(JobModel.id),
123
+ # No need to lock finished jobs
124
+ with_loader_criteria(
125
+ JobModel,
126
+ JobModel.status.not_in(JobStatus.finished_statuses()),
127
+ include_aliases=True,
128
+ ),
129
+ )
130
+ .options(load_only(RunModel.id))
131
+ .order_by(RunModel.last_processed_at.asc())
132
+ .limit(1)
133
+ .with_for_update(skip_locked=True, key_share=True, of=RunModel)
134
+ )
135
+ run_model = res.scalar()
136
+ if run_model is None:
137
+ return
138
+ res = await session.execute(
139
+ select(JobModel)
140
+ .where(
141
+ JobModel.run_id == run_model.id,
142
+ JobModel.id.not_in(job_lockset),
143
+ )
144
+ .options(
145
+ load_only(JobModel.id),
146
+ with_loader_criteria(
147
+ JobModel,
148
+ JobModel.status.not_in(JobStatus.finished_statuses()),
149
+ include_aliases=True,
150
+ ),
151
+ )
152
+ .order_by(JobModel.id) # take locks in order
153
+ .with_for_update(skip_locked=True, key_share=True)
154
+ )
155
+ job_models = res.scalars().all()
156
+ if len(run_model.jobs) != len(job_models):
157
+ # Some jobs are locked or there was a non-repeatable read
158
+ return
159
+ job_ids = [j.id for j in run_model.jobs]
160
+ run_lockset.add(run_model.id)
161
+ job_lockset.update(job_ids)
162
+ run_model_id = run_model.id
163
+ try:
164
+ await _process_run(session=session, run_model=run_model)
165
+ finally:
166
+ run_lockset.difference_update([run_model_id])
167
+ job_lockset.difference_update(job_ids)
168
+
169
+
170
+ async def _process_run(session: AsyncSession, run_model: RunModel):
171
+ run_model = await _refetch_run_model(session, run_model)
172
+ logger.debug("%s: processing run", fmt(run_model))
173
+ try:
174
+ if run_model.status == RunStatus.PENDING:
175
+ await _process_pending_run(session, run_model)
176
+ elif run_model.status in {RunStatus.SUBMITTED, RunStatus.PROVISIONING, RunStatus.RUNNING}:
177
+ await _process_active_run(session, run_model)
178
+ elif run_model.status == RunStatus.TERMINATING:
179
+ await process_terminating_run(session, run_model)
180
+ else:
181
+ logger.error("%s: unexpected status %s", fmt(run_model), run_model.status.name)
182
+ run_model.termination_reason = RunTerminationReason.SERVER_ERROR
183
+ switch_run_status(session, run_model, RunStatus.TERMINATING)
184
+ except ServerError as e:
185
+ logger.error("%s: run processing error: %s", fmt(run_model), e)
186
+ run_model.termination_reason = RunTerminationReason.SERVER_ERROR
187
+ switch_run_status(session, run_model, RunStatus.TERMINATING)
188
+
189
+ run_model.last_processed_at = common.get_current_datetime()
190
+ await session.commit()
191
+
192
+
193
+ async def _refetch_run_model(session: AsyncSession, run_model: RunModel) -> RunModel:
194
+ # Select only latest submissions for every job.
195
+ latest_submissions_sq = (
196
+ select(
197
+ JobModel.run_id.label("run_id"),
198
+ JobModel.replica_num.label("replica_num"),
199
+ JobModel.job_num.label("job_num"),
200
+ func.max(JobModel.submission_num).label("max_submission_num"),
201
+ )
202
+ .where(JobModel.run_id == run_model.id)
203
+ .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num)
204
+ .subquery()
205
+ )
206
+ job_alias = aliased(JobModel)
207
+ res = await session.execute(
208
+ select(RunModel)
209
+ .where(RunModel.id == run_model.id)
210
+ .outerjoin(latest_submissions_sq, latest_submissions_sq.c.run_id == RunModel.id)
211
+ .outerjoin(
212
+ job_alias,
213
+ onclause=and_(
214
+ job_alias.run_id == latest_submissions_sq.c.run_id,
215
+ job_alias.replica_num == latest_submissions_sq.c.replica_num,
216
+ job_alias.job_num == latest_submissions_sq.c.job_num,
217
+ job_alias.submission_num == latest_submissions_sq.c.max_submission_num,
218
+ ),
219
+ )
220
+ .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
221
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
222
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
223
+ .options(
224
+ contains_eager(RunModel.jobs, alias=job_alias)
225
+ .joinedload(JobModel.instance)
226
+ .load_only(InstanceModel.fleet_id)
227
+ )
228
+ .execution_options(populate_existing=True)
229
+ )
230
+ return res.unique().scalar_one()
231
+
232
+
233
+ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
234
+ """Jobs are not created yet"""
235
+ run = run_model_to_run(run_model)
236
+
237
+ # TODO: Do not select such runs in the first place to avoid redundant processing
238
+ if run_model.resubmission_attempt > 0 and not _retrying_run_ready_for_resubmission(
239
+ run_model, run
240
+ ):
241
+ logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model))
242
+ return
243
+
244
+ if run.run_spec.configuration.type == "service":
245
+ run_model.desired_replica_count = sum(
246
+ group.count.min or 0 for group in run.run_spec.configuration.replica_groups
247
+ )
248
+ await update_service_desired_replica_count(
249
+ session,
250
+ run_model,
251
+ run.run_spec.configuration,
252
+ # does not matter for pending services, since 0->n scaling should happen without delay
253
+ last_scaled_at=None,
254
+ )
255
+
256
+ if run_model.desired_replica_count == 0:
257
+ # stay zero scaled
258
+ return
259
+
260
+ replicas: List[ReplicaGroup] = run.run_spec.configuration.replica_groups
261
+
262
+ await scale_run_replicas_per_group(session, run_model, replicas)
263
+ else:
264
+ run_model.desired_replica_count = 1
265
+ await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count)
266
+
267
+ switch_run_status(session=session, run_model=run_model, new_status=RunStatus.SUBMITTED)
268
+
269
+
270
+ def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
271
+ if run.latest_job_submission is None:
272
+ # Should not be possible
273
+ return True
274
+ duration_since_processing = (
275
+ common.get_current_datetime() - run.latest_job_submission.last_processed_at
276
+ )
277
+ if duration_since_processing < _get_retry_delay(run_model.resubmission_attempt):
278
+ return False
279
+ return True
280
+
281
+
282
+ # We use exponentially increasing retry delays for pending runs.
283
+ # This prevents creation of too many job submissions for runs stuck in pending,
284
+ # e.g. when users set retry for a long period without capacity.
285
+ _PENDING_RETRY_DELAYS = [
286
+ datetime.timedelta(seconds=15),
287
+ datetime.timedelta(seconds=30),
288
+ datetime.timedelta(minutes=1),
289
+ datetime.timedelta(minutes=2),
290
+ datetime.timedelta(minutes=5),
291
+ datetime.timedelta(minutes=10),
292
+ ]
293
+
294
+
295
+ def _get_retry_delay(resubmission_attempt: int) -> datetime.timedelta:
296
+ if resubmission_attempt - 1 < len(_PENDING_RETRY_DELAYS):
297
+ return _PENDING_RETRY_DELAYS[resubmission_attempt - 1]
298
+ return _PENDING_RETRY_DELAYS[-1]
299
+
300
+
301
+ async def _process_active_run(session: AsyncSession, run_model: RunModel):
302
+ """
303
+ Run is submitted, provisioning, or running.
304
+ We handle fails, scaling, and status changes.
305
+ """
306
+ run = run_model_to_run(run_model)
307
+ run_spec = run.run_spec
308
+ retry_single_job = _can_retry_single_job(run_spec)
309
+
310
+ run_statuses: Set[RunStatus] = set()
311
+ run_termination_reasons: Set[RunTerminationReason] = set()
312
+ replicas_to_retry: List[Tuple[int, List[JobModel]]] = []
313
+
314
+ replicas_info: List[autoscalers.ReplicaInfo] = []
315
+ for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
316
+ replica_statuses: Set[RunStatus] = set()
317
+ replica_needs_retry = False
318
+ replica_active = True
319
+ jobs_done_num = 0
320
+ for job_model in job_models:
321
+ job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
322
+ if (
323
+ run_model.fleet_id is None
324
+ and job_model.instance is not None
325
+ and job_model.instance.fleet_id is not None
326
+ ):
327
+ run_model.fleet_id = job_model.instance.fleet_id
328
+ if job_model.status == JobStatus.DONE or (
329
+ job_model.status == JobStatus.TERMINATING
330
+ and job_model.termination_reason == JobTerminationReason.DONE_BY_RUNNER
331
+ ):
332
+ # the job is done or going to be done
333
+ replica_statuses.add(RunStatus.DONE)
334
+ jobs_done_num += 1
335
+ elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN:
336
+ # the job was scaled down
337
+ replica_active = False
338
+ elif job_model.status == JobStatus.RUNNING:
339
+ # the job is running
340
+ replica_statuses.add(RunStatus.RUNNING)
341
+ elif job_model.status in {JobStatus.PROVISIONING, JobStatus.PULLING}:
342
+ # the job is provisioning
343
+ replica_statuses.add(RunStatus.PROVISIONING)
344
+ elif job_model.status == JobStatus.SUBMITTED:
345
+ # the job is submitted
346
+ replica_statuses.add(RunStatus.SUBMITTED)
347
+ elif job_model.status == JobStatus.FAILED or (
348
+ job_model.status
349
+ in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED]
350
+ and job_model.termination_reason
351
+ not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN}
352
+ ):
353
+ current_duration = await _should_retry_job(session, run, job, job_model)
354
+ if current_duration is None:
355
+ replica_statuses.add(RunStatus.FAILED)
356
+ run_termination_reasons.add(RunTerminationReason.JOB_FAILED)
357
+ else:
358
+ if _is_retry_duration_exceeded(job, current_duration):
359
+ replica_statuses.add(RunStatus.FAILED)
360
+ run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED)
361
+ else:
362
+ replica_needs_retry = True
363
+ else:
364
+ raise ValueError(f"Unexpected job status {job_model.status}")
365
+
366
+ if RunStatus.FAILED in replica_statuses:
367
+ run_statuses.add(RunStatus.FAILED)
368
+ else:
369
+ if replica_needs_retry:
370
+ replicas_to_retry.append((replica_num, job_models))
371
+ if not replica_needs_retry or retry_single_job:
372
+ run_statuses.update(replica_statuses)
373
+
374
+ if jobs_done_num == len(job_models):
375
+ # Consider replica inactive if all its jobs are done for some reason.
376
+ # If only some jobs are done, replica is considered active to avoid
377
+ # provisioning new replicas for partially done multi-node tasks.
378
+ replica_active = False
379
+
380
+ replica_info = _get_replica_info(job_models, replica_active)
381
+ replicas_info.append(replica_info)
382
+
383
+ termination_reason: Optional[RunTerminationReason] = None
384
+ if RunStatus.FAILED in run_statuses:
385
+ new_status = RunStatus.TERMINATING
386
+ if RunTerminationReason.JOB_FAILED in run_termination_reasons:
387
+ termination_reason = RunTerminationReason.JOB_FAILED
388
+ elif RunTerminationReason.RETRY_LIMIT_EXCEEDED in run_termination_reasons:
389
+ termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED
390
+ else:
391
+ raise ValueError(f"Unexpected termination reason {run_termination_reasons}")
392
+ elif _should_stop_on_master_done(run):
393
+ new_status = RunStatus.TERMINATING
394
+ # ALL_JOBS_DONE is used for all DONE reasons including master-done
395
+ termination_reason = RunTerminationReason.ALL_JOBS_DONE
396
+ elif RunStatus.RUNNING in run_statuses:
397
+ new_status = RunStatus.RUNNING
398
+ elif RunStatus.PROVISIONING in run_statuses:
399
+ new_status = RunStatus.PROVISIONING
400
+ elif RunStatus.SUBMITTED in run_statuses:
401
+ new_status = RunStatus.SUBMITTED
402
+ elif RunStatus.DONE in run_statuses and not replicas_to_retry:
403
+ new_status = RunStatus.TERMINATING
404
+ termination_reason = RunTerminationReason.ALL_JOBS_DONE
405
+ else:
406
+ new_status = RunStatus.PENDING
407
+
408
+ # Terminate active jobs if the run is to be resubmitted
409
+ if new_status == RunStatus.PENDING and not retry_single_job:
410
+ for _, replica_jobs in replicas_to_retry:
411
+ for job_model in replica_jobs:
412
+ if not (
413
+ job_model.status.is_finished() or job_model.status == JobStatus.TERMINATING
414
+ ):
415
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
416
+ job_model.termination_reason_message = "Run is to be resubmitted"
417
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
418
+
419
+ if new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}:
420
+ # No need to retry, scale, or redeploy replicas if the run is terminating,
421
+ # pending run will retry replicas in `process_pending_run`
422
+ await _handle_run_replicas(
423
+ session, run_model, run_spec, replicas_to_retry, retry_single_job, replicas_info
424
+ )
425
+
426
+ if run_model.status != new_status:
427
+ if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
428
+ current_time = common.get_current_datetime()
429
+ submit_to_provision_duration = (current_time - run_model.submitted_at).total_seconds()
430
+ logger.info(
431
+ "%s: run took %.2f seconds from submission to provisioning.",
432
+ fmt(run_model),
433
+ submit_to_provision_duration,
434
+ )
435
+ project_name = run_model.project.name
436
+ run_metrics.log_submit_to_provision_duration(
437
+ submit_to_provision_duration, project_name, run_spec.configuration.type
438
+ )
439
+
440
+ if new_status == RunStatus.PENDING:
441
+ run_metrics.increment_pending_runs(run_model.project.name, run_spec.configuration.type)
442
+ # Unassign run from fleet so that the new fleet can be chosen when retrying
443
+ run_model.fleet = None
444
+
445
+ run_model.termination_reason = termination_reason
446
+ switch_run_status(session, run_model, new_status)
447
+ # While a run goes to pending without provisioning, resubmission_attempt increases.
448
+ if new_status == RunStatus.PROVISIONING:
449
+ run_model.resubmission_attempt = 0
450
+ elif new_status == RunStatus.PENDING:
451
+ run_model.resubmission_attempt += 1
452
+
453
+
454
+ def _get_replica_info(
455
+ replica_job_models: list[JobModel],
456
+ replica_active: bool,
457
+ ) -> autoscalers.ReplicaInfo:
458
+ if replica_active:
459
+ # submitted_at = replica created
460
+ return autoscalers.ReplicaInfo(
461
+ active=True,
462
+ timestamp=min(job.submitted_at for job in replica_job_models),
463
+ )
464
+ # last_processed_at = replica scaled down
465
+ return autoscalers.ReplicaInfo(
466
+ active=False,
467
+ timestamp=max(job.last_processed_at for job in replica_job_models),
468
+ )
469
+
470
+
471
+ async def _handle_run_replicas(
472
+ session: AsyncSession,
473
+ run_model: RunModel,
474
+ run_spec: RunSpec,
475
+ replicas_to_retry: list[tuple[int, list[JobModel]]],
476
+ retry_single_job: bool,
477
+ replicas_info: list[autoscalers.ReplicaInfo],
478
+ ) -> None:
479
+ """
480
+ Does ONE of:
481
+ - replica retry
482
+ - replica scaling
483
+ - replica rolling deployment
484
+
485
+ Does not do everything at once to avoid conflicts between the stages and long DB transactions.
486
+ """
487
+
488
+ if replicas_to_retry:
489
+ for _, replica_jobs in replicas_to_retry:
490
+ await retry_run_replica_jobs(
491
+ session, run_model, replica_jobs, only_failed=retry_single_job
492
+ )
493
+ return
494
+
495
+ if run_spec.configuration.type == "service":
496
+ await update_service_desired_replica_count(
497
+ session,
498
+ run_model,
499
+ run_spec.configuration,
500
+ # FIXME: should only include scaling events, not retries and deployments
501
+ last_scaled_at=max((r.timestamp for r in replicas_info), default=None),
502
+ )
503
+ replicas: List[ReplicaGroup] = run_spec.configuration.replica_groups
504
+ assert replicas, "replica groups should always return at least one group"
505
+
506
+ await scale_run_replicas_per_group(session, run_model, replicas)
507
+
508
+ # Handle per-group rolling deployment
509
+ await _update_jobs_to_new_deployment_in_place(
510
+ session=session,
511
+ run_model=run_model,
512
+ run_spec=run_spec,
513
+ replicas=replicas,
514
+ )
515
+ # Process per-group rolling deployment
516
+ for group in replicas:
517
+ await _handle_rolling_deployment_for_group(
518
+ session=session, run_model=run_model, group=group, run_spec=run_spec
519
+ )
520
+ # Terminate replicas from groups that were removed from the configuration
521
+ existing_group_names = set()
522
+ for job in run_model.jobs:
523
+ if job.status.is_finished():
524
+ continue
525
+ try:
526
+ job_spec = JobSpec.__response__.parse_raw(job.job_spec_data)
527
+ existing_group_names.add(job_spec.replica_group)
528
+ except Exception:
529
+ continue
530
+ new_group_names = {group.name for group in replicas}
531
+ removed_group_names = existing_group_names - new_group_names
532
+ for removed_group_name in removed_group_names:
533
+ # Build replica lists for this removed group
534
+ active_replicas, inactive_replicas = build_replica_lists(
535
+ run_model=run_model,
536
+ group_filter=removed_group_name,
537
+ )
538
+
539
+ total_replicas = len(active_replicas) + len(inactive_replicas)
540
+ if total_replicas > 0:
541
+ logger.info(
542
+ "%s: terminating %d replica(s) from removed group '%s'",
543
+ fmt(run_model),
544
+ total_replicas,
545
+ removed_group_name,
546
+ )
547
+ # Terminate all active replicas in the removed group
548
+ if active_replicas:
549
+ scale_down_replicas(session, active_replicas, len(active_replicas))
550
+ # Terminate all inactive replicas in the removed group
551
+ if inactive_replicas:
552
+ scale_down_replicas(session, inactive_replicas, len(inactive_replicas))
553
+ return
554
+
555
+ max_replica_count = run_model.desired_replica_count
556
+ if has_out_of_date_replicas(run_model):
557
+ # allow extra replicas when deployment is in progress
558
+ max_replica_count += ROLLING_DEPLOYMENT_MAX_SURGE
559
+
560
+ active_replica_count = sum(1 for r in replicas_info if r.active)
561
+ if active_replica_count not in range(run_model.desired_replica_count, max_replica_count + 1):
562
+ await scale_run_replicas(
563
+ session,
564
+ run_model,
565
+ replicas_diff=run_model.desired_replica_count - active_replica_count,
566
+ )
567
+ return
568
+
569
+ await _update_jobs_to_new_deployment_in_place(
570
+ session=session,
571
+ run_model=run_model,
572
+ run_spec=run_spec,
573
+ )
574
+ if has_out_of_date_replicas(run_model):
575
+ assert run_spec.configuration.type == "service", (
576
+ "Rolling deployment is only supported for services"
577
+ )
578
+ non_terminated_replica_count = len(
579
+ {j.replica_num for j in run_model.jobs if not j.status.is_finished()}
580
+ )
581
+ # Avoid using too much hardware during a deployment - never have
582
+ # more than max_replica_count non-terminated replicas.
583
+ if non_terminated_replica_count < max_replica_count:
584
+ # Start more up-to-date replicas that will eventually replace out-of-date replicas.
585
+ await scale_run_replicas(
586
+ session,
587
+ run_model,
588
+ replicas_diff=max_replica_count - non_terminated_replica_count,
589
+ )
590
+
591
+ replicas_to_stop_count = 0
592
+ # stop any out-of-date replicas that are not registered
593
+ replicas_to_stop_count += sum(
594
+ any(j.deployment_num < run_model.deployment_num for j in jobs)
595
+ and any(
596
+ j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
597
+ for j in jobs
598
+ )
599
+ and not is_replica_registered(jobs)
600
+ for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
601
+ )
602
+ # stop excessive registered out-of-date replicas, except those that are already `terminating`
603
+ non_terminating_registered_replicas_count = sum(
604
+ is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs)
605
+ for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
606
+ )
607
+ replicas_to_stop_count += max(
608
+ 0, non_terminating_registered_replicas_count - run_model.desired_replica_count
609
+ )
610
+ if replicas_to_stop_count:
611
+ await scale_run_replicas(
612
+ session,
613
+ run_model,
614
+ replicas_diff=-replicas_to_stop_count,
615
+ )
616
+
617
+
618
+ async def _update_jobs_to_new_deployment_in_place(
619
+ session: AsyncSession,
620
+ run_model: RunModel,
621
+ run_spec: RunSpec,
622
+ replicas: Optional[List] = None,
623
+ ) -> None:
624
+ """
625
+ Bump deployment_num for jobs that do not require redeployment.
626
+ """
627
+ secrets = await get_project_secrets_mapping(
628
+ session=session,
629
+ project=run_model.project,
630
+ )
631
+
632
+ for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
633
+ if all(j.status.is_finished() for j in job_models):
634
+ continue
635
+ if all(j.deployment_num == run_model.deployment_num for j in job_models):
636
+ continue
637
+
638
+ # Determine which group this replica belongs to
639
+ replica_group_name = None
640
+
641
+ if replicas:
642
+ job_spec = JobSpec.__response__.parse_raw(job_models[0].job_spec_data)
643
+ replica_group_name = job_spec.replica_group
644
+
645
+ # FIXME: Handle getting image configuration errors or skip it.
646
+ new_job_specs = await get_job_specs_from_run_spec(
647
+ run_spec=run_spec,
648
+ secrets=secrets,
649
+ replica_num=replica_num,
650
+ replica_group_name=replica_group_name,
651
+ )
652
+ assert len(new_job_specs) == len(job_models), (
653
+ "Changing the number of jobs within a replica is not yet supported"
654
+ )
655
+ can_update_all_jobs = True
656
+ for old_job_model, new_job_spec in zip(job_models, new_job_specs):
657
+ old_job_spec = JobSpec.__response__.parse_raw(old_job_model.job_spec_data)
658
+ if new_job_spec != old_job_spec:
659
+ can_update_all_jobs = False
660
+ break
661
+ if can_update_all_jobs:
662
+ for job_model in job_models:
663
+ job_model.deployment_num = run_model.deployment_num
664
+
665
+
666
+ async def _should_retry_job(
667
+ session: AsyncSession,
668
+ run: Run,
669
+ job: Job,
670
+ job_model: JobModel,
671
+ ) -> Optional[datetime.timedelta]:
672
+ """
673
+ Checks if the job should be retried.
674
+ Returns the current duration of retrying if retry is enabled.
675
+ Retrying duration is calculated as the time since `last_processed_at`
676
+ of the latest provisioned submission.
677
+ """
678
+ if job.job_spec.retry is None:
679
+ return None
680
+
681
+ last_provisioned_submission = None
682
+ if len(job.job_submissions) > 0:
683
+ last_submission = job.job_submissions[-1]
684
+ if last_submission.job_provisioning_data is not None:
685
+ last_provisioned_submission = last_submission
686
+ else:
687
+ # The caller passes at most one latest submission in job.job_submissions, so check the db.
688
+ res = await session.execute(
689
+ select(JobModel)
690
+ .where(
691
+ JobModel.run_id == job_model.run_id,
692
+ JobModel.replica_num == job_model.replica_num,
693
+ JobModel.job_num == job_model.job_num,
694
+ JobModel.job_provisioning_data.is_not(None),
695
+ )
696
+ .order_by(JobModel.last_processed_at.desc())
697
+ .limit(1)
698
+ )
699
+ last_provisioned_submission_model = res.scalar()
700
+ if last_provisioned_submission_model is not None:
701
+ last_provisioned_submission = job_model_to_job_submission(
702
+ last_provisioned_submission_model
703
+ )
704
+
705
+ if (
706
+ job_model.termination_reason is not None
707
+ and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY
708
+ and last_provisioned_submission is None
709
+ and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events
710
+ ):
711
+ return common.get_current_datetime() - run.submitted_at
712
+
713
+ if (
714
+ job_model.termination_reason is not None
715
+ and job_model.termination_reason.to_retry_event() in job.job_spec.retry.on_events
716
+ and last_provisioned_submission is not None
717
+ ):
718
+ return common.get_current_datetime() - last_provisioned_submission.last_processed_at
719
+
720
+ return None
721
+
722
+
723
+ def _is_retry_duration_exceeded(job: Job, current_duration: datetime.timedelta) -> bool:
724
+ if job.job_spec.retry is None:
725
+ return True
726
+ return current_duration > datetime.timedelta(seconds=job.job_spec.retry.duration)
727
+
728
+
729
+ def _can_retry_single_job(run_spec: RunSpec) -> bool:
730
+ # TODO: Currently, we terminate and retry the entire replica if one of the job fails.
731
+ # We could make partial retry in some multi-node cases.
732
+ # E.g. restarting a worker node, independent jobs.
733
+ return False
734
+
735
+
736
+ def _should_stop_on_master_done(run: Run) -> bool:
737
+ if run.run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE:
738
+ return False
739
+ for job in run.jobs:
740
+ if is_master_job(job) and job.job_submissions[-1].status == JobStatus.DONE:
741
+ return True
742
+ return False
743
+
744
+
745
+ async def _handle_rolling_deployment_for_group(
746
+ session: AsyncSession, run_model: RunModel, group: ReplicaGroup, run_spec: RunSpec
747
+ ) -> None:
748
+ """
749
+ Handle rolling deployment for a single replica group.
750
+ """
751
+ from dstack._internal.server.services.runs.replicas import (
752
+ build_replica_lists,
753
+ scale_run_replicas_for_group,
754
+ )
755
+
756
+ desired_replica_counts = (
757
+ json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {}
758
+ )
759
+
760
+ group_desired = desired_replica_counts.get(group.name, group.count.min or 0)
761
+
762
+ # Check if group has out-of-date replicas
763
+ if not has_out_of_date_replicas(run_model, group_filter=group.name):
764
+ return # Group is up-to-date
765
+
766
+ # Calculate max replicas (allow surge during deployment)
767
+ group_max_replica_count = group_desired + ROLLING_DEPLOYMENT_MAX_SURGE
768
+
769
+ # Count non-terminated replicas for this group only
770
+
771
+ non_terminated_replica_count = len(
772
+ {
773
+ j.replica_num
774
+ for j in run_model.jobs
775
+ if not j.status.is_finished()
776
+ and group.name is not None
777
+ and job_belongs_to_group(job=j, group_name=group.name)
778
+ }
779
+ )
780
+
781
+ # Start new up-to-date replicas if needed
782
+ if non_terminated_replica_count < group_max_replica_count:
783
+ active_replicas, inactive_replicas = build_replica_lists(
784
+ run_model=run_model,
785
+ group_filter=group.name,
786
+ )
787
+
788
+ await scale_run_replicas_for_group(
789
+ session=session,
790
+ run_model=run_model,
791
+ group=group,
792
+ replicas_diff=group_max_replica_count - non_terminated_replica_count,
793
+ run_spec=run_spec,
794
+ active_replicas=active_replicas,
795
+ inactive_replicas=inactive_replicas,
796
+ )
797
+
798
+ # Stop out-of-date replicas that are not registered
799
+ replicas_to_stop_count = 0
800
+ for _, jobs in group_jobs_by_replica_latest(run_model.jobs):
801
+ assert group.name is not None, "Group name is always set"
802
+ if not job_belongs_to_group(jobs[0], group.name):
803
+ continue
804
+ # Check if replica is out-of-date and not registered
805
+ if (
806
+ any(j.deployment_num < run_model.deployment_num for j in jobs)
807
+ and any(
808
+ j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
809
+ for j in jobs
810
+ )
811
+ and not is_replica_registered(jobs)
812
+ ):
813
+ replicas_to_stop_count += 1
814
+
815
+ # Stop excessive registered out-of-date replicas
816
+ non_terminating_registered_replicas_count = 0
817
+ for _, jobs in group_jobs_by_replica_latest(run_model.jobs):
818
+ assert group.name is not None, "Group name is always set"
819
+ if not job_belongs_to_group(jobs[0], group.name):
820
+ continue
821
+
822
+ if is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs):
823
+ non_terminating_registered_replicas_count += 1
824
+
825
+ replicas_to_stop_count += max(0, non_terminating_registered_replicas_count - group_desired)
826
+
827
+ if replicas_to_stop_count > 0:
828
+ # Build lists again to get current state
829
+ active_replicas, inactive_replicas = build_replica_lists(
830
+ run_model=run_model,
831
+ group_filter=group.name,
832
+ )
833
+
834
+ await scale_run_replicas_for_group(
835
+ session=session,
836
+ run_model=run_model,
837
+ group=group,
838
+ replicas_diff=-replicas_to_stop_count,
839
+ run_spec=run_spec,
840
+ active_replicas=active_replicas,
841
+ inactive_replicas=inactive_replicas,
842
+ )