dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (879) hide show
  1. dstack/_internal/cli/commands/__init__.py +80 -0
  2. dstack/_internal/cli/commands/apply.py +100 -0
  3. dstack/_internal/cli/commands/attach.py +161 -0
  4. dstack/_internal/cli/commands/completion.py +22 -0
  5. dstack/_internal/cli/commands/delete.py +44 -0
  6. dstack/_internal/cli/commands/event.py +168 -0
  7. dstack/_internal/cli/commands/fleet.py +161 -0
  8. dstack/_internal/cli/commands/gateway.py +159 -0
  9. dstack/_internal/cli/commands/init.py +64 -0
  10. dstack/_internal/cli/commands/login.py +352 -0
  11. dstack/_internal/cli/commands/logs.py +62 -0
  12. dstack/_internal/cli/commands/metrics.py +153 -0
  13. dstack/_internal/cli/commands/offer.py +146 -0
  14. dstack/_internal/cli/commands/project.py +259 -0
  15. dstack/_internal/cli/commands/ps.py +81 -0
  16. dstack/_internal/cli/commands/run.py +69 -0
  17. dstack/_internal/cli/commands/secrets.py +92 -0
  18. dstack/_internal/cli/commands/server.py +96 -0
  19. dstack/_internal/cli/commands/stop.py +26 -0
  20. dstack/_internal/cli/commands/volume.py +117 -0
  21. dstack/_internal/cli/main.py +101 -0
  22. dstack/_internal/cli/models/gateways.py +16 -0
  23. dstack/_internal/cli/models/offers.py +47 -0
  24. dstack/_internal/cli/models/runs.py +16 -0
  25. dstack/_internal/cli/services/args.py +31 -0
  26. dstack/_internal/cli/services/completion.py +91 -0
  27. dstack/_internal/cli/services/configurators/__init__.py +86 -0
  28. dstack/_internal/cli/services/configurators/base.py +103 -0
  29. dstack/_internal/cli/services/configurators/fleet.py +475 -0
  30. dstack/_internal/cli/services/configurators/gateway.py +231 -0
  31. dstack/_internal/cli/services/configurators/run.py +882 -0
  32. dstack/_internal/cli/services/configurators/volume.py +222 -0
  33. dstack/_internal/cli/services/events.py +68 -0
  34. dstack/_internal/cli/services/profile.py +182 -0
  35. dstack/_internal/cli/services/repos.py +71 -0
  36. dstack/_internal/cli/services/resources.py +54 -0
  37. dstack/_internal/cli/utils/common.py +159 -0
  38. dstack/_internal/cli/utils/fleet.py +106 -0
  39. dstack/_internal/cli/utils/gateway.py +56 -0
  40. dstack/_internal/cli/utils/gpu.py +178 -0
  41. dstack/_internal/cli/utils/rich.py +156 -0
  42. dstack/_internal/cli/utils/run.py +517 -0
  43. dstack/_internal/cli/utils/secrets.py +25 -0
  44. dstack/_internal/cli/utils/updates.py +98 -0
  45. dstack/_internal/cli/utils/volume.py +58 -0
  46. dstack/_internal/compat.py +3 -0
  47. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  48. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  49. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  50. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  51. dstack/_internal/core/backends/aws/auth.py +30 -0
  52. dstack/_internal/core/backends/aws/backend.py +31 -0
  53. dstack/_internal/core/backends/aws/compute.py +1153 -0
  54. dstack/_internal/core/backends/aws/configurator.py +191 -0
  55. dstack/_internal/core/backends/aws/models.py +135 -0
  56. dstack/_internal/core/backends/aws/resources.py +700 -0
  57. dstack/_internal/core/backends/azure/auth.py +39 -0
  58. dstack/_internal/core/backends/azure/backend.py +21 -0
  59. dstack/_internal/core/backends/azure/compute.py +676 -0
  60. dstack/_internal/core/backends/azure/configurator.py +472 -0
  61. dstack/_internal/core/backends/azure/models.py +98 -0
  62. dstack/_internal/core/backends/azure/resources.py +116 -0
  63. dstack/_internal/core/backends/azure/utils.py +42 -0
  64. dstack/_internal/core/backends/base/backend.py +18 -0
  65. dstack/_internal/core/backends/base/compute.py +1101 -0
  66. dstack/_internal/core/backends/base/configurator.py +117 -0
  67. dstack/_internal/core/backends/base/models.py +24 -0
  68. dstack/_internal/core/backends/base/offers.py +232 -0
  69. dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
  70. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  71. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  72. dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
  73. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  74. dstack/_internal/core/backends/configurators.py +181 -0
  75. dstack/_internal/core/backends/cudo/__init__.py +0 -0
  76. dstack/_internal/core/backends/cudo/api_client.py +111 -0
  77. dstack/_internal/core/backends/cudo/backend.py +16 -0
  78. dstack/_internal/core/backends/cudo/compute.py +174 -0
  79. dstack/_internal/core/backends/cudo/configurator.py +63 -0
  80. dstack/_internal/core/backends/cudo/models.py +37 -0
  81. dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
  82. dstack/_internal/core/backends/datacrunch/backend.py +18 -0
  83. dstack/_internal/core/backends/datacrunch/compute.py +8 -0
  84. dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
  85. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  86. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  87. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  88. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  89. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  90. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  91. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  92. dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
  93. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  94. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  95. dstack/_internal/core/backends/dstack/__init__.py +0 -0
  96. dstack/_internal/core/backends/dstack/models.py +26 -0
  97. dstack/_internal/core/backends/features.py +74 -0
  98. dstack/_internal/core/backends/gcp/__init__.py +0 -0
  99. dstack/_internal/core/backends/gcp/auth.py +57 -0
  100. dstack/_internal/core/backends/gcp/backend.py +17 -0
  101. dstack/_internal/core/backends/gcp/compute.py +1257 -0
  102. dstack/_internal/core/backends/gcp/configurator.py +206 -0
  103. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  104. dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
  105. dstack/_internal/core/backends/gcp/models.py +160 -0
  106. dstack/_internal/core/backends/gcp/resources.py +585 -0
  107. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  108. dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
  109. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  110. dstack/_internal/core/backends/hotaisle/compute.py +188 -0
  111. dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
  112. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  113. dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
  114. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  115. dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
  116. dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
  117. dstack/_internal/core/backends/kubernetes/models.py +71 -0
  118. dstack/_internal/core/backends/kubernetes/utils.py +81 -0
  119. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
  120. dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
  121. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  122. dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
  123. dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
  124. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  125. dstack/_internal/core/backends/local/__init__.py +0 -0
  126. dstack/_internal/core/backends/local/backend.py +14 -0
  127. dstack/_internal/core/backends/local/compute.py +130 -0
  128. dstack/_internal/core/backends/models.py +158 -0
  129. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  130. dstack/_internal/core/backends/nebius/backend.py +16 -0
  131. dstack/_internal/core/backends/nebius/compute.py +401 -0
  132. dstack/_internal/core/backends/nebius/configurator.py +98 -0
  133. dstack/_internal/core/backends/nebius/models.py +185 -0
  134. dstack/_internal/core/backends/nebius/resources.py +433 -0
  135. dstack/_internal/core/backends/oci/__init__.py +0 -0
  136. dstack/_internal/core/backends/oci/auth.py +21 -0
  137. dstack/_internal/core/backends/oci/backend.py +16 -0
  138. dstack/_internal/core/backends/oci/compute.py +209 -0
  139. dstack/_internal/core/backends/oci/configurator.py +156 -0
  140. dstack/_internal/core/backends/oci/exceptions.py +15 -0
  141. dstack/_internal/core/backends/oci/models.py +87 -0
  142. dstack/_internal/core/backends/oci/region.py +86 -0
  143. dstack/_internal/core/backends/oci/resources.py +836 -0
  144. dstack/_internal/core/backends/runpod/__init__.py +0 -0
  145. dstack/_internal/core/backends/runpod/api_client.py +627 -0
  146. dstack/_internal/core/backends/runpod/backend.py +16 -0
  147. dstack/_internal/core/backends/runpod/compute.py +444 -0
  148. dstack/_internal/core/backends/runpod/configurator.py +63 -0
  149. dstack/_internal/core/backends/runpod/models.py +54 -0
  150. dstack/_internal/core/backends/template/__init__.py +0 -0
  151. dstack/_internal/core/backends/template/backend.py.jinja +16 -0
  152. dstack/_internal/core/backends/template/compute.py.jinja +95 -0
  153. dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
  154. dstack/_internal/core/backends/template/models.py.jinja +62 -0
  155. dstack/_internal/core/backends/tensordock/models.py +40 -0
  156. dstack/_internal/core/backends/vastai/__init__.py +0 -0
  157. dstack/_internal/core/backends/vastai/api_client.py +143 -0
  158. dstack/_internal/core/backends/vastai/backend.py +16 -0
  159. dstack/_internal/core/backends/vastai/compute.py +141 -0
  160. dstack/_internal/core/backends/vastai/configurator.py +69 -0
  161. dstack/_internal/core/backends/vastai/models.py +37 -0
  162. dstack/_internal/core/backends/verda/__init__.py +0 -0
  163. dstack/_internal/core/backends/verda/backend.py +16 -0
  164. dstack/_internal/core/backends/verda/compute.py +266 -0
  165. dstack/_internal/core/backends/verda/configurator.py +73 -0
  166. dstack/_internal/core/backends/verda/models.py +38 -0
  167. dstack/_internal/core/backends/vultr/__init__.py +0 -0
  168. dstack/_internal/core/backends/vultr/api_client.py +116 -0
  169. dstack/_internal/core/backends/vultr/backend.py +16 -0
  170. dstack/_internal/core/backends/vultr/compute.py +167 -0
  171. dstack/_internal/core/backends/vultr/configurator.py +71 -0
  172. dstack/_internal/core/backends/vultr/models.py +34 -0
  173. dstack/_internal/core/compatibility/__init__.py +0 -0
  174. dstack/_internal/core/compatibility/events.py +13 -0
  175. dstack/_internal/core/compatibility/fleets.py +58 -0
  176. dstack/_internal/core/compatibility/gateways.py +39 -0
  177. dstack/_internal/core/compatibility/gpus.py +13 -0
  178. dstack/_internal/core/compatibility/logs.py +14 -0
  179. dstack/_internal/core/compatibility/runs.py +86 -0
  180. dstack/_internal/core/compatibility/volumes.py +37 -0
  181. dstack/_internal/core/consts.py +8 -0
  182. dstack/_internal/core/errors.py +160 -0
  183. dstack/_internal/core/models/__init__.py +0 -0
  184. dstack/_internal/core/models/auth.py +28 -0
  185. dstack/_internal/core/models/backends/__init__.py +0 -0
  186. dstack/_internal/core/models/backends/base.py +48 -0
  187. dstack/_internal/core/models/common.py +143 -0
  188. dstack/_internal/core/models/compute_groups.py +39 -0
  189. dstack/_internal/core/models/config.py +28 -0
  190. dstack/_internal/core/models/configurations.py +1123 -0
  191. dstack/_internal/core/models/envs.py +149 -0
  192. dstack/_internal/core/models/events.py +98 -0
  193. dstack/_internal/core/models/files.py +67 -0
  194. dstack/_internal/core/models/fleets.py +437 -0
  195. dstack/_internal/core/models/gateways.py +146 -0
  196. dstack/_internal/core/models/gpus.py +45 -0
  197. dstack/_internal/core/models/health.py +28 -0
  198. dstack/_internal/core/models/instances.py +346 -0
  199. dstack/_internal/core/models/logs.py +27 -0
  200. dstack/_internal/core/models/metrics.py +14 -0
  201. dstack/_internal/core/models/placement.py +27 -0
  202. dstack/_internal/core/models/profiles.py +431 -0
  203. dstack/_internal/core/models/projects.py +46 -0
  204. dstack/_internal/core/models/repos/__init__.py +34 -0
  205. dstack/_internal/core/models/repos/base.py +36 -0
  206. dstack/_internal/core/models/repos/local.py +96 -0
  207. dstack/_internal/core/models/repos/remote.py +341 -0
  208. dstack/_internal/core/models/repos/virtual.py +85 -0
  209. dstack/_internal/core/models/resources.py +424 -0
  210. dstack/_internal/core/models/routers.py +24 -0
  211. dstack/_internal/core/models/runs.py +618 -0
  212. dstack/_internal/core/models/secrets.py +16 -0
  213. dstack/_internal/core/models/server.py +7 -0
  214. dstack/_internal/core/models/services.py +76 -0
  215. dstack/_internal/core/models/unix.py +53 -0
  216. dstack/_internal/core/models/users.py +60 -0
  217. dstack/_internal/core/models/volumes.py +221 -0
  218. dstack/_internal/core/services/__init__.py +16 -0
  219. dstack/_internal/core/services/api_client.py +15 -0
  220. dstack/_internal/core/services/configs/__init__.py +116 -0
  221. dstack/_internal/core/services/diff.py +71 -0
  222. dstack/_internal/core/services/logs.py +58 -0
  223. dstack/_internal/core/services/profiles.py +46 -0
  224. dstack/_internal/core/services/repos.py +236 -0
  225. dstack/_internal/core/services/ssh/__init__.py +27 -0
  226. dstack/_internal/core/services/ssh/attach.py +241 -0
  227. dstack/_internal/core/services/ssh/client.py +113 -0
  228. dstack/_internal/core/services/ssh/key_manager.py +53 -0
  229. dstack/_internal/core/services/ssh/ports.py +89 -0
  230. dstack/_internal/core/services/ssh/tunnel.py +337 -0
  231. dstack/_internal/proxy/__init__.py +8 -0
  232. dstack/_internal/proxy/gateway/__init__.py +0 -0
  233. dstack/_internal/proxy/gateway/app.py +89 -0
  234. dstack/_internal/proxy/gateway/auth.py +26 -0
  235. dstack/_internal/proxy/gateway/const.py +7 -0
  236. dstack/_internal/proxy/gateway/deps.py +73 -0
  237. dstack/_internal/proxy/gateway/main.py +17 -0
  238. dstack/_internal/proxy/gateway/models.py +23 -0
  239. dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
  240. dstack/_internal/proxy/gateway/repo/repo.py +121 -0
  241. dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
  242. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
  243. dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
  244. dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
  245. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
  246. dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
  247. dstack/_internal/proxy/gateway/routers/auth.py +10 -0
  248. dstack/_internal/proxy/gateway/routers/config.py +28 -0
  249. dstack/_internal/proxy/gateway/routers/registry.py +124 -0
  250. dstack/_internal/proxy/gateway/routers/stats.py +18 -0
  251. dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
  252. dstack/_internal/proxy/gateway/schemas/common.py +5 -0
  253. dstack/_internal/proxy/gateway/schemas/config.py +9 -0
  254. dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
  255. dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
  256. dstack/_internal/proxy/gateway/services/__init__.py +0 -0
  257. dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
  258. dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
  259. dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
  260. dstack/_internal/proxy/gateway/services/nginx.py +455 -0
  261. dstack/_internal/proxy/gateway/services/registry.py +426 -0
  262. dstack/_internal/proxy/gateway/services/server_client.py +95 -0
  263. dstack/_internal/proxy/gateway/services/stats.py +170 -0
  264. dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
  265. dstack/_internal/proxy/gateway/testing/common.py +13 -0
  266. dstack/_internal/proxy/lib/__init__.py +0 -0
  267. dstack/_internal/proxy/lib/auth.py +7 -0
  268. dstack/_internal/proxy/lib/deps.py +106 -0
  269. dstack/_internal/proxy/lib/errors.py +14 -0
  270. dstack/_internal/proxy/lib/models.py +112 -0
  271. dstack/_internal/proxy/lib/repo.py +27 -0
  272. dstack/_internal/proxy/lib/routers/__init__.py +0 -0
  273. dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
  274. dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
  275. dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
  276. dstack/_internal/proxy/lib/services/__init__.py +0 -0
  277. dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
  278. dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
  279. dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
  280. dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
  281. dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
  282. dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
  283. dstack/_internal/proxy/lib/services/service_connection.py +160 -0
  284. dstack/_internal/proxy/lib/testing/__init__.py +0 -0
  285. dstack/_internal/proxy/lib/testing/auth.py +11 -0
  286. dstack/_internal/proxy/lib/testing/common.py +51 -0
  287. dstack/_internal/server/__init__.py +0 -0
  288. dstack/_internal/server/alembic.ini +100 -0
  289. dstack/_internal/server/app.py +432 -0
  290. dstack/_internal/server/background/__init__.py +142 -0
  291. dstack/_internal/server/background/tasks/__init__.py +0 -0
  292. dstack/_internal/server/background/tasks/common.py +24 -0
  293. dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
  294. dstack/_internal/server/background/tasks/process_events.py +17 -0
  295. dstack/_internal/server/background/tasks/process_fleets.py +289 -0
  296. dstack/_internal/server/background/tasks/process_gateways.py +188 -0
  297. dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
  298. dstack/_internal/server/background/tasks/process_instances.py +1186 -0
  299. dstack/_internal/server/background/tasks/process_metrics.py +172 -0
  300. dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
  301. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  302. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
  303. dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
  304. dstack/_internal/server/background/tasks/process_runs.py +842 -0
  305. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
  306. dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
  307. dstack/_internal/server/background/tasks/process_volumes.py +129 -0
  308. dstack/_internal/server/compatibility/__init__.py +0 -0
  309. dstack/_internal/server/compatibility/common.py +20 -0
  310. dstack/_internal/server/compatibility/gpus.py +22 -0
  311. dstack/_internal/server/db.py +127 -0
  312. dstack/_internal/server/deps.py +19 -0
  313. dstack/_internal/server/main.py +4 -0
  314. dstack/_internal/server/migrations/__init__.py +0 -0
  315. dstack/_internal/server/migrations/env.py +112 -0
  316. dstack/_internal/server/migrations/script.py.mako +28 -0
  317. dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
  318. dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
  319. dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
  320. dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
  321. dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
  322. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  323. dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
  324. dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
  325. dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
  326. dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
  327. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  328. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  329. dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
  330. dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
  331. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  332. dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
  333. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  334. dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
  335. dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
  336. dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
  337. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  338. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  339. dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
  340. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  341. dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
  342. dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
  343. dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
  344. dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
  345. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  346. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  347. dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
  348. dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
  349. dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
  350. dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
  351. dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
  352. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  353. dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
  354. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  355. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  356. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  357. dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
  358. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  359. dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
  360. dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
  361. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  362. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  363. dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
  364. dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
  365. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  366. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
  367. dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
  368. dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
  369. dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
  370. dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
  371. dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
  372. dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
  373. dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
  374. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  375. dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
  376. dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
  377. dstack/_internal/server/migrations/versions/__init__.py +0 -0
  378. dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
  379. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  380. dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
  381. dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
  382. dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
  383. dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
  384. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  385. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  386. dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
  387. dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
  388. dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
  389. dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
  390. dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
  391. dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
  392. dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
  393. dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
  394. dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
  395. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  396. dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
  397. dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
  398. dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
  399. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  400. dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
  401. dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
  402. dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
  403. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  404. dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
  405. dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
  406. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  407. dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
  408. dstack/_internal/server/models.py +930 -0
  409. dstack/_internal/server/routers/__init__.py +0 -0
  410. dstack/_internal/server/routers/auth.py +34 -0
  411. dstack/_internal/server/routers/backends.py +142 -0
  412. dstack/_internal/server/routers/events.py +60 -0
  413. dstack/_internal/server/routers/files.py +68 -0
  414. dstack/_internal/server/routers/fleets.py +202 -0
  415. dstack/_internal/server/routers/gateways.py +109 -0
  416. dstack/_internal/server/routers/gpus.py +32 -0
  417. dstack/_internal/server/routers/instances.py +77 -0
  418. dstack/_internal/server/routers/logs.py +34 -0
  419. dstack/_internal/server/routers/metrics.py +82 -0
  420. dstack/_internal/server/routers/projects.py +205 -0
  421. dstack/_internal/server/routers/prometheus.py +35 -0
  422. dstack/_internal/server/routers/repos.py +118 -0
  423. dstack/_internal/server/routers/runs.py +216 -0
  424. dstack/_internal/server/routers/secrets.py +86 -0
  425. dstack/_internal/server/routers/server.py +19 -0
  426. dstack/_internal/server/routers/users.py +158 -0
  427. dstack/_internal/server/routers/volumes.py +122 -0
  428. dstack/_internal/server/schemas/__init__.py +0 -0
  429. dstack/_internal/server/schemas/auth.py +83 -0
  430. dstack/_internal/server/schemas/backends.py +16 -0
  431. dstack/_internal/server/schemas/common.py +9 -0
  432. dstack/_internal/server/schemas/events.py +211 -0
  433. dstack/_internal/server/schemas/files.py +5 -0
  434. dstack/_internal/server/schemas/fleets.py +49 -0
  435. dstack/_internal/server/schemas/gateways.py +31 -0
  436. dstack/_internal/server/schemas/gpus.py +26 -0
  437. dstack/_internal/server/schemas/health/__init__.py +0 -0
  438. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  439. dstack/_internal/server/schemas/instances.py +47 -0
  440. dstack/_internal/server/schemas/logs.py +17 -0
  441. dstack/_internal/server/schemas/projects.py +81 -0
  442. dstack/_internal/server/schemas/repos.py +24 -0
  443. dstack/_internal/server/schemas/runner.py +269 -0
  444. dstack/_internal/server/schemas/runs.py +66 -0
  445. dstack/_internal/server/schemas/secrets.py +16 -0
  446. dstack/_internal/server/schemas/users.py +72 -0
  447. dstack/_internal/server/schemas/volumes.py +29 -0
  448. dstack/_internal/server/security/__init__.py +0 -0
  449. dstack/_internal/server/security/permissions.py +251 -0
  450. dstack/_internal/server/services/__init__.py +0 -0
  451. dstack/_internal/server/services/auth.py +77 -0
  452. dstack/_internal/server/services/backends/__init__.py +404 -0
  453. dstack/_internal/server/services/backends/handlers.py +105 -0
  454. dstack/_internal/server/services/compute_groups.py +22 -0
  455. dstack/_internal/server/services/config.py +279 -0
  456. dstack/_internal/server/services/docker.py +162 -0
  457. dstack/_internal/server/services/encryption/__init__.py +102 -0
  458. dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
  459. dstack/_internal/server/services/encryption/keys/aes.py +68 -0
  460. dstack/_internal/server/services/encryption/keys/base.py +19 -0
  461. dstack/_internal/server/services/encryption/keys/identity.py +28 -0
  462. dstack/_internal/server/services/events.py +477 -0
  463. dstack/_internal/server/services/files.py +91 -0
  464. dstack/_internal/server/services/fleets.py +1224 -0
  465. dstack/_internal/server/services/gateways/__init__.py +686 -0
  466. dstack/_internal/server/services/gateways/client.py +209 -0
  467. dstack/_internal/server/services/gateways/connection.py +139 -0
  468. dstack/_internal/server/services/gateways/pool.py +58 -0
  469. dstack/_internal/server/services/gpus.py +387 -0
  470. dstack/_internal/server/services/instances.py +731 -0
  471. dstack/_internal/server/services/jobs/__init__.py +840 -0
  472. dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
  473. dstack/_internal/server/services/jobs/configurators/base.py +469 -0
  474. dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
  475. dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
  476. dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
  477. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  478. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
  479. dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
  480. dstack/_internal/server/services/jobs/configurators/service.py +28 -0
  481. dstack/_internal/server/services/jobs/configurators/task.py +39 -0
  482. dstack/_internal/server/services/locking.py +187 -0
  483. dstack/_internal/server/services/logging.py +29 -0
  484. dstack/_internal/server/services/logs/__init__.py +122 -0
  485. dstack/_internal/server/services/logs/aws.py +373 -0
  486. dstack/_internal/server/services/logs/base.py +47 -0
  487. dstack/_internal/server/services/logs/filelog.py +261 -0
  488. dstack/_internal/server/services/logs/fluentbit.py +329 -0
  489. dstack/_internal/server/services/logs/gcp.py +181 -0
  490. dstack/_internal/server/services/metrics.py +172 -0
  491. dstack/_internal/server/services/offers.py +249 -0
  492. dstack/_internal/server/services/permissions.py +37 -0
  493. dstack/_internal/server/services/placement.py +234 -0
  494. dstack/_internal/server/services/plugins.py +109 -0
  495. dstack/_internal/server/services/probes.py +10 -0
  496. dstack/_internal/server/services/projects.py +835 -0
  497. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  498. dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
  499. dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
  500. dstack/_internal/server/services/proxy/__init__.py +3 -0
  501. dstack/_internal/server/services/proxy/auth.py +12 -0
  502. dstack/_internal/server/services/proxy/deps.py +18 -0
  503. dstack/_internal/server/services/proxy/repo.py +189 -0
  504. dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
  505. dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
  506. dstack/_internal/server/services/proxy/services/__init__.py +0 -0
  507. dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
  508. dstack/_internal/server/services/repos.py +362 -0
  509. dstack/_internal/server/services/requirements/__init__.py +0 -0
  510. dstack/_internal/server/services/requirements/combine.py +260 -0
  511. dstack/_internal/server/services/resources.py +21 -0
  512. dstack/_internal/server/services/runner/__init__.py +0 -0
  513. dstack/_internal/server/services/runner/client.py +646 -0
  514. dstack/_internal/server/services/runner/ssh.py +128 -0
  515. dstack/_internal/server/services/runs/__init__.py +1026 -0
  516. dstack/_internal/server/services/runs/plan.py +703 -0
  517. dstack/_internal/server/services/runs/replicas.py +317 -0
  518. dstack/_internal/server/services/runs/spec.py +191 -0
  519. dstack/_internal/server/services/secrets.py +245 -0
  520. dstack/_internal/server/services/services/__init__.py +345 -0
  521. dstack/_internal/server/services/services/autoscalers.py +140 -0
  522. dstack/_internal/server/services/services/options.py +53 -0
  523. dstack/_internal/server/services/ssh.py +67 -0
  524. dstack/_internal/server/services/storage/__init__.py +37 -0
  525. dstack/_internal/server/services/storage/base.py +48 -0
  526. dstack/_internal/server/services/storage/gcs.py +66 -0
  527. dstack/_internal/server/services/storage/s3.py +69 -0
  528. dstack/_internal/server/services/users.py +461 -0
  529. dstack/_internal/server/services/volumes.py +496 -0
  530. dstack/_internal/server/settings.py +161 -0
  531. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  532. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  533. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  534. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  535. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  536. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  537. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  538. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  539. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  540. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  541. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  542. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  543. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  544. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  545. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  546. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  547. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  548. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  549. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  550. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  551. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  552. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  553. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  554. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  555. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  556. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  557. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  558. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  559. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  560. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  561. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  562. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  563. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  564. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  565. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  566. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  567. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  568. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  569. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  570. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  571. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  572. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  573. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  574. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  575. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  576. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  577. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  578. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  579. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  580. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  581. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  582. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  583. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  584. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  585. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  586. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  587. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  588. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  589. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  590. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  591. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  592. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  593. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  594. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  595. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  596. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  597. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  598. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  599. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  600. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  601. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  602. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  603. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  604. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  605. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  606. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  607. dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
  608. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  609. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  610. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  611. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  612. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  613. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  614. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  615. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  616. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  617. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  618. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  619. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  620. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  621. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  622. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  623. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  624. dstack/_internal/server/statics/index.html +3 -0
  625. dstack/_internal/server/statics/logo-notext.svg +116 -0
  626. dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
  627. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
  628. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
  629. dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
  630. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  631. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  632. dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
  633. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  634. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  635. dstack/_internal/server/testing/__init__.py +0 -0
  636. dstack/_internal/server/testing/common.py +1220 -0
  637. dstack/_internal/server/testing/conf.py +53 -0
  638. dstack/_internal/server/testing/matchers.py +31 -0
  639. dstack/_internal/server/utils/__init__.py +0 -0
  640. dstack/_internal/server/utils/common.py +55 -0
  641. dstack/_internal/server/utils/logging.py +51 -0
  642. dstack/_internal/server/utils/provisioning.py +368 -0
  643. dstack/_internal/server/utils/routers.py +166 -0
  644. dstack/_internal/server/utils/sentry_utils.py +24 -0
  645. dstack/_internal/settings.py +49 -0
  646. dstack/_internal/utils/__init__.py +0 -0
  647. dstack/_internal/utils/common.py +318 -0
  648. dstack/_internal/utils/cron.py +5 -0
  649. dstack/_internal/utils/crypto.py +40 -0
  650. dstack/_internal/utils/env.py +88 -0
  651. dstack/_internal/utils/event_loop.py +30 -0
  652. dstack/_internal/utils/files.py +69 -0
  653. dstack/_internal/utils/gpu.py +59 -0
  654. dstack/_internal/utils/hash.py +31 -0
  655. dstack/_internal/utils/interpolator.py +91 -0
  656. dstack/_internal/utils/json_schema.py +11 -0
  657. dstack/_internal/utils/json_utils.py +54 -0
  658. dstack/_internal/utils/logging.py +5 -0
  659. dstack/_internal/utils/nested_list.py +47 -0
  660. dstack/_internal/utils/network.py +50 -0
  661. dstack/_internal/utils/path.py +57 -0
  662. dstack/_internal/utils/random_names.py +258 -0
  663. dstack/_internal/utils/ssh.py +346 -0
  664. dstack/_internal/utils/tags.py +42 -0
  665. dstack/_internal/utils/typing.py +14 -0
  666. dstack/_internal/utils/version.py +22 -0
  667. dstack/api/__init__.py +46 -0
  668. dstack/api/_public/__init__.py +96 -0
  669. dstack/api/_public/backends.py +42 -0
  670. dstack/api/_public/common.py +5 -0
  671. dstack/api/_public/repos.py +202 -0
  672. dstack/api/_public/runs.py +714 -0
  673. dstack/api/server/__init__.py +206 -0
  674. dstack/api/server/_auth.py +30 -0
  675. dstack/api/server/_backends.py +38 -0
  676. dstack/api/server/_events.py +64 -0
  677. dstack/api/server/_files.py +18 -0
  678. dstack/api/server/_fleets.py +82 -0
  679. dstack/api/server/_gateways.py +54 -0
  680. dstack/api/server/_gpus.py +27 -0
  681. dstack/api/server/_group.py +22 -0
  682. dstack/api/server/_logs.py +15 -0
  683. dstack/api/server/_metrics.py +23 -0
  684. dstack/api/server/_projects.py +124 -0
  685. dstack/api/server/_repos.py +64 -0
  686. dstack/api/server/_runs.py +102 -0
  687. dstack/api/server/_secrets.py +36 -0
  688. dstack/api/server/_users.py +82 -0
  689. dstack/api/server/_volumes.py +39 -0
  690. dstack/api/server/utils.py +34 -0
  691. dstack/api/utils.py +105 -0
  692. dstack/core/__init__.py +0 -0
  693. dstack/plugins/__init__.py +8 -0
  694. dstack/plugins/_base.py +72 -0
  695. dstack/plugins/_models.py +8 -0
  696. dstack/plugins/_utils.py +19 -0
  697. dstack/plugins/builtin/__init__.py +0 -0
  698. dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
  699. dstack/plugins/builtin/rest_plugin/_models.py +48 -0
  700. dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
  701. dstack/version.py +3 -1
  702. dstack-0.20.7.dist-info/METADATA +519 -0
  703. dstack-0.20.7.dist-info/RECORD +720 -0
  704. {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
  705. dstack-0.20.7.dist-info/entry_points.txt +2 -0
  706. dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
  707. dstack/aws/__init__.py +0 -180
  708. dstack/aws/artifacts.py +0 -111
  709. dstack/aws/config.py +0 -40
  710. dstack/aws/jobs.py +0 -245
  711. dstack/aws/logs.py +0 -186
  712. dstack/aws/repos.py +0 -137
  713. dstack/aws/run_names.py +0 -17
  714. dstack/aws/runners.py +0 -693
  715. dstack/aws/runs.py +0 -79
  716. dstack/aws/secrets.py +0 -99
  717. dstack/aws/tags.py +0 -138
  718. dstack/backend.py +0 -299
  719. dstack/cli/app.py +0 -41
  720. dstack/cli/artifacts.py +0 -87
  721. dstack/cli/common.py +0 -57
  722. dstack/cli/config.py +0 -194
  723. dstack/cli/dashboard.py +0 -26
  724. dstack/cli/delete.py +0 -49
  725. dstack/cli/init.py +0 -33
  726. dstack/cli/logs.py +0 -87
  727. dstack/cli/main.py +0 -81
  728. dstack/cli/restart.py +0 -43
  729. dstack/cli/run.py +0 -223
  730. dstack/cli/schema.py +0 -46
  731. dstack/cli/secrets.py +0 -97
  732. dstack/cli/status.py +0 -140
  733. dstack/cli/stop.py +0 -53
  734. dstack/cli/tags.py +0 -100
  735. dstack/config.py +0 -80
  736. dstack/dashboard/artifacts.py +0 -26
  737. dstack/dashboard/logs.py +0 -73
  738. dstack/dashboard/main.py +0 -45
  739. dstack/dashboard/repos.py +0 -41
  740. dstack/dashboard/runs.py +0 -140
  741. dstack/dashboard/secrets.py +0 -53
  742. dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
  743. dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
  744. dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
  745. dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
  746. dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
  747. dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
  748. dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
  749. dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
  750. dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
  751. dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
  752. dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
  753. dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  754. dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
  755. dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
  756. dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
  757. dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
  758. dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
  759. dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
  760. dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
  761. dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
  762. dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
  763. dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
  764. dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
  765. dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
  766. dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  767. dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  768. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  769. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  770. dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  771. dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  772. dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  773. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  774. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  775. dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  776. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  777. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  778. dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  779. dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  780. dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  781. dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  782. dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  783. dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  784. dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  785. dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  786. dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  787. dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  788. dstack/dashboard/statics/assets/browserconfig.xml +0 -15
  789. dstack/dashboard/statics/assets/coast-228x228.png +0 -0
  790. dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
  791. dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
  792. dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
  793. dstack/dashboard/statics/assets/favicon.ico +0 -0
  794. dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
  795. dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
  796. dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
  797. dstack/dashboard/statics/assets/manifest.webapp +0 -14
  798. dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
  799. dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
  800. dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
  801. dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
  802. dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
  803. dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
  804. dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
  805. dstack/dashboard/statics/index.html +0 -7
  806. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
  807. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
  808. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
  809. dstack/dashboard/statics/main.css +0 -5058
  810. dstack/dashboard/statics/splash_thumbnail.png +0 -0
  811. dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  812. dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
  813. dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
  814. dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
  815. dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
  816. dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
  817. dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
  818. dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
  819. dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
  820. dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
  821. dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
  822. dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
  823. dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
  824. dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
  825. dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
  826. dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
  827. dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
  828. dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
  829. dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
  830. dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
  831. dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
  832. dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
  833. dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  834. dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
  835. dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
  836. dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
  837. dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
  838. dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
  839. dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
  840. dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
  841. dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
  842. dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
  843. dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
  844. dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
  845. dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
  846. dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
  847. dstack/dashboard/tags.py +0 -119
  848. dstack/jobs.py +0 -255
  849. dstack/providers/__init__.py +0 -316
  850. dstack/providers/_python/main.py +0 -88
  851. dstack/providers/_tensorboard/main.py +0 -93
  852. dstack/providers/_torchrun/main.py +0 -121
  853. dstack/providers/bash/main.py +0 -90
  854. dstack/providers/code/main.py +0 -95
  855. dstack/providers/docker/main.py +0 -79
  856. dstack/providers/lab/main.py +0 -95
  857. dstack/providers/notebook/main.py +0 -90
  858. dstack/random_name.py +0 -29
  859. dstack/repo.py +0 -135
  860. dstack/runners.py +0 -35
  861. dstack/util.py +0 -15
  862. dstack-0.0.9.dist-info/METADATA +0 -176
  863. dstack-0.0.9.dist-info/RECORD +0 -179
  864. dstack-0.0.9.dist-info/entry_points.txt +0 -3
  865. dstack-0.0.9.dist-info/top_level.txt +0 -2
  866. tests/test_config.py +0 -70
  867. /dstack/{cli → _internal}/__init__.py +0 -0
  868. /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
  869. /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
  870. /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
  871. /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
  872. /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
  873. /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
  874. /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
  875. /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
  876. /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
  877. {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
  878. /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
  879. /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
@@ -0,0 +1,1186 @@
1
+ import asyncio
2
+ import datetime
3
+ import logging
4
+ from datetime import timedelta
5
+ from typing import Any, Dict, Optional, cast
6
+
7
+ import gpuhunt
8
+ import requests
9
+ from paramiko.pkey import PKey
10
+ from paramiko.ssh_exception import PasswordRequiredException
11
+ from pydantic import ValidationError
12
+ from sqlalchemy import and_, delete, func, not_, select
13
+ from sqlalchemy.ext.asyncio import AsyncSession
14
+ from sqlalchemy.orm import joinedload
15
+
16
+ from dstack._internal import settings
17
+ from dstack._internal.core.backends.base.compute import (
18
+ ComputeWithCreateInstanceSupport,
19
+ ComputeWithPlacementGroupSupport,
20
+ GoArchType,
21
+ get_dstack_runner_binary_path,
22
+ get_dstack_runner_download_url,
23
+ get_dstack_runner_version,
24
+ get_dstack_shim_binary_path,
25
+ get_dstack_shim_download_url,
26
+ get_dstack_shim_version,
27
+ get_dstack_working_dir,
28
+ get_shim_env,
29
+ get_shim_pre_start_commands,
30
+ )
31
+ from dstack._internal.core.backends.features import (
32
+ BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
33
+ BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT,
34
+ )
35
+ from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
36
+
37
+ # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
38
+ from dstack._internal.core.errors import (
39
+ BackendError,
40
+ NotYetTerminated,
41
+ ProvisioningError,
42
+ )
43
+ from dstack._internal.core.models.backends.base import BackendType
44
+ from dstack._internal.core.models.fleets import InstanceGroupPlacement
45
+ from dstack._internal.core.models.instances import (
46
+ HealthStatus,
47
+ InstanceAvailability,
48
+ InstanceOfferWithAvailability,
49
+ InstanceRuntime,
50
+ InstanceStatus,
51
+ InstanceTerminationReason,
52
+ RemoteConnectionInfo,
53
+ SSHKey,
54
+ )
55
+ from dstack._internal.core.models.profiles import (
56
+ TerminationPolicy,
57
+ )
58
+ from dstack._internal.core.models.runs import (
59
+ JobProvisioningData,
60
+ )
61
+ from dstack._internal.server import settings as server_settings
62
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
63
+ from dstack._internal.server.db import get_db, get_session_ctx
64
+ from dstack._internal.server.models import (
65
+ FleetModel,
66
+ InstanceHealthCheckModel,
67
+ InstanceModel,
68
+ JobModel,
69
+ ProjectModel,
70
+ )
71
+ from dstack._internal.server.schemas.instances import InstanceCheck
72
+ from dstack._internal.server.schemas.runner import (
73
+ ComponentInfo,
74
+ ComponentStatus,
75
+ HealthcheckResponse,
76
+ InstanceHealthResponse,
77
+ )
78
+ from dstack._internal.server.services import backends as backends_services
79
+ from dstack._internal.server.services import events
80
+ from dstack._internal.server.services.fleets import (
81
+ fleet_model_to_fleet,
82
+ get_create_instance_offers,
83
+ is_cloud_cluster,
84
+ )
85
+ from dstack._internal.server.services.instances import (
86
+ get_instance_configuration,
87
+ get_instance_profile,
88
+ get_instance_provisioning_data,
89
+ get_instance_requirements,
90
+ get_instance_ssh_private_keys,
91
+ remove_dangling_tasks_from_instance,
92
+ switch_instance_status,
93
+ )
94
+ from dstack._internal.server.services.locking import get_locker
95
+ from dstack._internal.server.services.logging import fmt
96
+ from dstack._internal.server.services.offers import (
97
+ get_instance_offer_with_restricted_az,
98
+ is_divisible_into_blocks,
99
+ )
100
+ from dstack._internal.server.services.placement import (
101
+ find_or_create_suitable_placement_group,
102
+ get_fleet_placement_group_models,
103
+ get_placement_group_model_for_instance,
104
+ placement_group_model_to_placement_group_optional,
105
+ schedule_fleet_placement_groups_deletion,
106
+ )
107
+ from dstack._internal.server.services.runner import client as runner_client
108
+ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
109
+ from dstack._internal.server.utils import sentry_utils
110
+ from dstack._internal.server.utils.provisioning import (
111
+ detect_cpu_arch,
112
+ get_host_info,
113
+ get_paramiko_connection,
114
+ get_shim_healthcheck,
115
+ host_info_to_instance_type,
116
+ remove_dstack_runner_if_exists,
117
+ remove_host_info_if_exists,
118
+ run_pre_start_commands,
119
+ run_shim_as_systemd_service,
120
+ upload_envs,
121
+ )
122
+ from dstack._internal.utils.common import (
123
+ get_current_datetime,
124
+ get_or_error,
125
+ run_async,
126
+ )
127
+ from dstack._internal.utils.logging import get_logger
128
+ from dstack._internal.utils.network import get_ip_from_network, is_ip_among_addresses
129
+ from dstack._internal.utils.ssh import (
130
+ pkey_from_str,
131
+ )
132
+
133
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
134
+
135
+ PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60)
136
+
137
+ TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20)
138
+ TERMINATION_RETRY_TIMEOUT = timedelta(seconds=30)
139
+ TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15)
140
+ PROVISIONING_TIMEOUT_SECONDS = 10 * 60 # 10 minutes in seconds
141
+
142
+
143
+ logger = get_logger(__name__)
144
+
145
+
146
+ async def process_instances(batch_size: int = 1):
147
+ tasks = []
148
+ for _ in range(batch_size):
149
+ tasks.append(_process_next_instance())
150
+ await asyncio.gather(*tasks)
151
+
152
+
153
+ @sentry_utils.instrument_background_task
154
+ async def delete_instance_health_checks():
155
+ now = get_current_datetime()
156
+ cutoff = now - timedelta(seconds=server_settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS)
157
+ async with get_session_ctx() as session:
158
+ await session.execute(
159
+ delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff)
160
+ )
161
+ await session.commit()
162
+
163
+
164
+ @sentry_utils.instrument_background_task
165
+ async def _process_next_instance():
166
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
167
+ async with get_session_ctx() as session:
168
+ async with lock:
169
+ res = await session.execute(
170
+ select(InstanceModel)
171
+ .where(
172
+ InstanceModel.status.in_(
173
+ [
174
+ InstanceStatus.PENDING,
175
+ InstanceStatus.PROVISIONING,
176
+ InstanceStatus.BUSY,
177
+ InstanceStatus.IDLE,
178
+ InstanceStatus.TERMINATING,
179
+ ]
180
+ ),
181
+ # Terminating instances belonging to a compute group
182
+ # are handled by process_compute_groups.
183
+ not_(
184
+ and_(
185
+ InstanceModel.status == InstanceStatus.TERMINATING,
186
+ InstanceModel.compute_group_id.is_not(None),
187
+ )
188
+ ),
189
+ InstanceModel.id.not_in(lockset),
190
+ InstanceModel.last_processed_at
191
+ < get_current_datetime() - MIN_PROCESSING_INTERVAL,
192
+ )
193
+ .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
194
+ .options(joinedload(InstanceModel.project).load_only(ProjectModel.ssh_private_key))
195
+ .order_by(InstanceModel.last_processed_at.asc())
196
+ .limit(1)
197
+ .with_for_update(skip_locked=True, key_share=True, of=InstanceModel)
198
+ )
199
+ instance = res.scalar()
200
+ if instance is None:
201
+ return
202
+ lockset.add(instance.id)
203
+ instance_model_id = instance.id
204
+ try:
205
+ await _process_instance(session=session, instance=instance)
206
+ finally:
207
+ lockset.difference_update([instance_model_id])
208
+
209
+
210
+ async def _process_instance(session: AsyncSession, instance: InstanceModel):
211
+ logger.debug("%s: processing instance, status: %s", fmt(instance), instance.status.upper())
212
+ # Refetch to load related attributes.
213
+ # Load related attributes only for statuses that always need them.
214
+ if instance.status in (
215
+ InstanceStatus.PENDING,
216
+ InstanceStatus.TERMINATING,
217
+ ):
218
+ res = await session.execute(
219
+ select(InstanceModel)
220
+ .where(InstanceModel.id == instance.id)
221
+ .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends))
222
+ .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
223
+ .options(
224
+ joinedload(InstanceModel.fleet).joinedload(
225
+ FleetModel.instances.and_(InstanceModel.deleted == False)
226
+ ),
227
+ )
228
+ .execution_options(populate_existing=True)
229
+ )
230
+ instance = res.unique().scalar_one()
231
+ elif instance.status == InstanceStatus.IDLE:
232
+ res = await session.execute(
233
+ select(InstanceModel)
234
+ .where(InstanceModel.id == instance.id)
235
+ .options(joinedload(InstanceModel.project))
236
+ .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
237
+ .options(
238
+ joinedload(InstanceModel.fleet).joinedload(
239
+ FleetModel.instances.and_(InstanceModel.deleted == False)
240
+ ),
241
+ )
242
+ .execution_options(populate_existing=True)
243
+ )
244
+ instance = res.unique().scalar_one()
245
+
246
+ if instance.status == InstanceStatus.PENDING:
247
+ if instance.remote_connection_info is not None:
248
+ await _add_remote(session, instance)
249
+ else:
250
+ await _create_instance(
251
+ session=session,
252
+ instance=instance,
253
+ )
254
+ elif instance.status in (
255
+ InstanceStatus.PROVISIONING,
256
+ InstanceStatus.IDLE,
257
+ InstanceStatus.BUSY,
258
+ ):
259
+ idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired(
260
+ session, instance
261
+ )
262
+ if not idle_duration_expired:
263
+ await _check_instance(session, instance)
264
+ elif instance.status == InstanceStatus.TERMINATING:
265
+ await _terminate(session, instance)
266
+
267
+ instance.last_processed_at = get_current_datetime()
268
+ await session.commit()
269
+
270
+
271
+ def _check_and_mark_terminating_if_idle_duration_expired(
272
+ session: AsyncSession, instance: InstanceModel
273
+ ):
274
+ if not (
275
+ instance.status == InstanceStatus.IDLE
276
+ and instance.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE
277
+ and not instance.jobs
278
+ ):
279
+ return False
280
+ if instance.fleet is not None and not _can_terminate_fleet_instances_on_idle_duration(
281
+ instance.fleet
282
+ ):
283
+ logger.debug(
284
+ "Skipping instance %s termination on idle duration. Fleet is already at `nodes.min`.",
285
+ instance.name,
286
+ )
287
+ return False
288
+ idle_duration = _get_instance_idle_duration(instance)
289
+ idle_seconds = instance.termination_idle_time
290
+ delta = datetime.timedelta(seconds=idle_seconds)
291
+ if idle_duration > delta:
292
+ instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT
293
+ instance.termination_reason_message = f"Instance idle for {idle_duration.seconds}s"
294
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
295
+ return True
296
+ return False
297
+
298
+
299
+ def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> bool:
300
+ # Do not terminate instances on idle duration if fleet is already at `nodes.min`.
301
+ # This is an optimization to avoid terminate-create loop.
302
+ # There may be race conditions since we don't take the fleet lock.
303
+ # That's ok: in the worst case we go below `nodes.min`, but
304
+ # the fleet consolidation logic will provision new nodes.
305
+ fleet = fleet_model_to_fleet(fleet_model)
306
+ if fleet.spec.configuration.nodes is None or fleet.spec.autocreated:
307
+ return True
308
+ active_instances = [i for i in fleet_model.instances if i.status.is_active()]
309
+ active_instances_num = len(active_instances)
310
+ return active_instances_num > fleet.spec.configuration.nodes.min
311
+
312
+
313
+ async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None:
314
+ logger.info("Adding ssh instance %s...", instance.name)
315
+
316
+ retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
317
+ if retry_duration_deadline < get_current_datetime():
318
+ instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
319
+ instance.termination_reason_message = (
320
+ f"Failed to add SSH instance in {PROVISIONING_TIMEOUT_SECONDS}s"
321
+ )
322
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
323
+ return
324
+
325
+ try:
326
+ remote_details = RemoteConnectionInfo.parse_raw(cast(str, instance.remote_connection_info))
327
+ # Prepare connection key
328
+ try:
329
+ pkeys = _ssh_keys_to_pkeys(remote_details.ssh_keys)
330
+ if remote_details.ssh_proxy_keys is not None:
331
+ ssh_proxy_pkeys = _ssh_keys_to_pkeys(remote_details.ssh_proxy_keys)
332
+ else:
333
+ ssh_proxy_pkeys = None
334
+ except (ValueError, PasswordRequiredException):
335
+ instance.termination_reason = InstanceTerminationReason.ERROR
336
+ instance.termination_reason_message = "Unsupported private SSH key type"
337
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
338
+ return
339
+
340
+ authorized_keys = [pk.public.strip() for pk in remote_details.ssh_keys]
341
+ authorized_keys.append(instance.project.ssh_public_key.strip())
342
+
343
+ try:
344
+ future = run_async(
345
+ _deploy_instance, remote_details, pkeys, ssh_proxy_pkeys, authorized_keys
346
+ )
347
+ deploy_timeout = 20 * 60 # 20 minutes
348
+ result = await asyncio.wait_for(future, timeout=deploy_timeout)
349
+ health, host_info, arch = result
350
+ except (asyncio.TimeoutError, TimeoutError) as e:
351
+ raise ProvisioningError(f"Deploy timeout: {e}") from e
352
+ except Exception as e:
353
+ raise ProvisioningError(f"Deploy instance raised an error: {e}") from e
354
+ except ProvisioningError as e:
355
+ logger.warning(
356
+ "Provisioning instance %s could not be completed because of the error: %s",
357
+ instance.name,
358
+ e,
359
+ )
360
+ # Stays in PENDING, may retry later
361
+ return
362
+
363
+ instance_type = host_info_to_instance_type(host_info, arch)
364
+ instance_network = None
365
+ internal_ip = None
366
+ try:
367
+ default_jpd = JobProvisioningData.__response__.parse_raw(instance.job_provisioning_data)
368
+ instance_network = default_jpd.instance_network
369
+ internal_ip = default_jpd.internal_ip
370
+ except ValidationError:
371
+ pass
372
+
373
+ host_network_addresses = host_info.get("addresses", [])
374
+ if internal_ip is None:
375
+ internal_ip = get_ip_from_network(
376
+ network=instance_network,
377
+ addresses=host_network_addresses,
378
+ )
379
+ if instance_network is not None and internal_ip is None:
380
+ instance.termination_reason = InstanceTerminationReason.ERROR
381
+ instance.termination_reason_message = (
382
+ "Failed to locate internal IP address on the given network"
383
+ )
384
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
385
+ return
386
+ if internal_ip is not None:
387
+ if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses):
388
+ instance.termination_reason = InstanceTerminationReason.ERROR
389
+ instance.termination_reason_message = (
390
+ "Specified internal IP not found among instance interfaces"
391
+ )
392
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
393
+ return
394
+
395
+ divisible, blocks = is_divisible_into_blocks(
396
+ cpu_count=instance_type.resources.cpus,
397
+ gpu_count=len(instance_type.resources.gpus),
398
+ blocks="auto" if instance.total_blocks is None else instance.total_blocks,
399
+ )
400
+ if divisible:
401
+ instance.total_blocks = blocks
402
+ else:
403
+ instance.termination_reason = InstanceTerminationReason.ERROR
404
+ instance.termination_reason_message = "Cannot split into blocks"
405
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
406
+ return
407
+
408
+ region = instance.region
409
+ assert region is not None # always set for ssh instances
410
+ jpd = JobProvisioningData(
411
+ backend=BackendType.REMOTE,
412
+ instance_type=instance_type,
413
+ instance_id="instance_id",
414
+ hostname=remote_details.host,
415
+ region=region,
416
+ price=0,
417
+ internal_ip=internal_ip,
418
+ instance_network=instance_network,
419
+ username=remote_details.ssh_user,
420
+ ssh_port=remote_details.port,
421
+ dockerized=True,
422
+ backend_data=None,
423
+ ssh_proxy=remote_details.ssh_proxy,
424
+ )
425
+
426
+ switch_instance_status(
427
+ session, instance, InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING
428
+ )
429
+ instance.backend = BackendType.REMOTE
430
+ instance_offer = InstanceOfferWithAvailability(
431
+ backend=BackendType.REMOTE,
432
+ instance=instance_type,
433
+ region=region,
434
+ price=0,
435
+ availability=InstanceAvailability.AVAILABLE,
436
+ instance_runtime=InstanceRuntime.SHIM,
437
+ )
438
+ instance.price = 0
439
+ instance.offer = instance_offer.json()
440
+ instance.job_provisioning_data = jpd.json()
441
+ instance.started_at = get_current_datetime()
442
+
443
+
444
+ def _deploy_instance(
445
+ remote_details: RemoteConnectionInfo,
446
+ pkeys: list[PKey],
447
+ ssh_proxy_pkeys: Optional[list[PKey]],
448
+ authorized_keys: list[str],
449
+ ) -> tuple[InstanceCheck, dict[str, Any], GoArchType]:
450
+ with get_paramiko_connection(
451
+ remote_details.ssh_user,
452
+ remote_details.host,
453
+ remote_details.port,
454
+ pkeys,
455
+ remote_details.ssh_proxy,
456
+ ssh_proxy_pkeys,
457
+ ) as client:
458
+ logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}")
459
+
460
+ arch = detect_cpu_arch(client)
461
+ logger.info("%s: CPU arch is %s", remote_details.host, arch)
462
+
463
+ # Execute pre start commands
464
+ shim_pre_start_commands = get_shim_pre_start_commands(arch=arch)
465
+ run_pre_start_commands(client, shim_pre_start_commands, authorized_keys)
466
+ logger.debug("The script for installing dstack has been executed")
467
+
468
+ # Upload envs
469
+ shim_envs = get_shim_env(arch=arch)
470
+ try:
471
+ fleet_configuration_envs = remote_details.env.as_dict()
472
+ except ValueError as e:
473
+ raise ProvisioningError(f"Invalid Env: {e}") from e
474
+ shim_envs.update(fleet_configuration_envs)
475
+ dstack_working_dir = get_dstack_working_dir()
476
+ dstack_shim_binary_path = get_dstack_shim_binary_path()
477
+ dstack_runner_binary_path = get_dstack_runner_binary_path()
478
+ upload_envs(client, dstack_working_dir, shim_envs)
479
+ logger.debug("The dstack-shim environment variables have been installed")
480
+
481
+ # Ensure we have fresh versions of host info.json and dstack-runner
482
+ remove_host_info_if_exists(client, dstack_working_dir)
483
+ remove_dstack_runner_if_exists(client, dstack_runner_binary_path)
484
+
485
+ # Run dstack-shim as a systemd service
486
+ run_shim_as_systemd_service(
487
+ client=client,
488
+ binary_path=dstack_shim_binary_path,
489
+ working_dir=dstack_working_dir,
490
+ dev=settings.DSTACK_VERSION is None,
491
+ )
492
+
493
+ # Get host info
494
+ host_info = get_host_info(client, dstack_working_dir)
495
+ logger.debug("Received a host_info %s", host_info)
496
+
497
+ healthcheck_out = get_shim_healthcheck(client)
498
+ try:
499
+ healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out)
500
+ except ValueError as e:
501
+ raise ProvisioningError(f"Cannot parse HealthcheckResponse: {e}") from e
502
+ instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck)
503
+
504
+ return instance_check, host_info, arch
505
+
506
+
507
+ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
508
+ master_instance = await _get_fleet_master_instance(session, instance)
509
+ if _need_to_wait_fleet_provisioning(instance, master_instance):
510
+ logger.debug(
511
+ "%s: waiting for the first instance in the fleet to be provisioned", fmt(instance)
512
+ )
513
+ return
514
+
515
+ try:
516
+ instance_configuration = get_instance_configuration(instance)
517
+ profile = get_instance_profile(instance)
518
+ requirements = get_instance_requirements(instance)
519
+ except ValidationError as e:
520
+ instance.termination_reason = InstanceTerminationReason.ERROR
521
+ instance.termination_reason_message = (
522
+ f"Error to parse profile, requirements or instance_configuration: {e}"
523
+ )
524
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
525
+ logger.exception(
526
+ "%s: error parsing profile, requirements or instance configuration", fmt(instance)
527
+ )
528
+ return
529
+
530
+ # The placement group is determined when provisioning the master instance
531
+ # and used for all other instances in the fleet.
532
+ placement_group_models = await get_fleet_placement_group_models(
533
+ session=session,
534
+ fleet_id=instance.fleet_id,
535
+ )
536
+ placement_group_model = get_placement_group_model_for_instance(
537
+ placement_group_models=placement_group_models,
538
+ instance_model=instance,
539
+ master_instance_model=master_instance,
540
+ )
541
+ offers = await get_create_instance_offers(
542
+ project=instance.project,
543
+ profile=profile,
544
+ requirements=requirements,
545
+ fleet_model=instance.fleet,
546
+ placement_group=placement_group_model_to_placement_group_optional(placement_group_model),
547
+ blocks="auto" if instance.total_blocks is None else instance.total_blocks,
548
+ exclude_not_available=True,
549
+ )
550
+
551
+ # Limit number of offers tried to prevent long-running processing
552
+ # in case all offers fail.
553
+ for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
554
+ if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
555
+ continue
556
+ compute = backend.compute()
557
+ assert isinstance(compute, ComputeWithCreateInstanceSupport)
558
+ instance_offer = _get_instance_offer_for_instance(
559
+ instance_offer=instance_offer,
560
+ instance=instance,
561
+ master_instance=master_instance,
562
+ )
563
+ if (
564
+ instance.fleet
565
+ and is_cloud_cluster(instance.fleet)
566
+ and instance.id == master_instance.id
567
+ and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
568
+ and isinstance(compute, ComputeWithPlacementGroupSupport)
569
+ and (
570
+ compute.are_placement_groups_compatible_with_reservations(instance_offer.backend)
571
+ or instance_configuration.reservation is None
572
+ )
573
+ ):
574
+ placement_group_model = await find_or_create_suitable_placement_group(
575
+ fleet_model=instance.fleet,
576
+ placement_groups=placement_group_models,
577
+ instance_offer=instance_offer,
578
+ compute=compute,
579
+ )
580
+ if placement_group_model is None: # error occurred
581
+ continue
582
+ session.add(placement_group_model)
583
+ placement_group_models.append(placement_group_model)
584
+ logger.debug(
585
+ "Trying %s in %s/%s for $%0.4f per hour",
586
+ instance_offer.instance.name,
587
+ instance_offer.backend.value,
588
+ instance_offer.region,
589
+ instance_offer.price,
590
+ )
591
+ try:
592
+ job_provisioning_data = await run_async(
593
+ compute.create_instance,
594
+ instance_offer,
595
+ instance_configuration,
596
+ placement_group_model_to_placement_group_optional(placement_group_model),
597
+ )
598
+ except BackendError as e:
599
+ logger.warning(
600
+ "%s launch in %s/%s failed: %s",
601
+ instance_offer.instance.name,
602
+ instance_offer.backend.value,
603
+ instance_offer.region,
604
+ repr(e),
605
+ extra={"instance_name": instance.name},
606
+ )
607
+ continue
608
+ except Exception:
609
+ logger.exception(
610
+ "Got exception when launching %s in %s/%s",
611
+ instance_offer.instance.name,
612
+ instance_offer.backend.value,
613
+ instance_offer.region,
614
+ )
615
+ continue
616
+
617
+ switch_instance_status(session, instance, InstanceStatus.PROVISIONING)
618
+ instance.backend = backend.TYPE
619
+ instance.region = instance_offer.region
620
+ instance.price = instance_offer.price
621
+ instance.instance_configuration = instance_configuration.json()
622
+ instance.job_provisioning_data = job_provisioning_data.json()
623
+ instance.offer = instance_offer.json()
624
+ instance.total_blocks = instance_offer.total_blocks
625
+ instance.started_at = get_current_datetime()
626
+
627
+ if instance.fleet_id and instance.id == master_instance.id:
628
+ # Clean up placement groups that did not end up being used.
629
+ # Flush to update still uncommitted placement groups.
630
+ await session.flush()
631
+ await schedule_fleet_placement_groups_deletion(
632
+ session=session,
633
+ fleet_id=instance.fleet_id,
634
+ except_placement_group_ids=(
635
+ [placement_group_model.id] if placement_group_model is not None else []
636
+ ),
637
+ )
638
+ return
639
+
640
+ instance.termination_reason = InstanceTerminationReason.NO_OFFERS
641
+ instance.termination_reason_message = "All offers failed" if offers else "No offers found"
642
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
643
+ if instance.fleet and instance.id == master_instance.id and is_cloud_cluster(instance.fleet):
644
+ # Do not attempt to deploy other instances, as they won't determine the correct cluster
645
+ # backend, region, and placement group without a successfully deployed master instance
646
+ for sibling_instance in instance.fleet.instances:
647
+ if sibling_instance.id == instance.id:
648
+ continue
649
+ sibling_instance.termination_reason = InstanceTerminationReason.MASTER_FAILED
650
+ switch_instance_status(session, sibling_instance, InstanceStatus.TERMINATED)
651
+
652
+
653
+ async def _get_fleet_master_instance(
654
+ session: AsyncSession, instance: InstanceModel
655
+ ) -> InstanceModel:
656
+ # The "master" fleet instance is relevant for cloud clusters only:
657
+ # it can be any fixed instance that is chosen to be provisioned first.
658
+ res = await session.execute(
659
+ select(InstanceModel)
660
+ .where(InstanceModel.fleet_id == instance.fleet_id)
661
+ .order_by(InstanceModel.instance_num, InstanceModel.created_at)
662
+ .limit(1)
663
+ )
664
+ return res.scalar_one()
665
+
666
+
667
+ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> None:
668
+ if (
669
+ instance.status == InstanceStatus.BUSY
670
+ and instance.jobs
671
+ and all(job.status.is_finished() for job in instance.jobs)
672
+ ):
673
+ # A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068
674
+ instance.termination_reason = InstanceTerminationReason.JOB_FINISHED
675
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
676
+ logger.warning(
677
+ "Detected busy instance %s with finished job. Marked as TERMINATING",
678
+ instance.name,
679
+ extra={
680
+ "instance_name": instance.name,
681
+ "instance_status": instance.status.value,
682
+ },
683
+ )
684
+ return
685
+
686
+ job_provisioning_data = get_or_error(get_instance_provisioning_data(instance))
687
+ if job_provisioning_data.hostname is None:
688
+ res = await session.execute(
689
+ select(ProjectModel)
690
+ .where(ProjectModel.id == instance.project_id)
691
+ .options(joinedload(ProjectModel.backends))
692
+ )
693
+ project = res.unique().scalar_one()
694
+ await _wait_for_instance_provisioning_data(
695
+ session=session,
696
+ project=project,
697
+ instance=instance,
698
+ job_provisioning_data=job_provisioning_data,
699
+ )
700
+ return
701
+
702
+ if not job_provisioning_data.dockerized:
703
+ if instance.status == InstanceStatus.PROVISIONING:
704
+ switch_instance_status(session, instance, InstanceStatus.BUSY)
705
+ return
706
+
707
+ ssh_private_keys = get_instance_ssh_private_keys(instance)
708
+
709
+ health_check_cutoff = get_current_datetime() - timedelta(
710
+ seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS
711
+ )
712
+ res = await session.execute(
713
+ select(func.count(1)).where(
714
+ InstanceHealthCheckModel.instance_id == instance.id,
715
+ InstanceHealthCheckModel.collected_at > health_check_cutoff,
716
+ )
717
+ )
718
+ check_instance_health = res.scalar_one() == 0
719
+
720
+ # May return False if fails to establish ssh connection
721
+ instance_check = await run_async(
722
+ _check_instance_inner,
723
+ ssh_private_keys,
724
+ job_provisioning_data,
725
+ None,
726
+ instance=instance,
727
+ check_instance_health=check_instance_health,
728
+ )
729
+ if instance_check is False:
730
+ instance_check = InstanceCheck(reachable=False, message="SSH or tunnel error")
731
+
732
+ if instance_check.reachable and check_instance_health:
733
+ health_status = instance_check.get_health_status()
734
+ else:
735
+ # Keep previous health status
736
+ health_status = instance.health
737
+
738
+ loglevel = logging.DEBUG
739
+ if not instance_check.reachable and instance.status.is_available():
740
+ loglevel = logging.WARNING
741
+ elif check_instance_health and not health_status.is_healthy():
742
+ loglevel = logging.WARNING
743
+ logger.log(
744
+ loglevel,
745
+ "Instance %s check: reachable=%s health_status=%s message=%r",
746
+ instance.name,
747
+ instance_check.reachable,
748
+ health_status.name,
749
+ instance_check.message,
750
+ extra={"instance_name": instance.name, "health_status": health_status},
751
+ )
752
+
753
+ if instance_check.has_health_checks():
754
+ # ensured by has_health_checks()
755
+ assert instance_check.health_response is not None
756
+ health_check_model = InstanceHealthCheckModel(
757
+ instance_id=instance.id,
758
+ collected_at=get_current_datetime(),
759
+ status=health_status,
760
+ response=instance_check.health_response.json(),
761
+ )
762
+ session.add(health_check_model)
763
+
764
+ _set_health(session, instance, health_status)
765
+ _set_unreachable(session, instance, unreachable=not instance_check.reachable)
766
+
767
+ if instance_check.reachable:
768
+ instance.termination_deadline = None
769
+
770
+ if instance.status == InstanceStatus.PROVISIONING:
771
+ switch_instance_status(
772
+ session,
773
+ instance,
774
+ InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY,
775
+ )
776
+ return
777
+
778
+ if instance.termination_deadline is None:
779
+ instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET
780
+
781
+ if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
782
+ provisioning_deadline = _get_provisioning_deadline(
783
+ instance=instance,
784
+ job_provisioning_data=job_provisioning_data,
785
+ )
786
+ if get_current_datetime() > provisioning_deadline:
787
+ instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
788
+ instance.termination_reason_message = "Instance did not become reachable in time"
789
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
790
+ elif instance.status.is_available():
791
+ deadline = instance.termination_deadline
792
+ if get_current_datetime() > deadline:
793
+ instance.termination_reason = InstanceTerminationReason.UNREACHABLE
794
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
795
+
796
+
797
+ async def _wait_for_instance_provisioning_data(
798
+ session: AsyncSession,
799
+ project: ProjectModel,
800
+ instance: InstanceModel,
801
+ job_provisioning_data: JobProvisioningData,
802
+ ):
803
+ logger.debug(
804
+ "Waiting for instance %s to become running",
805
+ instance.name,
806
+ )
807
+ provisioning_deadline = _get_provisioning_deadline(
808
+ instance=instance,
809
+ job_provisioning_data=job_provisioning_data,
810
+ )
811
+ if get_current_datetime() > provisioning_deadline:
812
+ instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
813
+ instance.termination_reason_message = "Backend did not complete provisioning in time"
814
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
815
+ return
816
+
817
+ backend = await backends_services.get_project_backend_by_type(
818
+ project=project,
819
+ backend_type=job_provisioning_data.backend,
820
+ )
821
+ if backend is None:
822
+ logger.warning(
823
+ "Instance %s failed because instance's backend is not available",
824
+ instance.name,
825
+ )
826
+ instance.termination_reason = InstanceTerminationReason.ERROR
827
+ instance.termination_reason_message = "Backend not available"
828
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
829
+ return
830
+ try:
831
+ await run_async(
832
+ backend.compute().update_provisioning_data,
833
+ job_provisioning_data,
834
+ project.ssh_public_key,
835
+ project.ssh_private_key,
836
+ )
837
+ instance.job_provisioning_data = job_provisioning_data.json()
838
+ except ProvisioningError as e:
839
+ logger.warning(
840
+ "Error while waiting for instance %s to become running: %s",
841
+ instance.name,
842
+ repr(e),
843
+ )
844
+ instance.termination_reason = InstanceTerminationReason.ERROR
845
+ instance.termination_reason_message = "Error while waiting for instance to become running"
846
+ switch_instance_status(session, instance, InstanceStatus.TERMINATING)
847
+ except Exception:
848
+ logger.exception(
849
+ "Got exception when updating instance %s provisioning data", instance.name
850
+ )
851
+
852
+
853
+ @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
854
+ def _check_instance_inner(
855
+ ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
856
+ ) -> InstanceCheck:
857
+ instance_health_response: Optional[InstanceHealthResponse] = None
858
+ shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
859
+ method = shim_client.healthcheck
860
+ try:
861
+ healthcheck_response = method(unmask_exceptions=True)
862
+ if check_instance_health:
863
+ method = shim_client.get_instance_health
864
+ instance_health_response = method()
865
+ except requests.RequestException as e:
866
+ template = "shim.%s(): request error: %s"
867
+ args = (method.__func__.__name__, e)
868
+ logger.debug(template, *args)
869
+ return InstanceCheck(reachable=False, message=template % args)
870
+ except Exception as e:
871
+ template = "shim.%s(): unexpected exception %s: %s"
872
+ args = (method.__func__.__name__, e.__class__.__name__, e)
873
+ logger.exception(template, *args)
874
+ return InstanceCheck(reachable=False, message=template % args)
875
+
876
+ try:
877
+ remove_dangling_tasks_from_instance(shim_client, instance)
878
+ except Exception as e:
879
+ logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
880
+
881
+ # There should be no shim API calls after this function call since it can request shim restart.
882
+ _maybe_install_components(instance, shim_client)
883
+
884
+ return runner_client.healthcheck_response_to_instance_check(
885
+ healthcheck_response, instance_health_response
886
+ )
887
+
888
+
889
+ def _maybe_install_components(
890
+ instance: InstanceModel, shim_client: runner_client.ShimClient
891
+ ) -> None:
892
+ try:
893
+ components = shim_client.get_components()
894
+ except requests.RequestException as e:
895
+ logger.warning("Instance %s: shim.get_components(): request error: %s", instance.name, e)
896
+ return
897
+ if components is None:
898
+ logger.debug("Instance %s: no components info", instance.name)
899
+ return
900
+
901
+ installed_shim_version: Optional[str] = None
902
+ installation_requested = False
903
+
904
+ if (runner_info := components.runner) is not None:
905
+ installation_requested |= _maybe_install_runner(instance, shim_client, runner_info)
906
+ else:
907
+ logger.debug("Instance %s: no runner info", instance.name)
908
+
909
+ if (shim_info := components.shim) is not None:
910
+ if shim_info.status == ComponentStatus.INSTALLED:
911
+ installed_shim_version = shim_info.version
912
+ installation_requested |= _maybe_install_shim(instance, shim_client, shim_info)
913
+ else:
914
+ logger.debug("Instance %s: no shim info", instance.name)
915
+
916
+ running_shim_version = shim_client.get_version_string()
917
+ if (
918
+ # old shim without `dstack-shim` component and `/api/shutdown` support
919
+ installed_shim_version is None
920
+ # or the same version is already running
921
+ or installed_shim_version == running_shim_version
922
+ # or we just requested installation of at least one component
923
+ or installation_requested
924
+ # or at least one component is already being installed
925
+ or any(c.status == ComponentStatus.INSTALLING for c in components)
926
+ # or at least one shim task won't survive restart
927
+ or not shim_client.is_safe_to_restart()
928
+ ):
929
+ return
930
+
931
+ if shim_client.shutdown(force=False):
932
+ logger.debug(
933
+ "Instance %s: restarting shim %s -> %s",
934
+ instance.name,
935
+ running_shim_version,
936
+ installed_shim_version,
937
+ )
938
+ else:
939
+ logger.debug("Instance %s: cannot restart shim", instance.name)
940
+
941
+
942
+ def _maybe_install_runner(
943
+ instance: InstanceModel, shim_client: runner_client.ShimClient, runner_info: ComponentInfo
944
+ ) -> bool:
945
+ # For developers:
946
+ # * To install the latest dev build for the current branch from the CI,
947
+ # set DSTACK_USE_LATEST_FROM_BRANCH=1.
948
+ # * To provide your own build, set DSTACK_RUNNER_VERSION_URL and DSTACK_RUNNER_DOWNLOAD_URL.
949
+ expected_version = get_dstack_runner_version()
950
+ if expected_version is None:
951
+ logger.debug("Cannot determine the expected runner version")
952
+ return False
953
+
954
+ installed_version = runner_info.version
955
+ logger.debug(
956
+ "Instance %s: runner status=%s installed_version=%s",
957
+ instance.name,
958
+ runner_info.status.value,
959
+ installed_version or "(no version)",
960
+ )
961
+
962
+ if runner_info.status == ComponentStatus.INSTALLING:
963
+ logger.debug("Instance %s: runner is already being installed", instance.name)
964
+ return False
965
+
966
+ if installed_version and installed_version == expected_version:
967
+ logger.debug("Instance %s: expected runner version already installed", instance.name)
968
+ return False
969
+
970
+ url = get_dstack_runner_download_url(
971
+ arch=_get_instance_cpu_arch(instance), version=expected_version
972
+ )
973
+ logger.debug(
974
+ "Instance %s: installing runner %s -> %s from %s",
975
+ instance.name,
976
+ installed_version or "(no version)",
977
+ expected_version,
978
+ url,
979
+ )
980
+ try:
981
+ shim_client.install_runner(url)
982
+ return True
983
+ except requests.RequestException as e:
984
+ logger.warning("Instance %s: shim.install_runner(): %s", instance.name, e)
985
+ return False
986
+
987
+
988
+ def _maybe_install_shim(
989
+ instance: InstanceModel, shim_client: runner_client.ShimClient, shim_info: ComponentInfo
990
+ ) -> bool:
991
+ # For developers:
992
+ # * To install the latest dev build for the current branch from the CI,
993
+ # set DSTACK_USE_LATEST_FROM_BRANCH=1.
994
+ # * To provide your own build, set DSTACK_SHIM_VERSION_URL and DSTACK_SHIM_DOWNLOAD_URL.
995
+ expected_version = get_dstack_shim_version()
996
+ if expected_version is None:
997
+ logger.debug("Cannot determine the expected shim version")
998
+ return False
999
+
1000
+ installed_version = shim_info.version
1001
+ logger.debug(
1002
+ "Instance %s: shim status=%s installed_version=%s running_version=%s",
1003
+ instance.name,
1004
+ shim_info.status.value,
1005
+ installed_version or "(no version)",
1006
+ shim_client.get_version_string(),
1007
+ )
1008
+
1009
+ if shim_info.status == ComponentStatus.INSTALLING:
1010
+ logger.debug("Instance %s: shim is already being installed", instance.name)
1011
+ return False
1012
+
1013
+ if installed_version and installed_version == expected_version:
1014
+ logger.debug("Instance %s: expected shim version already installed", instance.name)
1015
+ return False
1016
+
1017
+ url = get_dstack_shim_download_url(
1018
+ arch=_get_instance_cpu_arch(instance), version=expected_version
1019
+ )
1020
+ logger.debug(
1021
+ "Instance %s: installing shim %s -> %s from %s",
1022
+ instance.name,
1023
+ installed_version or "(no version)",
1024
+ expected_version,
1025
+ url,
1026
+ )
1027
+ try:
1028
+ shim_client.install_shim(url)
1029
+ return True
1030
+ except requests.RequestException as e:
1031
+ logger.warning("Instance %s: shim.install_shim(): %s", instance.name, e)
1032
+ return False
1033
+
1034
+
1035
+ def _get_instance_cpu_arch(instance: InstanceModel) -> Optional[gpuhunt.CPUArchitecture]:
1036
+ jpd = get_instance_provisioning_data(instance)
1037
+ if jpd is None:
1038
+ return None
1039
+ return jpd.instance_type.resources.cpu_arch
1040
+
1041
+
1042
+ async def _terminate(session: AsyncSession, instance: InstanceModel) -> None:
1043
+ if (
1044
+ instance.last_termination_retry_at is not None
1045
+ and _next_termination_retry_at(instance) > get_current_datetime()
1046
+ ):
1047
+ return
1048
+ jpd = get_instance_provisioning_data(instance)
1049
+ if jpd is not None and jpd.backend != BackendType.REMOTE:
1050
+ backend = await backends_services.get_project_backend_by_type(
1051
+ project=instance.project, backend_type=jpd.backend
1052
+ )
1053
+ if backend is None:
1054
+ logger.error(
1055
+ "Failed to terminate instance %s. Backend %s not available.",
1056
+ instance.name,
1057
+ jpd.backend,
1058
+ )
1059
+ else:
1060
+ logger.debug("Terminating runner instance %s", jpd.hostname)
1061
+ try:
1062
+ await run_async(
1063
+ backend.compute().terminate_instance,
1064
+ jpd.instance_id,
1065
+ jpd.region,
1066
+ jpd.backend_data,
1067
+ )
1068
+ except Exception as e:
1069
+ if instance.first_termination_retry_at is None:
1070
+ instance.first_termination_retry_at = get_current_datetime()
1071
+ instance.last_termination_retry_at = get_current_datetime()
1072
+ if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
1073
+ if isinstance(e, NotYetTerminated):
1074
+ logger.debug("Instance %s termination in progress: %s", instance.name, e)
1075
+ else:
1076
+ logger.warning(
1077
+ "Failed to terminate instance %s. Will retry. Error: %r",
1078
+ instance.name,
1079
+ e,
1080
+ exc_info=not isinstance(e, BackendError),
1081
+ )
1082
+ return
1083
+ logger.error(
1084
+ "Failed all attempts to terminate instance %s."
1085
+ " Please terminate the instance manually to avoid unexpected charges."
1086
+ " Error: %r",
1087
+ instance.name,
1088
+ e,
1089
+ exc_info=not isinstance(e, BackendError),
1090
+ )
1091
+
1092
+ instance.deleted = True
1093
+ instance.deleted_at = get_current_datetime()
1094
+ instance.finished_at = get_current_datetime()
1095
+ switch_instance_status(session, instance, InstanceStatus.TERMINATED)
1096
+
1097
+
1098
+ def _set_health(session: AsyncSession, instance: InstanceModel, health: HealthStatus) -> None:
1099
+ if instance.health != health:
1100
+ events.emit(
1101
+ session,
1102
+ f"Instance health changed {instance.health.upper()} -> {health.upper()}",
1103
+ actor=events.SystemActor(),
1104
+ targets=[events.Target.from_model(instance)],
1105
+ )
1106
+ instance.health = health
1107
+
1108
+
1109
+ def _set_unreachable(session: AsyncSession, instance: InstanceModel, unreachable: bool) -> None:
1110
+ if (
1111
+ instance.status.is_available() # avoid misleading event during provisioning
1112
+ and instance.unreachable != unreachable
1113
+ ):
1114
+ events.emit(
1115
+ session,
1116
+ "Instance became unreachable" if unreachable else "Instance became reachable",
1117
+ actor=events.SystemActor(),
1118
+ targets=[events.Target.from_model(instance)],
1119
+ )
1120
+ instance.unreachable = unreachable
1121
+
1122
+
1123
+ def _next_termination_retry_at(instance: InstanceModel) -> datetime.datetime:
1124
+ assert instance.last_termination_retry_at is not None
1125
+ return instance.last_termination_retry_at + TERMINATION_RETRY_TIMEOUT
1126
+
1127
+
1128
+ def _get_termination_deadline(instance: InstanceModel) -> datetime.datetime:
1129
+ assert instance.first_termination_retry_at is not None
1130
+ return instance.first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION
1131
+
1132
+
1133
+ def _need_to_wait_fleet_provisioning(
1134
+ instance: InstanceModel, master_instance: InstanceModel
1135
+ ) -> bool:
1136
+ # Cluster cloud instances should wait for the first fleet instance to be provisioned
1137
+ # so that they are provisioned in the same backend/region
1138
+ if instance.fleet is None:
1139
+ return False
1140
+ if (
1141
+ instance.id == master_instance.id
1142
+ or master_instance.job_provisioning_data is not None
1143
+ or master_instance.status == InstanceStatus.TERMINATED
1144
+ ):
1145
+ return False
1146
+ return is_cloud_cluster(instance.fleet)
1147
+
1148
+
1149
+ def _get_instance_offer_for_instance(
1150
+ instance_offer: InstanceOfferWithAvailability,
1151
+ instance: InstanceModel,
1152
+ master_instance: InstanceModel,
1153
+ ) -> InstanceOfferWithAvailability:
1154
+ if instance.fleet is None:
1155
+ return instance_offer
1156
+ fleet = fleet_model_to_fleet(instance.fleet)
1157
+ if fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER:
1158
+ master_job_provisioning_data = get_instance_provisioning_data(master_instance)
1159
+ return get_instance_offer_with_restricted_az(
1160
+ instance_offer=instance_offer,
1161
+ master_job_provisioning_data=master_job_provisioning_data,
1162
+ )
1163
+ return instance_offer
1164
+
1165
+
1166
+ def _get_instance_idle_duration(instance: InstanceModel) -> datetime.timedelta:
1167
+ last_time = instance.created_at
1168
+ if instance.last_job_processed_at is not None:
1169
+ last_time = instance.last_job_processed_at
1170
+ return get_current_datetime() - last_time
1171
+
1172
+
1173
+ def _get_provisioning_deadline(
1174
+ instance: InstanceModel,
1175
+ job_provisioning_data: JobProvisioningData,
1176
+ ) -> datetime.datetime:
1177
+ assert instance.started_at is not None
1178
+ timeout_interval = get_provisioning_timeout(
1179
+ backend_type=job_provisioning_data.get_base_backend(),
1180
+ instance_type_name=job_provisioning_data.instance_type.name,
1181
+ )
1182
+ return instance.started_at + timeout_interval
1183
+
1184
+
1185
+ def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
1186
+ return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]