dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (879) hide show
  1. dstack/_internal/cli/commands/__init__.py +80 -0
  2. dstack/_internal/cli/commands/apply.py +100 -0
  3. dstack/_internal/cli/commands/attach.py +161 -0
  4. dstack/_internal/cli/commands/completion.py +22 -0
  5. dstack/_internal/cli/commands/delete.py +44 -0
  6. dstack/_internal/cli/commands/event.py +168 -0
  7. dstack/_internal/cli/commands/fleet.py +161 -0
  8. dstack/_internal/cli/commands/gateway.py +159 -0
  9. dstack/_internal/cli/commands/init.py +64 -0
  10. dstack/_internal/cli/commands/login.py +352 -0
  11. dstack/_internal/cli/commands/logs.py +62 -0
  12. dstack/_internal/cli/commands/metrics.py +153 -0
  13. dstack/_internal/cli/commands/offer.py +146 -0
  14. dstack/_internal/cli/commands/project.py +259 -0
  15. dstack/_internal/cli/commands/ps.py +81 -0
  16. dstack/_internal/cli/commands/run.py +69 -0
  17. dstack/_internal/cli/commands/secrets.py +92 -0
  18. dstack/_internal/cli/commands/server.py +96 -0
  19. dstack/_internal/cli/commands/stop.py +26 -0
  20. dstack/_internal/cli/commands/volume.py +117 -0
  21. dstack/_internal/cli/main.py +101 -0
  22. dstack/_internal/cli/models/gateways.py +16 -0
  23. dstack/_internal/cli/models/offers.py +47 -0
  24. dstack/_internal/cli/models/runs.py +16 -0
  25. dstack/_internal/cli/services/args.py +31 -0
  26. dstack/_internal/cli/services/completion.py +91 -0
  27. dstack/_internal/cli/services/configurators/__init__.py +86 -0
  28. dstack/_internal/cli/services/configurators/base.py +103 -0
  29. dstack/_internal/cli/services/configurators/fleet.py +475 -0
  30. dstack/_internal/cli/services/configurators/gateway.py +231 -0
  31. dstack/_internal/cli/services/configurators/run.py +882 -0
  32. dstack/_internal/cli/services/configurators/volume.py +222 -0
  33. dstack/_internal/cli/services/events.py +68 -0
  34. dstack/_internal/cli/services/profile.py +182 -0
  35. dstack/_internal/cli/services/repos.py +71 -0
  36. dstack/_internal/cli/services/resources.py +54 -0
  37. dstack/_internal/cli/utils/common.py +159 -0
  38. dstack/_internal/cli/utils/fleet.py +106 -0
  39. dstack/_internal/cli/utils/gateway.py +56 -0
  40. dstack/_internal/cli/utils/gpu.py +178 -0
  41. dstack/_internal/cli/utils/rich.py +156 -0
  42. dstack/_internal/cli/utils/run.py +517 -0
  43. dstack/_internal/cli/utils/secrets.py +25 -0
  44. dstack/_internal/cli/utils/updates.py +98 -0
  45. dstack/_internal/cli/utils/volume.py +58 -0
  46. dstack/_internal/compat.py +3 -0
  47. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  48. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  49. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  50. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  51. dstack/_internal/core/backends/aws/auth.py +30 -0
  52. dstack/_internal/core/backends/aws/backend.py +31 -0
  53. dstack/_internal/core/backends/aws/compute.py +1153 -0
  54. dstack/_internal/core/backends/aws/configurator.py +191 -0
  55. dstack/_internal/core/backends/aws/models.py +135 -0
  56. dstack/_internal/core/backends/aws/resources.py +700 -0
  57. dstack/_internal/core/backends/azure/auth.py +39 -0
  58. dstack/_internal/core/backends/azure/backend.py +21 -0
  59. dstack/_internal/core/backends/azure/compute.py +676 -0
  60. dstack/_internal/core/backends/azure/configurator.py +472 -0
  61. dstack/_internal/core/backends/azure/models.py +98 -0
  62. dstack/_internal/core/backends/azure/resources.py +116 -0
  63. dstack/_internal/core/backends/azure/utils.py +42 -0
  64. dstack/_internal/core/backends/base/backend.py +18 -0
  65. dstack/_internal/core/backends/base/compute.py +1101 -0
  66. dstack/_internal/core/backends/base/configurator.py +117 -0
  67. dstack/_internal/core/backends/base/models.py +24 -0
  68. dstack/_internal/core/backends/base/offers.py +232 -0
  69. dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
  70. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  71. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  72. dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
  73. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  74. dstack/_internal/core/backends/configurators.py +181 -0
  75. dstack/_internal/core/backends/cudo/__init__.py +0 -0
  76. dstack/_internal/core/backends/cudo/api_client.py +111 -0
  77. dstack/_internal/core/backends/cudo/backend.py +16 -0
  78. dstack/_internal/core/backends/cudo/compute.py +174 -0
  79. dstack/_internal/core/backends/cudo/configurator.py +63 -0
  80. dstack/_internal/core/backends/cudo/models.py +37 -0
  81. dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
  82. dstack/_internal/core/backends/datacrunch/backend.py +18 -0
  83. dstack/_internal/core/backends/datacrunch/compute.py +8 -0
  84. dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
  85. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  86. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  87. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  88. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  89. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  90. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  91. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  92. dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
  93. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  94. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  95. dstack/_internal/core/backends/dstack/__init__.py +0 -0
  96. dstack/_internal/core/backends/dstack/models.py +26 -0
  97. dstack/_internal/core/backends/features.py +74 -0
  98. dstack/_internal/core/backends/gcp/__init__.py +0 -0
  99. dstack/_internal/core/backends/gcp/auth.py +57 -0
  100. dstack/_internal/core/backends/gcp/backend.py +17 -0
  101. dstack/_internal/core/backends/gcp/compute.py +1257 -0
  102. dstack/_internal/core/backends/gcp/configurator.py +206 -0
  103. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  104. dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
  105. dstack/_internal/core/backends/gcp/models.py +160 -0
  106. dstack/_internal/core/backends/gcp/resources.py +585 -0
  107. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  108. dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
  109. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  110. dstack/_internal/core/backends/hotaisle/compute.py +188 -0
  111. dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
  112. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  113. dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
  114. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  115. dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
  116. dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
  117. dstack/_internal/core/backends/kubernetes/models.py +71 -0
  118. dstack/_internal/core/backends/kubernetes/utils.py +81 -0
  119. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
  120. dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
  121. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  122. dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
  123. dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
  124. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  125. dstack/_internal/core/backends/local/__init__.py +0 -0
  126. dstack/_internal/core/backends/local/backend.py +14 -0
  127. dstack/_internal/core/backends/local/compute.py +130 -0
  128. dstack/_internal/core/backends/models.py +158 -0
  129. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  130. dstack/_internal/core/backends/nebius/backend.py +16 -0
  131. dstack/_internal/core/backends/nebius/compute.py +401 -0
  132. dstack/_internal/core/backends/nebius/configurator.py +98 -0
  133. dstack/_internal/core/backends/nebius/models.py +185 -0
  134. dstack/_internal/core/backends/nebius/resources.py +433 -0
  135. dstack/_internal/core/backends/oci/__init__.py +0 -0
  136. dstack/_internal/core/backends/oci/auth.py +21 -0
  137. dstack/_internal/core/backends/oci/backend.py +16 -0
  138. dstack/_internal/core/backends/oci/compute.py +209 -0
  139. dstack/_internal/core/backends/oci/configurator.py +156 -0
  140. dstack/_internal/core/backends/oci/exceptions.py +15 -0
  141. dstack/_internal/core/backends/oci/models.py +87 -0
  142. dstack/_internal/core/backends/oci/region.py +86 -0
  143. dstack/_internal/core/backends/oci/resources.py +836 -0
  144. dstack/_internal/core/backends/runpod/__init__.py +0 -0
  145. dstack/_internal/core/backends/runpod/api_client.py +627 -0
  146. dstack/_internal/core/backends/runpod/backend.py +16 -0
  147. dstack/_internal/core/backends/runpod/compute.py +444 -0
  148. dstack/_internal/core/backends/runpod/configurator.py +63 -0
  149. dstack/_internal/core/backends/runpod/models.py +54 -0
  150. dstack/_internal/core/backends/template/__init__.py +0 -0
  151. dstack/_internal/core/backends/template/backend.py.jinja +16 -0
  152. dstack/_internal/core/backends/template/compute.py.jinja +95 -0
  153. dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
  154. dstack/_internal/core/backends/template/models.py.jinja +62 -0
  155. dstack/_internal/core/backends/tensordock/models.py +40 -0
  156. dstack/_internal/core/backends/vastai/__init__.py +0 -0
  157. dstack/_internal/core/backends/vastai/api_client.py +143 -0
  158. dstack/_internal/core/backends/vastai/backend.py +16 -0
  159. dstack/_internal/core/backends/vastai/compute.py +141 -0
  160. dstack/_internal/core/backends/vastai/configurator.py +69 -0
  161. dstack/_internal/core/backends/vastai/models.py +37 -0
  162. dstack/_internal/core/backends/verda/__init__.py +0 -0
  163. dstack/_internal/core/backends/verda/backend.py +16 -0
  164. dstack/_internal/core/backends/verda/compute.py +266 -0
  165. dstack/_internal/core/backends/verda/configurator.py +73 -0
  166. dstack/_internal/core/backends/verda/models.py +38 -0
  167. dstack/_internal/core/backends/vultr/__init__.py +0 -0
  168. dstack/_internal/core/backends/vultr/api_client.py +116 -0
  169. dstack/_internal/core/backends/vultr/backend.py +16 -0
  170. dstack/_internal/core/backends/vultr/compute.py +167 -0
  171. dstack/_internal/core/backends/vultr/configurator.py +71 -0
  172. dstack/_internal/core/backends/vultr/models.py +34 -0
  173. dstack/_internal/core/compatibility/__init__.py +0 -0
  174. dstack/_internal/core/compatibility/events.py +13 -0
  175. dstack/_internal/core/compatibility/fleets.py +58 -0
  176. dstack/_internal/core/compatibility/gateways.py +39 -0
  177. dstack/_internal/core/compatibility/gpus.py +13 -0
  178. dstack/_internal/core/compatibility/logs.py +14 -0
  179. dstack/_internal/core/compatibility/runs.py +86 -0
  180. dstack/_internal/core/compatibility/volumes.py +37 -0
  181. dstack/_internal/core/consts.py +8 -0
  182. dstack/_internal/core/errors.py +160 -0
  183. dstack/_internal/core/models/__init__.py +0 -0
  184. dstack/_internal/core/models/auth.py +28 -0
  185. dstack/_internal/core/models/backends/__init__.py +0 -0
  186. dstack/_internal/core/models/backends/base.py +48 -0
  187. dstack/_internal/core/models/common.py +143 -0
  188. dstack/_internal/core/models/compute_groups.py +39 -0
  189. dstack/_internal/core/models/config.py +28 -0
  190. dstack/_internal/core/models/configurations.py +1123 -0
  191. dstack/_internal/core/models/envs.py +149 -0
  192. dstack/_internal/core/models/events.py +98 -0
  193. dstack/_internal/core/models/files.py +67 -0
  194. dstack/_internal/core/models/fleets.py +437 -0
  195. dstack/_internal/core/models/gateways.py +146 -0
  196. dstack/_internal/core/models/gpus.py +45 -0
  197. dstack/_internal/core/models/health.py +28 -0
  198. dstack/_internal/core/models/instances.py +346 -0
  199. dstack/_internal/core/models/logs.py +27 -0
  200. dstack/_internal/core/models/metrics.py +14 -0
  201. dstack/_internal/core/models/placement.py +27 -0
  202. dstack/_internal/core/models/profiles.py +431 -0
  203. dstack/_internal/core/models/projects.py +46 -0
  204. dstack/_internal/core/models/repos/__init__.py +34 -0
  205. dstack/_internal/core/models/repos/base.py +36 -0
  206. dstack/_internal/core/models/repos/local.py +96 -0
  207. dstack/_internal/core/models/repos/remote.py +341 -0
  208. dstack/_internal/core/models/repos/virtual.py +85 -0
  209. dstack/_internal/core/models/resources.py +424 -0
  210. dstack/_internal/core/models/routers.py +24 -0
  211. dstack/_internal/core/models/runs.py +618 -0
  212. dstack/_internal/core/models/secrets.py +16 -0
  213. dstack/_internal/core/models/server.py +7 -0
  214. dstack/_internal/core/models/services.py +76 -0
  215. dstack/_internal/core/models/unix.py +53 -0
  216. dstack/_internal/core/models/users.py +60 -0
  217. dstack/_internal/core/models/volumes.py +221 -0
  218. dstack/_internal/core/services/__init__.py +16 -0
  219. dstack/_internal/core/services/api_client.py +15 -0
  220. dstack/_internal/core/services/configs/__init__.py +116 -0
  221. dstack/_internal/core/services/diff.py +71 -0
  222. dstack/_internal/core/services/logs.py +58 -0
  223. dstack/_internal/core/services/profiles.py +46 -0
  224. dstack/_internal/core/services/repos.py +236 -0
  225. dstack/_internal/core/services/ssh/__init__.py +27 -0
  226. dstack/_internal/core/services/ssh/attach.py +241 -0
  227. dstack/_internal/core/services/ssh/client.py +113 -0
  228. dstack/_internal/core/services/ssh/key_manager.py +53 -0
  229. dstack/_internal/core/services/ssh/ports.py +89 -0
  230. dstack/_internal/core/services/ssh/tunnel.py +337 -0
  231. dstack/_internal/proxy/__init__.py +8 -0
  232. dstack/_internal/proxy/gateway/__init__.py +0 -0
  233. dstack/_internal/proxy/gateway/app.py +89 -0
  234. dstack/_internal/proxy/gateway/auth.py +26 -0
  235. dstack/_internal/proxy/gateway/const.py +7 -0
  236. dstack/_internal/proxy/gateway/deps.py +73 -0
  237. dstack/_internal/proxy/gateway/main.py +17 -0
  238. dstack/_internal/proxy/gateway/models.py +23 -0
  239. dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
  240. dstack/_internal/proxy/gateway/repo/repo.py +121 -0
  241. dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
  242. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
  243. dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
  244. dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
  245. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
  246. dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
  247. dstack/_internal/proxy/gateway/routers/auth.py +10 -0
  248. dstack/_internal/proxy/gateway/routers/config.py +28 -0
  249. dstack/_internal/proxy/gateway/routers/registry.py +124 -0
  250. dstack/_internal/proxy/gateway/routers/stats.py +18 -0
  251. dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
  252. dstack/_internal/proxy/gateway/schemas/common.py +5 -0
  253. dstack/_internal/proxy/gateway/schemas/config.py +9 -0
  254. dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
  255. dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
  256. dstack/_internal/proxy/gateway/services/__init__.py +0 -0
  257. dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
  258. dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
  259. dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
  260. dstack/_internal/proxy/gateway/services/nginx.py +455 -0
  261. dstack/_internal/proxy/gateway/services/registry.py +426 -0
  262. dstack/_internal/proxy/gateway/services/server_client.py +95 -0
  263. dstack/_internal/proxy/gateway/services/stats.py +170 -0
  264. dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
  265. dstack/_internal/proxy/gateway/testing/common.py +13 -0
  266. dstack/_internal/proxy/lib/__init__.py +0 -0
  267. dstack/_internal/proxy/lib/auth.py +7 -0
  268. dstack/_internal/proxy/lib/deps.py +106 -0
  269. dstack/_internal/proxy/lib/errors.py +14 -0
  270. dstack/_internal/proxy/lib/models.py +112 -0
  271. dstack/_internal/proxy/lib/repo.py +27 -0
  272. dstack/_internal/proxy/lib/routers/__init__.py +0 -0
  273. dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
  274. dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
  275. dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
  276. dstack/_internal/proxy/lib/services/__init__.py +0 -0
  277. dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
  278. dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
  279. dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
  280. dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
  281. dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
  282. dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
  283. dstack/_internal/proxy/lib/services/service_connection.py +160 -0
  284. dstack/_internal/proxy/lib/testing/__init__.py +0 -0
  285. dstack/_internal/proxy/lib/testing/auth.py +11 -0
  286. dstack/_internal/proxy/lib/testing/common.py +51 -0
  287. dstack/_internal/server/__init__.py +0 -0
  288. dstack/_internal/server/alembic.ini +100 -0
  289. dstack/_internal/server/app.py +432 -0
  290. dstack/_internal/server/background/__init__.py +142 -0
  291. dstack/_internal/server/background/tasks/__init__.py +0 -0
  292. dstack/_internal/server/background/tasks/common.py +24 -0
  293. dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
  294. dstack/_internal/server/background/tasks/process_events.py +17 -0
  295. dstack/_internal/server/background/tasks/process_fleets.py +289 -0
  296. dstack/_internal/server/background/tasks/process_gateways.py +188 -0
  297. dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
  298. dstack/_internal/server/background/tasks/process_instances.py +1186 -0
  299. dstack/_internal/server/background/tasks/process_metrics.py +172 -0
  300. dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
  301. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  302. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
  303. dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
  304. dstack/_internal/server/background/tasks/process_runs.py +842 -0
  305. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
  306. dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
  307. dstack/_internal/server/background/tasks/process_volumes.py +129 -0
  308. dstack/_internal/server/compatibility/__init__.py +0 -0
  309. dstack/_internal/server/compatibility/common.py +20 -0
  310. dstack/_internal/server/compatibility/gpus.py +22 -0
  311. dstack/_internal/server/db.py +127 -0
  312. dstack/_internal/server/deps.py +19 -0
  313. dstack/_internal/server/main.py +4 -0
  314. dstack/_internal/server/migrations/__init__.py +0 -0
  315. dstack/_internal/server/migrations/env.py +112 -0
  316. dstack/_internal/server/migrations/script.py.mako +28 -0
  317. dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
  318. dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
  319. dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
  320. dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
  321. dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
  322. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  323. dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
  324. dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
  325. dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
  326. dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
  327. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  328. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  329. dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
  330. dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
  331. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  332. dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
  333. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  334. dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
  335. dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
  336. dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
  337. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  338. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  339. dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
  340. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  341. dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
  342. dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
  343. dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
  344. dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
  345. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  346. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  347. dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
  348. dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
  349. dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
  350. dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
  351. dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
  352. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  353. dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
  354. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  355. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  356. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  357. dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
  358. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  359. dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
  360. dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
  361. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  362. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  363. dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
  364. dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
  365. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  366. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
  367. dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
  368. dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
  369. dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
  370. dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
  371. dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
  372. dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
  373. dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
  374. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  375. dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
  376. dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
  377. dstack/_internal/server/migrations/versions/__init__.py +0 -0
  378. dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
  379. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  380. dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
  381. dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
  382. dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
  383. dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
  384. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  385. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  386. dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
  387. dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
  388. dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
  389. dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
  390. dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
  391. dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
  392. dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
  393. dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
  394. dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
  395. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  396. dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
  397. dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
  398. dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
  399. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  400. dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
  401. dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
  402. dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
  403. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  404. dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
  405. dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
  406. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  407. dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
  408. dstack/_internal/server/models.py +930 -0
  409. dstack/_internal/server/routers/__init__.py +0 -0
  410. dstack/_internal/server/routers/auth.py +34 -0
  411. dstack/_internal/server/routers/backends.py +142 -0
  412. dstack/_internal/server/routers/events.py +60 -0
  413. dstack/_internal/server/routers/files.py +68 -0
  414. dstack/_internal/server/routers/fleets.py +202 -0
  415. dstack/_internal/server/routers/gateways.py +109 -0
  416. dstack/_internal/server/routers/gpus.py +32 -0
  417. dstack/_internal/server/routers/instances.py +77 -0
  418. dstack/_internal/server/routers/logs.py +34 -0
  419. dstack/_internal/server/routers/metrics.py +82 -0
  420. dstack/_internal/server/routers/projects.py +205 -0
  421. dstack/_internal/server/routers/prometheus.py +35 -0
  422. dstack/_internal/server/routers/repos.py +118 -0
  423. dstack/_internal/server/routers/runs.py +216 -0
  424. dstack/_internal/server/routers/secrets.py +86 -0
  425. dstack/_internal/server/routers/server.py +19 -0
  426. dstack/_internal/server/routers/users.py +158 -0
  427. dstack/_internal/server/routers/volumes.py +122 -0
  428. dstack/_internal/server/schemas/__init__.py +0 -0
  429. dstack/_internal/server/schemas/auth.py +83 -0
  430. dstack/_internal/server/schemas/backends.py +16 -0
  431. dstack/_internal/server/schemas/common.py +9 -0
  432. dstack/_internal/server/schemas/events.py +211 -0
  433. dstack/_internal/server/schemas/files.py +5 -0
  434. dstack/_internal/server/schemas/fleets.py +49 -0
  435. dstack/_internal/server/schemas/gateways.py +31 -0
  436. dstack/_internal/server/schemas/gpus.py +26 -0
  437. dstack/_internal/server/schemas/health/__init__.py +0 -0
  438. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  439. dstack/_internal/server/schemas/instances.py +47 -0
  440. dstack/_internal/server/schemas/logs.py +17 -0
  441. dstack/_internal/server/schemas/projects.py +81 -0
  442. dstack/_internal/server/schemas/repos.py +24 -0
  443. dstack/_internal/server/schemas/runner.py +269 -0
  444. dstack/_internal/server/schemas/runs.py +66 -0
  445. dstack/_internal/server/schemas/secrets.py +16 -0
  446. dstack/_internal/server/schemas/users.py +72 -0
  447. dstack/_internal/server/schemas/volumes.py +29 -0
  448. dstack/_internal/server/security/__init__.py +0 -0
  449. dstack/_internal/server/security/permissions.py +251 -0
  450. dstack/_internal/server/services/__init__.py +0 -0
  451. dstack/_internal/server/services/auth.py +77 -0
  452. dstack/_internal/server/services/backends/__init__.py +404 -0
  453. dstack/_internal/server/services/backends/handlers.py +105 -0
  454. dstack/_internal/server/services/compute_groups.py +22 -0
  455. dstack/_internal/server/services/config.py +279 -0
  456. dstack/_internal/server/services/docker.py +162 -0
  457. dstack/_internal/server/services/encryption/__init__.py +102 -0
  458. dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
  459. dstack/_internal/server/services/encryption/keys/aes.py +68 -0
  460. dstack/_internal/server/services/encryption/keys/base.py +19 -0
  461. dstack/_internal/server/services/encryption/keys/identity.py +28 -0
  462. dstack/_internal/server/services/events.py +477 -0
  463. dstack/_internal/server/services/files.py +91 -0
  464. dstack/_internal/server/services/fleets.py +1224 -0
  465. dstack/_internal/server/services/gateways/__init__.py +686 -0
  466. dstack/_internal/server/services/gateways/client.py +209 -0
  467. dstack/_internal/server/services/gateways/connection.py +139 -0
  468. dstack/_internal/server/services/gateways/pool.py +58 -0
  469. dstack/_internal/server/services/gpus.py +387 -0
  470. dstack/_internal/server/services/instances.py +731 -0
  471. dstack/_internal/server/services/jobs/__init__.py +840 -0
  472. dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
  473. dstack/_internal/server/services/jobs/configurators/base.py +469 -0
  474. dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
  475. dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
  476. dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
  477. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  478. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
  479. dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
  480. dstack/_internal/server/services/jobs/configurators/service.py +28 -0
  481. dstack/_internal/server/services/jobs/configurators/task.py +39 -0
  482. dstack/_internal/server/services/locking.py +187 -0
  483. dstack/_internal/server/services/logging.py +29 -0
  484. dstack/_internal/server/services/logs/__init__.py +122 -0
  485. dstack/_internal/server/services/logs/aws.py +373 -0
  486. dstack/_internal/server/services/logs/base.py +47 -0
  487. dstack/_internal/server/services/logs/filelog.py +261 -0
  488. dstack/_internal/server/services/logs/fluentbit.py +329 -0
  489. dstack/_internal/server/services/logs/gcp.py +181 -0
  490. dstack/_internal/server/services/metrics.py +172 -0
  491. dstack/_internal/server/services/offers.py +249 -0
  492. dstack/_internal/server/services/permissions.py +37 -0
  493. dstack/_internal/server/services/placement.py +234 -0
  494. dstack/_internal/server/services/plugins.py +109 -0
  495. dstack/_internal/server/services/probes.py +10 -0
  496. dstack/_internal/server/services/projects.py +835 -0
  497. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  498. dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
  499. dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
  500. dstack/_internal/server/services/proxy/__init__.py +3 -0
  501. dstack/_internal/server/services/proxy/auth.py +12 -0
  502. dstack/_internal/server/services/proxy/deps.py +18 -0
  503. dstack/_internal/server/services/proxy/repo.py +189 -0
  504. dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
  505. dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
  506. dstack/_internal/server/services/proxy/services/__init__.py +0 -0
  507. dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
  508. dstack/_internal/server/services/repos.py +362 -0
  509. dstack/_internal/server/services/requirements/__init__.py +0 -0
  510. dstack/_internal/server/services/requirements/combine.py +260 -0
  511. dstack/_internal/server/services/resources.py +21 -0
  512. dstack/_internal/server/services/runner/__init__.py +0 -0
  513. dstack/_internal/server/services/runner/client.py +646 -0
  514. dstack/_internal/server/services/runner/ssh.py +128 -0
  515. dstack/_internal/server/services/runs/__init__.py +1026 -0
  516. dstack/_internal/server/services/runs/plan.py +703 -0
  517. dstack/_internal/server/services/runs/replicas.py +317 -0
  518. dstack/_internal/server/services/runs/spec.py +191 -0
  519. dstack/_internal/server/services/secrets.py +245 -0
  520. dstack/_internal/server/services/services/__init__.py +345 -0
  521. dstack/_internal/server/services/services/autoscalers.py +140 -0
  522. dstack/_internal/server/services/services/options.py +53 -0
  523. dstack/_internal/server/services/ssh.py +67 -0
  524. dstack/_internal/server/services/storage/__init__.py +37 -0
  525. dstack/_internal/server/services/storage/base.py +48 -0
  526. dstack/_internal/server/services/storage/gcs.py +66 -0
  527. dstack/_internal/server/services/storage/s3.py +69 -0
  528. dstack/_internal/server/services/users.py +461 -0
  529. dstack/_internal/server/services/volumes.py +496 -0
  530. dstack/_internal/server/settings.py +161 -0
  531. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  532. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  533. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  534. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  535. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  536. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  537. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  538. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  539. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  540. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  541. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  542. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  543. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  544. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  545. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  546. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  547. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  548. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  549. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  550. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  551. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  552. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  553. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  554. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  555. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  556. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  557. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  558. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  559. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  560. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  561. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  562. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  563. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  564. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  565. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  566. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  567. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  568. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  569. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  570. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  571. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  572. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  573. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  574. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  575. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  576. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  577. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  578. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  579. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  580. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  581. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  582. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  583. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  584. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  585. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  586. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  587. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  588. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  589. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  590. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  591. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  592. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  593. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  594. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  595. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  596. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  597. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  598. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  599. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  600. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  601. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  602. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  603. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  604. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  605. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  606. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  607. dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
  608. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  609. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  610. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  611. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  612. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  613. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  614. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  615. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  616. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  617. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  618. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  619. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  620. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  621. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  622. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  623. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  624. dstack/_internal/server/statics/index.html +3 -0
  625. dstack/_internal/server/statics/logo-notext.svg +116 -0
  626. dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
  627. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
  628. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
  629. dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
  630. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  631. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  632. dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
  633. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  634. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  635. dstack/_internal/server/testing/__init__.py +0 -0
  636. dstack/_internal/server/testing/common.py +1220 -0
  637. dstack/_internal/server/testing/conf.py +53 -0
  638. dstack/_internal/server/testing/matchers.py +31 -0
  639. dstack/_internal/server/utils/__init__.py +0 -0
  640. dstack/_internal/server/utils/common.py +55 -0
  641. dstack/_internal/server/utils/logging.py +51 -0
  642. dstack/_internal/server/utils/provisioning.py +368 -0
  643. dstack/_internal/server/utils/routers.py +166 -0
  644. dstack/_internal/server/utils/sentry_utils.py +24 -0
  645. dstack/_internal/settings.py +49 -0
  646. dstack/_internal/utils/__init__.py +0 -0
  647. dstack/_internal/utils/common.py +318 -0
  648. dstack/_internal/utils/cron.py +5 -0
  649. dstack/_internal/utils/crypto.py +40 -0
  650. dstack/_internal/utils/env.py +88 -0
  651. dstack/_internal/utils/event_loop.py +30 -0
  652. dstack/_internal/utils/files.py +69 -0
  653. dstack/_internal/utils/gpu.py +59 -0
  654. dstack/_internal/utils/hash.py +31 -0
  655. dstack/_internal/utils/interpolator.py +91 -0
  656. dstack/_internal/utils/json_schema.py +11 -0
  657. dstack/_internal/utils/json_utils.py +54 -0
  658. dstack/_internal/utils/logging.py +5 -0
  659. dstack/_internal/utils/nested_list.py +47 -0
  660. dstack/_internal/utils/network.py +50 -0
  661. dstack/_internal/utils/path.py +57 -0
  662. dstack/_internal/utils/random_names.py +258 -0
  663. dstack/_internal/utils/ssh.py +346 -0
  664. dstack/_internal/utils/tags.py +42 -0
  665. dstack/_internal/utils/typing.py +14 -0
  666. dstack/_internal/utils/version.py +22 -0
  667. dstack/api/__init__.py +46 -0
  668. dstack/api/_public/__init__.py +96 -0
  669. dstack/api/_public/backends.py +42 -0
  670. dstack/api/_public/common.py +5 -0
  671. dstack/api/_public/repos.py +202 -0
  672. dstack/api/_public/runs.py +714 -0
  673. dstack/api/server/__init__.py +206 -0
  674. dstack/api/server/_auth.py +30 -0
  675. dstack/api/server/_backends.py +38 -0
  676. dstack/api/server/_events.py +64 -0
  677. dstack/api/server/_files.py +18 -0
  678. dstack/api/server/_fleets.py +82 -0
  679. dstack/api/server/_gateways.py +54 -0
  680. dstack/api/server/_gpus.py +27 -0
  681. dstack/api/server/_group.py +22 -0
  682. dstack/api/server/_logs.py +15 -0
  683. dstack/api/server/_metrics.py +23 -0
  684. dstack/api/server/_projects.py +124 -0
  685. dstack/api/server/_repos.py +64 -0
  686. dstack/api/server/_runs.py +102 -0
  687. dstack/api/server/_secrets.py +36 -0
  688. dstack/api/server/_users.py +82 -0
  689. dstack/api/server/_volumes.py +39 -0
  690. dstack/api/server/utils.py +34 -0
  691. dstack/api/utils.py +105 -0
  692. dstack/core/__init__.py +0 -0
  693. dstack/plugins/__init__.py +8 -0
  694. dstack/plugins/_base.py +72 -0
  695. dstack/plugins/_models.py +8 -0
  696. dstack/plugins/_utils.py +19 -0
  697. dstack/plugins/builtin/__init__.py +0 -0
  698. dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
  699. dstack/plugins/builtin/rest_plugin/_models.py +48 -0
  700. dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
  701. dstack/version.py +3 -1
  702. dstack-0.20.7.dist-info/METADATA +519 -0
  703. dstack-0.20.7.dist-info/RECORD +720 -0
  704. {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
  705. dstack-0.20.7.dist-info/entry_points.txt +2 -0
  706. dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
  707. dstack/aws/__init__.py +0 -180
  708. dstack/aws/artifacts.py +0 -111
  709. dstack/aws/config.py +0 -40
  710. dstack/aws/jobs.py +0 -245
  711. dstack/aws/logs.py +0 -186
  712. dstack/aws/repos.py +0 -137
  713. dstack/aws/run_names.py +0 -17
  714. dstack/aws/runners.py +0 -693
  715. dstack/aws/runs.py +0 -79
  716. dstack/aws/secrets.py +0 -99
  717. dstack/aws/tags.py +0 -138
  718. dstack/backend.py +0 -299
  719. dstack/cli/app.py +0 -41
  720. dstack/cli/artifacts.py +0 -87
  721. dstack/cli/common.py +0 -57
  722. dstack/cli/config.py +0 -194
  723. dstack/cli/dashboard.py +0 -26
  724. dstack/cli/delete.py +0 -49
  725. dstack/cli/init.py +0 -33
  726. dstack/cli/logs.py +0 -87
  727. dstack/cli/main.py +0 -81
  728. dstack/cli/restart.py +0 -43
  729. dstack/cli/run.py +0 -223
  730. dstack/cli/schema.py +0 -46
  731. dstack/cli/secrets.py +0 -97
  732. dstack/cli/status.py +0 -140
  733. dstack/cli/stop.py +0 -53
  734. dstack/cli/tags.py +0 -100
  735. dstack/config.py +0 -80
  736. dstack/dashboard/artifacts.py +0 -26
  737. dstack/dashboard/logs.py +0 -73
  738. dstack/dashboard/main.py +0 -45
  739. dstack/dashboard/repos.py +0 -41
  740. dstack/dashboard/runs.py +0 -140
  741. dstack/dashboard/secrets.py +0 -53
  742. dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
  743. dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
  744. dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
  745. dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
  746. dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
  747. dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
  748. dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
  749. dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
  750. dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
  751. dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
  752. dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
  753. dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  754. dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
  755. dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
  756. dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
  757. dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
  758. dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
  759. dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
  760. dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
  761. dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
  762. dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
  763. dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
  764. dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
  765. dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
  766. dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  767. dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  768. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  769. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  770. dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  771. dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  772. dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  773. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  774. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  775. dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  776. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  777. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  778. dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  779. dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  780. dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  781. dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  782. dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  783. dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  784. dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  785. dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  786. dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  787. dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  788. dstack/dashboard/statics/assets/browserconfig.xml +0 -15
  789. dstack/dashboard/statics/assets/coast-228x228.png +0 -0
  790. dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
  791. dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
  792. dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
  793. dstack/dashboard/statics/assets/favicon.ico +0 -0
  794. dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
  795. dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
  796. dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
  797. dstack/dashboard/statics/assets/manifest.webapp +0 -14
  798. dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
  799. dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
  800. dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
  801. dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
  802. dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
  803. dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
  804. dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
  805. dstack/dashboard/statics/index.html +0 -7
  806. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
  807. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
  808. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
  809. dstack/dashboard/statics/main.css +0 -5058
  810. dstack/dashboard/statics/splash_thumbnail.png +0 -0
  811. dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  812. dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
  813. dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
  814. dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
  815. dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
  816. dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
  817. dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
  818. dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
  819. dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
  820. dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
  821. dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
  822. dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
  823. dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
  824. dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
  825. dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
  826. dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
  827. dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
  828. dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
  829. dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
  830. dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
  831. dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
  832. dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
  833. dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  834. dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
  835. dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
  836. dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
  837. dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
  838. dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
  839. dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
  840. dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
  841. dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
  842. dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
  843. dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
  844. dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
  845. dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
  846. dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
  847. dstack/dashboard/tags.py +0 -119
  848. dstack/jobs.py +0 -255
  849. dstack/providers/__init__.py +0 -316
  850. dstack/providers/_python/main.py +0 -88
  851. dstack/providers/_tensorboard/main.py +0 -93
  852. dstack/providers/_torchrun/main.py +0 -121
  853. dstack/providers/bash/main.py +0 -90
  854. dstack/providers/code/main.py +0 -95
  855. dstack/providers/docker/main.py +0 -79
  856. dstack/providers/lab/main.py +0 -95
  857. dstack/providers/notebook/main.py +0 -90
  858. dstack/random_name.py +0 -29
  859. dstack/repo.py +0 -135
  860. dstack/runners.py +0 -35
  861. dstack/util.py +0 -15
  862. dstack-0.0.9.dist-info/METADATA +0 -176
  863. dstack-0.0.9.dist-info/RECORD +0 -179
  864. dstack-0.0.9.dist-info/entry_points.txt +0 -3
  865. dstack-0.0.9.dist-info/top_level.txt +0 -2
  866. tests/test_config.py +0 -70
  867. /dstack/{cli → _internal}/__init__.py +0 -0
  868. /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
  869. /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
  870. /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
  871. /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
  872. /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
  873. /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
  874. /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
  875. /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
  876. /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
  877. {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
  878. /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
  879. /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
@@ -0,0 +1,1238 @@
1
+ import asyncio
2
+ import re
3
+ import uuid
4
+ from collections.abc import Iterable
5
+ from datetime import timedelta
6
+ from typing import Dict, List, Optional
7
+
8
+ from sqlalchemy import and_, func, select
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+ from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only
11
+
12
+ from dstack._internal import settings
13
+ from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
14
+ from dstack._internal.core.errors import GatewayError
15
+ from dstack._internal.core.models.backends.base import BackendType
16
+ from dstack._internal.core.models.common import NetworkMode, RegistryAuth
17
+ from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
18
+ from dstack._internal.core.models.files import FileArchiveMapping
19
+ from dstack._internal.core.models.instances import (
20
+ InstanceStatus,
21
+ RemoteConnectionInfo,
22
+ SSHConnectionParams,
23
+ )
24
+ from dstack._internal.core.models.metrics import Metric
25
+ from dstack._internal.core.models.profiles import StartupOrder
26
+ from dstack._internal.core.models.repos import RemoteRepoCreds
27
+ from dstack._internal.core.models.runs import (
28
+ ClusterInfo,
29
+ Job,
30
+ JobProvisioningData,
31
+ JobRuntimeData,
32
+ JobSpec,
33
+ JobStatus,
34
+ JobTerminationReason,
35
+ ProbeSpec,
36
+ Run,
37
+ RunSpec,
38
+ RunStatus,
39
+ )
40
+ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
41
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
42
+ from dstack._internal.server.db import get_db, get_session_ctx
43
+ from dstack._internal.server.models import (
44
+ FleetModel,
45
+ InstanceModel,
46
+ JobModel,
47
+ ProbeModel,
48
+ ProjectModel,
49
+ RepoModel,
50
+ RunModel,
51
+ UserModel,
52
+ )
53
+ from dstack._internal.server.schemas.runner import GPUDevice, TaskStatus
54
+ from dstack._internal.server.services import events, services
55
+ from dstack._internal.server.services import files as files_services
56
+ from dstack._internal.server.services import logs as logs_services
57
+ from dstack._internal.server.services.instances import get_instance_ssh_private_keys
58
+ from dstack._internal.server.services.jobs import (
59
+ find_job,
60
+ get_job_attached_volumes,
61
+ get_job_runtime_data,
62
+ is_master_job,
63
+ job_model_to_job_submission,
64
+ switch_job_status,
65
+ )
66
+ from dstack._internal.server.services.locking import get_locker
67
+ from dstack._internal.server.services.logging import fmt
68
+ from dstack._internal.server.services.metrics import get_job_metrics
69
+ from dstack._internal.server.services.repos import (
70
+ get_code_model,
71
+ get_repo_creds,
72
+ repo_model_to_repo_head_with_creds,
73
+ )
74
+ from dstack._internal.server.services.runner import client
75
+ from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
76
+ from dstack._internal.server.services.runs import (
77
+ is_job_ready,
78
+ run_model_to_run,
79
+ )
80
+ from dstack._internal.server.services.secrets import get_project_secrets_mapping
81
+ from dstack._internal.server.services.storage import get_default_storage
82
+ from dstack._internal.server.utils import sentry_utils
83
+ from dstack._internal.utils import common as common_utils
84
+ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
85
+ from dstack._internal.utils.logging import get_logger
86
+
87
+ logger = get_logger(__name__)
88
+
89
+
90
+ MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
91
+ # Minimum time before terminating active job in case of connectivity issues.
92
+ # Should be sufficient to survive most problems caused by
93
+ # the server network flickering and providers' glitches.
94
+ JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
95
+
96
+
97
+ async def process_running_jobs(batch_size: int = 1):
98
+ tasks = []
99
+ for _ in range(batch_size):
100
+ tasks.append(_process_next_running_job())
101
+ await asyncio.gather(*tasks)
102
+
103
+
104
+ @sentry_utils.instrument_background_task
105
+ async def _process_next_running_job():
106
+ lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
107
+ async with get_session_ctx() as session:
108
+ async with lock:
109
+ res = await session.execute(
110
+ select(JobModel)
111
+ .join(JobModel.run)
112
+ .where(
113
+ JobModel.status.in_(
114
+ [JobStatus.PROVISIONING, JobStatus.PULLING, JobStatus.RUNNING]
115
+ ),
116
+ RunModel.status.not_in([RunStatus.TERMINATING]),
117
+ JobModel.id.not_in(lockset),
118
+ JobModel.last_processed_at
119
+ < common_utils.get_current_datetime() - MIN_PROCESSING_INTERVAL,
120
+ )
121
+ .options(load_only(JobModel.id))
122
+ .order_by(JobModel.last_processed_at.asc())
123
+ .limit(1)
124
+ .with_for_update(
125
+ skip_locked=True,
126
+ key_share=True,
127
+ of=JobModel,
128
+ )
129
+ )
130
+ job_model = res.unique().scalar()
131
+ if job_model is None:
132
+ return
133
+ lockset.add(job_model.id)
134
+ job_model_id = job_model.id
135
+ try:
136
+ await _process_running_job(session=session, job_model=job_model)
137
+ finally:
138
+ lockset.difference_update([job_model_id])
139
+
140
+
141
+ async def _process_running_job(session: AsyncSession, job_model: JobModel):
142
+ job_model = await _refetch_job_model(session, job_model)
143
+ run_model = await _fetch_run_model(session, job_model.run_id)
144
+ repo_model = run_model.repo
145
+ project = run_model.project
146
+ run = run_model_to_run(run_model, include_sensitive=True)
147
+ job_submission = job_model_to_job_submission(job_model)
148
+ job_provisioning_data = job_submission.job_provisioning_data
149
+ if job_provisioning_data is None:
150
+ logger.error("%s: job_provisioning_data of an active job is None", fmt(job_model))
151
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
152
+ job_model.termination_reason_message = (
153
+ "Unexpected server error: job_provisioning_data of an active job is None"
154
+ )
155
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
156
+ job_model.last_processed_at = common_utils.get_current_datetime()
157
+ await session.commit()
158
+ return
159
+
160
+ job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
161
+
162
+ volumes = []
163
+ secrets = {}
164
+ cluster_info = None
165
+ repo_creds = None
166
+
167
+ initial_status = job_model.status
168
+ if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
169
+ for other_job in run.jobs:
170
+ if (
171
+ other_job.job_spec.replica_num == job.job_spec.replica_num
172
+ and other_job.job_submissions[-1].status == JobStatus.SUBMITTED
173
+ ):
174
+ logger.debug(
175
+ "%s: waiting for all jobs in the replica to be provisioned",
176
+ fmt(job_model),
177
+ )
178
+ job_model.last_processed_at = common_utils.get_current_datetime()
179
+ await session.commit()
180
+ return
181
+
182
+ cluster_info = _get_cluster_info(
183
+ jobs=run.jobs,
184
+ replica_num=job.job_spec.replica_num,
185
+ job_provisioning_data=job_provisioning_data,
186
+ job_runtime_data=job_submission.job_runtime_data,
187
+ )
188
+
189
+ volumes = await get_job_attached_volumes(
190
+ session=session,
191
+ project=project,
192
+ run_spec=run.run_spec,
193
+ job_num=job.job_spec.job_num,
194
+ job_provisioning_data=job_provisioning_data,
195
+ )
196
+
197
+ repo_creds_model = await get_repo_creds(
198
+ session=session, repo=repo_model, user=run_model.user
199
+ )
200
+ repo_creds = repo_model_to_repo_head_with_creds(repo_model, repo_creds_model).repo_creds
201
+
202
+ secrets = await get_project_secrets_mapping(session=session, project=project)
203
+ try:
204
+ _interpolate_secrets(secrets, job.job_spec)
205
+ except InterpolatorError as e:
206
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
207
+ job_model.termination_reason_message = f"Secrets interpolation error: {e.args[0]}"
208
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
209
+ job_model.last_processed_at = common_utils.get_current_datetime()
210
+ await session.commit()
211
+ return
212
+
213
+ server_ssh_private_keys = get_instance_ssh_private_keys(
214
+ common_utils.get_or_error(job_model.instance)
215
+ )
216
+
217
+ if initial_status == JobStatus.PROVISIONING:
218
+ if job_provisioning_data.hostname is None:
219
+ await _wait_for_instance_provisioning_data(session, job_model)
220
+ job_model.last_processed_at = common_utils.get_current_datetime()
221
+ await session.commit()
222
+ return
223
+ if _should_wait_for_other_nodes(run, job, job_model):
224
+ job_model.last_processed_at = common_utils.get_current_datetime()
225
+ await session.commit()
226
+ return
227
+
228
+ # fails are acceptable until timeout is exceeded
229
+ if job_provisioning_data.dockerized:
230
+ logger.debug(
231
+ "%s: process provisioning job with shim, age=%s",
232
+ fmt(job_model),
233
+ job_submission.age,
234
+ )
235
+ ssh_user = job_provisioning_data.username
236
+ assert run.run_spec.ssh_key_pub is not None
237
+ user_ssh_key = run.run_spec.ssh_key_pub.strip()
238
+ public_keys = [project.ssh_public_key.strip(), user_ssh_key]
239
+ if job_provisioning_data.backend == BackendType.LOCAL:
240
+ # No need to update ~/.ssh/authorized_keys when running shim locally
241
+ user_ssh_key = ""
242
+ success = await common_utils.run_async(
243
+ _process_provisioning_with_shim,
244
+ server_ssh_private_keys,
245
+ job_provisioning_data,
246
+ None,
247
+ session,
248
+ run,
249
+ job_model,
250
+ job_provisioning_data,
251
+ volumes,
252
+ job.job_spec.registry_auth,
253
+ public_keys,
254
+ ssh_user,
255
+ user_ssh_key,
256
+ )
257
+ else:
258
+ assert cluster_info is not None
259
+ logger.debug(
260
+ "%s: process provisioning job without shim, age=%s",
261
+ fmt(job_model),
262
+ job_submission.age,
263
+ )
264
+ # FIXME: downloading file archives and code here is a waste of time if
265
+ # the runner is not ready yet
266
+ file_archives = await _get_job_file_archives(
267
+ session=session,
268
+ archive_mappings=job.job_spec.file_archives,
269
+ user=run_model.user,
270
+ )
271
+ code = await _get_job_code(
272
+ session=session,
273
+ project=project,
274
+ repo=repo_model,
275
+ code_hash=_get_repo_code_hash(run, job),
276
+ )
277
+ success = await common_utils.run_async(
278
+ _submit_job_to_runner,
279
+ server_ssh_private_keys,
280
+ job_provisioning_data,
281
+ None,
282
+ session,
283
+ run,
284
+ job_model,
285
+ job,
286
+ cluster_info,
287
+ code,
288
+ file_archives,
289
+ secrets,
290
+ repo_creds,
291
+ success_if_not_available=False,
292
+ )
293
+
294
+ if not success:
295
+ # check timeout
296
+ provisioning_timeout = get_provisioning_timeout(
297
+ backend_type=job_provisioning_data.get_base_backend(),
298
+ instance_type_name=job_provisioning_data.instance_type.name,
299
+ )
300
+ if job_submission.age > provisioning_timeout:
301
+ job_model.termination_reason = JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED
302
+ job_model.termination_reason_message = (
303
+ f"Runner did not become available within {provisioning_timeout.total_seconds()}s."
304
+ f" Job submission age: {job_submission.age.total_seconds()}s)"
305
+ )
306
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
307
+ # instance will be emptied by process_terminating_jobs
308
+
309
+ else: # fails are not acceptable
310
+ if initial_status == JobStatus.PULLING:
311
+ assert cluster_info is not None
312
+ logger.debug(
313
+ "%s: process pulling job with shim, age=%s", fmt(job_model), job_submission.age
314
+ )
315
+ # FIXME: downloading file archives and code here is a waste of time if
316
+ # the runner is not ready yet
317
+ file_archives = await _get_job_file_archives(
318
+ session=session,
319
+ archive_mappings=job.job_spec.file_archives,
320
+ user=run_model.user,
321
+ )
322
+ code = await _get_job_code(
323
+ session=session,
324
+ project=project,
325
+ repo=repo_model,
326
+ code_hash=_get_repo_code_hash(run, job),
327
+ )
328
+ success = await common_utils.run_async(
329
+ _process_pulling_with_shim,
330
+ server_ssh_private_keys,
331
+ job_provisioning_data,
332
+ None,
333
+ session,
334
+ run,
335
+ job_model,
336
+ job,
337
+ cluster_info,
338
+ code,
339
+ file_archives,
340
+ secrets,
341
+ repo_creds,
342
+ server_ssh_private_keys,
343
+ job_provisioning_data,
344
+ )
345
+ else:
346
+ logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
347
+ success = await common_utils.run_async(
348
+ _process_running,
349
+ server_ssh_private_keys,
350
+ job_provisioning_data,
351
+ job_submission.job_runtime_data,
352
+ session,
353
+ run_model,
354
+ job_model,
355
+ )
356
+
357
+ if success:
358
+ _reset_disconnected_at(session, job_model)
359
+ else:
360
+ if job_model.termination_reason:
361
+ logger.warning(
362
+ "%s: failed due to %s, age=%s",
363
+ fmt(job_model),
364
+ job_model.termination_reason.value,
365
+ job_submission.age,
366
+ )
367
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
368
+ # job will be terminated and instance will be emptied by process_terminating_jobs
369
+ else:
370
+ # No job_model.termination_reason set means ssh connection failed
371
+ _set_disconnected_at_now(session, job_model)
372
+ if _should_terminate_job_due_to_disconnect(job_model):
373
+ if job_provisioning_data.instance_type.resources.spot:
374
+ job_model.termination_reason = (
375
+ JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
376
+ )
377
+ else:
378
+ job_model.termination_reason = JobTerminationReason.INSTANCE_UNREACHABLE
379
+ job_model.termination_reason_message = "Instance is unreachable"
380
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
381
+ else:
382
+ logger.warning(
383
+ "%s: is unreachable, waiting for the instance to become reachable again, age=%s",
384
+ fmt(job_model),
385
+ job_submission.age,
386
+ )
387
+
388
+ if initial_status != job_model.status and job_model.status == JobStatus.RUNNING:
389
+ job_model.probes = []
390
+ for probe_num in range(len(job.job_spec.probes)):
391
+ job_model.probes.append(
392
+ ProbeModel(
393
+ name=f"{job_model.job_name}-{probe_num}",
394
+ probe_num=probe_num,
395
+ due=common_utils.get_current_datetime(),
396
+ success_streak=0,
397
+ active=True,
398
+ )
399
+ )
400
+
401
+ if job_model.status == JobStatus.RUNNING:
402
+ await _maybe_register_replica(session, run_model, run, job_model, job.job_spec.probes)
403
+ if job_model.status == JobStatus.RUNNING:
404
+ await _check_gpu_utilization(session, job_model, job)
405
+
406
+ job_model.last_processed_at = common_utils.get_current_datetime()
407
+ await session.commit()
408
+
409
+
410
+ async def _refetch_job_model(session: AsyncSession, job_model: JobModel) -> JobModel:
411
+ res = await session.execute(
412
+ select(JobModel)
413
+ .where(JobModel.id == job_model.id)
414
+ .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
415
+ .options(joinedload(JobModel.probes).load_only(ProbeModel.success_streak))
416
+ .execution_options(populate_existing=True)
417
+ )
418
+ return res.unique().scalar_one()
419
+
420
+
421
+ async def _fetch_run_model(session: AsyncSession, run_id: uuid.UUID) -> RunModel:
422
+ # Select only latest submissions for every job.
423
+ latest_submissions_sq = (
424
+ select(
425
+ JobModel.run_id.label("run_id"),
426
+ JobModel.replica_num.label("replica_num"),
427
+ JobModel.job_num.label("job_num"),
428
+ func.max(JobModel.submission_num).label("max_submission_num"),
429
+ )
430
+ .where(JobModel.run_id == run_id)
431
+ .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num)
432
+ .subquery()
433
+ )
434
+ job_alias = aliased(JobModel)
435
+ res = await session.execute(
436
+ select(RunModel)
437
+ .where(RunModel.id == run_id)
438
+ .join(job_alias, job_alias.run_id == RunModel.id)
439
+ .join(
440
+ latest_submissions_sq,
441
+ onclause=and_(
442
+ job_alias.run_id == latest_submissions_sq.c.run_id,
443
+ job_alias.replica_num == latest_submissions_sq.c.replica_num,
444
+ job_alias.job_num == latest_submissions_sq.c.job_num,
445
+ job_alias.submission_num == latest_submissions_sq.c.max_submission_num,
446
+ ),
447
+ )
448
+ .options(joinedload(RunModel.project))
449
+ .options(joinedload(RunModel.user))
450
+ .options(joinedload(RunModel.repo))
451
+ .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
452
+ .options(contains_eager(RunModel.jobs, alias=job_alias))
453
+ )
454
+ return res.unique().scalar_one()
455
+
456
+
457
+ async def _wait_for_instance_provisioning_data(session: AsyncSession, job_model: JobModel):
458
+ """
459
+ This function will be called until instance IP address appears
460
+ in `job_model.instance.job_provisioning_data` or instance is terminated on timeout.
461
+ """
462
+ if job_model.instance is None:
463
+ logger.error(
464
+ "%s: cannot update job_provisioning_data. job_model.instance is None.",
465
+ fmt(job_model),
466
+ )
467
+ return
468
+ if job_model.instance.job_provisioning_data is None:
469
+ logger.error(
470
+ "%s: cannot update job_provisioning_data. job_model.job_provisioning_data is None.",
471
+ fmt(job_model),
472
+ )
473
+ return
474
+
475
+ if job_model.instance.status == InstanceStatus.TERMINATED:
476
+ job_model.termination_reason = JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED
477
+ job_model.termination_reason_message = "Instance is terminated"
478
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
479
+ return
480
+
481
+ job_model.job_provisioning_data = job_model.instance.job_provisioning_data
482
+
483
+
484
+ def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> bool:
485
+ for other_job in run.jobs:
486
+ if (
487
+ other_job.job_spec.replica_num == job.job_spec.replica_num
488
+ and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
489
+ and other_job.job_submissions[-1].job_provisioning_data is not None
490
+ and other_job.job_submissions[-1].job_provisioning_data.hostname is None
491
+ ):
492
+ logger.debug(
493
+ "%s: waiting for other job to have IP assigned",
494
+ fmt(job_model),
495
+ )
496
+ return True
497
+ master_job = find_job(run.jobs, job.job_spec.replica_num, 0)
498
+ if (
499
+ job.job_spec.job_num != 0
500
+ and run.run_spec.merged_profile.startup_order == StartupOrder.MASTER_FIRST
501
+ and master_job.job_submissions[-1].status != JobStatus.RUNNING
502
+ ):
503
+ logger.debug(
504
+ "%s: waiting for master job to become running",
505
+ fmt(job_model),
506
+ )
507
+ return True
508
+ if (
509
+ is_master_job(job)
510
+ and run.run_spec.merged_profile.startup_order == StartupOrder.WORKERS_FIRST
511
+ ):
512
+ for other_job in run.jobs:
513
+ if (
514
+ other_job.job_spec.replica_num == job.job_spec.replica_num
515
+ and other_job.job_spec.job_num != job.job_spec.job_num
516
+ and other_job.job_submissions[-1].status != JobStatus.RUNNING
517
+ ):
518
+ logger.debug(
519
+ "%s: waiting for worker job to become running",
520
+ fmt(job_model),
521
+ )
522
+ return True
523
+ return False
524
+
525
+
526
+ @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
527
+ def _process_provisioning_with_shim(
528
+ ports: Dict[int, int],
529
+ session: AsyncSession,
530
+ run: Run,
531
+ job_model: JobModel,
532
+ job_provisioning_data: JobProvisioningData,
533
+ volumes: List[Volume],
534
+ registry_auth: Optional[RegistryAuth],
535
+ public_keys: List[str],
536
+ ssh_user: str,
537
+ ssh_key: str,
538
+ ) -> bool:
539
+ """
540
+ Possible next states:
541
+ - JobStatus.PULLING if shim is available
542
+ - JobStatus.TERMINATING if timeout is exceeded
543
+
544
+ Returns:
545
+ is successful
546
+ """
547
+ job_spec = JobSpec.__response__.parse_raw(job_model.job_spec_data)
548
+
549
+ shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
550
+
551
+ resp = shim_client.healthcheck()
552
+ if resp is None:
553
+ logger.debug("%s: shim is not available yet", fmt(job_model))
554
+ return False # shim is not available yet
555
+
556
+ registry_username = ""
557
+ registry_password = ""
558
+ if registry_auth is not None:
559
+ registry_username = registry_auth.username
560
+ registry_password = registry_auth.password
561
+
562
+ volume_mounts: List[VolumeMountPoint] = []
563
+ instance_mounts: List[InstanceMountPoint] = []
564
+ for mount in run.run_spec.configuration.volumes:
565
+ if isinstance(mount, VolumeMountPoint):
566
+ volume_mounts.append(mount.copy())
567
+ elif isinstance(mount, InstanceMountPoint):
568
+ instance_mounts.append(mount)
569
+ else:
570
+ assert False, f"unexpected mount point: {mount!r}"
571
+
572
+ # Run configuration may specify list of possible volume names.
573
+ # We should resolve in to the actual volume attached.
574
+ for volume, volume_mount in zip(volumes, volume_mounts):
575
+ volume_mount.name = volume.name
576
+
577
+ instance_mounts += _get_instance_specific_mounts(
578
+ job_provisioning_data.backend, job_provisioning_data.instance_type.name
579
+ )
580
+
581
+ gpu_devices = _get_instance_specific_gpu_devices(
582
+ job_provisioning_data.backend, job_provisioning_data.instance_type.name
583
+ )
584
+
585
+ container_user = "root"
586
+
587
+ job_runtime_data = get_job_runtime_data(job_model)
588
+ # should check for None, as there may be older jobs submitted before
589
+ # JobRuntimeData was introduced
590
+ if job_runtime_data is not None:
591
+ gpu = job_runtime_data.gpu
592
+ cpu = job_runtime_data.cpu
593
+ memory = job_runtime_data.memory
594
+ network_mode = job_runtime_data.network_mode
595
+ else:
596
+ gpu = None
597
+ cpu = None
598
+ memory = None
599
+ network_mode = NetworkMode.HOST
600
+ image_name = _patch_base_image_for_aws_efa(job_spec, job_provisioning_data)
601
+ if shim_client.is_api_v2_supported():
602
+ shim_client.submit_task(
603
+ task_id=job_model.id,
604
+ name=job_model.job_name,
605
+ registry_username=registry_username,
606
+ registry_password=registry_password,
607
+ image_name=image_name,
608
+ container_user=container_user,
609
+ privileged=job_spec.privileged,
610
+ gpu=gpu,
611
+ cpu=cpu,
612
+ memory=memory,
613
+ shm_size=job_spec.requirements.resources.shm_size,
614
+ network_mode=network_mode,
615
+ volumes=volumes,
616
+ volume_mounts=volume_mounts,
617
+ instance_mounts=instance_mounts,
618
+ gpu_devices=gpu_devices,
619
+ host_ssh_user=ssh_user,
620
+ host_ssh_keys=[ssh_key] if ssh_key else [],
621
+ container_ssh_keys=public_keys,
622
+ instance_id=job_provisioning_data.instance_id,
623
+ )
624
+ else:
625
+ submitted = shim_client.submit(
626
+ username=registry_username,
627
+ password=registry_password,
628
+ image_name=image_name,
629
+ privileged=job_spec.privileged,
630
+ container_name=job_model.job_name,
631
+ container_user=container_user,
632
+ shm_size=job_spec.requirements.resources.shm_size,
633
+ public_keys=public_keys,
634
+ ssh_user=ssh_user,
635
+ ssh_key=ssh_key,
636
+ mounts=volume_mounts,
637
+ volumes=volumes,
638
+ instance_mounts=instance_mounts,
639
+ instance_id=job_provisioning_data.instance_id,
640
+ )
641
+ if not submitted:
642
+ # This can happen when we lost connection to the runner (e.g., network issues), marked
643
+ # the job as failed, released the instance (status=BUSY->IDLE, job_id={id}->None),
644
+ # but the job container is in fact alive, running the previous job. As we force-stop
645
+ # the container via shim API when cancelling the current job anyway (when either the
646
+ # user aborts the submission process or the submission deadline is reached), it's safe
647
+ # to kill the previous job container now, making the shim available
648
+ # (state=running->pending) for the next try.
649
+ logger.warning(
650
+ "%s: failed to submit, shim is already running a job, stopping it now, retry later",
651
+ fmt(job_model),
652
+ )
653
+ shim_client.stop(force=True)
654
+ return False
655
+
656
+ switch_job_status(session, job_model, JobStatus.PULLING)
657
+ return True
658
+
659
+
660
+ @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT])
661
+ def _process_pulling_with_shim(
662
+ ports: Dict[int, int],
663
+ session: AsyncSession,
664
+ run: Run,
665
+ job_model: JobModel,
666
+ job: Job,
667
+ cluster_info: ClusterInfo,
668
+ code: bytes,
669
+ file_archives: Iterable[tuple[uuid.UUID, bytes]],
670
+ secrets: Dict[str, str],
671
+ repo_credentials: Optional[RemoteRepoCreds],
672
+ server_ssh_private_keys: tuple[str, Optional[str]],
673
+ job_provisioning_data: JobProvisioningData,
674
+ ) -> bool:
675
+ """
676
+ Possible next states:
677
+ - JobStatus.RUNNING if runner is available
678
+ - JobStatus.TERMINATING if shim is not available
679
+
680
+ Returns:
681
+ is successful
682
+ """
683
+ shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
684
+ job_runtime_data = None
685
+ if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
686
+ task = shim_client.get_task(job_model.id)
687
+
688
+ # If task goes to terminated before the job is submitted to runner, then an error occurred
689
+ if task.status == TaskStatus.TERMINATED:
690
+ logger.warning(
691
+ "shim failed to execute job %s: %s (%s)",
692
+ job_model.job_name,
693
+ task.termination_reason,
694
+ task.termination_message,
695
+ )
696
+ logger.debug("task status: %s", task.dict())
697
+ job_model.termination_reason = JobTerminationReason(task.termination_reason.lower())
698
+ job_model.termination_reason_message = task.termination_message
699
+ return False
700
+
701
+ if task.status != TaskStatus.RUNNING:
702
+ return True
703
+
704
+ job_runtime_data = get_job_runtime_data(job_model)
705
+ # should check for None, as there may be older jobs submitted before
706
+ # JobRuntimeData was introduced
707
+ if job_runtime_data is not None:
708
+ # port mapping is not yet available, waiting
709
+ if task.ports is None:
710
+ return True
711
+ job_runtime_data.ports = {pm.container: pm.host for pm in task.ports}
712
+ job_model.job_runtime_data = job_runtime_data.json()
713
+
714
+ else:
715
+ shim_status = shim_client.pull() # raises error if shim is down, causes retry
716
+
717
+ # If shim goes to pending before the job is submitted to runner, then an error occurred
718
+ if (
719
+ shim_status.state == "pending"
720
+ and shim_status.result is not None
721
+ and shim_status.result.reason != ""
722
+ ):
723
+ logger.warning(
724
+ "shim failed to execute job %s: %s (%s)",
725
+ job_model.job_name,
726
+ shim_status.result.reason,
727
+ shim_status.result.reason_message,
728
+ )
729
+ logger.debug("shim status: %s", shim_status.dict())
730
+ job_model.termination_reason = JobTerminationReason(shim_status.result.reason.lower())
731
+ job_model.termination_reason_message = shim_status.result.reason_message
732
+ return False
733
+
734
+ if shim_status.state in ("pulling", "creating"):
735
+ return True
736
+
737
+ return _submit_job_to_runner(
738
+ server_ssh_private_keys,
739
+ job_provisioning_data,
740
+ job_runtime_data,
741
+ session=session,
742
+ run=run,
743
+ job_model=job_model,
744
+ job=job,
745
+ cluster_info=cluster_info,
746
+ code=code,
747
+ file_archives=file_archives,
748
+ secrets=secrets,
749
+ repo_credentials=repo_credentials,
750
+ success_if_not_available=True,
751
+ )
752
+
753
+
754
+ @runner_ssh_tunnel(ports=[DSTACK_RUNNER_HTTP_PORT])
755
+ def _process_running(
756
+ ports: Dict[int, int],
757
+ session: AsyncSession,
758
+ run_model: RunModel,
759
+ job_model: JobModel,
760
+ ) -> bool:
761
+ """
762
+ Possible next states:
763
+ - JobStatus.TERMINATING if runner is not available
764
+ - Any status received from runner
765
+
766
+ Returns:
767
+ is successful
768
+ """
769
+ runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
770
+ timestamp = 0
771
+ if job_model.runner_timestamp is not None:
772
+ timestamp = job_model.runner_timestamp
773
+ resp = runner_client.pull(timestamp) # raises error if runner is down, causes retry
774
+ job_model.runner_timestamp = resp.last_updated
775
+ # may raise LogStorageError, causing a retry
776
+ logs_services.write_logs(
777
+ project=run_model.project,
778
+ run_name=run_model.run_name,
779
+ job_submission_id=job_model.id,
780
+ runner_logs=resp.runner_logs,
781
+ job_logs=resp.job_logs,
782
+ )
783
+ if len(resp.job_states) > 0:
784
+ latest_state_event = resp.job_states[-1]
785
+ latest_status = latest_state_event.state
786
+ if latest_status == JobStatus.DONE:
787
+ job_model.termination_reason = JobTerminationReason.DONE_BY_RUNNER
788
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
789
+ elif latest_status in {JobStatus.FAILED, JobStatus.TERMINATED}:
790
+ job_model.termination_reason = JobTerminationReason.CONTAINER_EXITED_WITH_ERROR
791
+ if latest_state_event.termination_reason:
792
+ job_model.termination_reason = JobTerminationReason(
793
+ latest_state_event.termination_reason.lower()
794
+ )
795
+ if latest_state_event.termination_message:
796
+ job_model.termination_reason_message = latest_state_event.termination_message
797
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
798
+ if (exit_status := latest_state_event.exit_status) is not None:
799
+ job_model.exit_status = exit_status
800
+ if exit_status != 0:
801
+ logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
802
+ else:
803
+ _terminate_if_inactivity_duration_exceeded(
804
+ session, run_model, job_model, resp.no_connections_secs
805
+ )
806
+ return True
807
+
808
+
809
+ def _terminate_if_inactivity_duration_exceeded(
810
+ session: AsyncSession,
811
+ run_model: RunModel,
812
+ job_model: JobModel,
813
+ no_connections_secs: Optional[int],
814
+ ) -> None:
815
+ conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
816
+ if not isinstance(conf, DevEnvironmentConfiguration) or not isinstance(
817
+ conf.inactivity_duration, int
818
+ ):
819
+ # reset in case inactivity_duration was disabled via in-place update
820
+ job_model.inactivity_secs = None
821
+ return
822
+ logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
823
+ job_model.inactivity_secs = no_connections_secs
824
+ if no_connections_secs is None:
825
+ # TODO(0.19 or earlier): make no_connections_secs required
826
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
827
+ job_model.termination_reason_message = (
828
+ "The selected instance was created before dstack 0.18.41"
829
+ " and does not support inactivity_duration"
830
+ )
831
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
832
+ elif no_connections_secs >= conf.inactivity_duration:
833
+ # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
834
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
835
+ job_model.termination_reason_message = (
836
+ f"The job was inactive for {no_connections_secs} seconds,"
837
+ f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
838
+ )
839
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
840
+
841
+
842
+ def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
843
+ if job_model.disconnected_at is None:
844
+ return False
845
+ return (
846
+ common_utils.get_current_datetime()
847
+ > job_model.disconnected_at + JOB_DISCONNECTED_RETRY_TIMEOUT
848
+ )
849
+
850
+
851
+ async def _maybe_register_replica(
852
+ session: AsyncSession,
853
+ run_model: RunModel,
854
+ run: Run,
855
+ job_model: JobModel,
856
+ probe_specs: Iterable[ProbeSpec],
857
+ ) -> None:
858
+ """
859
+ Register the replica represented by this job to receive service requests if it is ready.
860
+ """
861
+
862
+ if (
863
+ run.run_spec.configuration.type != "service"
864
+ or job_model.registered
865
+ or job_model.job_num != 0 # only the first job in the replica receives service requests
866
+ or not is_job_ready(job_model.probes, probe_specs)
867
+ ):
868
+ return
869
+
870
+ ssh_head_proxy: Optional[SSHConnectionParams] = None
871
+ ssh_head_proxy_private_key: Optional[str] = None
872
+ instance = common_utils.get_or_error(job_model.instance)
873
+ if instance.remote_connection_info is not None:
874
+ rci: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw(
875
+ instance.remote_connection_info
876
+ )
877
+ if rci.ssh_proxy is not None:
878
+ ssh_head_proxy = rci.ssh_proxy
879
+ ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys)
880
+ ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private
881
+ try:
882
+ await services.register_replica(
883
+ session,
884
+ run_model.gateway_id,
885
+ run,
886
+ job_model,
887
+ ssh_head_proxy,
888
+ ssh_head_proxy_private_key,
889
+ )
890
+ except GatewayError as e:
891
+ logger.warning(
892
+ "%s: failed to register service replica: %s",
893
+ fmt(job_model),
894
+ e,
895
+ )
896
+ job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
897
+ # Not including e.args[0] in the message to avoid exposing internal details
898
+ job_model.termination_reason_message = "Failed to register service replica"
899
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
900
+
901
+
902
+ async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
903
+ policy = job.job_spec.utilization_policy
904
+ if policy is None:
905
+ return
906
+ after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
907
+ job_metrics = await get_job_metrics(session, job_model, after=after)
908
+ gpus_util_metrics: list[Metric] = []
909
+ for metric in job_metrics.metrics:
910
+ if metric.name.startswith("gpu_util_percent_gpu"):
911
+ gpus_util_metrics.append(metric)
912
+ if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
913
+ # Job has started recently, not enough points collected.
914
+ # Assuming that metrics collection interval less than 1 minute.
915
+ logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
916
+ return
917
+ if _should_terminate_due_to_low_gpu_util(
918
+ policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
919
+ ):
920
+ logger.debug("%s: GPU utilization check: terminating", fmt(job_model))
921
+ # TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
922
+ job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
923
+ job_model.termination_reason_message = (
924
+ f"The job GPU utilization below {policy.min_gpu_utilization}%"
925
+ f" for {policy.time_window} seconds"
926
+ )
927
+ switch_job_status(session, job_model, JobStatus.TERMINATING)
928
+ else:
929
+ logger.debug("%s: GPU utilization check: OK", fmt(job_model))
930
+
931
+
932
+ def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
933
+ for gpu_util in gpus_util:
934
+ if all(util < min_util for util in gpu_util):
935
+ return True
936
+ return False
937
+
938
+
939
+ def _set_disconnected_at_now(session: AsyncSession, job_model: JobModel) -> None:
940
+ if job_model.disconnected_at is None:
941
+ job_model.disconnected_at = common_utils.get_current_datetime()
942
+ events.emit(
943
+ session,
944
+ "Job became unreachable",
945
+ actor=events.SystemActor(),
946
+ targets=[events.Target.from_model(job_model)],
947
+ )
948
+
949
+
950
+ def _reset_disconnected_at(session: AsyncSession, job_model: JobModel) -> None:
951
+ if job_model.disconnected_at is not None:
952
+ job_model.disconnected_at = None
953
+ events.emit(
954
+ session,
955
+ "Job became reachable",
956
+ actor=events.SystemActor(),
957
+ targets=[events.Target.from_model(job_model)],
958
+ )
959
+
960
+
961
+ def _get_cluster_info(
962
+ jobs: List[Job],
963
+ replica_num: int,
964
+ job_provisioning_data: JobProvisioningData,
965
+ job_runtime_data: Optional[JobRuntimeData],
966
+ ) -> ClusterInfo:
967
+ job_ips = []
968
+ for job in jobs:
969
+ if job.job_spec.replica_num == replica_num:
970
+ job_ips.append(
971
+ common_utils.get_or_error(
972
+ job.job_submissions[-1].job_provisioning_data
973
+ ).internal_ip
974
+ or ""
975
+ )
976
+ gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
977
+ if job_runtime_data is not None and job_runtime_data.offer is not None:
978
+ gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
979
+ cluster_info = ClusterInfo(
980
+ job_ips=job_ips,
981
+ master_job_ip=job_ips[0],
982
+ gpus_per_job=gpus_per_job,
983
+ )
984
+ return cluster_info
985
+
986
+
987
+ def _get_repo_code_hash(run: Run, job: Job) -> Optional[str]:
988
+ # TODO: drop this function when supporting jobs submitted before 0.19.17 is no longer relevant.
989
+ if (
990
+ job.job_spec.repo_code_hash is None
991
+ and run.run_spec.repo_code_hash is not None
992
+ and job.job_submissions[-1].deployment_num == run.deployment_num
993
+ ):
994
+ # The job spec does not have `repo_code_hash`, because it was submitted before 0.19.17.
995
+ # Use `repo_code_hash` from the run.
996
+ return run.run_spec.repo_code_hash
997
+ return job.job_spec.repo_code_hash
998
+
999
+
1000
+ async def _get_job_code(
1001
+ session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
1002
+ ) -> bytes:
1003
+ if code_hash is None:
1004
+ return b""
1005
+ code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash)
1006
+ if code_model is None:
1007
+ return b""
1008
+ if code_model.blob is not None:
1009
+ return code_model.blob
1010
+ storage = get_default_storage()
1011
+ if storage is None:
1012
+ return b""
1013
+ blob = await common_utils.run_async(
1014
+ storage.get_code,
1015
+ project.name,
1016
+ repo.name,
1017
+ code_hash,
1018
+ )
1019
+ if blob is None:
1020
+ logger.error(
1021
+ "Failed to get repo code hash %s from storage for repo %s", code_hash, repo.name
1022
+ )
1023
+ return b""
1024
+ return blob
1025
+
1026
+
1027
+ async def _get_job_file_archives(
1028
+ session: AsyncSession,
1029
+ archive_mappings: Iterable[FileArchiveMapping],
1030
+ user: UserModel,
1031
+ ) -> list[tuple[uuid.UUID, bytes]]:
1032
+ archives: list[tuple[uuid.UUID, bytes]] = []
1033
+ for archive_mapping in archive_mappings:
1034
+ archive_id = archive_mapping.id
1035
+ archive_blob = await _get_job_file_archive(
1036
+ session=session, archive_id=archive_id, user=user
1037
+ )
1038
+ archives.append((archive_id, archive_blob))
1039
+ return archives
1040
+
1041
+
1042
+ async def _get_job_file_archive(
1043
+ session: AsyncSession, archive_id: uuid.UUID, user: UserModel
1044
+ ) -> bytes:
1045
+ archive_model = await files_services.get_archive_model(session, id=archive_id, user=user)
1046
+ if archive_model is None:
1047
+ return b""
1048
+ if archive_model.blob is not None:
1049
+ return archive_model.blob
1050
+ storage = get_default_storage()
1051
+ if storage is None:
1052
+ return b""
1053
+ blob = await common_utils.run_async(
1054
+ storage.get_archive,
1055
+ str(archive_model.user_id),
1056
+ archive_model.blob_hash,
1057
+ )
1058
+ if blob is None:
1059
+ logger.error("Failed to get file archive %s from storage", archive_id)
1060
+ return b""
1061
+ return blob
1062
+
1063
+
1064
+ @runner_ssh_tunnel(ports=[DSTACK_RUNNER_HTTP_PORT], retries=1)
1065
+ def _submit_job_to_runner(
1066
+ ports: Dict[int, int],
1067
+ session: AsyncSession,
1068
+ run: Run,
1069
+ job_model: JobModel,
1070
+ job: Job,
1071
+ cluster_info: ClusterInfo,
1072
+ code: bytes,
1073
+ file_archives: Iterable[tuple[uuid.UUID, bytes]],
1074
+ secrets: Dict[str, str],
1075
+ repo_credentials: Optional[RemoteRepoCreds],
1076
+ success_if_not_available: bool,
1077
+ ) -> bool:
1078
+ """
1079
+ Possible next states:
1080
+ - JobStatus.RUNNING if runner is available
1081
+ - JobStatus.TERMINATING if timeout is exceeded
1082
+
1083
+ Returns:
1084
+ is successful
1085
+ """
1086
+ logger.debug("%s: submitting job spec", fmt(job_model))
1087
+ logger.debug(
1088
+ "%s: repo clone URL is %s",
1089
+ fmt(job_model),
1090
+ None if repo_credentials is None else repo_credentials.clone_url,
1091
+ )
1092
+ instance = job_model.instance
1093
+ if instance is not None and instance.remote_connection_info is not None:
1094
+ remote_info = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
1095
+ instance_env = remote_info.env
1096
+ else:
1097
+ instance_env = None
1098
+
1099
+ runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
1100
+ resp = runner_client.healthcheck()
1101
+ if resp is None:
1102
+ # runner is not available yet
1103
+ return success_if_not_available
1104
+
1105
+ runner_client.submit_job(
1106
+ run=run,
1107
+ job=job,
1108
+ cluster_info=cluster_info,
1109
+ # Do not send all the secrets since interpolation is already done by the server.
1110
+ # TODO: Passing secrets may be necessary for filtering out secret values from logs.
1111
+ secrets={},
1112
+ repo_credentials=repo_credentials,
1113
+ instance_env=instance_env,
1114
+ )
1115
+ logger.debug("%s: uploading file archive(s)", fmt(job_model))
1116
+ for archive_id, archive in file_archives:
1117
+ runner_client.upload_archive(archive_id, archive)
1118
+ logger.debug("%s: uploading code", fmt(job_model))
1119
+ runner_client.upload_code(code)
1120
+ logger.debug("%s: starting job", fmt(job_model))
1121
+ runner_client.run_job()
1122
+
1123
+ switch_job_status(session, job_model, JobStatus.RUNNING)
1124
+ # do not log here, because the runner will send a new status
1125
+
1126
+ return True
1127
+
1128
+
1129
+ def _interpolate_secrets(secrets: Dict[str, str], job_spec: JobSpec):
1130
+ interpolate = VariablesInterpolator({"secrets": secrets}).interpolate_or_error
1131
+ job_spec.env = {k: interpolate(v) for k, v in job_spec.env.items()}
1132
+ if job_spec.registry_auth is not None:
1133
+ job_spec.registry_auth = RegistryAuth(
1134
+ username=interpolate(job_spec.registry_auth.username),
1135
+ password=interpolate(job_spec.registry_auth.password),
1136
+ )
1137
+
1138
+
1139
+ def _get_instance_specific_mounts(
1140
+ backend_type: BackendType, instance_type_name: str
1141
+ ) -> List[InstanceMountPoint]:
1142
+ if backend_type == BackendType.GCP:
1143
+ if instance_type_name == "a3-megagpu-8g":
1144
+ return [
1145
+ InstanceMountPoint(
1146
+ instance_path="/dev/aperture_devices",
1147
+ path="/dev/aperture_devices",
1148
+ ),
1149
+ InstanceMountPoint(
1150
+ instance_path="/var/lib/tcpxo/lib64",
1151
+ path="/var/lib/tcpxo/lib64",
1152
+ ),
1153
+ InstanceMountPoint(
1154
+ instance_path="/var/lib/fastrak/lib64",
1155
+ path="/var/lib/fastrak/lib64",
1156
+ ),
1157
+ ]
1158
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
1159
+ return [
1160
+ InstanceMountPoint(
1161
+ instance_path="/var/lib/nvidia/lib64",
1162
+ path="/usr/local/nvidia/lib64",
1163
+ ),
1164
+ InstanceMountPoint(
1165
+ instance_path="/var/lib/nvidia/bin",
1166
+ path="/usr/local/nvidia/bin",
1167
+ ),
1168
+ InstanceMountPoint(
1169
+ instance_path="/var/lib/tcpx/lib64",
1170
+ path="/usr/local/tcpx/lib64",
1171
+ ),
1172
+ InstanceMountPoint(
1173
+ instance_path="/run/tcpx",
1174
+ path="/run/tcpx",
1175
+ ),
1176
+ ]
1177
+ return []
1178
+
1179
+
1180
+ def _get_instance_specific_gpu_devices(
1181
+ backend_type: BackendType, instance_type_name: str
1182
+ ) -> List[GPUDevice]:
1183
+ gpu_devices = []
1184
+ if backend_type == BackendType.GCP and instance_type_name in [
1185
+ "a3-edgegpu-8g",
1186
+ "a3-highgpu-8g",
1187
+ ]:
1188
+ for i in range(8):
1189
+ gpu_devices.append(
1190
+ GPUDevice(path_on_host=f"/dev/nvidia{i}", path_in_container=f"/dev/nvidia{i}")
1191
+ )
1192
+ gpu_devices.append(
1193
+ GPUDevice(path_on_host="/dev/nvidia-uvm", path_in_container="/dev/nvidia-uvm")
1194
+ )
1195
+ gpu_devices.append(
1196
+ GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl")
1197
+ )
1198
+ return gpu_devices
1199
+
1200
+
1201
+ def _patch_base_image_for_aws_efa(
1202
+ job_spec: JobSpec, job_provisioning_data: JobProvisioningData
1203
+ ) -> str:
1204
+ image_name = job_spec.image_name
1205
+
1206
+ if job_provisioning_data.backend != BackendType.AWS:
1207
+ return image_name
1208
+
1209
+ instance_type = job_provisioning_data.instance_type.name
1210
+ efa_enabled_patterns = [
1211
+ # TODO: p6-b200 isn't supported yet in gpuhunt
1212
+ r"^p6-b200\.(48xlarge)$",
1213
+ r"^p5\.(4xlarge|48xlarge)$",
1214
+ r"^p5e\.(48xlarge)$",
1215
+ r"^p5en\.(48xlarge)$",
1216
+ r"^p4d\.(24xlarge)$",
1217
+ r"^p4de\.(24xlarge)$",
1218
+ r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
1219
+ r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
1220
+ r"^gr6\.8xlarge$",
1221
+ r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
1222
+ r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$",
1223
+ r"^p3dn\.(24xlarge)$",
1224
+ ]
1225
+
1226
+ is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns)
1227
+ if not is_efa_enabled:
1228
+ return image_name
1229
+
1230
+ if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"):
1231
+ return image_name
1232
+
1233
+ if image_name.endswith(f"-base-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
1234
+ return image_name[:-17] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
1235
+ elif image_name.endswith(f"-devel-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
1236
+ return image_name[:-18] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
1237
+
1238
+ return image_name