dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (879) hide show
  1. dstack/_internal/cli/commands/__init__.py +80 -0
  2. dstack/_internal/cli/commands/apply.py +100 -0
  3. dstack/_internal/cli/commands/attach.py +161 -0
  4. dstack/_internal/cli/commands/completion.py +22 -0
  5. dstack/_internal/cli/commands/delete.py +44 -0
  6. dstack/_internal/cli/commands/event.py +168 -0
  7. dstack/_internal/cli/commands/fleet.py +161 -0
  8. dstack/_internal/cli/commands/gateway.py +159 -0
  9. dstack/_internal/cli/commands/init.py +64 -0
  10. dstack/_internal/cli/commands/login.py +352 -0
  11. dstack/_internal/cli/commands/logs.py +62 -0
  12. dstack/_internal/cli/commands/metrics.py +153 -0
  13. dstack/_internal/cli/commands/offer.py +146 -0
  14. dstack/_internal/cli/commands/project.py +259 -0
  15. dstack/_internal/cli/commands/ps.py +81 -0
  16. dstack/_internal/cli/commands/run.py +69 -0
  17. dstack/_internal/cli/commands/secrets.py +92 -0
  18. dstack/_internal/cli/commands/server.py +96 -0
  19. dstack/_internal/cli/commands/stop.py +26 -0
  20. dstack/_internal/cli/commands/volume.py +117 -0
  21. dstack/_internal/cli/main.py +101 -0
  22. dstack/_internal/cli/models/gateways.py +16 -0
  23. dstack/_internal/cli/models/offers.py +47 -0
  24. dstack/_internal/cli/models/runs.py +16 -0
  25. dstack/_internal/cli/services/args.py +31 -0
  26. dstack/_internal/cli/services/completion.py +91 -0
  27. dstack/_internal/cli/services/configurators/__init__.py +86 -0
  28. dstack/_internal/cli/services/configurators/base.py +103 -0
  29. dstack/_internal/cli/services/configurators/fleet.py +475 -0
  30. dstack/_internal/cli/services/configurators/gateway.py +231 -0
  31. dstack/_internal/cli/services/configurators/run.py +882 -0
  32. dstack/_internal/cli/services/configurators/volume.py +222 -0
  33. dstack/_internal/cli/services/events.py +68 -0
  34. dstack/_internal/cli/services/profile.py +182 -0
  35. dstack/_internal/cli/services/repos.py +71 -0
  36. dstack/_internal/cli/services/resources.py +54 -0
  37. dstack/_internal/cli/utils/common.py +159 -0
  38. dstack/_internal/cli/utils/fleet.py +106 -0
  39. dstack/_internal/cli/utils/gateway.py +56 -0
  40. dstack/_internal/cli/utils/gpu.py +178 -0
  41. dstack/_internal/cli/utils/rich.py +156 -0
  42. dstack/_internal/cli/utils/run.py +517 -0
  43. dstack/_internal/cli/utils/secrets.py +25 -0
  44. dstack/_internal/cli/utils/updates.py +98 -0
  45. dstack/_internal/cli/utils/volume.py +58 -0
  46. dstack/_internal/compat.py +3 -0
  47. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  48. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  49. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  50. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  51. dstack/_internal/core/backends/aws/auth.py +30 -0
  52. dstack/_internal/core/backends/aws/backend.py +31 -0
  53. dstack/_internal/core/backends/aws/compute.py +1153 -0
  54. dstack/_internal/core/backends/aws/configurator.py +191 -0
  55. dstack/_internal/core/backends/aws/models.py +135 -0
  56. dstack/_internal/core/backends/aws/resources.py +700 -0
  57. dstack/_internal/core/backends/azure/auth.py +39 -0
  58. dstack/_internal/core/backends/azure/backend.py +21 -0
  59. dstack/_internal/core/backends/azure/compute.py +676 -0
  60. dstack/_internal/core/backends/azure/configurator.py +472 -0
  61. dstack/_internal/core/backends/azure/models.py +98 -0
  62. dstack/_internal/core/backends/azure/resources.py +116 -0
  63. dstack/_internal/core/backends/azure/utils.py +42 -0
  64. dstack/_internal/core/backends/base/backend.py +18 -0
  65. dstack/_internal/core/backends/base/compute.py +1101 -0
  66. dstack/_internal/core/backends/base/configurator.py +117 -0
  67. dstack/_internal/core/backends/base/models.py +24 -0
  68. dstack/_internal/core/backends/base/offers.py +232 -0
  69. dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
  70. dstack/_internal/core/backends/cloudrift/backend.py +16 -0
  71. dstack/_internal/core/backends/cloudrift/compute.py +138 -0
  72. dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
  73. dstack/_internal/core/backends/cloudrift/models.py +40 -0
  74. dstack/_internal/core/backends/configurators.py +181 -0
  75. dstack/_internal/core/backends/cudo/__init__.py +0 -0
  76. dstack/_internal/core/backends/cudo/api_client.py +111 -0
  77. dstack/_internal/core/backends/cudo/backend.py +16 -0
  78. dstack/_internal/core/backends/cudo/compute.py +174 -0
  79. dstack/_internal/core/backends/cudo/configurator.py +63 -0
  80. dstack/_internal/core/backends/cudo/models.py +37 -0
  81. dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
  82. dstack/_internal/core/backends/datacrunch/backend.py +18 -0
  83. dstack/_internal/core/backends/datacrunch/compute.py +8 -0
  84. dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
  85. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  86. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  87. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  88. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  89. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  90. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  91. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  92. dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
  93. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  94. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  95. dstack/_internal/core/backends/dstack/__init__.py +0 -0
  96. dstack/_internal/core/backends/dstack/models.py +26 -0
  97. dstack/_internal/core/backends/features.py +74 -0
  98. dstack/_internal/core/backends/gcp/__init__.py +0 -0
  99. dstack/_internal/core/backends/gcp/auth.py +57 -0
  100. dstack/_internal/core/backends/gcp/backend.py +17 -0
  101. dstack/_internal/core/backends/gcp/compute.py +1257 -0
  102. dstack/_internal/core/backends/gcp/configurator.py +206 -0
  103. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  104. dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
  105. dstack/_internal/core/backends/gcp/models.py +160 -0
  106. dstack/_internal/core/backends/gcp/resources.py +585 -0
  107. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  108. dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
  109. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  110. dstack/_internal/core/backends/hotaisle/compute.py +188 -0
  111. dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
  112. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  113. dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
  114. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  115. dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
  116. dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
  117. dstack/_internal/core/backends/kubernetes/models.py +71 -0
  118. dstack/_internal/core/backends/kubernetes/utils.py +81 -0
  119. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
  120. dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
  121. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  122. dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
  123. dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
  124. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  125. dstack/_internal/core/backends/local/__init__.py +0 -0
  126. dstack/_internal/core/backends/local/backend.py +14 -0
  127. dstack/_internal/core/backends/local/compute.py +130 -0
  128. dstack/_internal/core/backends/models.py +158 -0
  129. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  130. dstack/_internal/core/backends/nebius/backend.py +16 -0
  131. dstack/_internal/core/backends/nebius/compute.py +401 -0
  132. dstack/_internal/core/backends/nebius/configurator.py +98 -0
  133. dstack/_internal/core/backends/nebius/models.py +185 -0
  134. dstack/_internal/core/backends/nebius/resources.py +433 -0
  135. dstack/_internal/core/backends/oci/__init__.py +0 -0
  136. dstack/_internal/core/backends/oci/auth.py +21 -0
  137. dstack/_internal/core/backends/oci/backend.py +16 -0
  138. dstack/_internal/core/backends/oci/compute.py +209 -0
  139. dstack/_internal/core/backends/oci/configurator.py +156 -0
  140. dstack/_internal/core/backends/oci/exceptions.py +15 -0
  141. dstack/_internal/core/backends/oci/models.py +87 -0
  142. dstack/_internal/core/backends/oci/region.py +86 -0
  143. dstack/_internal/core/backends/oci/resources.py +836 -0
  144. dstack/_internal/core/backends/runpod/__init__.py +0 -0
  145. dstack/_internal/core/backends/runpod/api_client.py +627 -0
  146. dstack/_internal/core/backends/runpod/backend.py +16 -0
  147. dstack/_internal/core/backends/runpod/compute.py +444 -0
  148. dstack/_internal/core/backends/runpod/configurator.py +63 -0
  149. dstack/_internal/core/backends/runpod/models.py +54 -0
  150. dstack/_internal/core/backends/template/__init__.py +0 -0
  151. dstack/_internal/core/backends/template/backend.py.jinja +16 -0
  152. dstack/_internal/core/backends/template/compute.py.jinja +95 -0
  153. dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
  154. dstack/_internal/core/backends/template/models.py.jinja +62 -0
  155. dstack/_internal/core/backends/tensordock/models.py +40 -0
  156. dstack/_internal/core/backends/vastai/__init__.py +0 -0
  157. dstack/_internal/core/backends/vastai/api_client.py +143 -0
  158. dstack/_internal/core/backends/vastai/backend.py +16 -0
  159. dstack/_internal/core/backends/vastai/compute.py +141 -0
  160. dstack/_internal/core/backends/vastai/configurator.py +69 -0
  161. dstack/_internal/core/backends/vastai/models.py +37 -0
  162. dstack/_internal/core/backends/verda/__init__.py +0 -0
  163. dstack/_internal/core/backends/verda/backend.py +16 -0
  164. dstack/_internal/core/backends/verda/compute.py +266 -0
  165. dstack/_internal/core/backends/verda/configurator.py +73 -0
  166. dstack/_internal/core/backends/verda/models.py +38 -0
  167. dstack/_internal/core/backends/vultr/__init__.py +0 -0
  168. dstack/_internal/core/backends/vultr/api_client.py +116 -0
  169. dstack/_internal/core/backends/vultr/backend.py +16 -0
  170. dstack/_internal/core/backends/vultr/compute.py +167 -0
  171. dstack/_internal/core/backends/vultr/configurator.py +71 -0
  172. dstack/_internal/core/backends/vultr/models.py +34 -0
  173. dstack/_internal/core/compatibility/__init__.py +0 -0
  174. dstack/_internal/core/compatibility/events.py +13 -0
  175. dstack/_internal/core/compatibility/fleets.py +58 -0
  176. dstack/_internal/core/compatibility/gateways.py +39 -0
  177. dstack/_internal/core/compatibility/gpus.py +13 -0
  178. dstack/_internal/core/compatibility/logs.py +14 -0
  179. dstack/_internal/core/compatibility/runs.py +86 -0
  180. dstack/_internal/core/compatibility/volumes.py +37 -0
  181. dstack/_internal/core/consts.py +8 -0
  182. dstack/_internal/core/errors.py +160 -0
  183. dstack/_internal/core/models/__init__.py +0 -0
  184. dstack/_internal/core/models/auth.py +28 -0
  185. dstack/_internal/core/models/backends/__init__.py +0 -0
  186. dstack/_internal/core/models/backends/base.py +48 -0
  187. dstack/_internal/core/models/common.py +143 -0
  188. dstack/_internal/core/models/compute_groups.py +39 -0
  189. dstack/_internal/core/models/config.py +28 -0
  190. dstack/_internal/core/models/configurations.py +1123 -0
  191. dstack/_internal/core/models/envs.py +149 -0
  192. dstack/_internal/core/models/events.py +98 -0
  193. dstack/_internal/core/models/files.py +67 -0
  194. dstack/_internal/core/models/fleets.py +437 -0
  195. dstack/_internal/core/models/gateways.py +146 -0
  196. dstack/_internal/core/models/gpus.py +45 -0
  197. dstack/_internal/core/models/health.py +28 -0
  198. dstack/_internal/core/models/instances.py +346 -0
  199. dstack/_internal/core/models/logs.py +27 -0
  200. dstack/_internal/core/models/metrics.py +14 -0
  201. dstack/_internal/core/models/placement.py +27 -0
  202. dstack/_internal/core/models/profiles.py +431 -0
  203. dstack/_internal/core/models/projects.py +46 -0
  204. dstack/_internal/core/models/repos/__init__.py +34 -0
  205. dstack/_internal/core/models/repos/base.py +36 -0
  206. dstack/_internal/core/models/repos/local.py +96 -0
  207. dstack/_internal/core/models/repos/remote.py +341 -0
  208. dstack/_internal/core/models/repos/virtual.py +85 -0
  209. dstack/_internal/core/models/resources.py +424 -0
  210. dstack/_internal/core/models/routers.py +24 -0
  211. dstack/_internal/core/models/runs.py +618 -0
  212. dstack/_internal/core/models/secrets.py +16 -0
  213. dstack/_internal/core/models/server.py +7 -0
  214. dstack/_internal/core/models/services.py +76 -0
  215. dstack/_internal/core/models/unix.py +53 -0
  216. dstack/_internal/core/models/users.py +60 -0
  217. dstack/_internal/core/models/volumes.py +221 -0
  218. dstack/_internal/core/services/__init__.py +16 -0
  219. dstack/_internal/core/services/api_client.py +15 -0
  220. dstack/_internal/core/services/configs/__init__.py +116 -0
  221. dstack/_internal/core/services/diff.py +71 -0
  222. dstack/_internal/core/services/logs.py +58 -0
  223. dstack/_internal/core/services/profiles.py +46 -0
  224. dstack/_internal/core/services/repos.py +236 -0
  225. dstack/_internal/core/services/ssh/__init__.py +27 -0
  226. dstack/_internal/core/services/ssh/attach.py +241 -0
  227. dstack/_internal/core/services/ssh/client.py +113 -0
  228. dstack/_internal/core/services/ssh/key_manager.py +53 -0
  229. dstack/_internal/core/services/ssh/ports.py +89 -0
  230. dstack/_internal/core/services/ssh/tunnel.py +337 -0
  231. dstack/_internal/proxy/__init__.py +8 -0
  232. dstack/_internal/proxy/gateway/__init__.py +0 -0
  233. dstack/_internal/proxy/gateway/app.py +89 -0
  234. dstack/_internal/proxy/gateway/auth.py +26 -0
  235. dstack/_internal/proxy/gateway/const.py +7 -0
  236. dstack/_internal/proxy/gateway/deps.py +73 -0
  237. dstack/_internal/proxy/gateway/main.py +17 -0
  238. dstack/_internal/proxy/gateway/models.py +23 -0
  239. dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
  240. dstack/_internal/proxy/gateway/repo/repo.py +121 -0
  241. dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
  242. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
  243. dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
  244. dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
  245. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
  246. dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
  247. dstack/_internal/proxy/gateway/routers/auth.py +10 -0
  248. dstack/_internal/proxy/gateway/routers/config.py +28 -0
  249. dstack/_internal/proxy/gateway/routers/registry.py +124 -0
  250. dstack/_internal/proxy/gateway/routers/stats.py +18 -0
  251. dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
  252. dstack/_internal/proxy/gateway/schemas/common.py +5 -0
  253. dstack/_internal/proxy/gateway/schemas/config.py +9 -0
  254. dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
  255. dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
  256. dstack/_internal/proxy/gateway/services/__init__.py +0 -0
  257. dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
  258. dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
  259. dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
  260. dstack/_internal/proxy/gateway/services/nginx.py +455 -0
  261. dstack/_internal/proxy/gateway/services/registry.py +426 -0
  262. dstack/_internal/proxy/gateway/services/server_client.py +95 -0
  263. dstack/_internal/proxy/gateway/services/stats.py +170 -0
  264. dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
  265. dstack/_internal/proxy/gateway/testing/common.py +13 -0
  266. dstack/_internal/proxy/lib/__init__.py +0 -0
  267. dstack/_internal/proxy/lib/auth.py +7 -0
  268. dstack/_internal/proxy/lib/deps.py +106 -0
  269. dstack/_internal/proxy/lib/errors.py +14 -0
  270. dstack/_internal/proxy/lib/models.py +112 -0
  271. dstack/_internal/proxy/lib/repo.py +27 -0
  272. dstack/_internal/proxy/lib/routers/__init__.py +0 -0
  273. dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
  274. dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
  275. dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
  276. dstack/_internal/proxy/lib/services/__init__.py +0 -0
  277. dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
  278. dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
  279. dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
  280. dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
  281. dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
  282. dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
  283. dstack/_internal/proxy/lib/services/service_connection.py +160 -0
  284. dstack/_internal/proxy/lib/testing/__init__.py +0 -0
  285. dstack/_internal/proxy/lib/testing/auth.py +11 -0
  286. dstack/_internal/proxy/lib/testing/common.py +51 -0
  287. dstack/_internal/server/__init__.py +0 -0
  288. dstack/_internal/server/alembic.ini +100 -0
  289. dstack/_internal/server/app.py +432 -0
  290. dstack/_internal/server/background/__init__.py +142 -0
  291. dstack/_internal/server/background/tasks/__init__.py +0 -0
  292. dstack/_internal/server/background/tasks/common.py +24 -0
  293. dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
  294. dstack/_internal/server/background/tasks/process_events.py +17 -0
  295. dstack/_internal/server/background/tasks/process_fleets.py +289 -0
  296. dstack/_internal/server/background/tasks/process_gateways.py +188 -0
  297. dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
  298. dstack/_internal/server/background/tasks/process_instances.py +1186 -0
  299. dstack/_internal/server/background/tasks/process_metrics.py +172 -0
  300. dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
  301. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  302. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
  303. dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
  304. dstack/_internal/server/background/tasks/process_runs.py +842 -0
  305. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
  306. dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
  307. dstack/_internal/server/background/tasks/process_volumes.py +129 -0
  308. dstack/_internal/server/compatibility/__init__.py +0 -0
  309. dstack/_internal/server/compatibility/common.py +20 -0
  310. dstack/_internal/server/compatibility/gpus.py +22 -0
  311. dstack/_internal/server/db.py +127 -0
  312. dstack/_internal/server/deps.py +19 -0
  313. dstack/_internal/server/main.py +4 -0
  314. dstack/_internal/server/migrations/__init__.py +0 -0
  315. dstack/_internal/server/migrations/env.py +112 -0
  316. dstack/_internal/server/migrations/script.py.mako +28 -0
  317. dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
  318. dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
  319. dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
  320. dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
  321. dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
  322. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  323. dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
  324. dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
  325. dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
  326. dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
  327. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  328. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  329. dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
  330. dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
  331. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  332. dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
  333. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  334. dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
  335. dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
  336. dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
  337. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  338. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  339. dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
  340. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  341. dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
  342. dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
  343. dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
  344. dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
  345. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  346. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  347. dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
  348. dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
  349. dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
  350. dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
  351. dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
  352. dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
  353. dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
  354. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  355. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  356. dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
  357. dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
  358. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  359. dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
  360. dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
  361. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  362. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  363. dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
  364. dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
  365. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  366. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
  367. dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
  368. dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
  369. dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
  370. dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
  371. dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
  372. dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
  373. dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
  374. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  375. dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
  376. dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
  377. dstack/_internal/server/migrations/versions/__init__.py +0 -0
  378. dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
  379. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  380. dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
  381. dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
  382. dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
  383. dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
  384. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  385. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  386. dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
  387. dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
  388. dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
  389. dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
  390. dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
  391. dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
  392. dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
  393. dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
  394. dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
  395. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  396. dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
  397. dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
  398. dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
  399. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  400. dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
  401. dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
  402. dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
  403. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  404. dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
  405. dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
  406. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  407. dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
  408. dstack/_internal/server/models.py +930 -0
  409. dstack/_internal/server/routers/__init__.py +0 -0
  410. dstack/_internal/server/routers/auth.py +34 -0
  411. dstack/_internal/server/routers/backends.py +142 -0
  412. dstack/_internal/server/routers/events.py +60 -0
  413. dstack/_internal/server/routers/files.py +68 -0
  414. dstack/_internal/server/routers/fleets.py +202 -0
  415. dstack/_internal/server/routers/gateways.py +109 -0
  416. dstack/_internal/server/routers/gpus.py +32 -0
  417. dstack/_internal/server/routers/instances.py +77 -0
  418. dstack/_internal/server/routers/logs.py +34 -0
  419. dstack/_internal/server/routers/metrics.py +82 -0
  420. dstack/_internal/server/routers/projects.py +205 -0
  421. dstack/_internal/server/routers/prometheus.py +35 -0
  422. dstack/_internal/server/routers/repos.py +118 -0
  423. dstack/_internal/server/routers/runs.py +216 -0
  424. dstack/_internal/server/routers/secrets.py +86 -0
  425. dstack/_internal/server/routers/server.py +19 -0
  426. dstack/_internal/server/routers/users.py +158 -0
  427. dstack/_internal/server/routers/volumes.py +122 -0
  428. dstack/_internal/server/schemas/__init__.py +0 -0
  429. dstack/_internal/server/schemas/auth.py +83 -0
  430. dstack/_internal/server/schemas/backends.py +16 -0
  431. dstack/_internal/server/schemas/common.py +9 -0
  432. dstack/_internal/server/schemas/events.py +211 -0
  433. dstack/_internal/server/schemas/files.py +5 -0
  434. dstack/_internal/server/schemas/fleets.py +49 -0
  435. dstack/_internal/server/schemas/gateways.py +31 -0
  436. dstack/_internal/server/schemas/gpus.py +26 -0
  437. dstack/_internal/server/schemas/health/__init__.py +0 -0
  438. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  439. dstack/_internal/server/schemas/instances.py +47 -0
  440. dstack/_internal/server/schemas/logs.py +17 -0
  441. dstack/_internal/server/schemas/projects.py +81 -0
  442. dstack/_internal/server/schemas/repos.py +24 -0
  443. dstack/_internal/server/schemas/runner.py +269 -0
  444. dstack/_internal/server/schemas/runs.py +66 -0
  445. dstack/_internal/server/schemas/secrets.py +16 -0
  446. dstack/_internal/server/schemas/users.py +72 -0
  447. dstack/_internal/server/schemas/volumes.py +29 -0
  448. dstack/_internal/server/security/__init__.py +0 -0
  449. dstack/_internal/server/security/permissions.py +251 -0
  450. dstack/_internal/server/services/__init__.py +0 -0
  451. dstack/_internal/server/services/auth.py +77 -0
  452. dstack/_internal/server/services/backends/__init__.py +404 -0
  453. dstack/_internal/server/services/backends/handlers.py +105 -0
  454. dstack/_internal/server/services/compute_groups.py +22 -0
  455. dstack/_internal/server/services/config.py +279 -0
  456. dstack/_internal/server/services/docker.py +162 -0
  457. dstack/_internal/server/services/encryption/__init__.py +102 -0
  458. dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
  459. dstack/_internal/server/services/encryption/keys/aes.py +68 -0
  460. dstack/_internal/server/services/encryption/keys/base.py +19 -0
  461. dstack/_internal/server/services/encryption/keys/identity.py +28 -0
  462. dstack/_internal/server/services/events.py +477 -0
  463. dstack/_internal/server/services/files.py +91 -0
  464. dstack/_internal/server/services/fleets.py +1224 -0
  465. dstack/_internal/server/services/gateways/__init__.py +686 -0
  466. dstack/_internal/server/services/gateways/client.py +209 -0
  467. dstack/_internal/server/services/gateways/connection.py +139 -0
  468. dstack/_internal/server/services/gateways/pool.py +58 -0
  469. dstack/_internal/server/services/gpus.py +387 -0
  470. dstack/_internal/server/services/instances.py +731 -0
  471. dstack/_internal/server/services/jobs/__init__.py +840 -0
  472. dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
  473. dstack/_internal/server/services/jobs/configurators/base.py +469 -0
  474. dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
  475. dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
  476. dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
  477. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  478. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
  479. dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
  480. dstack/_internal/server/services/jobs/configurators/service.py +28 -0
  481. dstack/_internal/server/services/jobs/configurators/task.py +39 -0
  482. dstack/_internal/server/services/locking.py +187 -0
  483. dstack/_internal/server/services/logging.py +29 -0
  484. dstack/_internal/server/services/logs/__init__.py +122 -0
  485. dstack/_internal/server/services/logs/aws.py +373 -0
  486. dstack/_internal/server/services/logs/base.py +47 -0
  487. dstack/_internal/server/services/logs/filelog.py +261 -0
  488. dstack/_internal/server/services/logs/fluentbit.py +329 -0
  489. dstack/_internal/server/services/logs/gcp.py +181 -0
  490. dstack/_internal/server/services/metrics.py +172 -0
  491. dstack/_internal/server/services/offers.py +249 -0
  492. dstack/_internal/server/services/permissions.py +37 -0
  493. dstack/_internal/server/services/placement.py +234 -0
  494. dstack/_internal/server/services/plugins.py +109 -0
  495. dstack/_internal/server/services/probes.py +10 -0
  496. dstack/_internal/server/services/projects.py +835 -0
  497. dstack/_internal/server/services/prometheus/__init__.py +0 -0
  498. dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
  499. dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
  500. dstack/_internal/server/services/proxy/__init__.py +3 -0
  501. dstack/_internal/server/services/proxy/auth.py +12 -0
  502. dstack/_internal/server/services/proxy/deps.py +18 -0
  503. dstack/_internal/server/services/proxy/repo.py +189 -0
  504. dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
  505. dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
  506. dstack/_internal/server/services/proxy/services/__init__.py +0 -0
  507. dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
  508. dstack/_internal/server/services/repos.py +362 -0
  509. dstack/_internal/server/services/requirements/__init__.py +0 -0
  510. dstack/_internal/server/services/requirements/combine.py +260 -0
  511. dstack/_internal/server/services/resources.py +21 -0
  512. dstack/_internal/server/services/runner/__init__.py +0 -0
  513. dstack/_internal/server/services/runner/client.py +646 -0
  514. dstack/_internal/server/services/runner/ssh.py +128 -0
  515. dstack/_internal/server/services/runs/__init__.py +1026 -0
  516. dstack/_internal/server/services/runs/plan.py +703 -0
  517. dstack/_internal/server/services/runs/replicas.py +317 -0
  518. dstack/_internal/server/services/runs/spec.py +191 -0
  519. dstack/_internal/server/services/secrets.py +245 -0
  520. dstack/_internal/server/services/services/__init__.py +345 -0
  521. dstack/_internal/server/services/services/autoscalers.py +140 -0
  522. dstack/_internal/server/services/services/options.py +53 -0
  523. dstack/_internal/server/services/ssh.py +67 -0
  524. dstack/_internal/server/services/storage/__init__.py +37 -0
  525. dstack/_internal/server/services/storage/base.py +48 -0
  526. dstack/_internal/server/services/storage/gcs.py +66 -0
  527. dstack/_internal/server/services/storage/s3.py +69 -0
  528. dstack/_internal/server/services/users.py +461 -0
  529. dstack/_internal/server/services/volumes.py +496 -0
  530. dstack/_internal/server/settings.py +161 -0
  531. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  532. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  533. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  534. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  535. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  536. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  537. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  538. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  539. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  540. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  541. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  542. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  543. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  544. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  545. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  546. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  547. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  548. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  549. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  550. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  551. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  552. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  553. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  554. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  555. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  556. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  557. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  558. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  559. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  560. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  561. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  562. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  563. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  564. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  565. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  566. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  567. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  568. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  569. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  570. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  571. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  572. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  573. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  574. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  575. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  576. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  577. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  578. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  579. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  580. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  581. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  582. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  583. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  584. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  585. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  586. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  587. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  588. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  589. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  590. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  591. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  592. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  593. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  594. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  595. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  596. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  597. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  598. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  599. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  600. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  601. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  602. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  603. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  604. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  605. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  606. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  607. dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
  608. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  609. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  610. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  611. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  612. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  613. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  614. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  615. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  616. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  617. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  618. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  619. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  620. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  621. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  622. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  623. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  624. dstack/_internal/server/statics/index.html +3 -0
  625. dstack/_internal/server/statics/logo-notext.svg +116 -0
  626. dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
  627. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
  628. dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
  629. dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
  630. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  631. dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
  632. dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
  633. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  634. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  635. dstack/_internal/server/testing/__init__.py +0 -0
  636. dstack/_internal/server/testing/common.py +1220 -0
  637. dstack/_internal/server/testing/conf.py +53 -0
  638. dstack/_internal/server/testing/matchers.py +31 -0
  639. dstack/_internal/server/utils/__init__.py +0 -0
  640. dstack/_internal/server/utils/common.py +55 -0
  641. dstack/_internal/server/utils/logging.py +51 -0
  642. dstack/_internal/server/utils/provisioning.py +368 -0
  643. dstack/_internal/server/utils/routers.py +166 -0
  644. dstack/_internal/server/utils/sentry_utils.py +24 -0
  645. dstack/_internal/settings.py +49 -0
  646. dstack/_internal/utils/__init__.py +0 -0
  647. dstack/_internal/utils/common.py +318 -0
  648. dstack/_internal/utils/cron.py +5 -0
  649. dstack/_internal/utils/crypto.py +40 -0
  650. dstack/_internal/utils/env.py +88 -0
  651. dstack/_internal/utils/event_loop.py +30 -0
  652. dstack/_internal/utils/files.py +69 -0
  653. dstack/_internal/utils/gpu.py +59 -0
  654. dstack/_internal/utils/hash.py +31 -0
  655. dstack/_internal/utils/interpolator.py +91 -0
  656. dstack/_internal/utils/json_schema.py +11 -0
  657. dstack/_internal/utils/json_utils.py +54 -0
  658. dstack/_internal/utils/logging.py +5 -0
  659. dstack/_internal/utils/nested_list.py +47 -0
  660. dstack/_internal/utils/network.py +50 -0
  661. dstack/_internal/utils/path.py +57 -0
  662. dstack/_internal/utils/random_names.py +258 -0
  663. dstack/_internal/utils/ssh.py +346 -0
  664. dstack/_internal/utils/tags.py +42 -0
  665. dstack/_internal/utils/typing.py +14 -0
  666. dstack/_internal/utils/version.py +22 -0
  667. dstack/api/__init__.py +46 -0
  668. dstack/api/_public/__init__.py +96 -0
  669. dstack/api/_public/backends.py +42 -0
  670. dstack/api/_public/common.py +5 -0
  671. dstack/api/_public/repos.py +202 -0
  672. dstack/api/_public/runs.py +714 -0
  673. dstack/api/server/__init__.py +206 -0
  674. dstack/api/server/_auth.py +30 -0
  675. dstack/api/server/_backends.py +38 -0
  676. dstack/api/server/_events.py +64 -0
  677. dstack/api/server/_files.py +18 -0
  678. dstack/api/server/_fleets.py +82 -0
  679. dstack/api/server/_gateways.py +54 -0
  680. dstack/api/server/_gpus.py +27 -0
  681. dstack/api/server/_group.py +22 -0
  682. dstack/api/server/_logs.py +15 -0
  683. dstack/api/server/_metrics.py +23 -0
  684. dstack/api/server/_projects.py +124 -0
  685. dstack/api/server/_repos.py +64 -0
  686. dstack/api/server/_runs.py +102 -0
  687. dstack/api/server/_secrets.py +36 -0
  688. dstack/api/server/_users.py +82 -0
  689. dstack/api/server/_volumes.py +39 -0
  690. dstack/api/server/utils.py +34 -0
  691. dstack/api/utils.py +105 -0
  692. dstack/core/__init__.py +0 -0
  693. dstack/plugins/__init__.py +8 -0
  694. dstack/plugins/_base.py +72 -0
  695. dstack/plugins/_models.py +8 -0
  696. dstack/plugins/_utils.py +19 -0
  697. dstack/plugins/builtin/__init__.py +0 -0
  698. dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
  699. dstack/plugins/builtin/rest_plugin/_models.py +48 -0
  700. dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
  701. dstack/version.py +3 -1
  702. dstack-0.20.7.dist-info/METADATA +519 -0
  703. dstack-0.20.7.dist-info/RECORD +720 -0
  704. {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
  705. dstack-0.20.7.dist-info/entry_points.txt +2 -0
  706. dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
  707. dstack/aws/__init__.py +0 -180
  708. dstack/aws/artifacts.py +0 -111
  709. dstack/aws/config.py +0 -40
  710. dstack/aws/jobs.py +0 -245
  711. dstack/aws/logs.py +0 -186
  712. dstack/aws/repos.py +0 -137
  713. dstack/aws/run_names.py +0 -17
  714. dstack/aws/runners.py +0 -693
  715. dstack/aws/runs.py +0 -79
  716. dstack/aws/secrets.py +0 -99
  717. dstack/aws/tags.py +0 -138
  718. dstack/backend.py +0 -299
  719. dstack/cli/app.py +0 -41
  720. dstack/cli/artifacts.py +0 -87
  721. dstack/cli/common.py +0 -57
  722. dstack/cli/config.py +0 -194
  723. dstack/cli/dashboard.py +0 -26
  724. dstack/cli/delete.py +0 -49
  725. dstack/cli/init.py +0 -33
  726. dstack/cli/logs.py +0 -87
  727. dstack/cli/main.py +0 -81
  728. dstack/cli/restart.py +0 -43
  729. dstack/cli/run.py +0 -223
  730. dstack/cli/schema.py +0 -46
  731. dstack/cli/secrets.py +0 -97
  732. dstack/cli/status.py +0 -140
  733. dstack/cli/stop.py +0 -53
  734. dstack/cli/tags.py +0 -100
  735. dstack/config.py +0 -80
  736. dstack/dashboard/artifacts.py +0 -26
  737. dstack/dashboard/logs.py +0 -73
  738. dstack/dashboard/main.py +0 -45
  739. dstack/dashboard/repos.py +0 -41
  740. dstack/dashboard/runs.py +0 -140
  741. dstack/dashboard/secrets.py +0 -53
  742. dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
  743. dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
  744. dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
  745. dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
  746. dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
  747. dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
  748. dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
  749. dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
  750. dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
  751. dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
  752. dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
  753. dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  754. dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
  755. dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
  756. dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
  757. dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
  758. dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
  759. dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
  760. dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
  761. dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
  762. dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
  763. dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
  764. dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
  765. dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
  766. dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  767. dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  768. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  769. dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  770. dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  771. dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  772. dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  773. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  774. dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  775. dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  776. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  777. dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  778. dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  779. dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  780. dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  781. dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  782. dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  783. dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  784. dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  785. dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  786. dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  787. dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  788. dstack/dashboard/statics/assets/browserconfig.xml +0 -15
  789. dstack/dashboard/statics/assets/coast-228x228.png +0 -0
  790. dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
  791. dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
  792. dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
  793. dstack/dashboard/statics/assets/favicon.ico +0 -0
  794. dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
  795. dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
  796. dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
  797. dstack/dashboard/statics/assets/manifest.webapp +0 -14
  798. dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
  799. dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
  800. dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
  801. dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
  802. dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
  803. dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
  804. dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
  805. dstack/dashboard/statics/index.html +0 -7
  806. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
  807. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
  808. dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
  809. dstack/dashboard/statics/main.css +0 -5058
  810. dstack/dashboard/statics/splash_thumbnail.png +0 -0
  811. dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  812. dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
  813. dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
  814. dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
  815. dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
  816. dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
  817. dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
  818. dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
  819. dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
  820. dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
  821. dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
  822. dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
  823. dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
  824. dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
  825. dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
  826. dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
  827. dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
  828. dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
  829. dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
  830. dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
  831. dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
  832. dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
  833. dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
  834. dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
  835. dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
  836. dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
  837. dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
  838. dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
  839. dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
  840. dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
  841. dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
  842. dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
  843. dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
  844. dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
  845. dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
  846. dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
  847. dstack/dashboard/tags.py +0 -119
  848. dstack/jobs.py +0 -255
  849. dstack/providers/__init__.py +0 -316
  850. dstack/providers/_python/main.py +0 -88
  851. dstack/providers/_tensorboard/main.py +0 -93
  852. dstack/providers/_torchrun/main.py +0 -121
  853. dstack/providers/bash/main.py +0 -90
  854. dstack/providers/code/main.py +0 -95
  855. dstack/providers/docker/main.py +0 -79
  856. dstack/providers/lab/main.py +0 -95
  857. dstack/providers/notebook/main.py +0 -90
  858. dstack/random_name.py +0 -29
  859. dstack/repo.py +0 -135
  860. dstack/runners.py +0 -35
  861. dstack/util.py +0 -15
  862. dstack-0.0.9.dist-info/METADATA +0 -176
  863. dstack-0.0.9.dist-info/RECORD +0 -179
  864. dstack-0.0.9.dist-info/entry_points.txt +0 -3
  865. dstack-0.0.9.dist-info/top_level.txt +0 -2
  866. tests/test_config.py +0 -70
  867. /dstack/{cli → _internal}/__init__.py +0 -0
  868. /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
  869. /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
  870. /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
  871. /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
  872. /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
  873. /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
  874. /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
  875. /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
  876. /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
  877. {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
  878. /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
  879. /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
@@ -0,0 +1,1257 @@
1
+ import concurrent.futures
2
+ import json
3
+ import re
4
+ from collections import defaultdict
5
+ from collections.abc import Iterable
6
+ from dataclasses import dataclass
7
+ from typing import Callable, Dict, List, Literal, Optional, Tuple
8
+
9
+ import google.api_core.exceptions
10
+ import google.cloud.compute_v1 as compute_v1
11
+ from cachetools import TTLCache, cachedmethod
12
+ from google.cloud import tpu_v2
13
+ from google.cloud.compute_v1.types.compute import Instance
14
+ from gpuhunt import KNOWN_TPUS
15
+
16
+ import dstack._internal.core.backends.gcp.auth as auth
17
+ import dstack._internal.core.backends.gcp.resources as gcp_resources
18
+ from dstack import version
19
+ from dstack._internal.core.backends.base.compute import (
20
+ Compute,
21
+ ComputeTTLCache,
22
+ ComputeWithAllOffersCached,
23
+ ComputeWithCreateInstanceSupport,
24
+ ComputeWithGatewaySupport,
25
+ ComputeWithMultinodeSupport,
26
+ ComputeWithPlacementGroupSupport,
27
+ ComputeWithPrivateGatewaySupport,
28
+ ComputeWithPrivilegedSupport,
29
+ ComputeWithReservationSupport,
30
+ ComputeWithVolumeSupport,
31
+ generate_unique_gateway_instance_name,
32
+ generate_unique_instance_name,
33
+ generate_unique_volume_name,
34
+ get_gateway_user_data,
35
+ get_shim_commands,
36
+ get_user_data,
37
+ merge_tags,
38
+ requires_nvidia_proprietary_kernel_modules,
39
+ )
40
+ from dstack._internal.core.backends.base.offers import (
41
+ OfferModifier,
42
+ get_catalog_offers,
43
+ get_offers_disk_modifier,
44
+ )
45
+ from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
46
+ from dstack._internal.core.backends.gcp.models import GCPConfig
47
+ from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
48
+ from dstack._internal.core.errors import (
49
+ ComputeError,
50
+ ComputeResourceNotFoundError,
51
+ NoCapacityError,
52
+ PlacementGroupInUseError,
53
+ ProvisioningError,
54
+ )
55
+ from dstack._internal.core.models.backends.base import BackendType
56
+ from dstack._internal.core.models.common import CoreModel
57
+ from dstack._internal.core.models.gateways import (
58
+ GatewayComputeConfiguration,
59
+ GatewayProvisioningData,
60
+ )
61
+ from dstack._internal.core.models.instances import (
62
+ InstanceAvailability,
63
+ InstanceConfiguration,
64
+ InstanceOffer,
65
+ InstanceOfferWithAvailability,
66
+ InstanceType,
67
+ Resources,
68
+ )
69
+ from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
70
+ from dstack._internal.core.models.resources import Memory, Range
71
+ from dstack._internal.core.models.runs import JobProvisioningData, Requirements
72
+ from dstack._internal.core.models.volumes import (
73
+ Volume,
74
+ VolumeAttachmentData,
75
+ VolumeProvisioningData,
76
+ )
77
+ from dstack._internal.utils.common import get_or_error
78
+ from dstack._internal.utils.logging import get_logger
79
+
80
+ logger = get_logger(__name__)
81
+
82
+ # pd-balanced disks can be 10GB-64TB, but dstack images are 20GB and cannot grow larger
83
+ # than 32TB because of filesystem settings
84
+ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("20GB"), max=Memory.parse("32TB"))
85
+ # Pattern from https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation
86
+ RESERVATION_PATTERN = re.compile(
87
+ r"projects/(?P<project_id>[a-z0-9-]+)/reservations/(?P<reservation_name>[a-z0-9-]+)"
88
+ )
89
+ RESOURCE_NAME_PATTERN = re.compile(r"[a-z0-9-]+")
90
+ TPU_VERSIONS = [tpu.name for tpu in KNOWN_TPUS]
91
+ DEFAULT_GATEWAY_INSTANCE_TYPE = "e2-medium"
92
+
93
+
94
+ class GCPOfferBackendData(CoreModel):
95
+ is_dws_calendar_mode: bool = False
96
+
97
+
98
+ class GCPVolumeDiskBackendData(CoreModel):
99
+ type: Literal["disk"] = "disk"
100
+ disk_type: str
101
+
102
+
103
+ class GCPCompute(
104
+ ComputeWithAllOffersCached,
105
+ ComputeWithCreateInstanceSupport,
106
+ ComputeWithPrivilegedSupport,
107
+ ComputeWithMultinodeSupport,
108
+ ComputeWithReservationSupport,
109
+ ComputeWithPlacementGroupSupport,
110
+ ComputeWithGatewaySupport,
111
+ ComputeWithPrivateGatewaySupport,
112
+ ComputeWithVolumeSupport,
113
+ Compute,
114
+ ):
115
+ def __init__(self, config: GCPConfig):
116
+ super().__init__()
117
+ self.config = config
118
+ self.credentials, _ = auth.authenticate(config.creds, self.config.project_id)
119
+ self.instances_client = compute_v1.InstancesClient(credentials=self.credentials)
120
+ self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
121
+ self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
122
+ self.subnetworks_client = compute_v1.SubnetworksClient(credentials=self.credentials)
123
+ self.routers_client = compute_v1.RoutersClient(credentials=self.credentials)
124
+ self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials)
125
+ self.disk_client = compute_v1.DisksClient(credentials=self.credentials)
126
+ self.resource_policies_client = compute_v1.ResourcePoliciesClient(
127
+ credentials=self.credentials
128
+ )
129
+ self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials)
130
+ self._usable_subnets_cache = ComputeTTLCache(cache=TTLCache(maxsize=1, ttl=120))
131
+ # Smaller TTL since we check the reservation's in_use_count, which can change often
132
+ self._reservation_cache = ComputeTTLCache(cache=TTLCache(maxsize=8, ttl=20))
133
+
134
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
135
+ regions = get_or_error(self.config.regions)
136
+ offers = get_catalog_offers(
137
+ backend=BackendType.GCP,
138
+ extra_filter=_supported_instances_and_zones(regions),
139
+ )
140
+ quotas: Dict[str, Dict[str, float]] = defaultdict(dict)
141
+ for region in self.regions_client.list(project=self.config.project_id):
142
+ for quota in region.quotas:
143
+ quotas[region.name][quota.metric] = quota.limit - quota.usage
144
+
145
+ offer_keys_to_offers = {}
146
+ offers_with_availability = []
147
+ for offer in offers:
148
+ region = offer.region[:-2] # strip zone
149
+ key = (_unique_instance_name(offer.instance), region)
150
+ if key in offer_keys_to_offers:
151
+ offer_keys_to_offers[key].availability_zones.append(offer.region)
152
+ continue
153
+ availability = InstanceAvailability.NO_QUOTA
154
+ if _has_gpu_quota(quotas[region], offer.instance.resources):
155
+ availability = InstanceAvailability.UNKNOWN
156
+ # todo quotas: cpu, memory, global gpu, tpu
157
+ offer_with_availability = InstanceOfferWithAvailability(
158
+ **offer.dict(),
159
+ availability=availability,
160
+ availability_zones=[offer.region],
161
+ )
162
+ offer_keys_to_offers[key] = offer_with_availability
163
+ offers_with_availability.append(offer_with_availability)
164
+ offers_with_availability[-1].region = region
165
+ return offers_with_availability
166
+
167
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
168
+ modifiers = []
169
+
170
+ if requirements.reservation:
171
+ zone_to_reservation = self._find_reservation(requirements.reservation)
172
+
173
+ def reservation_modifier(
174
+ offer: InstanceOfferWithAvailability,
175
+ ) -> Optional[InstanceOfferWithAvailability]:
176
+ if offer.instance.resources.spot:
177
+ return None
178
+ assert offer.availability_zones is not None
179
+ matching_zones = []
180
+ zones_with_capacity = []
181
+ for zone in offer.availability_zones:
182
+ reservation = zone_to_reservation.get(zone)
183
+ if reservation is not None and _offer_matches_reservation(offer, reservation):
184
+ matching_zones.append(zone)
185
+ if _reservation_has_capacity(reservation):
186
+ zones_with_capacity.append(zone)
187
+ if not matching_zones:
188
+ return None
189
+ offer = offer.copy(deep=True)
190
+ if zones_with_capacity:
191
+ offer.availability_zones = zones_with_capacity
192
+ else:
193
+ offer.availability_zones = matching_zones
194
+ offer.availability = InstanceAvailability.NOT_AVAILABLE
195
+ return offer
196
+
197
+ modifiers.append(reservation_modifier)
198
+
199
+ modifiers.append(get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements))
200
+ return modifiers
201
+
202
+ def get_offers_post_filter(
203
+ self, requirements: Requirements
204
+ ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
205
+ if requirements.reservation is None:
206
+
207
+ def reserved_offers_filter(offer: InstanceOfferWithAvailability) -> bool:
208
+ """Remove reserved-only offers"""
209
+ if GCPOfferBackendData.__response__.parse_obj(
210
+ offer.backend_data
211
+ ).is_dws_calendar_mode:
212
+ return False
213
+ return True
214
+
215
+ return reserved_offers_filter
216
+
217
+ return None
218
+
219
+ def terminate_instance(
220
+ self, instance_id: str, region: str, backend_data: Optional[str] = None
221
+ ) -> None:
222
+ # Old instances have region set to zone, e.g. us-central1-a.
223
+ # New instance have region set to region, e.g. us-central1. Zone is stored in backend_data.
224
+ zone = region
225
+ is_tpu = False
226
+ if backend_data is not None:
227
+ backend_data_dict = json.loads(backend_data)
228
+ zone = backend_data_dict["zone"]
229
+ is_tpu = backend_data_dict.get("is_tpu", False)
230
+ try:
231
+ if is_tpu:
232
+ name = f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}"
233
+ delete_request = tpu_v2.DeleteNodeRequest(name=name)
234
+ self.tpu_client.delete_node(request=delete_request)
235
+ else:
236
+ self.instances_client.delete(
237
+ project=self.config.project_id,
238
+ zone=zone,
239
+ instance=instance_id,
240
+ )
241
+ except google.api_core.exceptions.NotFound:
242
+ pass
243
+
244
+ def create_instance(
245
+ self,
246
+ instance_offer: InstanceOfferWithAvailability,
247
+ instance_config: InstanceConfiguration,
248
+ placement_group: Optional[PlacementGroup],
249
+ ) -> JobProvisioningData:
250
+ instance_name = generate_unique_instance_name(
251
+ instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
252
+ )
253
+ allocate_public_ip = self.config.allocate_public_ips
254
+ authorized_keys = instance_config.get_public_keys()
255
+
256
+ # get_offers always fills instance_offer.availability_zones
257
+ zones = get_or_error(instance_offer.availability_zones)
258
+ if len(zones) == 0:
259
+ raise NoCapacityError("No eligible availability zones")
260
+ # If a shared VPC is not used, we can create firewall rules for user
261
+ if self.config.vpc_project_id is None:
262
+ gcp_resources.create_runner_firewall_rules(
263
+ firewalls_client=self.firewalls_client,
264
+ project_id=self.config.project_id,
265
+ network=self.config.vpc_resource_name,
266
+ )
267
+ disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
268
+ # Choose any usable subnet in a VPC.
269
+ # Configuring a specific subnet per region is not supported yet.
270
+ subnetwork = self._get_vpc_subnet(instance_offer.region)
271
+ extra_subnets = self._get_extra_subnets(
272
+ region=instance_offer.region,
273
+ instance_type_name=instance_offer.instance.name,
274
+ )
275
+ roce_subnets = self._get_roce_subnets(
276
+ region=instance_offer.region,
277
+ instance_type_name=instance_offer.instance.name,
278
+ )
279
+ placement_policy = None
280
+ if placement_group is not None:
281
+ placement_policy = gcp_resources.get_placement_policy_resource_name(
282
+ project_id=self.config.project_id,
283
+ region=instance_offer.region,
284
+ placement_policy=placement_group.name,
285
+ )
286
+ labels = {
287
+ "owner": "dstack",
288
+ "dstack_project": instance_config.project_name.lower(),
289
+ "dstack_name": instance_config.instance_name,
290
+ "dstack_user": instance_config.user.lower(),
291
+ }
292
+ labels = merge_tags(
293
+ base_tags=labels,
294
+ backend_tags=self.config.tags,
295
+ resource_tags=instance_config.tags,
296
+ )
297
+ labels = gcp_resources.filter_invalid_labels(labels)
298
+ is_tpu = (
299
+ _is_tpu(instance_offer.instance.resources.gpus[0].name)
300
+ if instance_offer.instance.resources.gpus
301
+ else False
302
+ )
303
+ if is_tpu:
304
+ instance_id = instance_name
305
+ startup_script = _get_tpu_startup_script()
306
+ # GCP does not allow attaching disks while TPUs is creating,
307
+ # so we need to attach the disks on creation.
308
+ data_disks = _get_tpu_data_disks(self.config.project_id, instance_config.volumes)
309
+ for zone in zones:
310
+ tpu_node = gcp_resources.create_tpu_node_struct(
311
+ instance_name=instance_offer.instance.name,
312
+ startup_script=startup_script,
313
+ authorized_keys=authorized_keys,
314
+ spot=instance_offer.instance.resources.spot,
315
+ labels=labels,
316
+ runtime_version=_get_tpu_runtime_version(instance_offer.instance.name),
317
+ network=self.config.vpc_resource_name,
318
+ subnetwork=subnetwork,
319
+ allocate_public_ip=allocate_public_ip,
320
+ service_account=self.config.vm_service_account,
321
+ data_disks=data_disks,
322
+ )
323
+ create_node_request = tpu_v2.CreateNodeRequest(
324
+ parent=f"projects/{self.config.project_id}/locations/{zone}",
325
+ node_id=instance_id,
326
+ node=tpu_node,
327
+ )
328
+ try:
329
+ # GCP needs some time to return an error in case of no capacity (< 30s).
330
+ # Call wait_for_operation() to get the capacity error and try another option.
331
+ # If the request succeeds, we'll probably timeout and update_provisioning_data() will get hostname.
332
+ operation = self.tpu_client.create_node(request=create_node_request)
333
+ gcp_resources.wait_for_operation(operation, timeout=30)
334
+ except (
335
+ google.api_core.exceptions.ServiceUnavailable,
336
+ google.api_core.exceptions.NotFound,
337
+ google.api_core.exceptions.ResourceExhausted,
338
+ ) as e:
339
+ logger.debug("Got GCP error when provisioning a TPU: %s", e)
340
+ continue
341
+ except concurrent.futures.TimeoutError:
342
+ pass
343
+ return JobProvisioningData(
344
+ backend=instance_offer.backend,
345
+ instance_type=instance_offer.instance,
346
+ instance_id=instance_id,
347
+ hostname=None,
348
+ internal_ip=None,
349
+ region=instance_offer.region,
350
+ availability_zone=zone,
351
+ price=instance_offer.price,
352
+ ssh_port=22,
353
+ username="ubuntu",
354
+ ssh_proxy=None,
355
+ dockerized=True,
356
+ backend_data=json.dumps({"is_tpu": is_tpu, "zone": zone}),
357
+ )
358
+ raise NoCapacityError()
359
+
360
+ image = _get_image(
361
+ instance_type_name=instance_offer.instance.name,
362
+ gpu_name=(
363
+ instance_offer.instance.resources.gpus[0].name
364
+ if len(instance_offer.instance.resources.gpus) > 0
365
+ else None
366
+ ),
367
+ )
368
+
369
+ for zone in zones:
370
+ reservation = None
371
+ if instance_config.reservation:
372
+ reservation = self._find_reservation(instance_config.reservation).get(zone)
373
+ if reservation is None:
374
+ logger.warning(
375
+ "Reservation %s no longer exists in zone %s",
376
+ instance_config.reservation,
377
+ zone,
378
+ )
379
+ continue
380
+ request = compute_v1.InsertInstanceRequest()
381
+ request.zone = zone
382
+ request.project = self.config.project_id
383
+ request.instance_resource = gcp_resources.create_instance_struct(
384
+ disk_size=disk_size,
385
+ image_id=image.id,
386
+ machine_type=instance_offer.instance.name,
387
+ accelerators=gcp_resources.get_accelerators(
388
+ project_id=self.config.project_id,
389
+ zone=zone,
390
+ gpus=instance_offer.instance.resources.gpus,
391
+ ),
392
+ spot=instance_offer.instance.resources.spot,
393
+ user_data=_get_user_data(
394
+ authorized_keys=authorized_keys,
395
+ instance_type_name=instance_offer.instance.name,
396
+ is_ufw_installed=image.is_ufw_installed,
397
+ ),
398
+ authorized_keys=authorized_keys,
399
+ labels=labels,
400
+ tags=[gcp_resources.DSTACK_INSTANCE_TAG],
401
+ instance_name=instance_name,
402
+ zone=zone,
403
+ service_account=self.config.vm_service_account,
404
+ network=self.config.vpc_resource_name,
405
+ subnetwork=subnetwork,
406
+ extra_subnetworks=extra_subnets,
407
+ roce_subnetworks=roce_subnets,
408
+ allocate_public_ip=allocate_public_ip,
409
+ placement_policy=placement_policy,
410
+ reservation=reservation,
411
+ )
412
+ try:
413
+ # GCP needs some time to return an error in case of no capacity (< 30s).
414
+ # Call wait_for_operation() to get the capacity error and try another option.
415
+ # If the request succeeds, we'll probably timeout and update_provisioning_data() will get hostname.
416
+ operation = self.instances_client.insert(request=request)
417
+ gcp_resources.wait_for_extended_operation(operation, timeout=30)
418
+ except google.api_core.exceptions.BadRequest as e:
419
+ if "Network profile only allows resource creation in location" in e.message:
420
+ # A hack to find the correct RoCE VPC zone by trial and error.
421
+ # Could be better to find it via the API.
422
+ logger.debug("Got GCP error when provisioning a VM: %s", e)
423
+ continue
424
+ raise
425
+ except (
426
+ google.api_core.exceptions.ServiceUnavailable,
427
+ google.api_core.exceptions.NotFound,
428
+ ) as e:
429
+ logger.debug("Got GCP error when provisioning a VM: %s", e)
430
+ continue
431
+ except concurrent.futures.TimeoutError:
432
+ pass
433
+ return JobProvisioningData(
434
+ backend=instance_offer.backend,
435
+ instance_type=instance_offer.instance,
436
+ instance_id=instance_name,
437
+ public_ip_enabled=allocate_public_ip,
438
+ hostname=None,
439
+ internal_ip=None,
440
+ region=instance_offer.region,
441
+ availability_zone=zone,
442
+ price=instance_offer.price,
443
+ username="ubuntu",
444
+ ssh_port=22,
445
+ dockerized=True,
446
+ ssh_proxy=None,
447
+ backend_data=json.dumps({"zone": zone}),
448
+ )
449
+ raise NoCapacityError()
450
+
451
+ def update_provisioning_data(
452
+ self,
453
+ provisioning_data: JobProvisioningData,
454
+ project_ssh_public_key: str,
455
+ project_ssh_private_key: str,
456
+ ):
457
+ allocate_public_ip = self.config.allocate_public_ips
458
+ zone = provisioning_data.region
459
+ is_tpu = False
460
+ if provisioning_data.backend_data is not None:
461
+ backend_data_dict = json.loads(provisioning_data.backend_data)
462
+ zone = backend_data_dict["zone"]
463
+ is_tpu = backend_data_dict.get("is_tpu", False)
464
+
465
+ if is_tpu:
466
+ node_request = tpu_v2.GetNodeRequest(
467
+ name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{provisioning_data.instance_id}",
468
+ )
469
+ try:
470
+ instance = self.tpu_client.get_node(request=node_request)
471
+ except google.api_core.exceptions.NotFound:
472
+ raise ProvisioningError("Failed to get instance IP address. Instance not found.")
473
+
474
+ # See states https://cloud.google.com/python/docs/reference/tpu/latest/google.cloud.tpu_v2.types.Node.State
475
+ if instance.state in [0, 1]:
476
+ return
477
+ if instance.state == 2:
478
+ if allocate_public_ip:
479
+ hostname = instance.network_endpoints[0].access_config.external_ip
480
+ else:
481
+ hostname = instance.network_endpoints[0].ip_address
482
+ provisioning_data.hostname = hostname
483
+ provisioning_data.internal_ip = instance.network_endpoints[0].ip_address
484
+ return
485
+ raise ProvisioningError(
486
+ f"Failed to get instance IP address. Instance state: {instance.state}"
487
+ )
488
+
489
+ try:
490
+ instance = self.instances_client.get(
491
+ project=self.config.project_id, zone=zone, instance=provisioning_data.instance_id
492
+ )
493
+ except google.api_core.exceptions.NotFound:
494
+ raise ProvisioningError("Failed to get instance IP address. Instance not found.")
495
+
496
+ if instance.status in ["PROVISIONING", "STAGING"]:
497
+ return
498
+ if instance.status == "RUNNING":
499
+ provisioning_data.hostname = _get_instance_ip(instance, allocate_public_ip)
500
+ provisioning_data.internal_ip = instance.network_interfaces[0].network_i_p
501
+ return
502
+ raise ProvisioningError(
503
+ f"Failed to get instance IP address. Instance status: {instance.status}"
504
+ )
505
+
506
+ def create_placement_group(
507
+ self,
508
+ placement_group: PlacementGroup,
509
+ master_instance_offer: InstanceOffer,
510
+ ) -> PlacementGroupProvisioningData:
511
+ policy = compute_v1.ResourcePolicy(
512
+ name=placement_group.name,
513
+ region=placement_group.configuration.region,
514
+ group_placement_policy=compute_v1.ResourcePolicyGroupPlacementPolicy(
515
+ availability_domain_count=1,
516
+ collocation="COLLOCATED",
517
+ ),
518
+ )
519
+ self.resource_policies_client.insert(
520
+ project=self.config.project_id,
521
+ region=placement_group.configuration.region,
522
+ resource_policy_resource=policy,
523
+ )
524
+ return PlacementGroupProvisioningData(backend=BackendType.GCP)
525
+
526
+ def delete_placement_group(
527
+ self,
528
+ placement_group: PlacementGroup,
529
+ ):
530
+ try:
531
+ operation = self.resource_policies_client.delete(
532
+ project=self.config.project_id,
533
+ region=placement_group.configuration.region,
534
+ resource_policy=placement_group.name,
535
+ )
536
+ operation.result() # Wait for operation to complete
537
+ except google.api_core.exceptions.NotFound:
538
+ logger.debug("Placement group %s not found", placement_group.name)
539
+ except google.api_core.exceptions.BadRequest as e:
540
+ if "is already being used by" in e.message:
541
+ raise PlacementGroupInUseError()
542
+ raise
543
+
544
+ def is_suitable_placement_group(
545
+ self,
546
+ placement_group: PlacementGroup,
547
+ instance_offer: InstanceOffer,
548
+ ) -> bool:
549
+ return placement_group.configuration.region == instance_offer.region
550
+
551
+ def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
552
+ # Cannot use our own placement policies when provisioning in a reservation.
553
+ # Instead, we use the placement policy defined in reservation settings.
554
+ return False
555
+
556
+ def create_gateway(
557
+ self,
558
+ configuration: GatewayComputeConfiguration,
559
+ ) -> GatewayProvisioningData:
560
+ if self.config.vpc_project_id is None:
561
+ gcp_resources.create_gateway_firewall_rules(
562
+ firewalls_client=self.firewalls_client,
563
+ project_id=self.config.project_id,
564
+ network=self.config.vpc_resource_name,
565
+ )
566
+ for i in self.regions_client.list(project=self.config.project_id):
567
+ if i.name == configuration.region:
568
+ zone = i.zones[0].split("/")[-1]
569
+ break
570
+ else:
571
+ raise ComputeResourceNotFoundError()
572
+
573
+ instance_name = generate_unique_gateway_instance_name(
574
+ configuration, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
575
+ )
576
+ # Choose any usable subnet in a VPC.
577
+ # Configuring a specific subnet per region is not supported yet.
578
+ subnetwork = self._get_vpc_subnet(configuration.region)
579
+
580
+ labels = {
581
+ "owner": "dstack",
582
+ "dstack_project": configuration.project_name.lower(),
583
+ "dstack_name": configuration.instance_name,
584
+ }
585
+ labels = merge_tags(
586
+ base_tags=labels,
587
+ backend_tags=self.config.tags,
588
+ resource_tags=configuration.tags,
589
+ )
590
+ labels = gcp_resources.filter_invalid_labels(labels)
591
+
592
+ request = compute_v1.InsertInstanceRequest()
593
+ request.zone = zone
594
+ request.project = self.config.project_id
595
+ request.instance_resource = gcp_resources.create_instance_struct(
596
+ disk_size=10,
597
+ image_id=_get_gateway_image_id(),
598
+ machine_type=configuration.instance_type or DEFAULT_GATEWAY_INSTANCE_TYPE,
599
+ accelerators=[],
600
+ spot=False,
601
+ user_data=get_gateway_user_data(
602
+ configuration.ssh_key_pub, router=configuration.router
603
+ ),
604
+ authorized_keys=[configuration.ssh_key_pub],
605
+ labels=labels,
606
+ tags=[gcp_resources.DSTACK_GATEWAY_TAG],
607
+ instance_name=instance_name,
608
+ zone=zone,
609
+ service_account=self.config.vm_service_account,
610
+ network=self.config.vpc_resource_name,
611
+ subnetwork=subnetwork,
612
+ allocate_public_ip=configuration.public_ip,
613
+ )
614
+ try:
615
+ operation = self.instances_client.insert(request=request)
616
+ gcp_resources.wait_for_extended_operation(operation, "instance creation")
617
+ except (
618
+ google.api_core.exceptions.ServiceUnavailable,
619
+ google.api_core.exceptions.ClientError,
620
+ ) as e:
621
+ raise ComputeError(f"GCP error: {e.message}")
622
+ instance = self.instances_client.get(
623
+ project=self.config.project_id, zone=zone, instance=instance_name
624
+ )
625
+ return GatewayProvisioningData(
626
+ instance_id=instance_name,
627
+ region=configuration.region, # used for instance termination
628
+ availability_zone=zone,
629
+ ip_address=_get_instance_ip(instance, configuration.public_ip),
630
+ backend_data=json.dumps({"zone": zone}),
631
+ )
632
+
633
+ def terminate_gateway(
634
+ self,
635
+ instance_id: str,
636
+ configuration: GatewayComputeConfiguration,
637
+ backend_data: Optional[str] = None,
638
+ ):
639
+ self.terminate_instance(
640
+ instance_id=instance_id,
641
+ region=configuration.region,
642
+ backend_data=backend_data,
643
+ )
644
+
645
+ def register_volume(self, volume: Volume) -> VolumeProvisioningData:
646
+ logger.debug("Requesting persistent disk %s", volume.configuration.volume_id)
647
+ zones = gcp_resources.get_availability_zones(
648
+ regions_client=self.regions_client,
649
+ project_id=self.config.project_id,
650
+ region=volume.configuration.region,
651
+ )
652
+ for zone in zones:
653
+ try:
654
+ disk = self.disk_client.get(
655
+ project=self.config.project_id,
656
+ zone=zone,
657
+ disk=volume.configuration.volume_id,
658
+ )
659
+ except google.api_core.exceptions.NotFound:
660
+ pass
661
+ else:
662
+ logger.debug("Found persistent disk %s", volume.configuration.volume_id)
663
+ return VolumeProvisioningData(
664
+ backend=BackendType.GCP,
665
+ volume_id=disk.name,
666
+ size_gb=disk.size_gb,
667
+ availability_zone=zone,
668
+ attachable=True,
669
+ detachable=True,
670
+ backend_data=GCPVolumeDiskBackendData(
671
+ disk_type=gcp_resources.full_resource_name_to_name(disk.type_),
672
+ ).json(),
673
+ )
674
+ raise ComputeError(f"Persistent disk {volume.configuration.volume_id} not found")
675
+
676
+ def create_volume(self, volume: Volume) -> VolumeProvisioningData:
677
+ zones = gcp_resources.get_availability_zones(
678
+ regions_client=self.regions_client,
679
+ project_id=self.config.project_id,
680
+ region=volume.configuration.region,
681
+ )
682
+ if volume.configuration.availability_zone is not None:
683
+ zones = [z for z in zones if z == volume.configuration.availability_zone]
684
+ if len(zones) == 0:
685
+ raise ComputeError(
686
+ f"Failed to find availability zone in region {volume.configuration.region}"
687
+ )
688
+ zone = zones[0]
689
+
690
+ disk_name = generate_unique_volume_name(
691
+ volume, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
692
+ )
693
+
694
+ labels = {
695
+ "owner": "dstack",
696
+ "dstack_project": volume.project_name.lower(),
697
+ "dstack_name": volume.name,
698
+ "dstack_user": volume.user,
699
+ }
700
+ labels = merge_tags(
701
+ base_tags=labels,
702
+ backend_tags=self.config.tags,
703
+ resource_tags=volume.configuration.tags,
704
+ )
705
+ labels = gcp_resources.filter_invalid_labels(labels)
706
+
707
+ disk = compute_v1.Disk()
708
+ disk.name = disk_name
709
+ disk.size_gb = volume.configuration.size_gb
710
+ disk.type_ = f"zones/{zone}/diskTypes/pd-balanced"
711
+ disk.labels = labels
712
+
713
+ logger.debug("Creating persistent disk for volume %s", volume.name)
714
+ try:
715
+ operation = self.disk_client.insert(
716
+ project=self.config.project_id,
717
+ zone=zone,
718
+ disk_resource=disk,
719
+ )
720
+ gcp_resources.wait_for_extended_operation(operation, "persistent disk creation")
721
+ except google.api_core.exceptions.Conflict:
722
+ raise ComputeError(f"Volume {volume.name} already exists")
723
+ created_disk = self.disk_client.get(
724
+ project=self.config.project_id,
725
+ zone=zone,
726
+ disk=disk_name,
727
+ )
728
+ logger.debug("Created persistent disk for volume %s", volume.name)
729
+ return VolumeProvisioningData(
730
+ backend=BackendType.GCP,
731
+ volume_id=created_disk.name,
732
+ size_gb=created_disk.size_gb,
733
+ availability_zone=zone,
734
+ price=_get_volume_price(created_disk.size_gb),
735
+ attachable=True,
736
+ detachable=True,
737
+ backend_data=GCPVolumeDiskBackendData(
738
+ disk_type=gcp_resources.full_resource_name_to_name(disk.type_),
739
+ ).json(),
740
+ )
741
+
742
+ def delete_volume(self, volume: Volume):
743
+ logger.debug("Deleting persistent disk for volume %s", volume.name)
744
+ try:
745
+ operation = self.disk_client.delete(
746
+ project=self.config.project_id,
747
+ zone=get_or_error(volume.provisioning_data).availability_zone,
748
+ disk=volume.volume_id,
749
+ )
750
+ gcp_resources.wait_for_extended_operation(operation, "persistent disk deletion")
751
+ except google.api_core.exceptions.NotFound:
752
+ logger.debug("Failed to find persistent disk for volume %s", volume.name)
753
+ pass
754
+ logger.debug("Deleted persistent disk for volume %s", volume.name)
755
+
756
+ def attach_volume(
757
+ self, volume: Volume, provisioning_data: JobProvisioningData
758
+ ) -> VolumeAttachmentData:
759
+ instance_id = provisioning_data.instance_id
760
+ logger.debug(
761
+ "Attaching persistent disk for volume %s to instance %s",
762
+ volume.volume_id,
763
+ instance_id,
764
+ )
765
+ if not gcp_resources.instance_type_supports_persistent_disk(
766
+ provisioning_data.instance_type.name
767
+ ):
768
+ raise ComputeError(
769
+ f"Instance type {provisioning_data.instance_type.name} does not support Persistent disk volumes"
770
+ )
771
+
772
+ zone = get_or_error(volume.provisioning_data).availability_zone
773
+ is_tpu = _is_tpu_provisioning_data(provisioning_data)
774
+ try:
775
+ disk = self.disk_client.get(
776
+ project=self.config.project_id,
777
+ zone=zone,
778
+ disk=volume.volume_id,
779
+ )
780
+ disk_url = disk.self_link
781
+ except google.api_core.exceptions.NotFound:
782
+ raise ComputeError("Persistent disk found")
783
+
784
+ try:
785
+ if is_tpu:
786
+ get_node_request = tpu_v2.GetNodeRequest(
787
+ name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}",
788
+ )
789
+ tpu_node = self.tpu_client.get_node(get_node_request)
790
+
791
+ # Python API to attach a disk to a TPU is not documented,
792
+ # so we follow the code from the gcloud CLI:
793
+ # https://github.com/twistedpair/google-cloud-sdk/blob/26ab5a281d56b384cc25750f3279a27afe5b499f/google-cloud-sdk/lib/googlecloudsdk/command_lib/compute/tpus/tpu_vm/util.py#L113
794
+ source_disk = (
795
+ f"projects/{self.config.project_id}/zones/{zone}/disks/{volume.volume_id}"
796
+ )
797
+ # create_instance() has already attached the disks
798
+ # if the TPU is provisioned on the run submission via run_job()
799
+ for i, disk in enumerate(tpu_node.data_disks, start=1):
800
+ if disk.source_disk == source_disk:
801
+ device_name = f"persistent-disk-{i}"
802
+ logger.debug(
803
+ "Persistent disk for volume %s is already attached to instance %s",
804
+ volume.volume_id,
805
+ instance_id,
806
+ )
807
+ return VolumeAttachmentData(device_name=device_name)
808
+ attached_disk = tpu_v2.AttachedDisk(
809
+ source_disk=source_disk,
810
+ mode=tpu_v2.AttachedDisk.DiskMode.READ_WRITE,
811
+ )
812
+ tpu_node.data_disks.append(attached_disk)
813
+ # Cannot set device name for TPUs, so use default naming
814
+ device_name = f"persistent-disk-{len(tpu_node.data_disks)}"
815
+ update_node_request = tpu_v2.UpdateNodeRequest(
816
+ node=tpu_node,
817
+ update_mask="dataDisks",
818
+ )
819
+ operation = self.tpu_client.update_node(update_node_request)
820
+ gcp_resources.wait_for_operation(operation, "persistent disk attachment")
821
+ else:
822
+ attached_disk = compute_v1.AttachedDisk()
823
+ attached_disk.source = disk_url
824
+ attached_disk.auto_delete = False
825
+ attached_disk.device_name = f"pd-{volume.volume_id}"
826
+ device_name = attached_disk.device_name
827
+ operation = self.instances_client.attach_disk(
828
+ project=self.config.project_id,
829
+ zone=zone,
830
+ instance=instance_id,
831
+ attached_disk_resource=attached_disk,
832
+ )
833
+ gcp_resources.wait_for_extended_operation(operation, "persistent disk attachment")
834
+ except google.api_core.exceptions.NotFound:
835
+ raise ComputeError("Disk or instance not found")
836
+ logger.debug(
837
+ "Attached persistent disk for volume %s to instance %s", volume.volume_id, instance_id
838
+ )
839
+ return VolumeAttachmentData(device_name=device_name)
840
+
841
+ def detach_volume(
842
+ self, volume: Volume, provisioning_data: JobProvisioningData, force: bool = False
843
+ ):
844
+ instance_id = provisioning_data.instance_id
845
+ logger.debug(
846
+ "Detaching persistent disk for volume %s from instance %s",
847
+ volume.volume_id,
848
+ instance_id,
849
+ )
850
+ zone = get_or_error(volume.provisioning_data).availability_zone
851
+ attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id))
852
+ is_tpu = _is_tpu_provisioning_data(provisioning_data)
853
+ if is_tpu:
854
+ try:
855
+ get_node_request = tpu_v2.GetNodeRequest(
856
+ name=f"projects/{self.config.project_id}/locations/{zone}/nodes/{instance_id}",
857
+ )
858
+ tpu_node = self.tpu_client.get_node(get_node_request)
859
+ except google.api_core.exceptions.NotFound:
860
+ raise ComputeError("Instance not found")
861
+
862
+ source_disk = (
863
+ f"projects/{self.config.project_id}/zones/{zone}/disks/{volume.volume_id}"
864
+ )
865
+ tpu_node.data_disks = [
866
+ disk for disk in tpu_node.data_disks if disk.source_disk != source_disk
867
+ ]
868
+ update_node_request = tpu_v2.UpdateNodeRequest(
869
+ node=tpu_node,
870
+ update_mask="dataDisks",
871
+ )
872
+ operation = self.tpu_client.update_node(update_node_request)
873
+ gcp_resources.wait_for_operation(operation, "persistent disk detachment")
874
+ else:
875
+ operation = self.instances_client.detach_disk(
876
+ project=self.config.project_id,
877
+ zone=get_or_error(volume.provisioning_data).availability_zone,
878
+ instance=instance_id,
879
+ device_name=attachment_data.device_name,
880
+ )
881
+ gcp_resources.wait_for_extended_operation(operation, "persistent disk detachment")
882
+ logger.debug(
883
+ "Detached persistent disk for volume %s from instance %s",
884
+ volume.volume_id,
885
+ instance_id,
886
+ )
887
+
888
+ def _get_extra_subnets(
889
+ self,
890
+ region: str,
891
+ instance_type_name: str,
892
+ ) -> List[Tuple[str, str]]:
893
+ if self.config.extra_vpcs is None:
894
+ return []
895
+ if instance_type_name == "a3-megagpu-8g":
896
+ subnets_num = 8
897
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
898
+ subnets_num = 4
899
+ elif instance_type_name == "a4-highgpu-8g":
900
+ subnets_num = 1 # 1 main + 1 extra + 8 RoCE
901
+ else:
902
+ return []
903
+ extra_subnets = []
904
+ for vpc_name in self.config.extra_vpcs[:subnets_num]:
905
+ subnet = gcp_resources.get_vpc_subnet_or_error(
906
+ vpc_name=vpc_name,
907
+ region=region,
908
+ usable_subnets=self._list_usable_subnets(),
909
+ )
910
+ vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
911
+ project_id=self.config.vpc_project_id or self.config.project_id,
912
+ vpc_name=vpc_name,
913
+ )
914
+ extra_subnets.append((vpc_resource_name, subnet))
915
+ return extra_subnets
916
+
917
+ def _get_roce_subnets(
918
+ self,
919
+ region: str,
920
+ instance_type_name: str,
921
+ ) -> List[Tuple[str, str]]:
922
+ if not self.config.roce_vpcs:
923
+ return []
924
+ if instance_type_name == "a4-highgpu-8g":
925
+ nics_num = 8
926
+ else:
927
+ return []
928
+ roce_vpc = self.config.roce_vpcs[0] # roce_vpcs is validated to have at most 1 item
929
+ subnets = gcp_resources.get_vpc_subnets(
930
+ vpc_name=roce_vpc,
931
+ region=region,
932
+ usable_subnets=self._list_usable_subnets(),
933
+ )
934
+ if len(subnets) < nics_num:
935
+ raise ComputeError(
936
+ f"{instance_type_name} requires {nics_num} RoCE subnets,"
937
+ f" but only {len(subnets)} are available in VPC {roce_vpc}"
938
+ )
939
+ vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
940
+ project_id=self.config.vpc_project_id or self.config.project_id,
941
+ vpc_name=roce_vpc,
942
+ )
943
+ nic_subnets = []
944
+ for subnet in subnets[:nics_num]:
945
+ nic_subnets.append((vpc_resource_name, subnet))
946
+ return nic_subnets
947
+
948
+ @cachedmethod(
949
+ cache=lambda self: self._usable_subnets_cache.cache,
950
+ lock=lambda self: self._usable_subnets_cache.lock,
951
+ )
952
+ def _list_usable_subnets(self) -> list[compute_v1.UsableSubnetwork]:
953
+ # To avoid hitting the `ListUsable requests per minute` system limit, we fetch all subnets
954
+ # at once and cache them
955
+ return gcp_resources.list_project_usable_subnets(
956
+ subnetworks_client=self.subnetworks_client,
957
+ project_id=self.config.vpc_project_id or self.config.project_id,
958
+ )
959
+
960
+ def _get_vpc_subnet(self, region: str) -> Optional[str]:
961
+ if self.config.vpc_name is None:
962
+ return None
963
+ return gcp_resources.get_vpc_subnet_or_error(
964
+ vpc_name=self.config.vpc_name,
965
+ region=region,
966
+ usable_subnets=self._list_usable_subnets(),
967
+ )
968
+
969
+ @cachedmethod(
970
+ cache=lambda self: self._reservation_cache.cache,
971
+ lock=lambda self: self._reservation_cache.lock,
972
+ )
973
+ def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]:
974
+ if match := RESERVATION_PATTERN.fullmatch(configured_name):
975
+ project_id = match.group("project_id")
976
+ name = match.group("reservation_name")
977
+ elif RESOURCE_NAME_PATTERN.fullmatch(configured_name):
978
+ project_id = self.config.project_id
979
+ name = configured_name
980
+ else:
981
+ # misconfigured or non-GCP
982
+ return {}
983
+ return gcp_resources.find_reservation(
984
+ reservations_client=self.reservations_client,
985
+ project_id=project_id,
986
+ name=name,
987
+ )
988
+
989
+
990
+ def _supported_instances_and_zones(
991
+ regions: List[str],
992
+ ) -> Optional[Callable[[InstanceOffer], bool]]:
993
+ def _filter(offer: InstanceOffer) -> bool:
994
+ # strip zone
995
+ if offer.region[:-2] not in regions:
996
+ return False
997
+ # remove multi-host TPUs for initial release
998
+ if _is_tpu(offer.instance.name) and not _is_single_host_tpu(offer.instance.name):
999
+ return False
1000
+ for family in [
1001
+ "m4-",
1002
+ "c4-",
1003
+ "n4-",
1004
+ "h3-",
1005
+ "n2-",
1006
+ "e2-medium",
1007
+ "e2-standard-",
1008
+ "e2-highmem-",
1009
+ "e2-highcpu-",
1010
+ "m1-",
1011
+ "a2-",
1012
+ "a3-",
1013
+ "g2-",
1014
+ ]:
1015
+ if offer.instance.name.startswith(family):
1016
+ return True
1017
+ if offer.instance.resources.gpus:
1018
+ if offer.instance.resources.gpus[0].name not in {"K80", "P4"}:
1019
+ return True
1020
+ return False
1021
+
1022
+ return _filter
1023
+
1024
+
1025
+ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
1026
+ if not resources.gpus:
1027
+ return True
1028
+ gpu = resources.gpus[0]
1029
+ if _is_tpu(gpu.name):
1030
+ return True
1031
+ if gpu.name in ["B200", "H100", "RTXPRO6000"]:
1032
+ # B200, H100, H100_MEGA, and RTXPRO6000 quotas are not returned by `regions_client.list`
1033
+ return True
1034
+ quota_name = f"NVIDIA_{gpu.name}_GPUS"
1035
+ if gpu.name == "A100" and gpu.memory_mib == 80 * 1024:
1036
+ quota_name = "NVIDIA_A100_80GB_GPUS"
1037
+ if resources.spot:
1038
+ quota_name = "PREEMPTIBLE_" + quota_name
1039
+ return len(resources.gpus) <= quotas.get(quota_name, 0)
1040
+
1041
+
1042
+ def _offer_matches_reservation(
1043
+ offer: InstanceOfferWithAvailability, reservation: compute_v1.Reservation
1044
+ ) -> bool:
1045
+ if (
1046
+ reservation.specific_reservation is None
1047
+ or reservation.specific_reservation.instance_properties is None
1048
+ ):
1049
+ return False
1050
+ properties = reservation.specific_reservation.instance_properties
1051
+ if properties.machine_type != offer.instance.name:
1052
+ return False
1053
+ accelerators = properties.guest_accelerators or []
1054
+ if not accelerators and offer.instance.resources.gpus:
1055
+ return False
1056
+ if len(accelerators) > 1:
1057
+ logger.warning(
1058
+ "Expected 0 or 1 accelerator types per instance,"
1059
+ f" but {properties.machine_type} has {len(accelerators)}."
1060
+ f" Ignoring reservation {reservation.self_link}"
1061
+ )
1062
+ return False
1063
+ if accelerators:
1064
+ if accelerators[0].accelerator_count != len(offer.instance.resources.gpus):
1065
+ return False
1066
+ if (
1067
+ offer.instance.resources.gpus
1068
+ and gcp_resources.find_accelerator_name(
1069
+ offer.instance.resources.gpus[0].name,
1070
+ offer.instance.resources.gpus[0].memory_mib,
1071
+ )
1072
+ != accelerators[0].accelerator_type
1073
+ ):
1074
+ return False
1075
+ return True
1076
+
1077
+
1078
+ def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool:
1079
+ return (
1080
+ reservation.specific_reservation is not None
1081
+ and reservation.specific_reservation.in_use_count is not None
1082
+ and reservation.specific_reservation.assured_count is not None
1083
+ and reservation.specific_reservation.in_use_count
1084
+ < reservation.specific_reservation.assured_count
1085
+ )
1086
+
1087
+
1088
+ def _unique_instance_name(instance: InstanceType) -> str:
1089
+ if instance.resources.spot:
1090
+ name = f"{instance.name}-spot"
1091
+ else:
1092
+ name = instance.name
1093
+ if not instance.resources.gpus:
1094
+ return name
1095
+ gpu = instance.resources.gpus[0]
1096
+ return f"{name}-{gpu.name}-{gpu.memory_mib}"
1097
+
1098
+
1099
+ @dataclass
1100
+ class GCPImage:
1101
+ id: str
1102
+ is_ufw_installed: bool
1103
+
1104
+
1105
+ def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
1106
+ if instance_type_name == "a3-megagpu-8g":
1107
+ image_name = "dstack-a3mega-5"
1108
+ is_ufw_installed = False
1109
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
1110
+ return GCPImage(
1111
+ id="projects/cos-cloud/global/images/cos-105-17412-535-78",
1112
+ is_ufw_installed=False,
1113
+ )
1114
+ elif gpu_name is not None:
1115
+ if not requires_nvidia_proprietary_kernel_modules(gpu_name):
1116
+ image_name = f"dstack-cuda-{version.base_image}"
1117
+ else:
1118
+ image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
1119
+ is_ufw_installed = True
1120
+ else:
1121
+ image_name = f"dstack-{version.base_image}"
1122
+ is_ufw_installed = True
1123
+ image_name = image_name.replace(".", "-")
1124
+ return GCPImage(
1125
+ id=f"projects/dstack/global/images/{image_name}",
1126
+ is_ufw_installed=is_ufw_installed,
1127
+ )
1128
+
1129
+
1130
+ def _get_gateway_image_id() -> str:
1131
+ return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
1132
+
1133
+
1134
+ def _get_user_data(
1135
+ authorized_keys: List[str], instance_type_name: str, is_ufw_installed: bool
1136
+ ) -> str:
1137
+ base_path = None
1138
+ bin_path = None
1139
+ backend_shim_env = None
1140
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
1141
+ # In the COS image the / file system is not writable.
1142
+ # /home and /var are writable but not executable.
1143
+ # Only /etc is both writable and executable, so use it for shim/runner binaries.
1144
+ # See: https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
1145
+ base_path = bin_path = "/etc"
1146
+ backend_shim_env = {
1147
+ # In COS nvidia binaries are not installed on PATH by default.
1148
+ # Set so that shim can run nvidia-smi.
1149
+ "PATH": "/var/lib/nvidia/bin:$PATH",
1150
+ }
1151
+ return get_user_data(
1152
+ authorized_keys=authorized_keys,
1153
+ backend_specific_commands=_get_backend_specific_commands(
1154
+ instance_type_name=instance_type_name,
1155
+ ),
1156
+ base_path=base_path,
1157
+ bin_path=bin_path,
1158
+ backend_shim_env=backend_shim_env,
1159
+ # Instance-level firewall is optional on GCP. The main protection comes from GCP firewalls.
1160
+ # So only set up instance-level firewall as an additional measure if ufw is available.
1161
+ skip_firewall_setup=not is_ufw_installed,
1162
+ )
1163
+
1164
+
1165
+ def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
1166
+ if instance_type_name == "a3-megagpu-8g":
1167
+ return tcpx_features.get_backend_specific_commands_tcpxo()
1168
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
1169
+ return tcpx_features.get_backend_specific_commands_tcpx()
1170
+ return []
1171
+
1172
+
1173
+ def _get_volume_price(size: int) -> float:
1174
+ # https://cloud.google.com/compute/disks-image-pricing#persistentdisk
1175
+ # The price is different in different regions. Take max across supported regions.
1176
+ return size * 0.12
1177
+
1178
+
1179
+ def _get_tpu_startup_script() -> str:
1180
+ commands = get_shim_commands(is_privileged=True, pjrt_device="TPU")
1181
+ startup_script = " ".join([" && ".join(commands)])
1182
+ startup_script = "#! /bin/bash\n" + startup_script
1183
+ return startup_script
1184
+
1185
+
1186
+ def _is_tpu(instance_name: str) -> bool:
1187
+ parts = instance_name.split("-")
1188
+ if len(parts) == 2:
1189
+ version, cores = parts
1190
+ if version in TPU_VERSIONS and cores.isdigit():
1191
+ return True
1192
+ return False
1193
+
1194
+
1195
+ def _get_tpu_runtime_version(instance_name: str) -> str:
1196
+ tpu_version = _get_tpu_version(instance_name)
1197
+ if tpu_version == "v6e":
1198
+ return "v2-alpha-tpuv6e"
1199
+ elif tpu_version == "v5litepod":
1200
+ return "v2-alpha-tpuv5-lite"
1201
+ return "tpu-ubuntu2204-base"
1202
+
1203
+
1204
+ def _get_tpu_version(instance_name: str) -> str:
1205
+ return instance_name.split("-")[0]
1206
+
1207
+
1208
+ def _is_single_host_tpu(instance_name: str) -> bool:
1209
+ parts = instance_name.split("-")
1210
+ if len(parts) != 2:
1211
+ logger.info("Skipping unknown TPU: %s", instance_name)
1212
+ return False
1213
+ tpu_version, tensor_cores = parts
1214
+ try:
1215
+ tensor_cores = int(tensor_cores)
1216
+ except ValueError:
1217
+ logger.info("Skipping TPU due to invalid number of tensor cores: %s", tensor_cores)
1218
+ return False
1219
+ if tpu_version in ["v2", "v3", "v5p", "v5litepod", "v6e"]:
1220
+ return tensor_cores <= 8
1221
+ elif tpu_version == "v4":
1222
+ return False
1223
+ else:
1224
+ logger.info("Skipping unknown TPU: %s", instance_name)
1225
+ return False
1226
+
1227
+
1228
+ def _get_tpu_data_disks(
1229
+ project_id: str, volumes: Optional[List[Volume]]
1230
+ ) -> List[tpu_v2.AttachedDisk]:
1231
+ if volumes is None:
1232
+ return []
1233
+ return [_get_tpu_data_disk_for_volume(project_id, volume) for volume in volumes]
1234
+
1235
+
1236
+ def _get_tpu_data_disk_for_volume(project_id: str, volume: Volume) -> tpu_v2.AttachedDisk:
1237
+ zone = get_or_error(volume.provisioning_data).availability_zone
1238
+ source_disk = f"projects/{project_id}/zones/{zone}/disks/{volume.volume_id}"
1239
+ attached_disk = tpu_v2.AttachedDisk(
1240
+ source_disk=source_disk,
1241
+ mode=tpu_v2.AttachedDisk.DiskMode.READ_WRITE,
1242
+ )
1243
+ return attached_disk
1244
+
1245
+
1246
+ def _is_tpu_provisioning_data(provisioning_data: JobProvisioningData) -> bool:
1247
+ is_tpu = False
1248
+ if provisioning_data.backend_data:
1249
+ backend_data_dict = json.loads(provisioning_data.backend_data)
1250
+ is_tpu = backend_data_dict.get("is_tpu", False)
1251
+ return is_tpu
1252
+
1253
+
1254
+ def _get_instance_ip(instance: Instance, public_ip: bool) -> str:
1255
+ if public_ip:
1256
+ return instance.network_interfaces[0].access_configs[0].nat_i_p
1257
+ return instance.network_interfaces[0].network_i_p