dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. dstack/_internal/cli/commands/gateway.py +15 -3
  2. dstack/_internal/cli/commands/logs.py +0 -22
  3. dstack/_internal/cli/commands/stats.py +8 -17
  4. dstack/_internal/cli/main.py +1 -5
  5. dstack/_internal/cli/services/configurators/fleet.py +4 -39
  6. dstack/_internal/cli/services/configurators/run.py +22 -20
  7. dstack/_internal/cli/services/profile.py +34 -83
  8. dstack/_internal/cli/utils/gateway.py +1 -1
  9. dstack/_internal/cli/utils/run.py +11 -0
  10. dstack/_internal/core/backends/__init__.py +56 -39
  11. dstack/_internal/core/backends/aws/__init__.py +0 -25
  12. dstack/_internal/core/backends/aws/auth.py +1 -10
  13. dstack/_internal/core/backends/aws/backend.py +26 -0
  14. dstack/_internal/core/backends/aws/compute.py +21 -45
  15. dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
  16. dstack/_internal/core/backends/aws/models.py +135 -0
  17. dstack/_internal/core/backends/aws/resources.py +1 -1
  18. dstack/_internal/core/backends/azure/__init__.py +0 -20
  19. dstack/_internal/core/backends/azure/auth.py +2 -11
  20. dstack/_internal/core/backends/azure/backend.py +21 -0
  21. dstack/_internal/core/backends/azure/compute.py +14 -28
  22. dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
  23. dstack/_internal/core/backends/azure/models.py +89 -0
  24. dstack/_internal/core/backends/base/__init__.py +0 -12
  25. dstack/_internal/core/backends/base/backend.py +18 -0
  26. dstack/_internal/core/backends/base/compute.py +153 -33
  27. dstack/_internal/core/backends/base/configurator.py +105 -0
  28. dstack/_internal/core/backends/base/models.py +14 -0
  29. dstack/_internal/core/backends/configurators.py +138 -0
  30. dstack/_internal/core/backends/cudo/__init__.py +0 -15
  31. dstack/_internal/core/backends/cudo/backend.py +16 -0
  32. dstack/_internal/core/backends/cudo/compute.py +8 -26
  33. dstack/_internal/core/backends/cudo/configurator.py +72 -0
  34. dstack/_internal/core/backends/cudo/models.py +37 -0
  35. dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
  36. dstack/_internal/core/backends/datacrunch/backend.py +16 -0
  37. dstack/_internal/core/backends/datacrunch/compute.py +8 -25
  38. dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
  39. dstack/_internal/core/backends/datacrunch/models.py +38 -0
  40. dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
  41. dstack/_internal/core/backends/gcp/__init__.py +0 -16
  42. dstack/_internal/core/backends/gcp/auth.py +2 -11
  43. dstack/_internal/core/backends/gcp/backend.py +17 -0
  44. dstack/_internal/core/backends/gcp/compute.py +14 -44
  45. dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
  46. dstack/_internal/core/backends/gcp/models.py +125 -0
  47. dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
  48. dstack/_internal/core/backends/kubernetes/backend.py +16 -0
  49. dstack/_internal/core/backends/kubernetes/compute.py +16 -5
  50. dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
  51. dstack/_internal/core/backends/kubernetes/models.py +72 -0
  52. dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
  53. dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
  54. dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
  55. dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
  56. dstack/_internal/core/backends/lambdalabs/models.py +37 -0
  57. dstack/_internal/core/backends/local/__init__.py +0 -13
  58. dstack/_internal/core/backends/local/backend.py +14 -0
  59. dstack/_internal/core/backends/local/compute.py +16 -2
  60. dstack/_internal/core/backends/models.py +128 -0
  61. dstack/_internal/core/backends/oci/__init__.py +0 -15
  62. dstack/_internal/core/backends/oci/auth.py +1 -5
  63. dstack/_internal/core/backends/oci/backend.py +16 -0
  64. dstack/_internal/core/backends/oci/compute.py +9 -23
  65. dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
  66. dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
  67. dstack/_internal/core/backends/oci/region.py +1 -1
  68. dstack/_internal/core/backends/runpod/__init__.py +0 -15
  69. dstack/_internal/core/backends/runpod/backend.py +16 -0
  70. dstack/_internal/core/backends/runpod/compute.py +28 -6
  71. dstack/_internal/core/backends/runpod/configurator.py +59 -0
  72. dstack/_internal/core/backends/runpod/models.py +54 -0
  73. dstack/_internal/core/backends/template/__init__.py +0 -0
  74. dstack/_internal/core/backends/tensordock/__init__.py +0 -15
  75. dstack/_internal/core/backends/tensordock/backend.py +16 -0
  76. dstack/_internal/core/backends/tensordock/compute.py +8 -27
  77. dstack/_internal/core/backends/tensordock/configurator.py +68 -0
  78. dstack/_internal/core/backends/tensordock/models.py +38 -0
  79. dstack/_internal/core/backends/vastai/__init__.py +0 -15
  80. dstack/_internal/core/backends/vastai/backend.py +16 -0
  81. dstack/_internal/core/backends/vastai/compute.py +2 -2
  82. dstack/_internal/core/backends/vastai/configurator.py +66 -0
  83. dstack/_internal/core/backends/vastai/models.py +37 -0
  84. dstack/_internal/core/backends/vultr/__init__.py +0 -15
  85. dstack/_internal/core/backends/vultr/backend.py +16 -0
  86. dstack/_internal/core/backends/vultr/compute.py +10 -24
  87. dstack/_internal/core/backends/vultr/configurator.py +64 -0
  88. dstack/_internal/core/backends/vultr/models.py +34 -0
  89. dstack/_internal/core/models/backends/__init__.py +0 -184
  90. dstack/_internal/core/models/backends/base.py +0 -19
  91. dstack/_internal/core/models/configurations.py +22 -16
  92. dstack/_internal/core/models/envs.py +4 -3
  93. dstack/_internal/core/models/fleets.py +17 -22
  94. dstack/_internal/core/models/gateways.py +3 -3
  95. dstack/_internal/core/models/instances.py +24 -0
  96. dstack/_internal/core/models/profiles.py +85 -45
  97. dstack/_internal/core/models/projects.py +1 -1
  98. dstack/_internal/core/models/repos/base.py +0 -5
  99. dstack/_internal/core/models/repos/local.py +3 -3
  100. dstack/_internal/core/models/repos/remote.py +26 -12
  101. dstack/_internal/core/models/repos/virtual.py +1 -1
  102. dstack/_internal/core/models/resources.py +45 -76
  103. dstack/_internal/core/models/runs.py +21 -19
  104. dstack/_internal/core/models/volumes.py +1 -3
  105. dstack/_internal/core/services/profiles.py +7 -16
  106. dstack/_internal/core/services/repos.py +0 -4
  107. dstack/_internal/server/app.py +11 -4
  108. dstack/_internal/server/background/__init__.py +10 -0
  109. dstack/_internal/server/background/tasks/process_gateways.py +4 -8
  110. dstack/_internal/server/background/tasks/process_instances.py +14 -9
  111. dstack/_internal/server/background/tasks/process_metrics.py +1 -1
  112. dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
  113. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  114. dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
  115. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  116. dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
  117. dstack/_internal/server/background/tasks/process_volumes.py +5 -2
  118. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  119. dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
  120. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  121. dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
  122. dstack/_internal/server/models.py +59 -9
  123. dstack/_internal/server/routers/backends.py +14 -23
  124. dstack/_internal/server/routers/instances.py +3 -4
  125. dstack/_internal/server/routers/metrics.py +31 -10
  126. dstack/_internal/server/routers/prometheus.py +36 -0
  127. dstack/_internal/server/routers/repos.py +1 -2
  128. dstack/_internal/server/routers/runs.py +13 -59
  129. dstack/_internal/server/schemas/gateways.py +14 -23
  130. dstack/_internal/server/schemas/projects.py +7 -2
  131. dstack/_internal/server/schemas/repos.py +2 -38
  132. dstack/_internal/server/schemas/runner.py +1 -0
  133. dstack/_internal/server/schemas/runs.py +1 -24
  134. dstack/_internal/server/security/permissions.py +1 -1
  135. dstack/_internal/server/services/backends/__init__.py +85 -158
  136. dstack/_internal/server/services/config.py +53 -567
  137. dstack/_internal/server/services/fleets.py +9 -103
  138. dstack/_internal/server/services/gateways/__init__.py +13 -4
  139. dstack/_internal/server/services/{pools.py → instances.py} +22 -329
  140. dstack/_internal/server/services/jobs/__init__.py +9 -6
  141. dstack/_internal/server/services/jobs/configurators/base.py +25 -1
  142. dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
  143. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
  144. dstack/_internal/server/services/metrics.py +131 -72
  145. dstack/_internal/server/services/offers.py +1 -1
  146. dstack/_internal/server/services/projects.py +23 -14
  147. dstack/_internal/server/services/prometheus.py +245 -0
  148. dstack/_internal/server/services/runner/client.py +14 -3
  149. dstack/_internal/server/services/runs.py +67 -31
  150. dstack/_internal/server/services/volumes.py +9 -4
  151. dstack/_internal/server/settings.py +3 -0
  152. dstack/_internal/server/statics/index.html +1 -1
  153. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
  154. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
  155. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  156. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  157. dstack/_internal/server/testing/common.py +75 -32
  158. dstack/_internal/utils/json_schema.py +6 -0
  159. dstack/_internal/utils/ssh.py +2 -1
  160. dstack/api/__init__.py +4 -0
  161. dstack/api/_public/__init__.py +16 -20
  162. dstack/api/_public/backends.py +1 -1
  163. dstack/api/_public/repos.py +36 -36
  164. dstack/api/_public/runs.py +170 -83
  165. dstack/api/server/__init__.py +11 -13
  166. dstack/api/server/_backends.py +12 -16
  167. dstack/api/server/_fleets.py +15 -55
  168. dstack/api/server/_gateways.py +3 -14
  169. dstack/api/server/_repos.py +1 -4
  170. dstack/api/server/_runs.py +21 -96
  171. dstack/api/server/_volumes.py +10 -5
  172. dstack/api/utils.py +3 -0
  173. dstack/version.py +1 -1
  174. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
  175. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
  176. tests/_internal/cli/services/configurators/test_profile.py +6 -6
  177. tests/_internal/core/backends/aws/test_configurator.py +35 -0
  178. tests/_internal/core/backends/aws/test_resources.py +1 -1
  179. tests/_internal/core/backends/azure/test_configurator.py +61 -0
  180. tests/_internal/core/backends/cudo/__init__.py +0 -0
  181. tests/_internal/core/backends/cudo/test_configurator.py +37 -0
  182. tests/_internal/core/backends/datacrunch/__init__.py +0 -0
  183. tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
  184. tests/_internal/core/backends/gcp/test_configurator.py +42 -0
  185. tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
  186. tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
  187. tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
  188. tests/_internal/core/backends/oci/test_configurator.py +55 -0
  189. tests/_internal/core/backends/runpod/__init__.py +0 -0
  190. tests/_internal/core/backends/runpod/test_configurator.py +33 -0
  191. tests/_internal/core/backends/tensordock/__init__.py +0 -0
  192. tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
  193. tests/_internal/core/backends/vastai/__init__.py +0 -0
  194. tests/_internal/core/backends/vastai/test_configurator.py +33 -0
  195. tests/_internal/core/backends/vultr/__init__.py +0 -0
  196. tests/_internal/core/backends/vultr/test_configurator.py +33 -0
  197. tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
  198. tests/_internal/server/background/tasks/test_process_instances.py +49 -48
  199. tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
  200. tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
  201. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
  202. tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
  203. tests/_internal/server/background/tasks/test_process_runs.py +8 -22
  204. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
  205. tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
  206. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
  207. tests/_internal/server/routers/test_backends.py +6 -764
  208. tests/_internal/server/routers/test_fleets.py +2 -26
  209. tests/_internal/server/routers/test_gateways.py +27 -3
  210. tests/_internal/server/routers/test_instances.py +0 -10
  211. tests/_internal/server/routers/test_metrics.py +42 -0
  212. tests/_internal/server/routers/test_projects.py +56 -0
  213. tests/_internal/server/routers/test_prometheus.py +333 -0
  214. tests/_internal/server/routers/test_repos.py +0 -15
  215. tests/_internal/server/routers/test_runs.py +83 -275
  216. tests/_internal/server/routers/test_volumes.py +2 -3
  217. tests/_internal/server/services/backends/__init__.py +0 -0
  218. tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
  219. tests/_internal/server/services/test_config.py +7 -4
  220. tests/_internal/server/services/test_fleets.py +1 -4
  221. tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
  222. tests/_internal/server/services/test_metrics.py +167 -0
  223. tests/_internal/server/services/test_repos.py +1 -14
  224. tests/_internal/server/services/test_runs.py +0 -4
  225. dstack/_internal/cli/commands/pool.py +0 -581
  226. dstack/_internal/cli/commands/run.py +0 -75
  227. dstack/_internal/core/backends/aws/config.py +0 -18
  228. dstack/_internal/core/backends/azure/config.py +0 -12
  229. dstack/_internal/core/backends/base/config.py +0 -5
  230. dstack/_internal/core/backends/cudo/config.py +0 -9
  231. dstack/_internal/core/backends/datacrunch/config.py +0 -9
  232. dstack/_internal/core/backends/gcp/config.py +0 -22
  233. dstack/_internal/core/backends/kubernetes/config.py +0 -6
  234. dstack/_internal/core/backends/lambdalabs/config.py +0 -9
  235. dstack/_internal/core/backends/nebius/__init__.py +0 -15
  236. dstack/_internal/core/backends/nebius/api_client.py +0 -319
  237. dstack/_internal/core/backends/nebius/compute.py +0 -220
  238. dstack/_internal/core/backends/nebius/config.py +0 -6
  239. dstack/_internal/core/backends/nebius/types.py +0 -37
  240. dstack/_internal/core/backends/oci/config.py +0 -6
  241. dstack/_internal/core/backends/runpod/config.py +0 -9
  242. dstack/_internal/core/backends/tensordock/config.py +0 -9
  243. dstack/_internal/core/backends/vastai/config.py +0 -6
  244. dstack/_internal/core/backends/vultr/config.py +0 -9
  245. dstack/_internal/core/models/backends/aws.py +0 -86
  246. dstack/_internal/core/models/backends/azure.py +0 -68
  247. dstack/_internal/core/models/backends/cudo.py +0 -43
  248. dstack/_internal/core/models/backends/datacrunch.py +0 -44
  249. dstack/_internal/core/models/backends/gcp.py +0 -67
  250. dstack/_internal/core/models/backends/kubernetes.py +0 -40
  251. dstack/_internal/core/models/backends/lambdalabs.py +0 -43
  252. dstack/_internal/core/models/backends/nebius.py +0 -54
  253. dstack/_internal/core/models/backends/runpod.py +0 -40
  254. dstack/_internal/core/models/backends/tensordock.py +0 -44
  255. dstack/_internal/core/models/backends/vastai.py +0 -43
  256. dstack/_internal/core/models/backends/vultr.py +0 -40
  257. dstack/_internal/core/models/pools.py +0 -43
  258. dstack/_internal/server/routers/pools.py +0 -142
  259. dstack/_internal/server/schemas/pools.py +0 -38
  260. dstack/_internal/server/services/backends/configurators/base.py +0 -72
  261. dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
  262. dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
  263. dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
  264. dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
  265. dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
  266. dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
  267. dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
  268. dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
  269. dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
  270. dstack/api/_public/pools.py +0 -41
  271. dstack/api/_public/resources.py +0 -105
  272. dstack/api/server/_pools.py +0 -63
  273. tests/_internal/server/routers/test_pools.py +0 -612
  274. /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
  275. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
  276. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
  277. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
  278. {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ from dstack._internal.core.models.gateways import (
25
25
  from dstack._internal.core.models.instances import (
26
26
  InstanceConfiguration,
27
27
  InstanceOfferWithAvailability,
28
+ SSHKey,
28
29
  )
29
30
  from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
30
31
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
@@ -46,6 +47,11 @@ DSTACK_RUNNER_BINARY_PATH = f"/usr/local/bin/{DSTACK_RUNNER_BINARY_NAME}"
46
47
 
47
48
 
48
49
  class Compute(ABC):
50
+ """
51
+ A base class for all compute implementations with minimal features.
52
+ If a compute supports additional features, it must also subclass `ComputeWith*` classes.
53
+ """
54
+
49
55
  def __init__(self):
50
56
  self._offers_cache_lock = threading.Lock()
51
57
  self._offers_cache = TTLCache(maxsize=5, ttl=30)
@@ -54,6 +60,11 @@ class Compute(ABC):
54
60
  def get_offers(
55
61
  self, requirements: Optional[Requirements] = None
56
62
  ) -> List[InstanceOfferWithAvailability]:
63
+ """
64
+ Returns offers with availability matching `requirements`.
65
+ If the provider is added to gpuhunt, typically gets offers using `base.offers.get_catalog_offers()`
66
+ and extends them with availability info.
67
+ """
57
68
  pass
58
69
 
59
70
  @abstractmethod
@@ -86,6 +97,47 @@ class Compute(ABC):
86
97
  """
87
98
  pass
88
99
 
100
+ def update_provisioning_data(
101
+ self,
102
+ provisioning_data: JobProvisioningData,
103
+ project_ssh_public_key: str,
104
+ project_ssh_private_key: str,
105
+ ):
106
+ """
107
+ This method is called if `JobProvisioningData` returned from `run_job()`/`create_instance()`
108
+ is not complete, e.g. missing `hostname` or `ssh_port`.
109
+ It can be used if getting complete provisioning data takes a long of time.
110
+ It should not wait but return immediately.
111
+ If it raises `ProvisioningError`, there will be no further attempts to update the provisioning data,
112
+ and the run will be terminated.
113
+ """
114
+ pass
115
+
116
+ def _get_offers_cached_key(self, requirements: Optional[Requirements] = None) -> int:
117
+ # Requirements is not hashable, so we use a hack to get arguments hash
118
+ if requirements is None:
119
+ return hash(None)
120
+ return hash(requirements.json())
121
+
122
+ @cachedmethod(
123
+ cache=lambda self: self._offers_cache,
124
+ key=_get_offers_cached_key,
125
+ lock=lambda self: self._offers_cache_lock,
126
+ )
127
+ def get_offers_cached(
128
+ self, requirements: Optional[Requirements] = None
129
+ ) -> List[InstanceOfferWithAvailability]:
130
+ return self.get_offers(requirements)
131
+
132
+
133
+ class ComputeWithCreateInstanceSupport(ABC):
134
+ """
135
+ Must be subclassed and implemented to support fleets (instance creation without running a job).
136
+ Typically, a compute that runs VMs would implement it,
137
+ and a compute that runs containers would not.
138
+ """
139
+
140
+ @abstractmethod
89
141
  def create_instance(
90
142
  self,
91
143
  instance_offer: InstanceOfferWithAvailability,
@@ -96,24 +148,77 @@ class Compute(ABC):
96
148
  If required to wait to get the IP address or SSH port, return partially filled `JobProvisioningData`
97
149
  and implement `update_provisioning_data()`.
98
150
  """
99
- raise NotImplementedError()
151
+ pass
100
152
 
101
- def update_provisioning_data(
153
+ def run_job(
102
154
  self,
103
- provisioning_data: JobProvisioningData,
155
+ run: Run,
156
+ job: Job,
157
+ instance_offer: InstanceOfferWithAvailability,
104
158
  project_ssh_public_key: str,
105
159
  project_ssh_private_key: str,
106
- ):
160
+ volumes: List[Volume],
161
+ ) -> JobProvisioningData:
107
162
  """
108
- This method is called if `JobProvisioningData` returned from `run_job()`/`create_instance()`
109
- is not complete, e.g. missing `hostname` or `ssh_port`.
110
- It can be used if getting complete provisioning data takes a long of time.
111
- It should not wait but return immediately.
112
- If it raises `ProvisioningError`, there will be no further attempts to update the provisioning data,
113
- and the run will be terminated.
163
+ The default `run_job()` implementation for all backends that support `create_instance()`.
164
+ Override only if custom `run_job()` behavior is required.
114
165
  """
115
- pass
166
+ instance_config = InstanceConfiguration(
167
+ project_name=run.project_name,
168
+ instance_name=get_job_instance_name(run, job),
169
+ user=run.user,
170
+ ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
171
+ volumes=volumes,
172
+ reservation=run.run_spec.configuration.reservation,
173
+ )
174
+ instance_offer = instance_offer.copy()
175
+ self._restrict_instance_offer_az_to_volumes_az(instance_offer, volumes)
176
+ return self.create_instance(instance_offer, instance_config)
177
+
178
+ def _restrict_instance_offer_az_to_volumes_az(
179
+ self,
180
+ instance_offer: InstanceOfferWithAvailability,
181
+ volumes: List[Volume],
182
+ ):
183
+ if len(volumes) == 0:
184
+ return
185
+ volume = volumes[0]
186
+ if (
187
+ volume.provisioning_data is not None
188
+ and volume.provisioning_data.availability_zone is not None
189
+ ):
190
+ if instance_offer.availability_zones is None:
191
+ instance_offer.availability_zones = [volume.provisioning_data.availability_zone]
192
+ instance_offer.availability_zones = [
193
+ z
194
+ for z in instance_offer.availability_zones
195
+ if z == volume.provisioning_data.availability_zone
196
+ ]
197
+
198
+
199
+ class ComputeWithMultinodeSupport:
200
+ """
201
+ Must be subclassed to support multinode tasks and cluster fleets.
202
+ Instances provisioned in the same project/region must be interconnected.
203
+ """
204
+
205
+ pass
206
+
207
+
208
+ class ComputeWithReservationSupport:
209
+ """
210
+ Must be subclassed to support provisioning from reservations.
211
+ """
212
+
213
+ pass
214
+
116
215
 
216
+ class ComputeWithPlacementGroupSupport(ABC):
217
+ """
218
+ Must be subclassed and implemented to support placement groups.
219
+ """
220
+
221
+ @abstractmethod
117
222
  def create_placement_group(
118
223
  self,
119
224
  placement_group: PlacementGroup,
@@ -121,8 +226,9 @@ class Compute(ABC):
121
226
  """
122
227
  Creates a placement group.
123
228
  """
124
- raise NotImplementedError()
229
+ pass
125
230
 
231
+ @abstractmethod
126
232
  def delete_placement_group(
127
233
  self,
128
234
  placement_group: PlacementGroup,
@@ -131,8 +237,15 @@ class Compute(ABC):
131
237
  Deletes a placement group.
132
238
  If the group does not exist, it should not raise errors but return silently.
133
239
  """
134
- raise NotImplementedError()
240
+ pass
241
+
135
242
 
243
+ class ComputeWithGatewaySupport(ABC):
244
+ """
245
+ Must be subclassed and imlemented to support gateways.
246
+ """
247
+
248
+ @abstractmethod
136
249
  def create_gateway(
137
250
  self,
138
251
  configuration: GatewayComputeConfiguration,
@@ -140,8 +253,9 @@ class Compute(ABC):
140
253
  """
141
254
  Creates a gateway instance.
142
255
  """
143
- raise NotImplementedError()
256
+ pass
144
257
 
258
+ @abstractmethod
145
259
  def terminate_gateway(
146
260
  self,
147
261
  instance_id: str,
@@ -152,21 +266,39 @@ class Compute(ABC):
152
266
  Terminates a gateway instance. Generally, it passes the call to `terminate_instance()`,
153
267
  but may perform additional work such as deleting a load balancer when a gateway has one.
154
268
  """
155
- raise NotImplementedError()
269
+ pass
156
270
 
271
+
272
+ class ComputeWithPrivateGatewaySupport:
273
+ """
274
+ Must be subclassed to support private gateways.
275
+ `create_gateway()` must be able to create private gateways.
276
+ """
277
+
278
+ pass
279
+
280
+
281
+ class ComputeWithVolumeSupport(ABC):
282
+ """
283
+ Must be subclassed and implemented to support volumes.
284
+ """
285
+
286
+ @abstractmethod
157
287
  def register_volume(self, volume: Volume) -> VolumeProvisioningData:
158
288
  """
159
289
  Returns VolumeProvisioningData for an existing volume.
160
290
  Used to add external volumes to dstack.
161
291
  """
162
- raise NotImplementedError()
292
+ pass
163
293
 
294
+ @abstractmethod
164
295
  def create_volume(self, volume: Volume) -> VolumeProvisioningData:
165
296
  """
166
297
  Creates a new volume.
167
298
  """
168
299
  raise NotImplementedError()
169
300
 
301
+ @abstractmethod
170
302
  def delete_volume(self, volume: Volume):
171
303
  """
172
304
  Deletes a volume.
@@ -176,13 +308,17 @@ class Compute(ABC):
176
308
  def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentData:
177
309
  """
178
310
  Attaches a volume to the instance.
179
- If the volume is not found, it should raise `ComputeError()` instead of a thrid-party exception.
311
+ If the volume is not found, it should raise `ComputeError()`.
312
+ Implement only if compute may return `VolumeProvisioningData.attachable`.
313
+ Otherwise, volumes should be attached by `run_job()`.
180
314
  """
181
315
  raise NotImplementedError()
182
316
 
183
317
  def detach_volume(self, volume: Volume, instance_id: str, force: bool = False):
184
318
  """
185
319
  Detaches a volume from the instance.
320
+ Implement only if compute may return `VolumeProvisioningData.detachable`.
321
+ Otherwise, volumes should be detached on instance termination.
186
322
  """
187
323
  raise NotImplementedError()
188
324
 
@@ -195,22 +331,6 @@ class Compute(ABC):
195
331
  """
196
332
  return True
197
333
 
198
- def _get_offers_cached_key(self, requirements: Optional[Requirements] = None) -> int:
199
- # Requirements is not hashable, so we use a hack to get arguments hash
200
- if requirements is None:
201
- return hash(None)
202
- return hash(requirements.json())
203
-
204
- @cachedmethod(
205
- cache=lambda self: self._offers_cache,
206
- key=_get_offers_cached_key,
207
- lock=lambda self: self._offers_cache_lock,
208
- )
209
- def get_offers_cached(
210
- self, requirements: Optional[Requirements] = None
211
- ) -> List[InstanceOfferWithAvailability]:
212
- return self.get_offers(requirements)
213
-
214
334
 
215
335
  def get_job_instance_name(run: Run, job: Job) -> str:
216
336
  return job.job_spec.job_name
@@ -0,0 +1,105 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, ClassVar, List, Optional
3
+ from uuid import UUID
4
+
5
+ from dstack._internal.core.backends.base.backend import Backend
6
+ from dstack._internal.core.backends.models import (
7
+ AnyBackendConfig,
8
+ AnyBackendConfigWithCreds,
9
+ )
10
+ from dstack._internal.core.errors import BackendInvalidCredentialsError
11
+ from dstack._internal.core.models.backends.base import BackendType
12
+ from dstack._internal.core.models.common import CoreModel
13
+
14
+ # Most clouds allow ~ 40-60 tags/labels per resource.
15
+ # We'll introduce our own base limit that can be customized per backend if required.
16
+ TAGS_MAX_NUM = 25
17
+
18
+
19
+ class BackendRecord(CoreModel):
20
+ """
21
+ This model includes backend parameters to store in the DB.
22
+ """
23
+
24
+ # `config` stores text-encoded non-sensitive backend config parameters (e.g. json)
25
+ config: str
26
+ # `auth` stores text-encoded sensitive backend config parameters (e.g. json).
27
+ # Configurator should not encrypt/decrypt it. This is done by the caller.
28
+ auth: str
29
+
30
+
31
+ class StoredBackendRecord(BackendRecord):
32
+ """
33
+ This model includes backend parameters stored in the DB.
34
+ """
35
+
36
+ # IDs of DB models.
37
+ # Can be used by externally-registered Configurator to work with the DB directly.
38
+ project_id: UUID
39
+ backend_id: UUID
40
+
41
+
42
+ class Configurator(ABC):
43
+ """
44
+ `Configurator` is responsible for configuring backends
45
+ and initializing `Backend` instances from backend configs.
46
+ Every backend must implement `Configurator` and register it
47
+ in `dstack._internal.core.backends.configurators`.
48
+ """
49
+
50
+ TYPE: ClassVar[BackendType]
51
+ # `BACKEND_CLASS` is used to introspect backend features without initializing it.
52
+ BACKEND_CLASS: ClassVar[type[Backend]]
53
+
54
+ @abstractmethod
55
+ def validate_config(self, config: AnyBackendConfigWithCreds, default_creds_enabled: bool):
56
+ """
57
+ Validates backend config including backend creds and other parameters.
58
+ Raises `ServerClientError` or its subclass if config is invalid.
59
+ If the backend supports default creds and not `default_creds_enabled`, should raise an error.
60
+ """
61
+ pass
62
+
63
+ @abstractmethod
64
+ def create_backend(
65
+ self, project_name: str, config: AnyBackendConfigWithCreds
66
+ ) -> BackendRecord:
67
+ """
68
+ Sets up backend given backend config and returns
69
+ text-encoded config and creds to be stored in the DB.
70
+ It may perform backend initialization, create
71
+ cloud resources such as networks and managed identities, and
72
+ save additional configuration parameters.
73
+ It does not need to duplicate validation done by `validate_config()`
74
+ since the caller guarantees to call `validate_config()` first.
75
+ It may perform additional validation not possible in `validate_config()`
76
+ and raise `ServerClientError` or its subclass if config is invalid.
77
+ """
78
+ pass
79
+
80
+ @abstractmethod
81
+ def get_backend_config(
82
+ self, record: StoredBackendRecord, include_creds: bool
83
+ ) -> AnyBackendConfig:
84
+ """
85
+ Constructs `BackendConfig` to be returned in API responses.
86
+ Project admins may need to see backend's creds. In this case `include_creds` will be `True`.
87
+ Otherwise, no sensitive information should be included.
88
+ """
89
+ pass
90
+
91
+ @abstractmethod
92
+ def get_backend(self, record: StoredBackendRecord) -> Backend:
93
+ """
94
+ Returns `Backend` instance from config and creds stored in `record`.
95
+ """
96
+ pass
97
+
98
+
99
+ def raise_invalid_credentials_error(
100
+ fields: Optional[List[List[str]]] = None, details: Optional[Any] = None
101
+ ):
102
+ msg = BackendInvalidCredentialsError.msg
103
+ if details:
104
+ msg += f": {details}"
105
+ raise BackendInvalidCredentialsError(fields=fields, msg=msg)
@@ -0,0 +1,14 @@
1
+ from pathlib import Path
2
+
3
+
4
+ def fill_data(values: dict):
5
+ if values.get("data") is not None:
6
+ return values
7
+ if "filename" not in values:
8
+ raise ValueError()
9
+ try:
10
+ with open(Path(values["filename"]).expanduser()) as f:
11
+ values["data"] = f.read()
12
+ except OSError:
13
+ raise ValueError(f"No such file {values['filename']}")
14
+ return values
@@ -0,0 +1,138 @@
1
+ from typing import List, Optional, Type, Union
2
+
3
+ from dstack._internal.core.backends.base.configurator import Configurator
4
+ from dstack._internal.core.models.backends.base import BackendType
5
+
6
+ _CONFIGURATOR_CLASSES: List[Type[Configurator]] = []
7
+
8
+
9
+ try:
10
+ from dstack._internal.core.backends.aws.configurator import AWSConfigurator
11
+
12
+ _CONFIGURATOR_CLASSES.append(AWSConfigurator)
13
+ except ImportError:
14
+ pass
15
+
16
+ try:
17
+ from dstack._internal.core.backends.azure.configurator import AzureConfigurator
18
+
19
+ _CONFIGURATOR_CLASSES.append(AzureConfigurator)
20
+ except ImportError:
21
+ pass
22
+
23
+ try:
24
+ from dstack._internal.core.backends.cudo.configurator import (
25
+ CudoConfigurator,
26
+ )
27
+
28
+ _CONFIGURATOR_CLASSES.append(CudoConfigurator)
29
+ except ImportError:
30
+ pass
31
+
32
+ try:
33
+ from dstack._internal.core.backends.datacrunch.configurator import (
34
+ DataCrunchConfigurator,
35
+ )
36
+
37
+ _CONFIGURATOR_CLASSES.append(DataCrunchConfigurator)
38
+ except ImportError:
39
+ pass
40
+
41
+ try:
42
+ from dstack._internal.core.backends.gcp.configurator import GCPConfigurator
43
+
44
+ _CONFIGURATOR_CLASSES.append(GCPConfigurator)
45
+ except ImportError:
46
+ pass
47
+
48
+ try:
49
+ from dstack._internal.core.backends.kubernetes.configurator import (
50
+ KubernetesConfigurator,
51
+ )
52
+
53
+ _CONFIGURATOR_CLASSES.append(KubernetesConfigurator)
54
+ except ImportError:
55
+ pass
56
+
57
+ try:
58
+ from dstack._internal.core.backends.lambdalabs.configurator import (
59
+ LambdaConfigurator,
60
+ )
61
+
62
+ _CONFIGURATOR_CLASSES.append(LambdaConfigurator)
63
+ except ImportError:
64
+ pass
65
+
66
+ try:
67
+ from dstack._internal.core.backends.oci.configurator import OCIConfigurator
68
+
69
+ _CONFIGURATOR_CLASSES.append(OCIConfigurator)
70
+ except ImportError:
71
+ pass
72
+
73
+ try:
74
+ from dstack._internal.core.backends.runpod.configurator import RunpodConfigurator
75
+
76
+ _CONFIGURATOR_CLASSES.append(RunpodConfigurator)
77
+ except ImportError:
78
+ pass
79
+
80
+ try:
81
+ from dstack._internal.core.backends.tensordock.configurator import (
82
+ TensorDockConfigurator,
83
+ )
84
+
85
+ _CONFIGURATOR_CLASSES.append(TensorDockConfigurator)
86
+ except ImportError:
87
+ pass
88
+
89
+ try:
90
+ from dstack._internal.core.backends.vastai.configurator import VastAIConfigurator
91
+
92
+ _CONFIGURATOR_CLASSES.append(VastAIConfigurator)
93
+ except ImportError:
94
+ pass
95
+
96
+ try:
97
+ from dstack._internal.core.backends.vultr.configurator import VultrConfigurator
98
+
99
+ _CONFIGURATOR_CLASSES.append(VultrConfigurator)
100
+ except ImportError:
101
+ pass
102
+
103
+
104
+ _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP = {c.TYPE: c for c in _CONFIGURATOR_CLASSES}
105
+ _BACKEND_TYPES = [c.TYPE for c in _CONFIGURATOR_CLASSES]
106
+
107
+
108
+ def get_configurator(backend_type: Union[BackendType, str]) -> Optional[Configurator]:
109
+ """
110
+ Returns an available `Configurator` for a given `backend_type`.
111
+ """
112
+ backend_type = BackendType(backend_type)
113
+ configurator_class = _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP.get(backend_type)
114
+ if configurator_class is None:
115
+ return None
116
+ return configurator_class()
117
+
118
+
119
+ def list_available_backend_types() -> List[BackendType]:
120
+ """
121
+ Lists all backend types available on the server.
122
+ """
123
+ return _BACKEND_TYPES
124
+
125
+
126
+ def list_available_configurator_classes() -> List[type[Configurator]]:
127
+ """
128
+ Lists all backend configurator classes available on the server.
129
+ """
130
+ return _CONFIGURATOR_CLASSES
131
+
132
+
133
+ def register_configurator(configurator: Type[Configurator]):
134
+ """
135
+ A hook to for registering new configurators without importing them.
136
+ Can be used to extend dstack functionality.
137
+ """
138
+ _BACKEND_TYPE_TO_CONFIGURATOR_CLASS_MAP[configurator.TYPE] = configurator
@@ -1,15 +0,0 @@
1
- from dstack._internal.core.backends.base import Backend
2
- from dstack._internal.core.backends.cudo.compute import CudoCompute
3
- from dstack._internal.core.backends.cudo.config import CudoConfig
4
- from dstack._internal.core.models.backends.base import BackendType
5
-
6
-
7
- class CudoBackend(Backend):
8
- TYPE: BackendType = BackendType.CUDO
9
-
10
- def __init__(self, config: CudoConfig):
11
- self.config = config
12
- self._compute = CudoCompute(self.config)
13
-
14
- def compute(self) -> CudoCompute:
15
- return self._compute
@@ -0,0 +1,16 @@
1
+ from dstack._internal.core.backends.base.backend import Backend
2
+ from dstack._internal.core.backends.cudo.compute import CudoCompute
3
+ from dstack._internal.core.backends.cudo.models import CudoConfig
4
+ from dstack._internal.core.models.backends.base import BackendType
5
+
6
+
7
+ class CudoBackend(Backend):
8
+ TYPE = BackendType.CUDO
9
+ COMPUTE_CLASS = CudoCompute
10
+
11
+ def __init__(self, config: CudoConfig):
12
+ self.config = config
13
+ self._compute = CudoCompute(self.config)
14
+
15
+ def compute(self) -> CudoCompute:
16
+ return self._compute
@@ -2,25 +2,23 @@ from typing import List, Optional
2
2
 
3
3
  import requests
4
4
 
5
- from dstack._internal.core.backends.base import Compute
5
+ from dstack._internal.core.backends.base.backend import Compute
6
6
  from dstack._internal.core.backends.base.compute import (
7
+ ComputeWithCreateInstanceSupport,
7
8
  generate_unique_instance_name,
8
- get_job_instance_name,
9
9
  get_shim_commands,
10
10
  )
11
11
  from dstack._internal.core.backends.base.offers import get_catalog_offers
12
12
  from dstack._internal.core.backends.cudo.api_client import CudoApiClient
13
- from dstack._internal.core.backends.cudo.config import CudoConfig
13
+ from dstack._internal.core.backends.cudo.models import CudoConfig
14
14
  from dstack._internal.core.errors import BackendError, NoCapacityError, ProvisioningError
15
15
  from dstack._internal.core.models.backends.base import BackendType
16
16
  from dstack._internal.core.models.instances import (
17
17
  InstanceAvailability,
18
18
  InstanceConfiguration,
19
19
  InstanceOfferWithAvailability,
20
- SSHKey,
21
20
  )
22
- from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
23
- from dstack._internal.core.models.volumes import Volume
21
+ from dstack._internal.core.models.runs import JobProvisioningData, Requirements
24
22
  from dstack._internal.utils.logging import get_logger
25
23
 
26
24
  logger = get_logger(__name__)
@@ -29,7 +27,10 @@ logger = get_logger(__name__)
29
27
  MAX_RESOURCE_NAME_LEN = 30
30
28
 
31
29
 
32
- class CudoCompute(Compute):
30
+ class CudoCompute(
31
+ ComputeWithCreateInstanceSupport,
32
+ Compute,
33
+ ):
33
34
  def __init__(self, config: CudoConfig):
34
35
  super().__init__()
35
36
  self.config = config
@@ -51,25 +52,6 @@ class CudoCompute(Compute):
51
52
  ]
52
53
  return offers
53
54
 
54
- def run_job(
55
- self,
56
- run: Run,
57
- job: Job,
58
- instance_offer: InstanceOfferWithAvailability,
59
- project_ssh_public_key: str,
60
- project_ssh_private_key: str,
61
- volumes: List[Volume],
62
- ) -> JobProvisioningData:
63
- instance_config = InstanceConfiguration(
64
- project_name=run.project_name,
65
- instance_name=get_job_instance_name(run, job),
66
- ssh_keys=[
67
- SSHKey(public=project_ssh_public_key.strip()),
68
- ],
69
- user=run.user,
70
- )
71
- return self.create_instance(instance_offer, instance_config)
72
-
73
55
  def create_instance(
74
56
  self,
75
57
  instance_offer: InstanceOfferWithAvailability,