dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (51) hide show
  1. dstack/_internal/cli/main.py +3 -1
  2. dstack/_internal/cli/services/configurators/fleet.py +20 -6
  3. dstack/_internal/cli/utils/gpu.py +2 -2
  4. dstack/_internal/core/backends/aws/compute.py +62 -41
  5. dstack/_internal/core/backends/aws/resources.py +11 -6
  6. dstack/_internal/core/backends/azure/compute.py +25 -13
  7. dstack/_internal/core/backends/base/compute.py +121 -14
  8. dstack/_internal/core/backends/base/offers.py +34 -4
  9. dstack/_internal/core/backends/cloudrift/compute.py +5 -7
  10. dstack/_internal/core/backends/cudo/compute.py +4 -2
  11. dstack/_internal/core/backends/datacrunch/compute.py +13 -11
  12. dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
  13. dstack/_internal/core/backends/gcp/compute.py +25 -11
  14. dstack/_internal/core/backends/hotaisle/compute.py +4 -7
  15. dstack/_internal/core/backends/kubernetes/compute.py +6 -4
  16. dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
  17. dstack/_internal/core/backends/local/compute.py +1 -3
  18. dstack/_internal/core/backends/nebius/compute.py +10 -7
  19. dstack/_internal/core/backends/oci/compute.py +15 -8
  20. dstack/_internal/core/backends/oci/resources.py +8 -3
  21. dstack/_internal/core/backends/runpod/compute.py +15 -6
  22. dstack/_internal/core/backends/template/compute.py.jinja +3 -1
  23. dstack/_internal/core/backends/tensordock/compute.py +1 -3
  24. dstack/_internal/core/backends/tensordock/models.py +2 -0
  25. dstack/_internal/core/backends/vastai/compute.py +7 -3
  26. dstack/_internal/core/backends/vultr/compute.py +5 -5
  27. dstack/_internal/core/consts.py +2 -0
  28. dstack/_internal/core/models/projects.py +8 -0
  29. dstack/_internal/core/services/repos.py +101 -10
  30. dstack/_internal/server/background/tasks/process_instances.py +3 -2
  31. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +100 -47
  33. dstack/_internal/server/services/backends/__init__.py +1 -1
  34. dstack/_internal/server/services/projects.py +11 -3
  35. dstack/_internal/server/services/runs.py +2 -0
  36. dstack/_internal/server/statics/index.html +1 -1
  37. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
  38. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
  39. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
  40. dstack/_internal/utils/ssh.py +22 -2
  41. dstack/version.py +2 -2
  42. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/METADATA +8 -6
  43. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/RECORD +46 -50
  44. dstack/_internal/core/backends/tensordock/__init__.py +0 -0
  45. dstack/_internal/core/backends/tensordock/api_client.py +0 -104
  46. dstack/_internal/core/backends/tensordock/backend.py +0 -16
  47. dstack/_internal/core/backends/tensordock/configurator.py +0 -74
  48. dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
  49. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/WHEEL +0 -0
  50. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/entry_points.txt +0 -0
  51. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,7 +1,8 @@
1
1
  from typing import Dict, List, Optional
2
2
 
3
- from dstack._internal.core.backends.base.backend import Compute
4
3
  from dstack._internal.core.backends.base.compute import (
4
+ Compute,
5
+ ComputeWithAllOffersCached,
5
6
  ComputeWithCreateInstanceSupport,
6
7
  get_shim_commands,
7
8
  )
@@ -17,13 +18,14 @@ from dstack._internal.core.models.instances import (
17
18
  InstanceOfferWithAvailability,
18
19
  )
19
20
  from dstack._internal.core.models.placement import PlacementGroup
20
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
21
+ from dstack._internal.core.models.runs import JobProvisioningData
21
22
  from dstack._internal.utils.logging import get_logger
22
23
 
23
24
  logger = get_logger(__name__)
24
25
 
25
26
 
26
27
  class CloudRiftCompute(
28
+ ComputeWithAllOffersCached,
27
29
  ComputeWithCreateInstanceSupport,
28
30
  Compute,
29
31
  ):
@@ -32,15 +34,11 @@ class CloudRiftCompute(
32
34
  self.config = config
33
35
  self.client = RiftClient(self.config.creds.api_key)
34
36
 
35
- def get_offers(
36
- self, requirements: Optional[Requirements] = None
37
- ) -> List[InstanceOfferWithAvailability]:
37
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
38
38
  offers = get_catalog_offers(
39
39
  backend=BackendType.CLOUDRIFT,
40
40
  locations=self.config.regions or None,
41
- requirements=requirements,
42
41
  )
43
-
44
42
  offers_with_availabilities = self._get_offers_with_availability(offers)
45
43
  return offers_with_availabilities
46
44
 
@@ -5,6 +5,7 @@ import requests
5
5
  from dstack._internal.core.backends.base.backend import Compute
6
6
  from dstack._internal.core.backends.base.compute import (
7
7
  ComputeWithCreateInstanceSupport,
8
+ ComputeWithFilteredOffersCached,
8
9
  generate_unique_instance_name,
9
10
  get_shim_commands,
10
11
  )
@@ -29,6 +30,7 @@ MAX_RESOURCE_NAME_LEN = 30
29
30
 
30
31
 
31
32
  class CudoCompute(
33
+ ComputeWithFilteredOffersCached,
32
34
  ComputeWithCreateInstanceSupport,
33
35
  Compute,
34
36
  ):
@@ -37,8 +39,8 @@ class CudoCompute(
37
39
  self.config = config
38
40
  self.api_client = CudoApiClient(config.creds.api_key)
39
41
 
40
- def get_offers(
41
- self, requirements: Optional[Requirements] = None
42
+ def get_offers_by_requirements(
43
+ self, requirements: Requirements
42
44
  ) -> List[InstanceOfferWithAvailability]:
43
45
  offers = get_catalog_offers(
44
46
  backend=BackendType.CUDO,
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Optional
1
+ from typing import Callable, Dict, List, Optional
2
2
 
3
3
  from datacrunch import DataCrunchClient
4
4
  from datacrunch.exceptions import APIException
@@ -6,11 +6,12 @@ from datacrunch.instances.instances import Instance
6
6
 
7
7
  from dstack._internal.core.backends.base.backend import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
+ ComputeWithAllOffersCached,
9
10
  ComputeWithCreateInstanceSupport,
10
11
  generate_unique_instance_name,
11
12
  get_shim_commands,
12
13
  )
13
- from dstack._internal.core.backends.base.offers import get_catalog_offers
14
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
14
15
  from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
15
16
  from dstack._internal.core.errors import NoCapacityError
16
17
  from dstack._internal.core.models.backends.base import BackendType
@@ -36,6 +37,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None)
36
37
 
37
38
 
38
39
  class DataCrunchCompute(
40
+ ComputeWithAllOffersCached,
39
41
  ComputeWithCreateInstanceSupport,
40
42
  Compute,
41
43
  ):
@@ -47,18 +49,19 @@ class DataCrunchCompute(
47
49
  client_secret=self.config.creds.client_secret,
48
50
  )
49
51
 
50
- def get_offers(
51
- self, requirements: Optional[Requirements] = None
52
- ) -> List[InstanceOfferWithAvailability]:
52
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
53
53
  offers = get_catalog_offers(
54
54
  backend=BackendType.DATACRUNCH,
55
55
  locations=self.config.regions,
56
- requirements=requirements,
57
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
58
56
  )
59
57
  offers_with_availability = self._get_offers_with_availability(offers)
60
58
  return offers_with_availability
61
59
 
60
+ def get_offers_modifier(
61
+ self, requirements: Requirements
62
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
63
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
64
+
62
65
  def _get_offers_with_availability(
63
66
  self, offers: List[InstanceOffer]
64
67
  ) -> List[InstanceOfferWithAvailability]:
@@ -182,10 +185,9 @@ class DataCrunchCompute(
182
185
 
183
186
  def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str:
184
187
  # https://api.datacrunch.io/v1/images
185
- if (
186
- len(instance_offer.instance.resources.gpus) > 0
187
- and instance_offer.instance.resources.gpus[0].name == "V100"
188
- ):
188
+ if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[
189
+ 0
190
+ ].name in ["V100", "A6000"]:
189
191
  # Ubuntu 22.04 + CUDA 12.0 + Docker
190
192
  return "2088da25-bb0d-41cc-a191-dccae45d96fd"
191
193
  # Ubuntu 24.04 + CUDA 12.8 Open + Docker
@@ -5,6 +5,7 @@ from gpuhunt.providers.digitalocean import DigitalOceanProvider
5
5
 
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ ComputeWithAllOffersCached,
8
9
  ComputeWithCreateInstanceSupport,
9
10
  generate_unique_instance_name,
10
11
  get_user_data,
@@ -20,7 +21,7 @@ from dstack._internal.core.models.instances import (
20
21
  InstanceOfferWithAvailability,
21
22
  )
22
23
  from dstack._internal.core.models.placement import PlacementGroup
23
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
24
+ from dstack._internal.core.models.runs import JobProvisioningData
24
25
  from dstack._internal.utils.logging import get_logger
25
26
 
26
27
  logger = get_logger(__name__)
@@ -37,6 +38,7 @@ DOCKER_INSTALL_COMMANDS = [
37
38
 
38
39
 
39
40
  class BaseDigitalOceanCompute(
41
+ ComputeWithAllOffersCached,
40
42
  ComputeWithCreateInstanceSupport,
41
43
  Compute,
42
44
  ):
@@ -50,13 +52,10 @@ class BaseDigitalOceanCompute(
50
52
  DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url)
51
53
  )
52
54
 
53
- def get_offers(
54
- self, requirements: Optional[Requirements] = None
55
- ) -> List[InstanceOfferWithAvailability]:
55
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
56
56
  offers = get_catalog_offers(
57
57
  backend=self.BACKEND_TYPE,
58
58
  locations=self.config.regions,
59
- requirements=requirements,
60
59
  catalog=self.catalog,
61
60
  )
62
61
  return [
@@ -17,6 +17,7 @@ import dstack._internal.core.backends.gcp.resources as gcp_resources
17
17
  from dstack import version
18
18
  from dstack._internal.core.backends.base.compute import (
19
19
  Compute,
20
+ ComputeWithAllOffersCached,
20
21
  ComputeWithCreateInstanceSupport,
21
22
  ComputeWithGatewaySupport,
22
23
  ComputeWithMultinodeSupport,
@@ -30,10 +31,15 @@ from dstack._internal.core.backends.base.compute import (
30
31
  get_shim_commands,
31
32
  get_user_data,
32
33
  merge_tags,
34
+ requires_nvidia_proprietary_kernel_modules,
35
+ )
36
+ from dstack._internal.core.backends.base.offers import (
37
+ get_catalog_offers,
38
+ get_offers_disk_modifier,
33
39
  )
34
- from dstack._internal.core.backends.base.offers import get_catalog_offers
35
40
  from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
36
41
  from dstack._internal.core.backends.gcp.models import GCPConfig
42
+ from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
37
43
  from dstack._internal.core.errors import (
38
44
  ComputeError,
39
45
  ComputeResourceNotFoundError,
@@ -82,6 +88,7 @@ class GCPVolumeDiskBackendData(CoreModel):
82
88
 
83
89
 
84
90
  class GCPCompute(
91
+ ComputeWithAllOffersCached,
85
92
  ComputeWithCreateInstanceSupport,
86
93
  ComputeWithMultinodeSupport,
87
94
  ComputeWithPlacementGroupSupport,
@@ -107,14 +114,10 @@ class GCPCompute(
107
114
  self._extra_subnets_cache_lock = threading.Lock()
108
115
  self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
109
116
 
110
- def get_offers(
111
- self, requirements: Optional[Requirements] = None
112
- ) -> List[InstanceOfferWithAvailability]:
117
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
113
118
  regions = get_or_error(self.config.regions)
114
119
  offers = get_catalog_offers(
115
120
  backend=BackendType.GCP,
116
- requirements=requirements,
117
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
118
121
  extra_filter=_supported_instances_and_zones(regions),
119
122
  )
120
123
  quotas: Dict[str, Dict[str, float]] = defaultdict(dict)
@@ -142,9 +145,13 @@ class GCPCompute(
142
145
  offer_keys_to_offers[key] = offer_with_availability
143
146
  offers_with_availability.append(offer_with_availability)
144
147
  offers_with_availability[-1].region = region
145
-
146
148
  return offers_with_availability
147
149
 
150
+ def get_offers_modifier(
151
+ self, requirements: Requirements
152
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
153
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
154
+
148
155
  def terminate_instance(
149
156
  self, instance_id: str, region: str, backend_data: Optional[str] = None
150
157
  ) -> None:
@@ -288,7 +295,11 @@ class GCPCompute(
288
295
 
289
296
  image = _get_image(
290
297
  instance_type_name=instance_offer.instance.name,
291
- cuda=len(instance_offer.instance.resources.gpus) > 0,
298
+ gpu_name=(
299
+ instance_offer.instance.resources.gpus[0].name
300
+ if len(instance_offer.instance.resources.gpus) > 0
301
+ else None
302
+ ),
292
303
  )
293
304
 
294
305
  for zone in zones:
@@ -899,7 +910,7 @@ class GCPImage:
899
910
  is_ufw_installed: bool
900
911
 
901
912
 
902
- def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
913
+ def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
903
914
  if instance_type_name == "a3-megagpu-8g":
904
915
  image_name = "dstack-a3mega-5"
905
916
  is_ufw_installed = False
@@ -908,8 +919,11 @@ def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
908
919
  id="projects/cos-cloud/global/images/cos-105-17412-535-78",
909
920
  is_ufw_installed=False,
910
921
  )
911
- elif cuda:
912
- image_name = f"dstack-cuda-{version.base_image}"
922
+ elif gpu_name is not None:
923
+ if not requires_nvidia_proprietary_kernel_modules(gpu_name):
924
+ image_name = f"dstack-cuda-{version.base_image}"
925
+ else:
926
+ image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
913
927
  is_ufw_installed = True
914
928
  else:
915
929
  image_name = f"dstack-{version.base_image}"
@@ -9,6 +9,7 @@ from gpuhunt.providers.hotaisle import HotAisleProvider
9
9
 
10
10
  from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
+ ComputeWithAllOffersCached,
12
13
  ComputeWithCreateInstanceSupport,
13
14
  get_shim_commands,
14
15
  )
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
23
24
  InstanceOfferWithAvailability,
24
25
  )
25
26
  from dstack._internal.core.models.placement import PlacementGroup
26
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
27
+ from dstack._internal.core.models.runs import JobProvisioningData
27
28
  from dstack._internal.utils.logging import get_logger
28
29
 
29
30
  logger = get_logger(__name__)
@@ -44,6 +45,7 @@ INSTANCE_TYPE_SPECS = {
44
45
 
45
46
 
46
47
  class HotAisleCompute(
48
+ ComputeWithAllOffersCached,
47
49
  ComputeWithCreateInstanceSupport,
48
50
  Compute,
49
51
  ):
@@ -56,16 +58,12 @@ class HotAisleCompute(
56
58
  HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
57
59
  )
58
60
 
59
- def get_offers(
60
- self, requirements: Optional[Requirements] = None
61
- ) -> List[InstanceOfferWithAvailability]:
61
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
62
62
  offers = get_catalog_offers(
63
63
  backend=BackendType.HOTAISLE,
64
64
  locations=self.config.regions or None,
65
- requirements=requirements,
66
65
  catalog=self.catalog,
67
66
  )
68
-
69
67
  supported_offers = []
70
68
  for offer in offers:
71
69
  if offer.instance.name in INSTANCE_TYPE_SPECS:
@@ -78,7 +76,6 @@ class HotAisleCompute(
78
76
  logger.warning(
79
77
  f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
80
78
  )
81
-
82
79
  return supported_offers
83
80
 
84
81
  def get_payload_from_offer(self, instance_type) -> dict:
@@ -9,13 +9,14 @@ from kubernetes import client
9
9
 
10
10
  from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
+ ComputeWithFilteredOffersCached,
12
13
  ComputeWithGatewaySupport,
13
14
  generate_unique_gateway_instance_name,
14
15
  generate_unique_instance_name_for_job,
15
16
  get_docker_commands,
16
17
  get_dstack_gateway_commands,
17
18
  )
18
- from dstack._internal.core.backends.base.offers import match_requirements
19
+ from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
19
20
  from dstack._internal.core.backends.kubernetes.models import (
20
21
  KubernetesConfig,
21
22
  KubernetesNetworkingConfig,
@@ -58,6 +59,7 @@ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
58
59
 
59
60
 
60
61
  class KubernetesCompute(
62
+ ComputeWithFilteredOffersCached,
61
63
  ComputeWithGatewaySupport,
62
64
  Compute,
63
65
  ):
@@ -70,8 +72,8 @@ class KubernetesCompute(
70
72
  self.networking_config = networking_config
71
73
  self.api = get_api_from_config_data(config.kubeconfig.data)
72
74
 
73
- def get_offers(
74
- self, requirements: Optional[Requirements] = None
75
+ def get_offers_by_requirements(
76
+ self, requirements: Requirements
75
77
  ) -> List[InstanceOfferWithAvailability]:
76
78
  nodes = self.api.list_node()
77
79
  instance_offers = []
@@ -99,7 +101,7 @@ class KubernetesCompute(
99
101
  availability=InstanceAvailability.AVAILABLE,
100
102
  instance_runtime=InstanceRuntime.RUNNER,
101
103
  )
102
- instance_offers.extend(match_requirements([instance_offer], requirements))
104
+ instance_offers.extend(filter_offers_by_requirements([instance_offer], requirements))
103
105
  return instance_offers
104
106
 
105
107
  def run_job(
@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
7
7
 
8
8
  from dstack._internal.core.backends.base.compute import (
9
9
  Compute,
10
+ ComputeWithAllOffersCached,
10
11
  ComputeWithCreateInstanceSupport,
11
12
  generate_unique_instance_name,
12
13
  get_shim_commands,
@@ -22,12 +23,13 @@ from dstack._internal.core.models.instances import (
22
23
  InstanceOfferWithAvailability,
23
24
  )
24
25
  from dstack._internal.core.models.placement import PlacementGroup
25
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
26
+ from dstack._internal.core.models.runs import JobProvisioningData
26
27
 
27
28
  MAX_INSTANCE_NAME_LEN = 60
28
29
 
29
30
 
30
31
  class LambdaCompute(
32
+ ComputeWithAllOffersCached,
31
33
  ComputeWithCreateInstanceSupport,
32
34
  Compute,
33
35
  ):
@@ -36,13 +38,10 @@ class LambdaCompute(
36
38
  self.config = config
37
39
  self.api_client = LambdaAPIClient(config.creds.api_key)
38
40
 
39
- def get_offers(
40
- self, requirements: Optional[Requirements] = None
41
- ) -> List[InstanceOfferWithAvailability]:
41
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
42
42
  offers = get_catalog_offers(
43
43
  backend=BackendType.LAMBDA,
44
44
  locations=self.config.regions or None,
45
- requirements=requirements,
46
45
  )
47
46
  offers_with_availability = self._get_offers_with_availability(offers)
48
47
  return offers_with_availability
@@ -28,9 +28,7 @@ class LocalCompute(
28
28
  ComputeWithVolumeSupport,
29
29
  Compute,
30
30
  ):
31
- def get_offers(
32
- self, requirements: Optional[Requirements] = None
33
- ) -> List[InstanceOfferWithAvailability]:
31
+ def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
34
32
  return [
35
33
  InstanceOfferWithAvailability(
36
34
  backend=BackendType.LOCAL,
@@ -3,7 +3,7 @@ import random
3
3
  import shlex
4
4
  import time
5
5
  from functools import cached_property
6
- from typing import List, Optional
6
+ from typing import Callable, List, Optional
7
7
 
8
8
  from nebius.aio.operation import Operation as SDKOperation
9
9
  from nebius.aio.service_error import RequestError, StatusCode
@@ -12,13 +12,14 @@ from nebius.sdk import SDK
12
12
 
13
13
  from dstack._internal.core.backends.base.backend import Compute
14
14
  from dstack._internal.core.backends.base.compute import (
15
+ ComputeWithAllOffersCached,
15
16
  ComputeWithCreateInstanceSupport,
16
17
  ComputeWithMultinodeSupport,
17
18
  ComputeWithPlacementGroupSupport,
18
19
  generate_unique_instance_name,
19
20
  get_user_data,
20
21
  )
21
- from dstack._internal.core.backends.base.offers import get_catalog_offers
22
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
22
23
  from dstack._internal.core.backends.nebius import resources
23
24
  from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
24
25
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -76,6 +77,7 @@ SUPPORTED_PLATFORMS = [
76
77
 
77
78
 
78
79
  class NebiusCompute(
80
+ ComputeWithAllOffersCached,
79
81
  ComputeWithCreateInstanceSupport,
80
82
  ComputeWithMultinodeSupport,
81
83
  ComputeWithPlacementGroupSupport,
@@ -106,15 +108,11 @@ class NebiusCompute(
106
108
  ).metadata.id
107
109
  return self._subnet_id_cache[region]
108
110
 
109
- def get_offers(
110
- self, requirements: Optional[Requirements] = None
111
- ) -> List[InstanceOfferWithAvailability]:
111
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
112
112
  offers = get_catalog_offers(
113
113
  backend=BackendType.NEBIUS,
114
114
  locations=list(self._region_to_project_id),
115
- requirements=requirements,
116
115
  extra_filter=_supported_instances,
117
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
118
116
  )
119
117
  return [
120
118
  InstanceOfferWithAvailability(
@@ -124,6 +122,11 @@ class NebiusCompute(
124
122
  for offer in offers
125
123
  ]
126
124
 
125
+ def get_offers_modifier(
126
+ self, requirements: Requirements
127
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
128
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
129
+
127
130
  def create_instance(
128
131
  self,
129
132
  instance_offer: InstanceOfferWithAvailability,
@@ -1,17 +1,18 @@
1
1
  from concurrent.futures import ThreadPoolExecutor
2
2
  from functools import cached_property
3
- from typing import List, Optional
3
+ from typing import Callable, List, Optional
4
4
 
5
5
  import oci
6
6
 
7
7
  from dstack._internal.core.backends.base.compute import (
8
8
  Compute,
9
+ ComputeWithAllOffersCached,
9
10
  ComputeWithCreateInstanceSupport,
10
11
  ComputeWithMultinodeSupport,
11
12
  generate_unique_instance_name,
12
13
  get_user_data,
13
14
  )
14
- from dstack._internal.core.backends.base.offers import get_catalog_offers
15
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
15
16
  from dstack._internal.core.backends.oci import resources
16
17
  from dstack._internal.core.backends.oci.models import OCIConfig
17
18
  from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -47,6 +48,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
47
48
 
48
49
 
49
50
  class OCICompute(
51
+ ComputeWithAllOffersCached,
50
52
  ComputeWithCreateInstanceSupport,
51
53
  ComputeWithMultinodeSupport,
52
54
  Compute,
@@ -60,14 +62,10 @@ class OCICompute(
60
62
  def shapes_quota(self) -> resources.ShapesQuota:
61
63
  return resources.ShapesQuota.load(self.regions, self.config.compartment_id)
62
64
 
63
- def get_offers(
64
- self, requirements: Optional[Requirements] = None
65
- ) -> List[InstanceOfferWithAvailability]:
65
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
66
66
  offers = get_catalog_offers(
67
67
  backend=BackendType.OCI,
68
68
  locations=self.config.regions,
69
- requirements=requirements,
70
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
71
69
  extra_filter=_supported_instances,
72
70
  )
73
71
 
@@ -96,6 +94,11 @@ class OCICompute(
96
94
 
97
95
  return offers_with_availability
98
96
 
97
+ def get_offers_modifier(
98
+ self, requirements: Requirements
99
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
100
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
101
+
99
102
  def terminate_instance(
100
103
  self, instance_id: str, region: str, backend_data: Optional[str] = None
101
104
  ) -> None:
@@ -115,7 +118,11 @@ class OCICompute(
115
118
  availability_domain = instance_offer.availability_zones[0]
116
119
 
117
120
  listing, package = resources.get_marketplace_listing_and_package(
118
- cuda=len(instance_offer.instance.resources.gpus) > 0,
121
+ gpu_name=(
122
+ instance_offer.instance.resources.gpus[0].name
123
+ if len(instance_offer.instance.resources.gpus) > 0
124
+ else None
125
+ ),
119
126
  client=region.marketplace_client,
120
127
  )
121
128
  resources.accept_marketplace_listing_agreements(
@@ -23,7 +23,9 @@ import oci
23
23
  from oci.object_storage.models import CreatePreauthenticatedRequestDetails
24
24
 
25
25
  from dstack import version
26
+ from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
26
27
  from dstack._internal.core.backends.oci.region import OCIRegionClient
28
+ from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
27
29
  from dstack._internal.core.errors import BackendError
28
30
  from dstack._internal.core.models.instances import InstanceOffer
29
31
  from dstack._internal.utils.common import batched
@@ -352,11 +354,14 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
352
354
 
353
355
 
354
356
  def get_marketplace_listing_and_package(
355
- cuda: bool, client: oci.marketplace.MarketplaceClient
357
+ gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
356
358
  ) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
357
359
  listing_name = f"dstack-{version.base_image}"
358
- if cuda:
359
- listing_name = f"dstack-cuda-{version.base_image}"
360
+ if gpu_name is not None:
361
+ if not requires_nvidia_proprietary_kernel_modules(gpu_name):
362
+ listing_name = f"dstack-cuda-{version.base_image}"
363
+ else:
364
+ listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
360
365
 
361
366
  listing_summaries = list_marketplace_listings(listing_name, client)
362
367
  if len(listing_summaries) != 1:
@@ -1,17 +1,18 @@
1
1
  import json
2
2
  import uuid
3
3
  from datetime import timedelta
4
- from typing import List, Optional
4
+ from typing import Callable, List, Optional
5
5
 
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ ComputeWithAllOffersCached,
8
9
  ComputeWithVolumeSupport,
9
10
  generate_unique_instance_name,
10
11
  generate_unique_volume_name,
11
12
  get_docker_commands,
12
13
  get_job_instance_name,
13
14
  )
14
- from dstack._internal.core.backends.base.offers import get_catalog_offers
15
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
15
16
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
16
17
  from dstack._internal.core.backends.runpod.models import RunpodConfig
17
18
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -27,6 +28,7 @@ from dstack._internal.core.models.instances import (
27
28
  InstanceOfferWithAvailability,
28
29
  SSHKey,
29
30
  )
31
+ from dstack._internal.core.models.resources import Memory, Range
30
32
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
31
33
  from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
32
34
  from dstack._internal.utils.common import get_current_datetime
@@ -39,8 +41,12 @@ MAX_RESOURCE_NAME_LEN = 60
39
41
 
40
42
  CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
41
43
 
44
+ # RunPod does not seem to have any limits on the disk size.
45
+ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
46
+
42
47
 
43
48
  class RunpodCompute(
49
+ ComputeWithAllOffersCached,
44
50
  ComputeWithVolumeSupport,
45
51
  Compute,
46
52
  ):
@@ -51,13 +57,11 @@ class RunpodCompute(
51
57
  self.config = config
52
58
  self.api_client = RunpodApiClient(config.creds.api_key)
53
59
 
54
- def get_offers(
55
- self, requirements: Optional[Requirements] = None
56
- ) -> List[InstanceOfferWithAvailability]:
60
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
57
61
  offers = get_catalog_offers(
58
62
  backend=BackendType.RUNPOD,
59
63
  locations=self.config.regions or None,
60
- requirements=requirements,
64
+ requirements=None,
61
65
  extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
62
66
  )
63
67
  offers = [
@@ -68,6 +72,11 @@ class RunpodCompute(
68
72
  ]
69
73
  return offers
70
74
 
75
+ def get_offers_modifier(
76
+ self, requirements: Requirements
77
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
78
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
79
+
71
80
  def run_job(
72
81
  self,
73
82
  run: Run,
@@ -2,6 +2,7 @@ from typing import List, Optional
2
2
 
3
3
  from dstack._internal.core.backends.base.backend import Compute
4
4
  from dstack._internal.core.backends.base.compute import (
5
+ ComputeWithAllOffersCached,
5
6
  ComputeWithCreateInstanceSupport,
6
7
  ComputeWithGatewaySupport,
7
8
  ComputeWithMultinodeSupport,
@@ -28,6 +29,7 @@ logger = get_logger(__name__)
28
29
 
29
30
  class {{ backend_name }}Compute(
30
31
  # TODO: Choose ComputeWith* classes to extend and implement
32
+ # ComputeWithAllOffersCached,
31
33
  # ComputeWithCreateInstanceSupport,
32
34
  # ComputeWithMultinodeSupport,
33
35
  # ComputeWithReservationSupport,
@@ -42,7 +44,7 @@ class {{ backend_name }}Compute(
42
44
  self.config = config
43
45
 
44
46
  def get_offers(
45
- self, requirements: Optional[Requirements] = None
47
+ self, requirements: Requirements
46
48
  ) -> List[InstanceOfferWithAvailability]:
47
49
  # If the provider is added to gpuhunt, you'd typically get offers
48
50
  # using `get_catalog_offers()` and extend them with availability info.