dstack 0.19.31__py3-none-any.whl → 0.19.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (53) hide show
  1. dstack/_internal/cli/commands/offer.py +1 -1
  2. dstack/_internal/cli/services/configurators/run.py +1 -5
  3. dstack/_internal/core/backends/aws/compute.py +8 -5
  4. dstack/_internal/core/backends/azure/compute.py +9 -6
  5. dstack/_internal/core/backends/base/compute.py +40 -17
  6. dstack/_internal/core/backends/base/offers.py +5 -1
  7. dstack/_internal/core/backends/datacrunch/compute.py +9 -6
  8. dstack/_internal/core/backends/gcp/compute.py +137 -7
  9. dstack/_internal/core/backends/gcp/models.py +7 -0
  10. dstack/_internal/core/backends/gcp/resources.py +87 -5
  11. dstack/_internal/core/backends/hotaisle/compute.py +30 -0
  12. dstack/_internal/core/backends/kubernetes/compute.py +218 -77
  13. dstack/_internal/core/backends/kubernetes/models.py +4 -2
  14. dstack/_internal/core/backends/nebius/compute.py +24 -6
  15. dstack/_internal/core/backends/nebius/configurator.py +15 -0
  16. dstack/_internal/core/backends/nebius/models.py +57 -5
  17. dstack/_internal/core/backends/nebius/resources.py +45 -2
  18. dstack/_internal/core/backends/oci/compute.py +9 -6
  19. dstack/_internal/core/backends/runpod/compute.py +10 -6
  20. dstack/_internal/core/backends/vastai/compute.py +3 -1
  21. dstack/_internal/core/backends/vastai/configurator.py +0 -1
  22. dstack/_internal/core/compatibility/runs.py +8 -0
  23. dstack/_internal/core/models/fleets.py +1 -1
  24. dstack/_internal/core/models/profiles.py +12 -5
  25. dstack/_internal/core/models/runs.py +3 -2
  26. dstack/_internal/core/models/users.py +10 -0
  27. dstack/_internal/core/services/configs/__init__.py +1 -0
  28. dstack/_internal/server/background/tasks/process_fleets.py +75 -17
  29. dstack/_internal/server/background/tasks/process_instances.py +6 -4
  30. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
  31. dstack/_internal/server/background/tasks/process_runs.py +27 -23
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +63 -20
  33. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  34. dstack/_internal/server/models.py +3 -0
  35. dstack/_internal/server/routers/runs.py +5 -1
  36. dstack/_internal/server/routers/users.py +14 -2
  37. dstack/_internal/server/services/runs.py +9 -4
  38. dstack/_internal/server/services/users.py +35 -2
  39. dstack/_internal/server/statics/index.html +1 -1
  40. dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
  41. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
  42. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
  43. dstack/api/_public/__init__.py +9 -12
  44. dstack/api/_public/repos.py +0 -21
  45. dstack/api/_public/runs.py +64 -9
  46. dstack/api/server/_users.py +17 -2
  47. dstack/version.py +2 -2
  48. {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/METADATA +12 -14
  49. {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/RECORD +52 -51
  50. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
  51. {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
  52. {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
  53. {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -42,6 +42,36 @@ INSTANCE_TYPE_SPECS = {
42
42
  "cpu_frequency": 2000000000,
43
43
  "cpu_manufacturer": "Intel",
44
44
  },
45
+ "2x MI300X 26x Xeon Platinum 8470": {
46
+ "cpu_model": "Xeon Platinum 8470",
47
+ "cpu_frequency": 2000000000,
48
+ "cpu_manufacturer": "Intel",
49
+ },
50
+ "2x MI300X 26x Xeon Platinum 8462Y+": {
51
+ "cpu_model": "Xeon Platinum 8462Y+",
52
+ "cpu_frequency": 2800000000,
53
+ "cpu_manufacturer": "Intel",
54
+ },
55
+ "4x MI300X 52x Xeon Platinum 8470": {
56
+ "cpu_model": "Xeon Platinum 8470",
57
+ "cpu_frequency": 2000000000,
58
+ "cpu_manufacturer": "Intel",
59
+ },
60
+ "4x MI300X 52x Xeon Platinum 8462Y+": {
61
+ "cpu_model": "Xeon Platinum 8462Y+",
62
+ "cpu_frequency": 2800000000,
63
+ "cpu_manufacturer": "Intel",
64
+ },
65
+ "8x MI300X 104x Xeon Platinum 8470": {
66
+ "cpu_model": "Xeon Platinum 8470",
67
+ "cpu_frequency": 2000000000,
68
+ "cpu_manufacturer": "Intel",
69
+ },
70
+ "8x MI300X 104x Xeon Platinum 8462Y+": {
71
+ "cpu_model": "Xeon Platinum 8462Y+",
72
+ "cpu_frequency": 2800000000,
73
+ "cpu_manufacturer": "Intel",
74
+ },
45
75
  }
46
76
 
47
77
 
@@ -2,9 +2,10 @@ import subprocess
2
2
  import tempfile
3
3
  import threading
4
4
  import time
5
+ from enum import Enum
5
6
  from typing import List, Optional, Tuple
6
7
 
7
- from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
8
+ from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
8
9
  from kubernetes import client
9
10
 
10
11
  from dstack._internal.core.backends.base.compute import (
@@ -58,11 +59,42 @@ from dstack._internal.utils.logging import get_logger
58
59
  logger = get_logger(__name__)
59
60
 
60
61
  JUMP_POD_SSH_PORT = 22
62
+ DUMMY_REGION = "-"
63
+
64
+ NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
65
+ NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
66
+ NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
67
+
68
+ AMD_GPU_RESOURCE = "amd.com/gpu"
69
+ AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
70
+ # The oldest but still supported label format, the safest option, see the commit message:
71
+ # https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
72
+ # E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
73
+ # We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
74
+ # https://github.com/ROCm/k8s-device-plugin/issues/112
75
+ AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
76
+
77
+ # Taints we know and tolerate when creating our objects, e.g., the jump pod.
78
+ TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
61
79
 
62
80
  NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
63
81
  NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
64
82
 
65
- DUMMY_REGION = "-"
83
+ AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
84
+ device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
85
+ }
86
+ AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
87
+
88
+
89
+ class Operator(str, Enum):
90
+ EXISTS = "Exists"
91
+ IN = "In"
92
+
93
+
94
+ class TaintEffect(str, Enum):
95
+ NO_EXECUTE = "NoExecute"
96
+ NO_SCHEDULE = "NoSchedule"
97
+ PREFER_NO_SCHEDULE = "PreferNoSchedule"
66
98
 
67
99
 
68
100
  class KubernetesCompute(
@@ -92,21 +124,15 @@ class KubernetesCompute(
92
124
  nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
93
125
  for node in nodes:
94
126
  try:
95
- labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
96
127
  name = get_value(node, ".metadata.name", str, required=True)
97
- cpus = _parse_cpu(
98
- get_value(node, ".status.allocatable['cpu']", str, required=True)
99
- )
100
128
  cpu_arch = normalize_arch(
101
129
  get_value(node, ".status.node_info.architecture", str)
102
130
  ).to_cpu_architecture()
103
- memory_mib = _parse_memory(
104
- get_value(node, ".status.allocatable['memory']", str, required=True)
105
- )
106
- gpus, _ = _get_gpus_from_node_labels(labels)
107
- disk_size_mib = _parse_memory(
108
- get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
109
- )
131
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
132
+ cpus = _parse_cpu(allocatable["cpu"])
133
+ memory_mib = _parse_memory(allocatable["memory"])
134
+ disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
135
+ gpus = _get_node_gpus(node)
110
136
  except (AttributeError, KeyError, ValueError) as e:
111
137
  logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
112
138
  continue
@@ -141,6 +167,7 @@ class KubernetesCompute(
141
167
  volumes: List[Volume],
142
168
  ) -> JobProvisioningData:
143
169
  instance_name = generate_unique_instance_name_for_job(run, job)
170
+ assert run.run_spec.ssh_key_pub is not None
144
171
  commands = get_docker_commands(
145
172
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
146
173
  )
@@ -181,6 +208,7 @@ class KubernetesCompute(
181
208
  resources_requests: dict[str, str] = {}
182
209
  resources_limits: dict[str, str] = {}
183
210
  node_affinity: Optional[client.V1NodeAffinity] = None
211
+ tolerations: list[client.V1Toleration] = []
184
212
  volumes_: list[client.V1Volume] = []
185
213
  volume_mounts: list[client.V1VolumeMount] = []
186
214
 
@@ -196,52 +224,18 @@ class KubernetesCompute(
196
224
  "GPU is requested but the offer has no GPUs:"
197
225
  f" {gpu_spec=} {instance_offer=}",
198
226
  )
199
- offer_gpu = offer_gpus[0]
200
- matching_gpu_label_values: set[str] = set()
201
- # We cannot generate an expected GPU label value from the Gpu model instance
202
- # as the actual values may have additional components (socket, memory type, etc.)
203
- # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
204
- # Moreover, a single Gpu may match multiple label values.
205
- # As a workaround, we iterate and process all node labels once again (we already
206
- # processed them in `get_offers_by_requirements()`).
207
- node_list = call_api_method(
208
- self.api.list_node,
209
- client.V1NodeList,
227
+ gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
228
+ self.api, offer_gpus[0]
210
229
  )
211
- nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
212
- for node in nodes:
213
- labels = get_value(node, ".metadata.labels", dict[str, str])
214
- if not labels:
215
- continue
216
- gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
217
- if not gpus or gpu_label_value is None:
218
- continue
219
- if gpus[0] == offer_gpu:
220
- matching_gpu_label_values.add(gpu_label_value)
221
- if not matching_gpu_label_values:
222
- raise ComputeError(
223
- f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
230
+ logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
231
+ resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
232
+ # It should be NoSchedule, but we also add NoExecute toleration just in case.
233
+ for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
234
+ tolerations.append(
235
+ client.V1Toleration(
236
+ key=node_taint, operator=Operator.EXISTS, effect=effect
237
+ )
224
238
  )
225
- logger.debug(
226
- "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
227
- )
228
- # TODO: support other GPU vendors
229
- resources_requests["nvidia.com/gpu"] = str(gpu_min)
230
- resources_limits["nvidia.com/gpu"] = str(gpu_min)
231
- node_affinity = client.V1NodeAffinity(
232
- required_during_scheduling_ignored_during_execution=[
233
- client.V1NodeSelectorTerm(
234
- match_expressions=[
235
- client.V1NodeSelectorRequirement(
236
- key="nvidia.com/gpu.product",
237
- operator="In",
238
- values=list(matching_gpu_label_values),
239
- ),
240
- ],
241
- ),
242
- ],
243
- )
244
-
245
239
  if (memory_min := resources_spec.memory.min) is not None:
246
240
  resources_requests["memory"] = _render_memory(memory_min)
247
241
  if (
@@ -303,7 +297,10 @@ class KubernetesCompute(
303
297
  volume_mounts=volume_mounts,
304
298
  )
305
299
  ],
306
- affinity=node_affinity,
300
+ affinity=client.V1Affinity(
301
+ node_affinity=node_affinity,
302
+ ),
303
+ tolerations=tolerations,
307
304
  volumes=volumes_,
308
305
  ),
309
306
  )
@@ -521,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
521
518
  return f"{float(memory)}Gi"
522
519
 
523
520
 
524
- def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
521
+ def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
522
+ node_name = get_value(node, ".metadata.name", str, required=True)
523
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
524
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
525
+ for gpu_resource, gpu_getter in (
526
+ (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
527
+ (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
528
+ ):
529
+ _gpu_count = allocatable.get(gpu_resource)
530
+ if not _gpu_count:
531
+ continue
532
+ gpu_count = int(_gpu_count)
533
+ if gpu_count < 1:
534
+ continue
535
+ gpu = gpu_getter(labels)
536
+ if gpu is None:
537
+ logger.warning(
538
+ "Node %s: GPU resource found, but failed to detect its model: %s=%d",
539
+ node_name,
540
+ gpu_resource,
541
+ gpu_count,
542
+ )
543
+ return []
544
+ return [gpu] * gpu_count
545
+ logger.debug("Node %s: no GPU resource found", node_name)
546
+ return []
547
+
548
+
549
+ def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
525
550
  # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
526
551
  # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
527
552
  # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
528
553
  # Thus, we convert the product name to a known gpu name.
529
- # TODO: support other GPU vendors
530
- gpu_count = labels.get("nvidia.com/gpu.count")
531
- gpu_product = labels.get("nvidia.com/gpu.product")
532
- if gpu_count is None or gpu_product is None:
533
- return [], None
534
- gpu_count = int(gpu_count)
535
- gpu_name = None
536
- for known_gpu_name in NVIDIA_GPU_NAMES:
537
- if known_gpu_name.lower() in gpu_product.lower().split("-"):
538
- gpu_name = known_gpu_name
554
+ gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
555
+ if gpu_product is None:
556
+ return None
557
+ for gpu_name in NVIDIA_GPU_NAMES:
558
+ if gpu_name.lower() in gpu_product.lower().split("-"):
539
559
  break
540
- if gpu_name is None:
541
- return [], None
560
+ else:
561
+ return None
542
562
  gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
543
563
  gpu_memory = gpu_info.memory * 1024
544
564
  # A100 may come in two variants
545
565
  if "40GB" in gpu_product:
546
566
  gpu_memory = 40 * 1024
547
- gpus = [
548
- Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
549
- for _ in range(gpu_count)
550
- ]
551
- return gpus, gpu_product
567
+ return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
568
+
569
+
570
+ def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
571
+ # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
572
+ gpus: set[tuple[str, int]] = set()
573
+ for label in labels:
574
+ if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
575
+ continue
576
+ _, _, _device_id = label.rpartition(".")
577
+ device_id = int(_device_id, 16)
578
+ gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
579
+ if gpu_info is None:
580
+ logger.warning("Unknown AMD GPU device id: %X", device_id)
581
+ continue
582
+ gpus.add((gpu_info.name, gpu_info.memory))
583
+ if not gpus:
584
+ return None
585
+ if len(gpus) == 1:
586
+ gpu_name, gpu_memory_gib = next(iter(gpus))
587
+ return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
588
+ logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
589
+ return None
590
+
591
+
592
+ def _get_pod_spec_parameters_for_gpu(
593
+ api: client.CoreV1Api, gpu: Gpu
594
+ ) -> tuple[str, client.V1NodeAffinity, str]:
595
+ gpu_vendor = gpu.vendor
596
+ assert gpu_vendor is not None
597
+ if gpu_vendor == AcceleratorVendor.NVIDIA:
598
+ node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
599
+ return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
600
+ if gpu_vendor == AcceleratorVendor.AMD:
601
+ node_affinity = _get_amd_gpu_node_affinity(gpu)
602
+ return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
603
+ raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
604
+
605
+
606
+ def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
607
+ matching_gpu_label_values: set[str] = set()
608
+ # We cannot generate an expected GPU label value from the Gpu model instance
609
+ # as the actual values may have additional components (socket, memory type, etc.)
610
+ # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
611
+ # Moreover, a single Gpu may match multiple label values.
612
+ # As a workaround, we iterate and process all node labels once again (we already
613
+ # processed them in `get_offers_by_requirements()`).
614
+ node_list = call_api_method(api.list_node, client.V1NodeList)
615
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
616
+ for node in nodes:
617
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
618
+ if _get_nvidia_gpu_from_node_labels(labels) == gpu:
619
+ matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
620
+ if not matching_gpu_label_values:
621
+ raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
622
+ logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
623
+ return client.V1NodeAffinity(
624
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
625
+ node_selector_terms=[
626
+ client.V1NodeSelectorTerm(
627
+ match_expressions=[
628
+ client.V1NodeSelectorRequirement(
629
+ key=NVIDIA_GPU_PRODUCT_LABEL,
630
+ operator=Operator.IN,
631
+ values=list(matching_gpu_label_values),
632
+ ),
633
+ ],
634
+ ),
635
+ ],
636
+ ),
637
+ )
638
+
639
+
640
+ def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
641
+ device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
642
+ if device_ids is None:
643
+ raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
644
+ return client.V1NodeAffinity(
645
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
646
+ node_selector_terms=[
647
+ client.V1NodeSelectorTerm(
648
+ match_expressions=[
649
+ client.V1NodeSelectorRequirement(
650
+ key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
651
+ operator=Operator.EXISTS,
652
+ ),
653
+ ],
654
+ )
655
+ for device_id in device_ids
656
+ ],
657
+ ),
658
+ )
552
659
 
553
660
 
554
661
  def _continue_setup_jump_pod(
@@ -647,6 +754,39 @@ def _create_jump_pod_service(
647
754
  namespace=namespace,
648
755
  name=pod_name,
649
756
  )
757
+
758
+ node_list = call_api_method(api.list_node, client.V1NodeList)
759
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
760
+ # False if we found at least one node without any "hard" taint, that is, if we don't need to
761
+ # specify the toleration.
762
+ toleration_required = True
763
+ # (key, effect) pairs.
764
+ tolerated_taints: set[tuple[str, str]] = set()
765
+ for node in nodes:
766
+ # True if the node has at least one NoExecute or NoSchedule taint.
767
+ has_hard_taint = False
768
+ taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
769
+ for taint in taints:
770
+ effect = get_value(taint, ".effect", str, required=True)
771
+ # A "soft" taint, ignore.
772
+ if effect == TaintEffect.PREFER_NO_SCHEDULE:
773
+ continue
774
+ has_hard_taint = True
775
+ key = get_value(taint, ".key", str, required=True)
776
+ if key in TOLERATED_NODE_TAINTS:
777
+ tolerated_taints.add((key, effect))
778
+ if not has_hard_taint:
779
+ toleration_required = False
780
+ break
781
+ tolerations: list[client.V1Toleration] = []
782
+ if toleration_required:
783
+ for key, effect in tolerated_taints:
784
+ tolerations.append(
785
+ client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
786
+ )
787
+ if not tolerations:
788
+ logger.warning("No appropriate node found, the jump pod may never be scheduled")
789
+
650
790
  commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
651
791
  pod = client.V1Pod(
652
792
  metadata=client.V1ObjectMeta(
@@ -667,7 +807,8 @@ def _create_jump_pod_service(
667
807
  )
668
808
  ],
669
809
  )
670
- ]
810
+ ],
811
+ tolerations=tolerations,
671
812
  ),
672
813
  )
673
814
  call_api_method(
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
37
37
 
38
38
 
39
39
  class KubeconfigFileConfig(CoreModel):
40
- filename: Annotated[str, Field(description="The path to the kubeconfig file")]
40
+ filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
41
41
  data: Annotated[
42
42
  Optional[str],
43
43
  Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
50
50
  ] = None
51
51
 
52
52
  @root_validator
53
- def fill_data(cls, values):
53
+ def fill_data(cls, values: dict) -> dict:
54
+ if values.get("filename") == "" and values.get("data") is None:
55
+ raise ValueError("filename or data must be specified")
54
56
  return fill_data(values)
55
57
 
56
58
 
@@ -2,8 +2,9 @@ import json
2
2
  import random
3
3
  import shlex
4
4
  import time
5
+ from collections.abc import Iterable
5
6
  from functools import cached_property
6
- from typing import Callable, List, Optional
7
+ from typing import List, Optional
7
8
 
8
9
  from nebius.aio.operation import Operation as SDKOperation
9
10
  from nebius.aio.service_error import RequestError, StatusCode
@@ -19,8 +20,13 @@ from dstack._internal.core.backends.base.compute import (
19
20
  ComputeWithPrivilegedSupport,
20
21
  generate_unique_instance_name,
21
22
  get_user_data,
23
+ merge_tags,
24
+ )
25
+ from dstack._internal.core.backends.base.offers import (
26
+ OfferModifier,
27
+ get_catalog_offers,
28
+ get_offers_disk_modifier,
22
29
  )
23
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
24
30
  from dstack._internal.core.backends.nebius import resources
25
31
  from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
26
32
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -124,10 +130,8 @@ class NebiusCompute(
124
130
  for offer in offers
125
131
  ]
126
132
 
127
- def get_offers_modifier(
128
- self, requirements: Requirements
129
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
130
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
133
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
134
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
131
135
 
132
136
  def create_instance(
133
137
  self,
@@ -150,6 +154,18 @@ class NebiusCompute(
150
154
  if backend_data.cluster is not None:
151
155
  cluster_id = backend_data.cluster.id
152
156
 
157
+ labels = {
158
+ "owner": "dstack",
159
+ "dstack_project": instance_config.project_name.lower(),
160
+ "dstack_name": instance_config.instance_name,
161
+ "dstack_user": instance_config.user.lower(),
162
+ }
163
+ labels = merge_tags(
164
+ base_tags=labels,
165
+ backend_tags=self.config.tags,
166
+ resource_tags=instance_config.tags,
167
+ )
168
+ labels = resources.filter_invalid_labels(labels)
153
169
  gpus = instance_offer.instance.resources.gpus
154
170
  create_disk_op = resources.create_disk(
155
171
  sdk=self._sdk,
@@ -159,6 +175,7 @@ class NebiusCompute(
159
175
  image_family="ubuntu24.04-cuda12"
160
176
  if gpus and gpus[0].name == "B200"
161
177
  else "ubuntu22.04-cuda12",
178
+ labels=labels,
162
179
  )
163
180
  create_instance_op = None
164
181
  try:
@@ -184,6 +201,7 @@ class NebiusCompute(
184
201
  disk_id=create_disk_op.resource_id,
185
202
  subnet_id=self._get_subnet_id(instance_offer.region),
186
203
  preemptible=instance_offer.instance.resources.spot,
204
+ labels=labels,
187
205
  )
188
206
  _wait_for_instance(self._sdk, create_instance_op)
189
207
  except BaseException:
@@ -3,6 +3,7 @@ import json
3
3
  from nebius.aio.service_error import RequestError
4
4
 
5
5
  from dstack._internal.core.backends.base.configurator import (
6
+ TAGS_MAX_NUM,
6
7
  BackendRecord,
7
8
  Configurator,
8
9
  raise_invalid_credentials_error,
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
18
19
  NebiusServiceAccountCreds,
19
20
  NebiusStoredConfig,
20
21
  )
22
+ from dstack._internal.core.errors import BackendError, ServerClientError
21
23
  from dstack._internal.core.models.backends.base import BackendType
22
24
 
23
25
 
@@ -53,6 +55,19 @@ class NebiusConfigurator(
53
55
  f" some of the valid options: {sorted(valid_fabrics)}"
54
56
  ),
55
57
  )
58
+ self._check_config_tags(config)
59
+
60
+ def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
61
+ if not config.tags:
62
+ return
63
+ if len(config.tags) > TAGS_MAX_NUM:
64
+ raise ServerClientError(
65
+ f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
66
+ )
67
+ try:
68
+ resources.validate_labels(config.tags)
69
+ except BackendError as e:
70
+ raise ServerClientError(e.args[0])
56
71
 
57
72
  def create_backend(
58
73
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -1,4 +1,6 @@
1
- from typing import Annotated, Literal, Optional, Union
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Annotated, Dict, Literal, Optional, Union
2
4
 
3
5
  from pydantic import Field, root_validator
4
6
 
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
27
29
  )
28
30
  ),
29
31
  ]
32
+ filename: Annotated[
33
+ Optional[str], Field(description="The path to the service account credentials file")
34
+ ] = None
30
35
 
31
36
 
32
37
  class NebiusServiceAccountFileCreds(CoreModel):
33
38
  type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
34
39
  "service_account"
35
40
  )
36
- service_account_id: Annotated[str, Field(description="Service account ID")]
37
- public_key_id: Annotated[str, Field(description="ID of the service account public key")]
41
+ service_account_id: Annotated[
42
+ Optional[str],
43
+ Field(
44
+ description=(
45
+ "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
46
+ )
47
+ ),
48
+ ] = None
49
+ public_key_id: Annotated[
50
+ Optional[str],
51
+ Field(
52
+ description=(
53
+ "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
54
+ )
55
+ ),
56
+ ] = None
38
57
  private_key_file: Annotated[
39
- Optional[str], Field(description=("Path to the service account private key"))
58
+ Optional[str],
59
+ Field(
60
+ description=(
61
+ "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
62
+ )
63
+ ),
40
64
  ] = None
41
65
  private_key_content: Annotated[
42
66
  Optional[str],
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
44
68
  description=(
45
69
  "Content of the service account private key. When configuring via"
46
70
  " `server/config.yml`, it's automatically filled from `private_key_file`."
47
- " When configuring via UI, it has to be specified explicitly."
71
+ " When configuring via UI, it has to be specified explicitly"
48
72
  )
49
73
  ),
50
74
  ] = None
75
+ filename: Annotated[
76
+ Optional[str], Field(description="The path to the service account credentials file")
77
+ ] = None
51
78
 
52
79
  @root_validator
53
80
  def fill_data(cls, values):
81
+ if filename := values.get("filename"):
82
+ try:
83
+ with open(Path(filename).expanduser()) as f:
84
+ data = json.load(f)
85
+ from nebius.base.service_account.credentials_file import (
86
+ ServiceAccountCredentials,
87
+ )
88
+
89
+ credentials = ServiceAccountCredentials.from_json(data)
90
+ subject = credentials.subject_credentials
91
+ values["service_account_id"] = subject.sub
92
+ values["public_key_id"] = subject.kid
93
+ values["private_key_content"] = subject.private_key
94
+ except OSError:
95
+ raise ValueError(f"No such file {filename}")
96
+ except Exception as e:
97
+ raise ValueError(f"Failed to parse credentials file {filename}: {e}")
98
+ return values
99
+
54
100
  return fill_data(
55
101
  values, filename_field="private_key_file", data_field="private_key_content"
56
102
  )
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
95
141
  )
96
142
  ),
97
143
  ] = None
144
+ tags: Annotated[
145
+ Optional[Dict[str, str]],
146
+ Field(
147
+ description="The tags (labels) that will be assigned to resources created by `dstack`"
148
+ ),
149
+ ] = None
98
150
 
99
151
 
100
152
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):