dstack 0.19.6rc1__py3-none-any.whl → 0.19.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (69) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/fleet.py +3 -2
  3. dstack/_internal/cli/services/configurators/run.py +50 -4
  4. dstack/_internal/cli/utils/fleet.py +3 -1
  5. dstack/_internal/cli/utils/run.py +25 -28
  6. dstack/_internal/core/backends/aws/compute.py +13 -1
  7. dstack/_internal/core/backends/azure/compute.py +42 -13
  8. dstack/_internal/core/backends/azure/configurator.py +21 -0
  9. dstack/_internal/core/backends/azure/models.py +9 -0
  10. dstack/_internal/core/backends/base/compute.py +101 -27
  11. dstack/_internal/core/backends/base/offers.py +13 -3
  12. dstack/_internal/core/backends/cudo/compute.py +2 -0
  13. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  14. dstack/_internal/core/backends/gcp/auth.py +1 -1
  15. dstack/_internal/core/backends/gcp/compute.py +51 -35
  16. dstack/_internal/core/backends/gcp/resources.py +6 -1
  17. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  18. dstack/_internal/core/backends/local/compute.py +2 -0
  19. dstack/_internal/core/backends/nebius/compute.py +95 -1
  20. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  21. dstack/_internal/core/backends/nebius/fabrics.py +47 -0
  22. dstack/_internal/core/backends/nebius/models.py +8 -0
  23. dstack/_internal/core/backends/nebius/resources.py +29 -0
  24. dstack/_internal/core/backends/oci/compute.py +2 -0
  25. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  26. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  27. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  28. dstack/_internal/core/backends/vastai/compute.py +2 -1
  29. dstack/_internal/core/backends/vultr/compute.py +5 -1
  30. dstack/_internal/core/errors.py +4 -0
  31. dstack/_internal/core/models/fleets.py +2 -0
  32. dstack/_internal/core/models/instances.py +4 -3
  33. dstack/_internal/core/models/resources.py +80 -3
  34. dstack/_internal/core/models/runs.py +10 -3
  35. dstack/_internal/core/models/volumes.py +1 -1
  36. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  37. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  38. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  39. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  40. dstack/_internal/server/background/tasks/process_submitted_jobs.py +1 -1
  41. dstack/_internal/server/models.py +1 -0
  42. dstack/_internal/server/routers/gateways.py +2 -1
  43. dstack/_internal/server/services/config.py +7 -2
  44. dstack/_internal/server/services/fleets.py +24 -26
  45. dstack/_internal/server/services/gateways/__init__.py +17 -2
  46. dstack/_internal/server/services/instances.py +0 -2
  47. dstack/_internal/server/services/offers.py +15 -0
  48. dstack/_internal/server/services/placement.py +27 -6
  49. dstack/_internal/server/services/plugins.py +77 -0
  50. dstack/_internal/server/services/resources.py +21 -0
  51. dstack/_internal/server/services/runs.py +41 -17
  52. dstack/_internal/server/services/volumes.py +10 -1
  53. dstack/_internal/server/testing/common.py +35 -26
  54. dstack/_internal/utils/common.py +22 -9
  55. dstack/_internal/utils/json_schema.py +6 -3
  56. dstack/api/__init__.py +1 -0
  57. dstack/api/server/__init__.py +8 -1
  58. dstack/api/server/_fleets.py +16 -0
  59. dstack/api/server/_runs.py +44 -3
  60. dstack/plugins/__init__.py +8 -0
  61. dstack/plugins/_base.py +72 -0
  62. dstack/plugins/_models.py +8 -0
  63. dstack/plugins/_utils.py +19 -0
  64. dstack/version.py +1 -1
  65. {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/METADATA +14 -2
  66. {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/RECORD +69 -62
  67. {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
  68. {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
  69. {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import random
2
3
  import shlex
3
4
  import time
4
5
  from functools import cached_property
@@ -13,13 +14,19 @@ from dstack._internal.core.backends.base.backend import Compute
13
14
  from dstack._internal.core.backends.base.compute import (
14
15
  ComputeWithCreateInstanceSupport,
15
16
  ComputeWithMultinodeSupport,
17
+ ComputeWithPlacementGroupSupport,
16
18
  generate_unique_instance_name,
17
19
  get_user_data,
18
20
  )
19
21
  from dstack._internal.core.backends.base.offers import get_catalog_offers
20
22
  from dstack._internal.core.backends.nebius import resources
23
+ from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
21
24
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
22
- from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
25
+ from dstack._internal.core.errors import (
26
+ BackendError,
27
+ NotYetTerminated,
28
+ ProvisioningError,
29
+ )
23
30
  from dstack._internal.core.models.backends.base import BackendType
24
31
  from dstack._internal.core.models.common import CoreModel
25
32
  from dstack._internal.core.models.instances import (
@@ -28,6 +35,11 @@ from dstack._internal.core.models.instances import (
28
35
  InstanceOffer,
29
36
  InstanceOfferWithAvailability,
30
37
  )
38
+ from dstack._internal.core.models.placement import (
39
+ PlacementGroup,
40
+ PlacementGroupProvisioningData,
41
+ PlacementStrategy,
42
+ )
31
43
  from dstack._internal.core.models.resources import Memory, Range
32
44
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
33
45
  from dstack._internal.utils.logging import get_logger
@@ -72,6 +84,7 @@ SUPPORTED_PLATFORMS = [
72
84
  class NebiusCompute(
73
85
  ComputeWithCreateInstanceSupport,
74
86
  ComputeWithMultinodeSupport,
87
+ ComputeWithPlacementGroupSupport,
75
88
  Compute,
76
89
  ):
77
90
  def __init__(self, config: NebiusConfig):
@@ -121,6 +134,7 @@ class NebiusCompute(
121
134
  self,
122
135
  instance_offer: InstanceOfferWithAvailability,
123
136
  instance_config: InstanceConfiguration,
137
+ placement_group: Optional[PlacementGroup],
124
138
  ) -> JobProvisioningData:
125
139
  # NOTE: This method can block for a long time as it waits for the boot disk to be created
126
140
  # and the instance to enter the STARTING state. This has to be done in create_instance so
@@ -128,6 +142,14 @@ class NebiusCompute(
128
142
  # instance.
129
143
  instance_name = generate_unique_instance_name(instance_config)
130
144
  platform, preset = instance_offer.instance.name.split()
145
+ cluster_id = None
146
+ if placement_group:
147
+ assert placement_group.provisioning_data is not None
148
+ backend_data = NebiusPlacementGroupBackendData.load(
149
+ placement_group.provisioning_data.backend_data
150
+ )
151
+ if backend_data.cluster is not None:
152
+ cluster_id = backend_data.cluster.id
131
153
  create_disk_op = resources.create_disk(
132
154
  sdk=self._sdk,
133
155
  name=instance_name,
@@ -155,6 +177,7 @@ class NebiusCompute(
155
177
  ),
156
178
  platform=platform,
157
179
  preset=preset,
180
+ cluster_id=cluster_id,
158
181
  disk_id=create_disk_op.resource_id,
159
182
  subnet_id=self._get_subnet_id(instance_offer.region),
160
183
  )
@@ -230,6 +253,63 @@ class NebiusCompute(
230
253
  with resources.ignore_errors([StatusCode.NOT_FOUND]):
231
254
  resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
232
255
 
256
+ def create_placement_group(
257
+ self,
258
+ placement_group: PlacementGroup,
259
+ master_instance_offer: InstanceOffer,
260
+ ) -> PlacementGroupProvisioningData:
261
+ assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
262
+ backend_data = NebiusPlacementGroupBackendData(cluster=None)
263
+ # Only create a Nebius cluster if the instance supports it.
264
+ # For other instances, return dummy PlacementGroupProvisioningData.
265
+ if fabrics := get_suitable_infiniband_fabrics(
266
+ master_instance_offer, allowed_fabrics=self.config.fabrics
267
+ ):
268
+ fabric = random.choice(fabrics)
269
+ op = resources.create_cluster(
270
+ self._sdk,
271
+ name=placement_group.name,
272
+ project_id=self._region_to_project_id[placement_group.configuration.region],
273
+ fabric=fabric,
274
+ )
275
+ backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
276
+ return PlacementGroupProvisioningData(
277
+ backend=BackendType.NEBIUS,
278
+ backend_data=backend_data.json(),
279
+ )
280
+
281
+ def delete_placement_group(self, placement_group: PlacementGroup) -> None:
282
+ assert placement_group.provisioning_data is not None
283
+ backend_data = NebiusPlacementGroupBackendData.load(
284
+ placement_group.provisioning_data.backend_data
285
+ )
286
+ if backend_data.cluster is not None:
287
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
288
+ resources.delete_cluster(self._sdk, backend_data.cluster.id)
289
+
290
+ def is_suitable_placement_group(
291
+ self,
292
+ placement_group: PlacementGroup,
293
+ instance_offer: InstanceOffer,
294
+ ) -> bool:
295
+ if not (
296
+ placement_group.configuration.backend == BackendType.NEBIUS
297
+ and placement_group.configuration.region == instance_offer.region
298
+ ):
299
+ return False
300
+ assert placement_group.provisioning_data is not None
301
+ backend_data = NebiusPlacementGroupBackendData.load(
302
+ placement_group.provisioning_data.backend_data
303
+ )
304
+ return (
305
+ backend_data.cluster is None
306
+ or backend_data.cluster.fabric
307
+ in get_suitable_infiniband_fabrics(
308
+ instance_offer,
309
+ allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
310
+ )
311
+ )
312
+
233
313
 
234
314
  class NebiusInstanceBackendData(CoreModel):
235
315
  boot_disk_id: str
@@ -240,6 +320,20 @@ class NebiusInstanceBackendData(CoreModel):
240
320
  return cls.__response__.parse_raw(raw)
241
321
 
242
322
 
323
+ class NebiusClusterBackendData(CoreModel):
324
+ id: str
325
+ fabric: str
326
+
327
+
328
+ class NebiusPlacementGroupBackendData(CoreModel):
329
+ cluster: Optional[NebiusClusterBackendData]
330
+
331
+ @classmethod
332
+ def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData":
333
+ assert raw is not None
334
+ return cls.__response__.parse_raw(raw)
335
+
336
+
243
337
  def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
244
338
  start = time.monotonic()
245
339
  while True:
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.configurator import (
9
9
  )
10
10
  from dstack._internal.core.backends.nebius import resources
11
11
  from dstack._internal.core.backends.nebius.backend import NebiusBackend
12
+ from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
12
13
  from dstack._internal.core.backends.nebius.models import (
13
14
  AnyNebiusBackendConfig,
14
15
  NebiusBackendConfig,
@@ -38,6 +39,16 @@ class NebiusConfigurator(Configurator):
38
39
  fields=[["creds"]],
39
40
  details=str(e),
40
41
  )
42
+ valid_fabrics = get_all_infiniband_fabrics()
43
+ if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
44
+ raise_invalid_credentials_error(
45
+ fields=[["fabrics"]],
46
+ details=(
47
+ "These InfiniBand fabrics do not exist or are not known to dstack:"
48
+ f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
49
+ f" some of the valid options: {sorted(valid_fabrics)}"
50
+ ),
51
+ )
41
52
 
42
53
  def create_backend(
43
54
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -0,0 +1,47 @@
1
+ from collections.abc import Container
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from dstack._internal.core.models.instances import InstanceOffer
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class InfinibandFabric:
10
+ name: str
11
+ platform: str
12
+ region: str
13
+
14
+
15
+ # https://docs.nebius.com/compute/clusters/gpu#fabrics
16
+ INFINIBAND_FABRICS = [
17
+ InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
18
+ InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
19
+ InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
20
+ InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21
+ InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
+ InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
+ ]
24
+
25
+
26
+ def get_suitable_infiniband_fabrics(
27
+ offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
28
+ ) -> list[str]:
29
+ if len(offer.instance.resources.gpus) < 8:
30
+ # From the create VM page in the Nebius Console:
31
+ # > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
32
+ # > can be added to the cluster
33
+ return []
34
+ platform, _ = offer.instance.name.split()
35
+ return [
36
+ f.name
37
+ for f in INFINIBAND_FABRICS
38
+ if (
39
+ f.platform == platform
40
+ and f.region == offer.region
41
+ and (allowed_fabrics is None or f.name in allowed_fabrics)
42
+ )
43
+ ]
44
+
45
+
46
+ def get_all_infiniband_fabrics() -> set[str]:
47
+ return {f.name for f in INFINIBAND_FABRICS}
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
87
87
  Optional[list[str]],
88
88
  Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
89
89
  ] = None
90
+ fabrics: Annotated[
91
+ Optional[list[str]],
92
+ Field(
93
+ description=(
94
+ "The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
95
+ )
96
+ ),
97
+ ] = None
90
98
 
91
99
 
92
100
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):
@@ -15,14 +15,19 @@ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
15
15
  from nebius.api.nebius.compute.v1 import (
16
16
  AttachedDiskSpec,
17
17
  CreateDiskRequest,
18
+ CreateGpuClusterRequest,
18
19
  CreateInstanceRequest,
19
20
  DeleteDiskRequest,
21
+ DeleteGpuClusterRequest,
20
22
  DeleteInstanceRequest,
21
23
  DiskServiceClient,
22
24
  DiskSpec,
23
25
  ExistingDisk,
24
26
  GetInstanceRequest,
27
+ GpuClusterServiceClient,
28
+ GpuClusterSpec,
25
29
  Instance,
30
+ InstanceGpuClusterSpec,
26
31
  InstanceServiceClient,
27
32
  InstanceSpec,
28
33
  IPAddress,
@@ -275,6 +280,7 @@ def create_instance(
275
280
  user_data: str,
276
281
  platform: str,
277
282
  preset: str,
283
+ cluster_id: Optional[str],
278
284
  disk_id: str,
279
285
  subnet_id: str,
280
286
  ) -> SDKOperation[Operation]:
@@ -287,6 +293,7 @@ def create_instance(
287
293
  spec=InstanceSpec(
288
294
  cloud_init_user_data=user_data,
289
295
  resources=ResourcesSpec(platform=platform, preset=preset),
296
+ gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None,
290
297
  boot_disk=AttachedDiskSpec(
291
298
  attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
292
299
  existing_disk=ExistingDisk(id=disk_id),
@@ -319,3 +326,25 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
319
326
  DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
320
327
  )
321
328
  )
329
+
330
+
331
+ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]:
332
+ with wrap_capacity_errors():
333
+ return LOOP.await_(
334
+ GpuClusterServiceClient(sdk).create(
335
+ CreateGpuClusterRequest(
336
+ metadata=ResourceMetadata(name=name, parent_id=project_id),
337
+ spec=GpuClusterSpec(infiniband_fabric=fabric),
338
+ ),
339
+ timeout=REQUEST_TIMEOUT,
340
+ metadata=REQUEST_MD,
341
+ )
342
+ )
343
+
344
+
345
+ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
346
+ return LOOP.await_(
347
+ GpuClusterServiceClient(sdk).delete(
348
+ DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
349
+ )
350
+ )
@@ -23,6 +23,7 @@ from dstack._internal.core.models.instances import (
23
23
  InstanceOffer,
24
24
  InstanceOfferWithAvailability,
25
25
  )
26
+ from dstack._internal.core.models.placement import PlacementGroup
26
27
  from dstack._internal.core.models.resources import Memory, Range
27
28
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
28
29
 
@@ -105,6 +106,7 @@ class OCICompute(
105
106
  self,
106
107
  instance_offer: InstanceOfferWithAvailability,
107
108
  instance_config: InstanceConfiguration,
109
+ placement_group: Optional[PlacementGroup],
108
110
  ) -> JobProvisioningData:
109
111
  region = self.regions[instance_offer.region]
110
112
 
@@ -6,8 +6,9 @@ from textwrap import dedent
6
6
  from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
- from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
9
+ from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
10
10
 
11
+ from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
11
12
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
12
13
 
13
14
  # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
36
37
  HOST_INFO_FILE = "host_info.json"
37
38
 
38
39
 
40
+ def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
41
+ cmd = "uname -m"
42
+ try:
43
+ _, stdout, stderr = client.exec_command(cmd, timeout=20)
44
+ except (paramiko.SSHException, OSError) as e:
45
+ raise ProvisioningError(f"detect_cpu_arch: {e}") from e
46
+ out = stdout.read().strip().decode()
47
+ err = stderr.read().strip().decode()
48
+ if err:
49
+ raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
50
+ try:
51
+ return normalize_arch(out)
52
+ except ValueError as e:
53
+ raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
54
+
55
+
39
56
  def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
40
57
  try:
41
58
  sftp = client.open_sftp()
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
226
243
  raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
227
244
 
228
245
 
229
- def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
246
+ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
247
+ _cpu_arch: CPUArchitecture
248
+ if cpu_arch == "amd64":
249
+ _cpu_arch = CPUArchitecture.X86
250
+ elif cpu_arch == "arm64":
251
+ _cpu_arch = CPUArchitecture.ARM
252
+ else:
253
+ raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
230
254
  gpu_count = host_info.get("gpu_count", 0)
231
255
  if gpu_count > 0:
232
256
  gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
251
275
  instance_type = InstanceType(
252
276
  name="instance",
253
277
  resources=Resources(
278
+ cpu_arch=_cpu_arch,
254
279
  cpus=host_info["cpus"],
255
280
  memory_mib=host_info["memory"] / 1024 / 1024,
256
281
  spot=False,
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
18
18
  InstanceConfiguration,
19
19
  InstanceOfferWithAvailability,
20
20
  )
21
+ from dstack._internal.core.models.placement import PlacementGroup
21
22
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
22
23
  from dstack._internal.core.models.volumes import Volume
23
24
  from dstack._internal.utils.logging import get_logger
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
64
65
  self,
65
66
  instance_offer: InstanceOfferWithAvailability,
66
67
  instance_config: InstanceConfiguration,
68
+ placement_group: Optional[PlacementGroup],
67
69
  ) -> JobProvisioningData:
68
70
  # TODO: Implement if backend supports creating instances (VM-based).
69
71
  # Delete if backend can only run jobs (container-based).
@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
19
19
  InstanceConfiguration,
20
20
  InstanceOfferWithAvailability,
21
21
  )
22
+ from dstack._internal.core.models.placement import PlacementGroup
22
23
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
23
24
  from dstack._internal.utils.logging import get_logger
24
25
 
@@ -57,6 +58,7 @@ class TensorDockCompute(
57
58
  self,
58
59
  instance_offer: InstanceOfferWithAvailability,
59
60
  instance_config: InstanceConfiguration,
61
+ placement_group: Optional[PlacementGroup],
60
62
  ) -> JobProvisioningData:
61
63
  instance_name = generate_unique_instance_name(
62
64
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -43,7 +43,8 @@ class VastAICompute(Compute):
43
43
  "reliability2": {"gte": 0.9},
44
44
  "inet_down": {"gt": 128},
45
45
  "verified": {"eq": True},
46
- "cuda_max_good": {"gte": 11.8},
46
+ "cuda_max_good": {"gte": 12.1},
47
+ "compute_cap": {"gte": 600},
47
48
  }
48
49
  )
49
50
  )
@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
22
22
  InstanceOffer,
23
23
  InstanceOfferWithAvailability,
24
24
  )
25
+ from dstack._internal.core.models.placement import PlacementGroup
25
26
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
26
27
  from dstack._internal.utils.logging import get_logger
27
28
 
@@ -58,7 +59,10 @@ class VultrCompute(
58
59
  return offers
59
60
 
60
61
  def create_instance(
61
- self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
62
+ self,
63
+ instance_offer: InstanceOfferWithAvailability,
64
+ instance_config: InstanceConfiguration,
65
+ placement_group: Optional[PlacementGroup],
62
66
  ) -> JobProvisioningData:
63
67
  instance_name = generate_unique_instance_name(
64
68
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -22,6 +22,10 @@ class URLNotFoundError(ClientError):
22
22
  pass
23
23
 
24
24
 
25
+ class MethodNotAllowedError(ClientError):
26
+ pass
27
+
28
+
25
29
  class ServerClientErrorCode(str, enum.Enum):
26
30
  UNSPECIFIED_ERROR = "error"
27
31
  RESOURCE_EXISTS = "resource_exists"
@@ -269,6 +269,8 @@ class FleetSpec(CoreModel):
269
269
  configuration_path: Optional[str] = None
270
270
  profile: Profile
271
271
  autocreated: bool = False
272
+ # merged_profile stores profile parameters merged from profile and configuration.
273
+ # Read profile parameters from merged_profile instead of profile directly.
272
274
  # TODO: make merged_profile a computed field after migrating to pydanticV2
273
275
  merged_profile: Annotated[Profile, Field(exclude=True)] = None
274
276
 
@@ -49,15 +49,17 @@ class Resources(CoreModel):
49
49
  spot: bool
50
50
  disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
51
51
  description: str = ""
52
+ cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
52
53
 
53
54
  def pretty_format(self, include_spot: bool = False) -> str:
54
55
  resources = {}
55
56
  if self.cpus > 0:
56
57
  resources["cpus"] = self.cpus
58
+ resources["cpu_arch"] = self.cpu_arch
57
59
  if self.memory_mib > 0:
58
60
  resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
59
61
  if self.disk.size_mib > 0:
60
- resources["disk_size"] = f"{self.disk.size_mib / 1024:.1f}GB"
62
+ resources["disk_size"] = f"{self.disk.size_mib / 1024:.0f}GB"
61
63
  if self.gpus:
62
64
  gpu = self.gpus[0]
63
65
  resources["gpu_name"] = gpu.name
@@ -66,7 +68,7 @@ class Resources(CoreModel):
66
68
  resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
67
69
  output = pretty_resources(**resources)
68
70
  if include_spot and self.spot:
69
- output += ", SPOT"
71
+ output += " (spot)"
70
72
  return output
71
73
 
72
74
 
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
105
107
  user: str # dstack user name
106
108
  ssh_keys: List[SSHKey]
107
109
  instance_id: Optional[str] = None
108
- placement_group_name: Optional[str] = None
109
110
  reservation: Optional[str] = None
110
111
  volumes: Optional[List[Volume]] = None
111
112
  tags: Optional[Dict[str, str]] = None
@@ -1,8 +1,9 @@
1
1
  import math
2
+ from collections.abc import Mapping
2
3
  from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union
3
4
 
4
5
  import gpuhunt
5
- from pydantic import Field, root_validator, validator
6
+ from pydantic import Field, parse_obj_as, root_validator, validator
6
7
  from pydantic.generics import GenericModel
7
8
  from typing_extensions import Annotated
8
9
 
@@ -128,6 +129,67 @@ DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
128
129
  DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
129
130
 
130
131
 
132
+ class CPUSpec(CoreModel):
133
+ class Config:
134
+ @staticmethod
135
+ def schema_extra(schema: Dict[str, Any]):
136
+ add_extra_schema_types(
137
+ schema["properties"]["count"],
138
+ extra_types=[{"type": "integer"}, {"type": "string"}],
139
+ )
140
+
141
+ arch: Annotated[
142
+ Optional[gpuhunt.CPUArchitecture],
143
+ Field(description="The CPU architecture, one of: `x86`, `arm`"),
144
+ ] = None
145
+ count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
146
+
147
+ @classmethod
148
+ def __get_validators__(cls):
149
+ yield cls.parse
150
+ yield cls.validate
151
+
152
+ @classmethod
153
+ def parse(cls, v: Any) -> Any:
154
+ if isinstance(v, int):
155
+ v = str(v)
156
+ if isinstance(v, str):
157
+ tokens = v.replace(" ", "").split(":")
158
+ spec = {}
159
+ for token in tokens:
160
+ if not token:
161
+ raise ValueError(f"CPU spec contains empty token: {v}")
162
+ if ".." in token or token.isdigit():
163
+ if "count" in spec:
164
+ raise ValueError(f"CPU spec count conflict: {v}")
165
+ spec["count"] = token
166
+ else:
167
+ try:
168
+ arch = gpuhunt.CPUArchitecture.cast(token)
169
+ except ValueError:
170
+ raise ValueError(f"Invalid CPU architecture: {v}")
171
+ if "arch" in spec:
172
+ raise ValueError(f"CPU spec arch conflict: {v}")
173
+ spec["arch"] = arch
174
+ return spec
175
+ # Range and min/max dict - for backward compatibility
176
+ if isinstance(v, Range):
177
+ return {"arch": None, "count": v}
178
+ if isinstance(v, Mapping) and v.keys() == {"min", "max"}:
179
+ return {"arch": None, "count": v}
180
+ return v
181
+
182
+ @validator("arch", pre=True)
183
+ def _validate_arch(cls, v: Any) -> Any:
184
+ if v is None:
185
+ return None
186
+ if isinstance(v, gpuhunt.CPUArchitecture):
187
+ return v
188
+ if isinstance(v, str):
189
+ return gpuhunt.CPUArchitecture.cast(v)
190
+ return v
191
+
192
+
131
193
  class GPUSpec(CoreModel):
132
194
  class Config:
133
195
  @staticmethod
@@ -246,6 +308,8 @@ class GPUSpec(CoreModel):
246
308
  v = v.lower()
247
309
  if v == "tpu":
248
310
  return gpuhunt.AcceleratorVendor.GOOGLE
311
+ if v == "tt":
312
+ return gpuhunt.AcceleratorVendor.TENSTORRENT
249
313
  return gpuhunt.AcceleratorVendor.cast(v)
250
314
 
251
315
 
@@ -300,7 +364,10 @@ class ResourcesSpec(CoreModel):
300
364
  extra_types=[{"type": "integer"}, {"type": "string"}],
301
365
  )
302
366
 
303
- cpu: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
367
+ # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
368
+ cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
369
+ CPUSpec()
370
+ )
304
371
  memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
305
372
  DEFAULT_MEMORY_SIZE
306
373
  )
@@ -315,8 +382,18 @@ class ResourcesSpec(CoreModel):
315
382
  gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
316
383
  disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
317
384
 
385
+ # TODO: Remove in 0.20. Added for backward compatibility.
386
+ @root_validator
387
+ def _post_validate(cls, values):
388
+ cpu = values.get("cpu")
389
+ if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
390
+ values["cpu"] = cpu.count
391
+ return values
392
+
318
393
  def pretty_format(self) -> str:
319
- resources: Dict[str, Any] = dict(cpus=self.cpu, memory=self.memory)
394
+ # TODO: Remove in 0.20. Use self.cpu directly
395
+ cpu = parse_obj_as(CPUSpec, self.cpu)
396
+ resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory)
320
397
  if self.gpu:
321
398
  gpu = self.gpu
322
399
  resources.update(
@@ -162,7 +162,7 @@ class Requirements(CoreModel):
162
162
  if self.spot is not None:
163
163
  res += f", {'spot' if self.spot else 'on-demand'}"
164
164
  if self.max_price is not None:
165
- res += f" under ${self.max_price:g} per hour"
165
+ res += f" under ${self.max_price:3f}".rstrip("0").rstrip(".") + " per hour"
166
166
  return res
167
167
 
168
168
 
@@ -357,6 +357,8 @@ class RunSpec(CoreModel):
357
357
  description="The contents of the SSH public key that will be used to connect to the run."
358
358
  ),
359
359
  ]
360
+ # merged_profile stores profile parameters merged from profile and configuration.
361
+ # Read profile parameters from merged_profile instead of profile directly.
360
362
  # TODO: make merged_profile a computed field after migrating to pydanticV2
361
363
  merged_profile: Annotated[Profile, Field(exclude=True)] = None
362
364
 
@@ -437,9 +439,14 @@ class Run(CoreModel):
437
439
 
438
440
  @root_validator
439
441
  def _error(cls, values) -> Dict:
442
+ try:
443
+ termination_reason = values["termination_reason"]
444
+ jobs = values["jobs"]
445
+ except KeyError:
446
+ return values
440
447
  values["error"] = _get_run_error(
441
- run_termination_reason=values["termination_reason"],
442
- run_jobs=values["jobs"],
448
+ run_termination_reason=termination_reason,
449
+ run_jobs=jobs,
443
450
  )
444
451
  return values
445
452
 
@@ -159,7 +159,7 @@ class VolumeMountPoint(CoreModel):
159
159
  description=(
160
160
  "The network volume name or the list of network volume names to mount."
161
161
  " If a list is specified, one of the volumes in the list will be mounted."
162
- " Specify volumes from different backends/regions to increase availability."
162
+ " Specify volumes from different backends/regions to increase availability"
163
163
  )
164
164
  ),
165
165
  ]