dstack 0.19.31__py3-none-any.whl → 0.19.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/hotaisle/compute.py +20 -0
- dstack/_internal/core/backends/kubernetes/compute.py +70 -7
- dstack/_internal/core/backends/nebius/compute.py +15 -0
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/models/profiles.py +11 -4
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +1 -3
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +63 -20
- dstack/version.py +1 -1
- {dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/METADATA +11 -13
- {dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/RECORD +18 -18
- {dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -42,6 +42,26 @@ INSTANCE_TYPE_SPECS = {
|
|
|
42
42
|
"cpu_frequency": 2000000000,
|
|
43
43
|
"cpu_manufacturer": "Intel",
|
|
44
44
|
},
|
|
45
|
+
"2x MI300X 26x Xeon Platinum 8470": {
|
|
46
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
47
|
+
"cpu_frequency": 2000000000,
|
|
48
|
+
"cpu_manufacturer": "Intel",
|
|
49
|
+
},
|
|
50
|
+
"2x MI300X 26x Xeon Platinum 8462Y+": {
|
|
51
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
52
|
+
"cpu_frequency": 2800000000,
|
|
53
|
+
"cpu_manufacturer": "Intel",
|
|
54
|
+
},
|
|
55
|
+
"4x MI300X 52x Xeon Platinum 8462Y": {
|
|
56
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
57
|
+
"cpu_frequency": 2000000000,
|
|
58
|
+
"cpu_manufacturer": "Intel",
|
|
59
|
+
},
|
|
60
|
+
"4x MI300X 52x Xeon Platinum 8462Y+": {
|
|
61
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
62
|
+
"cpu_frequency": 2800000000,
|
|
63
|
+
"cpu_manufacturer": "Intel",
|
|
64
|
+
},
|
|
45
65
|
}
|
|
46
66
|
|
|
47
67
|
|
|
@@ -2,6 +2,7 @@ import subprocess
|
|
|
2
2
|
import tempfile
|
|
3
3
|
import threading
|
|
4
4
|
import time
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
@@ -62,9 +63,28 @@ JUMP_POD_SSH_PORT = 22
|
|
|
62
63
|
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
63
64
|
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
64
65
|
|
|
66
|
+
NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
|
|
67
|
+
NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
|
|
68
|
+
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
69
|
+
NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
|
|
70
|
+
|
|
71
|
+
# Taints we know and tolerate when creating our objects, e.g., the jump pod.
|
|
72
|
+
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
|
|
73
|
+
|
|
65
74
|
DUMMY_REGION = "-"
|
|
66
75
|
|
|
67
76
|
|
|
77
|
+
class Operator(str, Enum):
|
|
78
|
+
EXISTS = "Exists"
|
|
79
|
+
IN = "In"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TaintEffect(str, Enum):
|
|
83
|
+
NO_EXECUTE = "NoExecute"
|
|
84
|
+
NO_SCHEDULE = "NoSchedule"
|
|
85
|
+
PREFER_NO_SCHEDULE = "PreferNoSchedule"
|
|
86
|
+
|
|
87
|
+
|
|
68
88
|
class KubernetesCompute(
|
|
69
89
|
ComputeWithFilteredOffersCached,
|
|
70
90
|
ComputeWithPrivilegedSupport,
|
|
@@ -181,6 +201,7 @@ class KubernetesCompute(
|
|
|
181
201
|
resources_requests: dict[str, str] = {}
|
|
182
202
|
resources_limits: dict[str, str] = {}
|
|
183
203
|
node_affinity: Optional[client.V1NodeAffinity] = None
|
|
204
|
+
tolerations: list[client.V1Toleration] = []
|
|
184
205
|
volumes_: list[client.V1Volume] = []
|
|
185
206
|
volume_mounts: list[client.V1VolumeMount] = []
|
|
186
207
|
|
|
@@ -226,21 +247,28 @@ class KubernetesCompute(
|
|
|
226
247
|
"Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
|
|
227
248
|
)
|
|
228
249
|
# TODO: support other GPU vendors
|
|
229
|
-
resources_requests[
|
|
230
|
-
resources_limits[
|
|
250
|
+
resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
251
|
+
resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
231
252
|
node_affinity = client.V1NodeAffinity(
|
|
232
253
|
required_during_scheduling_ignored_during_execution=[
|
|
233
254
|
client.V1NodeSelectorTerm(
|
|
234
255
|
match_expressions=[
|
|
235
256
|
client.V1NodeSelectorRequirement(
|
|
236
|
-
key=
|
|
237
|
-
operator=
|
|
257
|
+
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
258
|
+
operator=Operator.IN,
|
|
238
259
|
values=list(matching_gpu_label_values),
|
|
239
260
|
),
|
|
240
261
|
],
|
|
241
262
|
),
|
|
242
263
|
],
|
|
243
264
|
)
|
|
265
|
+
# It should be NoSchedule, but we also add NoExecute toleration just in case.
|
|
266
|
+
for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
|
|
267
|
+
tolerations.append(
|
|
268
|
+
client.V1Toleration(
|
|
269
|
+
key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
|
|
270
|
+
)
|
|
271
|
+
)
|
|
244
272
|
|
|
245
273
|
if (memory_min := resources_spec.memory.min) is not None:
|
|
246
274
|
resources_requests["memory"] = _render_memory(memory_min)
|
|
@@ -304,6 +332,7 @@ class KubernetesCompute(
|
|
|
304
332
|
)
|
|
305
333
|
],
|
|
306
334
|
affinity=node_affinity,
|
|
335
|
+
tolerations=tolerations,
|
|
307
336
|
volumes=volumes_,
|
|
308
337
|
),
|
|
309
338
|
)
|
|
@@ -527,8 +556,8 @@ def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optio
|
|
|
527
556
|
# "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
|
|
528
557
|
# Thus, we convert the product name to a known gpu name.
|
|
529
558
|
# TODO: support other GPU vendors
|
|
530
|
-
gpu_count = labels.get(
|
|
531
|
-
gpu_product = labels.get(
|
|
559
|
+
gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
|
|
560
|
+
gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
|
|
532
561
|
if gpu_count is None or gpu_product is None:
|
|
533
562
|
return [], None
|
|
534
563
|
gpu_count = int(gpu_count)
|
|
@@ -647,6 +676,39 @@ def _create_jump_pod_service(
|
|
|
647
676
|
namespace=namespace,
|
|
648
677
|
name=pod_name,
|
|
649
678
|
)
|
|
679
|
+
|
|
680
|
+
node_list = call_api_method(api.list_node, client.V1NodeList)
|
|
681
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
682
|
+
# False if we found at least one node without any "hard" taint, that is, if we don't need to
|
|
683
|
+
# specify the toleration.
|
|
684
|
+
toleration_required = True
|
|
685
|
+
# (key, effect) pairs.
|
|
686
|
+
tolerated_taints: set[tuple[str, str]] = set()
|
|
687
|
+
for node in nodes:
|
|
688
|
+
# True if the node has at least one NoExecute or NoSchedule taint.
|
|
689
|
+
has_hard_taint = False
|
|
690
|
+
taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
|
|
691
|
+
for taint in taints:
|
|
692
|
+
effect = get_value(taint, ".effect", str, required=True)
|
|
693
|
+
# A "soft" taint, ignore.
|
|
694
|
+
if effect == TaintEffect.PREFER_NO_SCHEDULE:
|
|
695
|
+
continue
|
|
696
|
+
has_hard_taint = True
|
|
697
|
+
key = get_value(taint, ".key", str, required=True)
|
|
698
|
+
if key in TOLERATED_NODE_TAINTS:
|
|
699
|
+
tolerated_taints.add((key, effect))
|
|
700
|
+
if not has_hard_taint:
|
|
701
|
+
toleration_required = False
|
|
702
|
+
break
|
|
703
|
+
tolerations: list[client.V1Toleration] = []
|
|
704
|
+
if toleration_required:
|
|
705
|
+
for key, effect in tolerated_taints:
|
|
706
|
+
tolerations.append(
|
|
707
|
+
client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
|
|
708
|
+
)
|
|
709
|
+
if not tolerations:
|
|
710
|
+
logger.warning("No appropriate node found, the jump pod may never be scheduled")
|
|
711
|
+
|
|
650
712
|
commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
|
|
651
713
|
pod = client.V1Pod(
|
|
652
714
|
metadata=client.V1ObjectMeta(
|
|
@@ -667,7 +729,8 @@ def _create_jump_pod_service(
|
|
|
667
729
|
)
|
|
668
730
|
],
|
|
669
731
|
)
|
|
670
|
-
]
|
|
732
|
+
],
|
|
733
|
+
tolerations=tolerations,
|
|
671
734
|
),
|
|
672
735
|
)
|
|
673
736
|
call_api_method(
|
|
@@ -19,6 +19,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
19
19
|
ComputeWithPrivilegedSupport,
|
|
20
20
|
generate_unique_instance_name,
|
|
21
21
|
get_user_data,
|
|
22
|
+
merge_tags,
|
|
22
23
|
)
|
|
23
24
|
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
24
25
|
from dstack._internal.core.backends.nebius import resources
|
|
@@ -150,6 +151,18 @@ class NebiusCompute(
|
|
|
150
151
|
if backend_data.cluster is not None:
|
|
151
152
|
cluster_id = backend_data.cluster.id
|
|
152
153
|
|
|
154
|
+
labels = {
|
|
155
|
+
"owner": "dstack",
|
|
156
|
+
"dstack_project": instance_config.project_name.lower(),
|
|
157
|
+
"dstack_name": instance_config.instance_name,
|
|
158
|
+
"dstack_user": instance_config.user.lower(),
|
|
159
|
+
}
|
|
160
|
+
labels = merge_tags(
|
|
161
|
+
base_tags=labels,
|
|
162
|
+
backend_tags=self.config.tags,
|
|
163
|
+
resource_tags=instance_config.tags,
|
|
164
|
+
)
|
|
165
|
+
labels = resources.filter_invalid_labels(labels)
|
|
153
166
|
gpus = instance_offer.instance.resources.gpus
|
|
154
167
|
create_disk_op = resources.create_disk(
|
|
155
168
|
sdk=self._sdk,
|
|
@@ -159,6 +172,7 @@ class NebiusCompute(
|
|
|
159
172
|
image_family="ubuntu24.04-cuda12"
|
|
160
173
|
if gpus and gpus[0].name == "B200"
|
|
161
174
|
else "ubuntu22.04-cuda12",
|
|
175
|
+
labels=labels,
|
|
162
176
|
)
|
|
163
177
|
create_instance_op = None
|
|
164
178
|
try:
|
|
@@ -184,6 +198,7 @@ class NebiusCompute(
|
|
|
184
198
|
disk_id=create_disk_op.resource_id,
|
|
185
199
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
186
200
|
preemptible=instance_offer.instance.resources.spot,
|
|
201
|
+
labels=labels,
|
|
187
202
|
)
|
|
188
203
|
_wait_for_instance(self._sdk, create_instance_op)
|
|
189
204
|
except BaseException:
|
|
@@ -3,6 +3,7 @@ import json
|
|
|
3
3
|
from nebius.aio.service_error import RequestError
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.backends.base.configurator import (
|
|
6
|
+
TAGS_MAX_NUM,
|
|
6
7
|
BackendRecord,
|
|
7
8
|
Configurator,
|
|
8
9
|
raise_invalid_credentials_error,
|
|
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
|
|
|
18
19
|
NebiusServiceAccountCreds,
|
|
19
20
|
NebiusStoredConfig,
|
|
20
21
|
)
|
|
22
|
+
from dstack._internal.core.errors import BackendError, ServerClientError
|
|
21
23
|
from dstack._internal.core.models.backends.base import BackendType
|
|
22
24
|
|
|
23
25
|
|
|
@@ -53,6 +55,19 @@ class NebiusConfigurator(
|
|
|
53
55
|
f" some of the valid options: {sorted(valid_fabrics)}"
|
|
54
56
|
),
|
|
55
57
|
)
|
|
58
|
+
self._check_config_tags(config)
|
|
59
|
+
|
|
60
|
+
def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
|
|
61
|
+
if not config.tags:
|
|
62
|
+
return
|
|
63
|
+
if len(config.tags) > TAGS_MAX_NUM:
|
|
64
|
+
raise ServerClientError(
|
|
65
|
+
f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
|
|
66
|
+
)
|
|
67
|
+
try:
|
|
68
|
+
resources.validate_labels(config.tags)
|
|
69
|
+
except BackendError as e:
|
|
70
|
+
raise ServerClientError(e.args[0])
|
|
56
71
|
|
|
57
72
|
def create_backend(
|
|
58
73
|
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Annotated, Dict, Literal, Optional, Union
|
|
2
4
|
|
|
3
5
|
from pydantic import Field, root_validator
|
|
4
6
|
|
|
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
|
|
|
27
29
|
)
|
|
28
30
|
),
|
|
29
31
|
]
|
|
32
|
+
filename: Annotated[
|
|
33
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
34
|
+
] = None
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
class NebiusServiceAccountFileCreds(CoreModel):
|
|
33
38
|
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
34
39
|
"service_account"
|
|
35
40
|
)
|
|
36
|
-
service_account_id: Annotated[
|
|
37
|
-
|
|
41
|
+
service_account_id: Annotated[
|
|
42
|
+
Optional[str],
|
|
43
|
+
Field(
|
|
44
|
+
description=(
|
|
45
|
+
"Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
49
|
+
public_key_id: Annotated[
|
|
50
|
+
Optional[str],
|
|
51
|
+
Field(
|
|
52
|
+
description=(
|
|
53
|
+
"ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
54
|
+
)
|
|
55
|
+
),
|
|
56
|
+
] = None
|
|
38
57
|
private_key_file: Annotated[
|
|
39
|
-
Optional[str],
|
|
58
|
+
Optional[str],
|
|
59
|
+
Field(
|
|
60
|
+
description=(
|
|
61
|
+
"Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
|
|
62
|
+
)
|
|
63
|
+
),
|
|
40
64
|
] = None
|
|
41
65
|
private_key_content: Annotated[
|
|
42
66
|
Optional[str],
|
|
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
|
|
|
44
68
|
description=(
|
|
45
69
|
"Content of the service account private key. When configuring via"
|
|
46
70
|
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
47
|
-
" When configuring via UI, it has to be specified explicitly
|
|
71
|
+
" When configuring via UI, it has to be specified explicitly"
|
|
48
72
|
)
|
|
49
73
|
),
|
|
50
74
|
] = None
|
|
75
|
+
filename: Annotated[
|
|
76
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
77
|
+
] = None
|
|
51
78
|
|
|
52
79
|
@root_validator
|
|
53
80
|
def fill_data(cls, values):
|
|
81
|
+
if filename := values.get("filename"):
|
|
82
|
+
try:
|
|
83
|
+
with open(Path(filename).expanduser()) as f:
|
|
84
|
+
data = json.load(f)
|
|
85
|
+
from nebius.base.service_account.credentials_file import (
|
|
86
|
+
ServiceAccountCredentials,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
credentials = ServiceAccountCredentials.from_json(data)
|
|
90
|
+
subject = credentials.subject_credentials
|
|
91
|
+
values["service_account_id"] = subject.sub
|
|
92
|
+
values["public_key_id"] = subject.kid
|
|
93
|
+
values["private_key_content"] = subject.private_key
|
|
94
|
+
except OSError:
|
|
95
|
+
raise ValueError(f"No such file {filename}")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to parse credentials file {filename}: {e}")
|
|
98
|
+
return values
|
|
99
|
+
|
|
54
100
|
return fill_data(
|
|
55
101
|
values, filename_field="private_key_file", data_field="private_key_content"
|
|
56
102
|
)
|
|
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
|
|
|
95
141
|
)
|
|
96
142
|
),
|
|
97
143
|
] = None
|
|
144
|
+
tags: Annotated[
|
|
145
|
+
Optional[Dict[str, str]],
|
|
146
|
+
Field(
|
|
147
|
+
description="The tags (labels) that will be assigned to resources created by `dstack`"
|
|
148
|
+
),
|
|
149
|
+
] = None
|
|
98
150
|
|
|
99
151
|
|
|
100
152
|
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
import time
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from collections.abc import Container as ContainerT
|
|
5
6
|
from collections.abc import Generator, Iterable, Sequence
|
|
6
7
|
from contextlib import contextmanager
|
|
7
8
|
from tempfile import NamedTemporaryFile
|
|
8
|
-
from typing import Optional
|
|
9
|
+
from typing import Dict, Optional
|
|
9
10
|
|
|
10
11
|
from nebius.aio.authorization.options import options_to_metadata
|
|
11
12
|
from nebius.aio.operation import Operation as SDKOperation
|
|
@@ -249,13 +250,14 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
|
|
|
249
250
|
|
|
250
251
|
|
|
251
252
|
def create_disk(
|
|
252
|
-
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
|
|
253
|
+
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
|
|
253
254
|
) -> SDKOperation[Operation]:
|
|
254
255
|
client = DiskServiceClient(sdk)
|
|
255
256
|
request = CreateDiskRequest(
|
|
256
257
|
metadata=ResourceMetadata(
|
|
257
258
|
name=name,
|
|
258
259
|
parent_id=project_id,
|
|
260
|
+
labels=labels,
|
|
259
261
|
),
|
|
260
262
|
spec=DiskSpec(
|
|
261
263
|
size_mebibytes=size_mib,
|
|
@@ -288,12 +290,14 @@ def create_instance(
|
|
|
288
290
|
disk_id: str,
|
|
289
291
|
subnet_id: str,
|
|
290
292
|
preemptible: bool,
|
|
293
|
+
labels: Dict[str, str],
|
|
291
294
|
) -> SDKOperation[Operation]:
|
|
292
295
|
client = InstanceServiceClient(sdk)
|
|
293
296
|
request = CreateInstanceRequest(
|
|
294
297
|
metadata=ResourceMetadata(
|
|
295
298
|
name=name,
|
|
296
299
|
parent_id=project_id,
|
|
300
|
+
labels=labels,
|
|
297
301
|
),
|
|
298
302
|
spec=InstanceSpec(
|
|
299
303
|
cloud_init_user_data=user_data,
|
|
@@ -367,3 +371,42 @@ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
|
|
|
367
371
|
metadata=REQUEST_MD,
|
|
368
372
|
)
|
|
369
373
|
)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
|
|
377
|
+
filtered_labels = {}
|
|
378
|
+
for k, v in labels.items():
|
|
379
|
+
if not _is_valid_label(k, v):
|
|
380
|
+
logger.warning("Skipping invalid label '%s: %s'", k, v)
|
|
381
|
+
continue
|
|
382
|
+
filtered_labels[k] = v
|
|
383
|
+
return filtered_labels
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def validate_labels(labels: Dict[str, str]):
|
|
387
|
+
for k, v in labels.items():
|
|
388
|
+
if not _is_valid_label(k, v):
|
|
389
|
+
raise BackendError("Invalid resource labels")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _is_valid_label(key: str, value: str) -> bool:
|
|
393
|
+
# TODO: [Nebius] current validation logic reuses GCP's approach.
|
|
394
|
+
# There is no public information on Nebius labels restrictions.
|
|
395
|
+
return is_valid_resource_name(key) and is_valid_label_value(value)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
MAX_RESOURCE_NAME_LEN = 63
|
|
399
|
+
NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
|
|
400
|
+
LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def is_valid_resource_name(name: str) -> bool:
|
|
404
|
+
if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
|
|
405
|
+
return False
|
|
406
|
+
match = re.match(NAME_PATTERN, name)
|
|
407
|
+
return match is not None
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def is_valid_label_value(value: str) -> bool:
|
|
411
|
+
match = re.match(LABEL_VALUE_PATTERN, value)
|
|
412
|
+
return match is not None
|
|
@@ -53,6 +53,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
53
53
|
}
|
|
54
54
|
if all(js.exit_status is None for js in job_submissions):
|
|
55
55
|
job_submissions_excludes["exit_status"] = True
|
|
56
|
+
if all(js.status_message == "" for js in job_submissions):
|
|
57
|
+
job_submissions_excludes["status_message"] = True
|
|
58
|
+
if all(js.error is None for js in job_submissions):
|
|
59
|
+
job_submissions_excludes["error"] = True
|
|
56
60
|
if all(js.deployment_num == 0 for js in job_submissions):
|
|
57
61
|
job_submissions_excludes["deployment_num"] = True
|
|
58
62
|
if all(not js.probes for js in job_submissions):
|
|
@@ -71,6 +75,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
71
75
|
}
|
|
72
76
|
if latest_job_submission.exit_status is None:
|
|
73
77
|
latest_job_submission_excludes["exit_status"] = True
|
|
78
|
+
if latest_job_submission.status_message == "":
|
|
79
|
+
latest_job_submission_excludes["status_message"] = True
|
|
80
|
+
if latest_job_submission.error is None:
|
|
81
|
+
latest_job_submission_excludes["error"] = True
|
|
74
82
|
if latest_job_submission.deployment_num == 0:
|
|
75
83
|
latest_job_submission_excludes["deployment_num"] = True
|
|
76
84
|
if not latest_job_submission.probes:
|
|
@@ -80,14 +80,21 @@ def parse_stop_duration(
|
|
|
80
80
|
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[Literal["off"], int]]:
|
|
81
81
|
if v == "off" or v is False:
|
|
82
82
|
return "off"
|
|
83
|
-
if v is True:
|
|
83
|
+
if v is True or v is None:
|
|
84
84
|
return None
|
|
85
|
-
|
|
85
|
+
duration = parse_duration(v)
|
|
86
|
+
if duration < 0:
|
|
87
|
+
raise ValueError("Duration cannot be negative")
|
|
88
|
+
return duration
|
|
86
89
|
|
|
87
90
|
|
|
88
|
-
def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[int]:
|
|
89
|
-
|
|
91
|
+
def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[int]:
|
|
92
|
+
# Differs from `parse_off_duration` to accept negative durations as `off`
|
|
93
|
+
# for backward compatibility.
|
|
94
|
+
if v == "off" or v is False or v == -1:
|
|
90
95
|
return -1
|
|
96
|
+
if v is True:
|
|
97
|
+
return None
|
|
91
98
|
return parse_duration(v)
|
|
92
99
|
|
|
93
100
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
1
2
|
from datetime import timedelta
|
|
2
3
|
from typing import List
|
|
3
4
|
from uuid import UUID
|
|
4
5
|
|
|
5
6
|
from sqlalchemy import select, update
|
|
6
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
-
from sqlalchemy.orm import joinedload, load_only
|
|
8
|
+
from sqlalchemy.orm import joinedload, load_only, selectinload
|
|
8
9
|
|
|
9
10
|
from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
|
|
10
11
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
@@ -37,30 +38,68 @@ MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
|
37
38
|
|
|
38
39
|
@sentry_utils.instrument_background_task
|
|
39
40
|
async def process_fleets():
|
|
40
|
-
|
|
41
|
+
fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
42
|
+
FleetModel.__tablename__
|
|
43
|
+
)
|
|
44
|
+
instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
45
|
+
InstanceModel.__tablename__
|
|
46
|
+
)
|
|
41
47
|
async with get_session_ctx() as session:
|
|
42
|
-
async with
|
|
48
|
+
async with fleet_lock, instance_lock:
|
|
43
49
|
res = await session.execute(
|
|
44
50
|
select(FleetModel)
|
|
45
51
|
.where(
|
|
46
52
|
FleetModel.deleted == False,
|
|
47
|
-
FleetModel.id.not_in(
|
|
53
|
+
FleetModel.id.not_in(fleet_lockset),
|
|
48
54
|
FleetModel.last_processed_at
|
|
49
55
|
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
50
56
|
)
|
|
51
|
-
.options(
|
|
57
|
+
.options(
|
|
58
|
+
load_only(FleetModel.id, FleetModel.name),
|
|
59
|
+
selectinload(FleetModel.instances).load_only(InstanceModel.id),
|
|
60
|
+
)
|
|
52
61
|
.order_by(FleetModel.last_processed_at.asc())
|
|
53
62
|
.limit(BATCH_SIZE)
|
|
54
63
|
.with_for_update(skip_locked=True, key_share=True)
|
|
55
64
|
)
|
|
56
|
-
fleet_models = list(res.scalars().all())
|
|
65
|
+
fleet_models = list(res.scalars().unique().all())
|
|
57
66
|
fleet_ids = [fm.id for fm in fleet_models]
|
|
67
|
+
res = await session.execute(
|
|
68
|
+
select(InstanceModel)
|
|
69
|
+
.where(
|
|
70
|
+
InstanceModel.id.not_in(instance_lockset),
|
|
71
|
+
InstanceModel.fleet_id.in_(fleet_ids),
|
|
72
|
+
)
|
|
73
|
+
.options(load_only(InstanceModel.id, InstanceModel.fleet_id))
|
|
74
|
+
.order_by(InstanceModel.id)
|
|
75
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
76
|
+
)
|
|
77
|
+
instance_models = list(res.scalars().all())
|
|
78
|
+
fleet_id_to_locked_instances = defaultdict(list)
|
|
79
|
+
for instance_model in instance_models:
|
|
80
|
+
fleet_id_to_locked_instances[instance_model.fleet_id].append(instance_model)
|
|
81
|
+
# Process only fleets with all instances locked.
|
|
82
|
+
# Other fleets won't be processed but will still be locked to avoid new transaction.
|
|
83
|
+
# This should not be problematic as long as process_fleets is quick.
|
|
84
|
+
fleet_models_to_process = []
|
|
85
|
+
for fleet_model in fleet_models:
|
|
86
|
+
if len(fleet_model.instances) == len(fleet_id_to_locked_instances[fleet_model.id]):
|
|
87
|
+
fleet_models_to_process.append(fleet_model)
|
|
88
|
+
else:
|
|
89
|
+
logger.debug(
|
|
90
|
+
"Fleet %s processing will be skipped: some instance were not locked",
|
|
91
|
+
fleet_model.name,
|
|
92
|
+
)
|
|
58
93
|
for fleet_id in fleet_ids:
|
|
59
|
-
|
|
94
|
+
fleet_lockset.add(fleet_id)
|
|
95
|
+
instance_ids = [im.id for im in instance_models]
|
|
96
|
+
for instance_id in instance_ids:
|
|
97
|
+
instance_lockset.add(instance_id)
|
|
60
98
|
try:
|
|
61
|
-
await _process_fleets(session=session, fleet_models=
|
|
99
|
+
await _process_fleets(session=session, fleet_models=fleet_models_to_process)
|
|
62
100
|
finally:
|
|
63
|
-
|
|
101
|
+
fleet_lockset.difference_update(fleet_ids)
|
|
102
|
+
instance_lockset.difference_update(instance_ids)
|
|
64
103
|
|
|
65
104
|
|
|
66
105
|
async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
|
|
@@ -99,8 +138,8 @@ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: Fleet
|
|
|
99
138
|
return
|
|
100
139
|
if not _is_fleet_ready_for_consolidation(fleet_model):
|
|
101
140
|
return
|
|
102
|
-
|
|
103
|
-
if
|
|
141
|
+
changed_instances = _maintain_fleet_nodes_in_min_max_range(session, fleet_model, fleet_spec)
|
|
142
|
+
if changed_instances:
|
|
104
143
|
fleet_model.consolidation_attempt += 1
|
|
105
144
|
else:
|
|
106
145
|
# The fleet is already consolidated or consolidation is in progress.
|
|
@@ -138,28 +177,47 @@ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
|
|
|
138
177
|
return _CONSOLIDATION_RETRY_DELAYS[-1]
|
|
139
178
|
|
|
140
179
|
|
|
141
|
-
def
|
|
180
|
+
def _maintain_fleet_nodes_in_min_max_range(
|
|
142
181
|
session: AsyncSession,
|
|
143
182
|
fleet_model: FleetModel,
|
|
144
183
|
fleet_spec: FleetSpec,
|
|
145
184
|
) -> bool:
|
|
146
185
|
"""
|
|
147
|
-
Ensures the fleet has at least `nodes.min` instances.
|
|
148
|
-
Returns `True` if retried
|
|
186
|
+
Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances.
|
|
187
|
+
Returns `True` if retried, added new instances, or terminated redundant instances and `False` otherwise.
|
|
149
188
|
"""
|
|
150
189
|
assert fleet_spec.configuration.nodes is not None
|
|
151
190
|
for instance in fleet_model.instances:
|
|
152
191
|
# Delete terminated but not deleted instances since
|
|
153
192
|
# they are going to be replaced with new pending instances.
|
|
154
193
|
if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
|
|
155
|
-
# It's safe to modify instances without instance lock since
|
|
156
|
-
# no other task modifies already terminated instances.
|
|
157
194
|
instance.deleted = True
|
|
158
195
|
instance.deleted_at = get_current_datetime()
|
|
159
196
|
active_instances = [i for i in fleet_model.instances if not i.deleted]
|
|
160
197
|
active_instances_num = len(active_instances)
|
|
161
198
|
if active_instances_num >= fleet_spec.configuration.nodes.min:
|
|
162
|
-
|
|
199
|
+
if (
|
|
200
|
+
fleet_spec.configuration.nodes.max is None
|
|
201
|
+
or active_instances_num <= fleet_spec.configuration.nodes.max
|
|
202
|
+
):
|
|
203
|
+
return False
|
|
204
|
+
# Fleet has more instances than allowed by nodes.max.
|
|
205
|
+
# This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently)
|
|
206
|
+
# or if nodes.max is updated.
|
|
207
|
+
nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max
|
|
208
|
+
for instance in fleet_model.instances:
|
|
209
|
+
if nodes_redundant == 0:
|
|
210
|
+
break
|
|
211
|
+
if instance.status in [InstanceStatus.IDLE]:
|
|
212
|
+
instance.status = InstanceStatus.TERMINATING
|
|
213
|
+
instance.termination_reason = "Fleet has too many instances"
|
|
214
|
+
nodes_redundant -= 1
|
|
215
|
+
logger.info(
|
|
216
|
+
"Terminating instance %s: %s",
|
|
217
|
+
instance.name,
|
|
218
|
+
instance.termination_reason,
|
|
219
|
+
)
|
|
220
|
+
return True
|
|
163
221
|
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
|
|
164
222
|
for i in range(nodes_missing):
|
|
165
223
|
instance_model = create_fleet_instance_model(
|
|
@@ -259,9 +259,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
259
259
|
if instance.status == InstanceStatus.PENDING:
|
|
260
260
|
instance.status = InstanceStatus.PROVISIONING
|
|
261
261
|
|
|
262
|
-
retry_duration_deadline = instance.created_at
|
|
263
|
-
tzinfo=datetime.timezone.utc
|
|
264
|
-
) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
|
|
262
|
+
retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
|
|
265
263
|
if retry_duration_deadline < get_current_datetime():
|
|
266
264
|
instance.status = InstanceStatus.TERMINATED
|
|
267
265
|
instance.termination_reason = "Provisioning timeout expired"
|
|
@@ -256,8 +256,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
256
256
|
for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
|
|
257
257
|
replica_statuses: Set[RunStatus] = set()
|
|
258
258
|
replica_needs_retry = False
|
|
259
|
-
|
|
260
259
|
replica_active = True
|
|
260
|
+
jobs_done_num = 0
|
|
261
261
|
for job_model in job_models:
|
|
262
262
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
263
263
|
if (
|
|
@@ -272,8 +272,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
272
272
|
):
|
|
273
273
|
# the job is done or going to be done
|
|
274
274
|
replica_statuses.add(RunStatus.DONE)
|
|
275
|
-
|
|
276
|
-
replica_active = False
|
|
275
|
+
jobs_done_num += 1
|
|
277
276
|
elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN:
|
|
278
277
|
# the job was scaled down
|
|
279
278
|
replica_active = False
|
|
@@ -313,26 +312,14 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
313
312
|
if not replica_needs_retry or retry_single_job:
|
|
314
313
|
run_statuses.update(replica_statuses)
|
|
315
314
|
|
|
316
|
-
if
|
|
317
|
-
#
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
)
|
|
325
|
-
)
|
|
326
|
-
else:
|
|
327
|
-
# last_processed_at = replica scaled down
|
|
328
|
-
replicas_info.append(
|
|
329
|
-
autoscalers.ReplicaInfo(
|
|
330
|
-
active=False,
|
|
331
|
-
timestamp=max(job.last_processed_at for job in job_models).replace(
|
|
332
|
-
tzinfo=datetime.timezone.utc
|
|
333
|
-
),
|
|
334
|
-
)
|
|
335
|
-
)
|
|
315
|
+
if jobs_done_num == len(job_models):
|
|
316
|
+
# Consider replica inactive if all its jobs are done for some reason.
|
|
317
|
+
# If only some jobs are done, replica is considered active to avoid
|
|
318
|
+
# provisioning new replicas for partially done multi-node tasks.
|
|
319
|
+
replica_active = False
|
|
320
|
+
|
|
321
|
+
replica_info = _get_replica_info(job_models, replica_active)
|
|
322
|
+
replicas_info.append(replica_info)
|
|
336
323
|
|
|
337
324
|
termination_reason: Optional[RunTerminationReason] = None
|
|
338
325
|
if RunStatus.FAILED in run_statuses:
|
|
@@ -410,6 +397,23 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
410
397
|
run_model.resubmission_attempt += 1
|
|
411
398
|
|
|
412
399
|
|
|
400
|
+
def _get_replica_info(
|
|
401
|
+
replica_job_models: list[JobModel],
|
|
402
|
+
replica_active: bool,
|
|
403
|
+
) -> autoscalers.ReplicaInfo:
|
|
404
|
+
if replica_active:
|
|
405
|
+
# submitted_at = replica created
|
|
406
|
+
return autoscalers.ReplicaInfo(
|
|
407
|
+
active=True,
|
|
408
|
+
timestamp=min(job.submitted_at for job in replica_job_models),
|
|
409
|
+
)
|
|
410
|
+
# last_processed_at = replica scaled down
|
|
411
|
+
return autoscalers.ReplicaInfo(
|
|
412
|
+
active=False,
|
|
413
|
+
timestamp=max(job.last_processed_at for job in replica_job_models),
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
413
417
|
async def _handle_run_replicas(
|
|
414
418
|
session: AsyncSession,
|
|
415
419
|
run_model: RunModel,
|
|
@@ -260,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
260
260
|
|
|
261
261
|
instance_filters = [
|
|
262
262
|
InstanceModel.deleted == False,
|
|
263
|
-
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
264
263
|
InstanceModel.id.not_in(detaching_instances_ids),
|
|
265
264
|
]
|
|
266
265
|
|
|
@@ -514,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
|
|
|
514
513
|
)
|
|
515
514
|
return run_model.fleet, fleet_instances_with_pool_offers
|
|
516
515
|
|
|
517
|
-
if len(fleet_models) == 0:
|
|
518
|
-
return None, []
|
|
519
|
-
|
|
520
516
|
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
521
517
|
# The current strategy is first to consider fleets that can accommodate
|
|
522
518
|
# the run without additional provisioning and choose the one with the cheapest pool offer.
|
|
@@ -534,6 +530,7 @@ async def _find_optimal_fleet_with_offers(
|
|
|
534
530
|
]
|
|
535
531
|
] = []
|
|
536
532
|
for candidate_fleet_model in fleet_models:
|
|
533
|
+
candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
|
|
537
534
|
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
538
535
|
fleet_model=candidate_fleet_model,
|
|
539
536
|
run_spec=run_spec,
|
|
@@ -541,24 +538,21 @@ async def _find_optimal_fleet_with_offers(
|
|
|
541
538
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
542
539
|
volumes=volumes,
|
|
543
540
|
)
|
|
544
|
-
|
|
541
|
+
fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
|
|
545
542
|
fleet_cheapest_pool_offer = math.inf
|
|
546
543
|
if len(fleet_instances_with_pool_offers) > 0:
|
|
547
544
|
fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
|
|
548
545
|
|
|
549
|
-
candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
|
|
550
|
-
profile = None
|
|
551
|
-
requirements = None
|
|
552
546
|
try:
|
|
547
|
+
_check_can_create_new_instance_in_fleet(candidate_fleet)
|
|
553
548
|
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
554
549
|
job=job,
|
|
555
550
|
run_spec=run_spec,
|
|
556
551
|
fleet=candidate_fleet,
|
|
557
552
|
)
|
|
558
553
|
except ValueError:
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
if profile is not None and requirements is not None:
|
|
554
|
+
fleet_backend_offers = []
|
|
555
|
+
else:
|
|
562
556
|
multinode = (
|
|
563
557
|
candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
564
558
|
or job.job_spec.jobs_per_replica > 1
|
|
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
|
|
|
579
573
|
if len(fleet_backend_offers) > 0:
|
|
580
574
|
fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
|
|
581
575
|
|
|
576
|
+
if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
|
|
577
|
+
logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
|
|
578
|
+
continue
|
|
579
|
+
|
|
582
580
|
fleet_priority = (
|
|
583
|
-
not
|
|
581
|
+
not fleet_has_pool_capacity,
|
|
584
582
|
fleet_cheapest_pool_offer,
|
|
585
583
|
fleet_cheapest_backend_offer,
|
|
586
584
|
)
|
|
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
|
|
|
593
591
|
fleet_priority,
|
|
594
592
|
)
|
|
595
593
|
)
|
|
594
|
+
if len(candidate_fleets_with_offers) == 0:
|
|
595
|
+
return None, []
|
|
596
596
|
if run_spec.merged_profile.fleets is None and all(
|
|
597
597
|
t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
|
|
598
598
|
):
|
|
599
|
-
# If fleets are not specified and no fleets have available pool
|
|
599
|
+
# If fleets are not specified and no fleets have available pool
|
|
600
|
+
# or backend offers, create a new fleet.
|
|
600
601
|
# This is for compatibility with non-fleet-first UX when runs created new fleets
|
|
601
602
|
# if there are no instances to reuse.
|
|
602
603
|
return None, []
|
|
@@ -616,6 +617,39 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
|
|
|
616
617
|
return nodes_required_num
|
|
617
618
|
|
|
618
619
|
|
|
620
|
+
def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
|
|
621
|
+
"""
|
|
622
|
+
Returns `False` if the run cannot fit into fleet for sure.
|
|
623
|
+
This is helpful heuristic to avoid even considering fleets too small for a run.
|
|
624
|
+
A run may not fit even if this function returns `True`.
|
|
625
|
+
This will lead to some jobs failing due to exceeding `nodes.max`
|
|
626
|
+
or more than `nodes.max` instances being provisioned
|
|
627
|
+
and eventually removed by the fleet consolidation logic.
|
|
628
|
+
"""
|
|
629
|
+
# No check for cloud fleets with blocks > 1 since we don't know
|
|
630
|
+
# how many jobs such fleets can accommodate.
|
|
631
|
+
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
632
|
+
if (
|
|
633
|
+
fleet.spec.configuration.nodes is not None
|
|
634
|
+
and fleet.spec.configuration.blocks == 1
|
|
635
|
+
and fleet.spec.configuration.nodes.max is not None
|
|
636
|
+
):
|
|
637
|
+
busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
|
|
638
|
+
fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
|
|
639
|
+
if fleet_available_capacity < nodes_required_num:
|
|
640
|
+
return False
|
|
641
|
+
elif fleet.spec.configuration.ssh_config is not None:
|
|
642
|
+
# Currently assume that each idle block can run a job.
|
|
643
|
+
# TODO: Take resources / eligible offers into account.
|
|
644
|
+
total_idle_blocks = 0
|
|
645
|
+
for instance in fleet.instances:
|
|
646
|
+
total_blocks = instance.total_blocks or 1
|
|
647
|
+
total_idle_blocks += total_blocks - instance.busy_blocks
|
|
648
|
+
if total_idle_blocks < nodes_required_num:
|
|
649
|
+
return False
|
|
650
|
+
return True
|
|
651
|
+
|
|
652
|
+
|
|
619
653
|
def _get_fleet_instances_with_pool_offers(
|
|
620
654
|
fleet_model: FleetModel,
|
|
621
655
|
run_spec: RunSpec,
|
|
@@ -713,6 +747,7 @@ async def _run_job_on_new_instance(
|
|
|
713
747
|
if fleet_model is not None:
|
|
714
748
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
715
749
|
try:
|
|
750
|
+
_check_can_create_new_instance_in_fleet(fleet)
|
|
716
751
|
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
717
752
|
job=job,
|
|
718
753
|
run_spec=run.run_spec,
|
|
@@ -787,8 +822,6 @@ def _get_run_profile_and_requirements_in_fleet(
|
|
|
787
822
|
run_spec: RunSpec,
|
|
788
823
|
fleet: Fleet,
|
|
789
824
|
) -> tuple[Profile, Requirements]:
|
|
790
|
-
if not _check_can_create_new_instance_in_fleet(fleet):
|
|
791
|
-
raise ValueError("Cannot fit new instance into fleet")
|
|
792
825
|
profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
|
|
793
826
|
if profile is None:
|
|
794
827
|
raise ValueError("Cannot combine fleet profile")
|
|
@@ -801,13 +834,23 @@ def _get_run_profile_and_requirements_in_fleet(
|
|
|
801
834
|
return profile, requirements
|
|
802
835
|
|
|
803
836
|
|
|
804
|
-
def _check_can_create_new_instance_in_fleet(fleet: Fleet)
|
|
837
|
+
def _check_can_create_new_instance_in_fleet(fleet: Fleet):
|
|
838
|
+
if not _can_create_new_instance_in_fleet(fleet):
|
|
839
|
+
raise ValueError("Cannot fit new instance into fleet")
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
805
843
|
if fleet.spec.configuration.ssh_config is not None:
|
|
806
844
|
return False
|
|
807
|
-
|
|
808
|
-
#
|
|
809
|
-
#
|
|
810
|
-
|
|
845
|
+
active_instances = [i for i in fleet.instances if i.status.is_active()]
|
|
846
|
+
# nodes.max is a soft limit that can be exceeded when provisioning concurrently.
|
|
847
|
+
# The fleet consolidation logic will remove redundant nodes eventually.
|
|
848
|
+
if (
|
|
849
|
+
fleet.spec.configuration.nodes is not None
|
|
850
|
+
and fleet.spec.configuration.nodes.max is not None
|
|
851
|
+
and len(active_instances) >= fleet.spec.configuration.nodes.max
|
|
852
|
+
):
|
|
853
|
+
return False
|
|
811
854
|
return True
|
|
812
855
|
|
|
813
856
|
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.32
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -73,7 +73,7 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
|
|
|
73
73
|
Requires-Dist: httpx; extra == 'all'
|
|
74
74
|
Requires-Dist: jinja2; extra == 'all'
|
|
75
75
|
Requires-Dist: kubernetes; extra == 'all'
|
|
76
|
-
Requires-Dist: nebius
|
|
76
|
+
Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'all'
|
|
77
77
|
Requires-Dist: oci>=2.150.0; extra == 'all'
|
|
78
78
|
Requires-Dist: prometheus-client; extra == 'all'
|
|
79
79
|
Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
|
|
@@ -259,7 +259,7 @@ Requires-Dist: fastapi; extra == 'nebius'
|
|
|
259
259
|
Requires-Dist: grpcio>=1.50; extra == 'nebius'
|
|
260
260
|
Requires-Dist: httpx; extra == 'nebius'
|
|
261
261
|
Requires-Dist: jinja2; extra == 'nebius'
|
|
262
|
-
Requires-Dist: nebius
|
|
262
|
+
Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
|
|
263
263
|
Requires-Dist: prometheus-client; extra == 'nebius'
|
|
264
264
|
Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
|
|
265
265
|
Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
|
|
@@ -340,15 +340,13 @@ It streamlines development, training, and inference, and is compatible with any
|
|
|
340
340
|
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
|
|
341
341
|
|
|
342
342
|
## Latest news ✨
|
|
343
|
-
- [2025/
|
|
344
|
-
- [2025/08] [dstack 0.19.26: Repos
|
|
345
|
-
- [2025/08] [dstack 0.19.
|
|
346
|
-
- [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy, Secrets UI](https://github.com/dstackai/dstack/releases/tag/0.19.22)
|
|
343
|
+
- [2025/10] [dstack 0.19.31: Kubernetes, GCP A4 spot](https://github.com/dstackai/dstack/releases/tag/0.19.31)
|
|
344
|
+
- [2025/08] [dstack 0.19.26: Repos](https://github.com/dstackai/dstack/releases/tag/0.19.26)
|
|
345
|
+
- [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy](https://github.com/dstackai/dstack/releases/tag/0.19.22)
|
|
347
346
|
- [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
|
|
348
347
|
- [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
|
|
349
|
-
- [2025/06] [dstack 0.19.16: Docker in Docker
|
|
350
|
-
- [2025/06] [dstack 0.19.13:
|
|
351
|
-
- [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
|
|
348
|
+
- [2025/06] [dstack 0.19.16: Docker in Docker](https://github.com/dstackai/dstack/releases/tag/0.19.16)
|
|
349
|
+
- [2025/06] [dstack 0.19.13: Default images with InfiniBand support](https://github.com/dstackai/dstack/releases/tag/0.19.13)
|
|
352
350
|
|
|
353
351
|
## How does it work?
|
|
354
352
|
|
|
@@ -364,11 +362,11 @@ It streamlines development, training, and inference, and is compatible with any
|
|
|
364
362
|
|
|
365
363
|
To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
|
|
366
364
|
|
|
367
|
-
Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](
|
|
365
|
+
Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI.
|
|
368
366
|
|
|
369
|
-
For more details, see [Backends](
|
|
367
|
+
For more details, see [Backends](https://dstack.ai/docs/concepts/backends).
|
|
370
368
|
|
|
371
|
-
> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](
|
|
369
|
+
> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh) once the server is up.
|
|
372
370
|
|
|
373
371
|
##### Start the server
|
|
374
372
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
dstack/version.py,sha256=
|
|
2
|
+
dstack/version.py,sha256=DLiOZq8Gabr_DjHGIzjxI9IasDON-4xNaF3b4Rt2BBI,105
|
|
3
3
|
dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
|
|
5
5
|
dstack/_internal/settings.py,sha256=FYtd7tRk17Oc62Kl_3O8NuT5JHb8TKhLThl1TsfjjVs,1390
|
|
@@ -118,12 +118,12 @@ dstack/_internal/core/backends/gcp/features/tcpx.py,sha256=8bDR5kwF5qke5EWNdBscd
|
|
|
118
118
|
dstack/_internal/core/backends/hotaisle/__init__.py,sha256=CYMaS1jd9Km0Y6Jvg4ePjYOtfqL9swGsRo5kcXGFrFQ,30
|
|
119
119
|
dstack/_internal/core/backends/hotaisle/api_client.py,sha256=Fd1TOg4_orwQyJtoZ657zJweLeBhzj_9ObfL538S5uI,3640
|
|
120
120
|
dstack/_internal/core/backends/hotaisle/backend.py,sha256=o0cqLIKGcrXhvksHHGvjCpLShoQxT2IKdJy9sm0H9gE,586
|
|
121
|
-
dstack/_internal/core/backends/hotaisle/compute.py,sha256=
|
|
121
|
+
dstack/_internal/core/backends/hotaisle/compute.py,sha256=X9XbIatbFH5wqLoSH3Z9nNOhBMrXnVVayFn6xi4zu-g,8224
|
|
122
122
|
dstack/_internal/core/backends/hotaisle/configurator.py,sha256=EJwdKFfC0ab0pe4lzeV65b80Ok21rR0OfupOmuqCp6c,2287
|
|
123
123
|
dstack/_internal/core/backends/hotaisle/models.py,sha256=CmJ20SbpKzFldX7rrR0CpVytSJSN2YWKQ3Ixnta_A1M,1334
|
|
124
124
|
dstack/_internal/core/backends/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
125
|
dstack/_internal/core/backends/kubernetes/backend.py,sha256=Jy0_Nwn6Oro8McJIo_QeNxxq4Pmwsd7JPd5_YE8Fz9U,606
|
|
126
|
-
dstack/_internal/core/backends/kubernetes/compute.py,sha256=
|
|
126
|
+
dstack/_internal/core/backends/kubernetes/compute.py,sha256=7xVdo2HK-dTZjkQQtuIfiTXLhBSzhUO0BomZBaPG5UM,34989
|
|
127
127
|
dstack/_internal/core/backends/kubernetes/configurator.py,sha256=RK8_eznv1AFrcG3fM-KIxyolsaJ8UTBAO7c3P3RCBnw,2228
|
|
128
128
|
dstack/_internal/core/backends/kubernetes/models.py,sha256=vGOhRYP4OzhF62BN5bfRGd4E2tKPaqdlZY8tMmjZoJ0,2308
|
|
129
129
|
dstack/_internal/core/backends/kubernetes/utils.py,sha256=1DkkL_VWShFFqN-Crh0ddebRyXXyL435FyrjVkFLR1Q,6286
|
|
@@ -138,11 +138,11 @@ dstack/_internal/core/backends/local/backend.py,sha256=KJuNXUXrg60NhLywnExD1EXH2
|
|
|
138
138
|
dstack/_internal/core/backends/local/compute.py,sha256=tWNsKGKYlPx9yeqwlpAL_XExOYMPLcb6AsGAji3YO3M,3825
|
|
139
139
|
dstack/_internal/core/backends/nebius/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
140
|
dstack/_internal/core/backends/nebius/backend.py,sha256=2XqZIbSR8VzlfOnuVklXlDxNmwAkQj7txQN8VXF1j2E,566
|
|
141
|
-
dstack/_internal/core/backends/nebius/compute.py,sha256=
|
|
142
|
-
dstack/_internal/core/backends/nebius/configurator.py,sha256=
|
|
141
|
+
dstack/_internal/core/backends/nebius/compute.py,sha256=US5W8Q0UT09huQybDTBLVE2EDpRG3UdMqq1DZFFaCI4,15454
|
|
142
|
+
dstack/_internal/core/backends/nebius/configurator.py,sha256=eybolJi5rlEeU8GBXC7pdOU7To32ASQGHDAiE2cNeFo,3794
|
|
143
143
|
dstack/_internal/core/backends/nebius/fabrics.py,sha256=-X-nSPV2pUin2PAYDHGm-j14KPboIFRpLi93PKHUXTM,1616
|
|
144
|
-
dstack/_internal/core/backends/nebius/models.py,sha256=
|
|
145
|
-
dstack/_internal/core/backends/nebius/resources.py,sha256=
|
|
144
|
+
dstack/_internal/core/backends/nebius/models.py,sha256=OSiUANBf893Xdm7-4WDoPfmd3YFk5-oRjdXiWUjvDdk,6194
|
|
145
|
+
dstack/_internal/core/backends/nebius/resources.py,sha256=MVyMaS0-mZu2g-tJ4HF7GiT1hFFL7Mha9hXtP3XeT7o,14070
|
|
146
146
|
dstack/_internal/core/backends/oci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
147
147
|
dstack/_internal/core/backends/oci/auth.py,sha256=8Cr18y_LOsyRP-16yfFpT70Cofpm0clB3KawS_7aRl4,717
|
|
148
148
|
dstack/_internal/core/backends/oci/backend.py,sha256=yXjVCt7n6BVLH0byYFbNFf-P9J0FwlNfxsYbKGMdoI4,536
|
|
@@ -182,7 +182,7 @@ dstack/_internal/core/compatibility/fleets.py,sha256=jg42A7OmprqATKKt6JpLL1qOQSZ
|
|
|
182
182
|
dstack/_internal/core/compatibility/gateways.py,sha256=4h_lfpN9KJFyLTFexq-wlu74Rwpk0anV67v38aJ-SnI,1463
|
|
183
183
|
dstack/_internal/core/compatibility/gpus.py,sha256=myWVUjaK2S1QuYgRZyMtD8-DPKQTjadSbsnECtfoHHs,575
|
|
184
184
|
dstack/_internal/core/compatibility/logs.py,sha256=keXt3OFKR0CjD_XMsetzRu8yQGCz7CWBwycuP267L_Q,629
|
|
185
|
-
dstack/_internal/core/compatibility/runs.py,sha256=
|
|
185
|
+
dstack/_internal/core/compatibility/runs.py,sha256=pT5RxheOzJJXFpJM1th-ku9-inj3McMBcdEHxcMBp9U,9357
|
|
186
186
|
dstack/_internal/core/compatibility/volumes.py,sha256=ofjpVusuc-pq285bGrIh8PAqu0QlAd6NQgU3gfJQIc0,1546
|
|
187
187
|
dstack/_internal/core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
188
188
|
dstack/_internal/core/models/common.py,sha256=QKdZM7L2NepzOPavkkI_q3g6WYCauMTbtZSZwWZVYHE,4704
|
|
@@ -197,7 +197,7 @@ dstack/_internal/core/models/instances.py,sha256=Gpv46fu3uWO-3f8w1A6rBzU5dYhmO_w
|
|
|
197
197
|
dstack/_internal/core/models/logs.py,sha256=VOsgEsvUIRNNHivD6OZnPZNC52ioqafv7ccdnFQ1YI8,529
|
|
198
198
|
dstack/_internal/core/models/metrics.py,sha256=Xb8hCXUL-ncQ3PMsErIUAJTe9gwh5jyrQ4UQoZbibsc,269
|
|
199
199
|
dstack/_internal/core/models/placement.py,sha256=WJVq5ENJykyRarQzL2EeYQag_9_jV7VSAtR_xoFvPVM,720
|
|
200
|
-
dstack/_internal/core/models/profiles.py,sha256=
|
|
200
|
+
dstack/_internal/core/models/profiles.py,sha256=YW7XeztKBnVCptvUyR8-E-dbB06nu9GIqwLvmtHgSms,14501
|
|
201
201
|
dstack/_internal/core/models/projects.py,sha256=hOZoL85q-873vT_Aw7FhzpS6DGVt0Y3yT8kpElrLFto,833
|
|
202
202
|
dstack/_internal/core/models/resources.py,sha256=-dLupzud5BSqxNABBjLVYTCKekbr9_mhaNGD1ZWBjgM,14544
|
|
203
203
|
dstack/_internal/core/models/runs.py,sha256=ehGSyCSx5OAaEqCEd2YvCRiP_uewUeDOHbJGSocu6w0,22609
|
|
@@ -289,17 +289,17 @@ dstack/_internal/server/settings.py,sha256=7SRzSlTnUPNNjlZH-vpgwbwj95gI-LLtQite8
|
|
|
289
289
|
dstack/_internal/server/background/__init__.py,sha256=QftEjgQZffu83sY-F0WL65vRp28FbBEtezfowQYcTv4,5606
|
|
290
290
|
dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
291
291
|
dstack/_internal/server/background/tasks/common.py,sha256=n87hFjDNtS2x8mYbBnKLqhXus1P8qkdfqXG1TSeIJjM,1089
|
|
292
|
-
dstack/_internal/server/background/tasks/process_fleets.py,sha256=
|
|
292
|
+
dstack/_internal/server/background/tasks/process_fleets.py,sha256=0i_S3HCZp4AjQjER7j_pvIm22eYbrBwZnt9-0kgsq3E,11547
|
|
293
293
|
dstack/_internal/server/background/tasks/process_gateways.py,sha256=FH9RY3Tfmtw_UctCdYZDIRb2rgtmHdxTg6Oc4IBiDBA,8356
|
|
294
294
|
dstack/_internal/server/background/tasks/process_idle_volumes.py,sha256=mqnl8wvWaKTYvJMbgFJbOP-bZMRQG2vrhUnaNcyBldE,5223
|
|
295
|
-
dstack/_internal/server/background/tasks/process_instances.py,sha256=
|
|
295
|
+
dstack/_internal/server/background/tasks/process_instances.py,sha256=IeKl28NSC-va5QABW0PC3eDGU6r-Xz1TNuSK2EtIX08,44007
|
|
296
296
|
dstack/_internal/server/background/tasks/process_metrics.py,sha256=yKXe9J7m3oleK0C-oGJaYNkcPT8kqkz0nw-A7xqYbjE,6390
|
|
297
297
|
dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=lgYIzjHG9EITK31yG6uQjlIcSwW5jsP9ZOBBZqW_eNs,4263
|
|
298
298
|
dstack/_internal/server/background/tasks/process_probes.py,sha256=dmug-_rmYiVLLF-imto-Ju1gPtENvHvCjHyilqgYuJw,6457
|
|
299
299
|
dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=_UZm37FVV4rhdd0So7HtcKbIgrSdAr5Vx-Uen_xizec,5459
|
|
300
300
|
dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=IoQi7mm4upEZgujTkWYrXDKrC5rSZ5Q4_jAR4OpajaM,44973
|
|
301
|
-
dstack/_internal/server/background/tasks/process_runs.py,sha256=
|
|
302
|
-
dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=
|
|
301
|
+
dstack/_internal/server/background/tasks/process_runs.py,sha256=K4km4XT0JYUf6JYbpKbEAyumUDBT21lqcMFTQ7pIsoY,25200
|
|
302
|
+
dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=fDuLfnSJqWDL2oPrxXQMkaqKgplMdKT0SRD_AyB4n-0,43099
|
|
303
303
|
dstack/_internal/server/background/tasks/process_terminating_jobs.py,sha256=S7ZSDVMX-N0XMaMgwFa1QG_RAi48BP432s9AqHw4PMM,4066
|
|
304
304
|
dstack/_internal/server/background/tasks/process_volumes.py,sha256=_fMmkwLYsyX-kpW9pDrZVJvFTZEOPp0gpjyKBMW-zw0,5204
|
|
305
305
|
dstack/_internal/server/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -662,8 +662,8 @@ dstack/plugins/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
|
662
662
|
dstack/plugins/builtin/rest_plugin/__init__.py,sha256=lgTsq8Z6Km2F2UhPRChVB4vDM5ZpWtdk1iB1aa20ypA,440
|
|
663
663
|
dstack/plugins/builtin/rest_plugin/_models.py,sha256=9hgVuU6OGSxidar88XhQnNo9izYWeQvVH45ciErv-Es,1910
|
|
664
664
|
dstack/plugins/builtin/rest_plugin/_plugin.py,sha256=h3r3Yc3h22i93fifPTgTm9Oojd1sN1O4DP7ZTV-kWpM,5386
|
|
665
|
-
dstack-0.19.
|
|
666
|
-
dstack-0.19.
|
|
667
|
-
dstack-0.19.
|
|
668
|
-
dstack-0.19.
|
|
669
|
-
dstack-0.19.
|
|
665
|
+
dstack-0.19.32.dist-info/METADATA,sha256=R_c6-NfPoaFBeuJVdHTD6dENAKZZsBo_syd6wCdiJ6M,20834
|
|
666
|
+
dstack-0.19.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
667
|
+
dstack-0.19.32.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
|
|
668
|
+
dstack-0.19.32.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
|
|
669
|
+
dstack-0.19.32.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|