dstack 0.19.12__py3-none-any.whl → 0.19.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -6
- dstack/_internal/cli/utils/run.py +5 -1
- dstack/_internal/core/backends/aws/compute.py +22 -9
- dstack/_internal/core/backends/aws/resources.py +26 -0
- dstack/_internal/core/backends/base/offers.py +0 -1
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/fleets.py +72 -0
- dstack/_internal/core/compatibility/gateways.py +34 -0
- dstack/_internal/core/compatibility/runs.py +125 -0
- dstack/_internal/core/compatibility/volumes.py +32 -0
- dstack/_internal/core/models/instances.py +51 -12
- dstack/_internal/core/models/runs.py +41 -0
- dstack/_internal/server/app.py +1 -1
- dstack/_internal/server/services/storage/__init__.py +38 -0
- dstack/_internal/server/services/storage/base.py +27 -0
- dstack/_internal/server/services/storage/gcs.py +44 -0
- dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
- dstack/_internal/server/settings.py +7 -3
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b0e80f8e26a168c129e9.js → main-2066f1f22ddb4557bcde.js} +1615 -31
- dstack/_internal/server/statics/{main-b0e80f8e26a168c129e9.js.map → main-2066f1f22ddb4557bcde.js.map} +1 -1
- dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
- dstack/api/server/_fleets.py +9 -73
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_runs.py +4 -124
- dstack/api/server/_volumes.py +3 -14
- dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
- dstack/version.py +2 -2
- {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/METADATA +1 -1
- {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/RECORD +33 -25
- {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/WHEEL +0 -0
- {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -166,12 +166,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
166
166
|
# We can attach to run multiple times if it goes from running to pending (retried).
|
|
167
167
|
while True:
|
|
168
168
|
with MultiItemStatus(f"Launching [code]{run.name}[/]...", console=console) as live:
|
|
169
|
-
while
|
|
170
|
-
RunStatus.SUBMITTED,
|
|
171
|
-
RunStatus.PENDING,
|
|
172
|
-
RunStatus.PROVISIONING,
|
|
173
|
-
RunStatus.TERMINATING,
|
|
174
|
-
):
|
|
169
|
+
while not _is_ready_to_attach(run):
|
|
175
170
|
table = get_runs_table([run])
|
|
176
171
|
live.update(table)
|
|
177
172
|
time.sleep(5)
|
|
@@ -166,11 +166,15 @@ def get_runs_table(
|
|
|
166
166
|
run_row: Dict[Union[str, int], Any] = {
|
|
167
167
|
"NAME": run.run_spec.run_name,
|
|
168
168
|
"SUBMITTED": format_date(run.submitted_at),
|
|
169
|
+
"STATUS": (
|
|
170
|
+
run.latest_job_submission.status_message
|
|
171
|
+
if run.status.is_finished() and run.latest_job_submission
|
|
172
|
+
else run.status_message
|
|
173
|
+
),
|
|
169
174
|
}
|
|
170
175
|
if run.error:
|
|
171
176
|
run_row["ERROR"] = run.error
|
|
172
177
|
if len(run.jobs) != 1:
|
|
173
|
-
run_row["STATUS"] = run.status
|
|
174
178
|
add_row_from_dict(table, run_row)
|
|
175
179
|
|
|
176
180
|
for job in run.jobs:
|
|
@@ -132,7 +132,8 @@ class AWSCompute(
|
|
|
132
132
|
availability_offers = []
|
|
133
133
|
for offer in offers:
|
|
134
134
|
availability = InstanceAvailability.UNKNOWN
|
|
135
|
-
|
|
135
|
+
quota = _has_quota(regions_to_quotas[offer.region], offer.instance.name)
|
|
136
|
+
if quota is not None and not quota:
|
|
136
137
|
availability = InstanceAvailability.NO_QUOTA
|
|
137
138
|
availability_offers.append(
|
|
138
139
|
InstanceOfferWithAvailability(
|
|
@@ -231,6 +232,7 @@ class AWSCompute(
|
|
|
231
232
|
image_id, username = aws_resources.get_image_id_and_username(
|
|
232
233
|
ec2_client=ec2_client,
|
|
233
234
|
cuda=len(instance_offer.instance.resources.gpus) > 0,
|
|
235
|
+
instance_type=instance_offer.instance.name,
|
|
234
236
|
image_config=self.config.os_images,
|
|
235
237
|
)
|
|
236
238
|
response = ec2_resource.create_instances(
|
|
@@ -781,10 +783,18 @@ def _get_regions_to_quotas(
|
|
|
781
783
|
) -> Dict[str, Dict[str, int]]:
|
|
782
784
|
def get_region_quotas(client: botocore.client.BaseClient) -> Dict[str, int]:
|
|
783
785
|
region_quotas = {}
|
|
784
|
-
|
|
785
|
-
for
|
|
786
|
-
|
|
787
|
-
|
|
786
|
+
try:
|
|
787
|
+
for page in client.get_paginator("list_service_quotas").paginate(ServiceCode="ec2"):
|
|
788
|
+
for q in page["Quotas"]:
|
|
789
|
+
if "On-Demand" in q["QuotaName"]:
|
|
790
|
+
region_quotas[q["UsageMetric"]["MetricDimensions"]["Class"]] = q["Value"]
|
|
791
|
+
except botocore.exceptions.ClientError as e:
|
|
792
|
+
if len(e.args) > 0 and "TooManyRequestsException" in e.args[0]:
|
|
793
|
+
logger.warning(
|
|
794
|
+
"Failed to get quotas due to rate limits. Quotas won't be accounted for."
|
|
795
|
+
)
|
|
796
|
+
else:
|
|
797
|
+
logger.exception(e)
|
|
788
798
|
return region_quotas
|
|
789
799
|
|
|
790
800
|
regions_to_quotas = {}
|
|
@@ -800,12 +810,15 @@ def _get_regions_to_quotas(
|
|
|
800
810
|
return regions_to_quotas
|
|
801
811
|
|
|
802
812
|
|
|
803
|
-
def _has_quota(quotas: Dict[str, int], instance_name: str) -> bool:
|
|
813
|
+
def _has_quota(quotas: Dict[str, int], instance_name: str) -> Optional[bool]:
|
|
814
|
+
quota = quotas.get("Standard/OnDemand")
|
|
804
815
|
if instance_name.startswith("p"):
|
|
805
|
-
|
|
816
|
+
quota = quotas.get("P/OnDemand")
|
|
806
817
|
if instance_name.startswith("g"):
|
|
807
|
-
|
|
808
|
-
|
|
818
|
+
quota = quotas.get("G/OnDemand")
|
|
819
|
+
if quota is None:
|
|
820
|
+
return None
|
|
821
|
+
return quota > 0
|
|
809
822
|
|
|
810
823
|
|
|
811
824
|
def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[str, List[str]]:
|
|
@@ -12,11 +12,13 @@ from dstack._internal.utils.logging import get_logger
|
|
|
12
12
|
logger = get_logger(__name__)
|
|
13
13
|
|
|
14
14
|
DSTACK_ACCOUNT_ID = "142421590066"
|
|
15
|
+
DLAMI_OWNER_ACCOUNT_ID = "898082745236"
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def get_image_id_and_username(
|
|
18
19
|
ec2_client: botocore.client.BaseClient,
|
|
19
20
|
cuda: bool,
|
|
21
|
+
instance_type: str,
|
|
20
22
|
image_config: Optional[AWSOSImageConfig] = None,
|
|
21
23
|
) -> tuple[str, str]:
|
|
22
24
|
if image_config is not None:
|
|
@@ -27,6 +29,11 @@ def get_image_id_and_username(
|
|
|
27
29
|
image_name = image.name
|
|
28
30
|
image_owner = image.owner
|
|
29
31
|
username = image.user
|
|
32
|
+
elif _supported_by_dlami(instance_type):
|
|
33
|
+
# TODO: Update DLAMI image version from time to time
|
|
34
|
+
image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) 20250516"
|
|
35
|
+
image_owner = DLAMI_OWNER_ACCOUNT_ID
|
|
36
|
+
username = "ubuntu"
|
|
30
37
|
else:
|
|
31
38
|
image_name = (
|
|
32
39
|
f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
|
|
@@ -628,6 +635,25 @@ def _is_private_subnet_with_internet_egress(
|
|
|
628
635
|
return False
|
|
629
636
|
|
|
630
637
|
|
|
638
|
+
def _supported_by_dlami(instance_type: str) -> bool:
|
|
639
|
+
# Currently only p3. instances are not supported by DLAMI among GPU instances.
|
|
640
|
+
return any(
|
|
641
|
+
instance_type.startswith(family)
|
|
642
|
+
for family in [
|
|
643
|
+
"g4dn.",
|
|
644
|
+
"g5.",
|
|
645
|
+
"g6.",
|
|
646
|
+
"gr6.",
|
|
647
|
+
"g6e.",
|
|
648
|
+
"p4d.",
|
|
649
|
+
"p4de.",
|
|
650
|
+
"p5.",
|
|
651
|
+
"p5e.",
|
|
652
|
+
"p6-b200.",
|
|
653
|
+
]
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
|
|
631
657
|
def get_reservation(
|
|
632
658
|
ec2_client: botocore.client.BaseClient,
|
|
633
659
|
reservation_id: str,
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec
|
|
4
|
+
from dstack._internal.core.models.instances import Instance
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_get_plan_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
8
|
+
get_plan_excludes = {}
|
|
9
|
+
spec_excludes = get_fleet_spec_excludes(fleet_spec)
|
|
10
|
+
if spec_excludes:
|
|
11
|
+
get_plan_excludes["spec"] = spec_excludes
|
|
12
|
+
return get_plan_excludes
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
|
|
16
|
+
apply_plan_excludes = {}
|
|
17
|
+
spec_excludes = get_fleet_spec_excludes(plan_input.spec)
|
|
18
|
+
if spec_excludes:
|
|
19
|
+
apply_plan_excludes["spec"] = spec_excludes
|
|
20
|
+
current_resource = plan_input.current_resource
|
|
21
|
+
if current_resource is not None:
|
|
22
|
+
current_resource_excludes = {}
|
|
23
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
24
|
+
if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
|
|
25
|
+
current_resource_excludes["instances"] = {
|
|
26
|
+
"__all__": {"instance_type": {"resources": {"cpu_arch"}}}
|
|
27
|
+
}
|
|
28
|
+
return {"plan": apply_plan_excludes}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
32
|
+
create_fleet_excludes = {}
|
|
33
|
+
spec_excludes = get_fleet_spec_excludes(fleet_spec)
|
|
34
|
+
if spec_excludes:
|
|
35
|
+
create_fleet_excludes["spec"] = spec_excludes
|
|
36
|
+
return create_fleet_excludes
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[Dict]:
|
|
40
|
+
"""
|
|
41
|
+
Returns `fleet_spec` exclude mapping to exclude certain fields from the request.
|
|
42
|
+
Use this method to exclude new fields when they are not set to keep
|
|
43
|
+
clients backward-compatibility with older servers.
|
|
44
|
+
"""
|
|
45
|
+
spec_excludes: Dict[str, Any] = {}
|
|
46
|
+
configuration_excludes: Dict[str, Any] = {}
|
|
47
|
+
profile_excludes: set[str] = set()
|
|
48
|
+
profile = fleet_spec.profile
|
|
49
|
+
if profile.fleets is None:
|
|
50
|
+
profile_excludes.add("fleets")
|
|
51
|
+
if fleet_spec.configuration.tags is None:
|
|
52
|
+
configuration_excludes["tags"] = True
|
|
53
|
+
if profile.tags is None:
|
|
54
|
+
profile_excludes.add("tags")
|
|
55
|
+
if profile.startup_order is None:
|
|
56
|
+
profile_excludes.add("startup_order")
|
|
57
|
+
if profile.stop_criteria is None:
|
|
58
|
+
profile_excludes.add("stop_criteria")
|
|
59
|
+
if configuration_excludes:
|
|
60
|
+
spec_excludes["configuration"] = configuration_excludes
|
|
61
|
+
if profile_excludes:
|
|
62
|
+
spec_excludes["profile"] = profile_excludes
|
|
63
|
+
if spec_excludes:
|
|
64
|
+
return spec_excludes
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
|
|
69
|
+
try:
|
|
70
|
+
return instance.instance_type.resources.cpu_arch is None
|
|
71
|
+
except AttributeError:
|
|
72
|
+
return True
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_gateway_spec_excludes(gateway_spec: GatewaySpec) -> Dict:
|
|
7
|
+
"""
|
|
8
|
+
Returns `gateway_spec` exclude mapping to exclude certain fields from the request.
|
|
9
|
+
Use this method to exclude new fields when they are not set to keep
|
|
10
|
+
clients backward-compatibility with older servers.
|
|
11
|
+
"""
|
|
12
|
+
spec_excludes = {}
|
|
13
|
+
spec_excludes["configuration"] = _get_gateway_configuration_excludes(
|
|
14
|
+
gateway_spec.configuration
|
|
15
|
+
)
|
|
16
|
+
return spec_excludes
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_create_gateway_excludes(configuration: GatewayConfiguration) -> Dict:
|
|
20
|
+
"""
|
|
21
|
+
Returns an exclude mapping to exclude certain fields from the create gateway request.
|
|
22
|
+
Use this method to exclude new fields when they are not set to keep
|
|
23
|
+
clients backward-compatibility with older servers.
|
|
24
|
+
"""
|
|
25
|
+
create_gateway_excludes = {}
|
|
26
|
+
create_gateway_excludes["configuration"] = _get_gateway_configuration_excludes(configuration)
|
|
27
|
+
return create_gateway_excludes
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_gateway_configuration_excludes(configuration: GatewayConfiguration) -> Dict:
|
|
31
|
+
configuration_excludes = {}
|
|
32
|
+
if configuration.tags is None:
|
|
33
|
+
configuration_excludes["tags"] = True
|
|
34
|
+
return configuration_excludes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
4
|
+
from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
|
|
5
|
+
from dstack._internal.server.schemas.runs import GetRunPlanRequest
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
9
|
+
"""
|
|
10
|
+
Returns `plan` exclude mapping to exclude certain fields from the request.
|
|
11
|
+
Use this method to exclude new fields when they are not set to keep
|
|
12
|
+
clients backward-compatibility with older servers.
|
|
13
|
+
"""
|
|
14
|
+
apply_plan_excludes = {}
|
|
15
|
+
run_spec_excludes = get_run_spec_excludes(plan.run_spec)
|
|
16
|
+
if run_spec_excludes is not None:
|
|
17
|
+
apply_plan_excludes["run_spec"] = run_spec_excludes
|
|
18
|
+
current_resource = plan.current_resource
|
|
19
|
+
if current_resource is not None:
|
|
20
|
+
current_resource_excludes = {}
|
|
21
|
+
current_resource_excludes["status_message"] = True
|
|
22
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
23
|
+
current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
|
|
24
|
+
job_submissions_excludes = {}
|
|
25
|
+
current_resource_excludes["jobs"] = {
|
|
26
|
+
"__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
|
|
27
|
+
}
|
|
28
|
+
job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
|
|
29
|
+
if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
|
|
30
|
+
job_submissions_excludes["job_provisioning_data"] = {
|
|
31
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
32
|
+
}
|
|
33
|
+
if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
|
|
34
|
+
job_submissions_excludes["job_runtime_data"] = {
|
|
35
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
36
|
+
}
|
|
37
|
+
if all(js.exit_status is None for js in job_submissions):
|
|
38
|
+
job_submissions_excludes["exit_status"] = True
|
|
39
|
+
latest_job_submission = current_resource.latest_job_submission
|
|
40
|
+
if latest_job_submission is not None:
|
|
41
|
+
latest_job_submission_excludes = {}
|
|
42
|
+
current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
|
|
43
|
+
if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
|
|
44
|
+
latest_job_submission_excludes["job_provisioning_data"] = {
|
|
45
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
46
|
+
}
|
|
47
|
+
if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
|
|
48
|
+
latest_job_submission_excludes["job_runtime_data"] = {
|
|
49
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
50
|
+
}
|
|
51
|
+
if latest_job_submission.exit_status is None:
|
|
52
|
+
latest_job_submission_excludes["exit_status"] = True
|
|
53
|
+
return {"plan": apply_plan_excludes}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
|
|
57
|
+
"""
|
|
58
|
+
Excludes new fields when they are not set to keep
|
|
59
|
+
clients backward-compatibility with older servers.
|
|
60
|
+
"""
|
|
61
|
+
get_plan_excludes = {}
|
|
62
|
+
run_spec_excludes = get_run_spec_excludes(request.run_spec)
|
|
63
|
+
if run_spec_excludes is not None:
|
|
64
|
+
get_plan_excludes["run_spec"] = run_spec_excludes
|
|
65
|
+
if request.max_offers is None:
|
|
66
|
+
get_plan_excludes["max_offers"] = True
|
|
67
|
+
return get_plan_excludes
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
|
|
71
|
+
"""
|
|
72
|
+
Returns `run_spec` exclude mapping to exclude certain fields from the request.
|
|
73
|
+
Use this method to exclude new fields when they are not set to keep
|
|
74
|
+
clients backward-compatibility with older servers.
|
|
75
|
+
"""
|
|
76
|
+
spec_excludes: dict[str, Any] = {}
|
|
77
|
+
configuration_excludes: dict[str, Any] = {}
|
|
78
|
+
profile_excludes: set[str] = set()
|
|
79
|
+
configuration = run_spec.configuration
|
|
80
|
+
profile = run_spec.profile
|
|
81
|
+
|
|
82
|
+
if configuration.fleets is None:
|
|
83
|
+
configuration_excludes["fleets"] = True
|
|
84
|
+
if profile is not None and profile.fleets is None:
|
|
85
|
+
profile_excludes.add("fleets")
|
|
86
|
+
if configuration.tags is None:
|
|
87
|
+
configuration_excludes["tags"] = True
|
|
88
|
+
if profile is not None and profile.tags is None:
|
|
89
|
+
profile_excludes.add("tags")
|
|
90
|
+
if isinstance(configuration, ServiceConfiguration) and not configuration.rate_limits:
|
|
91
|
+
configuration_excludes["rate_limits"] = True
|
|
92
|
+
if configuration.shell is None:
|
|
93
|
+
configuration_excludes["shell"] = True
|
|
94
|
+
if configuration.priority is None:
|
|
95
|
+
configuration_excludes["priority"] = True
|
|
96
|
+
if configuration.startup_order is None:
|
|
97
|
+
configuration_excludes["startup_order"] = True
|
|
98
|
+
if profile is not None and profile.startup_order is None:
|
|
99
|
+
profile_excludes.add("startup_order")
|
|
100
|
+
if configuration.stop_criteria is None:
|
|
101
|
+
configuration_excludes["stop_criteria"] = True
|
|
102
|
+
if profile is not None and profile.stop_criteria is None:
|
|
103
|
+
profile_excludes.add("stop_criteria")
|
|
104
|
+
|
|
105
|
+
if configuration_excludes:
|
|
106
|
+
spec_excludes["configuration"] = configuration_excludes
|
|
107
|
+
if profile_excludes:
|
|
108
|
+
spec_excludes["profile"] = profile_excludes
|
|
109
|
+
if spec_excludes:
|
|
110
|
+
return spec_excludes
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
115
|
+
try:
|
|
116
|
+
return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
|
|
117
|
+
except AttributeError:
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
122
|
+
try:
|
|
123
|
+
return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
|
|
124
|
+
except AttributeError:
|
|
125
|
+
return True
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
|
|
7
|
+
"""
|
|
8
|
+
Returns `volume_spec` exclude mapping to exclude certain fields from the request.
|
|
9
|
+
Use this method to exclude new fields when they are not set to keep
|
|
10
|
+
clients backward-compatibility with older servers.
|
|
11
|
+
"""
|
|
12
|
+
spec_excludes = {}
|
|
13
|
+
spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
|
|
14
|
+
return spec_excludes
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
|
|
18
|
+
"""
|
|
19
|
+
Returns an exclude mapping to exclude certain fields from the create volume request.
|
|
20
|
+
Use this method to exclude new fields when they are not set to keep
|
|
21
|
+
clients backward-compatibility with older servers.
|
|
22
|
+
"""
|
|
23
|
+
create_volume_excludes = {}
|
|
24
|
+
create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
|
|
25
|
+
return create_volume_excludes
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
|
|
29
|
+
configuration_excludes = {}
|
|
30
|
+
if configuration.tags is None:
|
|
31
|
+
configuration_excludes["tags"] = True
|
|
32
|
+
return configuration_excludes
|
|
@@ -48,29 +48,68 @@ class Resources(CoreModel):
|
|
|
48
48
|
gpus: List[Gpu]
|
|
49
49
|
spot: bool
|
|
50
50
|
disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
|
|
51
|
+
# TODO: make description a computed field after migrating to pydanticV2
|
|
51
52
|
description: str = ""
|
|
52
53
|
cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
|
|
53
54
|
|
|
54
|
-
|
|
55
|
+
@root_validator
|
|
56
|
+
def _description(cls, values) -> Dict:
|
|
57
|
+
try:
|
|
58
|
+
description = values["description"]
|
|
59
|
+
if not description:
|
|
60
|
+
cpus = values["cpus"]
|
|
61
|
+
memory_mib = values["memory_mib"]
|
|
62
|
+
gpus = values["gpus"]
|
|
63
|
+
disk_size_mib = values["disk"].size_mib
|
|
64
|
+
spot = values["spot"]
|
|
65
|
+
cpu_arch = values["cpu_arch"]
|
|
66
|
+
values["description"] = Resources._pretty_format(
|
|
67
|
+
cpus, cpu_arch, memory_mib, disk_size_mib, gpus, spot, include_spot=True
|
|
68
|
+
)
|
|
69
|
+
except KeyError:
|
|
70
|
+
return values
|
|
71
|
+
return values
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _pretty_format(
|
|
75
|
+
cpus: int,
|
|
76
|
+
cpu_arch: Optional[gpuhunt.CPUArchitecture],
|
|
77
|
+
memory_mib: int,
|
|
78
|
+
disk_size_mib: int,
|
|
79
|
+
gpus: List[Gpu],
|
|
80
|
+
spot: bool,
|
|
81
|
+
include_spot: bool = False,
|
|
82
|
+
) -> str:
|
|
55
83
|
resources = {}
|
|
56
|
-
if
|
|
57
|
-
resources["cpus"] =
|
|
58
|
-
resources["cpu_arch"] =
|
|
59
|
-
if
|
|
60
|
-
resources["memory"] = f"{
|
|
61
|
-
if
|
|
62
|
-
resources["disk_size"] = f"{
|
|
63
|
-
if
|
|
64
|
-
gpu =
|
|
84
|
+
if cpus > 0:
|
|
85
|
+
resources["cpus"] = cpus
|
|
86
|
+
resources["cpu_arch"] = cpu_arch
|
|
87
|
+
if memory_mib > 0:
|
|
88
|
+
resources["memory"] = f"{memory_mib / 1024:.0f}GB"
|
|
89
|
+
if disk_size_mib > 0:
|
|
90
|
+
resources["disk_size"] = f"{disk_size_mib / 1024:.0f}GB"
|
|
91
|
+
if gpus:
|
|
92
|
+
gpu = gpus[0]
|
|
65
93
|
resources["gpu_name"] = gpu.name
|
|
66
|
-
resources["gpu_count"] = len(
|
|
94
|
+
resources["gpu_count"] = len(gpus)
|
|
67
95
|
if gpu.memory_mib > 0:
|
|
68
96
|
resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
|
|
69
97
|
output = pretty_resources(**resources)
|
|
70
|
-
if include_spot and
|
|
98
|
+
if include_spot and spot:
|
|
71
99
|
output += " (spot)"
|
|
72
100
|
return output
|
|
73
101
|
|
|
102
|
+
def pretty_format(self, include_spot: bool = False) -> str:
|
|
103
|
+
return Resources._pretty_format(
|
|
104
|
+
self.cpus,
|
|
105
|
+
self.cpu_arch,
|
|
106
|
+
self.memory_mib,
|
|
107
|
+
self.disk.size_mib,
|
|
108
|
+
self.gpus,
|
|
109
|
+
self.spot,
|
|
110
|
+
include_spot,
|
|
111
|
+
)
|
|
112
|
+
|
|
74
113
|
|
|
75
114
|
class InstanceType(CoreModel):
|
|
76
115
|
name: str
|
|
@@ -496,6 +496,7 @@ class Run(CoreModel):
|
|
|
496
496
|
submitted_at: datetime
|
|
497
497
|
last_processed_at: datetime
|
|
498
498
|
status: RunStatus
|
|
499
|
+
status_message: Optional[str] = None
|
|
499
500
|
termination_reason: Optional[RunTerminationReason]
|
|
500
501
|
run_spec: RunSpec
|
|
501
502
|
jobs: List[Job]
|
|
@@ -524,6 +525,46 @@ class Run(CoreModel):
|
|
|
524
525
|
else:
|
|
525
526
|
return None
|
|
526
527
|
|
|
528
|
+
@root_validator
|
|
529
|
+
def _status_message(cls, values) -> Dict:
|
|
530
|
+
try:
|
|
531
|
+
status = values["status"]
|
|
532
|
+
jobs: List[Job] = values["jobs"]
|
|
533
|
+
retry_on_events = (
|
|
534
|
+
jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
|
|
535
|
+
)
|
|
536
|
+
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
|
|
537
|
+
except KeyError:
|
|
538
|
+
return values
|
|
539
|
+
values["status_message"] = Run._get_status_message(
|
|
540
|
+
status=status,
|
|
541
|
+
retry_on_events=retry_on_events,
|
|
542
|
+
termination_reason=termination_reason,
|
|
543
|
+
)
|
|
544
|
+
return values
|
|
545
|
+
|
|
546
|
+
@staticmethod
|
|
547
|
+
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
|
|
548
|
+
for submission in reversed(job.job_submissions):
|
|
549
|
+
if submission.termination_reason is not None:
|
|
550
|
+
return submission.termination_reason
|
|
551
|
+
return None
|
|
552
|
+
|
|
553
|
+
@staticmethod
|
|
554
|
+
def _get_status_message(
|
|
555
|
+
status: RunStatus,
|
|
556
|
+
retry_on_events: List[RetryEvent],
|
|
557
|
+
termination_reason: Optional[JobTerminationReason],
|
|
558
|
+
) -> str:
|
|
559
|
+
# Currently, `retrying` is shown only for `no-capacity` events
|
|
560
|
+
if (
|
|
561
|
+
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
|
|
562
|
+
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
563
|
+
and RetryEvent.NO_CAPACITY in retry_on_events
|
|
564
|
+
):
|
|
565
|
+
return "retrying"
|
|
566
|
+
return status.value
|
|
567
|
+
|
|
527
568
|
|
|
528
569
|
class JobPlan(CoreModel):
|
|
529
570
|
job_spec: JobSpec
|
dstack/_internal/server/app.py
CHANGED
|
@@ -128,7 +128,7 @@ async def lifespan(app: FastAPI):
|
|
|
128
128
|
yes=UPDATE_DEFAULT_PROJECT,
|
|
129
129
|
no=DO_NOT_UPDATE_DEFAULT_PROJECT,
|
|
130
130
|
)
|
|
131
|
-
if settings.
|
|
131
|
+
if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
|
|
132
132
|
init_default_storage()
|
|
133
133
|
scheduler = start_background_tasks()
|
|
134
134
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.server import settings
|
|
4
|
+
from dstack._internal.server.services.storage.base import BaseStorage
|
|
5
|
+
from dstack._internal.server.services.storage.gcs import GCS_AVAILABLE, GCSStorage
|
|
6
|
+
from dstack._internal.server.services.storage.s3 import BOTO_AVAILABLE, S3Storage
|
|
7
|
+
|
|
8
|
+
_default_storage = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def init_default_storage():
|
|
12
|
+
global _default_storage
|
|
13
|
+
if settings.SERVER_S3_BUCKET is None and settings.SERVER_GCS_BUCKET is None:
|
|
14
|
+
raise ValueError(
|
|
15
|
+
"Either settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET must be set"
|
|
16
|
+
)
|
|
17
|
+
if settings.SERVER_S3_BUCKET and settings.SERVER_GCS_BUCKET:
|
|
18
|
+
raise ValueError(
|
|
19
|
+
"Only one of settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET can be set"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if settings.SERVER_S3_BUCKET:
|
|
23
|
+
if not BOTO_AVAILABLE:
|
|
24
|
+
raise ValueError("AWS dependencies are not installed")
|
|
25
|
+
_default_storage = S3Storage(
|
|
26
|
+
bucket=settings.SERVER_S3_BUCKET,
|
|
27
|
+
region=settings.SERVER_S3_BUCKET_REGION,
|
|
28
|
+
)
|
|
29
|
+
elif settings.SERVER_GCS_BUCKET:
|
|
30
|
+
if not GCS_AVAILABLE:
|
|
31
|
+
raise ValueError("GCS dependencies are not installed")
|
|
32
|
+
_default_storage = GCSStorage(
|
|
33
|
+
bucket=settings.SERVER_GCS_BUCKET,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_default_storage() -> Optional[BaseStorage]:
|
|
38
|
+
return _default_storage
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseStorage(ABC):
|
|
6
|
+
@abstractmethod
|
|
7
|
+
def upload_code(
|
|
8
|
+
self,
|
|
9
|
+
project_id: str,
|
|
10
|
+
repo_id: str,
|
|
11
|
+
code_hash: str,
|
|
12
|
+
blob: bytes,
|
|
13
|
+
):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def get_code(
|
|
18
|
+
self,
|
|
19
|
+
project_id: str,
|
|
20
|
+
repo_id: str,
|
|
21
|
+
code_hash: str,
|
|
22
|
+
) -> Optional[bytes]:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def _get_code_key(project_id: str, repo_id: str, code_hash: str) -> str:
|
|
27
|
+
return f"data/projects/{project_id}/codes/{repo_id}/{code_hash}"
|