dstack 0.19.12rc1__py3-none-any.whl → 0.19.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/attach.py +4 -4
- dstack/_internal/cli/services/configurators/run.py +44 -47
- dstack/_internal/cli/utils/run.py +31 -31
- dstack/_internal/core/backends/aws/compute.py +22 -9
- dstack/_internal/core/backends/aws/resources.py +26 -0
- dstack/_internal/core/backends/base/offers.py +0 -1
- dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
- dstack/_internal/core/backends/template/models.py.jinja +4 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/fleets.py +72 -0
- dstack/_internal/core/compatibility/gateways.py +34 -0
- dstack/_internal/core/compatibility/runs.py +131 -0
- dstack/_internal/core/compatibility/volumes.py +32 -0
- dstack/_internal/core/models/configurations.py +1 -1
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/instances.py +51 -12
- dstack/_internal/core/models/profiles.py +43 -3
- dstack/_internal/core/models/projects.py +1 -0
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/runs.py +139 -43
- dstack/_internal/server/app.py +46 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +92 -15
- dstack/_internal/server/background/tasks/process_runs.py +163 -80
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/models.py +4 -0
- dstack/_internal/server/routers/projects.py +4 -3
- dstack/_internal/server/routers/prometheus.py +4 -1
- dstack/_internal/server/schemas/projects.py +1 -0
- dstack/_internal/server/security/permissions.py +36 -0
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +11 -7
- dstack/_internal/server/services/projects.py +54 -1
- dstack/_internal/server/services/runner/client.py +4 -1
- dstack/_internal/server/services/runs.py +49 -29
- dstack/_internal/server/services/services/__init__.py +19 -0
- dstack/_internal/server/services/services/autoscalers.py +37 -26
- dstack/_internal/server/services/storage/__init__.py +38 -0
- dstack/_internal/server/services/storage/base.py +27 -0
- dstack/_internal/server/services/storage/gcs.py +44 -0
- dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
- dstack/_internal/server/settings.py +7 -3
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-0ac1e1583684417ae4d1.js} +1695 -62
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-0ac1e1583684417ae4d1.js.map} +1 -1
- dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
- dstack/_internal/server/testing/common.py +11 -1
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +4 -0
- dstack/api/_public/runs.py +14 -5
- dstack/api/server/_fleets.py +9 -69
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_projects.py +2 -2
- dstack/api/server/_runs.py +4 -116
- dstack/api/server/_volumes.py +3 -14
- dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
- dstack/version.py +2 -2
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/METADATA +1 -1
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/RECORD +62 -52
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/WHEEL +0 -0
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -52,9 +52,8 @@ class AttachCommand(APIBaseCommand):
|
|
|
52
52
|
)
|
|
53
53
|
self._parser.add_argument(
|
|
54
54
|
"--replica",
|
|
55
|
-
help="The replica number. Defaults to
|
|
55
|
+
help="The replica number. Defaults to any running replica.",
|
|
56
56
|
type=int,
|
|
57
|
-
default=0,
|
|
58
57
|
)
|
|
59
58
|
self._parser.add_argument(
|
|
60
59
|
"--job",
|
|
@@ -129,14 +128,15 @@ _IGNORED_PORTS = [DSTACK_RUNNER_HTTP_PORT]
|
|
|
129
128
|
def _print_attached_message(
|
|
130
129
|
run: Run,
|
|
131
130
|
bind_address: Optional[str],
|
|
132
|
-
replica_num: int,
|
|
131
|
+
replica_num: Optional[int],
|
|
133
132
|
job_num: int,
|
|
134
133
|
):
|
|
135
134
|
if bind_address is None:
|
|
136
135
|
bind_address = "localhost"
|
|
137
136
|
|
|
138
|
-
output = f"Attached to run [code]{run.name}[/] (replica={replica_num} job={job_num})\n"
|
|
139
137
|
job = get_or_error(run._find_job(replica_num=replica_num, job_num=job_num))
|
|
138
|
+
replica_num = job.job_spec.replica_num
|
|
139
|
+
output = f"Attached to run [code]{run.name}[/] (replica={replica_num} job={job_num})\n"
|
|
140
140
|
name = run.name
|
|
141
141
|
if replica_num != 0 or job_num != 0:
|
|
142
142
|
name = job.job_spec.job_name
|
|
@@ -3,7 +3,7 @@ import subprocess
|
|
|
3
3
|
import sys
|
|
4
4
|
import time
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Dict, List, Optional, Set
|
|
6
|
+
from typing import Dict, List, Optional, Set
|
|
7
7
|
|
|
8
8
|
import gpuhunt
|
|
9
9
|
from pydantic import parse_obj_as
|
|
@@ -41,7 +41,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
41
41
|
)
|
|
42
42
|
from dstack._internal.core.models.repos.base import Repo
|
|
43
43
|
from dstack._internal.core.models.resources import CPUSpec
|
|
44
|
-
from dstack._internal.core.models.runs import
|
|
44
|
+
from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunStatus
|
|
45
45
|
from dstack._internal.core.services.configs import ConfigManager
|
|
46
46
|
from dstack._internal.core.services.diff import diff_models
|
|
47
47
|
from dstack._internal.utils.common import local_time
|
|
@@ -166,12 +166,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
166
166
|
# We can attach to run multiple times if it goes from running to pending (retried).
|
|
167
167
|
while True:
|
|
168
168
|
with MultiItemStatus(f"Launching [code]{run.name}[/]...", console=console) as live:
|
|
169
|
-
while
|
|
170
|
-
RunStatus.SUBMITTED,
|
|
171
|
-
RunStatus.PENDING,
|
|
172
|
-
RunStatus.PROVISIONING,
|
|
173
|
-
RunStatus.TERMINATING,
|
|
174
|
-
):
|
|
169
|
+
while not _is_ready_to_attach(run):
|
|
175
170
|
table = get_runs_table([run])
|
|
176
171
|
live.update(table)
|
|
177
172
|
time.sleep(5)
|
|
@@ -553,35 +548,38 @@ def _print_service_urls(run: Run) -> None:
|
|
|
553
548
|
|
|
554
549
|
|
|
555
550
|
def print_finished_message(run: Run):
|
|
551
|
+
status_message = (
|
|
552
|
+
run._run.latest_job_submission.status_message
|
|
553
|
+
if run._run.latest_job_submission
|
|
554
|
+
else run._run.status_message
|
|
555
|
+
)
|
|
556
|
+
error = (
|
|
557
|
+
run._run.latest_job_submission.error if run._run.latest_job_submission else run._run.error
|
|
558
|
+
)
|
|
559
|
+
termination_reason = (
|
|
560
|
+
run._run.latest_job_submission.termination_reason
|
|
561
|
+
if run._run.latest_job_submission
|
|
562
|
+
else None
|
|
563
|
+
)
|
|
564
|
+
termination_reason_message = (
|
|
565
|
+
run._run.latest_job_submission.termination_reason_message
|
|
566
|
+
if run._run.latest_job_submission
|
|
567
|
+
else None
|
|
568
|
+
)
|
|
556
569
|
if run.status == RunStatus.DONE:
|
|
557
|
-
console.print("[code]
|
|
570
|
+
console.print(f"[code]{status_message.capitalize()}[/code]")
|
|
558
571
|
return
|
|
572
|
+
else:
|
|
573
|
+
str = f"[error]{status_message.capitalize()}[/error]"
|
|
574
|
+
if error:
|
|
575
|
+
str += f" ([error]{error.capitalize()}[/error])"
|
|
576
|
+
console.print(str)
|
|
559
577
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
|
|
566
|
-
|
|
567
|
-
if termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
568
|
-
message = (
|
|
569
|
-
"All provisioning attempts failed. "
|
|
570
|
-
"This is likely due to cloud providers not having enough capacity. "
|
|
571
|
-
"Check CLI and server logs for more details."
|
|
572
|
-
)
|
|
573
|
-
elif termination_reason is not None:
|
|
574
|
-
exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
|
|
575
|
-
error_details = (
|
|
576
|
-
f"Error: {termination_reason_message}\n" if termination_reason_message else ""
|
|
577
|
-
)
|
|
578
|
-
message = (
|
|
579
|
-
f"Run failed with error code {termination_reason.name}.\n"
|
|
580
|
-
f"{exit_status_details}"
|
|
581
|
-
f"{error_details}"
|
|
582
|
-
f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
|
|
583
|
-
)
|
|
584
|
-
console.print(f"[error]{message}[/]")
|
|
578
|
+
if termination_reason_message:
|
|
579
|
+
console.print(f"[error]{termination_reason_message}[/error]")
|
|
580
|
+
|
|
581
|
+
if termination_reason:
|
|
582
|
+
console.print(f"Check [code]dstack logs -d {run.name}[/code] for more details.")
|
|
585
583
|
|
|
586
584
|
|
|
587
585
|
def get_run_exit_code(run: Run) -> int:
|
|
@@ -590,19 +588,18 @@ def get_run_exit_code(run: Run) -> int:
|
|
|
590
588
|
return 1
|
|
591
589
|
|
|
592
590
|
|
|
593
|
-
def
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
job_submission.exit_status,
|
|
591
|
+
def _is_ready_to_attach(run: Run) -> bool:
|
|
592
|
+
return not (
|
|
593
|
+
run.status
|
|
594
|
+
in [
|
|
595
|
+
RunStatus.SUBMITTED,
|
|
596
|
+
RunStatus.PENDING,
|
|
597
|
+
RunStatus.PROVISIONING,
|
|
598
|
+
RunStatus.TERMINATING,
|
|
599
|
+
]
|
|
600
|
+
or run._run.jobs[0].job_submissions[-1].status
|
|
601
|
+
in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING]
|
|
602
|
+
or run._run.is_deployment_in_progress()
|
|
606
603
|
)
|
|
607
604
|
|
|
608
605
|
|
|
@@ -12,7 +12,6 @@ from dstack._internal.core.models.profiles import (
|
|
|
12
12
|
TerminationPolicy,
|
|
13
13
|
)
|
|
14
14
|
from dstack._internal.core.models.runs import (
|
|
15
|
-
Job,
|
|
16
15
|
RunPlan,
|
|
17
16
|
)
|
|
18
17
|
from dstack._internal.core.services.profiles import get_termination
|
|
@@ -154,8 +153,7 @@ def get_runs_table(
|
|
|
154
153
|
table.add_column("BACKEND", style="grey58", ratio=2)
|
|
155
154
|
table.add_column("RESOURCES", ratio=3 if not verbose else 2)
|
|
156
155
|
if verbose:
|
|
157
|
-
table.add_column("INSTANCE", no_wrap=True, ratio=1)
|
|
158
|
-
table.add_column("RESERVATION", no_wrap=True, ratio=1)
|
|
156
|
+
table.add_column("INSTANCE TYPE", no_wrap=True, ratio=1)
|
|
159
157
|
table.add_column("PRICE", style="grey58", ratio=1)
|
|
160
158
|
table.add_column("STATUS", no_wrap=True, ratio=1)
|
|
161
159
|
table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1)
|
|
@@ -163,16 +161,27 @@ def get_runs_table(
|
|
|
163
161
|
table.add_column("ERROR", no_wrap=True, ratio=2)
|
|
164
162
|
|
|
165
163
|
for run in runs:
|
|
166
|
-
run_error = _get_run_error(run)
|
|
167
164
|
run = run._run # TODO(egor-s): make public attribute
|
|
165
|
+
show_deployment_num = (
|
|
166
|
+
verbose
|
|
167
|
+
and run.run_spec.configuration.type == "service"
|
|
168
|
+
or run.is_deployment_in_progress()
|
|
169
|
+
)
|
|
170
|
+
merge_job_rows = len(run.jobs) == 1 and not show_deployment_num
|
|
168
171
|
|
|
169
172
|
run_row: Dict[Union[str, int], Any] = {
|
|
170
|
-
"NAME": run.run_spec.run_name
|
|
173
|
+
"NAME": run.run_spec.run_name
|
|
174
|
+
+ (f" [secondary]deployment={run.deployment_num}[/]" if show_deployment_num else ""),
|
|
171
175
|
"SUBMITTED": format_date(run.submitted_at),
|
|
172
|
-
"
|
|
176
|
+
"STATUS": (
|
|
177
|
+
run.latest_job_submission.status_message
|
|
178
|
+
if run.status.is_finished() and run.latest_job_submission
|
|
179
|
+
else run.status_message
|
|
180
|
+
),
|
|
173
181
|
}
|
|
174
|
-
if
|
|
175
|
-
run_row["
|
|
182
|
+
if run.error:
|
|
183
|
+
run_row["ERROR"] = run.error
|
|
184
|
+
if not merge_job_rows:
|
|
176
185
|
add_row_from_dict(table, run_row)
|
|
177
186
|
|
|
178
187
|
for job in run.jobs:
|
|
@@ -182,47 +191,38 @@ def get_runs_table(
|
|
|
182
191
|
inactive_for = format_duration_multiunit(latest_job_submission.inactivity_secs)
|
|
183
192
|
status += f" (inactive for {inactive_for})"
|
|
184
193
|
job_row: Dict[Union[str, int], Any] = {
|
|
185
|
-
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}"
|
|
186
|
-
|
|
194
|
+
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}"
|
|
195
|
+
+ (
|
|
196
|
+
f" deployment={latest_job_submission.deployment_num}"
|
|
197
|
+
if show_deployment_num
|
|
198
|
+
else ""
|
|
199
|
+
),
|
|
200
|
+
"STATUS": latest_job_submission.status_message,
|
|
187
201
|
"SUBMITTED": format_date(latest_job_submission.submitted_at),
|
|
188
|
-
"ERROR":
|
|
202
|
+
"ERROR": latest_job_submission.error,
|
|
189
203
|
}
|
|
190
204
|
jpd = latest_job_submission.job_provisioning_data
|
|
191
205
|
if jpd is not None:
|
|
192
206
|
resources = jpd.instance_type.resources
|
|
193
|
-
|
|
207
|
+
instance_type = jpd.instance_type.name
|
|
194
208
|
jrd = latest_job_submission.job_runtime_data
|
|
195
209
|
if jrd is not None and jrd.offer is not None:
|
|
196
210
|
resources = jrd.offer.instance.resources
|
|
197
211
|
if jrd.offer.total_blocks > 1:
|
|
198
|
-
|
|
212
|
+
instance_type += f" ({jrd.offer.blocks}/{jrd.offer.total_blocks})"
|
|
213
|
+
if jpd.reservation:
|
|
214
|
+
instance_type += f" ({jpd.reservation})"
|
|
199
215
|
job_row.update(
|
|
200
216
|
{
|
|
201
217
|
"BACKEND": f"{jpd.backend.value.replace('remote', 'ssh')} ({jpd.region})",
|
|
202
218
|
"RESOURCES": resources.pretty_format(include_spot=True),
|
|
203
|
-
"INSTANCE":
|
|
204
|
-
"RESERVATION": jpd.reservation,
|
|
219
|
+
"INSTANCE TYPE": instance_type,
|
|
205
220
|
"PRICE": f"${jpd.price:.4f}".rstrip("0").rstrip("."),
|
|
206
221
|
}
|
|
207
222
|
)
|
|
208
|
-
if
|
|
223
|
+
if merge_job_rows:
|
|
209
224
|
# merge rows
|
|
210
225
|
job_row.update(run_row)
|
|
211
226
|
add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)
|
|
212
227
|
|
|
213
228
|
return table
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def _get_run_error(run: Run) -> str:
|
|
217
|
-
return run._run.error or ""
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def _get_job_error(job: Job) -> str:
|
|
221
|
-
job_submission = job.job_submissions[-1]
|
|
222
|
-
termination_reason = job_submission.termination_reason
|
|
223
|
-
exit_status = job_submission.exit_status
|
|
224
|
-
if termination_reason is None:
|
|
225
|
-
return ""
|
|
226
|
-
if exit_status:
|
|
227
|
-
return f"{termination_reason.name} {exit_status}"
|
|
228
|
-
return termination_reason.name
|
|
@@ -132,7 +132,8 @@ class AWSCompute(
|
|
|
132
132
|
availability_offers = []
|
|
133
133
|
for offer in offers:
|
|
134
134
|
availability = InstanceAvailability.UNKNOWN
|
|
135
|
-
|
|
135
|
+
quota = _has_quota(regions_to_quotas[offer.region], offer.instance.name)
|
|
136
|
+
if quota is not None and not quota:
|
|
136
137
|
availability = InstanceAvailability.NO_QUOTA
|
|
137
138
|
availability_offers.append(
|
|
138
139
|
InstanceOfferWithAvailability(
|
|
@@ -231,6 +232,7 @@ class AWSCompute(
|
|
|
231
232
|
image_id, username = aws_resources.get_image_id_and_username(
|
|
232
233
|
ec2_client=ec2_client,
|
|
233
234
|
cuda=len(instance_offer.instance.resources.gpus) > 0,
|
|
235
|
+
instance_type=instance_offer.instance.name,
|
|
234
236
|
image_config=self.config.os_images,
|
|
235
237
|
)
|
|
236
238
|
response = ec2_resource.create_instances(
|
|
@@ -781,10 +783,18 @@ def _get_regions_to_quotas(
|
|
|
781
783
|
) -> Dict[str, Dict[str, int]]:
|
|
782
784
|
def get_region_quotas(client: botocore.client.BaseClient) -> Dict[str, int]:
|
|
783
785
|
region_quotas = {}
|
|
784
|
-
|
|
785
|
-
for
|
|
786
|
-
|
|
787
|
-
|
|
786
|
+
try:
|
|
787
|
+
for page in client.get_paginator("list_service_quotas").paginate(ServiceCode="ec2"):
|
|
788
|
+
for q in page["Quotas"]:
|
|
789
|
+
if "On-Demand" in q["QuotaName"]:
|
|
790
|
+
region_quotas[q["UsageMetric"]["MetricDimensions"]["Class"]] = q["Value"]
|
|
791
|
+
except botocore.exceptions.ClientError as e:
|
|
792
|
+
if len(e.args) > 0 and "TooManyRequestsException" in e.args[0]:
|
|
793
|
+
logger.warning(
|
|
794
|
+
"Failed to get quotas due to rate limits. Quotas won't be accounted for."
|
|
795
|
+
)
|
|
796
|
+
else:
|
|
797
|
+
logger.exception(e)
|
|
788
798
|
return region_quotas
|
|
789
799
|
|
|
790
800
|
regions_to_quotas = {}
|
|
@@ -800,12 +810,15 @@ def _get_regions_to_quotas(
|
|
|
800
810
|
return regions_to_quotas
|
|
801
811
|
|
|
802
812
|
|
|
803
|
-
def _has_quota(quotas: Dict[str, int], instance_name: str) -> bool:
|
|
813
|
+
def _has_quota(quotas: Dict[str, int], instance_name: str) -> Optional[bool]:
|
|
814
|
+
quota = quotas.get("Standard/OnDemand")
|
|
804
815
|
if instance_name.startswith("p"):
|
|
805
|
-
|
|
816
|
+
quota = quotas.get("P/OnDemand")
|
|
806
817
|
if instance_name.startswith("g"):
|
|
807
|
-
|
|
808
|
-
|
|
818
|
+
quota = quotas.get("G/OnDemand")
|
|
819
|
+
if quota is None:
|
|
820
|
+
return None
|
|
821
|
+
return quota > 0
|
|
809
822
|
|
|
810
823
|
|
|
811
824
|
def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[str, List[str]]:
|
|
@@ -12,11 +12,13 @@ from dstack._internal.utils.logging import get_logger
|
|
|
12
12
|
logger = get_logger(__name__)
|
|
13
13
|
|
|
14
14
|
DSTACK_ACCOUNT_ID = "142421590066"
|
|
15
|
+
DLAMI_OWNER_ACCOUNT_ID = "898082745236"
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def get_image_id_and_username(
|
|
18
19
|
ec2_client: botocore.client.BaseClient,
|
|
19
20
|
cuda: bool,
|
|
21
|
+
instance_type: str,
|
|
20
22
|
image_config: Optional[AWSOSImageConfig] = None,
|
|
21
23
|
) -> tuple[str, str]:
|
|
22
24
|
if image_config is not None:
|
|
@@ -27,6 +29,11 @@ def get_image_id_and_username(
|
|
|
27
29
|
image_name = image.name
|
|
28
30
|
image_owner = image.owner
|
|
29
31
|
username = image.user
|
|
32
|
+
elif _supported_by_dlami(instance_type):
|
|
33
|
+
# TODO: Update DLAMI image version from time to time
|
|
34
|
+
image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) 20250516"
|
|
35
|
+
image_owner = DLAMI_OWNER_ACCOUNT_ID
|
|
36
|
+
username = "ubuntu"
|
|
30
37
|
else:
|
|
31
38
|
image_name = (
|
|
32
39
|
f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
|
|
@@ -628,6 +635,25 @@ def _is_private_subnet_with_internet_egress(
|
|
|
628
635
|
return False
|
|
629
636
|
|
|
630
637
|
|
|
638
|
+
def _supported_by_dlami(instance_type: str) -> bool:
|
|
639
|
+
# Currently only p3. instances are not supported by DLAMI among GPU instances.
|
|
640
|
+
return any(
|
|
641
|
+
instance_type.startswith(family)
|
|
642
|
+
for family in [
|
|
643
|
+
"g4dn.",
|
|
644
|
+
"g5.",
|
|
645
|
+
"g6.",
|
|
646
|
+
"gr6.",
|
|
647
|
+
"g6e.",
|
|
648
|
+
"p4d.",
|
|
649
|
+
"p4de.",
|
|
650
|
+
"p5.",
|
|
651
|
+
"p5e.",
|
|
652
|
+
"p6-b200.",
|
|
653
|
+
]
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
|
|
631
657
|
def get_reservation(
|
|
632
658
|
ec2_client: botocore.client.BaseClient,
|
|
633
659
|
reservation_id: str,
|
|
@@ -19,9 +19,6 @@ from dstack._internal.core.models.backends.base import (
|
|
|
19
19
|
BackendType,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
# TODO: Add all supported regions and default regions
|
|
23
|
-
REGIONS = []
|
|
24
|
-
|
|
25
22
|
|
|
26
23
|
class {{ backend_name }}Configurator(Configurator):
|
|
27
24
|
TYPE = BackendType.{{ backend_name|upper }}
|
|
@@ -31,13 +28,11 @@ class {{ backend_name }}Configurator(Configurator):
|
|
|
31
28
|
self, config: {{ backend_name }}BackendConfigWithCreds, default_creds_enabled: bool
|
|
32
29
|
):
|
|
33
30
|
self._validate_creds(config.creds)
|
|
34
|
-
# TODO:
|
|
31
|
+
# TODO: If possible, validate config.regions and any other config parameters
|
|
35
32
|
|
|
36
33
|
def create_backend(
|
|
37
34
|
self, project_name: str, config: {{ backend_name }}BackendConfigWithCreds
|
|
38
35
|
) -> BackendRecord:
|
|
39
|
-
if config.regions is None:
|
|
40
|
-
config.regions = REGIONS
|
|
41
36
|
return BackendRecord(
|
|
42
37
|
config={{ backend_name }}StoredConfig(
|
|
43
38
|
**{{ backend_name }}BackendConfig.__response__.parse_obj(config).dict()
|
|
@@ -22,6 +22,7 @@ class {{ backend_name }}BackendConfig(CoreModel):
|
|
|
22
22
|
It also serves as a base class for other backend config models.
|
|
23
23
|
Should not include creds.
|
|
24
24
|
"""
|
|
25
|
+
|
|
25
26
|
type: Annotated[
|
|
26
27
|
Literal["{{ backend_name|lower }}"],
|
|
27
28
|
Field(description="The type of backend"),
|
|
@@ -37,6 +38,7 @@ class {{ backend_name }}BackendConfigWithCreds({{ backend_name }}BackendConfig):
|
|
|
37
38
|
"""
|
|
38
39
|
Same as `{{ backend_name }}BackendConfig` but also includes creds.
|
|
39
40
|
"""
|
|
41
|
+
|
|
40
42
|
creds: Annotated[Any{{ backend_name }}Creds, Field(description="The credentials")]
|
|
41
43
|
|
|
42
44
|
|
|
@@ -48,6 +50,7 @@ class {{ backend_name }}StoredConfig({{ backend_name }}BackendConfig):
|
|
|
48
50
|
The backend config used for config parameters in the DB.
|
|
49
51
|
Can extend `{{ backend_name }}BackendConfig` with additional parameters.
|
|
50
52
|
"""
|
|
53
|
+
|
|
51
54
|
pass
|
|
52
55
|
|
|
53
56
|
|
|
@@ -55,4 +58,5 @@ class {{ backend_name }}Config({{ backend_name }}StoredConfig):
|
|
|
55
58
|
"""
|
|
56
59
|
The backend config used by `{{ backend_name }}Backend` and `{{ backend_name }}Compute`.
|
|
57
60
|
"""
|
|
61
|
+
|
|
58
62
|
creds: Any{{ backend_name }}Creds
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec
|
|
4
|
+
from dstack._internal.core.models.instances import Instance
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_get_plan_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
8
|
+
get_plan_excludes = {}
|
|
9
|
+
spec_excludes = get_fleet_spec_excludes(fleet_spec)
|
|
10
|
+
if spec_excludes:
|
|
11
|
+
get_plan_excludes["spec"] = spec_excludes
|
|
12
|
+
return get_plan_excludes
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
|
|
16
|
+
apply_plan_excludes = {}
|
|
17
|
+
spec_excludes = get_fleet_spec_excludes(plan_input.spec)
|
|
18
|
+
if spec_excludes:
|
|
19
|
+
apply_plan_excludes["spec"] = spec_excludes
|
|
20
|
+
current_resource = plan_input.current_resource
|
|
21
|
+
if current_resource is not None:
|
|
22
|
+
current_resource_excludes = {}
|
|
23
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
24
|
+
if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
|
|
25
|
+
current_resource_excludes["instances"] = {
|
|
26
|
+
"__all__": {"instance_type": {"resources": {"cpu_arch"}}}
|
|
27
|
+
}
|
|
28
|
+
return {"plan": apply_plan_excludes}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
32
|
+
create_fleet_excludes = {}
|
|
33
|
+
spec_excludes = get_fleet_spec_excludes(fleet_spec)
|
|
34
|
+
if spec_excludes:
|
|
35
|
+
create_fleet_excludes["spec"] = spec_excludes
|
|
36
|
+
return create_fleet_excludes
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[Dict]:
|
|
40
|
+
"""
|
|
41
|
+
Returns `fleet_spec` exclude mapping to exclude certain fields from the request.
|
|
42
|
+
Use this method to exclude new fields when they are not set to keep
|
|
43
|
+
clients backward-compatibility with older servers.
|
|
44
|
+
"""
|
|
45
|
+
spec_excludes: Dict[str, Any] = {}
|
|
46
|
+
configuration_excludes: Dict[str, Any] = {}
|
|
47
|
+
profile_excludes: set[str] = set()
|
|
48
|
+
profile = fleet_spec.profile
|
|
49
|
+
if profile.fleets is None:
|
|
50
|
+
profile_excludes.add("fleets")
|
|
51
|
+
if fleet_spec.configuration.tags is None:
|
|
52
|
+
configuration_excludes["tags"] = True
|
|
53
|
+
if profile.tags is None:
|
|
54
|
+
profile_excludes.add("tags")
|
|
55
|
+
if profile.startup_order is None:
|
|
56
|
+
profile_excludes.add("startup_order")
|
|
57
|
+
if profile.stop_criteria is None:
|
|
58
|
+
profile_excludes.add("stop_criteria")
|
|
59
|
+
if configuration_excludes:
|
|
60
|
+
spec_excludes["configuration"] = configuration_excludes
|
|
61
|
+
if profile_excludes:
|
|
62
|
+
spec_excludes["profile"] = profile_excludes
|
|
63
|
+
if spec_excludes:
|
|
64
|
+
return spec_excludes
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
|
|
69
|
+
try:
|
|
70
|
+
return instance.instance_type.resources.cpu_arch is None
|
|
71
|
+
except AttributeError:
|
|
72
|
+
return True
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_gateway_spec_excludes(gateway_spec: GatewaySpec) -> Dict:
|
|
7
|
+
"""
|
|
8
|
+
Returns `gateway_spec` exclude mapping to exclude certain fields from the request.
|
|
9
|
+
Use this method to exclude new fields when they are not set to keep
|
|
10
|
+
clients backward-compatibility with older servers.
|
|
11
|
+
"""
|
|
12
|
+
spec_excludes = {}
|
|
13
|
+
spec_excludes["configuration"] = _get_gateway_configuration_excludes(
|
|
14
|
+
gateway_spec.configuration
|
|
15
|
+
)
|
|
16
|
+
return spec_excludes
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_create_gateway_excludes(configuration: GatewayConfiguration) -> Dict:
|
|
20
|
+
"""
|
|
21
|
+
Returns an exclude mapping to exclude certain fields from the create gateway request.
|
|
22
|
+
Use this method to exclude new fields when they are not set to keep
|
|
23
|
+
clients backward-compatibility with older servers.
|
|
24
|
+
"""
|
|
25
|
+
create_gateway_excludes = {}
|
|
26
|
+
create_gateway_excludes["configuration"] = _get_gateway_configuration_excludes(configuration)
|
|
27
|
+
return create_gateway_excludes
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_gateway_configuration_excludes(configuration: GatewayConfiguration) -> Dict:
|
|
31
|
+
configuration_excludes = {}
|
|
32
|
+
if configuration.tags is None:
|
|
33
|
+
configuration_excludes["tags"] = True
|
|
34
|
+
return configuration_excludes
|