dstack 0.19.12__py3-none-any.whl → 0.19.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (33) hide show
  1. dstack/_internal/cli/services/configurators/run.py +1 -6
  2. dstack/_internal/cli/utils/run.py +5 -1
  3. dstack/_internal/core/backends/aws/compute.py +22 -9
  4. dstack/_internal/core/backends/aws/resources.py +26 -0
  5. dstack/_internal/core/backends/base/offers.py +0 -1
  6. dstack/_internal/core/compatibility/__init__.py +0 -0
  7. dstack/_internal/core/compatibility/fleets.py +72 -0
  8. dstack/_internal/core/compatibility/gateways.py +34 -0
  9. dstack/_internal/core/compatibility/runs.py +125 -0
  10. dstack/_internal/core/compatibility/volumes.py +32 -0
  11. dstack/_internal/core/models/instances.py +51 -12
  12. dstack/_internal/core/models/runs.py +41 -0
  13. dstack/_internal/server/app.py +1 -1
  14. dstack/_internal/server/services/storage/__init__.py +38 -0
  15. dstack/_internal/server/services/storage/base.py +27 -0
  16. dstack/_internal/server/services/storage/gcs.py +44 -0
  17. dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
  18. dstack/_internal/server/settings.py +7 -3
  19. dstack/_internal/server/statics/index.html +1 -1
  20. dstack/_internal/server/statics/{main-b0e80f8e26a168c129e9.js → main-2066f1f22ddb4557bcde.js} +1615 -31
  21. dstack/_internal/server/statics/{main-b0e80f8e26a168c129e9.js.map → main-2066f1f22ddb4557bcde.js.map} +1 -1
  22. dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
  23. dstack/api/server/_fleets.py +9 -73
  24. dstack/api/server/_gateways.py +3 -14
  25. dstack/api/server/_runs.py +4 -124
  26. dstack/api/server/_volumes.py +3 -14
  27. dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
  28. dstack/version.py +2 -2
  29. {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/METADATA +1 -1
  30. {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/RECORD +33 -25
  31. {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/WHEEL +0 -0
  32. {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/entry_points.txt +0 -0
  33. {dstack-0.19.12.dist-info → dstack-0.19.13.dist-info}/licenses/LICENSE.md +0 -0
@@ -166,12 +166,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
166
166
  # We can attach to run multiple times if it goes from running to pending (retried).
167
167
  while True:
168
168
  with MultiItemStatus(f"Launching [code]{run.name}[/]...", console=console) as live:
169
- while run.status in (
170
- RunStatus.SUBMITTED,
171
- RunStatus.PENDING,
172
- RunStatus.PROVISIONING,
173
- RunStatus.TERMINATING,
174
- ):
169
+ while not _is_ready_to_attach(run):
175
170
  table = get_runs_table([run])
176
171
  live.update(table)
177
172
  time.sleep(5)
@@ -166,11 +166,15 @@ def get_runs_table(
166
166
  run_row: Dict[Union[str, int], Any] = {
167
167
  "NAME": run.run_spec.run_name,
168
168
  "SUBMITTED": format_date(run.submitted_at),
169
+ "STATUS": (
170
+ run.latest_job_submission.status_message
171
+ if run.status.is_finished() and run.latest_job_submission
172
+ else run.status_message
173
+ ),
169
174
  }
170
175
  if run.error:
171
176
  run_row["ERROR"] = run.error
172
177
  if len(run.jobs) != 1:
173
- run_row["STATUS"] = run.status
174
178
  add_row_from_dict(table, run_row)
175
179
 
176
180
  for job in run.jobs:
@@ -132,7 +132,8 @@ class AWSCompute(
132
132
  availability_offers = []
133
133
  for offer in offers:
134
134
  availability = InstanceAvailability.UNKNOWN
135
- if not _has_quota(regions_to_quotas[offer.region], offer.instance.name):
135
+ quota = _has_quota(regions_to_quotas[offer.region], offer.instance.name)
136
+ if quota is not None and not quota:
136
137
  availability = InstanceAvailability.NO_QUOTA
137
138
  availability_offers.append(
138
139
  InstanceOfferWithAvailability(
@@ -231,6 +232,7 @@ class AWSCompute(
231
232
  image_id, username = aws_resources.get_image_id_and_username(
232
233
  ec2_client=ec2_client,
233
234
  cuda=len(instance_offer.instance.resources.gpus) > 0,
235
+ instance_type=instance_offer.instance.name,
234
236
  image_config=self.config.os_images,
235
237
  )
236
238
  response = ec2_resource.create_instances(
@@ -781,10 +783,18 @@ def _get_regions_to_quotas(
781
783
  ) -> Dict[str, Dict[str, int]]:
782
784
  def get_region_quotas(client: botocore.client.BaseClient) -> Dict[str, int]:
783
785
  region_quotas = {}
784
- for page in client.get_paginator("list_service_quotas").paginate(ServiceCode="ec2"):
785
- for q in page["Quotas"]:
786
- if "On-Demand" in q["QuotaName"]:
787
- region_quotas[q["UsageMetric"]["MetricDimensions"]["Class"]] = q["Value"]
786
+ try:
787
+ for page in client.get_paginator("list_service_quotas").paginate(ServiceCode="ec2"):
788
+ for q in page["Quotas"]:
789
+ if "On-Demand" in q["QuotaName"]:
790
+ region_quotas[q["UsageMetric"]["MetricDimensions"]["Class"]] = q["Value"]
791
+ except botocore.exceptions.ClientError as e:
792
+ if len(e.args) > 0 and "TooManyRequestsException" in e.args[0]:
793
+ logger.warning(
794
+ "Failed to get quotas due to rate limits. Quotas won't be accounted for."
795
+ )
796
+ else:
797
+ logger.exception(e)
788
798
  return region_quotas
789
799
 
790
800
  regions_to_quotas = {}
@@ -800,12 +810,15 @@ def _get_regions_to_quotas(
800
810
  return regions_to_quotas
801
811
 
802
812
 
803
- def _has_quota(quotas: Dict[str, int], instance_name: str) -> bool:
813
+ def _has_quota(quotas: Dict[str, int], instance_name: str) -> Optional[bool]:
814
+ quota = quotas.get("Standard/OnDemand")
804
815
  if instance_name.startswith("p"):
805
- return quotas.get("P/OnDemand", 0) > 0
816
+ quota = quotas.get("P/OnDemand")
806
817
  if instance_name.startswith("g"):
807
- return quotas.get("G/OnDemand", 0) > 0
808
- return quotas.get("Standard/OnDemand", 0) > 0
818
+ quota = quotas.get("G/OnDemand")
819
+ if quota is None:
820
+ return None
821
+ return quota > 0
809
822
 
810
823
 
811
824
  def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[str, List[str]]:
@@ -12,11 +12,13 @@ from dstack._internal.utils.logging import get_logger
12
12
  logger = get_logger(__name__)
13
13
 
14
14
  DSTACK_ACCOUNT_ID = "142421590066"
15
+ DLAMI_OWNER_ACCOUNT_ID = "898082745236"
15
16
 
16
17
 
17
18
  def get_image_id_and_username(
18
19
  ec2_client: botocore.client.BaseClient,
19
20
  cuda: bool,
21
+ instance_type: str,
20
22
  image_config: Optional[AWSOSImageConfig] = None,
21
23
  ) -> tuple[str, str]:
22
24
  if image_config is not None:
@@ -27,6 +29,11 @@ def get_image_id_and_username(
27
29
  image_name = image.name
28
30
  image_owner = image.owner
29
31
  username = image.user
32
+ elif _supported_by_dlami(instance_type):
33
+ # TODO: Update DLAMI image version from time to time
34
+ image_name = "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04) 20250516"
35
+ image_owner = DLAMI_OWNER_ACCOUNT_ID
36
+ username = "ubuntu"
30
37
  else:
31
38
  image_name = (
32
39
  f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
@@ -628,6 +635,25 @@ def _is_private_subnet_with_internet_egress(
628
635
  return False
629
636
 
630
637
 
638
+ def _supported_by_dlami(instance_type: str) -> bool:
639
+ # Currently only p3. instances are not supported by DLAMI among GPU instances.
640
+ return any(
641
+ instance_type.startswith(family)
642
+ for family in [
643
+ "g4dn.",
644
+ "g5.",
645
+ "g6.",
646
+ "gr6.",
647
+ "g6e.",
648
+ "p4d.",
649
+ "p4de.",
650
+ "p5.",
651
+ "p5e.",
652
+ "p6-b200.",
653
+ ]
654
+ )
655
+
656
+
631
657
  def get_reservation(
632
658
  ec2_client: botocore.client.BaseClient,
633
659
  reservation_id: str,
@@ -80,7 +80,6 @@ def catalog_item_to_offer(
80
80
  spot=item.spot,
81
81
  disk=Disk(size_mib=disk_size_mib),
82
82
  )
83
- resources.description = resources.pretty_format()
84
83
  return InstanceOffer(
85
84
  backend=backend,
86
85
  instance=InstanceType(
File without changes
@@ -0,0 +1,72 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec
4
+ from dstack._internal.core.models.instances import Instance
5
+
6
+
7
+ def get_get_plan_excludes(fleet_spec: FleetSpec) -> Dict:
8
+ get_plan_excludes = {}
9
+ spec_excludes = get_fleet_spec_excludes(fleet_spec)
10
+ if spec_excludes:
11
+ get_plan_excludes["spec"] = spec_excludes
12
+ return get_plan_excludes
13
+
14
+
15
+ def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
16
+ apply_plan_excludes = {}
17
+ spec_excludes = get_fleet_spec_excludes(plan_input.spec)
18
+ if spec_excludes:
19
+ apply_plan_excludes["spec"] = spec_excludes
20
+ current_resource = plan_input.current_resource
21
+ if current_resource is not None:
22
+ current_resource_excludes = {}
23
+ apply_plan_excludes["current_resource"] = current_resource_excludes
24
+ if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
25
+ current_resource_excludes["instances"] = {
26
+ "__all__": {"instance_type": {"resources": {"cpu_arch"}}}
27
+ }
28
+ return {"plan": apply_plan_excludes}
29
+
30
+
31
+ def get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
32
+ create_fleet_excludes = {}
33
+ spec_excludes = get_fleet_spec_excludes(fleet_spec)
34
+ if spec_excludes:
35
+ create_fleet_excludes["spec"] = spec_excludes
36
+ return create_fleet_excludes
37
+
38
+
39
+ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[Dict]:
40
+ """
41
+ Returns `fleet_spec` exclude mapping to exclude certain fields from the request.
42
+ Use this method to exclude new fields when they are not set to keep
43
+ clients backward-compatibility with older servers.
44
+ """
45
+ spec_excludes: Dict[str, Any] = {}
46
+ configuration_excludes: Dict[str, Any] = {}
47
+ profile_excludes: set[str] = set()
48
+ profile = fleet_spec.profile
49
+ if profile.fleets is None:
50
+ profile_excludes.add("fleets")
51
+ if fleet_spec.configuration.tags is None:
52
+ configuration_excludes["tags"] = True
53
+ if profile.tags is None:
54
+ profile_excludes.add("tags")
55
+ if profile.startup_order is None:
56
+ profile_excludes.add("startup_order")
57
+ if profile.stop_criteria is None:
58
+ profile_excludes.add("stop_criteria")
59
+ if configuration_excludes:
60
+ spec_excludes["configuration"] = configuration_excludes
61
+ if profile_excludes:
62
+ spec_excludes["profile"] = profile_excludes
63
+ if spec_excludes:
64
+ return spec_excludes
65
+ return None
66
+
67
+
68
+ def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
69
+ try:
70
+ return instance.instance_type.resources.cpu_arch is None
71
+ except AttributeError:
72
+ return True
@@ -0,0 +1,34 @@
1
+ from typing import Dict
2
+
3
+ from dstack._internal.core.models.gateways import GatewayConfiguration, GatewaySpec
4
+
5
+
6
+ def get_gateway_spec_excludes(gateway_spec: GatewaySpec) -> Dict:
7
+ """
8
+ Returns `gateway_spec` exclude mapping to exclude certain fields from the request.
9
+ Use this method to exclude new fields when they are not set to keep
10
+ clients backward-compatibility with older servers.
11
+ """
12
+ spec_excludes = {}
13
+ spec_excludes["configuration"] = _get_gateway_configuration_excludes(
14
+ gateway_spec.configuration
15
+ )
16
+ return spec_excludes
17
+
18
+
19
+ def get_create_gateway_excludes(configuration: GatewayConfiguration) -> Dict:
20
+ """
21
+ Returns an exclude mapping to exclude certain fields from the create gateway request.
22
+ Use this method to exclude new fields when they are not set to keep
23
+ clients backward-compatibility with older servers.
24
+ """
25
+ create_gateway_excludes = {}
26
+ create_gateway_excludes["configuration"] = _get_gateway_configuration_excludes(configuration)
27
+ return create_gateway_excludes
28
+
29
+
30
+ def _get_gateway_configuration_excludes(configuration: GatewayConfiguration) -> Dict:
31
+ configuration_excludes = {}
32
+ if configuration.tags is None:
33
+ configuration_excludes["tags"] = True
34
+ return configuration_excludes
@@ -0,0 +1,125 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from dstack._internal.core.models.configurations import ServiceConfiguration
4
+ from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
5
+ from dstack._internal.server.schemas.runs import GetRunPlanRequest
6
+
7
+
8
+ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
9
+ """
10
+ Returns `plan` exclude mapping to exclude certain fields from the request.
11
+ Use this method to exclude new fields when they are not set to keep
12
+ clients backward-compatibility with older servers.
13
+ """
14
+ apply_plan_excludes = {}
15
+ run_spec_excludes = get_run_spec_excludes(plan.run_spec)
16
+ if run_spec_excludes is not None:
17
+ apply_plan_excludes["run_spec"] = run_spec_excludes
18
+ current_resource = plan.current_resource
19
+ if current_resource is not None:
20
+ current_resource_excludes = {}
21
+ current_resource_excludes["status_message"] = True
22
+ apply_plan_excludes["current_resource"] = current_resource_excludes
23
+ current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
24
+ job_submissions_excludes = {}
25
+ current_resource_excludes["jobs"] = {
26
+ "__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
27
+ }
28
+ job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
29
+ if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
30
+ job_submissions_excludes["job_provisioning_data"] = {
31
+ "instance_type": {"resources": {"cpu_arch"}}
32
+ }
33
+ if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
34
+ job_submissions_excludes["job_runtime_data"] = {
35
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
36
+ }
37
+ if all(js.exit_status is None for js in job_submissions):
38
+ job_submissions_excludes["exit_status"] = True
39
+ latest_job_submission = current_resource.latest_job_submission
40
+ if latest_job_submission is not None:
41
+ latest_job_submission_excludes = {}
42
+ current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
43
+ if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
44
+ latest_job_submission_excludes["job_provisioning_data"] = {
45
+ "instance_type": {"resources": {"cpu_arch"}}
46
+ }
47
+ if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
48
+ latest_job_submission_excludes["job_runtime_data"] = {
49
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
50
+ }
51
+ if latest_job_submission.exit_status is None:
52
+ latest_job_submission_excludes["exit_status"] = True
53
+ return {"plan": apply_plan_excludes}
54
+
55
+
56
+ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
57
+ """
58
+ Excludes new fields when they are not set to keep
59
+ clients backward-compatibility with older servers.
60
+ """
61
+ get_plan_excludes = {}
62
+ run_spec_excludes = get_run_spec_excludes(request.run_spec)
63
+ if run_spec_excludes is not None:
64
+ get_plan_excludes["run_spec"] = run_spec_excludes
65
+ if request.max_offers is None:
66
+ get_plan_excludes["max_offers"] = True
67
+ return get_plan_excludes
68
+
69
+
70
+ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
71
+ """
72
+ Returns `run_spec` exclude mapping to exclude certain fields from the request.
73
+ Use this method to exclude new fields when they are not set to keep
74
+ clients backward-compatibility with older servers.
75
+ """
76
+ spec_excludes: dict[str, Any] = {}
77
+ configuration_excludes: dict[str, Any] = {}
78
+ profile_excludes: set[str] = set()
79
+ configuration = run_spec.configuration
80
+ profile = run_spec.profile
81
+
82
+ if configuration.fleets is None:
83
+ configuration_excludes["fleets"] = True
84
+ if profile is not None and profile.fleets is None:
85
+ profile_excludes.add("fleets")
86
+ if configuration.tags is None:
87
+ configuration_excludes["tags"] = True
88
+ if profile is not None and profile.tags is None:
89
+ profile_excludes.add("tags")
90
+ if isinstance(configuration, ServiceConfiguration) and not configuration.rate_limits:
91
+ configuration_excludes["rate_limits"] = True
92
+ if configuration.shell is None:
93
+ configuration_excludes["shell"] = True
94
+ if configuration.priority is None:
95
+ configuration_excludes["priority"] = True
96
+ if configuration.startup_order is None:
97
+ configuration_excludes["startup_order"] = True
98
+ if profile is not None and profile.startup_order is None:
99
+ profile_excludes.add("startup_order")
100
+ if configuration.stop_criteria is None:
101
+ configuration_excludes["stop_criteria"] = True
102
+ if profile is not None and profile.stop_criteria is None:
103
+ profile_excludes.add("stop_criteria")
104
+
105
+ if configuration_excludes:
106
+ spec_excludes["configuration"] = configuration_excludes
107
+ if profile_excludes:
108
+ spec_excludes["profile"] = profile_excludes
109
+ if spec_excludes:
110
+ return spec_excludes
111
+ return None
112
+
113
+
114
+ def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
115
+ try:
116
+ return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
117
+ except AttributeError:
118
+ return True
119
+
120
+
121
+ def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
122
+ try:
123
+ return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
124
+ except AttributeError:
125
+ return True
@@ -0,0 +1,32 @@
1
+ from typing import Dict
2
+
3
+ from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
4
+
5
+
6
+ def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
7
+ """
8
+ Returns `volume_spec` exclude mapping to exclude certain fields from the request.
9
+ Use this method to exclude new fields when they are not set to keep
10
+ clients backward-compatibility with older servers.
11
+ """
12
+ spec_excludes = {}
13
+ spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
14
+ return spec_excludes
15
+
16
+
17
+ def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
18
+ """
19
+ Returns an exclude mapping to exclude certain fields from the create volume request.
20
+ Use this method to exclude new fields when they are not set to keep
21
+ clients backward-compatibility with older servers.
22
+ """
23
+ create_volume_excludes = {}
24
+ create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
25
+ return create_volume_excludes
26
+
27
+
28
+ def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
29
+ configuration_excludes = {}
30
+ if configuration.tags is None:
31
+ configuration_excludes["tags"] = True
32
+ return configuration_excludes
@@ -48,29 +48,68 @@ class Resources(CoreModel):
48
48
  gpus: List[Gpu]
49
49
  spot: bool
50
50
  disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
51
+ # TODO: make description a computed field after migrating to pydanticV2
51
52
  description: str = ""
52
53
  cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
53
54
 
54
- def pretty_format(self, include_spot: bool = False) -> str:
55
+ @root_validator
56
+ def _description(cls, values) -> Dict:
57
+ try:
58
+ description = values["description"]
59
+ if not description:
60
+ cpus = values["cpus"]
61
+ memory_mib = values["memory_mib"]
62
+ gpus = values["gpus"]
63
+ disk_size_mib = values["disk"].size_mib
64
+ spot = values["spot"]
65
+ cpu_arch = values["cpu_arch"]
66
+ values["description"] = Resources._pretty_format(
67
+ cpus, cpu_arch, memory_mib, disk_size_mib, gpus, spot, include_spot=True
68
+ )
69
+ except KeyError:
70
+ return values
71
+ return values
72
+
73
+ @staticmethod
74
+ def _pretty_format(
75
+ cpus: int,
76
+ cpu_arch: Optional[gpuhunt.CPUArchitecture],
77
+ memory_mib: int,
78
+ disk_size_mib: int,
79
+ gpus: List[Gpu],
80
+ spot: bool,
81
+ include_spot: bool = False,
82
+ ) -> str:
55
83
  resources = {}
56
- if self.cpus > 0:
57
- resources["cpus"] = self.cpus
58
- resources["cpu_arch"] = self.cpu_arch
59
- if self.memory_mib > 0:
60
- resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
61
- if self.disk.size_mib > 0:
62
- resources["disk_size"] = f"{self.disk.size_mib / 1024:.0f}GB"
63
- if self.gpus:
64
- gpu = self.gpus[0]
84
+ if cpus > 0:
85
+ resources["cpus"] = cpus
86
+ resources["cpu_arch"] = cpu_arch
87
+ if memory_mib > 0:
88
+ resources["memory"] = f"{memory_mib / 1024:.0f}GB"
89
+ if disk_size_mib > 0:
90
+ resources["disk_size"] = f"{disk_size_mib / 1024:.0f}GB"
91
+ if gpus:
92
+ gpu = gpus[0]
65
93
  resources["gpu_name"] = gpu.name
66
- resources["gpu_count"] = len(self.gpus)
94
+ resources["gpu_count"] = len(gpus)
67
95
  if gpu.memory_mib > 0:
68
96
  resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
69
97
  output = pretty_resources(**resources)
70
- if include_spot and self.spot:
98
+ if include_spot and spot:
71
99
  output += " (spot)"
72
100
  return output
73
101
 
102
+ def pretty_format(self, include_spot: bool = False) -> str:
103
+ return Resources._pretty_format(
104
+ self.cpus,
105
+ self.cpu_arch,
106
+ self.memory_mib,
107
+ self.disk.size_mib,
108
+ self.gpus,
109
+ self.spot,
110
+ include_spot,
111
+ )
112
+
74
113
 
75
114
  class InstanceType(CoreModel):
76
115
  name: str
@@ -496,6 +496,7 @@ class Run(CoreModel):
496
496
  submitted_at: datetime
497
497
  last_processed_at: datetime
498
498
  status: RunStatus
499
+ status_message: Optional[str] = None
499
500
  termination_reason: Optional[RunTerminationReason]
500
501
  run_spec: RunSpec
501
502
  jobs: List[Job]
@@ -524,6 +525,46 @@ class Run(CoreModel):
524
525
  else:
525
526
  return None
526
527
 
528
+ @root_validator
529
+ def _status_message(cls, values) -> Dict:
530
+ try:
531
+ status = values["status"]
532
+ jobs: List[Job] = values["jobs"]
533
+ retry_on_events = (
534
+ jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
535
+ )
536
+ termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
537
+ except KeyError:
538
+ return values
539
+ values["status_message"] = Run._get_status_message(
540
+ status=status,
541
+ retry_on_events=retry_on_events,
542
+ termination_reason=termination_reason,
543
+ )
544
+ return values
545
+
546
+ @staticmethod
547
+ def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
548
+ for submission in reversed(job.job_submissions):
549
+ if submission.termination_reason is not None:
550
+ return submission.termination_reason
551
+ return None
552
+
553
+ @staticmethod
554
+ def _get_status_message(
555
+ status: RunStatus,
556
+ retry_on_events: List[RetryEvent],
557
+ termination_reason: Optional[JobTerminationReason],
558
+ ) -> str:
559
+ # Currently, `retrying` is shown only for `no-capacity` events
560
+ if (
561
+ status in [RunStatus.SUBMITTED, RunStatus.PENDING]
562
+ and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
563
+ and RetryEvent.NO_CAPACITY in retry_on_events
564
+ ):
565
+ return "retrying"
566
+ return status.value
567
+
527
568
 
528
569
  class JobPlan(CoreModel):
529
570
  job_spec: JobSpec
@@ -128,7 +128,7 @@ async def lifespan(app: FastAPI):
128
128
  yes=UPDATE_DEFAULT_PROJECT,
129
129
  no=DO_NOT_UPDATE_DEFAULT_PROJECT,
130
130
  )
131
- if settings.SERVER_BUCKET is not None:
131
+ if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
132
132
  init_default_storage()
133
133
  scheduler = start_background_tasks()
134
134
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
@@ -0,0 +1,38 @@
1
+ from typing import Optional
2
+
3
+ from dstack._internal.server import settings
4
+ from dstack._internal.server.services.storage.base import BaseStorage
5
+ from dstack._internal.server.services.storage.gcs import GCS_AVAILABLE, GCSStorage
6
+ from dstack._internal.server.services.storage.s3 import BOTO_AVAILABLE, S3Storage
7
+
8
+ _default_storage = None
9
+
10
+
11
+ def init_default_storage():
12
+ global _default_storage
13
+ if settings.SERVER_S3_BUCKET is None and settings.SERVER_GCS_BUCKET is None:
14
+ raise ValueError(
15
+ "Either settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET must be set"
16
+ )
17
+ if settings.SERVER_S3_BUCKET and settings.SERVER_GCS_BUCKET:
18
+ raise ValueError(
19
+ "Only one of settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET can be set"
20
+ )
21
+
22
+ if settings.SERVER_S3_BUCKET:
23
+ if not BOTO_AVAILABLE:
24
+ raise ValueError("AWS dependencies are not installed")
25
+ _default_storage = S3Storage(
26
+ bucket=settings.SERVER_S3_BUCKET,
27
+ region=settings.SERVER_S3_BUCKET_REGION,
28
+ )
29
+ elif settings.SERVER_GCS_BUCKET:
30
+ if not GCS_AVAILABLE:
31
+ raise ValueError("GCS dependencies are not installed")
32
+ _default_storage = GCSStorage(
33
+ bucket=settings.SERVER_GCS_BUCKET,
34
+ )
35
+
36
+
37
+ def get_default_storage() -> Optional[BaseStorage]:
38
+ return _default_storage
@@ -0,0 +1,27 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+
5
+ class BaseStorage(ABC):
6
+ @abstractmethod
7
+ def upload_code(
8
+ self,
9
+ project_id: str,
10
+ repo_id: str,
11
+ code_hash: str,
12
+ blob: bytes,
13
+ ):
14
+ pass
15
+
16
+ @abstractmethod
17
+ def get_code(
18
+ self,
19
+ project_id: str,
20
+ repo_id: str,
21
+ code_hash: str,
22
+ ) -> Optional[bytes]:
23
+ pass
24
+
25
+ @staticmethod
26
+ def _get_code_key(project_id: str, repo_id: str, code_hash: str) -> str:
27
+ return f"data/projects/{project_id}/codes/{repo_id}/{code_hash}"