dstack 0.19.26__py3-none-any.whl → 0.19.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/init.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +114 -16
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +8 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/configurations.py +21 -7
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +75 -2
- dstack/_internal/core/models/runs.py +24 -5
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/background/tasks/process_fleets.py +109 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +18 -6
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +5 -2
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/RECORD +68 -53
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import gpuhunt
|
|
4
|
+
from gpuhunt.providers.digitalocean import DigitalOceanProvider
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.backends.base.backend import Compute
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithCreateInstanceSupport,
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_user_data,
|
|
11
|
+
)
|
|
12
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
13
|
+
from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient
|
|
14
|
+
from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig
|
|
15
|
+
from dstack._internal.core.errors import BackendError
|
|
16
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
17
|
+
from dstack._internal.core.models.instances import (
|
|
18
|
+
InstanceAvailability,
|
|
19
|
+
InstanceConfiguration,
|
|
20
|
+
InstanceOfferWithAvailability,
|
|
21
|
+
)
|
|
22
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
24
|
+
from dstack._internal.utils.logging import get_logger
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
29
|
+
DOCKER_INSTALL_COMMANDS = [
|
|
30
|
+
"export DEBIAN_FRONTEND=noninteractive",
|
|
31
|
+
"mkdir -p /etc/apt/keyrings",
|
|
32
|
+
"curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
|
|
33
|
+
'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
|
|
34
|
+
"apt-get update",
|
|
35
|
+
"apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BaseDigitalOceanCompute(
|
|
40
|
+
ComputeWithCreateInstanceSupport,
|
|
41
|
+
Compute,
|
|
42
|
+
):
|
|
43
|
+
def __init__(self, config: BaseDigitalOceanConfig, api_url: str, type: BackendType):
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.config = config
|
|
46
|
+
self.api_client = DigitalOceanAPIClient(config.creds.api_key, api_url)
|
|
47
|
+
self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
|
|
48
|
+
self.BACKEND_TYPE = type
|
|
49
|
+
self.catalog.add_provider(
|
|
50
|
+
DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def get_offers(
|
|
54
|
+
self, requirements: Optional[Requirements] = None
|
|
55
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
56
|
+
offers = get_catalog_offers(
|
|
57
|
+
backend=self.BACKEND_TYPE,
|
|
58
|
+
locations=self.config.regions,
|
|
59
|
+
requirements=requirements,
|
|
60
|
+
catalog=self.catalog,
|
|
61
|
+
)
|
|
62
|
+
return [
|
|
63
|
+
InstanceOfferWithAvailability(
|
|
64
|
+
**offer.dict(),
|
|
65
|
+
availability=InstanceAvailability.AVAILABLE,
|
|
66
|
+
)
|
|
67
|
+
for offer in offers
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
def create_instance(
|
|
71
|
+
self,
|
|
72
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
73
|
+
instance_config: InstanceConfiguration,
|
|
74
|
+
placement_group: Optional[PlacementGroup],
|
|
75
|
+
) -> JobProvisioningData:
|
|
76
|
+
instance_name = generate_unique_instance_name(
|
|
77
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
project_ssh_key = instance_config.ssh_keys[0]
|
|
81
|
+
ssh_key_id = self.api_client.get_or_create_ssh_key(
|
|
82
|
+
name=f"dstack-{instance_config.project_name}",
|
|
83
|
+
public_key=project_ssh_key.public,
|
|
84
|
+
)
|
|
85
|
+
size_slug = instance_offer.instance.name
|
|
86
|
+
|
|
87
|
+
if not instance_offer.instance.resources.gpus:
|
|
88
|
+
backend_specific_commands = DOCKER_INSTALL_COMMANDS
|
|
89
|
+
else:
|
|
90
|
+
backend_specific_commands = None
|
|
91
|
+
|
|
92
|
+
project_id = None
|
|
93
|
+
if self.config.project_name:
|
|
94
|
+
project_id = self.api_client.get_project_id(self.config.project_name)
|
|
95
|
+
if project_id is None:
|
|
96
|
+
raise BackendError(f"Project {self.config.project_name} does not exist")
|
|
97
|
+
droplet_config = {
|
|
98
|
+
"name": instance_name,
|
|
99
|
+
"region": instance_offer.region,
|
|
100
|
+
"size": size_slug,
|
|
101
|
+
"image": self._get_image_for_instance(instance_offer),
|
|
102
|
+
"ssh_keys": [ssh_key_id],
|
|
103
|
+
"backups": False,
|
|
104
|
+
"ipv6": False,
|
|
105
|
+
"monitoring": False,
|
|
106
|
+
"tags": [],
|
|
107
|
+
"user_data": get_user_data(
|
|
108
|
+
authorized_keys=instance_config.get_public_keys(),
|
|
109
|
+
backend_specific_commands=backend_specific_commands,
|
|
110
|
+
),
|
|
111
|
+
**({"project_id": project_id} if project_id is not None else {}),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
droplet = self.api_client.create_droplet(droplet_config)
|
|
115
|
+
|
|
116
|
+
return JobProvisioningData(
|
|
117
|
+
backend=instance_offer.backend,
|
|
118
|
+
instance_type=instance_offer.instance,
|
|
119
|
+
instance_id=str(droplet["id"]),
|
|
120
|
+
hostname=None,
|
|
121
|
+
internal_ip=None,
|
|
122
|
+
region=instance_offer.region,
|
|
123
|
+
price=instance_offer.price,
|
|
124
|
+
username="root",
|
|
125
|
+
ssh_port=22,
|
|
126
|
+
dockerized=True,
|
|
127
|
+
ssh_proxy=None,
|
|
128
|
+
backend_data=None,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def update_provisioning_data(
|
|
132
|
+
self,
|
|
133
|
+
provisioning_data: JobProvisioningData,
|
|
134
|
+
project_ssh_public_key: str,
|
|
135
|
+
project_ssh_private_key: str,
|
|
136
|
+
):
|
|
137
|
+
droplet = self.api_client.get_droplet(provisioning_data.instance_id)
|
|
138
|
+
if droplet["status"] == "active":
|
|
139
|
+
for network in droplet["networks"]["v4"]:
|
|
140
|
+
if network["type"] == "public":
|
|
141
|
+
provisioning_data.hostname = network["ip_address"]
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
def terminate_instance(
|
|
145
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
146
|
+
):
|
|
147
|
+
self.api_client.delete_droplet(instance_id)
|
|
148
|
+
|
|
149
|
+
def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str:
|
|
150
|
+
if not instance_offer.instance.resources.gpus:
|
|
151
|
+
# No GPUs, use CPU image
|
|
152
|
+
return "ubuntu-24-04-x64"
|
|
153
|
+
|
|
154
|
+
gpu_count = len(instance_offer.instance.resources.gpus)
|
|
155
|
+
gpu_vendor = instance_offer.instance.resources.gpus[0].vendor
|
|
156
|
+
|
|
157
|
+
if gpu_vendor == gpuhunt.AcceleratorVendor.AMD:
|
|
158
|
+
# AMD GPU
|
|
159
|
+
return "digitaloceanai-rocmjupyter"
|
|
160
|
+
else:
|
|
161
|
+
# NVIDIA GPUs - DO only supports 1 and 8 GPU configurations.
|
|
162
|
+
# DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". DO does not provide guidance for x8 GPUs so assuming the same applies.
|
|
163
|
+
# See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image)
|
|
164
|
+
if gpu_count == 8:
|
|
165
|
+
return "gpu-h100x8-base"
|
|
166
|
+
elif gpu_count == 1:
|
|
167
|
+
return "gpu-h100x1-base"
|
|
168
|
+
else:
|
|
169
|
+
# For Unsupported GPU count - use single GPU image and log warning
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image"
|
|
172
|
+
)
|
|
173
|
+
return "gpu-h100x1-base"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
5
|
+
BackendRecord,
|
|
6
|
+
Configurator,
|
|
7
|
+
)
|
|
8
|
+
from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend
|
|
9
|
+
from dstack._internal.core.backends.digitalocean_base.models import (
|
|
10
|
+
AnyBaseDigitalOceanCreds,
|
|
11
|
+
BaseDigitalOceanBackendConfig,
|
|
12
|
+
BaseDigitalOceanBackendConfigWithCreds,
|
|
13
|
+
BaseDigitalOceanConfig,
|
|
14
|
+
BaseDigitalOceanCreds,
|
|
15
|
+
BaseDigitalOceanStoredConfig,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseDigitalOceanConfigurator(Configurator):
|
|
20
|
+
def validate_config(
|
|
21
|
+
self, config: BaseDigitalOceanBackendConfigWithCreds, default_creds_enabled: bool
|
|
22
|
+
):
|
|
23
|
+
self._validate_creds(config.creds, config.project_name)
|
|
24
|
+
|
|
25
|
+
def create_backend(
|
|
26
|
+
self, project_name: str, config: BaseDigitalOceanBackendConfigWithCreds
|
|
27
|
+
) -> BackendRecord:
|
|
28
|
+
return BackendRecord(
|
|
29
|
+
config=BaseDigitalOceanStoredConfig(
|
|
30
|
+
**BaseDigitalOceanBackendConfig.__response__.parse_obj(config).dict()
|
|
31
|
+
).json(),
|
|
32
|
+
auth=BaseDigitalOceanCreds.parse_obj(config.creds).json(),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def get_backend_config_with_creds(
|
|
36
|
+
self, record: BackendRecord
|
|
37
|
+
) -> BaseDigitalOceanBackendConfigWithCreds:
|
|
38
|
+
config = self._get_config(record)
|
|
39
|
+
return BaseDigitalOceanBackendConfigWithCreds.__response__.parse_obj(config)
|
|
40
|
+
|
|
41
|
+
def get_backend_config_without_creds(
|
|
42
|
+
self, record: BackendRecord
|
|
43
|
+
) -> BaseDigitalOceanBackendConfig:
|
|
44
|
+
config = self._get_config(record)
|
|
45
|
+
return BaseDigitalOceanBackendConfig.__response__.parse_obj(config)
|
|
46
|
+
|
|
47
|
+
def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend:
|
|
48
|
+
raise NotImplementedError("Subclasses must implement get_backend")
|
|
49
|
+
|
|
50
|
+
def _get_config(self, record: BackendRecord) -> BaseDigitalOceanConfig:
|
|
51
|
+
return BaseDigitalOceanConfig.__response__(
|
|
52
|
+
**json.loads(record.config),
|
|
53
|
+
creds=BaseDigitalOceanCreds.parse_raw(record.auth),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None):
|
|
57
|
+
pass
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Annotated, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseDigitalOceanAPIKeyCreds(CoreModel):
|
|
9
|
+
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
10
|
+
api_key: Annotated[str, Field(description="The API key")]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
AnyBaseDigitalOceanCreds = BaseDigitalOceanAPIKeyCreds
|
|
14
|
+
BaseDigitalOceanCreds = AnyBaseDigitalOceanCreds
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseDigitalOceanBackendConfig(CoreModel):
|
|
18
|
+
type: Annotated[
|
|
19
|
+
Literal["amddevcloud", "digitalocean"],
|
|
20
|
+
Field(description="The type of backend"),
|
|
21
|
+
]
|
|
22
|
+
project_name: Annotated[Optional[str], Field(description="The name of the project")] = None
|
|
23
|
+
regions: Annotated[
|
|
24
|
+
Optional[List[str]],
|
|
25
|
+
Field(description="The list of regions. Omit to use all regions"),
|
|
26
|
+
] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseDigitalOceanBackendConfigWithCreds(BaseDigitalOceanBackendConfig):
|
|
30
|
+
creds: Annotated[AnyBaseDigitalOceanCreds, Field(description="The credentials")]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
AnyBaseDigitalOceanBackendConfig = Union[
|
|
34
|
+
BaseDigitalOceanBackendConfig, BaseDigitalOceanBackendConfigWithCreds
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BaseDigitalOceanStoredConfig(BaseDigitalOceanBackendConfig):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class BaseDigitalOceanConfig(BaseDigitalOceanStoredConfig):
|
|
43
|
+
creds: AnyBaseDigitalOceanCreds
|
|
@@ -2,6 +2,7 @@ import concurrent.futures
|
|
|
2
2
|
import json
|
|
3
3
|
import threading
|
|
4
4
|
from collections import defaultdict
|
|
5
|
+
from dataclasses import dataclass
|
|
5
6
|
from typing import Callable, Dict, List, Literal, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
import google.api_core.exceptions
|
|
@@ -285,16 +286,18 @@ class GCPCompute(
|
|
|
285
286
|
)
|
|
286
287
|
raise NoCapacityError()
|
|
287
288
|
|
|
289
|
+
image = _get_image(
|
|
290
|
+
instance_type_name=instance_offer.instance.name,
|
|
291
|
+
cuda=len(instance_offer.instance.resources.gpus) > 0,
|
|
292
|
+
)
|
|
293
|
+
|
|
288
294
|
for zone in zones:
|
|
289
295
|
request = compute_v1.InsertInstanceRequest()
|
|
290
296
|
request.zone = zone
|
|
291
297
|
request.project = self.config.project_id
|
|
292
298
|
request.instance_resource = gcp_resources.create_instance_struct(
|
|
293
299
|
disk_size=disk_size,
|
|
294
|
-
image_id=
|
|
295
|
-
instance_type_name=instance_offer.instance.name,
|
|
296
|
-
cuda=len(instance_offer.instance.resources.gpus) > 0,
|
|
297
|
-
),
|
|
300
|
+
image_id=image.id,
|
|
298
301
|
machine_type=instance_offer.instance.name,
|
|
299
302
|
accelerators=gcp_resources.get_accelerators(
|
|
300
303
|
project_id=self.config.project_id,
|
|
@@ -305,6 +308,7 @@ class GCPCompute(
|
|
|
305
308
|
user_data=_get_user_data(
|
|
306
309
|
authorized_keys=authorized_keys,
|
|
307
310
|
instance_type_name=instance_offer.instance.name,
|
|
311
|
+
is_ufw_installed=image.is_ufw_installed,
|
|
308
312
|
),
|
|
309
313
|
authorized_keys=authorized_keys,
|
|
310
314
|
labels=labels,
|
|
@@ -889,24 +893,41 @@ def _get_vpc_subnet(
|
|
|
889
893
|
)
|
|
890
894
|
|
|
891
895
|
|
|
892
|
-
|
|
896
|
+
@dataclass
|
|
897
|
+
class GCPImage:
|
|
898
|
+
id: str
|
|
899
|
+
is_ufw_installed: bool
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
|
|
893
903
|
if instance_type_name == "a3-megagpu-8g":
|
|
894
904
|
image_name = "dstack-a3mega-5"
|
|
905
|
+
is_ufw_installed = False
|
|
895
906
|
elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
|
|
896
|
-
return
|
|
907
|
+
return GCPImage(
|
|
908
|
+
id="projects/cos-cloud/global/images/cos-105-17412-535-78",
|
|
909
|
+
is_ufw_installed=False,
|
|
910
|
+
)
|
|
897
911
|
elif cuda:
|
|
898
912
|
image_name = f"dstack-cuda-{version.base_image}"
|
|
913
|
+
is_ufw_installed = True
|
|
899
914
|
else:
|
|
900
915
|
image_name = f"dstack-{version.base_image}"
|
|
916
|
+
is_ufw_installed = True
|
|
901
917
|
image_name = image_name.replace(".", "-")
|
|
902
|
-
return
|
|
918
|
+
return GCPImage(
|
|
919
|
+
id=f"projects/dstack/global/images/{image_name}",
|
|
920
|
+
is_ufw_installed=is_ufw_installed,
|
|
921
|
+
)
|
|
903
922
|
|
|
904
923
|
|
|
905
924
|
def _get_gateway_image_id() -> str:
|
|
906
925
|
return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
|
|
907
926
|
|
|
908
927
|
|
|
909
|
-
def _get_user_data(
|
|
928
|
+
def _get_user_data(
|
|
929
|
+
authorized_keys: List[str], instance_type_name: str, is_ufw_installed: bool
|
|
930
|
+
) -> str:
|
|
910
931
|
base_path = None
|
|
911
932
|
bin_path = None
|
|
912
933
|
backend_shim_env = None
|
|
@@ -929,6 +950,9 @@ def _get_user_data(authorized_keys: List[str], instance_type_name: str) -> str:
|
|
|
929
950
|
base_path=base_path,
|
|
930
951
|
bin_path=bin_path,
|
|
931
952
|
backend_shim_env=backend_shim_env,
|
|
953
|
+
# Instance-level firewall is optional on GCP. The main protection comes from GCP firewalls.
|
|
954
|
+
# So only set up instance-level firewall as an additional measure if ufw is available.
|
|
955
|
+
skip_firewall_setup=not is_ufw_installed,
|
|
932
956
|
)
|
|
933
957
|
|
|
934
958
|
|
|
@@ -16,46 +16,38 @@ class HotAisleAPIClient:
|
|
|
16
16
|
self.team_handle = team_handle
|
|
17
17
|
|
|
18
18
|
def validate_api_key(self) -> bool:
|
|
19
|
+
url = f"{API_URL}/user/"
|
|
19
20
|
try:
|
|
20
|
-
self.
|
|
21
|
-
|
|
21
|
+
response = self._make_request("GET", url)
|
|
22
|
+
response.raise_for_status()
|
|
22
23
|
except requests.HTTPError as e:
|
|
23
|
-
if e.response
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
error_message = str(e)
|
|
35
|
-
if "No Hot Aisle teams found" in error_message:
|
|
36
|
-
raise_invalid_credentials_error(
|
|
37
|
-
fields=[["creds", "api_key"]],
|
|
38
|
-
details="Valid API key but no teams found for this user",
|
|
39
|
-
)
|
|
40
|
-
elif "not found" in error_message:
|
|
41
|
-
raise_invalid_credentials_error(
|
|
42
|
-
fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found"
|
|
43
|
-
)
|
|
44
|
-
raise e
|
|
45
|
-
|
|
46
|
-
def _validate_user_and_team(self) -> None:
|
|
47
|
-
url = f"{API_URL}/user/"
|
|
48
|
-
response = self._make_request("GET", url)
|
|
49
|
-
response.raise_for_status()
|
|
50
|
-
user_data = response.json()
|
|
24
|
+
if e.response is not None:
|
|
25
|
+
if e.response.status_code == 401:
|
|
26
|
+
raise_invalid_credentials_error(
|
|
27
|
+
fields=[["creds", "api_key"]], details="Invalid API key"
|
|
28
|
+
)
|
|
29
|
+
if e.response.status_code == 403:
|
|
30
|
+
raise_invalid_credentials_error(
|
|
31
|
+
fields=[["creds", "api_key"]],
|
|
32
|
+
details="Authenticated user does not have required permissions",
|
|
33
|
+
)
|
|
34
|
+
raise
|
|
51
35
|
|
|
52
|
-
|
|
36
|
+
user_data = response.json()
|
|
37
|
+
teams = user_data["teams"]
|
|
53
38
|
if not teams:
|
|
54
|
-
|
|
39
|
+
raise_invalid_credentials_error(
|
|
40
|
+
fields=[["creds", "api_key"]],
|
|
41
|
+
details="Valid API key but no teams found for this user",
|
|
42
|
+
)
|
|
55
43
|
|
|
56
44
|
available_teams = [team["handle"] for team in teams]
|
|
57
45
|
if self.team_handle not in available_teams:
|
|
58
|
-
|
|
46
|
+
raise_invalid_credentials_error(
|
|
47
|
+
fields=[["team_handle"]],
|
|
48
|
+
details=f"Team handle '{self.team_handle}' not found",
|
|
49
|
+
)
|
|
50
|
+
return True
|
|
59
51
|
|
|
60
52
|
def upload_ssh_key(self, public_key: str) -> bool:
|
|
61
53
|
url = f"{API_URL}/user/ssh_keys/"
|
|
@@ -28,8 +28,6 @@ from dstack._internal.utils.logging import get_logger
|
|
|
28
28
|
|
|
29
29
|
logger = get_logger(__name__)
|
|
30
30
|
|
|
31
|
-
MAX_INSTANCE_NAME_LEN = 60
|
|
32
|
-
|
|
33
31
|
|
|
34
32
|
INSTANCE_TYPE_SPECS = {
|
|
35
33
|
"1x MI300X 8x Xeon Platinum 8462Y+": {
|
|
@@ -130,9 +128,7 @@ class HotAisleCompute(
|
|
|
130
128
|
ssh_port=22,
|
|
131
129
|
dockerized=True,
|
|
132
130
|
ssh_proxy=None,
|
|
133
|
-
backend_data=HotAisleInstanceBackendData(
|
|
134
|
-
ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
|
|
135
|
-
).json(),
|
|
131
|
+
backend_data=HotAisleInstanceBackendData(ip_address=vm_data["ip_address"]).json(),
|
|
136
132
|
)
|
|
137
133
|
|
|
138
134
|
def update_provisioning_data(
|
|
@@ -217,7 +213,6 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
|
|
|
217
213
|
|
|
218
214
|
class HotAisleInstanceBackendData(CoreModel):
|
|
219
215
|
ip_address: str
|
|
220
|
-
vm_id: Optional[str] = None
|
|
221
216
|
|
|
222
217
|
@classmethod
|
|
223
218
|
def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
|
|
@@ -20,6 +20,10 @@ from dstack._internal.core.backends.datacrunch.models import (
|
|
|
20
20
|
DataCrunchBackendConfig,
|
|
21
21
|
DataCrunchBackendConfigWithCreds,
|
|
22
22
|
)
|
|
23
|
+
from dstack._internal.core.backends.digitalocean_base.models import (
|
|
24
|
+
BaseDigitalOceanBackendConfig,
|
|
25
|
+
BaseDigitalOceanBackendConfigWithCreds,
|
|
26
|
+
)
|
|
23
27
|
from dstack._internal.core.backends.dstack.models import (
|
|
24
28
|
DstackBackendConfig,
|
|
25
29
|
DstackBaseBackendConfig,
|
|
@@ -77,6 +81,7 @@ AnyBackendConfigWithoutCreds = Union[
|
|
|
77
81
|
CloudRiftBackendConfig,
|
|
78
82
|
CudoBackendConfig,
|
|
79
83
|
DataCrunchBackendConfig,
|
|
84
|
+
BaseDigitalOceanBackendConfig,
|
|
80
85
|
GCPBackendConfig,
|
|
81
86
|
HotAisleBackendConfig,
|
|
82
87
|
KubernetesBackendConfig,
|
|
@@ -100,6 +105,7 @@ AnyBackendConfigWithCreds = Union[
|
|
|
100
105
|
CloudRiftBackendConfigWithCreds,
|
|
101
106
|
CudoBackendConfigWithCreds,
|
|
102
107
|
DataCrunchBackendConfigWithCreds,
|
|
108
|
+
BaseDigitalOceanBackendConfigWithCreds,
|
|
103
109
|
GCPBackendConfigWithCreds,
|
|
104
110
|
HotAisleBackendConfigWithCreds,
|
|
105
111
|
KubernetesBackendConfigWithCreds,
|
|
@@ -122,6 +128,7 @@ AnyBackendFileConfigWithCreds = Union[
|
|
|
122
128
|
CloudRiftBackendConfigWithCreds,
|
|
123
129
|
CudoBackendConfigWithCreds,
|
|
124
130
|
DataCrunchBackendConfigWithCreds,
|
|
131
|
+
BaseDigitalOceanBackendConfigWithCreds,
|
|
125
132
|
GCPBackendFileConfigWithCreds,
|
|
126
133
|
HotAisleBackendFileConfigWithCreds,
|
|
127
134
|
KubernetesBackendFileConfigWithCreds,
|
|
@@ -59,13 +59,6 @@ DOCKER_DAEMON_CONFIG = {
|
|
|
59
59
|
"exec-opts": ["native.cgroupdriver=cgroupfs"],
|
|
60
60
|
}
|
|
61
61
|
SETUP_COMMANDS = [
|
|
62
|
-
"ufw allow ssh",
|
|
63
|
-
"ufw allow from 10.0.0.0/8",
|
|
64
|
-
"ufw allow from 172.16.0.0/12",
|
|
65
|
-
"ufw allow from 192.168.0.0/16",
|
|
66
|
-
"ufw default deny incoming",
|
|
67
|
-
"ufw default allow outgoing",
|
|
68
|
-
"ufw enable",
|
|
69
62
|
'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config',
|
|
70
63
|
"service ssh restart",
|
|
71
64
|
f"echo {shlex.quote(json.dumps(DOCKER_DAEMON_CONFIG))} > /etc/docker/daemon.json",
|
|
@@ -135,11 +135,10 @@ class OCICompute(
|
|
|
135
135
|
security_group.id, region.virtual_network_client
|
|
136
136
|
)
|
|
137
137
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
|
|
138
|
+
cloud_init_user_data = get_user_data(
|
|
139
|
+
authorized_keys=instance_config.get_public_keys(),
|
|
140
|
+
firewall_allow_from_subnets=[resources.VCN_CIDR],
|
|
141
|
+
)
|
|
143
142
|
|
|
144
143
|
display_name = generate_unique_instance_name(instance_config)
|
|
145
144
|
try:
|
|
@@ -75,17 +75,13 @@ class VultrCompute(
|
|
|
75
75
|
subnet = vpc["v4_subnet"]
|
|
76
76
|
subnet_mask = vpc["v4_subnet_mask"]
|
|
77
77
|
|
|
78
|
-
setup_commands = [
|
|
79
|
-
f"sudo ufw allow from {subnet}/{subnet_mask}",
|
|
80
|
-
"sudo ufw reload",
|
|
81
|
-
]
|
|
82
78
|
instance_id = self.api_client.launch_instance(
|
|
83
79
|
region=instance_offer.region,
|
|
84
80
|
label=instance_name,
|
|
85
81
|
plan=instance_offer.instance.name,
|
|
86
82
|
user_data=get_user_data(
|
|
87
83
|
authorized_keys=instance_config.get_public_keys(),
|
|
88
|
-
|
|
84
|
+
firewall_allow_from_subnets=[f"{subnet}/{subnet_mask}"],
|
|
89
85
|
),
|
|
90
86
|
vpc_id=vpc["id"],
|
|
91
87
|
)
|
|
@@ -59,6 +59,11 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
|
|
|
59
59
|
profile_excludes.add("stop_criteria")
|
|
60
60
|
if profile.schedule is None:
|
|
61
61
|
profile_excludes.add("schedule")
|
|
62
|
+
if (
|
|
63
|
+
fleet_spec.configuration.nodes
|
|
64
|
+
and fleet_spec.configuration.nodes.min == fleet_spec.configuration.nodes.target
|
|
65
|
+
):
|
|
66
|
+
configuration_excludes["nodes"] = {"target"}
|
|
62
67
|
if configuration_excludes:
|
|
63
68
|
spec_excludes["configuration"] = configuration_excludes
|
|
64
69
|
if profile_excludes:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
|
|
4
|
-
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
4
|
+
from dstack._internal.core.models.configurations import LEGACY_REPO_DIR, ServiceConfiguration
|
|
5
5
|
from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
|
|
6
6
|
from dstack._internal.server.schemas.runs import GetRunPlanRequest, ListRunsRequest
|
|
7
7
|
|
|
@@ -102,6 +102,11 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
|
|
|
102
102
|
configuration = run_spec.configuration
|
|
103
103
|
profile = run_spec.profile
|
|
104
104
|
|
|
105
|
+
if run_spec.repo_dir in [None, LEGACY_REPO_DIR]:
|
|
106
|
+
spec_excludes["repo_dir"] = True
|
|
107
|
+
elif run_spec.repo_dir == "." and configuration.working_dir in [None, LEGACY_REPO_DIR, "."]:
|
|
108
|
+
spec_excludes["repo_dir"] = True
|
|
109
|
+
|
|
105
110
|
if configuration.fleets is None:
|
|
106
111
|
configuration_excludes["fleets"] = True
|
|
107
112
|
if profile is not None and profile.fleets is None:
|
|
@@ -163,6 +168,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
|
|
|
163
168
|
spec_excludes["service_port"] = True
|
|
164
169
|
if all(not s.probes for s in job_specs):
|
|
165
170
|
spec_excludes["probes"] = True
|
|
171
|
+
if all(s.repo_dir in [None, LEGACY_REPO_DIR] for s in job_specs):
|
|
172
|
+
spec_excludes["repo_dir"] = True
|
|
166
173
|
|
|
167
174
|
return spec_excludes
|
|
168
175
|
|
|
@@ -4,13 +4,15 @@ import enum
|
|
|
4
4
|
class BackendType(str, enum.Enum):
|
|
5
5
|
"""
|
|
6
6
|
Attributes:
|
|
7
|
+
AMDDEVCLOUD (BackendType): AMD Developer Cloud
|
|
7
8
|
AWS (BackendType): Amazon Web Services
|
|
8
9
|
AZURE (BackendType): Microsoft Azure
|
|
9
10
|
CLOUDRIFT (BackendType): CloudRift
|
|
10
11
|
CUDO (BackendType): Cudo
|
|
12
|
+
DATACRUNCH (BackendType): DataCrunch
|
|
13
|
+
DIGITALOCEAN (BackendType): DigitalOcean
|
|
11
14
|
DSTACK (BackendType): dstack Sky
|
|
12
15
|
GCP (BackendType): Google Cloud Platform
|
|
13
|
-
DATACRUNCH (BackendType): DataCrunch
|
|
14
16
|
HOTAISLE (BackendType): Hot Aisle
|
|
15
17
|
KUBERNETES (BackendType): Kubernetes
|
|
16
18
|
LAMBDA (BackendType): Lambda Cloud
|
|
@@ -22,11 +24,13 @@ class BackendType(str, enum.Enum):
|
|
|
22
24
|
VULTR (BackendType): Vultr
|
|
23
25
|
"""
|
|
24
26
|
|
|
27
|
+
AMDDEVCLOUD = "amddevcloud"
|
|
25
28
|
AWS = "aws"
|
|
26
29
|
AZURE = "azure"
|
|
27
30
|
CLOUDRIFT = "cloudrift"
|
|
28
31
|
CUDO = "cudo"
|
|
29
32
|
DATACRUNCH = "datacrunch"
|
|
33
|
+
DIGITALOCEAN = "digitalocean"
|
|
30
34
|
DSTACK = "dstack"
|
|
31
35
|
GCP = "gcp"
|
|
32
36
|
HOTAISLE = "hotaisle"
|