dstack 0.19.26__py3-none-any.whl → 0.19.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +11 -8
- dstack/_internal/cli/commands/apply.py +6 -3
- dstack/_internal/cli/commands/completion.py +3 -1
- dstack/_internal/cli/commands/config.py +1 -0
- dstack/_internal/cli/commands/init.py +4 -4
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/commands/project.py +1 -0
- dstack/_internal/cli/commands/server.py +2 -2
- dstack/_internal/cli/main.py +1 -1
- dstack/_internal/cli/services/configurators/base.py +2 -4
- dstack/_internal/cli/services/configurators/fleet.py +4 -5
- dstack/_internal/cli/services/configurators/gateway.py +3 -5
- dstack/_internal/cli/services/configurators/run.py +165 -43
- dstack/_internal/cli/services/configurators/volume.py +3 -5
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +10 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/common.py +67 -43
- dstack/_internal/core/models/configurations.py +109 -69
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +115 -25
- dstack/_internal/core/models/instances.py +5 -5
- dstack/_internal/core/models/profiles.py +66 -47
- dstack/_internal/core/models/repos/remote.py +21 -16
- dstack/_internal/core/models/resources.py +69 -65
- dstack/_internal/core/models/runs.py +41 -14
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/app.py +5 -0
- dstack/_internal/server/background/tasks/process_fleets.py +117 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +48 -16
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +11 -7
- dstack/_internal/server/schemas/gateways.py +10 -9
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/handlers.py +2 -0
- dstack/_internal/server/services/docker.py +8 -7
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/projects.py +52 -1
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/settings.py +46 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-5e0d56245c4bd241ec27.css} +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-a2a16772fbf11a14d191.js} +1215 -998
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-a2a16772fbf11a14d191.js.map} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/env.py +85 -11
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/RECORD +92 -78
- dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import time
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from rich.table import Table
|
|
6
5
|
|
|
@@ -26,7 +25,7 @@ from dstack.api._public import Client
|
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
class VolumeConfigurator(BaseApplyConfigurator[VolumeConfiguration]):
|
|
29
|
-
TYPE
|
|
28
|
+
TYPE = ApplyConfigurationType.VOLUME
|
|
30
29
|
|
|
31
30
|
def apply_configuration(
|
|
32
31
|
self,
|
|
@@ -34,9 +33,8 @@ class VolumeConfigurator(BaseApplyConfigurator[VolumeConfiguration]):
|
|
|
34
33
|
configuration_path: str,
|
|
35
34
|
command_args: argparse.Namespace,
|
|
36
35
|
configurator_args: argparse.Namespace,
|
|
37
|
-
unknown_args: List[str],
|
|
38
36
|
):
|
|
39
|
-
self.apply_args(conf, configurator_args
|
|
37
|
+
self.apply_args(conf, configurator_args)
|
|
40
38
|
spec = VolumeSpec(
|
|
41
39
|
configuration=conf,
|
|
42
40
|
configuration_path=configuration_path,
|
|
@@ -167,7 +165,7 @@ class VolumeConfigurator(BaseApplyConfigurator[VolumeConfiguration]):
|
|
|
167
165
|
help="The volume name",
|
|
168
166
|
)
|
|
169
167
|
|
|
170
|
-
def apply_args(self, conf: VolumeConfiguration, args: argparse.Namespace
|
|
168
|
+
def apply_args(self, conf: VolumeConfiguration, args: argparse.Namespace):
|
|
171
169
|
if args.name:
|
|
172
170
|
conf.name = args.name
|
|
173
171
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
from typing import Literal,
|
|
2
|
+
from typing import Literal, Union, overload
|
|
3
3
|
|
|
4
4
|
import git
|
|
5
5
|
|
|
@@ -8,7 +8,6 @@ from dstack._internal.core.errors import CLIError
|
|
|
8
8
|
from dstack._internal.core.models.repos.local import LocalRepo
|
|
9
9
|
from dstack._internal.core.models.repos.remote import GitRepoURL, RemoteRepo, RepoError
|
|
10
10
|
from dstack._internal.core.models.repos.virtual import VirtualRepo
|
|
11
|
-
from dstack._internal.core.services.repos import get_default_branch
|
|
12
11
|
from dstack._internal.utils.path import PathLike
|
|
13
12
|
from dstack.api._public import Client
|
|
14
13
|
|
|
@@ -43,22 +42,6 @@ def init_default_virtual_repo(api: Client) -> VirtualRepo:
|
|
|
43
42
|
return repo
|
|
44
43
|
|
|
45
44
|
|
|
46
|
-
def get_repo_from_url(
|
|
47
|
-
repo_url: str, repo_branch: Optional[str] = None, repo_hash: Optional[str] = None
|
|
48
|
-
) -> RemoteRepo:
|
|
49
|
-
if repo_branch is None and repo_hash is None:
|
|
50
|
-
repo_branch = get_default_branch(repo_url)
|
|
51
|
-
if repo_branch is None:
|
|
52
|
-
raise CLIError(
|
|
53
|
-
"Failed to automatically detect remote repo branch. Specify branch or hash."
|
|
54
|
-
)
|
|
55
|
-
return RemoteRepo.from_url(
|
|
56
|
-
repo_url=repo_url,
|
|
57
|
-
repo_branch=repo_branch,
|
|
58
|
-
repo_hash=repo_hash,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
|
|
62
45
|
@overload
|
|
63
46
|
def get_repo_from_dir(repo_dir: PathLike, local: Literal[False] = False) -> RemoteRepo: ...
|
|
64
47
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This package contains the implementation for the AMDDevCloud backend.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dstack._internal.core.backends.amddevcloud.compute import AMDDevCloudCompute
|
|
2
|
+
from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend
|
|
3
|
+
from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig
|
|
4
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AMDDevCloudBackend(BaseDigitalOceanBackend):
|
|
8
|
+
TYPE = BackendType.AMDDEVCLOUD
|
|
9
|
+
COMPUTE_CLASS = AMDDevCloudCompute
|
|
10
|
+
|
|
11
|
+
def __init__(self, config: BaseDigitalOceanConfig, api_url: str):
|
|
12
|
+
self.config = config
|
|
13
|
+
self._compute = AMDDevCloudCompute(self.config, api_url=api_url, type=self.TYPE)
|
|
14
|
+
|
|
15
|
+
def compute(self) -> AMDDevCloudCompute:
|
|
16
|
+
return self._compute
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.amddevcloud.backend import AMDDevCloudBackend
|
|
4
|
+
from dstack._internal.core.backends.base.configurator import BackendRecord
|
|
5
|
+
from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient
|
|
6
|
+
from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend
|
|
7
|
+
from dstack._internal.core.backends.digitalocean_base.configurator import (
|
|
8
|
+
BaseDigitalOceanConfigurator,
|
|
9
|
+
)
|
|
10
|
+
from dstack._internal.core.backends.digitalocean_base.models import AnyBaseDigitalOceanCreds
|
|
11
|
+
from dstack._internal.core.models.backends.base import (
|
|
12
|
+
BackendType,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AMDDevCloudConfigurator(BaseDigitalOceanConfigurator):
|
|
17
|
+
TYPE = BackendType.AMDDEVCLOUD
|
|
18
|
+
BACKEND_CLASS = AMDDevCloudBackend
|
|
19
|
+
API_URL = "https://api-amd.digitalocean.com"
|
|
20
|
+
|
|
21
|
+
def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend:
|
|
22
|
+
config = self._get_config(record)
|
|
23
|
+
return AMDDevCloudBackend(config=config, api_url=self.API_URL)
|
|
24
|
+
|
|
25
|
+
def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None):
|
|
26
|
+
api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL)
|
|
27
|
+
api_client.validate_api_key()
|
|
28
|
+
if project_name:
|
|
29
|
+
api_client.validate_project_name(project_name)
|
|
@@ -292,7 +292,12 @@ class AWSCompute(
|
|
|
292
292
|
image_id=image_id,
|
|
293
293
|
instance_type=instance_offer.instance.name,
|
|
294
294
|
iam_instance_profile=self.config.iam_instance_profile,
|
|
295
|
-
user_data=get_user_data(
|
|
295
|
+
user_data=get_user_data(
|
|
296
|
+
authorized_keys=instance_config.get_public_keys(),
|
|
297
|
+
# Custom OS images may lack ufw, so don't attempt to set up the firewall.
|
|
298
|
+
# Rely on security groups and the image's built-in firewall rules instead.
|
|
299
|
+
skip_firewall_setup=self.config.os_images is not None,
|
|
300
|
+
),
|
|
296
301
|
tags=aws_resources.make_tags(tags),
|
|
297
302
|
security_group_id=security_group_id,
|
|
298
303
|
spot=instance_offer.instance.resources.spot,
|
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
import string
|
|
5
5
|
import threading
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
+
from collections.abc import Iterable
|
|
7
8
|
from functools import lru_cache
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Dict, List, Literal, Optional
|
|
@@ -19,7 +20,7 @@ from dstack._internal.core.consts import (
|
|
|
19
20
|
DSTACK_RUNNER_SSH_PORT,
|
|
20
21
|
DSTACK_SHIM_HTTP_PORT,
|
|
21
22
|
)
|
|
22
|
-
from dstack._internal.core.models.configurations import
|
|
23
|
+
from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
|
|
23
24
|
from dstack._internal.core.models.gateways import (
|
|
24
25
|
GatewayComputeConfiguration,
|
|
25
26
|
GatewayProvisioningData,
|
|
@@ -45,6 +46,7 @@ logger = get_logger(__name__)
|
|
|
45
46
|
|
|
46
47
|
DSTACK_SHIM_BINARY_NAME = "dstack-shim"
|
|
47
48
|
DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
|
|
49
|
+
DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16")
|
|
48
50
|
|
|
49
51
|
GoArchType = Literal["amd64", "arm64"]
|
|
50
52
|
|
|
@@ -507,12 +509,16 @@ def get_user_data(
|
|
|
507
509
|
base_path: Optional[PathLike] = None,
|
|
508
510
|
bin_path: Optional[PathLike] = None,
|
|
509
511
|
backend_shim_env: Optional[Dict[str, str]] = None,
|
|
512
|
+
skip_firewall_setup: bool = False,
|
|
513
|
+
firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS,
|
|
510
514
|
) -> str:
|
|
511
515
|
shim_commands = get_shim_commands(
|
|
512
516
|
authorized_keys=authorized_keys,
|
|
513
517
|
base_path=base_path,
|
|
514
518
|
bin_path=bin_path,
|
|
515
519
|
backend_shim_env=backend_shim_env,
|
|
520
|
+
skip_firewall_setup=skip_firewall_setup,
|
|
521
|
+
firewall_allow_from_subnets=firewall_allow_from_subnets,
|
|
516
522
|
)
|
|
517
523
|
commands = (backend_specific_commands or []) + shim_commands
|
|
518
524
|
return get_cloud_config(
|
|
@@ -554,8 +560,13 @@ def get_shim_commands(
|
|
|
554
560
|
bin_path: Optional[PathLike] = None,
|
|
555
561
|
backend_shim_env: Optional[Dict[str, str]] = None,
|
|
556
562
|
arch: Optional[str] = None,
|
|
563
|
+
skip_firewall_setup: bool = False,
|
|
564
|
+
firewall_allow_from_subnets: Iterable[str] = DEFAULT_PRIVATE_SUBNETS,
|
|
557
565
|
) -> List[str]:
|
|
558
|
-
commands = get_setup_cloud_instance_commands(
|
|
566
|
+
commands = get_setup_cloud_instance_commands(
|
|
567
|
+
skip_firewall_setup=skip_firewall_setup,
|
|
568
|
+
firewall_allow_from_subnets=firewall_allow_from_subnets,
|
|
569
|
+
)
|
|
559
570
|
commands += get_shim_pre_start_commands(
|
|
560
571
|
base_path=base_path,
|
|
561
572
|
bin_path=bin_path,
|
|
@@ -638,8 +649,11 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
|
|
|
638
649
|
return url_template.format(version=version, arch=arch)
|
|
639
650
|
|
|
640
651
|
|
|
641
|
-
def get_setup_cloud_instance_commands(
|
|
642
|
-
|
|
652
|
+
def get_setup_cloud_instance_commands(
|
|
653
|
+
skip_firewall_setup: bool,
|
|
654
|
+
firewall_allow_from_subnets: Iterable[str],
|
|
655
|
+
) -> list[str]:
|
|
656
|
+
commands = [
|
|
643
657
|
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
|
|
644
658
|
# Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
|
|
645
659
|
(
|
|
@@ -653,6 +667,19 @@ def get_setup_cloud_instance_commands() -> list[str]:
|
|
|
653
667
|
"'"
|
|
654
668
|
),
|
|
655
669
|
]
|
|
670
|
+
if not skip_firewall_setup:
|
|
671
|
+
commands += [
|
|
672
|
+
"ufw --force reset", # Some OS images have default rules like `allow 80`. Delete them
|
|
673
|
+
"ufw default deny incoming",
|
|
674
|
+
"ufw default allow outgoing",
|
|
675
|
+
"ufw allow ssh",
|
|
676
|
+
]
|
|
677
|
+
for subnet in firewall_allow_from_subnets:
|
|
678
|
+
commands.append(f"ufw allow from {subnet}")
|
|
679
|
+
commands += [
|
|
680
|
+
"ufw --force enable",
|
|
681
|
+
]
|
|
682
|
+
return commands
|
|
656
683
|
|
|
657
684
|
|
|
658
685
|
def get_shim_pre_start_commands(
|
|
@@ -773,7 +800,8 @@ def get_docker_commands(
|
|
|
773
800
|
f" --ssh-port {DSTACK_RUNNER_SSH_PORT}"
|
|
774
801
|
" --temp-dir /tmp/runner"
|
|
775
802
|
" --home-dir /root"
|
|
776
|
-
|
|
803
|
+
# TODO: Not used, left for compatibility with old runners. Remove eventually.
|
|
804
|
+
f" --working-dir {LEGACY_REPO_DIR}"
|
|
777
805
|
),
|
|
778
806
|
]
|
|
779
807
|
|
|
@@ -34,6 +34,8 @@ def get_catalog_offers(
|
|
|
34
34
|
provider = backend.value
|
|
35
35
|
if backend == BackendType.LAMBDA:
|
|
36
36
|
provider = "lambdalabs"
|
|
37
|
+
if backend == BackendType.AMDDEVCLOUD:
|
|
38
|
+
provider = "digitalocean"
|
|
37
39
|
q = requirements_to_query_filter(requirements)
|
|
38
40
|
q.provider = [provider]
|
|
39
41
|
offers = []
|
|
@@ -5,6 +5,12 @@ from dstack._internal.core.models.backends.base import BackendType
|
|
|
5
5
|
|
|
6
6
|
_CONFIGURATOR_CLASSES: List[Type[Configurator]] = []
|
|
7
7
|
|
|
8
|
+
try:
|
|
9
|
+
from dstack._internal.core.backends.amddevcloud.configurator import AMDDevCloudConfigurator
|
|
10
|
+
|
|
11
|
+
_CONFIGURATOR_CLASSES.append(AMDDevCloudConfigurator)
|
|
12
|
+
except ImportError:
|
|
13
|
+
pass
|
|
8
14
|
|
|
9
15
|
try:
|
|
10
16
|
from dstack._internal.core.backends.aws.configurator import AWSConfigurator
|
|
@@ -47,6 +53,15 @@ try:
|
|
|
47
53
|
except ImportError:
|
|
48
54
|
pass
|
|
49
55
|
|
|
56
|
+
try:
|
|
57
|
+
from dstack._internal.core.backends.digitalocean.configurator import (
|
|
58
|
+
DigitalOceanConfigurator,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
_CONFIGURATOR_CLASSES.append(DigitalOceanConfigurator)
|
|
62
|
+
except ImportError:
|
|
63
|
+
pass
|
|
64
|
+
|
|
50
65
|
try:
|
|
51
66
|
from dstack._internal.core.backends.gcp.configurator import GCPConfigurator
|
|
52
67
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# DigitalOcean backend for dstack
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dstack._internal.core.backends.digitalocean.compute import DigitalOceanCompute
|
|
2
|
+
from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend
|
|
3
|
+
from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig
|
|
4
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DigitalOceanBackend(BaseDigitalOceanBackend):
|
|
8
|
+
TYPE = BackendType.DIGITALOCEAN
|
|
9
|
+
COMPUTE_CLASS = DigitalOceanCompute
|
|
10
|
+
|
|
11
|
+
def __init__(self, config: BaseDigitalOceanConfig, api_url: str):
|
|
12
|
+
self.config = config
|
|
13
|
+
self._compute = DigitalOceanCompute(self.config, api_url=api_url, type=self.TYPE)
|
|
14
|
+
|
|
15
|
+
def compute(self) -> DigitalOceanCompute:
|
|
16
|
+
return self._compute
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.configurator import BackendRecord
|
|
4
|
+
from dstack._internal.core.backends.digitalocean.backend import DigitalOceanBackend
|
|
5
|
+
from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient
|
|
6
|
+
from dstack._internal.core.backends.digitalocean_base.backend import BaseDigitalOceanBackend
|
|
7
|
+
from dstack._internal.core.backends.digitalocean_base.configurator import (
|
|
8
|
+
BaseDigitalOceanConfigurator,
|
|
9
|
+
)
|
|
10
|
+
from dstack._internal.core.backends.digitalocean_base.models import (
|
|
11
|
+
AnyBaseDigitalOceanCreds,
|
|
12
|
+
)
|
|
13
|
+
from dstack._internal.core.models.backends.base import (
|
|
14
|
+
BackendType,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DigitalOceanConfigurator(BaseDigitalOceanConfigurator):
|
|
19
|
+
TYPE = BackendType.DIGITALOCEAN
|
|
20
|
+
BACKEND_CLASS = DigitalOceanBackend
|
|
21
|
+
API_URL = "https://api.digitalocean.com"
|
|
22
|
+
|
|
23
|
+
def get_backend(self, record: BackendRecord) -> BaseDigitalOceanBackend:
|
|
24
|
+
config = self._get_config(record)
|
|
25
|
+
return DigitalOceanBackend(config=config, api_url=self.API_URL)
|
|
26
|
+
|
|
27
|
+
def _validate_creds(self, creds: AnyBaseDigitalOceanCreds, project_name: Optional[str] = None):
|
|
28
|
+
api_client = DigitalOceanAPIClient(creds.api_key, self.API_URL)
|
|
29
|
+
api_client.validate_api_key()
|
|
30
|
+
if project_name:
|
|
31
|
+
api_client.validate_project_name(project_name)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This package contains the base classes for DigitalOcean and AMDDevCloud backends.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
|
|
6
|
+
from dstack._internal.core.errors import NoCapacityError
|
|
7
|
+
from dstack._internal.utils.logging import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DigitalOceanAPIClient:
|
|
13
|
+
def __init__(self, api_key: str, api_url: str):
|
|
14
|
+
self.api_key = api_key
|
|
15
|
+
self.base_url = api_url
|
|
16
|
+
|
|
17
|
+
def validate_api_key(self) -> bool:
|
|
18
|
+
try:
|
|
19
|
+
response = self._make_request("GET", "/v2/account")
|
|
20
|
+
response.raise_for_status()
|
|
21
|
+
return True
|
|
22
|
+
except requests.HTTPError as e:
|
|
23
|
+
status = e.response.status_code
|
|
24
|
+
if status == 401:
|
|
25
|
+
raise_invalid_credentials_error(
|
|
26
|
+
fields=[["creds", "api_key"]], details="Invaild API key"
|
|
27
|
+
)
|
|
28
|
+
raise e
|
|
29
|
+
|
|
30
|
+
def validate_project_name(self, project_name: str) -> bool:
|
|
31
|
+
if self.get_project_id(project_name) is None:
|
|
32
|
+
raise_invalid_credentials_error(
|
|
33
|
+
fields=[["project_name"]],
|
|
34
|
+
details=f"Project with name '{project_name}' does not exist",
|
|
35
|
+
)
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def list_ssh_keys(self) -> List[Dict[str, Any]]:
|
|
39
|
+
response = self._make_request("GET", "/v2/account/keys")
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
return response.json()["ssh_keys"]
|
|
42
|
+
|
|
43
|
+
def list_projects(self) -> List[Dict[str, Any]]:
|
|
44
|
+
response = self._make_request("GET", "/v2/projects")
|
|
45
|
+
response.raise_for_status()
|
|
46
|
+
return response.json()["projects"]
|
|
47
|
+
|
|
48
|
+
def get_project_id(self, project_name: str) -> Optional[str]:
|
|
49
|
+
projects = self.list_projects()
|
|
50
|
+
for project in projects:
|
|
51
|
+
if project["name"] == project_name:
|
|
52
|
+
return project["id"]
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def create_ssh_key(self, name: str, public_key: str) -> Dict[str, Any]:
|
|
56
|
+
payload = {"name": name, "public_key": public_key}
|
|
57
|
+
response = self._make_request("POST", "/v2/account/keys", json=payload)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
return response.json()["ssh_key"]
|
|
60
|
+
|
|
61
|
+
def get_or_create_ssh_key(self, name: str, public_key: str) -> int:
|
|
62
|
+
ssh_keys = self.list_ssh_keys()
|
|
63
|
+
for ssh_key in ssh_keys:
|
|
64
|
+
if ssh_key["public_key"].strip() == public_key.strip():
|
|
65
|
+
return ssh_key["id"]
|
|
66
|
+
|
|
67
|
+
ssh_key = self.create_ssh_key(name, public_key)
|
|
68
|
+
return ssh_key["id"]
|
|
69
|
+
|
|
70
|
+
def create_droplet(self, droplet_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
71
|
+
response = self._make_request("POST", "/v2/droplets", json=droplet_config)
|
|
72
|
+
if response.status_code == 422:
|
|
73
|
+
raise NoCapacityError(response.json()["message"])
|
|
74
|
+
response.raise_for_status()
|
|
75
|
+
return response.json()["droplet"]
|
|
76
|
+
|
|
77
|
+
def get_droplet(self, droplet_id: str) -> Dict[str, Any]:
|
|
78
|
+
response = self._make_request("GET", f"/v2/droplets/{droplet_id}")
|
|
79
|
+
response.raise_for_status()
|
|
80
|
+
return response.json()["droplet"]
|
|
81
|
+
|
|
82
|
+
def delete_droplet(self, droplet_id: str) -> None:
|
|
83
|
+
response = self._make_request("DELETE", f"/v2/droplets/{droplet_id}")
|
|
84
|
+
if response.status_code == 404:
|
|
85
|
+
logger.debug("DigitalOcean droplet %s not found", droplet_id)
|
|
86
|
+
return
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
|
|
89
|
+
def _make_request(
|
|
90
|
+
self, method: str, endpoint: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30
|
|
91
|
+
) -> requests.Response:
|
|
92
|
+
url = f"{self.base_url}{endpoint}"
|
|
93
|
+
headers = {
|
|
94
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
response = requests.request(
|
|
98
|
+
method=method,
|
|
99
|
+
url=url,
|
|
100
|
+
headers=headers,
|
|
101
|
+
json=json,
|
|
102
|
+
timeout=timeout,
|
|
103
|
+
)
|
|
104
|
+
return response
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import gpuhunt
|
|
4
|
+
from gpuhunt.providers.digitalocean import DigitalOceanProvider
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.backends.base.backend import Compute
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithCreateInstanceSupport,
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_user_data,
|
|
11
|
+
)
|
|
12
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
13
|
+
from dstack._internal.core.backends.digitalocean_base.api_client import DigitalOceanAPIClient
|
|
14
|
+
from dstack._internal.core.backends.digitalocean_base.models import BaseDigitalOceanConfig
|
|
15
|
+
from dstack._internal.core.errors import BackendError
|
|
16
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
17
|
+
from dstack._internal.core.models.instances import (
|
|
18
|
+
InstanceAvailability,
|
|
19
|
+
InstanceConfiguration,
|
|
20
|
+
InstanceOfferWithAvailability,
|
|
21
|
+
)
|
|
22
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
24
|
+
from dstack._internal.utils.logging import get_logger
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
29
|
+
DOCKER_INSTALL_COMMANDS = [
|
|
30
|
+
"export DEBIAN_FRONTEND=noninteractive",
|
|
31
|
+
"mkdir -p /etc/apt/keyrings",
|
|
32
|
+
"curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
|
|
33
|
+
'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
|
|
34
|
+
"apt-get update",
|
|
35
|
+
"apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BaseDigitalOceanCompute(
|
|
40
|
+
ComputeWithCreateInstanceSupport,
|
|
41
|
+
Compute,
|
|
42
|
+
):
|
|
43
|
+
def __init__(self, config: BaseDigitalOceanConfig, api_url: str, type: BackendType):
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.config = config
|
|
46
|
+
self.api_client = DigitalOceanAPIClient(config.creds.api_key, api_url)
|
|
47
|
+
self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
|
|
48
|
+
self.BACKEND_TYPE = type
|
|
49
|
+
self.catalog.add_provider(
|
|
50
|
+
DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def get_offers(
|
|
54
|
+
self, requirements: Optional[Requirements] = None
|
|
55
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
56
|
+
offers = get_catalog_offers(
|
|
57
|
+
backend=self.BACKEND_TYPE,
|
|
58
|
+
locations=self.config.regions,
|
|
59
|
+
requirements=requirements,
|
|
60
|
+
catalog=self.catalog,
|
|
61
|
+
)
|
|
62
|
+
return [
|
|
63
|
+
InstanceOfferWithAvailability(
|
|
64
|
+
**offer.dict(),
|
|
65
|
+
availability=InstanceAvailability.AVAILABLE,
|
|
66
|
+
)
|
|
67
|
+
for offer in offers
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
def create_instance(
|
|
71
|
+
self,
|
|
72
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
73
|
+
instance_config: InstanceConfiguration,
|
|
74
|
+
placement_group: Optional[PlacementGroup],
|
|
75
|
+
) -> JobProvisioningData:
|
|
76
|
+
instance_name = generate_unique_instance_name(
|
|
77
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
project_ssh_key = instance_config.ssh_keys[0]
|
|
81
|
+
ssh_key_id = self.api_client.get_or_create_ssh_key(
|
|
82
|
+
name=f"dstack-{instance_config.project_name}",
|
|
83
|
+
public_key=project_ssh_key.public,
|
|
84
|
+
)
|
|
85
|
+
size_slug = instance_offer.instance.name
|
|
86
|
+
|
|
87
|
+
if not instance_offer.instance.resources.gpus:
|
|
88
|
+
backend_specific_commands = DOCKER_INSTALL_COMMANDS
|
|
89
|
+
else:
|
|
90
|
+
backend_specific_commands = None
|
|
91
|
+
|
|
92
|
+
project_id = None
|
|
93
|
+
if self.config.project_name:
|
|
94
|
+
project_id = self.api_client.get_project_id(self.config.project_name)
|
|
95
|
+
if project_id is None:
|
|
96
|
+
raise BackendError(f"Project {self.config.project_name} does not exist")
|
|
97
|
+
droplet_config = {
|
|
98
|
+
"name": instance_name,
|
|
99
|
+
"region": instance_offer.region,
|
|
100
|
+
"size": size_slug,
|
|
101
|
+
"image": self._get_image_for_instance(instance_offer),
|
|
102
|
+
"ssh_keys": [ssh_key_id],
|
|
103
|
+
"backups": False,
|
|
104
|
+
"ipv6": False,
|
|
105
|
+
"monitoring": False,
|
|
106
|
+
"tags": [],
|
|
107
|
+
"user_data": get_user_data(
|
|
108
|
+
authorized_keys=instance_config.get_public_keys(),
|
|
109
|
+
backend_specific_commands=backend_specific_commands,
|
|
110
|
+
),
|
|
111
|
+
**({"project_id": project_id} if project_id is not None else {}),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
droplet = self.api_client.create_droplet(droplet_config)
|
|
115
|
+
|
|
116
|
+
return JobProvisioningData(
|
|
117
|
+
backend=instance_offer.backend,
|
|
118
|
+
instance_type=instance_offer.instance,
|
|
119
|
+
instance_id=str(droplet["id"]),
|
|
120
|
+
hostname=None,
|
|
121
|
+
internal_ip=None,
|
|
122
|
+
region=instance_offer.region,
|
|
123
|
+
price=instance_offer.price,
|
|
124
|
+
username="root",
|
|
125
|
+
ssh_port=22,
|
|
126
|
+
dockerized=True,
|
|
127
|
+
ssh_proxy=None,
|
|
128
|
+
backend_data=None,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def update_provisioning_data(
|
|
132
|
+
self,
|
|
133
|
+
provisioning_data: JobProvisioningData,
|
|
134
|
+
project_ssh_public_key: str,
|
|
135
|
+
project_ssh_private_key: str,
|
|
136
|
+
):
|
|
137
|
+
droplet = self.api_client.get_droplet(provisioning_data.instance_id)
|
|
138
|
+
if droplet["status"] == "active":
|
|
139
|
+
for network in droplet["networks"]["v4"]:
|
|
140
|
+
if network["type"] == "public":
|
|
141
|
+
provisioning_data.hostname = network["ip_address"]
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
def terminate_instance(
|
|
145
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
146
|
+
):
|
|
147
|
+
self.api_client.delete_droplet(instance_id)
|
|
148
|
+
|
|
149
|
+
def _get_image_for_instance(self, instance_offer: InstanceOfferWithAvailability) -> str:
|
|
150
|
+
if not instance_offer.instance.resources.gpus:
|
|
151
|
+
# No GPUs, use CPU image
|
|
152
|
+
return "ubuntu-24-04-x64"
|
|
153
|
+
|
|
154
|
+
gpu_count = len(instance_offer.instance.resources.gpus)
|
|
155
|
+
gpu_vendor = instance_offer.instance.resources.gpus[0].vendor
|
|
156
|
+
|
|
157
|
+
if gpu_vendor == gpuhunt.AcceleratorVendor.AMD:
|
|
158
|
+
# AMD GPU
|
|
159
|
+
return "digitaloceanai-rocmjupyter"
|
|
160
|
+
else:
|
|
161
|
+
# NVIDIA GPUs - DO only supports 1 and 8 GPU configurations.
|
|
162
|
+
# DO says for single GPU plans using GPUs other than H100s use "gpu-h100x1-base". DO does not provide guidance for x8 GPUs so assuming the same applies.
|
|
163
|
+
# See (https://docs.digitalocean.com/products/droplets/getting-started/recommended-gpu-setup/#aiml-ready-image)
|
|
164
|
+
if gpu_count == 8:
|
|
165
|
+
return "gpu-h100x8-base"
|
|
166
|
+
elif gpu_count == 1:
|
|
167
|
+
return "gpu-h100x1-base"
|
|
168
|
+
else:
|
|
169
|
+
# For Unsupported GPU count - use single GPU image and log warning
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Unsupported NVIDIA GPU count: {gpu_count}, using single GPU image"
|
|
172
|
+
)
|
|
173
|
+
return "gpu-h100x1-base"
|