dstack 0.19.15rc1__py3-none-any.whl → 0.19.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/completion.py +5 -0
- dstack/_internal/cli/services/configurators/run.py +59 -17
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/core/backends/__init__.py +10 -4
- dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/compatibility/logs.py +15 -0
- dstack/_internal/core/compatibility/runs.py +31 -2
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +33 -2
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +24 -1
- dstack/_internal/core/models/secrets.py +9 -2
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +1 -1
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +110 -13
- dstack/_internal/server/background/tasks/process_runs.py +36 -5
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/models.py +33 -0
- dstack/_internal/server/routers/files.py +67 -0
- dstack/_internal/server/routers/gateways.py +6 -3
- dstack/_internal/server/routers/projects.py +63 -0
- dstack/_internal/server/routers/prometheus.py +5 -5
- dstack/_internal/server/routers/secrets.py +57 -15
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/logs.py +10 -1
- dstack/_internal/server/schemas/projects.py +12 -0
- dstack/_internal/server/schemas/runner.py +2 -0
- dstack/_internal/server/schemas/secrets.py +7 -11
- dstack/_internal/server/security/permissions.py +75 -2
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1 -1
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/__init__.py +19 -8
- dstack/_internal/server/services/jobs/configurators/base.py +27 -3
- dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
- dstack/_internal/server/services/logs/aws.py +38 -38
- dstack/_internal/server/services/logs/filelog.py +48 -14
- dstack/_internal/server/services/logs/gcp.py +17 -16
- dstack/_internal/server/services/projects.py +164 -5
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
- dstack/_internal/server/services/proxy/repo.py +3 -0
- dstack/_internal/server/services/runner/client.py +8 -0
- dstack/_internal/server/services/runs.py +55 -10
- dstack/_internal/server/services/secrets.py +204 -0
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/storage/base.py +21 -0
- dstack/_internal/server/services/storage/gcs.py +28 -6
- dstack/_internal/server/services/storage/s3.py +27 -9
- dstack/_internal/server/services/users.py +1 -3
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/settings.py +2 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-d151637af20f70b2e796.js} +104 -48
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-d151637af20f70b2e796.js.map} +1 -1
- dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-d48635d8fe670d53961c.css} +1 -1
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/_internal/server/testing/common.py +43 -5
- dstack/_internal/settings.py +5 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/path.py +12 -4
- dstack/api/_public/runs.py +73 -12
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_logs.py +5 -1
- dstack/api/server/_projects.py +24 -0
- dstack/api/server/_secrets.py +15 -15
- dstack/version.py +1 -1
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/METADATA +3 -4
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/RECORD +93 -71
- /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/WHEEL +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.15rc1.dist-info → dstack-0.19.17.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from dstack._internal.cli.commands import APIBaseCommand
|
|
4
|
+
from dstack._internal.cli.services.completion import SecretNameCompleter
|
|
5
|
+
from dstack._internal.cli.utils.common import (
|
|
6
|
+
confirm_ask,
|
|
7
|
+
console,
|
|
8
|
+
)
|
|
9
|
+
from dstack._internal.cli.utils.secrets import print_secrets_table
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SecretCommand(APIBaseCommand):
|
|
13
|
+
NAME = "secret"
|
|
14
|
+
DESCRIPTION = "Manage secrets"
|
|
15
|
+
|
|
16
|
+
def _register(self):
|
|
17
|
+
super()._register()
|
|
18
|
+
self._parser.set_defaults(subfunc=self._list)
|
|
19
|
+
subparsers = self._parser.add_subparsers(dest="action")
|
|
20
|
+
|
|
21
|
+
list_parser = subparsers.add_parser(
|
|
22
|
+
"list", help="List secrets", formatter_class=self._parser.formatter_class
|
|
23
|
+
)
|
|
24
|
+
list_parser.set_defaults(subfunc=self._list)
|
|
25
|
+
|
|
26
|
+
get_parser = subparsers.add_parser(
|
|
27
|
+
"get", help="Get secret value", formatter_class=self._parser.formatter_class
|
|
28
|
+
)
|
|
29
|
+
get_parser.add_argument(
|
|
30
|
+
"name",
|
|
31
|
+
help="The name of the secret",
|
|
32
|
+
).completer = SecretNameCompleter()
|
|
33
|
+
get_parser.set_defaults(subfunc=self._get)
|
|
34
|
+
|
|
35
|
+
set_parser = subparsers.add_parser(
|
|
36
|
+
"set", help="Set secret", formatter_class=self._parser.formatter_class
|
|
37
|
+
)
|
|
38
|
+
set_parser.add_argument(
|
|
39
|
+
"name",
|
|
40
|
+
help="The name of the secret",
|
|
41
|
+
)
|
|
42
|
+
set_parser.add_argument(
|
|
43
|
+
"value",
|
|
44
|
+
help="The value of the secret",
|
|
45
|
+
)
|
|
46
|
+
set_parser.set_defaults(subfunc=self._set)
|
|
47
|
+
|
|
48
|
+
delete_parser = subparsers.add_parser(
|
|
49
|
+
"delete",
|
|
50
|
+
help="Delete secrets",
|
|
51
|
+
formatter_class=self._parser.formatter_class,
|
|
52
|
+
)
|
|
53
|
+
delete_parser.add_argument(
|
|
54
|
+
"name",
|
|
55
|
+
help="The name of the secret",
|
|
56
|
+
).completer = SecretNameCompleter()
|
|
57
|
+
delete_parser.add_argument(
|
|
58
|
+
"-y", "--yes", help="Don't ask for confirmation", action="store_true"
|
|
59
|
+
)
|
|
60
|
+
delete_parser.set_defaults(subfunc=self._delete)
|
|
61
|
+
|
|
62
|
+
def _command(self, args: argparse.Namespace):
|
|
63
|
+
super()._command(args)
|
|
64
|
+
args.subfunc(args)
|
|
65
|
+
|
|
66
|
+
def _list(self, args: argparse.Namespace):
|
|
67
|
+
secrets = self.api.client.secrets.list(self.api.project)
|
|
68
|
+
print_secrets_table(secrets)
|
|
69
|
+
|
|
70
|
+
def _get(self, args: argparse.Namespace):
|
|
71
|
+
secret = self.api.client.secrets.get(self.api.project, name=args.name)
|
|
72
|
+
print_secrets_table([secret])
|
|
73
|
+
|
|
74
|
+
def _set(self, args: argparse.Namespace):
|
|
75
|
+
self.api.client.secrets.create_or_update(
|
|
76
|
+
self.api.project,
|
|
77
|
+
name=args.name,
|
|
78
|
+
value=args.value,
|
|
79
|
+
)
|
|
80
|
+
console.print("[grey58]OK[/]")
|
|
81
|
+
|
|
82
|
+
def _delete(self, args: argparse.Namespace):
|
|
83
|
+
if not args.yes and not confirm_ask(f"Delete the secret [code]{args.name}[/]?"):
|
|
84
|
+
console.print("\nExiting...")
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
with console.status("Deleting secret..."):
|
|
88
|
+
self.api.client.secrets.delete(
|
|
89
|
+
project_name=self.api.project,
|
|
90
|
+
names=[args.name],
|
|
91
|
+
)
|
|
92
|
+
console.print("[grey58]OK[/]")
|
dstack/_internal/cli/main.py
CHANGED
|
@@ -17,6 +17,7 @@ from dstack._internal.cli.commands.metrics import MetricsCommand
|
|
|
17
17
|
from dstack._internal.cli.commands.offer import OfferCommand
|
|
18
18
|
from dstack._internal.cli.commands.project import ProjectCommand
|
|
19
19
|
from dstack._internal.cli.commands.ps import PsCommand
|
|
20
|
+
from dstack._internal.cli.commands.secrets import SecretCommand
|
|
20
21
|
from dstack._internal.cli.commands.server import ServerCommand
|
|
21
22
|
from dstack._internal.cli.commands.stats import StatsCommand
|
|
22
23
|
from dstack._internal.cli.commands.stop import StopCommand
|
|
@@ -72,6 +73,7 @@ def main():
|
|
|
72
73
|
MetricsCommand.register(subparsers)
|
|
73
74
|
ProjectCommand.register(subparsers)
|
|
74
75
|
PsCommand.register(subparsers)
|
|
76
|
+
SecretCommand.register(subparsers)
|
|
75
77
|
ServerCommand.register(subparsers)
|
|
76
78
|
StatsCommand.register(subparsers)
|
|
77
79
|
StopCommand.register(subparsers)
|
|
@@ -75,6 +75,11 @@ class GatewayNameCompleter(BaseAPINameCompleter):
|
|
|
75
75
|
return [r.name for r in api.client.gateways.list(api.project)]
|
|
76
76
|
|
|
77
77
|
|
|
78
|
+
class SecretNameCompleter(BaseAPINameCompleter):
|
|
79
|
+
def fetch_resource_names(self, api: Client) -> Iterable[str]:
|
|
80
|
+
return [r.name for r in api.client.secrets.list(api.project)]
|
|
81
|
+
|
|
82
|
+
|
|
78
83
|
class ProjectNameCompleter(BaseCompleter):
|
|
79
84
|
"""
|
|
80
85
|
Completer for local project names.
|
|
@@ -41,12 +41,13 @@ from dstack._internal.core.models.configurations import (
|
|
|
41
41
|
)
|
|
42
42
|
from dstack._internal.core.models.repos.base import Repo
|
|
43
43
|
from dstack._internal.core.models.resources import CPUSpec
|
|
44
|
-
from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunStatus
|
|
44
|
+
from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunSpec, RunStatus
|
|
45
45
|
from dstack._internal.core.services.configs import ConfigManager
|
|
46
46
|
from dstack._internal.core.services.diff import diff_models
|
|
47
47
|
from dstack._internal.utils.common import local_time
|
|
48
48
|
from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
|
|
49
49
|
from dstack._internal.utils.logging import get_logger
|
|
50
|
+
from dstack._internal.utils.nested_list import NestedList, NestedListItem
|
|
50
51
|
from dstack.api._public.repos import get_ssh_keypair
|
|
51
52
|
from dstack.api._public.runs import Run
|
|
52
53
|
from dstack.api.utils import load_profile
|
|
@@ -102,25 +103,20 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
102
103
|
confirm_message = f"Submit the run [code]{conf.name}[/]?"
|
|
103
104
|
stop_run_name = None
|
|
104
105
|
if run_plan.current_resource is not None:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
)
|
|
111
|
-
changed_fields = list(diff.keys())
|
|
112
|
-
if run_plan.action == ApplyAction.UPDATE and len(changed_fields) > 0:
|
|
106
|
+
diff = render_run_spec_diff(
|
|
107
|
+
run_plan.get_effective_run_spec(),
|
|
108
|
+
run_plan.current_resource.run_spec,
|
|
109
|
+
)
|
|
110
|
+
if run_plan.action == ApplyAction.UPDATE and diff is not None:
|
|
113
111
|
console.print(
|
|
114
112
|
f"Active run [code]{conf.name}[/] already exists."
|
|
115
|
-
" Detected
|
|
116
|
-
f" {changed_fields}"
|
|
113
|
+
f" Detected changes that [code]can[/] be updated in-place:\n{diff}"
|
|
117
114
|
)
|
|
118
115
|
confirm_message = "Update the run?"
|
|
119
|
-
elif run_plan.action == ApplyAction.UPDATE and
|
|
116
|
+
elif run_plan.action == ApplyAction.UPDATE and diff is None:
|
|
120
117
|
stop_run_name = run_plan.current_resource.run_spec.run_name
|
|
121
118
|
console.print(
|
|
122
|
-
f"Active run [code]{conf.name}[/] already exists."
|
|
123
|
-
" Detected no configuration changes."
|
|
119
|
+
f"Active run [code]{conf.name}[/] already exists. Detected no changes."
|
|
124
120
|
)
|
|
125
121
|
if command_args.yes and not command_args.force:
|
|
126
122
|
console.print("Use --force to apply anyway.")
|
|
@@ -129,7 +125,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
129
125
|
elif not run_plan.current_resource.status.is_finished():
|
|
130
126
|
stop_run_name = run_plan.current_resource.run_spec.run_name
|
|
131
127
|
console.print(
|
|
132
|
-
f"Active run [code]{conf.name}[/] already exists
|
|
128
|
+
f"Active run [code]{conf.name}[/] already exists."
|
|
129
|
+
f" Detected changes that [error]cannot[/] be updated in-place:\n{diff}"
|
|
133
130
|
)
|
|
134
131
|
confirm_message = "Stop and override the run?"
|
|
135
132
|
|
|
@@ -398,9 +395,10 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
398
395
|
else:
|
|
399
396
|
has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD
|
|
400
397
|
has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT
|
|
401
|
-
|
|
398
|
+
# When docker=True, the system uses Docker-in-Docker image, so no custom image is required
|
|
399
|
+
if has_amd_gpu and conf.image is None and conf.docker is not True:
|
|
402
400
|
raise ConfigurationError("`image` is required if `resources.gpu.vendor` is `amd`")
|
|
403
|
-
if has_tt_gpu and conf.image is None:
|
|
401
|
+
if has_tt_gpu and conf.image is None and conf.docker is not True:
|
|
404
402
|
raise ConfigurationError(
|
|
405
403
|
"`image` is required if `resources.gpu.vendor` is `tenstorrent`"
|
|
406
404
|
)
|
|
@@ -610,3 +608,47 @@ def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission])
|
|
|
610
608
|
not run.status.is_finished()
|
|
611
609
|
and run._run.latest_job_submission.submitted_at > current_job_submission.submitted_at
|
|
612
610
|
)
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def render_run_spec_diff(old_spec: RunSpec, new_spec: RunSpec) -> Optional[str]:
|
|
614
|
+
changed_spec_fields = list(diff_models(old_spec, new_spec))
|
|
615
|
+
if not changed_spec_fields:
|
|
616
|
+
return None
|
|
617
|
+
friendly_spec_field_names = {
|
|
618
|
+
"repo_id": "Repo ID",
|
|
619
|
+
"repo_code_hash": "Repo files",
|
|
620
|
+
"repo_data": "Repo state (branch, commit, or other)",
|
|
621
|
+
"ssh_key_pub": "Public SSH key",
|
|
622
|
+
}
|
|
623
|
+
nested_list = NestedList()
|
|
624
|
+
for spec_field in changed_spec_fields:
|
|
625
|
+
if spec_field == "merged_profile":
|
|
626
|
+
continue
|
|
627
|
+
elif spec_field == "configuration":
|
|
628
|
+
if type(old_spec.configuration) is not type(new_spec.configuration):
|
|
629
|
+
item = NestedListItem("Configuration type")
|
|
630
|
+
else:
|
|
631
|
+
item = NestedListItem(
|
|
632
|
+
"Configuration properties:",
|
|
633
|
+
children=[
|
|
634
|
+
NestedListItem(field)
|
|
635
|
+
for field in diff_models(old_spec.configuration, new_spec.configuration)
|
|
636
|
+
],
|
|
637
|
+
)
|
|
638
|
+
elif spec_field == "profile":
|
|
639
|
+
if type(old_spec.profile) is not type(new_spec.profile):
|
|
640
|
+
item = NestedListItem("Profile")
|
|
641
|
+
else:
|
|
642
|
+
item = NestedListItem(
|
|
643
|
+
"Profile properties:",
|
|
644
|
+
children=[
|
|
645
|
+
NestedListItem(field)
|
|
646
|
+
for field in diff_models(old_spec.profile, new_spec.profile)
|
|
647
|
+
],
|
|
648
|
+
)
|
|
649
|
+
elif spec_field in friendly_spec_field_names:
|
|
650
|
+
item = NestedListItem(friendly_spec_field_names[spec_field])
|
|
651
|
+
else:
|
|
652
|
+
item = NestedListItem(spec_field.replace("_", " ").capitalize())
|
|
653
|
+
nested_list.children.append(item)
|
|
654
|
+
return nested_list.render()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
|
|
5
|
+
from dstack._internal.cli.utils.common import add_row_from_dict, console
|
|
6
|
+
from dstack._internal.core.models.secrets import Secret
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def print_secrets_table(secrets: List[Secret]) -> None:
|
|
10
|
+
console.print(get_secrets_table(secrets))
|
|
11
|
+
console.print()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_secrets_table(secrets: List[Secret]) -> Table:
|
|
15
|
+
table = Table(box=None)
|
|
16
|
+
table.add_column("NAME", no_wrap=True)
|
|
17
|
+
table.add_column("VALUE")
|
|
18
|
+
|
|
19
|
+
for secret in secrets:
|
|
20
|
+
row = {
|
|
21
|
+
"NAME": secret.name,
|
|
22
|
+
"VALUE": secret.value or "*" * 6,
|
|
23
|
+
}
|
|
24
|
+
add_row_from_dict(table, row)
|
|
25
|
+
return table
|
|
@@ -9,18 +9,25 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.core.backends.base.configurator import Configurator
|
|
11
11
|
from dstack._internal.core.backends.configurators import list_available_configurator_classes
|
|
12
|
+
from dstack._internal.core.backends.local.compute import LocalCompute
|
|
12
13
|
from dstack._internal.core.models.backends.base import BackendType
|
|
14
|
+
from dstack._internal.settings import LOCAL_BACKEND_ENABLED
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def _get_backends_with_compute_feature(
|
|
16
18
|
configurator_classes: list[type[Configurator]],
|
|
17
19
|
compute_feature_class: type,
|
|
18
20
|
) -> list[BackendType]:
|
|
21
|
+
backend_types_and_computes = [
|
|
22
|
+
(configurator_class.TYPE, configurator_class.BACKEND_CLASS.COMPUTE_CLASS)
|
|
23
|
+
for configurator_class in configurator_classes
|
|
24
|
+
]
|
|
25
|
+
if LOCAL_BACKEND_ENABLED:
|
|
26
|
+
backend_types_and_computes.append((BackendType.LOCAL, LocalCompute))
|
|
19
27
|
backend_types = []
|
|
20
|
-
for
|
|
21
|
-
compute_class = configurator_class.BACKEND_CLASS.COMPUTE_CLASS
|
|
28
|
+
for backend_type, compute_class in backend_types_and_computes:
|
|
22
29
|
if issubclass(compute_class, compute_feature_class):
|
|
23
|
-
backend_types.append(
|
|
30
|
+
backend_types.append(backend_type)
|
|
24
31
|
return backend_types
|
|
25
32
|
|
|
26
33
|
|
|
@@ -28,7 +35,6 @@ _configurator_classes = list_available_configurator_classes()
|
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
# The following backend lists do not include unavailable backends (i.e. backends missing deps).
|
|
31
|
-
# TODO: Add LocalBackend to lists if it's enabled
|
|
32
38
|
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
|
|
33
39
|
configurator_classes=_configurator_classes,
|
|
34
40
|
compute_feature_class=ComputeWithCreateInstanceSupport,
|
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional, Union
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from packaging import version
|
|
7
|
+
from requests import Response
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError
|
|
10
|
+
from dstack._internal.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
CLOUDRIFT_SERVER_ADDRESS = "https://api.cloudrift.ai"
|
|
16
|
+
CLOUDRIFT_API_VERSION = "2025-05-29"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RiftClient:
|
|
20
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
21
|
+
self.public_api_root = os.path.join(CLOUDRIFT_SERVER_ADDRESS, "api/v1")
|
|
22
|
+
self.api_key = api_key
|
|
23
|
+
|
|
24
|
+
def validate_api_key(self) -> bool:
|
|
25
|
+
"""
|
|
26
|
+
Validates the API key by making a request to the server.
|
|
27
|
+
Returns True if the API key is valid, False otherwise.
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
response = self._make_request("auth/me")
|
|
31
|
+
if isinstance(response, dict):
|
|
32
|
+
return "email" in response
|
|
33
|
+
return False
|
|
34
|
+
except BackendInvalidCredentialsError:
|
|
35
|
+
return False
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Error validating API key: {e}")
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
def get_instance_types(self) -> List[Dict]:
|
|
41
|
+
request_data = {"selector": {"ByServiceAndLocation": {"services": ["vm"]}}}
|
|
42
|
+
response_data = self._make_request("instance-types/list", request_data)
|
|
43
|
+
if isinstance(response_data, dict):
|
|
44
|
+
return response_data.get("instance_types", [])
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
def list_recipes(self) -> List[Dict]:
|
|
48
|
+
request_data = {}
|
|
49
|
+
response_data = self._make_request("recipes/list", request_data)
|
|
50
|
+
if isinstance(response_data, dict):
|
|
51
|
+
return response_data.get("groups", [])
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
def get_vm_recipies(self) -> List[Dict]:
|
|
55
|
+
"""
|
|
56
|
+
Retrieves a list of VM recipes from the CloudRift API.
|
|
57
|
+
Returns a list of dictionaries containing recipe information.
|
|
58
|
+
"""
|
|
59
|
+
recipe_group = self.list_recipes()
|
|
60
|
+
vm_recipes = []
|
|
61
|
+
for group in recipe_group:
|
|
62
|
+
tags = group.get("tags", [])
|
|
63
|
+
has_vm = "vm" in map(str.lower, tags)
|
|
64
|
+
if group.get("name", "").lower() != "linux" or not has_vm:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
recipes = group.get("recipes", [])
|
|
68
|
+
for recipe in recipes:
|
|
69
|
+
details = recipe.get("details", {})
|
|
70
|
+
if details.get("VirtualMachine", False):
|
|
71
|
+
vm_recipes.append(recipe)
|
|
72
|
+
|
|
73
|
+
return vm_recipes
|
|
74
|
+
|
|
75
|
+
def get_vm_image_url(self) -> Optional[str]:
|
|
76
|
+
recipes = self.get_vm_recipies()
|
|
77
|
+
ubuntu_images = []
|
|
78
|
+
for recipe in recipes:
|
|
79
|
+
has_nvidia_driver = "nvidia-driver" in recipe.get("tags", [])
|
|
80
|
+
if not has_nvidia_driver:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
recipe_name = recipe.get("name", "")
|
|
84
|
+
if "Ubuntu" not in recipe_name:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
url = recipe["details"].get("VirtualMachine", {}).get("image_url", None)
|
|
88
|
+
version_match = re.search(r".* (\d+\.\d+)", recipe_name)
|
|
89
|
+
if url and version_match and version_match.group(1):
|
|
90
|
+
ubuntu_version = version.parse(version_match.group(1))
|
|
91
|
+
ubuntu_images.append((ubuntu_version, url))
|
|
92
|
+
|
|
93
|
+
ubuntu_images.sort(key=lambda x: x[0]) # Sort by version
|
|
94
|
+
if ubuntu_images:
|
|
95
|
+
return ubuntu_images[-1][1]
|
|
96
|
+
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def deploy_instance(
|
|
100
|
+
self, instance_type: str, region: str, ssh_keys: List[str], cmd: str
|
|
101
|
+
) -> List[str]:
|
|
102
|
+
image_url = self.get_vm_image_url()
|
|
103
|
+
if not image_url:
|
|
104
|
+
raise BackendError("No suitable VM image found.")
|
|
105
|
+
|
|
106
|
+
request_data = {
|
|
107
|
+
"config": {
|
|
108
|
+
"VirtualMachine": {
|
|
109
|
+
"cloudinit_commands": cmd,
|
|
110
|
+
"image_url": image_url,
|
|
111
|
+
"ssh_key": {"PublicKeys": ssh_keys},
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"selector": {
|
|
115
|
+
"ByInstanceTypeAndLocation": {
|
|
116
|
+
"datacenters": [region],
|
|
117
|
+
"instance_type": instance_type,
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
"with_public_ip": True,
|
|
121
|
+
}
|
|
122
|
+
logger.debug("Deploying instance with request data: %s", request_data)
|
|
123
|
+
|
|
124
|
+
response_data = self._make_request("instances/rent", request_data)
|
|
125
|
+
if isinstance(response_data, dict):
|
|
126
|
+
return response_data.get("instance_ids", [])
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
def list_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
|
|
130
|
+
request_data = {
|
|
131
|
+
"selector": {
|
|
132
|
+
"ByStatus": ["Initializing", "Active", "Deactivating"],
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
logger.debug("Listing instances with request data: %s", request_data)
|
|
136
|
+
response_data = self._make_request("instances/list", request_data)
|
|
137
|
+
if isinstance(response_data, dict):
|
|
138
|
+
return response_data.get("instances", [])
|
|
139
|
+
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
def get_instance_by_id(self, instance_id: str) -> Optional[Dict]:
|
|
143
|
+
request_data = {"selector": {"ById": [instance_id]}}
|
|
144
|
+
logger.debug("Getting instance with request data: %s", request_data)
|
|
145
|
+
response_data = self._make_request("instances/list", request_data)
|
|
146
|
+
if isinstance(response_data, dict):
|
|
147
|
+
instances = response_data.get("instances", [])
|
|
148
|
+
if isinstance(instances, list) and len(instances) > 0:
|
|
149
|
+
return instances[0]
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def terminate_instance(self, instance_id: str) -> bool:
|
|
154
|
+
request_data = {"selector": {"ById": [instance_id]}}
|
|
155
|
+
logger.debug("Terminating instance with request data: %s", request_data)
|
|
156
|
+
response_data = self._make_request("instances/terminate", request_data)
|
|
157
|
+
if isinstance(response_data, dict):
|
|
158
|
+
info = response_data.get("terminated", [])
|
|
159
|
+
return len(info) > 0
|
|
160
|
+
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
def _make_request(
|
|
164
|
+
self,
|
|
165
|
+
endpoint: str,
|
|
166
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
167
|
+
method: str = "POST",
|
|
168
|
+
**kwargs,
|
|
169
|
+
) -> Union[Mapping[str, Any], str, Response]:
|
|
170
|
+
headers = {}
|
|
171
|
+
if self.api_key is not None:
|
|
172
|
+
headers["X-API-Key"] = self.api_key
|
|
173
|
+
|
|
174
|
+
version = CLOUDRIFT_API_VERSION
|
|
175
|
+
full_url = f"{self.public_api_root}/{endpoint}"
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
response = requests.request(
|
|
179
|
+
method,
|
|
180
|
+
full_url,
|
|
181
|
+
headers=headers,
|
|
182
|
+
json={"version": version, "data": data},
|
|
183
|
+
timeout=15,
|
|
184
|
+
**kwargs,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if not response.ok:
|
|
188
|
+
response.raise_for_status()
|
|
189
|
+
try:
|
|
190
|
+
response_json = response.json()
|
|
191
|
+
if isinstance(response_json, str):
|
|
192
|
+
return response_json
|
|
193
|
+
if version is not None and version < response_json["version"]:
|
|
194
|
+
logger.warning(
|
|
195
|
+
"The API version %s is lower than the server version %s. ",
|
|
196
|
+
version,
|
|
197
|
+
response_json["version"],
|
|
198
|
+
)
|
|
199
|
+
return response_json["data"]
|
|
200
|
+
except requests.exceptions.JSONDecodeError:
|
|
201
|
+
return response
|
|
202
|
+
except requests.HTTPError as e:
|
|
203
|
+
if e.response is not None and e.response.status_code in (
|
|
204
|
+
requests.codes.forbidden,
|
|
205
|
+
requests.codes.unauthorized,
|
|
206
|
+
):
|
|
207
|
+
raise BackendInvalidCredentialsError(e.response.text)
|
|
208
|
+
raise
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
2
|
+
from dstack._internal.core.backends.cloudrift.compute import CloudRiftCompute
|
|
3
|
+
from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig
|
|
4
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CloudRiftBackend(Backend):
|
|
8
|
+
TYPE = BackendType.CLOUDRIFT
|
|
9
|
+
COMPUTE_CLASS = CloudRiftCompute
|
|
10
|
+
|
|
11
|
+
def __init__(self, config: CloudRiftConfig):
|
|
12
|
+
self.config = config
|
|
13
|
+
self._compute = CloudRiftCompute(self.config)
|
|
14
|
+
|
|
15
|
+
def compute(self) -> CloudRiftCompute:
|
|
16
|
+
return self._compute
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.backend import Compute
|
|
4
|
+
from dstack._internal.core.backends.base.compute import (
|
|
5
|
+
ComputeWithCreateInstanceSupport,
|
|
6
|
+
get_shim_commands,
|
|
7
|
+
)
|
|
8
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
|
+
from dstack._internal.core.backends.cloudrift.api_client import RiftClient
|
|
10
|
+
from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig
|
|
11
|
+
from dstack._internal.core.errors import ComputeError
|
|
12
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
13
|
+
from dstack._internal.core.models.instances import (
|
|
14
|
+
InstanceAvailability,
|
|
15
|
+
InstanceConfiguration,
|
|
16
|
+
InstanceOffer,
|
|
17
|
+
InstanceOfferWithAvailability,
|
|
18
|
+
)
|
|
19
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
20
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
21
|
+
from dstack._internal.utils.logging import get_logger
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CloudRiftCompute(
|
|
27
|
+
ComputeWithCreateInstanceSupport,
|
|
28
|
+
Compute,
|
|
29
|
+
):
|
|
30
|
+
def __init__(self, config: CloudRiftConfig):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.config = config
|
|
33
|
+
self.client = RiftClient(self.config.creds.api_key)
|
|
34
|
+
|
|
35
|
+
def get_offers(
|
|
36
|
+
self, requirements: Optional[Requirements] = None
|
|
37
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
38
|
+
offers = get_catalog_offers(
|
|
39
|
+
backend=BackendType.CLOUDRIFT,
|
|
40
|
+
locations=self.config.regions or None,
|
|
41
|
+
requirements=requirements,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
offers_with_availabilities = self._get_offers_with_availability(offers)
|
|
45
|
+
return offers_with_availabilities
|
|
46
|
+
|
|
47
|
+
def _get_offers_with_availability(
|
|
48
|
+
self, offers: List[InstanceOffer]
|
|
49
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
50
|
+
instance_types_with_availabilities: List[Dict] = self.client.get_instance_types()
|
|
51
|
+
|
|
52
|
+
region_availabilities = {}
|
|
53
|
+
for instance_type in instance_types_with_availabilities:
|
|
54
|
+
for variant in instance_type["variants"]:
|
|
55
|
+
for dc, count in variant["available_nodes_per_dc"].items():
|
|
56
|
+
if count > 0:
|
|
57
|
+
key = (variant["name"], dc)
|
|
58
|
+
region_availabilities[key] = InstanceAvailability.AVAILABLE
|
|
59
|
+
|
|
60
|
+
availability_offers = []
|
|
61
|
+
for offer in offers:
|
|
62
|
+
key = (offer.instance.name, offer.region)
|
|
63
|
+
availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE)
|
|
64
|
+
availability_offers.append(
|
|
65
|
+
InstanceOfferWithAvailability(**offer.dict(), availability=availability)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return availability_offers
|
|
69
|
+
|
|
70
|
+
def create_instance(
|
|
71
|
+
self,
|
|
72
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
73
|
+
instance_config: InstanceConfiguration,
|
|
74
|
+
placement_group: Optional[PlacementGroup],
|
|
75
|
+
) -> JobProvisioningData:
|
|
76
|
+
commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
|
|
77
|
+
startup_script = " ".join([" && ".join(commands)])
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Creating instance for offer {instance_offer.instance.name} in region {instance_offer.region} with commands: {startup_script}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
instance_ids = self.client.deploy_instance(
|
|
83
|
+
instance_type=instance_offer.instance.name,
|
|
84
|
+
region=instance_offer.region,
|
|
85
|
+
ssh_keys=instance_config.get_public_keys(),
|
|
86
|
+
cmd=startup_script,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if len(instance_ids) == 0:
|
|
90
|
+
raise ComputeError(
|
|
91
|
+
f"Failed to create instance for offer {instance_offer.instance.name} in region {instance_offer.region}."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return JobProvisioningData(
|
|
95
|
+
backend=instance_offer.backend,
|
|
96
|
+
instance_type=instance_offer.instance,
|
|
97
|
+
instance_id=instance_ids[0],
|
|
98
|
+
hostname=None,
|
|
99
|
+
internal_ip=None,
|
|
100
|
+
region=instance_offer.region,
|
|
101
|
+
price=instance_offer.price,
|
|
102
|
+
username="riftuser",
|
|
103
|
+
ssh_port=22,
|
|
104
|
+
dockerized=True,
|
|
105
|
+
ssh_proxy=None,
|
|
106
|
+
backend_data=None,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def update_provisioning_data(
|
|
110
|
+
self,
|
|
111
|
+
provisioning_data: JobProvisioningData,
|
|
112
|
+
project_ssh_public_key: str,
|
|
113
|
+
project_ssh_private_key: str,
|
|
114
|
+
):
|
|
115
|
+
instance_info = self.client.get_instance_by_id(provisioning_data.instance_id)
|
|
116
|
+
|
|
117
|
+
if not instance_info:
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
instance_mode = instance_info.get("node_mode", "")
|
|
121
|
+
|
|
122
|
+
if not instance_mode or instance_mode != "VirtualMachine":
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
vms = instance_info.get("virtual_machines", [])
|
|
126
|
+
if len(vms) == 0:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
vm_ready = vms[0].get("ready", False)
|
|
130
|
+
if vm_ready:
|
|
131
|
+
provisioning_data.hostname = instance_info.get("host_address", None)
|
|
132
|
+
|
|
133
|
+
def terminate_instance(
|
|
134
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
135
|
+
):
|
|
136
|
+
terminated = self.client.terminate_instance(instance_id=instance_id)
|
|
137
|
+
if not terminated:
|
|
138
|
+
raise ComputeError(f"Failed to terminate instance {instance_id} in region {region}.")
|