dstack 0.19.15__py3-none-any.whl → 0.19.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/cloudrift/__init__.py +0 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +208 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +66 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/compatibility/logs.py +15 -0
- dstack/_internal/core/compatibility/runs.py +2 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +22 -2
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +10 -1
- dstack/_internal/server/background/tasks/process_fleets.py +1 -1
- dstack/_internal/server/background/tasks/process_gateways.py +1 -1
- dstack/_internal/server/background/tasks/process_instances.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_runs.py +21 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +10 -4
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
- dstack/_internal/server/background/tasks/process_volumes.py +1 -1
- dstack/_internal/server/routers/gateways.py +6 -3
- dstack/_internal/server/routers/projects.py +63 -0
- dstack/_internal/server/routers/prometheus.py +5 -5
- dstack/_internal/server/schemas/logs.py +10 -1
- dstack/_internal/server/schemas/projects.py +12 -0
- dstack/_internal/server/security/permissions.py +75 -2
- dstack/_internal/server/services/fleets.py +1 -1
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/jobs/configurators/base.py +7 -1
- dstack/_internal/server/services/logs/aws.py +38 -38
- dstack/_internal/server/services/logs/filelog.py +48 -14
- dstack/_internal/server/services/logs/gcp.py +17 -16
- dstack/_internal/server/services/projects.py +164 -5
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +52 -0
- dstack/_internal/server/services/runs.py +3 -3
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +1 -3
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js → main-a4eafa74304e587d037c.js} +51 -43
- dstack/_internal/server/statics/{main-0ac1e1583684417ae4d1.js.map → main-a4eafa74304e587d037c.js.map} +1 -1
- dstack/_internal/server/statics/{main-f39c418b05fe14772dd8.css → main-f53d6d0d42f8d61df1de.css} +1 -1
- dstack/_internal/settings.py +1 -0
- dstack/api/_public/runs.py +6 -5
- dstack/api/server/_logs.py +5 -1
- dstack/api/server/_projects.py +24 -0
- dstack/version.py +1 -1
- {dstack-0.19.15.dist-info → dstack-0.19.16.dist-info}/METADATA +1 -1
- {dstack-0.19.15.dist-info → dstack-0.19.16.dist-info}/RECORD +57 -48
- /dstack/_internal/server/services/{prometheus.py → prometheus/custom_metrics.py} +0 -0
- {dstack-0.19.15.dist-info → dstack-0.19.16.dist-info}/WHEEL +0 -0
- {dstack-0.19.15.dist-info → dstack-0.19.16.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.15.dist-info → dstack-0.19.16.dist-info}/licenses/LICENSE.md +0 -0
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional, Union
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from packaging import version
|
|
7
|
+
from requests import Response
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.errors import BackendError, BackendInvalidCredentialsError
|
|
10
|
+
from dstack._internal.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
CLOUDRIFT_SERVER_ADDRESS = "https://api.cloudrift.ai"
|
|
16
|
+
CLOUDRIFT_API_VERSION = "2025-05-29"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RiftClient:
|
|
20
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
21
|
+
self.public_api_root = os.path.join(CLOUDRIFT_SERVER_ADDRESS, "api/v1")
|
|
22
|
+
self.api_key = api_key
|
|
23
|
+
|
|
24
|
+
def validate_api_key(self) -> bool:
|
|
25
|
+
"""
|
|
26
|
+
Validates the API key by making a request to the server.
|
|
27
|
+
Returns True if the API key is valid, False otherwise.
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
response = self._make_request("auth/me")
|
|
31
|
+
if isinstance(response, dict):
|
|
32
|
+
return "email" in response
|
|
33
|
+
return False
|
|
34
|
+
except BackendInvalidCredentialsError:
|
|
35
|
+
return False
|
|
36
|
+
except Exception as e:
|
|
37
|
+
logger.error(f"Error validating API key: {e}")
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
def get_instance_types(self) -> List[Dict]:
|
|
41
|
+
request_data = {"selector": {"ByServiceAndLocation": {"services": ["vm"]}}}
|
|
42
|
+
response_data = self._make_request("instance-types/list", request_data)
|
|
43
|
+
if isinstance(response_data, dict):
|
|
44
|
+
return response_data.get("instance_types", [])
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
def list_recipes(self) -> List[Dict]:
|
|
48
|
+
request_data = {}
|
|
49
|
+
response_data = self._make_request("recipes/list", request_data)
|
|
50
|
+
if isinstance(response_data, dict):
|
|
51
|
+
return response_data.get("groups", [])
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
def get_vm_recipies(self) -> List[Dict]:
|
|
55
|
+
"""
|
|
56
|
+
Retrieves a list of VM recipes from the CloudRift API.
|
|
57
|
+
Returns a list of dictionaries containing recipe information.
|
|
58
|
+
"""
|
|
59
|
+
recipe_group = self.list_recipes()
|
|
60
|
+
vm_recipes = []
|
|
61
|
+
for group in recipe_group:
|
|
62
|
+
tags = group.get("tags", [])
|
|
63
|
+
has_vm = "vm" in map(str.lower, tags)
|
|
64
|
+
if group.get("name", "").lower() != "linux" or not has_vm:
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
recipes = group.get("recipes", [])
|
|
68
|
+
for recipe in recipes:
|
|
69
|
+
details = recipe.get("details", {})
|
|
70
|
+
if details.get("VirtualMachine", False):
|
|
71
|
+
vm_recipes.append(recipe)
|
|
72
|
+
|
|
73
|
+
return vm_recipes
|
|
74
|
+
|
|
75
|
+
def get_vm_image_url(self) -> Optional[str]:
|
|
76
|
+
recipes = self.get_vm_recipies()
|
|
77
|
+
ubuntu_images = []
|
|
78
|
+
for recipe in recipes:
|
|
79
|
+
has_nvidia_driver = "nvidia-driver" in recipe.get("tags", [])
|
|
80
|
+
if not has_nvidia_driver:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
recipe_name = recipe.get("name", "")
|
|
84
|
+
if "Ubuntu" not in recipe_name:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
url = recipe["details"].get("VirtualMachine", {}).get("image_url", None)
|
|
88
|
+
version_match = re.search(r".* (\d+\.\d+)", recipe_name)
|
|
89
|
+
if url and version_match and version_match.group(1):
|
|
90
|
+
ubuntu_version = version.parse(version_match.group(1))
|
|
91
|
+
ubuntu_images.append((ubuntu_version, url))
|
|
92
|
+
|
|
93
|
+
ubuntu_images.sort(key=lambda x: x[0]) # Sort by version
|
|
94
|
+
if ubuntu_images:
|
|
95
|
+
return ubuntu_images[-1][1]
|
|
96
|
+
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def deploy_instance(
|
|
100
|
+
self, instance_type: str, region: str, ssh_keys: List[str], cmd: str
|
|
101
|
+
) -> List[str]:
|
|
102
|
+
image_url = self.get_vm_image_url()
|
|
103
|
+
if not image_url:
|
|
104
|
+
raise BackendError("No suitable VM image found.")
|
|
105
|
+
|
|
106
|
+
request_data = {
|
|
107
|
+
"config": {
|
|
108
|
+
"VirtualMachine": {
|
|
109
|
+
"cloudinit_commands": cmd,
|
|
110
|
+
"image_url": image_url,
|
|
111
|
+
"ssh_key": {"PublicKeys": ssh_keys},
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"selector": {
|
|
115
|
+
"ByInstanceTypeAndLocation": {
|
|
116
|
+
"datacenters": [region],
|
|
117
|
+
"instance_type": instance_type,
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
"with_public_ip": True,
|
|
121
|
+
}
|
|
122
|
+
logger.debug("Deploying instance with request data: %s", request_data)
|
|
123
|
+
|
|
124
|
+
response_data = self._make_request("instances/rent", request_data)
|
|
125
|
+
if isinstance(response_data, dict):
|
|
126
|
+
return response_data.get("instance_ids", [])
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
def list_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
|
|
130
|
+
request_data = {
|
|
131
|
+
"selector": {
|
|
132
|
+
"ByStatus": ["Initializing", "Active", "Deactivating"],
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
logger.debug("Listing instances with request data: %s", request_data)
|
|
136
|
+
response_data = self._make_request("instances/list", request_data)
|
|
137
|
+
if isinstance(response_data, dict):
|
|
138
|
+
return response_data.get("instances", [])
|
|
139
|
+
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
def get_instance_by_id(self, instance_id: str) -> Optional[Dict]:
|
|
143
|
+
request_data = {"selector": {"ById": [instance_id]}}
|
|
144
|
+
logger.debug("Getting instance with request data: %s", request_data)
|
|
145
|
+
response_data = self._make_request("instances/list", request_data)
|
|
146
|
+
if isinstance(response_data, dict):
|
|
147
|
+
instances = response_data.get("instances", [])
|
|
148
|
+
if isinstance(instances, list) and len(instances) > 0:
|
|
149
|
+
return instances[0]
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def terminate_instance(self, instance_id: str) -> bool:
|
|
154
|
+
request_data = {"selector": {"ById": [instance_id]}}
|
|
155
|
+
logger.debug("Terminating instance with request data: %s", request_data)
|
|
156
|
+
response_data = self._make_request("instances/terminate", request_data)
|
|
157
|
+
if isinstance(response_data, dict):
|
|
158
|
+
info = response_data.get("terminated", [])
|
|
159
|
+
return len(info) > 0
|
|
160
|
+
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
def _make_request(
|
|
164
|
+
self,
|
|
165
|
+
endpoint: str,
|
|
166
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
167
|
+
method: str = "POST",
|
|
168
|
+
**kwargs,
|
|
169
|
+
) -> Union[Mapping[str, Any], str, Response]:
|
|
170
|
+
headers = {}
|
|
171
|
+
if self.api_key is not None:
|
|
172
|
+
headers["X-API-Key"] = self.api_key
|
|
173
|
+
|
|
174
|
+
version = CLOUDRIFT_API_VERSION
|
|
175
|
+
full_url = f"{self.public_api_root}/{endpoint}"
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
response = requests.request(
|
|
179
|
+
method,
|
|
180
|
+
full_url,
|
|
181
|
+
headers=headers,
|
|
182
|
+
json={"version": version, "data": data},
|
|
183
|
+
timeout=15,
|
|
184
|
+
**kwargs,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if not response.ok:
|
|
188
|
+
response.raise_for_status()
|
|
189
|
+
try:
|
|
190
|
+
response_json = response.json()
|
|
191
|
+
if isinstance(response_json, str):
|
|
192
|
+
return response_json
|
|
193
|
+
if version is not None and version < response_json["version"]:
|
|
194
|
+
logger.warning(
|
|
195
|
+
"The API version %s is lower than the server version %s. ",
|
|
196
|
+
version,
|
|
197
|
+
response_json["version"],
|
|
198
|
+
)
|
|
199
|
+
return response_json["data"]
|
|
200
|
+
except requests.exceptions.JSONDecodeError:
|
|
201
|
+
return response
|
|
202
|
+
except requests.HTTPError as e:
|
|
203
|
+
if e.response is not None and e.response.status_code in (
|
|
204
|
+
requests.codes.forbidden,
|
|
205
|
+
requests.codes.unauthorized,
|
|
206
|
+
):
|
|
207
|
+
raise BackendInvalidCredentialsError(e.response.text)
|
|
208
|
+
raise
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
2
|
+
from dstack._internal.core.backends.cloudrift.compute import CloudRiftCompute
|
|
3
|
+
from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig
|
|
4
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CloudRiftBackend(Backend):
|
|
8
|
+
TYPE = BackendType.CLOUDRIFT
|
|
9
|
+
COMPUTE_CLASS = CloudRiftCompute
|
|
10
|
+
|
|
11
|
+
def __init__(self, config: CloudRiftConfig):
|
|
12
|
+
self.config = config
|
|
13
|
+
self._compute = CloudRiftCompute(self.config)
|
|
14
|
+
|
|
15
|
+
def compute(self) -> CloudRiftCompute:
|
|
16
|
+
return self._compute
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.backend import Compute
|
|
4
|
+
from dstack._internal.core.backends.base.compute import (
|
|
5
|
+
ComputeWithCreateInstanceSupport,
|
|
6
|
+
get_shim_commands,
|
|
7
|
+
)
|
|
8
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
|
+
from dstack._internal.core.backends.cloudrift.api_client import RiftClient
|
|
10
|
+
from dstack._internal.core.backends.cloudrift.models import CloudRiftConfig
|
|
11
|
+
from dstack._internal.core.errors import ComputeError
|
|
12
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
13
|
+
from dstack._internal.core.models.instances import (
|
|
14
|
+
InstanceAvailability,
|
|
15
|
+
InstanceConfiguration,
|
|
16
|
+
InstanceOffer,
|
|
17
|
+
InstanceOfferWithAvailability,
|
|
18
|
+
)
|
|
19
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
20
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
21
|
+
from dstack._internal.utils.logging import get_logger
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CloudRiftCompute(
|
|
27
|
+
ComputeWithCreateInstanceSupport,
|
|
28
|
+
Compute,
|
|
29
|
+
):
|
|
30
|
+
def __init__(self, config: CloudRiftConfig):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.config = config
|
|
33
|
+
self.client = RiftClient(self.config.creds.api_key)
|
|
34
|
+
|
|
35
|
+
def get_offers(
|
|
36
|
+
self, requirements: Optional[Requirements] = None
|
|
37
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
38
|
+
offers = get_catalog_offers(
|
|
39
|
+
backend=BackendType.CLOUDRIFT,
|
|
40
|
+
locations=self.config.regions or None,
|
|
41
|
+
requirements=requirements,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
offers_with_availabilities = self._get_offers_with_availability(offers)
|
|
45
|
+
return offers_with_availabilities
|
|
46
|
+
|
|
47
|
+
def _get_offers_with_availability(
|
|
48
|
+
self, offers: List[InstanceOffer]
|
|
49
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
50
|
+
instance_types_with_availabilities: List[Dict] = self.client.get_instance_types()
|
|
51
|
+
|
|
52
|
+
region_availabilities = {}
|
|
53
|
+
for instance_type in instance_types_with_availabilities:
|
|
54
|
+
for variant in instance_type["variants"]:
|
|
55
|
+
for dc, count in variant["available_nodes_per_dc"].items():
|
|
56
|
+
if count > 0:
|
|
57
|
+
key = (variant["name"], dc)
|
|
58
|
+
region_availabilities[key] = InstanceAvailability.AVAILABLE
|
|
59
|
+
|
|
60
|
+
availability_offers = []
|
|
61
|
+
for offer in offers:
|
|
62
|
+
key = (offer.instance.name, offer.region)
|
|
63
|
+
availability = region_availabilities.get(key, InstanceAvailability.NOT_AVAILABLE)
|
|
64
|
+
availability_offers.append(
|
|
65
|
+
InstanceOfferWithAvailability(**offer.dict(), availability=availability)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return availability_offers
|
|
69
|
+
|
|
70
|
+
def create_instance(
|
|
71
|
+
self,
|
|
72
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
73
|
+
instance_config: InstanceConfiguration,
|
|
74
|
+
placement_group: Optional[PlacementGroup],
|
|
75
|
+
) -> JobProvisioningData:
|
|
76
|
+
commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
|
|
77
|
+
startup_script = " ".join([" && ".join(commands)])
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Creating instance for offer {instance_offer.instance.name} in region {instance_offer.region} with commands: {startup_script}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
instance_ids = self.client.deploy_instance(
|
|
83
|
+
instance_type=instance_offer.instance.name,
|
|
84
|
+
region=instance_offer.region,
|
|
85
|
+
ssh_keys=instance_config.get_public_keys(),
|
|
86
|
+
cmd=startup_script,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if len(instance_ids) == 0:
|
|
90
|
+
raise ComputeError(
|
|
91
|
+
f"Failed to create instance for offer {instance_offer.instance.name} in region {instance_offer.region}."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return JobProvisioningData(
|
|
95
|
+
backend=instance_offer.backend,
|
|
96
|
+
instance_type=instance_offer.instance,
|
|
97
|
+
instance_id=instance_ids[0],
|
|
98
|
+
hostname=None,
|
|
99
|
+
internal_ip=None,
|
|
100
|
+
region=instance_offer.region,
|
|
101
|
+
price=instance_offer.price,
|
|
102
|
+
username="riftuser",
|
|
103
|
+
ssh_port=22,
|
|
104
|
+
dockerized=True,
|
|
105
|
+
ssh_proxy=None,
|
|
106
|
+
backend_data=None,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def update_provisioning_data(
|
|
110
|
+
self,
|
|
111
|
+
provisioning_data: JobProvisioningData,
|
|
112
|
+
project_ssh_public_key: str,
|
|
113
|
+
project_ssh_private_key: str,
|
|
114
|
+
):
|
|
115
|
+
instance_info = self.client.get_instance_by_id(provisioning_data.instance_id)
|
|
116
|
+
|
|
117
|
+
if not instance_info:
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
instance_mode = instance_info.get("node_mode", "")
|
|
121
|
+
|
|
122
|
+
if not instance_mode or instance_mode != "VirtualMachine":
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
vms = instance_info.get("virtual_machines", [])
|
|
126
|
+
if len(vms) == 0:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
vm_ready = vms[0].get("ready", False)
|
|
130
|
+
if vm_ready:
|
|
131
|
+
provisioning_data.hostname = instance_info.get("host_address", None)
|
|
132
|
+
|
|
133
|
+
def terminate_instance(
|
|
134
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
135
|
+
):
|
|
136
|
+
terminated = self.client.terminate_instance(instance_id=instance_id)
|
|
137
|
+
if not terminated:
|
|
138
|
+
raise ComputeError(f"Failed to terminate instance {instance_id} in region {region}.")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
4
|
+
BackendRecord,
|
|
5
|
+
Configurator,
|
|
6
|
+
raise_invalid_credentials_error,
|
|
7
|
+
)
|
|
8
|
+
from dstack._internal.core.backends.cloudrift.api_client import RiftClient
|
|
9
|
+
from dstack._internal.core.backends.cloudrift.backend import CloudRiftBackend
|
|
10
|
+
from dstack._internal.core.backends.cloudrift.models import (
|
|
11
|
+
AnyCloudRiftBackendConfig,
|
|
12
|
+
AnyCloudRiftCreds,
|
|
13
|
+
CloudRiftBackendConfig,
|
|
14
|
+
CloudRiftBackendConfigWithCreds,
|
|
15
|
+
CloudRiftConfig,
|
|
16
|
+
CloudRiftCreds,
|
|
17
|
+
CloudRiftStoredConfig,
|
|
18
|
+
)
|
|
19
|
+
from dstack._internal.core.models.backends.base import (
|
|
20
|
+
BackendType,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CloudRiftConfigurator(Configurator):
|
|
25
|
+
TYPE = BackendType.CLOUDRIFT
|
|
26
|
+
BACKEND_CLASS = CloudRiftBackend
|
|
27
|
+
|
|
28
|
+
def validate_config(
|
|
29
|
+
self, config: CloudRiftBackendConfigWithCreds, default_creds_enabled: bool
|
|
30
|
+
):
|
|
31
|
+
self._validate_creds(config.creds)
|
|
32
|
+
|
|
33
|
+
def create_backend(
|
|
34
|
+
self, project_name: str, config: CloudRiftBackendConfigWithCreds
|
|
35
|
+
) -> BackendRecord:
|
|
36
|
+
return BackendRecord(
|
|
37
|
+
config=CloudRiftStoredConfig(
|
|
38
|
+
**CloudRiftBackendConfig.__response__.parse_obj(config).dict()
|
|
39
|
+
).json(),
|
|
40
|
+
auth=CloudRiftCreds.parse_obj(config.creds).json(),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def get_backend_config(
|
|
44
|
+
self, record: BackendRecord, include_creds: bool
|
|
45
|
+
) -> AnyCloudRiftBackendConfig:
|
|
46
|
+
config = self._get_config(record)
|
|
47
|
+
if include_creds:
|
|
48
|
+
return CloudRiftBackendConfigWithCreds.__response__.parse_obj(config)
|
|
49
|
+
return CloudRiftBackendConfig.__response__.parse_obj(config)
|
|
50
|
+
|
|
51
|
+
def get_backend(self, record: BackendRecord) -> CloudRiftBackend:
|
|
52
|
+
config = self._get_config(record)
|
|
53
|
+
return CloudRiftBackend(config=config)
|
|
54
|
+
|
|
55
|
+
def _get_config(self, record: BackendRecord) -> CloudRiftConfig:
|
|
56
|
+
return CloudRiftConfig.__response__(
|
|
57
|
+
**json.loads(record.config),
|
|
58
|
+
creds=CloudRiftCreds.parse_raw(record.auth),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def _validate_creds(self, creds: AnyCloudRiftCreds):
|
|
62
|
+
if not isinstance(creds, CloudRiftCreds):
|
|
63
|
+
raise_invalid_credentials_error(fields=[["creds"]])
|
|
64
|
+
client = RiftClient(creds.api_key)
|
|
65
|
+
if not client.validate_api_key():
|
|
66
|
+
raise_invalid_credentials_error(fields=[["creds", "api_key"]])
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Annotated, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CloudRiftAPIKeyCreds(CoreModel):
|
|
9
|
+
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
10
|
+
api_key: Annotated[str, Field(description="The API key")]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
AnyCloudRiftCreds = CloudRiftAPIKeyCreds
|
|
14
|
+
CloudRiftCreds = AnyCloudRiftCreds
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CloudRiftBackendConfig(CoreModel):
|
|
18
|
+
type: Annotated[
|
|
19
|
+
Literal["cloudrift"],
|
|
20
|
+
Field(description="The type of backend"),
|
|
21
|
+
] = "cloudrift"
|
|
22
|
+
regions: Annotated[
|
|
23
|
+
Optional[List[str]],
|
|
24
|
+
Field(description="The list of CloudRift regions. Omit to use all regions"),
|
|
25
|
+
] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CloudRiftBackendConfigWithCreds(CloudRiftBackendConfig):
|
|
29
|
+
creds: Annotated[AnyCloudRiftCreds, Field(description="The credentials")]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
AnyCloudRiftBackendConfig = Union[CloudRiftBackendConfig, CloudRiftBackendConfigWithCreds]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CloudRiftStoredConfig(CloudRiftBackendConfig):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CloudRiftConfig(CloudRiftStoredConfig):
|
|
40
|
+
creds: AnyCloudRiftCreds
|
|
@@ -20,6 +20,15 @@ try:
|
|
|
20
20
|
except ImportError:
|
|
21
21
|
pass
|
|
22
22
|
|
|
23
|
+
try:
|
|
24
|
+
from dstack._internal.core.backends.cloudrift.configurator import (
|
|
25
|
+
CloudRiftConfigurator,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_CONFIGURATOR_CLASSES.append(CloudRiftConfigurator)
|
|
29
|
+
except ImportError:
|
|
30
|
+
pass
|
|
31
|
+
|
|
23
32
|
try:
|
|
24
33
|
from dstack._internal.core.backends.cudo.configurator import (
|
|
25
34
|
CudoConfigurator,
|
|
@@ -8,6 +8,10 @@ from dstack._internal.core.backends.azure.models import (
|
|
|
8
8
|
AzureBackendConfig,
|
|
9
9
|
AzureBackendConfigWithCreds,
|
|
10
10
|
)
|
|
11
|
+
from dstack._internal.core.backends.cloudrift.models import (
|
|
12
|
+
CloudRiftBackendConfig,
|
|
13
|
+
CloudRiftBackendConfigWithCreds,
|
|
14
|
+
)
|
|
11
15
|
from dstack._internal.core.backends.cudo.models import (
|
|
12
16
|
CudoBackendConfig,
|
|
13
17
|
CudoBackendConfigWithCreds,
|
|
@@ -65,6 +69,7 @@ from dstack._internal.core.models.common import CoreModel
|
|
|
65
69
|
AnyBackendConfigWithoutCreds = Union[
|
|
66
70
|
AWSBackendConfig,
|
|
67
71
|
AzureBackendConfig,
|
|
72
|
+
CloudRiftBackendConfig,
|
|
68
73
|
CudoBackendConfig,
|
|
69
74
|
DataCrunchBackendConfig,
|
|
70
75
|
GCPBackendConfig,
|
|
@@ -86,6 +91,7 @@ AnyBackendConfigWithoutCreds = Union[
|
|
|
86
91
|
AnyBackendConfigWithCreds = Union[
|
|
87
92
|
AWSBackendConfigWithCreds,
|
|
88
93
|
AzureBackendConfigWithCreds,
|
|
94
|
+
CloudRiftBackendConfigWithCreds,
|
|
89
95
|
CudoBackendConfigWithCreds,
|
|
90
96
|
DataCrunchBackendConfigWithCreds,
|
|
91
97
|
GCPBackendConfigWithCreds,
|
|
@@ -106,6 +112,7 @@ AnyBackendConfigWithCreds = Union[
|
|
|
106
112
|
AnyBackendFileConfigWithCreds = Union[
|
|
107
113
|
AWSBackendConfigWithCreds,
|
|
108
114
|
AzureBackendConfigWithCreds,
|
|
115
|
+
CloudRiftBackendConfigWithCreds,
|
|
109
116
|
CudoBackendConfigWithCreds,
|
|
110
117
|
DataCrunchBackendConfigWithCreds,
|
|
111
118
|
GCPBackendFileConfigWithCreds,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[Dict]:
|
|
7
|
+
"""
|
|
8
|
+
Returns exclude mapping to exclude certain fields from the request.
|
|
9
|
+
Use this method to exclude new fields when they are not set to keep
|
|
10
|
+
clients backward-compatibility with older servers.
|
|
11
|
+
"""
|
|
12
|
+
excludes = {}
|
|
13
|
+
if request.next_token is None:
|
|
14
|
+
excludes["next_token"] = True
|
|
15
|
+
return excludes if excludes else None
|
|
@@ -97,6 +97,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
|
|
|
97
97
|
configuration_excludes["rate_limits"] = True
|
|
98
98
|
if configuration.shell is None:
|
|
99
99
|
configuration_excludes["shell"] = True
|
|
100
|
+
if configuration.docker is None:
|
|
101
|
+
configuration_excludes["docker"] = True
|
|
100
102
|
if configuration.priority is None:
|
|
101
103
|
configuration_excludes["priority"] = True
|
|
102
104
|
if configuration.startup_order is None:
|
|
@@ -6,6 +6,7 @@ class BackendType(str, enum.Enum):
|
|
|
6
6
|
Attributes:
|
|
7
7
|
AWS (BackendType): Amazon Web Services
|
|
8
8
|
AZURE (BackendType): Microsoft Azure
|
|
9
|
+
CLOUDRIFT (BackendType): CloudRift
|
|
9
10
|
CUDO (BackendType): Cudo
|
|
10
11
|
DSTACK (BackendType): dstack Sky
|
|
11
12
|
GCP (BackendType): Google Cloud Platform
|
|
@@ -22,6 +23,7 @@ class BackendType(str, enum.Enum):
|
|
|
22
23
|
|
|
23
24
|
AWS = "aws"
|
|
24
25
|
AZURE = "azure"
|
|
26
|
+
CLOUDRIFT = "cloudrift"
|
|
25
27
|
CUDO = "cudo"
|
|
26
28
|
DATACRUNCH = "datacrunch"
|
|
27
29
|
DSTACK = "dstack"
|
|
@@ -194,12 +194,14 @@ class BaseRunConfiguration(CoreModel):
|
|
|
194
194
|
] = None
|
|
195
195
|
python: Annotated[
|
|
196
196
|
Optional[PythonVersion],
|
|
197
|
-
Field(
|
|
197
|
+
Field(
|
|
198
|
+
description="The major version of Python. Mutually exclusive with `image` and `docker`"
|
|
199
|
+
),
|
|
198
200
|
] = None
|
|
199
201
|
nvcc: Annotated[
|
|
200
202
|
Optional[bool],
|
|
201
203
|
Field(
|
|
202
|
-
description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image`"
|
|
204
|
+
description="Use image with NVIDIA CUDA Compiler (NVCC) included. Mutually exclusive with `image` and `docker`"
|
|
203
205
|
),
|
|
204
206
|
] = None
|
|
205
207
|
single_branch: Annotated[
|
|
@@ -244,6 +246,12 @@ class BaseRunConfiguration(CoreModel):
|
|
|
244
246
|
volumes: Annotated[
|
|
245
247
|
List[Union[MountPoint, str]], Field(description="The volumes mount points")
|
|
246
248
|
] = []
|
|
249
|
+
docker: Annotated[
|
|
250
|
+
Optional[bool],
|
|
251
|
+
Field(
|
|
252
|
+
description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`"
|
|
253
|
+
),
|
|
254
|
+
] = None
|
|
247
255
|
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
248
256
|
setup: CommandsList = []
|
|
249
257
|
|
|
@@ -259,6 +267,18 @@ class BaseRunConfiguration(CoreModel):
|
|
|
259
267
|
return PythonVersion(v)
|
|
260
268
|
return v
|
|
261
269
|
|
|
270
|
+
@validator("docker", pre=True, always=True)
|
|
271
|
+
def _docker(cls, v, values) -> Optional[bool]:
|
|
272
|
+
if v is True and values.get("image"):
|
|
273
|
+
raise KeyError("`image` and `docker` are mutually exclusive fields")
|
|
274
|
+
if v is True and values.get("python"):
|
|
275
|
+
raise KeyError("`python` and `docker` are mutually exclusive fields")
|
|
276
|
+
if v is True and values.get("nvcc"):
|
|
277
|
+
raise KeyError("`nvcc` and `docker` are mutually exclusive fields")
|
|
278
|
+
# Ideally, we'd like to also prohibit privileged=False when docker=True,
|
|
279
|
+
# but it's not possible to do so without breaking backwards compatibility.
|
|
280
|
+
return v
|
|
281
|
+
|
|
262
282
|
@validator("volumes", each_item=True)
|
|
263
283
|
def convert_volumes(cls, v) -> MountPoint:
|
|
264
284
|
if isinstance(v, str):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List, Optional
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.common import CoreModel
|
|
6
6
|
|
|
@@ -23,3 +23,4 @@ class LogEvent(CoreModel):
|
|
|
23
23
|
|
|
24
24
|
class JobSubmissionLogs(CoreModel):
|
|
25
25
|
logs: List[LogEvent]
|
|
26
|
+
next_token: Optional[str]
|
|
@@ -301,7 +301,7 @@ class JobSubmission(CoreModel):
|
|
|
301
301
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
302
302
|
job_runtime_data: Optional[JobRuntimeData]
|
|
303
303
|
# TODO: make status_message and error a computed field after migrating to pydanticV2
|
|
304
|
-
status_message: Optional[str]
|
|
304
|
+
status_message: Optional[str] = None
|
|
305
305
|
error: Optional[str] = None
|
|
306
306
|
|
|
307
307
|
@property
|
|
@@ -548,11 +548,17 @@ class Run(CoreModel):
|
|
|
548
548
|
retry_on_events = (
|
|
549
549
|
jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
|
|
550
550
|
)
|
|
551
|
+
job_status = (
|
|
552
|
+
jobs[0].job_submissions[-1].status
|
|
553
|
+
if len(jobs) == 1 and jobs[0].job_submissions
|
|
554
|
+
else None
|
|
555
|
+
)
|
|
551
556
|
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
|
|
552
557
|
except KeyError:
|
|
553
558
|
return values
|
|
554
559
|
values["status_message"] = Run._get_status_message(
|
|
555
560
|
status=status,
|
|
561
|
+
job_status=job_status,
|
|
556
562
|
retry_on_events=retry_on_events,
|
|
557
563
|
termination_reason=termination_reason,
|
|
558
564
|
)
|
|
@@ -568,9 +574,12 @@ class Run(CoreModel):
|
|
|
568
574
|
@staticmethod
|
|
569
575
|
def _get_status_message(
|
|
570
576
|
status: RunStatus,
|
|
577
|
+
job_status: Optional[JobStatus],
|
|
571
578
|
retry_on_events: List[RetryEvent],
|
|
572
579
|
termination_reason: Optional[JobTerminationReason],
|
|
573
580
|
) -> str:
|
|
581
|
+
if job_status == JobStatus.PULLING:
|
|
582
|
+
return "pulling"
|
|
574
583
|
# Currently, `retrying` is shown only for `no-capacity` events
|
|
575
584
|
if (
|
|
576
585
|
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
|