dstack 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +10 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/core/backends/aws/compute.py +22 -10
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +14 -8
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +25 -19
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +11 -4
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/configurations.py +0 -1
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +11 -18
- dstack/_internal/server/background/tasks/process_running_jobs.py +9 -16
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/config.py +11 -1
- dstack/_internal/server/services/jobs/__init__.py +12 -9
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/pools.py +16 -17
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
- dstack/_internal/server/testing/common.py +33 -8
- dstack/api/_public/runs.py +1 -1
- dstack/version.py +2 -2
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/RECORD +80 -71
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +1 -1
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_runs.py +2 -2
- tests/_internal/server/services/test_logs.py +3 -3
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import random
|
|
2
3
|
import re
|
|
4
|
+
import string
|
|
3
5
|
import threading
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from functools import lru_cache
|
|
@@ -31,6 +33,7 @@ from dstack._internal.core.models.volumes import (
|
|
|
31
33
|
VolumeAttachmentData,
|
|
32
34
|
VolumeProvisioningData,
|
|
33
35
|
)
|
|
36
|
+
from dstack._internal.core.services import is_valid_dstack_resource_name
|
|
34
37
|
from dstack._internal.utils.logging import get_logger
|
|
35
38
|
|
|
36
39
|
logger = get_logger(__name__)
|
|
@@ -209,8 +212,105 @@ class Compute(ABC):
|
|
|
209
212
|
return self.get_offers(requirements)
|
|
210
213
|
|
|
211
214
|
|
|
212
|
-
def
|
|
213
|
-
return
|
|
215
|
+
def get_job_instance_name(run: Run, job: Job) -> str:
|
|
216
|
+
return job.job_spec.job_name
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
_DEFAULT_MAX_RESOURCE_NAME_LEN = 60
|
|
220
|
+
_CLOUD_RESOURCE_SUFFIX_LEN = 8
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def generate_unique_instance_name(
|
|
224
|
+
instance_configuration: InstanceConfiguration,
|
|
225
|
+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
|
|
226
|
+
) -> str:
|
|
227
|
+
"""
|
|
228
|
+
Generates a unique instance name valid across all backends.
|
|
229
|
+
"""
|
|
230
|
+
return generate_unique_backend_name(
|
|
231
|
+
resource_name=instance_configuration.instance_name,
|
|
232
|
+
project_name=instance_configuration.project_name,
|
|
233
|
+
max_length=max_length,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def generate_unique_instance_name_for_job(
|
|
238
|
+
run: Run,
|
|
239
|
+
job: Job,
|
|
240
|
+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
|
|
241
|
+
) -> str:
|
|
242
|
+
"""
|
|
243
|
+
Generates a unique instance name for a job valid across all backends.
|
|
244
|
+
"""
|
|
245
|
+
return generate_unique_backend_name(
|
|
246
|
+
resource_name=get_job_instance_name(run, job),
|
|
247
|
+
project_name=run.project_name,
|
|
248
|
+
max_length=max_length,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def generate_unique_gateway_instance_name(
|
|
253
|
+
gateway_compute_configuration: GatewayComputeConfiguration,
|
|
254
|
+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
|
|
255
|
+
) -> str:
|
|
256
|
+
"""
|
|
257
|
+
Generates a unique gateway instance name valid across all backends.
|
|
258
|
+
"""
|
|
259
|
+
return generate_unique_backend_name(
|
|
260
|
+
resource_name=gateway_compute_configuration.instance_name,
|
|
261
|
+
project_name=gateway_compute_configuration.project_name,
|
|
262
|
+
max_length=max_length,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def generate_unique_volume_name(
|
|
267
|
+
volume: Volume,
|
|
268
|
+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
|
|
269
|
+
) -> str:
|
|
270
|
+
"""
|
|
271
|
+
Generates a unique volume name valid across all backends.
|
|
272
|
+
"""
|
|
273
|
+
return generate_unique_backend_name(
|
|
274
|
+
resource_name=volume.name,
|
|
275
|
+
project_name=volume.project_name,
|
|
276
|
+
max_length=max_length,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def generate_unique_backend_name(
|
|
281
|
+
resource_name: str,
|
|
282
|
+
project_name: Optional[str],
|
|
283
|
+
max_length: int,
|
|
284
|
+
) -> str:
|
|
285
|
+
"""
|
|
286
|
+
Generates a unique resource name valid across all backends.
|
|
287
|
+
Backend resource names must be unique on every provisioning so that
|
|
288
|
+
resource re-submission/re-creation doesn't lead to conflicts
|
|
289
|
+
on backends that require unique names (e.g. Azure, GCP).
|
|
290
|
+
"""
|
|
291
|
+
# resource_name is guaranteed to be valid in all backends
|
|
292
|
+
prefix = f"dstack-{resource_name}"
|
|
293
|
+
if project_name is not None and is_valid_dstack_resource_name(project_name):
|
|
294
|
+
# project_name is not guaranteed to be valid in all backends,
|
|
295
|
+
# so we add it only if it passes the validation
|
|
296
|
+
prefix = f"dstack-{project_name}-{resource_name}"
|
|
297
|
+
return _generate_unique_backend_name_with_prefix(
|
|
298
|
+
prefix=prefix,
|
|
299
|
+
max_length=max_length,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _generate_unique_backend_name_with_prefix(
|
|
304
|
+
prefix: str,
|
|
305
|
+
max_length: int,
|
|
306
|
+
) -> str:
|
|
307
|
+
prefix_len = max_length - _CLOUD_RESOURCE_SUFFIX_LEN - 1
|
|
308
|
+
prefix = prefix[:prefix_len]
|
|
309
|
+
suffix = "".join(
|
|
310
|
+
random.choice(string.ascii_lowercase + string.digits)
|
|
311
|
+
for _ in range(_CLOUD_RESOURCE_SUFFIX_LEN)
|
|
312
|
+
)
|
|
313
|
+
return f"{prefix}-{suffix}"
|
|
214
314
|
|
|
215
315
|
|
|
216
316
|
def get_cloud_config(**config) -> str:
|
|
@@ -14,6 +14,12 @@ from dstack._internal.core.models.instances import (
|
|
|
14
14
|
from dstack._internal.core.models.resources import DEFAULT_DISK, Memory, Range
|
|
15
15
|
from dstack._internal.core.models.runs import Requirements
|
|
16
16
|
|
|
17
|
+
# Offers not supported by all dstack versions are hidden behind one or more flags.
|
|
18
|
+
# This list enables the flags that are currently supported.
|
|
19
|
+
SUPPORTED_GPUHUNT_FLAGS = [
|
|
20
|
+
"oci-spot",
|
|
21
|
+
]
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
def get_catalog_offers(
|
|
19
25
|
backend: BackendType,
|
|
@@ -110,7 +116,7 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
|
|
|
110
116
|
|
|
111
117
|
|
|
112
118
|
def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFilter:
|
|
113
|
-
q = gpuhunt.QueryFilter()
|
|
119
|
+
q = gpuhunt.QueryFilter(allowed_flags=SUPPORTED_GPUHUNT_FLAGS)
|
|
114
120
|
if req is None:
|
|
115
121
|
return q
|
|
116
122
|
|
|
@@ -4,7 +4,8 @@ import requests
|
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.backends.base import Compute
|
|
6
6
|
from dstack._internal.core.backends.base.compute import (
|
|
7
|
-
|
|
7
|
+
generate_unique_instance_name,
|
|
8
|
+
get_job_instance_name,
|
|
8
9
|
get_shim_commands,
|
|
9
10
|
)
|
|
10
11
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -25,6 +26,9 @@ from dstack._internal.utils.logging import get_logger
|
|
|
25
26
|
logger = get_logger(__name__)
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
MAX_RESOURCE_NAME_LEN = 30
|
|
30
|
+
|
|
31
|
+
|
|
28
32
|
class CudoCompute(Compute):
|
|
29
33
|
def __init__(self, config: CudoConfig):
|
|
30
34
|
super().__init__()
|
|
@@ -58,7 +62,7 @@ class CudoCompute(Compute):
|
|
|
58
62
|
) -> JobProvisioningData:
|
|
59
63
|
instance_config = InstanceConfiguration(
|
|
60
64
|
project_name=run.project_name,
|
|
61
|
-
instance_name=
|
|
65
|
+
instance_name=get_job_instance_name(run, job),
|
|
62
66
|
ssh_keys=[
|
|
63
67
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
64
68
|
],
|
|
@@ -71,6 +75,7 @@ class CudoCompute(Compute):
|
|
|
71
75
|
instance_offer: InstanceOfferWithAvailability,
|
|
72
76
|
instance_config: InstanceConfiguration,
|
|
73
77
|
) -> JobProvisioningData:
|
|
78
|
+
vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
74
79
|
public_keys = instance_config.get_public_keys()
|
|
75
80
|
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
|
|
76
81
|
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
@@ -81,13 +86,12 @@ class CudoCompute(Compute):
|
|
|
81
86
|
shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
|
|
82
87
|
)
|
|
83
88
|
|
|
84
|
-
vm_id = f"{instance_config.instance_name}-{instance_offer.region}"
|
|
85
89
|
try:
|
|
86
90
|
resp_data = self.api_client.create_virtual_machine(
|
|
87
91
|
project_id=self.config.project_id,
|
|
88
92
|
boot_disk_storage_class="STORAGE_CLASS_NETWORK",
|
|
89
93
|
boot_disk_size_gib=disk_size,
|
|
90
|
-
book_disk_id=f"{
|
|
94
|
+
book_disk_id=f"{vm_id}_disk_id",
|
|
91
95
|
boot_disk_image_id=_get_image_id(gpus_no > 0),
|
|
92
96
|
data_center_id=instance_offer.region,
|
|
93
97
|
gpus=gpus_no,
|
|
@@ -2,6 +2,7 @@ from typing import Dict, List, Optional
|
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.backends.base import Compute
|
|
4
4
|
from dstack._internal.core.backends.base.compute import (
|
|
5
|
+
generate_unique_instance_name,
|
|
5
6
|
get_shim_commands,
|
|
6
7
|
)
|
|
7
8
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -22,6 +23,8 @@ from dstack._internal.utils.logging import get_logger
|
|
|
22
23
|
|
|
23
24
|
logger = get_logger("datacrunch.compute")
|
|
24
25
|
|
|
26
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
27
|
+
|
|
25
28
|
# Ubuntu 22.04 + CUDA 12.0 + Docker
|
|
26
29
|
# from API https://datacrunch.stoplight.io/docs/datacrunch-public/c46ab45dbc508-get-all-image-types
|
|
27
30
|
IMAGE_ID = "2088da25-bb0d-41cc-a191-dccae45d96fd"
|
|
@@ -78,6 +81,9 @@ class DataCrunchCompute(Compute):
|
|
|
78
81
|
instance_offer: InstanceOfferWithAvailability,
|
|
79
82
|
instance_config: InstanceConfiguration,
|
|
80
83
|
) -> JobProvisioningData:
|
|
84
|
+
instance_name = generate_unique_instance_name(
|
|
85
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
86
|
+
)
|
|
81
87
|
public_keys = instance_config.get_public_keys()
|
|
82
88
|
ssh_ids = []
|
|
83
89
|
for ssh_public_key in public_keys:
|
|
@@ -106,8 +112,8 @@ class DataCrunchCompute(Compute):
|
|
|
106
112
|
instance_type=instance_offer.instance.name,
|
|
107
113
|
ssh_key_ids=ssh_ids,
|
|
108
114
|
startup_script_id=startup_script_ids,
|
|
109
|
-
hostname=
|
|
110
|
-
description=
|
|
115
|
+
hostname=instance_name,
|
|
116
|
+
description=instance_name,
|
|
111
117
|
image=IMAGE_ID,
|
|
112
118
|
disk_size=disk_size,
|
|
113
119
|
location=instance_offer.region,
|
|
@@ -119,8 +125,8 @@ class DataCrunchCompute(Compute):
|
|
|
119
125
|
"instance_type": instance_offer.instance.name,
|
|
120
126
|
"ssh_key_ids": ssh_ids,
|
|
121
127
|
"startup_script_id": startup_script_ids,
|
|
122
|
-
"hostname":
|
|
123
|
-
"description":
|
|
128
|
+
"hostname": instance_name,
|
|
129
|
+
"description": instance_name,
|
|
124
130
|
"image": IMAGE_ID,
|
|
125
131
|
"disk_size": disk_size,
|
|
126
132
|
"location": instance_offer.region,
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
|
+
import google.api_core.exceptions
|
|
4
5
|
import google.auth
|
|
6
|
+
import google.cloud.compute_v1 as compute_v1
|
|
5
7
|
from google.auth.credentials import Credentials
|
|
6
8
|
from google.auth.exceptions import DefaultCredentialsError
|
|
7
|
-
from google.cloud import storage
|
|
8
9
|
from google.oauth2 import service_account
|
|
9
10
|
|
|
10
11
|
from dstack._internal.core.errors import BackendAuthError
|
|
@@ -16,13 +17,16 @@ from dstack._internal.core.models.backends.gcp import (
|
|
|
16
17
|
from dstack._internal.core.models.common import is_core_model_instance
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def authenticate(creds: AnyGCPCreds) -> Tuple[Credentials,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
20
|
+
def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[Credentials, str]:
|
|
21
|
+
credentials, credentials_project_id = get_credentials(creds)
|
|
22
|
+
if project_id is None:
|
|
23
|
+
# If project_id is not specified explicitly, try using credentials' project_id.
|
|
24
|
+
# Explicit project_id takes precedence bacause credentials' project_id may be irrelevant.
|
|
25
|
+
# For example, with Workload Identity Federation for GKE, it's cluster project_id.
|
|
26
|
+
project_id = credentials_project_id
|
|
27
|
+
if project_id is None:
|
|
28
|
+
raise BackendAuthError("Credentials require project_id to be specified")
|
|
29
|
+
validate_credentials(credentials, project_id)
|
|
26
30
|
return credentials, project_id
|
|
27
31
|
|
|
28
32
|
|
|
@@ -40,17 +44,19 @@ def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
|
|
|
40
44
|
try:
|
|
41
45
|
default_credentials, project_id = google.auth.default()
|
|
42
46
|
except DefaultCredentialsError:
|
|
43
|
-
raise BackendAuthError()
|
|
47
|
+
raise BackendAuthError("Failed to find default credentials")
|
|
44
48
|
|
|
45
49
|
return default_credentials, project_id
|
|
46
50
|
|
|
47
51
|
|
|
48
|
-
def validate_credentials(credentials: Credentials):
|
|
52
|
+
def validate_credentials(credentials: Credentials, project_id: str):
|
|
49
53
|
try:
|
|
50
|
-
|
|
51
|
-
|
|
54
|
+
regions_client = compute_v1.RegionsClient(credentials=credentials)
|
|
55
|
+
regions_client.list(project=project_id)
|
|
56
|
+
except google.api_core.exceptions.NotFound:
|
|
57
|
+
raise BackendAuthError(f"project_id {project_id} not found")
|
|
52
58
|
except Exception:
|
|
53
|
-
raise BackendAuthError()
|
|
59
|
+
raise BackendAuthError("Insufficient permissions")
|
|
54
60
|
|
|
55
61
|
|
|
56
62
|
def default_creds_available() -> bool:
|
|
@@ -12,8 +12,11 @@ import dstack._internal.core.backends.gcp.auth as auth
|
|
|
12
12
|
import dstack._internal.core.backends.gcp.resources as gcp_resources
|
|
13
13
|
from dstack._internal.core.backends.base.compute import (
|
|
14
14
|
Compute,
|
|
15
|
+
generate_unique_gateway_instance_name,
|
|
16
|
+
generate_unique_instance_name,
|
|
17
|
+
generate_unique_volume_name,
|
|
15
18
|
get_gateway_user_data,
|
|
16
|
-
|
|
19
|
+
get_job_instance_name,
|
|
17
20
|
get_shim_commands,
|
|
18
21
|
get_user_data,
|
|
19
22
|
merge_tags,
|
|
@@ -70,7 +73,7 @@ class GCPCompute(Compute):
|
|
|
70
73
|
def __init__(self, config: GCPConfig):
|
|
71
74
|
super().__init__()
|
|
72
75
|
self.config = config
|
|
73
|
-
self.credentials,
|
|
76
|
+
self.credentials, _ = auth.authenticate(config.creds, self.config.project_id)
|
|
74
77
|
self.instances_client = compute_v1.InstancesClient(credentials=self.credentials)
|
|
75
78
|
self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
|
|
76
79
|
self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
|
|
@@ -147,17 +150,10 @@ class GCPCompute(Compute):
|
|
|
147
150
|
instance_offer: InstanceOfferWithAvailability,
|
|
148
151
|
instance_config: InstanceConfiguration,
|
|
149
152
|
) -> JobProvisioningData:
|
|
150
|
-
instance_name =
|
|
153
|
+
instance_name = generate_unique_instance_name(
|
|
154
|
+
instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
|
|
155
|
+
)
|
|
151
156
|
allocate_public_ip = self.config.allocate_public_ips
|
|
152
|
-
if not gcp_resources.is_valid_resource_name(instance_name):
|
|
153
|
-
# In a rare case the instance name is invalid in GCP,
|
|
154
|
-
# we better use a random instance name than fail provisioning.
|
|
155
|
-
instance_name = gcp_resources.generate_random_resource_name()
|
|
156
|
-
logger.warning(
|
|
157
|
-
"Invalid GCP instance name: %s. A new valid name is generated: %s",
|
|
158
|
-
instance_config.instance_name,
|
|
159
|
-
instance_name,
|
|
160
|
-
)
|
|
161
157
|
authorized_keys = instance_config.get_public_keys()
|
|
162
158
|
|
|
163
159
|
# get_offers always fills instance_offer.availability_zones
|
|
@@ -182,6 +178,7 @@ class GCPCompute(Compute):
|
|
|
182
178
|
labels = {
|
|
183
179
|
"owner": "dstack",
|
|
184
180
|
"dstack_project": instance_config.project_name.lower(),
|
|
181
|
+
"dstack_name": instance_config.instance_name,
|
|
185
182
|
"dstack_user": instance_config.user.lower(),
|
|
186
183
|
}
|
|
187
184
|
labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
|
|
@@ -192,7 +189,7 @@ class GCPCompute(Compute):
|
|
|
192
189
|
else False
|
|
193
190
|
)
|
|
194
191
|
if is_tpu:
|
|
195
|
-
instance_id =
|
|
192
|
+
instance_id = instance_name
|
|
196
193
|
startup_script = _get_tpu_startup_script(authorized_keys)
|
|
197
194
|
# GCP does not allow attaching disks while TPUs is creating,
|
|
198
195
|
# so we need to attach the disks on creation.
|
|
@@ -378,7 +375,7 @@ class GCPCompute(Compute):
|
|
|
378
375
|
# TODO: run_job is the same for vm-based backends, refactor
|
|
379
376
|
instance_config = InstanceConfiguration(
|
|
380
377
|
project_name=run.project_name,
|
|
381
|
-
instance_name=
|
|
378
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
382
379
|
ssh_keys=[
|
|
383
380
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
384
381
|
],
|
|
@@ -421,6 +418,9 @@ class GCPCompute(Compute):
|
|
|
421
418
|
else:
|
|
422
419
|
raise ComputeResourceNotFoundError()
|
|
423
420
|
|
|
421
|
+
instance_name = generate_unique_gateway_instance_name(
|
|
422
|
+
configuration, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
|
|
423
|
+
)
|
|
424
424
|
# Choose any usable subnet in a VPC.
|
|
425
425
|
# Configuring a specific subnet per region is not supported yet.
|
|
426
426
|
subnetwork = _get_vpc_subnet(
|
|
@@ -432,6 +432,7 @@ class GCPCompute(Compute):
|
|
|
432
432
|
labels = {
|
|
433
433
|
"owner": "dstack",
|
|
434
434
|
"dstack_project": configuration.project_name.lower(),
|
|
435
|
+
"dstack_name": configuration.instance_name,
|
|
435
436
|
}
|
|
436
437
|
labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
|
|
437
438
|
labels = merge_tags(tags=labels, backend_tags=self.config.tags)
|
|
@@ -449,7 +450,7 @@ class GCPCompute(Compute):
|
|
|
449
450
|
authorized_keys=[configuration.ssh_key_pub],
|
|
450
451
|
labels=labels,
|
|
451
452
|
tags=[gcp_resources.DSTACK_GATEWAY_TAG],
|
|
452
|
-
instance_name=
|
|
453
|
+
instance_name=instance_name,
|
|
453
454
|
zone=zone,
|
|
454
455
|
service_account=self.config.vm_service_account,
|
|
455
456
|
network=self.config.vpc_resource_name,
|
|
@@ -458,10 +459,10 @@ class GCPCompute(Compute):
|
|
|
458
459
|
operation = self.instances_client.insert(request=request)
|
|
459
460
|
gcp_resources.wait_for_extended_operation(operation, "instance creation")
|
|
460
461
|
instance = self.instances_client.get(
|
|
461
|
-
project=self.config.project_id, zone=zone, instance=
|
|
462
|
+
project=self.config.project_id, zone=zone, instance=instance_name
|
|
462
463
|
)
|
|
463
464
|
return GatewayProvisioningData(
|
|
464
|
-
instance_id=
|
|
465
|
+
instance_id=instance_name,
|
|
465
466
|
region=configuration.region, # used for instance termination
|
|
466
467
|
availability_zone=zone,
|
|
467
468
|
ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p,
|
|
@@ -525,16 +526,21 @@ class GCPCompute(Compute):
|
|
|
525
526
|
)
|
|
526
527
|
zone = zones[0]
|
|
527
528
|
|
|
529
|
+
disk_name = generate_unique_volume_name(
|
|
530
|
+
volume, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
|
|
531
|
+
)
|
|
532
|
+
|
|
528
533
|
labels = {
|
|
529
534
|
"owner": "dstack",
|
|
530
535
|
"dstack_project": volume.project_name.lower(),
|
|
536
|
+
"dstack_name": volume.name,
|
|
531
537
|
"dstack_user": volume.user,
|
|
532
538
|
}
|
|
533
539
|
labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
|
|
534
540
|
labels = merge_tags(tags=labels, backend_tags=self.config.tags)
|
|
535
541
|
|
|
536
542
|
disk = compute_v1.Disk()
|
|
537
|
-
disk.name =
|
|
543
|
+
disk.name = disk_name
|
|
538
544
|
disk.size_gb = volume.configuration.size_gb
|
|
539
545
|
disk.type_ = f"zones/{zone}/diskTypes/pd-balanced"
|
|
540
546
|
disk.labels = labels
|
|
@@ -552,7 +558,7 @@ class GCPCompute(Compute):
|
|
|
552
558
|
created_disk = self.disk_client.get(
|
|
553
559
|
project=self.config.project_id,
|
|
554
560
|
zone=zone,
|
|
555
|
-
disk=
|
|
561
|
+
disk=disk_name,
|
|
556
562
|
)
|
|
557
563
|
logger.debug("Created persistent disk for volume %s", volume.name)
|
|
558
564
|
return VolumeProvisioningData(
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
|
-
import random
|
|
3
2
|
import re
|
|
4
|
-
import string
|
|
5
3
|
from typing import Dict, List, Optional
|
|
6
4
|
|
|
7
5
|
import google.api_core.exceptions
|
|
@@ -64,7 +62,7 @@ def check_vpc(
|
|
|
64
62
|
region=region,
|
|
65
63
|
)
|
|
66
64
|
except google.api_core.exceptions.NotFound:
|
|
67
|
-
raise ComputeError(f"Failed to find
|
|
65
|
+
raise ComputeError(f"Failed to find VPC project {vpc_project_id}")
|
|
68
66
|
|
|
69
67
|
if allocate_public_ip:
|
|
70
68
|
return
|
|
@@ -322,12 +320,13 @@ def _is_valid_label(key: str, value: str) -> bool:
|
|
|
322
320
|
return is_valid_resource_name(key) and is_valid_label_value(value)
|
|
323
321
|
|
|
324
322
|
|
|
323
|
+
MAX_RESOURCE_NAME_LEN = 63
|
|
325
324
|
NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
|
|
326
325
|
LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
|
|
327
326
|
|
|
328
327
|
|
|
329
328
|
def is_valid_resource_name(name: str) -> bool:
|
|
330
|
-
if len(name) < 1 or len(name) >
|
|
329
|
+
if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
|
|
331
330
|
return False
|
|
332
331
|
match = re.match(NAME_PATTERN, name)
|
|
333
332
|
return match is not None
|
|
@@ -338,12 +337,6 @@ def is_valid_label_value(value: str) -> bool:
|
|
|
338
337
|
return match is not None
|
|
339
338
|
|
|
340
339
|
|
|
341
|
-
def generate_random_resource_name(length: int = 40) -> str:
|
|
342
|
-
return random.choice(string.ascii_lowercase) + "".join(
|
|
343
|
-
random.choice(string.ascii_lowercase + string.digits) for _ in range(length)
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
|
|
347
340
|
def create_tpu_node_struct(
|
|
348
341
|
instance_name: str,
|
|
349
342
|
startup_script: str,
|
|
@@ -9,9 +9,10 @@ from kubernetes import client
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
generate_unique_gateway_instance_name,
|
|
13
|
+
generate_unique_instance_name_for_job,
|
|
12
14
|
get_docker_commands,
|
|
13
15
|
get_dstack_gateway_commands,
|
|
14
|
-
get_instance_name,
|
|
15
16
|
)
|
|
16
17
|
from dstack._internal.core.backends.base.offers import match_requirements
|
|
17
18
|
from dstack._internal.core.backends.kubernetes.config import KubernetesConfig
|
|
@@ -99,7 +100,7 @@ class KubernetesCompute(Compute):
|
|
|
99
100
|
project_ssh_private_key: str,
|
|
100
101
|
volumes: List[Volume],
|
|
101
102
|
) -> JobProvisioningData:
|
|
102
|
-
instance_name =
|
|
103
|
+
instance_name = generate_unique_instance_name_for_job(run, job)
|
|
103
104
|
commands = get_docker_commands(
|
|
104
105
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
105
106
|
)
|
|
@@ -231,7 +232,7 @@ class KubernetesCompute(Compute):
|
|
|
231
232
|
# TODO: By default EKS creates a Classic Load Balancer for Load Balancer services.
|
|
232
233
|
# Consider deploying an NLB. It seems it requires some extra configuration on the cluster:
|
|
233
234
|
# https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html
|
|
234
|
-
instance_name = configuration
|
|
235
|
+
instance_name = generate_unique_gateway_instance_name(configuration)
|
|
235
236
|
commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub])
|
|
236
237
|
self.api.create_namespaced_pod(
|
|
237
238
|
namespace=DEFAULT_NAMESPACE,
|
|
@@ -6,7 +6,8 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
8
|
Compute,
|
|
9
|
-
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
10
11
|
get_shim_commands,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -23,6 +24,8 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
24
25
|
from dstack._internal.core.models.volumes import Volume
|
|
25
26
|
|
|
27
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
28
|
+
|
|
26
29
|
|
|
27
30
|
class LambdaCompute(Compute):
|
|
28
31
|
def __init__(self, config: LambdaConfig):
|
|
@@ -44,6 +47,9 @@ class LambdaCompute(Compute):
|
|
|
44
47
|
def create_instance(
|
|
45
48
|
self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
|
|
46
49
|
) -> JobProvisioningData:
|
|
50
|
+
instance_name = generate_unique_instance_name(
|
|
51
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
52
|
+
)
|
|
47
53
|
project_ssh_key = instance_config.ssh_keys[0]
|
|
48
54
|
project_key_name = _add_project_ssh_key(
|
|
49
55
|
api_client=self.api_client,
|
|
@@ -53,7 +59,7 @@ class LambdaCompute(Compute):
|
|
|
53
59
|
region_name=instance_offer.region,
|
|
54
60
|
instance_type_name=instance_offer.instance.name,
|
|
55
61
|
ssh_key_names=[project_key_name],
|
|
56
|
-
name=
|
|
62
|
+
name=instance_name,
|
|
57
63
|
quantity=1,
|
|
58
64
|
file_system_names=[],
|
|
59
65
|
)
|
|
@@ -107,7 +113,7 @@ class LambdaCompute(Compute):
|
|
|
107
113
|
) -> JobProvisioningData:
|
|
108
114
|
instance_config = InstanceConfiguration(
|
|
109
115
|
project_name=run.project_name,
|
|
110
|
-
instance_name=
|
|
116
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
111
117
|
ssh_keys=[
|
|
112
118
|
SSHKey(
|
|
113
119
|
public=project_ssh_public_key.strip(), private=project_ssh_private_key.strip()
|
|
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
import dstack.version as version
|
|
7
7
|
from dstack._internal import settings
|
|
8
8
|
from dstack._internal.core.backends.base import Compute
|
|
9
|
-
from dstack._internal.core.backends.base.compute import
|
|
9
|
+
from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
|
|
10
10
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
11
11
|
from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
|
|
12
12
|
from dstack._internal.core.backends.nebius.config import NebiusConfig
|
|
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
|
|
|
130
130
|
) -> JobProvisioningData:
|
|
131
131
|
instance_config = InstanceConfiguration(
|
|
132
132
|
project_name=run.project_name,
|
|
133
|
-
instance_name=
|
|
133
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
134
134
|
ssh_keys=[
|
|
135
135
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
136
136
|
],
|
|
@@ -4,7 +4,12 @@ from typing import List, Optional
|
|
|
4
4
|
|
|
5
5
|
import oci
|
|
6
6
|
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
Compute,
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
11
|
+
get_user_data,
|
|
12
|
+
)
|
|
8
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
14
|
from dstack._internal.core.backends.oci import resources
|
|
10
15
|
from dstack._internal.core.backends.oci.config import OCIConfig
|
|
@@ -98,7 +103,7 @@ class OCICompute(Compute):
|
|
|
98
103
|
) -> JobProvisioningData:
|
|
99
104
|
instance_config = InstanceConfiguration(
|
|
100
105
|
project_name=run.project_name,
|
|
101
|
-
instance_name=
|
|
106
|
+
instance_name=get_job_instance_name(run, job),
|
|
102
107
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
103
108
|
user=run.user,
|
|
104
109
|
)
|
|
@@ -148,6 +153,7 @@ class OCICompute(Compute):
|
|
|
148
153
|
]
|
|
149
154
|
cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
|
|
150
155
|
|
|
156
|
+
display_name = generate_unique_instance_name(instance_config)
|
|
151
157
|
try:
|
|
152
158
|
instance = resources.launch_instance(
|
|
153
159
|
region=region,
|
|
@@ -155,7 +161,7 @@ class OCICompute(Compute):
|
|
|
155
161
|
compartment_id=self.config.compartment_id,
|
|
156
162
|
subnet_id=subnet.id,
|
|
157
163
|
security_group_id=security_group.id,
|
|
158
|
-
display_name=
|
|
164
|
+
display_name=display_name,
|
|
159
165
|
cloud_init_user_data=cloud_init_user_data,
|
|
160
166
|
shape=instance_offer.instance.name,
|
|
161
167
|
is_spot=instance_offer.instance.resources.spot,
|
|
@@ -163,7 +169,7 @@ class OCICompute(Compute):
|
|
|
163
169
|
image_id=package.image_id,
|
|
164
170
|
)
|
|
165
171
|
except oci.exceptions.ServiceError as e:
|
|
166
|
-
if e.code in ("LimitExceeded", "QuotaExceeded"):
|
|
172
|
+
if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
|
|
167
173
|
raise NoCapacityError(e.message)
|
|
168
174
|
raise
|
|
169
175
|
|