dstack 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +10 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/core/backends/aws/compute.py +22 -10
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +14 -8
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +25 -19
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +11 -4
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/configurations.py +0 -1
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +11 -18
- dstack/_internal/server/background/tasks/process_running_jobs.py +9 -16
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/config.py +11 -1
- dstack/_internal/server/services/jobs/__init__.py +12 -9
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/pools.py +16 -17
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
- dstack/_internal/server/testing/common.py +33 -8
- dstack/api/_public/runs.py +1 -1
- dstack/version.py +2 -2
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/RECORD +80 -71
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +1 -1
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_runs.py +2 -2
- tests/_internal/server/services/test_logs.py +3 -3
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
|
@@ -5,8 +5,10 @@ from typing import List, Optional
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
generate_unique_volume_name,
|
|
8
10
|
get_docker_commands,
|
|
9
|
-
|
|
11
|
+
get_job_instance_name,
|
|
10
12
|
)
|
|
11
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
12
14
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
|
|
|
31
33
|
|
|
32
34
|
logger = get_logger(__name__)
|
|
33
35
|
|
|
36
|
+
# Undocumented but names of len 60 work
|
|
37
|
+
MAX_RESOURCE_NAME_LEN = 60
|
|
38
|
+
|
|
34
39
|
CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
35
40
|
|
|
36
41
|
|
|
@@ -69,7 +74,7 @@ class RunpodCompute(Compute):
|
|
|
69
74
|
) -> JobProvisioningData:
|
|
70
75
|
instance_config = InstanceConfiguration(
|
|
71
76
|
project_name=run.project_name,
|
|
72
|
-
instance_name=
|
|
77
|
+
instance_name=get_job_instance_name(run, job),
|
|
73
78
|
ssh_keys=[
|
|
74
79
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
75
80
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -77,6 +82,7 @@ class RunpodCompute(Compute):
|
|
|
77
82
|
user=run.user,
|
|
78
83
|
)
|
|
79
84
|
|
|
85
|
+
pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
80
86
|
authorized_keys = instance_config.get_public_keys()
|
|
81
87
|
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
|
|
82
88
|
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
@@ -98,7 +104,7 @@ class RunpodCompute(Compute):
|
|
|
98
104
|
bid_per_gpu = instance_offer.price / gpu_count
|
|
99
105
|
|
|
100
106
|
resp = self.api_client.create_pod(
|
|
101
|
-
name=
|
|
107
|
+
name=pod_name,
|
|
102
108
|
image_name=job.job_spec.image_name,
|
|
103
109
|
gpu_type_id=instance_offer.instance.name,
|
|
104
110
|
cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]:
|
|
@@ -197,9 +203,10 @@ class RunpodCompute(Compute):
|
|
|
197
203
|
)
|
|
198
204
|
|
|
199
205
|
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
|
|
206
|
+
volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
|
|
200
207
|
size_gb = volume.configuration.size_gb
|
|
201
208
|
volume_id = self.api_client.create_network_volume(
|
|
202
|
-
name=
|
|
209
|
+
name=volume_name,
|
|
203
210
|
region=volume.configuration.region,
|
|
204
211
|
size=size_gb,
|
|
205
212
|
)
|
|
@@ -4,7 +4,11 @@ from typing import List, Optional
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
get_job_instance_name,
|
|
10
|
+
get_shim_commands,
|
|
11
|
+
)
|
|
8
12
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
13
|
from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
|
|
10
14
|
from dstack._internal.core.backends.tensordock.config import TensorDockConfig
|
|
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
27
|
logger = get_logger(__name__)
|
|
24
28
|
|
|
25
29
|
|
|
30
|
+
# Undocumented but names of len 60 work
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
32
|
+
|
|
33
|
+
|
|
26
34
|
class TensorDockCompute(Compute):
|
|
27
35
|
def __init__(self, config: TensorDockConfig):
|
|
28
36
|
super().__init__()
|
|
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
|
|
|
49
57
|
instance_offer: InstanceOfferWithAvailability,
|
|
50
58
|
instance_config: InstanceConfiguration,
|
|
51
59
|
) -> JobProvisioningData:
|
|
60
|
+
instance_name = generate_unique_instance_name(
|
|
61
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
62
|
+
)
|
|
52
63
|
commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
|
|
53
64
|
try:
|
|
54
65
|
resp = self.api_client.deploy_single(
|
|
55
|
-
instance_name=
|
|
66
|
+
instance_name=instance_name,
|
|
56
67
|
instance=instance_offer.instance,
|
|
57
68
|
cloudinit={
|
|
58
69
|
"ssh_pwauth": False, # disable password auth
|
|
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
|
|
|
113
124
|
) -> JobProvisioningData:
|
|
114
125
|
instance_config = InstanceConfiguration(
|
|
115
126
|
project_name=run.project_name,
|
|
116
|
-
instance_name=
|
|
127
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
117
128
|
ssh_keys=[
|
|
118
129
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
119
130
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -4,7 +4,10 @@ import gpuhunt
|
|
|
4
4
|
from gpuhunt.providers.vastai import VastAIProvider
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name_for_job,
|
|
9
|
+
get_docker_commands,
|
|
10
|
+
)
|
|
8
11
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
12
|
from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
|
|
10
13
|
from dstack._internal.core.backends.vastai.config import VastAIConfig
|
|
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
26
|
logger = get_logger(__name__)
|
|
24
27
|
|
|
25
28
|
|
|
29
|
+
# Undocumented but names of len 60 work
|
|
30
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
31
|
+
|
|
32
|
+
|
|
26
33
|
class VastAICompute(Compute):
|
|
27
34
|
def __init__(self, config: VastAIConfig):
|
|
28
35
|
super().__init__()
|
|
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
|
|
|
70
77
|
project_ssh_private_key: str,
|
|
71
78
|
volumes: List[Volume],
|
|
72
79
|
) -> JobProvisioningData:
|
|
80
|
+
instance_name = generate_unique_instance_name_for_job(
|
|
81
|
+
run, job, max_length=MAX_INSTANCE_NAME_LEN
|
|
82
|
+
)
|
|
73
83
|
commands = get_docker_commands(
|
|
74
84
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
75
85
|
)
|
|
76
86
|
resp = self.api_client.create_instance(
|
|
77
|
-
instance_name=
|
|
87
|
+
instance_name=instance_name,
|
|
78
88
|
bundle_id=instance_offer.instance.name,
|
|
79
89
|
image_name=job.job_spec.image_name,
|
|
80
90
|
onstart=" && ".join(commands),
|
|
@@ -20,7 +20,7 @@ class VultrApiClient:
|
|
|
20
20
|
return False
|
|
21
21
|
return True
|
|
22
22
|
|
|
23
|
-
def get_instance(self, instance_id: str, plan_type: str):
|
|
23
|
+
def get_instance(self, instance_id: str, plan_type: str) -> dict:
|
|
24
24
|
if plan_type == "bare-metal":
|
|
25
25
|
response = self._make_request("GET", f"/bare-metals/{instance_id}")
|
|
26
26
|
return response.json()["bare_metal"]
|
|
@@ -28,7 +28,7 @@ class VultrApiClient:
|
|
|
28
28
|
response = self._make_request("GET", f"/instances/{instance_id}")
|
|
29
29
|
return response.json()["instance"]
|
|
30
30
|
|
|
31
|
-
def get_vpc_for_region(self, region: str) -> Optional[
|
|
31
|
+
def get_vpc_for_region(self, region: str) -> Optional[dict]:
|
|
32
32
|
response = self._make_request("GET", "/vpcs?per_page=500")
|
|
33
33
|
vpcs = response.json().get("vpcs", [])
|
|
34
34
|
if vpcs:
|
|
@@ -37,7 +37,7 @@ class VultrApiClient:
|
|
|
37
37
|
return vpc
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
|
-
def create_vpc(self, region: str):
|
|
40
|
+
def create_vpc(self, region: str) -> dict:
|
|
41
41
|
data = {"region": region, "description": f"dstack-vpc-{region}"}
|
|
42
42
|
response = self._make_request("POST", "/vpcs", data=data)
|
|
43
43
|
return response.json()["vpc"]
|
|
@@ -6,7 +6,8 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
-
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
10
11
|
get_user_data,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
|
|
|
27
28
|
|
|
28
29
|
logger = get_logger(__name__)
|
|
29
30
|
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 64
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
class VultrCompute(Compute):
|
|
32
35
|
def __init__(self, config: VultrConfig):
|
|
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
|
|
|
62
65
|
) -> JobProvisioningData:
|
|
63
66
|
instance_config = InstanceConfiguration(
|
|
64
67
|
project_name=run.project_name,
|
|
65
|
-
instance_name=
|
|
68
|
+
instance_name=get_job_instance_name(run, job),
|
|
66
69
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
67
70
|
user=run.user,
|
|
68
71
|
)
|
|
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
|
|
|
71
74
|
def create_instance(
|
|
72
75
|
self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
|
|
73
76
|
) -> JobProvisioningData:
|
|
77
|
+
instance_name = generate_unique_instance_name(
|
|
78
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
79
|
+
)
|
|
74
80
|
# create vpc
|
|
75
81
|
vpc = self.api_client.get_vpc_for_region(instance_offer.region)
|
|
76
82
|
if not vpc:
|
|
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
|
|
|
85
91
|
]
|
|
86
92
|
instance_id = self.api_client.launch_instance(
|
|
87
93
|
region=instance_offer.region,
|
|
88
|
-
label=
|
|
94
|
+
label=instance_name,
|
|
89
95
|
plan=instance_offer.instance.name,
|
|
90
96
|
user_data=get_user_data(
|
|
91
97
|
authorized_keys=instance_config.get_public_keys(),
|
|
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
|
|
|
32
32
|
vpc_ids: Optional[Dict[str, str]] = None
|
|
33
33
|
default_vpcs: Optional[bool] = None
|
|
34
34
|
public_ips: Optional[bool] = None
|
|
35
|
+
iam_instance_profile: Optional[str] = None
|
|
35
36
|
tags: Optional[Dict[str, str]] = None
|
|
36
37
|
os_images: Optional[AWSOSImageConfig] = None
|
|
37
38
|
|
|
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
|
|
|
70
71
|
vpc_ids: Optional[Dict[str, str]]
|
|
71
72
|
default_vpcs: Optional[bool]
|
|
72
73
|
public_ips: Optional[bool]
|
|
74
|
+
iam_instance_profile: Optional[str]
|
|
73
75
|
tags: Optional[Dict[str, str]]
|
|
74
76
|
os_images: Optional["AWSOSImageConfig"]
|
|
75
77
|
|
|
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
|
|
|
15
15
|
DATACRUNCH (BackendType): DataCrunch
|
|
16
16
|
KUBERNETES (BackendType): Kubernetes
|
|
17
17
|
LAMBDA (BackendType): Lambda Cloud
|
|
18
|
+
OCI (BackendType): Oracle Cloud Infrastructure
|
|
18
19
|
RUNPOD (BackendType): Runpod Cloud
|
|
19
20
|
TENSORDOCK (BackendType): TensorDock Marketplace
|
|
20
21
|
VASTAI (BackendType): Vast.ai Marketplace
|
|
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def validate_dstack_resource_name(resource_name: str):
|
|
7
|
-
if not
|
|
7
|
+
if not is_valid_dstack_resource_name(resource_name):
|
|
8
8
|
raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_valid_dstack_resource_name(resource_name: str) -> bool:
|
|
12
|
+
return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
|
|
@@ -65,6 +65,9 @@ class ConfigManager:
|
|
|
65
65
|
if len(self.config.projects) == 1:
|
|
66
66
|
self.config.projects[0].default = True
|
|
67
67
|
|
|
68
|
+
def list_projects(self):
|
|
69
|
+
return [project.name for project in self.config.projects]
|
|
70
|
+
|
|
68
71
|
def delete_project(self, name: str):
|
|
69
72
|
self.config.projects = [p for p in self.config.projects if p.name != name]
|
|
70
73
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
7
|
+
"""
|
|
8
|
+
This timeout is used in a few places, but roughly refers to the max time between
|
|
9
|
+
requesting instance creation and the instance becoming ready to accept jobs.
|
|
10
|
+
For container-based backends, this also includes the image pulling time.
|
|
11
|
+
"""
|
|
12
|
+
if backend_type == BackendType.LAMBDA:
|
|
13
|
+
return timedelta(minutes=30)
|
|
14
|
+
if backend_type == BackendType.RUNPOD:
|
|
15
|
+
return timedelta(minutes=20)
|
|
16
|
+
if backend_type == BackendType.KUBERNETES:
|
|
17
|
+
return timedelta(minutes=20)
|
|
18
|
+
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
19
|
+
return timedelta(minutes=20)
|
|
20
|
+
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
21
|
+
return timedelta(minutes=55)
|
|
22
|
+
return timedelta(minutes=10)
|
|
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
|
|
|
45
45
|
InstanceOfferWithAvailability,
|
|
46
46
|
InstanceRuntime,
|
|
47
47
|
InstanceStatus,
|
|
48
|
-
InstanceType,
|
|
49
48
|
RemoteConnectionInfo,
|
|
50
49
|
SSHKey,
|
|
51
50
|
)
|
|
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
|
|
|
63
62
|
Retry,
|
|
64
63
|
)
|
|
65
64
|
from dstack._internal.core.services.profiles import get_retry
|
|
65
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
66
|
from dstack._internal.server.db import get_session_ctx
|
|
67
67
|
from dstack._internal.server.models import (
|
|
68
68
|
FleetModel,
|
|
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
|
|
|
695
695
|
|
|
696
696
|
if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
|
|
697
697
|
provisioning_deadline = _get_provisioning_deadline(
|
|
698
|
-
instance,
|
|
698
|
+
instance=instance,
|
|
699
|
+
job_provisioning_data=job_provisioning_data,
|
|
699
700
|
)
|
|
700
701
|
if get_current_datetime() > provisioning_deadline:
|
|
701
702
|
instance.status = InstanceStatus.TERMINATING
|
|
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
|
|
|
737
738
|
instance.name,
|
|
738
739
|
)
|
|
739
740
|
provisioning_deadline = _get_provisioning_deadline(
|
|
740
|
-
instance,
|
|
741
|
+
instance=instance,
|
|
742
|
+
job_provisioning_data=job_provisioning_data,
|
|
741
743
|
)
|
|
742
744
|
if get_current_datetime() > provisioning_deadline:
|
|
743
745
|
logger.warning(
|
|
@@ -959,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
|
|
|
959
961
|
|
|
960
962
|
|
|
961
963
|
def _get_provisioning_deadline(
|
|
962
|
-
instance: InstanceModel,
|
|
964
|
+
instance: InstanceModel,
|
|
965
|
+
job_provisioning_data: JobProvisioningData,
|
|
963
966
|
) -> datetime.datetime:
|
|
964
|
-
timeout_interval =
|
|
967
|
+
timeout_interval = get_provisioning_timeout(
|
|
968
|
+
backend_type=job_provisioning_data.get_base_backend(),
|
|
969
|
+
instance_type_name=job_provisioning_data.instance_type.name,
|
|
970
|
+
)
|
|
965
971
|
return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
|
|
966
972
|
|
|
967
973
|
|
|
968
|
-
def _get_instance_timeout_interval(
|
|
969
|
-
backend_type: BackendType, instance_type_name: str
|
|
970
|
-
) -> timedelta:
|
|
971
|
-
# when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
|
|
972
|
-
if backend_type == BackendType.RUNPOD:
|
|
973
|
-
return timedelta(seconds=1200)
|
|
974
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
975
|
-
return timedelta(seconds=1200)
|
|
976
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
977
|
-
return timedelta(seconds=3300)
|
|
978
|
-
return timedelta(seconds=600)
|
|
979
|
-
|
|
980
|
-
|
|
981
974
|
def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
|
|
982
975
|
return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from datetime import timedelta
|
|
3
2
|
from typing import Dict, List, Optional
|
|
4
3
|
|
|
5
4
|
from sqlalchemy import select
|
|
@@ -21,6 +20,7 @@ from dstack._internal.core.models.runs import (
|
|
|
21
20
|
ClusterInfo,
|
|
22
21
|
Job,
|
|
23
22
|
JobProvisioningData,
|
|
23
|
+
JobRuntimeData,
|
|
24
24
|
JobSpec,
|
|
25
25
|
JobStatus,
|
|
26
26
|
JobTerminationReason,
|
|
@@ -28,6 +28,7 @@ from dstack._internal.core.models.runs import (
|
|
|
28
28
|
RunSpec,
|
|
29
29
|
)
|
|
30
30
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
31
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
31
32
|
from dstack._internal.server.db import get_session_ctx
|
|
32
33
|
from dstack._internal.server.models import (
|
|
33
34
|
InstanceModel,
|
|
@@ -148,6 +149,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
148
149
|
jobs=run.jobs,
|
|
149
150
|
replica_num=job.job_spec.replica_num,
|
|
150
151
|
job_provisioning_data=job_provisioning_data,
|
|
152
|
+
job_runtime_data=job_submission.job_runtime_data,
|
|
151
153
|
)
|
|
152
154
|
|
|
153
155
|
volumes = await get_job_attached_volumes(
|
|
@@ -242,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
242
244
|
|
|
243
245
|
if not success:
|
|
244
246
|
# check timeout
|
|
245
|
-
if job_submission.age >
|
|
247
|
+
if job_submission.age > get_provisioning_timeout(
|
|
246
248
|
backend_type=job_provisioning_data.get_base_backend(),
|
|
247
249
|
instance_type_name=job_provisioning_data.instance_type.name,
|
|
248
250
|
):
|
|
@@ -671,6 +673,7 @@ def _get_cluster_info(
|
|
|
671
673
|
jobs: List[Job],
|
|
672
674
|
replica_num: int,
|
|
673
675
|
job_provisioning_data: JobProvisioningData,
|
|
676
|
+
job_runtime_data: Optional[JobRuntimeData],
|
|
674
677
|
) -> ClusterInfo:
|
|
675
678
|
job_ips = []
|
|
676
679
|
for job in jobs:
|
|
@@ -681,10 +684,13 @@ def _get_cluster_info(
|
|
|
681
684
|
).internal_ip
|
|
682
685
|
or ""
|
|
683
686
|
)
|
|
687
|
+
gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
|
|
688
|
+
if job_runtime_data is not None and job_runtime_data.offer is not None:
|
|
689
|
+
gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
|
|
684
690
|
cluster_info = ClusterInfo(
|
|
685
691
|
job_ips=job_ips,
|
|
686
692
|
master_job_ip=job_ips[0],
|
|
687
|
-
gpus_per_job=
|
|
693
|
+
gpus_per_job=gpus_per_job,
|
|
688
694
|
)
|
|
689
695
|
return cluster_info
|
|
690
696
|
|
|
@@ -763,16 +769,3 @@ def _submit_job_to_runner(
|
|
|
763
769
|
# do not log here, because the runner will send a new status
|
|
764
770
|
|
|
765
771
|
return True
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
769
|
-
# when changing timeouts, also consider process_instances._get_instance_timeout_interval
|
|
770
|
-
if backend_type == BackendType.LAMBDA:
|
|
771
|
-
return timedelta(seconds=1200)
|
|
772
|
-
if backend_type == BackendType.KUBERNETES:
|
|
773
|
-
return timedelta(seconds=1200)
|
|
774
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
775
|
-
return timedelta(seconds=1200)
|
|
776
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
777
|
-
return timedelta(seconds=3300)
|
|
778
|
-
return timedelta(seconds=600)
|
|
@@ -11,7 +11,6 @@ from dstack._internal.server.models import (
|
|
|
11
11
|
JobModel,
|
|
12
12
|
ProjectModel,
|
|
13
13
|
VolumeAttachmentModel,
|
|
14
|
-
VolumeModel,
|
|
15
14
|
)
|
|
16
15
|
from dstack._internal.server.services.jobs import (
|
|
17
16
|
process_terminating_job,
|
|
@@ -86,12 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
86
85
|
.where(InstanceModel.id == job_model.used_instance_id)
|
|
87
86
|
.options(
|
|
88
87
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
89
|
-
joinedload(InstanceModel.volume_attachments)
|
|
90
|
-
.joinedload(VolumeAttachmentModel.volume)
|
|
91
|
-
.joinedload(VolumeModel.user),
|
|
92
|
-
joinedload(InstanceModel.volume_attachments)
|
|
93
|
-
.joinedload(VolumeAttachmentModel.volume)
|
|
94
|
-
.joinedload(VolumeModel.attachments),
|
|
88
|
+
joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
|
|
95
89
|
)
|
|
96
90
|
)
|
|
97
91
|
instance_model = res.unique().scalar()
|
|
@@ -24,4 +24,7 @@ async def poll_logs(
|
|
|
24
24
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
25
25
|
) -> JobSubmissionLogs:
|
|
26
26
|
_, project = user_project
|
|
27
|
+
# The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
|
|
28
|
+
# Otherwise, some logs with duplicated timestamps may be filtered out.
|
|
29
|
+
# This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
|
|
27
30
|
return await logs.poll_logs_async(project=project, request=body)
|
|
@@ -2,6 +2,7 @@ import concurrent.futures
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
import botocore.exceptions
|
|
5
6
|
from boto3.session import Session
|
|
6
7
|
|
|
7
8
|
from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
|
|
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
|
|
|
35
36
|
Configurator,
|
|
36
37
|
raise_invalid_credentials_error,
|
|
37
38
|
)
|
|
39
|
+
from dstack._internal.utils.logging import get_logger
|
|
40
|
+
|
|
41
|
+
logger = get_logger(__name__)
|
|
38
42
|
|
|
39
43
|
REGIONS = [
|
|
40
44
|
("US East, N. Virginia", "us-east-1"),
|
|
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
|
|
|
137
141
|
|
|
138
142
|
def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
139
143
|
self._check_tags_config(config)
|
|
140
|
-
self.
|
|
144
|
+
self._check_iam_instance_profile_config(session, config)
|
|
145
|
+
self._check_vpc_config(session, config)
|
|
141
146
|
|
|
142
147
|
def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
|
|
143
148
|
if not config.tags:
|
|
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
|
|
|
151
156
|
except BackendError as e:
|
|
152
157
|
raise ServerClientError(e.args[0])
|
|
153
158
|
|
|
159
|
+
def _check_iam_instance_profile_config(
|
|
160
|
+
self, session: Session, config: AWSConfigInfoWithCredsPartial
|
|
161
|
+
):
|
|
162
|
+
if config.iam_instance_profile is None:
|
|
163
|
+
return
|
|
164
|
+
try:
|
|
165
|
+
iam_client = session.client("iam")
|
|
166
|
+
iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
|
|
167
|
+
except botocore.exceptions.ClientError as e:
|
|
168
|
+
if e.response["Error"]["Code"] == "NoSuchEntity":
|
|
169
|
+
raise ServerClientError(
|
|
170
|
+
f"IAM instance profile {config.iam_instance_profile} not found"
|
|
171
|
+
)
|
|
172
|
+
logger.exception(
|
|
173
|
+
"Got botocore.exceptions.ClientError when checking iam_instance_profile"
|
|
174
|
+
)
|
|
175
|
+
raise ServerClientError(
|
|
176
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logger.exception("Got exception when checking iam_instance_profile")
|
|
180
|
+
raise ServerClientError(
|
|
181
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
182
|
+
)
|
|
183
|
+
|
|
154
184
|
def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
155
185
|
allocate_public_ip = config.public_ips if config.public_ips is not None else True
|
|
156
186
|
use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
|
|
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
|
|
|
127
127
|
_, project_id = auth.authenticate(GCPDefaultCreds())
|
|
128
128
|
except BackendAuthError:
|
|
129
129
|
return []
|
|
130
|
-
|
|
131
|
-
if project_id is None:
|
|
132
|
-
return []
|
|
133
|
-
|
|
134
130
|
return [
|
|
135
131
|
GCPConfigInfoWithCreds(
|
|
136
132
|
project_id=project_id,
|
|
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
|
|
|
152
148
|
):
|
|
153
149
|
raise_invalid_credentials_error(fields=[["creds"]])
|
|
154
150
|
try:
|
|
155
|
-
credentials,
|
|
156
|
-
except BackendAuthError:
|
|
151
|
+
credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
|
|
152
|
+
except BackendAuthError as e:
|
|
153
|
+
details = None
|
|
154
|
+
if len(e.args) > 0:
|
|
155
|
+
details = e.args[0]
|
|
157
156
|
if is_core_model_instance(config.creds, GCPServiceAccountCreds):
|
|
158
|
-
raise_invalid_credentials_error(fields=[["creds", "data"]])
|
|
157
|
+
raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
|
|
159
158
|
else:
|
|
160
|
-
raise_invalid_credentials_error(fields=[["creds"]])
|
|
161
|
-
if (
|
|
162
|
-
project_id is not None
|
|
163
|
-
and config.project_id is not None
|
|
164
|
-
and config.project_id != project_id
|
|
165
|
-
):
|
|
166
|
-
raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
|
|
167
|
-
config_values.project_id = self._get_project_id_element(selected=project_id)
|
|
159
|
+
raise_invalid_credentials_error(fields=[["creds"]], details=details)
|
|
168
160
|
config_values.regions = self._get_regions_element(
|
|
169
161
|
selected=config.regions or DEFAULT_REGIONS
|
|
170
162
|
)
|
|
171
163
|
if config.project_id is None:
|
|
172
164
|
return config_values
|
|
165
|
+
config_values.project_id = self._get_project_id_element(selected=config.project_id)
|
|
173
166
|
self._check_config(config=config, credentials=credentials)
|
|
174
167
|
return config_values
|
|
175
168
|
|
|
@@ -107,6 +107,16 @@ class AWSConfig(CoreModel):
|
|
|
107
107
|
)
|
|
108
108
|
),
|
|
109
109
|
] = None
|
|
110
|
+
iam_instance_profile: Annotated[
|
|
111
|
+
Optional[str],
|
|
112
|
+
Field(
|
|
113
|
+
description=(
|
|
114
|
+
"The name of the IAM instance profile to associate with EC2 instances."
|
|
115
|
+
" You can also specify the IAM role name for roles created via the AWS console."
|
|
116
|
+
" AWS automatically creates an instance profile and gives it the same name as the role"
|
|
117
|
+
)
|
|
118
|
+
),
|
|
119
|
+
] = None
|
|
110
120
|
tags: Annotated[
|
|
111
121
|
Optional[Dict[str, str]],
|
|
112
122
|
Field(description="The tags that will be assigned to resources created by `dstack`"),
|
|
@@ -251,7 +261,7 @@ class GCPConfig(CoreModel):
|
|
|
251
261
|
),
|
|
252
262
|
] = None
|
|
253
263
|
vm_service_account: Annotated[
|
|
254
|
-
Optional[str], Field(description="The service account
|
|
264
|
+
Optional[str], Field(description="The service account to associate with provisioned VMs")
|
|
255
265
|
] = None
|
|
256
266
|
tags: Annotated[
|
|
257
267
|
Optional[Dict[str, str]],
|
|
@@ -236,13 +236,14 @@ async def process_terminating_job(
|
|
|
236
236
|
logger.debug("%s: stopping container", fmt(job_model))
|
|
237
237
|
ssh_private_keys = get_instance_ssh_private_keys(instance_model)
|
|
238
238
|
await stop_container(job_model, jpd, ssh_private_keys)
|
|
239
|
-
volume_models: list[VolumeModel]
|
|
240
239
|
if jrd is not None and jrd.volume_names is not None:
|
|
241
|
-
|
|
242
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
243
|
-
)
|
|
240
|
+
volume_names = jrd.volume_names
|
|
244
241
|
else:
|
|
245
|
-
|
|
242
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
243
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
244
|
+
volume_models = await list_project_volume_models(
|
|
245
|
+
session=session, project=instance_model.project, names=volume_names
|
|
246
|
+
)
|
|
246
247
|
if len(volume_models) > 0:
|
|
247
248
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
248
249
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
|
|
|
302
303
|
jpd = get_or_error(get_job_provisioning_data(job_model))
|
|
303
304
|
jrd = get_job_runtime_data(job_model)
|
|
304
305
|
if jrd is not None and jrd.volume_names is not None:
|
|
305
|
-
|
|
306
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
307
|
-
)
|
|
306
|
+
volume_names = jrd.volume_names
|
|
308
307
|
else:
|
|
309
|
-
|
|
308
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
309
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
310
|
+
volume_models = await list_project_volume_models(
|
|
311
|
+
session=session, project=instance_model.project, names=volume_names
|
|
312
|
+
)
|
|
310
313
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
311
314
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
312
315
|
project=instance_model.project,
|
|
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
|
|
|
6
6
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
7
7
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
8
8
|
|
|
9
|
-
DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
|
|
10
|
-
|
|
11
9
|
INSTALL_IPYKERNEL = (
|
|
12
10
|
"(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
11
|
'echo "no pip, ipykernel was not installed"'
|
|
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
44
42
|
return False
|
|
45
43
|
|
|
46
44
|
def _default_max_duration(self) -> Optional[int]:
|
|
47
|
-
return
|
|
45
|
+
return None
|
|
48
46
|
|
|
49
47
|
def _spot_policy(self) -> SpotPolicy:
|
|
50
48
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|