dstack 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +10 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/cli/utils/volume.py +9 -0
- dstack/_internal/core/backends/aws/compute.py +24 -11
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +14 -8
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +27 -20
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +11 -4
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/configurations.py +0 -1
- dstack/_internal/core/models/runs.py +3 -3
- dstack/_internal/core/models/volumes.py +23 -0
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +13 -21
- dstack/_internal/server/background/tasks/process_running_jobs.py +13 -16
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +7 -2
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +17 -19
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/config.py +11 -1
- dstack/_internal/server/services/fleets.py +5 -1
- dstack/_internal/server/services/jobs/__init__.py +14 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/offers.py +7 -7
- dstack/_internal/server/services/pools.py +19 -20
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/services/runner/client.py +8 -5
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
- dstack/_internal/server/testing/common.py +46 -17
- dstack/api/_public/runs.py +1 -1
- dstack/version.py +2 -2
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/RECORD +97 -86
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +2 -1
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +5 -3
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +11 -6
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_runs.py +2 -2
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_logs.py +3 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +105 -1
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
import dstack.version as version
|
|
7
7
|
from dstack._internal import settings
|
|
8
8
|
from dstack._internal.core.backends.base import Compute
|
|
9
|
-
from dstack._internal.core.backends.base.compute import
|
|
9
|
+
from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
|
|
10
10
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
11
11
|
from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
|
|
12
12
|
from dstack._internal.core.backends.nebius.config import NebiusConfig
|
|
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
|
|
|
130
130
|
) -> JobProvisioningData:
|
|
131
131
|
instance_config = InstanceConfiguration(
|
|
132
132
|
project_name=run.project_name,
|
|
133
|
-
instance_name=
|
|
133
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
134
134
|
ssh_keys=[
|
|
135
135
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
136
136
|
],
|
|
@@ -4,7 +4,12 @@ from typing import List, Optional
|
|
|
4
4
|
|
|
5
5
|
import oci
|
|
6
6
|
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
Compute,
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
11
|
+
get_user_data,
|
|
12
|
+
)
|
|
8
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
14
|
from dstack._internal.core.backends.oci import resources
|
|
10
15
|
from dstack._internal.core.backends.oci.config import OCIConfig
|
|
@@ -98,7 +103,7 @@ class OCICompute(Compute):
|
|
|
98
103
|
) -> JobProvisioningData:
|
|
99
104
|
instance_config = InstanceConfiguration(
|
|
100
105
|
project_name=run.project_name,
|
|
101
|
-
instance_name=
|
|
106
|
+
instance_name=get_job_instance_name(run, job),
|
|
102
107
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
103
108
|
user=run.user,
|
|
104
109
|
)
|
|
@@ -148,6 +153,7 @@ class OCICompute(Compute):
|
|
|
148
153
|
]
|
|
149
154
|
cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
|
|
150
155
|
|
|
156
|
+
display_name = generate_unique_instance_name(instance_config)
|
|
151
157
|
try:
|
|
152
158
|
instance = resources.launch_instance(
|
|
153
159
|
region=region,
|
|
@@ -155,7 +161,7 @@ class OCICompute(Compute):
|
|
|
155
161
|
compartment_id=self.config.compartment_id,
|
|
156
162
|
subnet_id=subnet.id,
|
|
157
163
|
security_group_id=security_group.id,
|
|
158
|
-
display_name=
|
|
164
|
+
display_name=display_name,
|
|
159
165
|
cloud_init_user_data=cloud_init_user_data,
|
|
160
166
|
shape=instance_offer.instance.name,
|
|
161
167
|
is_spot=instance_offer.instance.resources.spot,
|
|
@@ -163,7 +169,7 @@ class OCICompute(Compute):
|
|
|
163
169
|
image_id=package.image_id,
|
|
164
170
|
)
|
|
165
171
|
except oci.exceptions.ServiceError as e:
|
|
166
|
-
if e.code in ("LimitExceeded", "QuotaExceeded"):
|
|
172
|
+
if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
|
|
167
173
|
raise NoCapacityError(e.message)
|
|
168
174
|
raise
|
|
169
175
|
|
|
@@ -5,8 +5,10 @@ from typing import List, Optional
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
generate_unique_volume_name,
|
|
8
10
|
get_docker_commands,
|
|
9
|
-
|
|
11
|
+
get_job_instance_name,
|
|
10
12
|
)
|
|
11
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
12
14
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
|
|
|
31
33
|
|
|
32
34
|
logger = get_logger(__name__)
|
|
33
35
|
|
|
36
|
+
# Undocumented but names of len 60 work
|
|
37
|
+
MAX_RESOURCE_NAME_LEN = 60
|
|
38
|
+
|
|
34
39
|
CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
35
40
|
|
|
36
41
|
|
|
@@ -69,7 +74,7 @@ class RunpodCompute(Compute):
|
|
|
69
74
|
) -> JobProvisioningData:
|
|
70
75
|
instance_config = InstanceConfiguration(
|
|
71
76
|
project_name=run.project_name,
|
|
72
|
-
instance_name=
|
|
77
|
+
instance_name=get_job_instance_name(run, job),
|
|
73
78
|
ssh_keys=[
|
|
74
79
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
75
80
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -77,6 +82,7 @@ class RunpodCompute(Compute):
|
|
|
77
82
|
user=run.user,
|
|
78
83
|
)
|
|
79
84
|
|
|
85
|
+
pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
80
86
|
authorized_keys = instance_config.get_public_keys()
|
|
81
87
|
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
|
|
82
88
|
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
@@ -98,7 +104,7 @@ class RunpodCompute(Compute):
|
|
|
98
104
|
bid_per_gpu = instance_offer.price / gpu_count
|
|
99
105
|
|
|
100
106
|
resp = self.api_client.create_pod(
|
|
101
|
-
name=
|
|
107
|
+
name=pod_name,
|
|
102
108
|
image_name=job.job_spec.image_name,
|
|
103
109
|
gpu_type_id=instance_offer.instance.name,
|
|
104
110
|
cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]:
|
|
@@ -197,9 +203,10 @@ class RunpodCompute(Compute):
|
|
|
197
203
|
)
|
|
198
204
|
|
|
199
205
|
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
|
|
206
|
+
volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
|
|
200
207
|
size_gb = volume.configuration.size_gb
|
|
201
208
|
volume_id = self.api_client.create_network_volume(
|
|
202
|
-
name=
|
|
209
|
+
name=volume_name,
|
|
203
210
|
region=volume.configuration.region,
|
|
204
211
|
size=size_gb,
|
|
205
212
|
)
|
|
@@ -4,7 +4,11 @@ from typing import List, Optional
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
get_job_instance_name,
|
|
10
|
+
get_shim_commands,
|
|
11
|
+
)
|
|
8
12
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
13
|
from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
|
|
10
14
|
from dstack._internal.core.backends.tensordock.config import TensorDockConfig
|
|
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
27
|
logger = get_logger(__name__)
|
|
24
28
|
|
|
25
29
|
|
|
30
|
+
# Undocumented but names of len 60 work
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
32
|
+
|
|
33
|
+
|
|
26
34
|
class TensorDockCompute(Compute):
|
|
27
35
|
def __init__(self, config: TensorDockConfig):
|
|
28
36
|
super().__init__()
|
|
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
|
|
|
49
57
|
instance_offer: InstanceOfferWithAvailability,
|
|
50
58
|
instance_config: InstanceConfiguration,
|
|
51
59
|
) -> JobProvisioningData:
|
|
60
|
+
instance_name = generate_unique_instance_name(
|
|
61
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
62
|
+
)
|
|
52
63
|
commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
|
|
53
64
|
try:
|
|
54
65
|
resp = self.api_client.deploy_single(
|
|
55
|
-
instance_name=
|
|
66
|
+
instance_name=instance_name,
|
|
56
67
|
instance=instance_offer.instance,
|
|
57
68
|
cloudinit={
|
|
58
69
|
"ssh_pwauth": False, # disable password auth
|
|
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
|
|
|
113
124
|
) -> JobProvisioningData:
|
|
114
125
|
instance_config = InstanceConfiguration(
|
|
115
126
|
project_name=run.project_name,
|
|
116
|
-
instance_name=
|
|
127
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
117
128
|
ssh_keys=[
|
|
118
129
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
119
130
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -4,7 +4,10 @@ import gpuhunt
|
|
|
4
4
|
from gpuhunt.providers.vastai import VastAIProvider
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name_for_job,
|
|
9
|
+
get_docker_commands,
|
|
10
|
+
)
|
|
8
11
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
12
|
from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
|
|
10
13
|
from dstack._internal.core.backends.vastai.config import VastAIConfig
|
|
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
26
|
logger = get_logger(__name__)
|
|
24
27
|
|
|
25
28
|
|
|
29
|
+
# Undocumented but names of len 60 work
|
|
30
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
31
|
+
|
|
32
|
+
|
|
26
33
|
class VastAICompute(Compute):
|
|
27
34
|
def __init__(self, config: VastAIConfig):
|
|
28
35
|
super().__init__()
|
|
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
|
|
|
70
77
|
project_ssh_private_key: str,
|
|
71
78
|
volumes: List[Volume],
|
|
72
79
|
) -> JobProvisioningData:
|
|
80
|
+
instance_name = generate_unique_instance_name_for_job(
|
|
81
|
+
run, job, max_length=MAX_INSTANCE_NAME_LEN
|
|
82
|
+
)
|
|
73
83
|
commands = get_docker_commands(
|
|
74
84
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
75
85
|
)
|
|
76
86
|
resp = self.api_client.create_instance(
|
|
77
|
-
instance_name=
|
|
87
|
+
instance_name=instance_name,
|
|
78
88
|
bundle_id=instance_offer.instance.name,
|
|
79
89
|
image_name=job.job_spec.image_name,
|
|
80
90
|
onstart=" && ".join(commands),
|
|
@@ -20,7 +20,7 @@ class VultrApiClient:
|
|
|
20
20
|
return False
|
|
21
21
|
return True
|
|
22
22
|
|
|
23
|
-
def get_instance(self, instance_id: str, plan_type: str):
|
|
23
|
+
def get_instance(self, instance_id: str, plan_type: str) -> dict:
|
|
24
24
|
if plan_type == "bare-metal":
|
|
25
25
|
response = self._make_request("GET", f"/bare-metals/{instance_id}")
|
|
26
26
|
return response.json()["bare_metal"]
|
|
@@ -28,7 +28,7 @@ class VultrApiClient:
|
|
|
28
28
|
response = self._make_request("GET", f"/instances/{instance_id}")
|
|
29
29
|
return response.json()["instance"]
|
|
30
30
|
|
|
31
|
-
def get_vpc_for_region(self, region: str) -> Optional[
|
|
31
|
+
def get_vpc_for_region(self, region: str) -> Optional[dict]:
|
|
32
32
|
response = self._make_request("GET", "/vpcs?per_page=500")
|
|
33
33
|
vpcs = response.json().get("vpcs", [])
|
|
34
34
|
if vpcs:
|
|
@@ -37,7 +37,7 @@ class VultrApiClient:
|
|
|
37
37
|
return vpc
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
|
-
def create_vpc(self, region: str):
|
|
40
|
+
def create_vpc(self, region: str) -> dict:
|
|
41
41
|
data = {"region": region, "description": f"dstack-vpc-{region}"}
|
|
42
42
|
response = self._make_request("POST", "/vpcs", data=data)
|
|
43
43
|
return response.json()["vpc"]
|
|
@@ -6,7 +6,8 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
-
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
10
11
|
get_user_data,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
|
|
|
27
28
|
|
|
28
29
|
logger = get_logger(__name__)
|
|
29
30
|
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 64
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
class VultrCompute(Compute):
|
|
32
35
|
def __init__(self, config: VultrConfig):
|
|
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
|
|
|
62
65
|
) -> JobProvisioningData:
|
|
63
66
|
instance_config = InstanceConfiguration(
|
|
64
67
|
project_name=run.project_name,
|
|
65
|
-
instance_name=
|
|
68
|
+
instance_name=get_job_instance_name(run, job),
|
|
66
69
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
67
70
|
user=run.user,
|
|
68
71
|
)
|
|
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
|
|
|
71
74
|
def create_instance(
|
|
72
75
|
self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
|
|
73
76
|
) -> JobProvisioningData:
|
|
77
|
+
instance_name = generate_unique_instance_name(
|
|
78
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
79
|
+
)
|
|
74
80
|
# create vpc
|
|
75
81
|
vpc = self.api_client.get_vpc_for_region(instance_offer.region)
|
|
76
82
|
if not vpc:
|
|
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
|
|
|
85
91
|
]
|
|
86
92
|
instance_id = self.api_client.launch_instance(
|
|
87
93
|
region=instance_offer.region,
|
|
88
|
-
label=
|
|
94
|
+
label=instance_name,
|
|
89
95
|
plan=instance_offer.instance.name,
|
|
90
96
|
user_data=get_user_data(
|
|
91
97
|
authorized_keys=instance_config.get_public_keys(),
|
|
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
|
|
|
32
32
|
vpc_ids: Optional[Dict[str, str]] = None
|
|
33
33
|
default_vpcs: Optional[bool] = None
|
|
34
34
|
public_ips: Optional[bool] = None
|
|
35
|
+
iam_instance_profile: Optional[str] = None
|
|
35
36
|
tags: Optional[Dict[str, str]] = None
|
|
36
37
|
os_images: Optional[AWSOSImageConfig] = None
|
|
37
38
|
|
|
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
|
|
|
70
71
|
vpc_ids: Optional[Dict[str, str]]
|
|
71
72
|
default_vpcs: Optional[bool]
|
|
72
73
|
public_ips: Optional[bool]
|
|
74
|
+
iam_instance_profile: Optional[str]
|
|
73
75
|
tags: Optional[Dict[str, str]]
|
|
74
76
|
os_images: Optional["AWSOSImageConfig"]
|
|
75
77
|
|
|
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
|
|
|
15
15
|
DATACRUNCH (BackendType): DataCrunch
|
|
16
16
|
KUBERNETES (BackendType): Kubernetes
|
|
17
17
|
LAMBDA (BackendType): Lambda Cloud
|
|
18
|
+
OCI (BackendType): Oracle Cloud Infrastructure
|
|
18
19
|
RUNPOD (BackendType): Runpod Cloud
|
|
19
20
|
TENSORDOCK (BackendType): TensorDock Marketplace
|
|
20
21
|
VASTAI (BackendType): Vast.ai Marketplace
|
|
@@ -150,9 +150,9 @@ class JobTerminationReason(str, Enum):
|
|
|
150
150
|
class Requirements(CoreModel):
|
|
151
151
|
# TODO: Make requirements' fields required
|
|
152
152
|
resources: ResourcesSpec
|
|
153
|
-
max_price: Optional[float]
|
|
154
|
-
spot: Optional[bool]
|
|
155
|
-
reservation: Optional[str]
|
|
153
|
+
max_price: Optional[float] = None
|
|
154
|
+
spot: Optional[bool] = None
|
|
155
|
+
reservation: Optional[str] = None
|
|
156
156
|
|
|
157
157
|
def pretty_format(self, resources_only: bool = False):
|
|
158
158
|
res = self.resources.pretty_format()
|
|
@@ -71,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
|
|
|
71
71
|
device_name: Optional[str] = None
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
class VolumeInstance(CoreModel):
|
|
75
|
+
name: str
|
|
76
|
+
fleet_name: Optional[str] = None
|
|
77
|
+
instance_num: int
|
|
78
|
+
instance_id: Optional[str] = None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class VolumeAttachment(CoreModel):
|
|
82
|
+
instance: VolumeInstance
|
|
83
|
+
attachment_data: Optional[VolumeAttachmentData] = None
|
|
84
|
+
|
|
85
|
+
|
|
74
86
|
class Volume(CoreModel):
|
|
75
87
|
id: uuid.UUID
|
|
76
88
|
name: str
|
|
@@ -86,8 +98,19 @@ class Volume(CoreModel):
|
|
|
86
98
|
deleted: bool
|
|
87
99
|
volume_id: Optional[str] = None # id of the volume in the cloud
|
|
88
100
|
provisioning_data: Optional[VolumeProvisioningData] = None
|
|
101
|
+
attachments: Optional[List[VolumeAttachment]] = None
|
|
102
|
+
# attachment_data is deprecated in favor of attachments.
|
|
103
|
+
# It's only set for volumes that were attached before attachments.
|
|
89
104
|
attachment_data: Optional[VolumeAttachmentData] = None
|
|
90
105
|
|
|
106
|
+
def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
|
|
107
|
+
if self.attachments is not None:
|
|
108
|
+
for attachment in self.attachments:
|
|
109
|
+
if attachment.instance.instance_id == instance_id:
|
|
110
|
+
return attachment.attachment_data
|
|
111
|
+
# volume was attached before attachments were introduced
|
|
112
|
+
return self.attachment_data
|
|
113
|
+
|
|
91
114
|
|
|
92
115
|
class VolumePlan(CoreModel):
|
|
93
116
|
project_name: str
|
|
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def validate_dstack_resource_name(resource_name: str):
|
|
7
|
-
if not
|
|
7
|
+
if not is_valid_dstack_resource_name(resource_name):
|
|
8
8
|
raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_valid_dstack_resource_name(resource_name: str) -> bool:
|
|
12
|
+
return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
|
|
@@ -65,6 +65,9 @@ class ConfigManager:
|
|
|
65
65
|
if len(self.config.projects) == 1:
|
|
66
66
|
self.config.projects[0].default = True
|
|
67
67
|
|
|
68
|
+
def list_projects(self):
|
|
69
|
+
return [project.name for project in self.config.projects]
|
|
70
|
+
|
|
68
71
|
def delete_project(self, name: str):
|
|
69
72
|
self.config.projects = [p for p in self.config.projects if p.name != name]
|
|
70
73
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
7
|
+
"""
|
|
8
|
+
This timeout is used in a few places, but roughly refers to the max time between
|
|
9
|
+
requesting instance creation and the instance becoming ready to accept jobs.
|
|
10
|
+
For container-based backends, this also includes the image pulling time.
|
|
11
|
+
"""
|
|
12
|
+
if backend_type == BackendType.LAMBDA:
|
|
13
|
+
return timedelta(minutes=30)
|
|
14
|
+
if backend_type == BackendType.RUNPOD:
|
|
15
|
+
return timedelta(minutes=20)
|
|
16
|
+
if backend_type == BackendType.KUBERNETES:
|
|
17
|
+
return timedelta(minutes=20)
|
|
18
|
+
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
19
|
+
return timedelta(minutes=20)
|
|
20
|
+
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
21
|
+
return timedelta(minutes=55)
|
|
22
|
+
return timedelta(minutes=10)
|
|
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
|
|
|
45
45
|
InstanceOfferWithAvailability,
|
|
46
46
|
InstanceRuntime,
|
|
47
47
|
InstanceStatus,
|
|
48
|
-
InstanceType,
|
|
49
48
|
RemoteConnectionInfo,
|
|
50
49
|
SSHKey,
|
|
51
50
|
)
|
|
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
|
|
|
63
62
|
Retry,
|
|
64
63
|
)
|
|
65
64
|
from dstack._internal.core.services.profiles import get_retry
|
|
65
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
66
|
from dstack._internal.server.db import get_session_ctx
|
|
67
67
|
from dstack._internal.server.models import (
|
|
68
68
|
FleetModel,
|
|
@@ -507,9 +507,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
507
507
|
project=instance.project,
|
|
508
508
|
profile=profile,
|
|
509
509
|
requirements=requirements,
|
|
510
|
-
exclude_not_available=True,
|
|
511
510
|
fleet_model=instance.fleet,
|
|
512
511
|
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
512
|
+
exclude_not_available=True,
|
|
513
513
|
)
|
|
514
514
|
|
|
515
515
|
if not offers and should_retry:
|
|
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
|
|
|
695
695
|
|
|
696
696
|
if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
|
|
697
697
|
provisioning_deadline = _get_provisioning_deadline(
|
|
698
|
-
instance,
|
|
698
|
+
instance=instance,
|
|
699
|
+
job_provisioning_data=job_provisioning_data,
|
|
699
700
|
)
|
|
700
701
|
if get_current_datetime() > provisioning_deadline:
|
|
701
702
|
instance.status = InstanceStatus.TERMINATING
|
|
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
|
|
|
737
738
|
instance.name,
|
|
738
739
|
)
|
|
739
740
|
provisioning_deadline = _get_provisioning_deadline(
|
|
740
|
-
instance,
|
|
741
|
+
instance=instance,
|
|
742
|
+
job_provisioning_data=job_provisioning_data,
|
|
741
743
|
)
|
|
742
744
|
if get_current_datetime() > provisioning_deadline:
|
|
743
745
|
logger.warning(
|
|
@@ -915,9 +917,8 @@ def _get_instance_offer_for_instance(
|
|
|
915
917
|
instance_offer.availability_zones = [
|
|
916
918
|
z
|
|
917
919
|
for z in instance_offer.availability_zones
|
|
918
|
-
if
|
|
920
|
+
if z == master_job_provisioning_data.availability_zone
|
|
919
921
|
]
|
|
920
|
-
|
|
921
922
|
return instance_offer
|
|
922
923
|
|
|
923
924
|
|
|
@@ -960,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
|
|
|
960
961
|
|
|
961
962
|
|
|
962
963
|
def _get_provisioning_deadline(
|
|
963
|
-
instance: InstanceModel,
|
|
964
|
+
instance: InstanceModel,
|
|
965
|
+
job_provisioning_data: JobProvisioningData,
|
|
964
966
|
) -> datetime.datetime:
|
|
965
|
-
timeout_interval =
|
|
967
|
+
timeout_interval = get_provisioning_timeout(
|
|
968
|
+
backend_type=job_provisioning_data.get_base_backend(),
|
|
969
|
+
instance_type_name=job_provisioning_data.instance_type.name,
|
|
970
|
+
)
|
|
966
971
|
return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
|
|
967
972
|
|
|
968
973
|
|
|
969
|
-
def _get_instance_timeout_interval(
|
|
970
|
-
backend_type: BackendType, instance_type_name: str
|
|
971
|
-
) -> timedelta:
|
|
972
|
-
# when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
|
|
973
|
-
if backend_type == BackendType.RUNPOD:
|
|
974
|
-
return timedelta(seconds=1200)
|
|
975
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
976
|
-
return timedelta(seconds=1200)
|
|
977
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
978
|
-
return timedelta(seconds=3300)
|
|
979
|
-
return timedelta(seconds=600)
|
|
980
|
-
|
|
981
|
-
|
|
982
974
|
def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
|
|
983
975
|
return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from datetime import timedelta
|
|
3
2
|
from typing import Dict, List, Optional
|
|
4
3
|
|
|
5
4
|
from sqlalchemy import select
|
|
@@ -21,6 +20,7 @@ from dstack._internal.core.models.runs import (
|
|
|
21
20
|
ClusterInfo,
|
|
22
21
|
Job,
|
|
23
22
|
JobProvisioningData,
|
|
23
|
+
JobRuntimeData,
|
|
24
24
|
JobSpec,
|
|
25
25
|
JobStatus,
|
|
26
26
|
JobTerminationReason,
|
|
@@ -28,6 +28,7 @@ from dstack._internal.core.models.runs import (
|
|
|
28
28
|
RunSpec,
|
|
29
29
|
)
|
|
30
30
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
31
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
31
32
|
from dstack._internal.server.db import get_session_ctx
|
|
32
33
|
from dstack._internal.server.models import (
|
|
33
34
|
InstanceModel,
|
|
@@ -148,6 +149,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
148
149
|
jobs=run.jobs,
|
|
149
150
|
replica_num=job.job_spec.replica_num,
|
|
150
151
|
job_provisioning_data=job_provisioning_data,
|
|
152
|
+
job_runtime_data=job_submission.job_runtime_data,
|
|
151
153
|
)
|
|
152
154
|
|
|
153
155
|
volumes = await get_job_attached_volumes(
|
|
@@ -205,6 +207,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
205
207
|
None,
|
|
206
208
|
run,
|
|
207
209
|
job_model,
|
|
210
|
+
job_provisioning_data,
|
|
208
211
|
volumes,
|
|
209
212
|
secrets,
|
|
210
213
|
job.job_spec.registry_auth,
|
|
@@ -241,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
241
244
|
|
|
242
245
|
if not success:
|
|
243
246
|
# check timeout
|
|
244
|
-
if job_submission.age >
|
|
247
|
+
if job_submission.age > get_provisioning_timeout(
|
|
245
248
|
backend_type=job_provisioning_data.get_base_backend(),
|
|
246
249
|
instance_type_name=job_provisioning_data.instance_type.name,
|
|
247
250
|
):
|
|
@@ -376,6 +379,7 @@ def _process_provisioning_with_shim(
|
|
|
376
379
|
ports: Dict[int, int],
|
|
377
380
|
run: Run,
|
|
378
381
|
job_model: JobModel,
|
|
382
|
+
job_provisioning_data: JobProvisioningData,
|
|
379
383
|
volumes: List[Volume],
|
|
380
384
|
secrets: Dict[str, str],
|
|
381
385
|
registry_auth: Optional[RegistryAuth],
|
|
@@ -459,6 +463,7 @@ def _process_provisioning_with_shim(
|
|
|
459
463
|
host_ssh_user=ssh_user,
|
|
460
464
|
host_ssh_keys=[ssh_key] if ssh_key else [],
|
|
461
465
|
container_ssh_keys=public_keys,
|
|
466
|
+
instance_id=job_provisioning_data.instance_id,
|
|
462
467
|
)
|
|
463
468
|
else:
|
|
464
469
|
submitted = shim_client.submit(
|
|
@@ -475,6 +480,7 @@ def _process_provisioning_with_shim(
|
|
|
475
480
|
mounts=volume_mounts,
|
|
476
481
|
volumes=volumes,
|
|
477
482
|
instance_mounts=instance_mounts,
|
|
483
|
+
instance_id=job_provisioning_data.instance_id,
|
|
478
484
|
)
|
|
479
485
|
if not submitted:
|
|
480
486
|
# This can happen when we lost connection to the runner (e.g., network issues), marked
|
|
@@ -667,6 +673,7 @@ def _get_cluster_info(
|
|
|
667
673
|
jobs: List[Job],
|
|
668
674
|
replica_num: int,
|
|
669
675
|
job_provisioning_data: JobProvisioningData,
|
|
676
|
+
job_runtime_data: Optional[JobRuntimeData],
|
|
670
677
|
) -> ClusterInfo:
|
|
671
678
|
job_ips = []
|
|
672
679
|
for job in jobs:
|
|
@@ -677,10 +684,13 @@ def _get_cluster_info(
|
|
|
677
684
|
).internal_ip
|
|
678
685
|
or ""
|
|
679
686
|
)
|
|
687
|
+
gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
|
|
688
|
+
if job_runtime_data is not None and job_runtime_data.offer is not None:
|
|
689
|
+
gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
|
|
680
690
|
cluster_info = ClusterInfo(
|
|
681
691
|
job_ips=job_ips,
|
|
682
692
|
master_job_ip=job_ips[0],
|
|
683
|
-
gpus_per_job=
|
|
693
|
+
gpus_per_job=gpus_per_job,
|
|
684
694
|
)
|
|
685
695
|
return cluster_info
|
|
686
696
|
|
|
@@ -759,16 +769,3 @@ def _submit_job_to_runner(
|
|
|
759
769
|
# do not log here, because the runner will send a new status
|
|
760
770
|
|
|
761
771
|
return True
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
765
|
-
# when changing timeouts, also consider process_instances._get_instance_timeout_interval
|
|
766
|
-
if backend_type == BackendType.LAMBDA:
|
|
767
|
-
return timedelta(seconds=1200)
|
|
768
|
-
if backend_type == BackendType.KUBERNETES:
|
|
769
|
-
return timedelta(seconds=1200)
|
|
770
|
-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
771
|
-
return timedelta(seconds=1200)
|
|
772
|
-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
773
|
-
return timedelta(seconds=3300)
|
|
774
|
-
return timedelta(seconds=600)
|