dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +11 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +23 -10
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +15 -9
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +26 -20
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +32 -7
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -2
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +11 -18
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +24 -4
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +12 -9
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/pools.py +16 -17
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +6 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +50 -8
- dstack/api/_public/runs.py +4 -1
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +2 -2
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +81 -58
- tests/_internal/server/services/test_logs.py +3 -3
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
import dstack.version as version
|
|
7
7
|
from dstack._internal import settings
|
|
8
8
|
from dstack._internal.core.backends.base import Compute
|
|
9
|
-
from dstack._internal.core.backends.base.compute import
|
|
9
|
+
from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
|
|
10
10
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
11
11
|
from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
|
|
12
12
|
from dstack._internal.core.backends.nebius.config import NebiusConfig
|
|
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
|
|
|
130
130
|
) -> JobProvisioningData:
|
|
131
131
|
instance_config = InstanceConfiguration(
|
|
132
132
|
project_name=run.project_name,
|
|
133
|
-
instance_name=
|
|
133
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
134
134
|
ssh_keys=[
|
|
135
135
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
136
136
|
],
|
|
@@ -4,7 +4,12 @@ from typing import List, Optional
|
|
|
4
4
|
|
|
5
5
|
import oci
|
|
6
6
|
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
Compute,
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
11
|
+
get_user_data,
|
|
12
|
+
)
|
|
8
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
14
|
from dstack._internal.core.backends.oci import resources
|
|
10
15
|
from dstack._internal.core.backends.oci.config import OCIConfig
|
|
@@ -98,7 +103,7 @@ class OCICompute(Compute):
|
|
|
98
103
|
) -> JobProvisioningData:
|
|
99
104
|
instance_config = InstanceConfiguration(
|
|
100
105
|
project_name=run.project_name,
|
|
101
|
-
instance_name=
|
|
106
|
+
instance_name=get_job_instance_name(run, job),
|
|
102
107
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
103
108
|
user=run.user,
|
|
104
109
|
)
|
|
@@ -148,6 +153,7 @@ class OCICompute(Compute):
|
|
|
148
153
|
]
|
|
149
154
|
cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
|
|
150
155
|
|
|
156
|
+
display_name = generate_unique_instance_name(instance_config)
|
|
151
157
|
try:
|
|
152
158
|
instance = resources.launch_instance(
|
|
153
159
|
region=region,
|
|
@@ -155,7 +161,7 @@ class OCICompute(Compute):
|
|
|
155
161
|
compartment_id=self.config.compartment_id,
|
|
156
162
|
subnet_id=subnet.id,
|
|
157
163
|
security_group_id=security_group.id,
|
|
158
|
-
display_name=
|
|
164
|
+
display_name=display_name,
|
|
159
165
|
cloud_init_user_data=cloud_init_user_data,
|
|
160
166
|
shape=instance_offer.instance.name,
|
|
161
167
|
is_spot=instance_offer.instance.resources.spot,
|
|
@@ -163,7 +169,7 @@ class OCICompute(Compute):
|
|
|
163
169
|
image_id=package.image_id,
|
|
164
170
|
)
|
|
165
171
|
except oci.exceptions.ServiceError as e:
|
|
166
|
-
if e.code in ("LimitExceeded", "QuotaExceeded"):
|
|
172
|
+
if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
|
|
167
173
|
raise NoCapacityError(e.message)
|
|
168
174
|
raise
|
|
169
175
|
|
|
@@ -5,8 +5,10 @@ from typing import List, Optional
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
generate_unique_volume_name,
|
|
8
10
|
get_docker_commands,
|
|
9
|
-
|
|
11
|
+
get_job_instance_name,
|
|
10
12
|
)
|
|
11
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
12
14
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
|
|
|
31
33
|
|
|
32
34
|
logger = get_logger(__name__)
|
|
33
35
|
|
|
36
|
+
# Undocumented but names of len 60 work
|
|
37
|
+
MAX_RESOURCE_NAME_LEN = 60
|
|
38
|
+
|
|
34
39
|
CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
35
40
|
|
|
36
41
|
|
|
@@ -47,8 +52,9 @@ class RunpodCompute(Compute):
|
|
|
47
52
|
) -> List[InstanceOfferWithAvailability]:
|
|
48
53
|
offers = get_catalog_offers(
|
|
49
54
|
backend=BackendType.RUNPOD,
|
|
50
|
-
locations=self.config.regions,
|
|
55
|
+
locations=self.config.regions or None,
|
|
51
56
|
requirements=requirements,
|
|
57
|
+
extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
|
|
52
58
|
)
|
|
53
59
|
offers = [
|
|
54
60
|
InstanceOfferWithAvailability(
|
|
@@ -69,7 +75,7 @@ class RunpodCompute(Compute):
|
|
|
69
75
|
) -> JobProvisioningData:
|
|
70
76
|
instance_config = InstanceConfiguration(
|
|
71
77
|
project_name=run.project_name,
|
|
72
|
-
instance_name=
|
|
78
|
+
instance_name=get_job_instance_name(run, job),
|
|
73
79
|
ssh_keys=[
|
|
74
80
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
75
81
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -77,6 +83,7 @@ class RunpodCompute(Compute):
|
|
|
77
83
|
user=run.user,
|
|
78
84
|
)
|
|
79
85
|
|
|
86
|
+
pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
80
87
|
authorized_keys = instance_config.get_public_keys()
|
|
81
88
|
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
|
|
82
89
|
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
@@ -96,13 +103,22 @@ class RunpodCompute(Compute):
|
|
|
96
103
|
bid_per_gpu = None
|
|
97
104
|
if instance_offer.instance.resources.spot and gpu_count:
|
|
98
105
|
bid_per_gpu = instance_offer.price / gpu_count
|
|
106
|
+
if _is_secure_cloud(instance_offer.region):
|
|
107
|
+
cloud_type = "SECURE"
|
|
108
|
+
data_center_id = instance_offer.region
|
|
109
|
+
country_code = None
|
|
110
|
+
else:
|
|
111
|
+
cloud_type = "COMMUNITY"
|
|
112
|
+
data_center_id = None
|
|
113
|
+
country_code = instance_offer.region
|
|
99
114
|
|
|
100
115
|
resp = self.api_client.create_pod(
|
|
101
|
-
name=
|
|
116
|
+
name=pod_name,
|
|
102
117
|
image_name=job.job_spec.image_name,
|
|
103
118
|
gpu_type_id=instance_offer.instance.name,
|
|
104
|
-
cloud_type=
|
|
105
|
-
data_center_id=
|
|
119
|
+
cloud_type=cloud_type,
|
|
120
|
+
data_center_id=data_center_id,
|
|
121
|
+
country_code=country_code,
|
|
106
122
|
gpu_count=gpu_count,
|
|
107
123
|
container_disk_in_gb=disk_size,
|
|
108
124
|
min_vcpu_count=instance_offer.instance.resources.cpus,
|
|
@@ -197,9 +213,10 @@ class RunpodCompute(Compute):
|
|
|
197
213
|
)
|
|
198
214
|
|
|
199
215
|
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
|
|
216
|
+
volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
|
|
200
217
|
size_gb = volume.configuration.size_gb
|
|
201
218
|
volume_id = self.api_client.create_network_volume(
|
|
202
|
-
name=
|
|
219
|
+
name=volume_name,
|
|
203
220
|
region=volume.configuration.region,
|
|
204
221
|
size=size_gb,
|
|
205
222
|
)
|
|
@@ -250,3 +267,11 @@ def _get_volume_price(size: int) -> float:
|
|
|
250
267
|
if size < 1000:
|
|
251
268
|
return 0.07 * size
|
|
252
269
|
return 0.05 * size
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _is_secure_cloud(region: str) -> str:
|
|
273
|
+
"""
|
|
274
|
+
Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
|
|
275
|
+
Community cloud regions are country codes: CA, NL, etc.
|
|
276
|
+
"""
|
|
277
|
+
return "-" in region
|
|
@@ -4,6 +4,14 @@ from dstack._internal.core.models.backends.runpod import (
|
|
|
4
4
|
RunpodStoredConfig,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
+
RUNPOD_COMMUNITY_CLOUD_DEFAULT = True
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class RunpodConfig(RunpodStoredConfig, BackendConfig):
|
|
9
11
|
creds: AnyRunpodCreds
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def allow_community_cloud(self) -> bool:
|
|
15
|
+
if self.community_cloud is not None:
|
|
16
|
+
return self.community_cloud
|
|
17
|
+
return RUNPOD_COMMUNITY_CLOUD_DEFAULT
|
|
@@ -4,7 +4,11 @@ from typing import List, Optional
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name,
|
|
9
|
+
get_job_instance_name,
|
|
10
|
+
get_shim_commands,
|
|
11
|
+
)
|
|
8
12
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
13
|
from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
|
|
10
14
|
from dstack._internal.core.backends.tensordock.config import TensorDockConfig
|
|
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
27
|
logger = get_logger(__name__)
|
|
24
28
|
|
|
25
29
|
|
|
30
|
+
# Undocumented but names of len 60 work
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
32
|
+
|
|
33
|
+
|
|
26
34
|
class TensorDockCompute(Compute):
|
|
27
35
|
def __init__(self, config: TensorDockConfig):
|
|
28
36
|
super().__init__()
|
|
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
|
|
|
49
57
|
instance_offer: InstanceOfferWithAvailability,
|
|
50
58
|
instance_config: InstanceConfiguration,
|
|
51
59
|
) -> JobProvisioningData:
|
|
60
|
+
instance_name = generate_unique_instance_name(
|
|
61
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
62
|
+
)
|
|
52
63
|
commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
|
|
53
64
|
try:
|
|
54
65
|
resp = self.api_client.deploy_single(
|
|
55
|
-
instance_name=
|
|
66
|
+
instance_name=instance_name,
|
|
56
67
|
instance=instance_offer.instance,
|
|
57
68
|
cloudinit={
|
|
58
69
|
"ssh_pwauth": False, # disable password auth
|
|
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
|
|
|
113
124
|
) -> JobProvisioningData:
|
|
114
125
|
instance_config = InstanceConfiguration(
|
|
115
126
|
project_name=run.project_name,
|
|
116
|
-
instance_name=
|
|
127
|
+
instance_name=get_job_instance_name(run, job), # TODO: generate name
|
|
117
128
|
ssh_keys=[
|
|
118
129
|
SSHKey(public=run.run_spec.ssh_key_pub.strip()),
|
|
119
130
|
SSHKey(public=project_ssh_public_key.strip()),
|
|
@@ -4,7 +4,10 @@ import gpuhunt
|
|
|
4
4
|
from gpuhunt.providers.vastai import VastAIProvider
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base import Compute
|
|
7
|
-
from dstack._internal.core.backends.base.compute import
|
|
7
|
+
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
generate_unique_instance_name_for_job,
|
|
9
|
+
get_docker_commands,
|
|
10
|
+
)
|
|
8
11
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
9
12
|
from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
|
|
10
13
|
from dstack._internal.core.backends.vastai.config import VastAIConfig
|
|
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
|
|
|
23
26
|
logger = get_logger(__name__)
|
|
24
27
|
|
|
25
28
|
|
|
29
|
+
# Undocumented but names of len 60 work
|
|
30
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
31
|
+
|
|
32
|
+
|
|
26
33
|
class VastAICompute(Compute):
|
|
27
34
|
def __init__(self, config: VastAIConfig):
|
|
28
35
|
super().__init__()
|
|
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
|
|
|
70
77
|
project_ssh_private_key: str,
|
|
71
78
|
volumes: List[Volume],
|
|
72
79
|
) -> JobProvisioningData:
|
|
80
|
+
instance_name = generate_unique_instance_name_for_job(
|
|
81
|
+
run, job, max_length=MAX_INSTANCE_NAME_LEN
|
|
82
|
+
)
|
|
73
83
|
commands = get_docker_commands(
|
|
74
84
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
75
85
|
)
|
|
76
86
|
resp = self.api_client.create_instance(
|
|
77
|
-
instance_name=
|
|
87
|
+
instance_name=instance_name,
|
|
78
88
|
bundle_id=instance_offer.instance.name,
|
|
79
89
|
image_name=job.job_spec.image_name,
|
|
80
90
|
onstart=" && ".join(commands),
|
|
@@ -20,7 +20,7 @@ class VultrApiClient:
|
|
|
20
20
|
return False
|
|
21
21
|
return True
|
|
22
22
|
|
|
23
|
-
def get_instance(self, instance_id: str, plan_type: str):
|
|
23
|
+
def get_instance(self, instance_id: str, plan_type: str) -> dict:
|
|
24
24
|
if plan_type == "bare-metal":
|
|
25
25
|
response = self._make_request("GET", f"/bare-metals/{instance_id}")
|
|
26
26
|
return response.json()["bare_metal"]
|
|
@@ -28,7 +28,7 @@ class VultrApiClient:
|
|
|
28
28
|
response = self._make_request("GET", f"/instances/{instance_id}")
|
|
29
29
|
return response.json()["instance"]
|
|
30
30
|
|
|
31
|
-
def get_vpc_for_region(self, region: str) -> Optional[
|
|
31
|
+
def get_vpc_for_region(self, region: str) -> Optional[dict]:
|
|
32
32
|
response = self._make_request("GET", "/vpcs?per_page=500")
|
|
33
33
|
vpcs = response.json().get("vpcs", [])
|
|
34
34
|
if vpcs:
|
|
@@ -37,7 +37,7 @@ class VultrApiClient:
|
|
|
37
37
|
return vpc
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
|
-
def create_vpc(self, region: str):
|
|
40
|
+
def create_vpc(self, region: str) -> dict:
|
|
41
41
|
data = {"region": region, "description": f"dstack-vpc-{region}"}
|
|
42
42
|
response = self._make_request("POST", "/vpcs", data=data)
|
|
43
43
|
return response.json()["vpc"]
|
|
@@ -6,7 +6,8 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
-
|
|
9
|
+
generate_unique_instance_name,
|
|
10
|
+
get_job_instance_name,
|
|
10
11
|
get_user_data,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
|
|
|
27
28
|
|
|
28
29
|
logger = get_logger(__name__)
|
|
29
30
|
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 64
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
class VultrCompute(Compute):
|
|
32
35
|
def __init__(self, config: VultrConfig):
|
|
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
|
|
|
62
65
|
) -> JobProvisioningData:
|
|
63
66
|
instance_config = InstanceConfiguration(
|
|
64
67
|
project_name=run.project_name,
|
|
65
|
-
instance_name=
|
|
68
|
+
instance_name=get_job_instance_name(run, job),
|
|
66
69
|
ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
|
|
67
70
|
user=run.user,
|
|
68
71
|
)
|
|
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
|
|
|
71
74
|
def create_instance(
|
|
72
75
|
self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
|
|
73
76
|
) -> JobProvisioningData:
|
|
77
|
+
instance_name = generate_unique_instance_name(
|
|
78
|
+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
79
|
+
)
|
|
74
80
|
# create vpc
|
|
75
81
|
vpc = self.api_client.get_vpc_for_region(instance_offer.region)
|
|
76
82
|
if not vpc:
|
|
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
|
|
|
85
91
|
]
|
|
86
92
|
instance_id = self.api_client.launch_instance(
|
|
87
93
|
region=instance_offer.region,
|
|
88
|
-
label=
|
|
94
|
+
label=instance_name,
|
|
89
95
|
plan=instance_offer.instance.name,
|
|
90
96
|
user_data=get_user_data(
|
|
91
97
|
authorized_keys=instance_config.get_public_keys(),
|
|
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
|
|
|
32
32
|
vpc_ids: Optional[Dict[str, str]] = None
|
|
33
33
|
default_vpcs: Optional[bool] = None
|
|
34
34
|
public_ips: Optional[bool] = None
|
|
35
|
+
iam_instance_profile: Optional[str] = None
|
|
35
36
|
tags: Optional[Dict[str, str]] = None
|
|
36
37
|
os_images: Optional[AWSOSImageConfig] = None
|
|
37
38
|
|
|
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
|
|
|
70
71
|
vpc_ids: Optional[Dict[str, str]]
|
|
71
72
|
default_vpcs: Optional[bool]
|
|
72
73
|
public_ips: Optional[bool]
|
|
74
|
+
iam_instance_profile: Optional[str]
|
|
73
75
|
tags: Optional[Dict[str, str]]
|
|
74
76
|
os_images: Optional["AWSOSImageConfig"]
|
|
75
77
|
|
|
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
|
|
|
15
15
|
DATACRUNCH (BackendType): DataCrunch
|
|
16
16
|
KUBERNETES (BackendType): Kubernetes
|
|
17
17
|
LAMBDA (BackendType): Lambda Cloud
|
|
18
|
+
OCI (BackendType): Oracle Cloud Infrastructure
|
|
18
19
|
RUNPOD (BackendType): Runpod Cloud
|
|
19
20
|
TENSORDOCK (BackendType): TensorDock Marketplace
|
|
20
21
|
VASTAI (BackendType): Vast.ai Marketplace
|
|
@@ -10,6 +10,7 @@ from dstack._internal.core.models.common import CoreModel
|
|
|
10
10
|
class RunpodConfigInfo(CoreModel):
|
|
11
11
|
type: Literal["runpod"] = "runpod"
|
|
12
12
|
regions: Optional[List[str]] = None
|
|
13
|
+
community_cloud: Optional[bool] = None
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
class RunpodStoredConfig(RunpodConfigInfo):
|
|
@@ -33,6 +34,7 @@ class RunpodConfigInfoWithCredsPartial(CoreModel):
|
|
|
33
34
|
type: Literal["runpod"] = "runpod"
|
|
34
35
|
creds: Optional[AnyRunpodCreds]
|
|
35
36
|
regions: Optional[List[str]]
|
|
37
|
+
community_cloud: Optional[bool]
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class RunpodConfigValues(CoreModel):
|
|
@@ -31,7 +31,6 @@ class RunConfigurationType(str, Enum):
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class PythonVersion(str, Enum):
|
|
34
|
-
PY38 = "3.8" # TODO(0.19 or earlier): drop 3.8, stop building Docker images with 3.8
|
|
35
34
|
PY39 = "3.9"
|
|
36
35
|
PY310 = "3.10"
|
|
37
36
|
PY311 = "3.11"
|
|
@@ -222,7 +221,8 @@ class DevEnvironmentConfigurationParams(CoreModel):
|
|
|
222
221
|
" Inactivity is defined as the absence of SSH connections to the"
|
|
223
222
|
" dev environment, including VS Code connections, `ssh <run name>`"
|
|
224
223
|
" shells, and attached `dstack apply` or `dstack attach` commands."
|
|
225
|
-
" Use `off` for unlimited duration.
|
|
224
|
+
" Use `off` for unlimited duration. Can be updated in-place."
|
|
225
|
+
" Defaults to `off`"
|
|
226
226
|
)
|
|
227
227
|
),
|
|
228
228
|
]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import List, Optional, Union
|
|
2
|
+
from typing import List, Optional, Union, overload
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, root_validator, validator
|
|
5
5
|
from typing_extensions import Annotated, Literal
|
|
@@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
|
|
|
34
34
|
DESTROY_AFTER_IDLE = "destroy-after-idle"
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
@overload
|
|
38
|
+
def parse_duration(v: None) -> None: ...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@overload
|
|
42
|
+
def parse_duration(v: Union[int, str]) -> int: ...
|
|
43
|
+
|
|
44
|
+
|
|
37
45
|
def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
|
|
38
46
|
if v is None:
|
|
39
47
|
return None
|
|
@@ -112,6 +120,39 @@ class ProfileRetry(CoreModel):
|
|
|
112
120
|
return values
|
|
113
121
|
|
|
114
122
|
|
|
123
|
+
class UtilizationPolicy(CoreModel):
|
|
124
|
+
_min_time_window = "5m"
|
|
125
|
+
|
|
126
|
+
min_gpu_utilization: Annotated[
|
|
127
|
+
int,
|
|
128
|
+
Field(
|
|
129
|
+
description=(
|
|
130
|
+
"Minimum required GPU utilization, percent."
|
|
131
|
+
" If any GPU has utilization below specified value during the whole time window,"
|
|
132
|
+
" the run is terminated"
|
|
133
|
+
),
|
|
134
|
+
ge=0,
|
|
135
|
+
le=100,
|
|
136
|
+
),
|
|
137
|
+
]
|
|
138
|
+
time_window: Annotated[
|
|
139
|
+
Union[int, str],
|
|
140
|
+
Field(
|
|
141
|
+
description=(
|
|
142
|
+
"The time window of metric samples taking into account to measure utilization"
|
|
143
|
+
f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
|
|
144
|
+
)
|
|
145
|
+
),
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
@validator("time_window", pre=True)
|
|
149
|
+
def validate_time_window(cls, v: Union[int, str]) -> int:
|
|
150
|
+
v = parse_duration(v)
|
|
151
|
+
if v < parse_duration(cls._min_time_window):
|
|
152
|
+
raise ValueError(f"Minimum time_window is {cls._min_time_window}")
|
|
153
|
+
return v
|
|
154
|
+
|
|
155
|
+
|
|
115
156
|
class ProfileParams(CoreModel):
|
|
116
157
|
backends: Annotated[
|
|
117
158
|
Optional[List[BackendType]],
|
|
@@ -194,6 +235,10 @@ class ProfileParams(CoreModel):
|
|
|
194
235
|
)
|
|
195
236
|
),
|
|
196
237
|
]
|
|
238
|
+
utilization_policy: Annotated[
|
|
239
|
+
Optional[UtilizationPolicy],
|
|
240
|
+
Field(description="Run termination policy based on utilization"),
|
|
241
|
+
]
|
|
197
242
|
# Deprecated:
|
|
198
243
|
termination_policy: Annotated[
|
|
199
244
|
Optional[TerminationPolicy],
|
|
@@ -23,6 +23,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
23
23
|
ProfileRetryPolicy,
|
|
24
24
|
RetryEvent,
|
|
25
25
|
SpotPolicy,
|
|
26
|
+
UtilizationPolicy,
|
|
26
27
|
)
|
|
27
28
|
from dstack._internal.core.models.repos import AnyRunRepoData
|
|
28
29
|
from dstack._internal.core.models.resources import Memory, ResourcesSpec
|
|
@@ -114,6 +115,7 @@ class JobTerminationReason(str, Enum):
|
|
|
114
115
|
ABORTED_BY_USER = "aborted_by_user"
|
|
115
116
|
TERMINATED_BY_SERVER = "terminated_by_server"
|
|
116
117
|
INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
|
|
118
|
+
TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
|
|
117
119
|
# Set by the runner
|
|
118
120
|
CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
|
|
119
121
|
PORTS_BINDING_FAILED = "ports_binding_failed"
|
|
@@ -135,6 +137,7 @@ class JobTerminationReason(str, Enum):
|
|
|
135
137
|
self.ABORTED_BY_USER: JobStatus.ABORTED,
|
|
136
138
|
self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
|
|
137
139
|
self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
|
|
140
|
+
self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
|
|
138
141
|
self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
|
|
139
142
|
self.PORTS_BINDING_FAILED: JobStatus.FAILED,
|
|
140
143
|
self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
|
|
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
|
|
|
190
193
|
single_branch: Optional[bool] = None
|
|
191
194
|
max_duration: Optional[int]
|
|
192
195
|
stop_duration: Optional[int] = None
|
|
196
|
+
utilization_policy: Optional[UtilizationPolicy] = None
|
|
193
197
|
registry_auth: Optional[RegistryAuth]
|
|
194
198
|
requirements: Requirements
|
|
195
199
|
retry: Optional[Retry]
|
|
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def validate_dstack_resource_name(resource_name: str):
|
|
7
|
-
if not
|
|
7
|
+
if not is_valid_dstack_resource_name(resource_name):
|
|
8
8
|
raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_valid_dstack_resource_name(resource_name: str) -> bool:
|
|
12
|
+
return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
|
|
@@ -65,6 +65,9 @@ class ConfigManager:
|
|
|
65
65
|
if len(self.config.projects) == 1:
|
|
66
66
|
self.config.projects[0].default = True
|
|
67
67
|
|
|
68
|
+
def list_projects(self):
|
|
69
|
+
return [project.name for project in self.config.projects]
|
|
70
|
+
|
|
68
71
|
def delete_project(self, name: str):
|
|
69
72
|
self.config.projects = [p for p in self.config.projects if p.name != name]
|
|
70
73
|
|
dstack/_internal/server/app.py
CHANGED
|
@@ -29,6 +29,7 @@ from dstack._internal.server.routers import (
|
|
|
29
29
|
metrics,
|
|
30
30
|
pools,
|
|
31
31
|
projects,
|
|
32
|
+
prometheus,
|
|
32
33
|
repos,
|
|
33
34
|
runs,
|
|
34
35
|
secrets,
|
|
@@ -185,6 +186,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
185
186
|
app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
|
|
186
187
|
app.include_router(pools.root_router)
|
|
187
188
|
app.include_router(pools.router)
|
|
189
|
+
app.include_router(prometheus.router)
|
|
188
190
|
|
|
189
191
|
@app.exception_handler(ForbiddenError)
|
|
190
192
|
async def forbidden_error_handler(request: Request, exc: ForbiddenError):
|
|
@@ -252,7 +254,11 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
252
254
|
|
|
253
255
|
@app.exception_handler(404)
|
|
254
256
|
async def custom_http_exception_handler(request, exc):
|
|
255
|
-
if
|
|
257
|
+
if (
|
|
258
|
+
request.url.path.startswith("/api")
|
|
259
|
+
or _is_proxy_request(request)
|
|
260
|
+
or _is_prometheus_request(request)
|
|
261
|
+
):
|
|
256
262
|
return JSONResponse(
|
|
257
263
|
{"detail": exc.detail},
|
|
258
264
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
@@ -283,6 +289,10 @@ def _is_proxy_request(request: Request) -> bool:
|
|
|
283
289
|
) and referrer.path.startswith("/proxy")
|
|
284
290
|
|
|
285
291
|
|
|
292
|
+
def _is_prometheus_request(request: Request) -> bool:
|
|
293
|
+
return request.url.path.startswith("/metrics")
|
|
294
|
+
|
|
295
|
+
|
|
286
296
|
def _print_dstack_logo():
|
|
287
297
|
console.print(
|
|
288
298
|
"""[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
2
2
|
from apscheduler.triggers.interval import IntervalTrigger
|
|
3
3
|
|
|
4
|
+
from dstack._internal.server import settings
|
|
4
5
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
5
6
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
6
7
|
process_gateways_connections,
|
|
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
|
|
|
16
17
|
from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
17
18
|
process_placement_groups,
|
|
18
19
|
)
|
|
20
|
+
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
21
|
+
collect_prometheus_metrics,
|
|
22
|
+
delete_prometheus_metrics,
|
|
23
|
+
)
|
|
19
24
|
from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
|
|
20
25
|
from dstack._internal.server.background.tasks.process_runs import process_runs
|
|
21
26
|
from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
|
|
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
43
48
|
# * 150 active instances with up to 2 minutes processing latency
|
|
44
49
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
45
50
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
51
|
+
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
52
|
+
_scheduler.add_job(
|
|
53
|
+
collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
|
|
54
|
+
)
|
|
55
|
+
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
46
56
|
# process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
|
|
47
57
|
_scheduler.add_job(
|
|
48
58
|
process_submitted_jobs,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
|
|
7
|
+
"""
|
|
8
|
+
This timeout is used in a few places, but roughly refers to the max time between
|
|
9
|
+
requesting instance creation and the instance becoming ready to accept jobs.
|
|
10
|
+
For container-based backends, this also includes the image pulling time.
|
|
11
|
+
"""
|
|
12
|
+
if backend_type == BackendType.LAMBDA:
|
|
13
|
+
return timedelta(minutes=30)
|
|
14
|
+
if backend_type == BackendType.RUNPOD:
|
|
15
|
+
return timedelta(minutes=20)
|
|
16
|
+
if backend_type == BackendType.KUBERNETES:
|
|
17
|
+
return timedelta(minutes=20)
|
|
18
|
+
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
|
|
19
|
+
return timedelta(minutes=20)
|
|
20
|
+
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
21
|
+
return timedelta(minutes=55)
|
|
22
|
+
return timedelta(minutes=10)
|