dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/gateway.py +15 -3
- dstack/_internal/cli/commands/logs.py +0 -22
- dstack/_internal/cli/commands/stats.py +8 -17
- dstack/_internal/cli/main.py +1 -5
- dstack/_internal/cli/services/configurators/fleet.py +4 -39
- dstack/_internal/cli/services/configurators/run.py +22 -20
- dstack/_internal/cli/services/profile.py +34 -83
- dstack/_internal/cli/utils/gateway.py +1 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/__init__.py +56 -39
- dstack/_internal/core/backends/aws/__init__.py +0 -25
- dstack/_internal/core/backends/aws/auth.py +1 -10
- dstack/_internal/core/backends/aws/backend.py +26 -0
- dstack/_internal/core/backends/aws/compute.py +21 -45
- dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +1 -1
- dstack/_internal/core/backends/azure/__init__.py +0 -20
- dstack/_internal/core/backends/azure/auth.py +2 -11
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +14 -28
- dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
- dstack/_internal/core/backends/azure/models.py +89 -0
- dstack/_internal/core/backends/base/__init__.py +0 -12
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +153 -33
- dstack/_internal/core/backends/base/configurator.py +105 -0
- dstack/_internal/core/backends/base/models.py +14 -0
- dstack/_internal/core/backends/configurators.py +138 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -15
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +8 -26
- dstack/_internal/core/backends/cudo/configurator.py +72 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
- dstack/_internal/core/backends/datacrunch/backend.py +16 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -25
- dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
- dstack/_internal/core/backends/datacrunch/models.py +38 -0
- dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
- dstack/_internal/core/backends/gcp/__init__.py +0 -16
- dstack/_internal/core/backends/gcp/auth.py +2 -11
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +14 -44
- dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
- dstack/_internal/core/backends/gcp/models.py +125 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +16 -5
- dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
- dstack/_internal/core/backends/kubernetes/models.py +72 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
- dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -13
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +16 -2
- dstack/_internal/core/backends/models.py +128 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -15
- dstack/_internal/core/backends/oci/auth.py +1 -5
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +9 -23
- dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
- dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
- dstack/_internal/core/backends/oci/region.py +1 -1
- dstack/_internal/core/backends/runpod/__init__.py +0 -15
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +28 -6
- dstack/_internal/core/backends/runpod/configurator.py +59 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/__init__.py +0 -15
- dstack/_internal/core/backends/tensordock/backend.py +16 -0
- dstack/_internal/core/backends/tensordock/compute.py +8 -27
- dstack/_internal/core/backends/tensordock/configurator.py +68 -0
- dstack/_internal/core/backends/tensordock/models.py +38 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -15
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -2
- dstack/_internal/core/backends/vastai/configurator.py +66 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -15
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +10 -24
- dstack/_internal/core/backends/vultr/configurator.py +64 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/models/backends/__init__.py +0 -184
- dstack/_internal/core/models/backends/base.py +0 -19
- dstack/_internal/core/models/configurations.py +22 -16
- dstack/_internal/core/models/envs.py +4 -3
- dstack/_internal/core/models/fleets.py +17 -22
- dstack/_internal/core/models/gateways.py +3 -3
- dstack/_internal/core/models/instances.py +24 -0
- dstack/_internal/core/models/profiles.py +85 -45
- dstack/_internal/core/models/projects.py +1 -1
- dstack/_internal/core/models/repos/base.py +0 -5
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/repos/remote.py +26 -12
- dstack/_internal/core/models/repos/virtual.py +1 -1
- dstack/_internal/core/models/resources.py +45 -76
- dstack/_internal/core/models/runs.py +21 -19
- dstack/_internal/core/models/volumes.py +1 -3
- dstack/_internal/core/services/profiles.py +7 -16
- dstack/_internal/core/services/repos.py +0 -4
- dstack/_internal/server/app.py +11 -4
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_gateways.py +4 -8
- dstack/_internal/server/background/tasks/process_instances.py +14 -9
- dstack/_internal/server/background/tasks/process_metrics.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
- dstack/_internal/server/background/tasks/process_volumes.py +5 -2
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/models.py +59 -9
- dstack/_internal/server/routers/backends.py +14 -23
- dstack/_internal/server/routers/instances.py +3 -4
- dstack/_internal/server/routers/metrics.py +31 -10
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/routers/repos.py +1 -2
- dstack/_internal/server/routers/runs.py +13 -59
- dstack/_internal/server/schemas/gateways.py +14 -23
- dstack/_internal/server/schemas/projects.py +7 -2
- dstack/_internal/server/schemas/repos.py +2 -38
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/schemas/runs.py +1 -24
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/__init__.py +85 -158
- dstack/_internal/server/services/config.py +53 -567
- dstack/_internal/server/services/fleets.py +9 -103
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/{pools.py → instances.py} +22 -329
- dstack/_internal/server/services/jobs/__init__.py +9 -6
- dstack/_internal/server/services/jobs/configurators/base.py +25 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/metrics.py +131 -72
- dstack/_internal/server/services/offers.py +1 -1
- dstack/_internal/server/services/projects.py +23 -14
- dstack/_internal/server/services/prometheus.py +245 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +67 -31
- dstack/_internal/server/services/volumes.py +9 -4
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +75 -32
- dstack/_internal/utils/json_schema.py +6 -0
- dstack/_internal/utils/ssh.py +2 -1
- dstack/api/__init__.py +4 -0
- dstack/api/_public/__init__.py +16 -20
- dstack/api/_public/backends.py +1 -1
- dstack/api/_public/repos.py +36 -36
- dstack/api/_public/runs.py +170 -83
- dstack/api/server/__init__.py +11 -13
- dstack/api/server/_backends.py +12 -16
- dstack/api/server/_fleets.py +15 -55
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_repos.py +1 -4
- dstack/api/server/_runs.py +21 -96
- dstack/api/server/_volumes.py +10 -5
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
- tests/_internal/cli/services/configurators/test_profile.py +6 -6
- tests/_internal/core/backends/aws/test_configurator.py +35 -0
- tests/_internal/core/backends/aws/test_resources.py +1 -1
- tests/_internal/core/backends/azure/test_configurator.py +61 -0
- tests/_internal/core/backends/cudo/__init__.py +0 -0
- tests/_internal/core/backends/cudo/test_configurator.py +37 -0
- tests/_internal/core/backends/datacrunch/__init__.py +0 -0
- tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
- tests/_internal/core/backends/gcp/test_configurator.py +42 -0
- tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
- tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
- tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
- tests/_internal/core/backends/oci/test_configurator.py +55 -0
- tests/_internal/core/backends/runpod/__init__.py +0 -0
- tests/_internal/core/backends/runpod/test_configurator.py +33 -0
- tests/_internal/core/backends/tensordock/__init__.py +0 -0
- tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
- tests/_internal/core/backends/vastai/__init__.py +0 -0
- tests/_internal/core/backends/vastai/test_configurator.py +33 -0
- tests/_internal/core/backends/vultr/__init__.py +0 -0
- tests/_internal/core/backends/vultr/test_configurator.py +33 -0
- tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
- tests/_internal/server/background/tasks/test_process_instances.py +49 -48
- tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
- tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
- tests/_internal/server/background/tasks/test_process_runs.py +8 -22
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
- tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
- tests/_internal/server/routers/test_backends.py +6 -764
- tests/_internal/server/routers/test_fleets.py +2 -26
- tests/_internal/server/routers/test_gateways.py +27 -3
- tests/_internal/server/routers/test_instances.py +0 -10
- tests/_internal/server/routers/test_metrics.py +42 -0
- tests/_internal/server/routers/test_projects.py +56 -0
- tests/_internal/server/routers/test_prometheus.py +333 -0
- tests/_internal/server/routers/test_repos.py +0 -15
- tests/_internal/server/routers/test_runs.py +83 -275
- tests/_internal/server/routers/test_volumes.py +2 -3
- tests/_internal/server/services/backends/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
- tests/_internal/server/services/test_config.py +7 -4
- tests/_internal/server/services/test_fleets.py +1 -4
- tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
- tests/_internal/server/services/test_metrics.py +167 -0
- tests/_internal/server/services/test_repos.py +1 -14
- tests/_internal/server/services/test_runs.py +0 -4
- dstack/_internal/cli/commands/pool.py +0 -581
- dstack/_internal/cli/commands/run.py +0 -75
- dstack/_internal/core/backends/aws/config.py +0 -18
- dstack/_internal/core/backends/azure/config.py +0 -12
- dstack/_internal/core/backends/base/config.py +0 -5
- dstack/_internal/core/backends/cudo/config.py +0 -9
- dstack/_internal/core/backends/datacrunch/config.py +0 -9
- dstack/_internal/core/backends/gcp/config.py +0 -22
- dstack/_internal/core/backends/kubernetes/config.py +0 -6
- dstack/_internal/core/backends/lambdalabs/config.py +0 -9
- dstack/_internal/core/backends/nebius/__init__.py +0 -15
- dstack/_internal/core/backends/nebius/api_client.py +0 -319
- dstack/_internal/core/backends/nebius/compute.py +0 -220
- dstack/_internal/core/backends/nebius/config.py +0 -6
- dstack/_internal/core/backends/nebius/types.py +0 -37
- dstack/_internal/core/backends/oci/config.py +0 -6
- dstack/_internal/core/backends/runpod/config.py +0 -9
- dstack/_internal/core/backends/tensordock/config.py +0 -9
- dstack/_internal/core/backends/vastai/config.py +0 -6
- dstack/_internal/core/backends/vultr/config.py +0 -9
- dstack/_internal/core/models/backends/aws.py +0 -86
- dstack/_internal/core/models/backends/azure.py +0 -68
- dstack/_internal/core/models/backends/cudo.py +0 -43
- dstack/_internal/core/models/backends/datacrunch.py +0 -44
- dstack/_internal/core/models/backends/gcp.py +0 -67
- dstack/_internal/core/models/backends/kubernetes.py +0 -40
- dstack/_internal/core/models/backends/lambdalabs.py +0 -43
- dstack/_internal/core/models/backends/nebius.py +0 -54
- dstack/_internal/core/models/backends/runpod.py +0 -40
- dstack/_internal/core/models/backends/tensordock.py +0 -44
- dstack/_internal/core/models/backends/vastai.py +0 -43
- dstack/_internal/core/models/backends/vultr.py +0 -40
- dstack/_internal/core/models/pools.py +0 -43
- dstack/_internal/server/routers/pools.py +0 -142
- dstack/_internal/server/schemas/pools.py +0 -38
- dstack/_internal/server/services/backends/configurators/base.py +0 -72
- dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
- dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
- dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
- dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
- dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
- dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
- dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
- dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
- dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
- dstack/api/_public/pools.py +0 -41
- dstack/api/_public/resources.py +0 -105
- dstack/api/server/_pools.py +0 -63
- tests/_internal/server/routers/test_pools.py +0 -612
- /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -86,9 +86,7 @@ class VolumeAttachment(CoreModel):
|
|
|
86
86
|
class Volume(CoreModel):
|
|
87
87
|
id: uuid.UUID
|
|
88
88
|
name: str
|
|
89
|
-
|
|
90
|
-
# TODO: Remove in 0.19
|
|
91
|
-
user: str = ""
|
|
89
|
+
user: str
|
|
92
90
|
project_name: str
|
|
93
91
|
configuration: VolumeConfiguration
|
|
94
92
|
external: bool
|
|
@@ -12,18 +12,7 @@ from dstack._internal.core.models.runs import Retry
|
|
|
12
12
|
def get_retry(profile: Profile) -> Optional[Retry]:
|
|
13
13
|
profile_retry = profile.retry
|
|
14
14
|
if profile_retry is None:
|
|
15
|
-
|
|
16
|
-
# TODO: Remove once retry_policy no longer supported
|
|
17
|
-
profile_retry_policy = profile.retry_policy
|
|
18
|
-
if profile_retry_policy is None:
|
|
19
|
-
return None
|
|
20
|
-
if not profile_retry_policy.retry:
|
|
21
|
-
return None
|
|
22
|
-
duration = profile_retry_policy.duration or DEFAULT_RETRY_DURATION
|
|
23
|
-
return Retry(
|
|
24
|
-
on_events=[RetryEvent.NO_CAPACITY, RetryEvent.INTERRUPTION, RetryEvent.ERROR],
|
|
25
|
-
duration=duration,
|
|
26
|
-
)
|
|
15
|
+
return None
|
|
27
16
|
if isinstance(profile_retry, bool):
|
|
28
17
|
if profile_retry:
|
|
29
18
|
return Retry(
|
|
@@ -32,6 +21,12 @@ def get_retry(profile: Profile) -> Optional[Retry]:
|
|
|
32
21
|
)
|
|
33
22
|
return None
|
|
34
23
|
profile_retry = profile_retry.copy()
|
|
24
|
+
if profile_retry.on_events is None:
|
|
25
|
+
profile_retry.on_events = [
|
|
26
|
+
RetryEvent.NO_CAPACITY,
|
|
27
|
+
RetryEvent.INTERRUPTION,
|
|
28
|
+
RetryEvent.ERROR,
|
|
29
|
+
]
|
|
35
30
|
if profile_retry.duration is None:
|
|
36
31
|
profile_retry.duration = DEFAULT_RETRY_DURATION
|
|
37
32
|
return Retry.parse_obj(profile_retry)
|
|
@@ -42,10 +37,6 @@ def get_termination(
|
|
|
42
37
|
) -> Tuple[TerminationPolicy, int]:
|
|
43
38
|
termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
|
|
44
39
|
termination_idle_time = default_termination_idle_time
|
|
45
|
-
if profile.termination_policy is not None:
|
|
46
|
-
termination_policy = profile.termination_policy
|
|
47
|
-
if profile.termination_idle_time is not None:
|
|
48
|
-
termination_idle_time = profile.termination_idle_time
|
|
49
40
|
if profile.idle_duration is not None and int(profile.idle_duration) < 0:
|
|
50
41
|
termination_policy = TerminationPolicy.DONT_DESTROY
|
|
51
42
|
elif profile.idle_duration is not None:
|
|
@@ -10,7 +10,6 @@ from git.exc import GitCommandError
|
|
|
10
10
|
from dstack._internal.core.errors import DstackError
|
|
11
11
|
from dstack._internal.core.models.config import RepoConfig
|
|
12
12
|
from dstack._internal.core.models.repos import LocalRepo, RemoteRepo, RemoteRepoCreds
|
|
13
|
-
from dstack._internal.core.models.repos.base import RepoProtocol
|
|
14
13
|
from dstack._internal.core.models.repos.remote import GitRepoURL
|
|
15
14
|
from dstack._internal.utils.logging import get_logger
|
|
16
15
|
from dstack._internal.utils.path import PathLike
|
|
@@ -41,7 +40,6 @@ def get_local_repo_credentials(
|
|
|
41
40
|
r = requests.get(f"{url.as_https()}/info/refs?service=git-upload-pack", timeout=10)
|
|
42
41
|
if r.status_code == 200:
|
|
43
42
|
return RemoteRepoCreds(
|
|
44
|
-
protocol=RepoProtocol.HTTPS,
|
|
45
43
|
clone_url=url.as_https(),
|
|
46
44
|
private_key=None,
|
|
47
45
|
oauth_token=None,
|
|
@@ -93,7 +91,6 @@ def check_remote_repo_credentials_https(url: GitRepoURL, oauth_token: str) -> Re
|
|
|
93
91
|
f"Can't access `{url.as_https()}` using the `{masked}` token"
|
|
94
92
|
)
|
|
95
93
|
return RemoteRepoCreds(
|
|
96
|
-
protocol=RepoProtocol.HTTPS,
|
|
97
94
|
clone_url=url.as_https(),
|
|
98
95
|
oauth_token=oauth_token,
|
|
99
96
|
private_key=None,
|
|
@@ -123,7 +120,6 @@ def check_remote_repo_credentials_ssh(url: GitRepoURL, identity_file: PathLike)
|
|
|
123
120
|
)
|
|
124
121
|
|
|
125
122
|
return RemoteRepoCreds(
|
|
126
|
-
protocol=RepoProtocol.SSH,
|
|
127
123
|
clone_url=url.as_ssh(),
|
|
128
124
|
private_key=private_key,
|
|
129
125
|
oauth_token=None,
|
dstack/_internal/server/app.py
CHANGED
|
@@ -27,8 +27,8 @@ from dstack._internal.server.routers import (
|
|
|
27
27
|
instances,
|
|
28
28
|
logs,
|
|
29
29
|
metrics,
|
|
30
|
-
pools,
|
|
31
30
|
projects,
|
|
31
|
+
prometheus,
|
|
32
32
|
repos,
|
|
33
33
|
runs,
|
|
34
34
|
secrets,
|
|
@@ -183,8 +183,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
183
183
|
app.include_router(volumes.project_router)
|
|
184
184
|
app.include_router(service_proxy.router, prefix="/proxy/services", tags=["service-proxy"])
|
|
185
185
|
app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
|
|
186
|
-
app.include_router(
|
|
187
|
-
app.include_router(pools.router)
|
|
186
|
+
app.include_router(prometheus.router)
|
|
188
187
|
|
|
189
188
|
@app.exception_handler(ForbiddenError)
|
|
190
189
|
async def forbidden_error_handler(request: Request, exc: ForbiddenError):
|
|
@@ -252,7 +251,11 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
252
251
|
|
|
253
252
|
@app.exception_handler(404)
|
|
254
253
|
async def custom_http_exception_handler(request, exc):
|
|
255
|
-
if
|
|
254
|
+
if (
|
|
255
|
+
request.url.path.startswith("/api")
|
|
256
|
+
or _is_proxy_request(request)
|
|
257
|
+
or _is_prometheus_request(request)
|
|
258
|
+
):
|
|
256
259
|
return JSONResponse(
|
|
257
260
|
{"detail": exc.detail},
|
|
258
261
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
@@ -283,6 +286,10 @@ def _is_proxy_request(request: Request) -> bool:
|
|
|
283
286
|
) and referrer.path.startswith("/proxy")
|
|
284
287
|
|
|
285
288
|
|
|
289
|
+
def _is_prometheus_request(request: Request) -> bool:
|
|
290
|
+
return request.url.path.startswith("/metrics")
|
|
291
|
+
|
|
292
|
+
|
|
286
293
|
def _print_dstack_logo():
|
|
287
294
|
console.print(
|
|
288
295
|
"""[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
2
2
|
from apscheduler.triggers.interval import IntervalTrigger
|
|
3
3
|
|
|
4
|
+
from dstack._internal.server import settings
|
|
4
5
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
5
6
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
6
7
|
process_gateways_connections,
|
|
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
|
|
|
16
17
|
from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
17
18
|
process_placement_groups,
|
|
18
19
|
)
|
|
20
|
+
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
21
|
+
collect_prometheus_metrics,
|
|
22
|
+
delete_prometheus_metrics,
|
|
23
|
+
)
|
|
19
24
|
from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
|
|
20
25
|
from dstack._internal.server.background.tasks.process_runs import process_runs
|
|
21
26
|
from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
|
|
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
43
48
|
# * 150 active instances with up to 2 minutes processing latency
|
|
44
49
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
45
50
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
51
|
+
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
52
|
+
_scheduler.add_job(
|
|
53
|
+
collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
|
|
54
|
+
)
|
|
55
|
+
_scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
46
56
|
# process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
|
|
47
57
|
_scheduler.add_job(
|
|
48
58
|
process_submitted_jobs,
|
|
@@ -54,17 +54,13 @@ async def process_submitted_gateways():
|
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
async def _remove_inactive_connections():
|
|
57
|
-
connections = await gateway_connections_pool.all()
|
|
58
|
-
ip_addresses = [c.ip_address for c in connections]
|
|
59
57
|
async with get_session_ctx() as session:
|
|
60
58
|
res = await session.execute(
|
|
61
|
-
select(GatewayComputeModel).where(
|
|
62
|
-
GatewayComputeModel.ip_address.in_(ip_addresses),
|
|
63
|
-
GatewayComputeModel.active == False,
|
|
64
|
-
)
|
|
59
|
+
select(GatewayComputeModel.ip_address).where(GatewayComputeModel.active == True)
|
|
65
60
|
)
|
|
66
|
-
|
|
67
|
-
|
|
61
|
+
active_connection_ips = set(res.scalars().all())
|
|
62
|
+
for conn in await gateway_connections_pool.all():
|
|
63
|
+
if conn.ip_address not in active_connection_ips:
|
|
68
64
|
await gateway_connections_pool.remove(conn.ip_address)
|
|
69
65
|
|
|
70
66
|
|
|
@@ -20,6 +20,8 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
20
20
|
DSTACK_RUNNER_BINARY_PATH,
|
|
21
21
|
DSTACK_SHIM_BINARY_PATH,
|
|
22
22
|
DSTACK_WORKING_DIR,
|
|
23
|
+
ComputeWithCreateInstanceSupport,
|
|
24
|
+
ComputeWithPlacementGroupSupport,
|
|
23
25
|
get_shim_env,
|
|
24
26
|
get_shim_pre_start_commands,
|
|
25
27
|
)
|
|
@@ -76,19 +78,19 @@ from dstack._internal.server.services.fleets import (
|
|
|
76
78
|
fleet_model_to_fleet,
|
|
77
79
|
get_create_instance_offers,
|
|
78
80
|
)
|
|
79
|
-
from dstack._internal.server.services.
|
|
80
|
-
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
81
|
-
from dstack._internal.server.services.placement import (
|
|
82
|
-
get_fleet_placement_groups,
|
|
83
|
-
placement_group_model_to_placement_group,
|
|
84
|
-
)
|
|
85
|
-
from dstack._internal.server.services.pools import (
|
|
81
|
+
from dstack._internal.server.services.instances import (
|
|
86
82
|
get_instance_configuration,
|
|
87
83
|
get_instance_profile,
|
|
88
84
|
get_instance_provisioning_data,
|
|
89
85
|
get_instance_requirements,
|
|
90
86
|
get_instance_ssh_private_keys,
|
|
91
87
|
)
|
|
88
|
+
from dstack._internal.server.services.locking import get_locker
|
|
89
|
+
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
90
|
+
from dstack._internal.server.services.placement import (
|
|
91
|
+
get_fleet_placement_groups,
|
|
92
|
+
placement_group_model_to_placement_group,
|
|
93
|
+
)
|
|
92
94
|
from dstack._internal.server.services.runner import client as runner_client
|
|
93
95
|
from dstack._internal.server.services.runner.client import HealthStatus
|
|
94
96
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
@@ -530,12 +532,15 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
530
532
|
for backend, instance_offer in offers:
|
|
531
533
|
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
|
|
532
534
|
continue
|
|
535
|
+
compute = backend.compute()
|
|
536
|
+
assert isinstance(compute, ComputeWithCreateInstanceSupport)
|
|
533
537
|
instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
|
|
534
538
|
if (
|
|
535
539
|
instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
|
|
536
540
|
and instance.fleet
|
|
537
541
|
and instance_configuration.placement_group_name
|
|
538
542
|
):
|
|
543
|
+
assert isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
539
544
|
placement_group_model = _create_placement_group_if_does_not_exist(
|
|
540
545
|
session=session,
|
|
541
546
|
fleet_model=instance.fleet,
|
|
@@ -546,7 +551,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
546
551
|
)
|
|
547
552
|
if placement_group_model is not None:
|
|
548
553
|
placement_group = placement_group_model_to_placement_group(placement_group_model)
|
|
549
|
-
pgpd = await run_async(
|
|
554
|
+
pgpd = await run_async(compute.create_placement_group, placement_group)
|
|
550
555
|
placement_group_model.provisioning_data = pgpd.json()
|
|
551
556
|
session.add(placement_group_model)
|
|
552
557
|
placement_groups.append(placement_group)
|
|
@@ -559,7 +564,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
559
564
|
)
|
|
560
565
|
try:
|
|
561
566
|
job_provisioning_data = await run_async(
|
|
562
|
-
|
|
567
|
+
compute.create_instance,
|
|
563
568
|
instance_offer,
|
|
564
569
|
instance_configuration,
|
|
565
570
|
)
|
|
@@ -11,8 +11,8 @@ from dstack._internal.server import settings
|
|
|
11
11
|
from dstack._internal.server.db import get_session_ctx
|
|
12
12
|
from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel
|
|
13
13
|
from dstack._internal.server.schemas.runner import MetricsResponse
|
|
14
|
+
from dstack._internal.server.services.instances import get_instance_ssh_private_keys
|
|
14
15
|
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
15
|
-
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
16
16
|
from dstack._internal.server.services.runner import client
|
|
17
17
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
18
18
|
from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
|
|
@@ -5,6 +5,7 @@ from sqlalchemy import select
|
|
|
5
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
6
|
from sqlalchemy.orm import joinedload
|
|
7
7
|
|
|
8
|
+
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
8
9
|
from dstack._internal.core.errors import PlacementGroupInUseError
|
|
9
10
|
from dstack._internal.server.db import get_session_ctx
|
|
10
11
|
from dstack._internal.server.models import PlacementGroupModel, ProjectModel
|
|
@@ -28,6 +29,7 @@ async def process_placement_groups():
|
|
|
28
29
|
PlacementGroupModel.deleted == False,
|
|
29
30
|
PlacementGroupModel.id.not_in(lockset),
|
|
30
31
|
)
|
|
32
|
+
.order_by(PlacementGroupModel.id) # take locks in order
|
|
31
33
|
.with_for_update(skip_locked=True)
|
|
32
34
|
)
|
|
33
35
|
placement_group_models = res.scalars().all()
|
|
@@ -80,8 +82,10 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel):
|
|
|
80
82
|
"Failed to delete placement group %s. Backend not available.", placement_group.name
|
|
81
83
|
)
|
|
82
84
|
return
|
|
85
|
+
compute = backend.compute()
|
|
86
|
+
assert isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
83
87
|
try:
|
|
84
|
-
await run_async(
|
|
88
|
+
await run_async(compute.delete_placement_group, placement_group)
|
|
85
89
|
except PlacementGroupInUseError:
|
|
86
90
|
logger.info(
|
|
87
91
|
"Placement group %s is still in use. Skipping deletion for now.", placement_group.name
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import sqlalchemy.exc
|
|
6
|
+
from sqlalchemy import delete, or_, select, update
|
|
7
|
+
from sqlalchemy.orm import joinedload
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
10
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
11
|
+
from dstack._internal.server.db import get_session_ctx
|
|
12
|
+
from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
|
|
13
|
+
from dstack._internal.server.services.instances import get_instance_ssh_private_keys
|
|
14
|
+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
15
|
+
from dstack._internal.server.services.runner import client
|
|
16
|
+
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
17
|
+
from dstack._internal.server.utils.common import gather_map_async
|
|
18
|
+
from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
|
|
19
|
+
from dstack._internal.utils.logging import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
MAX_JOBS_FETCHED = 100
|
|
25
|
+
BATCH_SIZE = 10
|
|
26
|
+
MIN_COLLECT_INTERVAL_SECONDS = 9
|
|
27
|
+
# 10 minutes should be more than enough to scrape metrics, and, in any case,
|
|
28
|
+
# 10 minutes old metrics has little to no value
|
|
29
|
+
METRICS_TTL_SECONDS = 600
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def collect_prometheus_metrics():
|
|
33
|
+
now = get_current_datetime()
|
|
34
|
+
cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
|
|
35
|
+
async with get_session_ctx() as session:
|
|
36
|
+
res = await session.execute(
|
|
37
|
+
select(JobModel)
|
|
38
|
+
.join(JobPrometheusMetrics, isouter=True)
|
|
39
|
+
.where(
|
|
40
|
+
JobModel.status.in_([JobStatus.RUNNING]),
|
|
41
|
+
or_(
|
|
42
|
+
JobPrometheusMetrics.job_id.is_(None),
|
|
43
|
+
JobPrometheusMetrics.collected_at < cutoff,
|
|
44
|
+
),
|
|
45
|
+
)
|
|
46
|
+
.options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
|
|
47
|
+
.order_by(JobModel.last_processed_at.asc())
|
|
48
|
+
.limit(MAX_JOBS_FETCHED)
|
|
49
|
+
)
|
|
50
|
+
job_models = res.unique().scalars().all()
|
|
51
|
+
for batch in batched(job_models, BATCH_SIZE):
|
|
52
|
+
await _collect_jobs_metrics(batch, now)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def delete_prometheus_metrics():
|
|
56
|
+
now = get_current_datetime()
|
|
57
|
+
cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
|
|
58
|
+
async with get_session_ctx() as session:
|
|
59
|
+
await session.execute(
|
|
60
|
+
delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
|
|
61
|
+
)
|
|
62
|
+
await session.commit()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
|
|
66
|
+
results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
|
|
67
|
+
async with get_session_ctx() as session:
|
|
68
|
+
for job_model, result in results:
|
|
69
|
+
if result is None:
|
|
70
|
+
continue
|
|
71
|
+
if isinstance(result, BaseException):
|
|
72
|
+
logger.error(
|
|
73
|
+
"Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
|
|
74
|
+
)
|
|
75
|
+
continue
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
update(JobPrometheusMetrics)
|
|
78
|
+
.where(JobPrometheusMetrics.job_id == job_model.id)
|
|
79
|
+
.values(
|
|
80
|
+
collected_at=collected_at,
|
|
81
|
+
text=result,
|
|
82
|
+
)
|
|
83
|
+
.returning(JobPrometheusMetrics)
|
|
84
|
+
)
|
|
85
|
+
metrics = res.scalar()
|
|
86
|
+
if metrics is None:
|
|
87
|
+
metrics = JobPrometheusMetrics(
|
|
88
|
+
job_id=job_model.id,
|
|
89
|
+
collected_at=collected_at,
|
|
90
|
+
text=result,
|
|
91
|
+
)
|
|
92
|
+
try:
|
|
93
|
+
async with session.begin_nested():
|
|
94
|
+
session.add(metrics)
|
|
95
|
+
except sqlalchemy.exc.IntegrityError:
|
|
96
|
+
# Concurrent server replica already committed, ignoring
|
|
97
|
+
pass
|
|
98
|
+
await session.commit()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
|
|
102
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
103
|
+
jpd = get_job_provisioning_data(job_model)
|
|
104
|
+
jrd = get_job_runtime_data(job_model)
|
|
105
|
+
if jpd is None:
|
|
106
|
+
return None
|
|
107
|
+
try:
|
|
108
|
+
res = await run_async(
|
|
109
|
+
_pull_job_metrics,
|
|
110
|
+
ssh_private_keys,
|
|
111
|
+
jpd,
|
|
112
|
+
jrd,
|
|
113
|
+
job_model.id,
|
|
114
|
+
)
|
|
115
|
+
except Exception:
|
|
116
|
+
logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
if isinstance(res, bool):
|
|
120
|
+
logger.warning(
|
|
121
|
+
"Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
|
|
122
|
+
)
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
if res is None:
|
|
126
|
+
# Either not supported by shim or exporter is not available
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
return res
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
133
|
+
def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
|
|
134
|
+
shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
135
|
+
return shim_client.get_task_metrics(task_id)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from datetime import timedelta
|
|
2
4
|
from typing import Dict, List, Optional
|
|
3
5
|
|
|
4
6
|
from sqlalchemy import select
|
|
@@ -15,6 +17,7 @@ from dstack._internal.core.models.instances import (
|
|
|
15
17
|
RemoteConnectionInfo,
|
|
16
18
|
SSHConnectionParams,
|
|
17
19
|
)
|
|
20
|
+
from dstack._internal.core.models.metrics import Metric
|
|
18
21
|
from dstack._internal.core.models.repos import RemoteRepoCreds
|
|
19
22
|
from dstack._internal.core.models.runs import (
|
|
20
23
|
ClusterInfo,
|
|
@@ -40,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
40
43
|
from dstack._internal.server.schemas.runner import TaskStatus
|
|
41
44
|
from dstack._internal.server.services import logs as logs_services
|
|
42
45
|
from dstack._internal.server.services import services
|
|
46
|
+
from dstack._internal.server.services.instances import get_instance_ssh_private_keys
|
|
43
47
|
from dstack._internal.server.services.jobs import (
|
|
44
48
|
find_job,
|
|
45
49
|
get_job_attached_volumes,
|
|
@@ -48,7 +52,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
48
52
|
)
|
|
49
53
|
from dstack._internal.server.services.locking import get_locker
|
|
50
54
|
from dstack._internal.server.services.logging import fmt
|
|
51
|
-
from dstack._internal.server.services.
|
|
55
|
+
from dstack._internal.server.services.metrics import get_job_metrics
|
|
52
56
|
from dstack._internal.server.services.repos import (
|
|
53
57
|
get_code_model,
|
|
54
58
|
get_repo_creds,
|
|
@@ -123,7 +127,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
123
127
|
run_model = res.unique().scalar_one()
|
|
124
128
|
repo_model = run_model.repo
|
|
125
129
|
project = run_model.project
|
|
126
|
-
run = run_model_to_run(run_model)
|
|
130
|
+
run = run_model_to_run(run_model, include_sensitive=True)
|
|
127
131
|
job_submission = job_model_to_job_submission(job_model)
|
|
128
132
|
job_provisioning_data = job_submission.job_provisioning_data
|
|
129
133
|
if job_provisioning_data is None:
|
|
@@ -343,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
343
347
|
job_model.status = JobStatus.TERMINATING
|
|
344
348
|
job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
|
|
345
349
|
|
|
350
|
+
if job_model.status == JobStatus.RUNNING:
|
|
351
|
+
await _check_gpu_utilization(session, job_model, job)
|
|
352
|
+
|
|
346
353
|
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
347
354
|
await session.commit()
|
|
348
355
|
|
|
@@ -646,27 +653,67 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
646
653
|
run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
|
|
647
654
|
) -> None:
|
|
648
655
|
conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
|
|
649
|
-
if is_core_model_instance(conf, DevEnvironmentConfiguration)
|
|
656
|
+
if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
|
|
650
657
|
conf.inactivity_duration, int
|
|
651
658
|
):
|
|
652
|
-
|
|
653
|
-
job_model.inactivity_secs =
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
659
|
+
# reset in case inactivity_duration was disabled via in-place update
|
|
660
|
+
job_model.inactivity_secs = None
|
|
661
|
+
return
|
|
662
|
+
logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
|
|
663
|
+
job_model.inactivity_secs = no_connections_secs
|
|
664
|
+
if no_connections_secs is None:
|
|
665
|
+
# TODO(0.19 or earlier): make no_connections_secs required
|
|
666
|
+
job_model.status = JobStatus.TERMINATING
|
|
667
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
668
|
+
job_model.termination_reason_message = (
|
|
669
|
+
"The selected instance was created before dstack 0.18.41"
|
|
670
|
+
" and does not support inactivity_duration"
|
|
671
|
+
)
|
|
672
|
+
elif no_connections_secs >= conf.inactivity_duration:
|
|
673
|
+
job_model.status = JobStatus.TERMINATING
|
|
674
|
+
# TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
|
|
675
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
676
|
+
job_model.termination_reason_message = (
|
|
677
|
+
f"The job was inactive for {no_connections_secs} seconds,"
|
|
678
|
+
f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
683
|
+
policy = job.job_spec.utilization_policy
|
|
684
|
+
if policy is None:
|
|
685
|
+
return
|
|
686
|
+
after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
|
|
687
|
+
job_metrics = await get_job_metrics(session, job_model, after=after)
|
|
688
|
+
gpus_util_metrics: list[Metric] = []
|
|
689
|
+
for metric in job_metrics.metrics:
|
|
690
|
+
if metric.name.startswith("gpu_util_percent_gpu"):
|
|
691
|
+
gpus_util_metrics.append(metric)
|
|
692
|
+
if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
|
|
693
|
+
# Job has started recently, not enough points collected.
|
|
694
|
+
# Assuming that metrics collection interval less than 1 minute.
|
|
695
|
+
logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
|
|
696
|
+
return
|
|
697
|
+
if _should_terminate_due_to_low_gpu_util(
|
|
698
|
+
policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
|
|
699
|
+
):
|
|
700
|
+
logger.info("%s: GPU utilization check: terminating", fmt(job_model))
|
|
701
|
+
job_model.status = JobStatus.TERMINATING
|
|
702
|
+
# TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
|
|
703
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
704
|
+
job_model.termination_reason_message = (
|
|
705
|
+
f"The job GPU utilization below {policy.min_gpu_utilization}%"
|
|
706
|
+
f" for {policy.time_window} seconds"
|
|
707
|
+
)
|
|
708
|
+
else:
|
|
709
|
+
logger.debug("%s: GPU utilization check: OK", fmt(job_model))
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
|
|
713
|
+
for gpu_util in gpus_util:
|
|
714
|
+
if all(util < min_util for util in gpu_util):
|
|
715
|
+
return True
|
|
716
|
+
return False
|
|
670
717
|
|
|
671
718
|
|
|
672
719
|
def _get_cluster_info(
|
|
@@ -696,20 +743,29 @@ def _get_cluster_info(
|
|
|
696
743
|
|
|
697
744
|
|
|
698
745
|
async def _get_job_code(
|
|
699
|
-
session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: str
|
|
746
|
+
session: AsyncSession, project: ProjectModel, repo: RepoModel, code_hash: Optional[str]
|
|
700
747
|
) -> bytes:
|
|
748
|
+
if code_hash is None:
|
|
749
|
+
return b""
|
|
701
750
|
code_model = await get_code_model(session=session, repo=repo, code_hash=code_hash)
|
|
702
751
|
if code_model is None:
|
|
703
752
|
return b""
|
|
704
|
-
|
|
705
|
-
if storage is None or code_model.blob is not None:
|
|
753
|
+
if code_model.blob is not None:
|
|
706
754
|
return code_model.blob
|
|
755
|
+
storage = get_default_storage()
|
|
756
|
+
if storage is None:
|
|
757
|
+
return b""
|
|
707
758
|
blob = await common_utils.run_async(
|
|
708
759
|
storage.get_code,
|
|
709
760
|
project.name,
|
|
710
761
|
repo.name,
|
|
711
762
|
code_hash,
|
|
712
763
|
)
|
|
764
|
+
if blob is None:
|
|
765
|
+
logger.error(
|
|
766
|
+
"Failed to get repo code hash %s from storage for repo %s", code_hash, repo.name
|
|
767
|
+
)
|
|
768
|
+
return b""
|
|
713
769
|
return blob
|
|
714
770
|
|
|
715
771
|
|