dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -0
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +1 -0
- dstack/_internal/core/backends/azure/compute.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +1 -1
- dstack/_internal/core/backends/runpod/compute.py +21 -3
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -1
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -19
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +13 -3
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4eb116b97819badd1e2c.js} +66 -13
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +17 -0
- dstack/api/_public/runs.py +3 -0
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/RECORD +59 -50
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +125 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +79 -56
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,13 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
13
13
|
|
|
14
14
|
from dstack._internal.core.models.backends.base import BackendType
|
|
15
15
|
from dstack._internal.core.models.common import ApplyAction
|
|
16
|
-
from dstack._internal.core.models.configurations import
|
|
16
|
+
from dstack._internal.core.models.configurations import (
|
|
17
|
+
AnyRunConfiguration,
|
|
18
|
+
DevEnvironmentConfiguration,
|
|
19
|
+
ScalingSpec,
|
|
20
|
+
ServiceConfiguration,
|
|
21
|
+
TaskConfiguration,
|
|
22
|
+
)
|
|
17
23
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
18
24
|
from dstack._internal.core.models.instances import (
|
|
19
25
|
InstanceAvailability,
|
|
@@ -126,6 +132,7 @@ def get_dev_env_run_plan_dict(
|
|
|
126
132
|
"idle_duration": None,
|
|
127
133
|
"termination_idle_time": 300,
|
|
128
134
|
"termination_policy": None,
|
|
135
|
+
"utilization_policy": None,
|
|
129
136
|
"reservation": None,
|
|
130
137
|
},
|
|
131
138
|
"configuration_path": "dstack.yaml",
|
|
@@ -148,6 +155,7 @@ def get_dev_env_run_plan_dict(
|
|
|
148
155
|
"idle_duration": None,
|
|
149
156
|
"termination_idle_time": 300,
|
|
150
157
|
"termination_policy": None,
|
|
158
|
+
"utilization_policy": None,
|
|
151
159
|
"reservation": None,
|
|
152
160
|
},
|
|
153
161
|
"repo_code_hash": None,
|
|
@@ -190,6 +198,7 @@ def get_dev_env_run_plan_dict(
|
|
|
190
198
|
"single_branch": False,
|
|
191
199
|
"max_duration": None,
|
|
192
200
|
"stop_duration": 300,
|
|
201
|
+
"utilization_policy": None,
|
|
193
202
|
"registry_auth": None,
|
|
194
203
|
"requirements": {
|
|
195
204
|
"resources": {
|
|
@@ -283,6 +292,7 @@ def get_dev_env_run_dict(
|
|
|
283
292
|
"idle_duration": None,
|
|
284
293
|
"termination_idle_time": 300,
|
|
285
294
|
"termination_policy": None,
|
|
295
|
+
"utilization_policy": None,
|
|
286
296
|
"reservation": None,
|
|
287
297
|
},
|
|
288
298
|
"configuration_path": "dstack.yaml",
|
|
@@ -305,6 +315,7 @@ def get_dev_env_run_dict(
|
|
|
305
315
|
"idle_duration": None,
|
|
306
316
|
"termination_idle_time": 300,
|
|
307
317
|
"termination_policy": None,
|
|
318
|
+
"utilization_policy": None,
|
|
308
319
|
"reservation": None,
|
|
309
320
|
},
|
|
310
321
|
"repo_code_hash": None,
|
|
@@ -347,6 +358,7 @@ def get_dev_env_run_dict(
|
|
|
347
358
|
"single_branch": False,
|
|
348
359
|
"max_duration": None,
|
|
349
360
|
"stop_duration": 300,
|
|
361
|
+
"utilization_policy": None,
|
|
350
362
|
"registry_auth": None,
|
|
351
363
|
"requirements": {
|
|
352
364
|
"resources": {
|
|
@@ -891,66 +903,77 @@ class TestGetRunPlan:
|
|
|
891
903
|
|
|
892
904
|
@pytest.mark.asyncio
|
|
893
905
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
906
|
+
@pytest.mark.parametrize(
|
|
907
|
+
("old_conf", "new_conf", "action"),
|
|
908
|
+
[
|
|
909
|
+
pytest.param(
|
|
910
|
+
ServiceConfiguration(
|
|
911
|
+
commands=["one", "two"],
|
|
912
|
+
port=80,
|
|
913
|
+
replicas=1,
|
|
914
|
+
scaling=None,
|
|
915
|
+
),
|
|
916
|
+
ServiceConfiguration(
|
|
917
|
+
commands=["one", "two"],
|
|
918
|
+
port=80,
|
|
919
|
+
replicas="2..4",
|
|
920
|
+
scaling=ScalingSpec(metric="rps", target=5),
|
|
921
|
+
),
|
|
922
|
+
"update",
|
|
923
|
+
id="update-service",
|
|
911
924
|
),
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
925
|
+
pytest.param(
|
|
926
|
+
ServiceConfiguration(
|
|
927
|
+
commands=["one", "two"],
|
|
928
|
+
port=80,
|
|
929
|
+
replicas=1,
|
|
930
|
+
scaling=None,
|
|
931
|
+
),
|
|
932
|
+
ServiceConfiguration(
|
|
933
|
+
commands=["one", "two"],
|
|
934
|
+
port=8080, # not updatable
|
|
935
|
+
replicas="2..4",
|
|
936
|
+
scaling=ScalingSpec(metric="rps", target=5),
|
|
937
|
+
),
|
|
938
|
+
"create",
|
|
939
|
+
id="no-update-service",
|
|
940
|
+
),
|
|
941
|
+
pytest.param(
|
|
942
|
+
DevEnvironmentConfiguration(ide="vscode", inactivity_duration=False),
|
|
943
|
+
DevEnvironmentConfiguration(ide="vscode", inactivity_duration="30m"),
|
|
944
|
+
"update",
|
|
945
|
+
id="update-dev-env",
|
|
946
|
+
),
|
|
947
|
+
pytest.param(
|
|
948
|
+
TaskConfiguration(image="test-image-1"),
|
|
949
|
+
TaskConfiguration(image="test-image-2"),
|
|
950
|
+
"create",
|
|
951
|
+
id="no-update-task",
|
|
952
|
+
),
|
|
953
|
+
pytest.param(
|
|
954
|
+
DevEnvironmentConfiguration(ide="vscode", image="test-image"),
|
|
955
|
+
TaskConfiguration(image="test-image"),
|
|
956
|
+
"create",
|
|
957
|
+
id="no-update-on-type-change",
|
|
958
|
+
),
|
|
959
|
+
],
|
|
960
|
+
)
|
|
961
|
+
async def test_returns_update_or_create_action_on_conf_change(
|
|
962
|
+
self,
|
|
963
|
+
test_db,
|
|
964
|
+
session: AsyncSession,
|
|
965
|
+
client: AsyncClient,
|
|
966
|
+
old_conf: AnyRunConfiguration,
|
|
967
|
+
new_conf: AnyRunConfiguration,
|
|
968
|
+
action: str,
|
|
969
|
+
) -> None:
|
|
938
970
|
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
939
971
|
project = await create_project(session=session, owner=user)
|
|
940
972
|
await add_project_member(
|
|
941
973
|
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
942
974
|
)
|
|
943
975
|
repo = await create_repo(session=session, project_id=project.id)
|
|
944
|
-
run_spec = get_run_spec(
|
|
945
|
-
run_name="test-service",
|
|
946
|
-
repo_id=repo.name,
|
|
947
|
-
configuration=ServiceConfiguration(
|
|
948
|
-
type="service",
|
|
949
|
-
commands=["one", "two"],
|
|
950
|
-
port=80,
|
|
951
|
-
replicas=1,
|
|
952
|
-
),
|
|
953
|
-
)
|
|
976
|
+
run_spec = get_run_spec(run_name="test-run", repo_id=repo.name, configuration=old_conf)
|
|
954
977
|
run_model = await create_run(
|
|
955
978
|
session=session,
|
|
956
979
|
project=project,
|
|
@@ -960,7 +983,7 @@ class TestGetRunPlan:
|
|
|
960
983
|
run_spec=run_spec,
|
|
961
984
|
)
|
|
962
985
|
run = run_model_to_run(run_model)
|
|
963
|
-
run_spec.configuration
|
|
986
|
+
run_spec.configuration = new_conf
|
|
964
987
|
response = await client.post(
|
|
965
988
|
f"/api/project/{project.name}/runs/get_plan",
|
|
966
989
|
headers=get_auth_headers(user.token),
|
|
@@ -968,7 +991,7 @@ class TestGetRunPlan:
|
|
|
968
991
|
)
|
|
969
992
|
assert response.status_code == 200
|
|
970
993
|
response_json = response.json()
|
|
971
|
-
assert response_json["action"] ==
|
|
994
|
+
assert response_json["action"] == action
|
|
972
995
|
assert response_json["current_resource"] == json.loads(run.json())
|
|
973
996
|
|
|
974
997
|
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from datetime import datetime, timedelta, timezone
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.models.metrics import Metric
|
|
7
|
+
from dstack._internal.server.services.metrics import get_job_metrics
|
|
8
|
+
from dstack._internal.server.testing.common import (
|
|
9
|
+
create_job,
|
|
10
|
+
create_job_metrics_point,
|
|
11
|
+
create_project,
|
|
12
|
+
create_repo,
|
|
13
|
+
create_run,
|
|
14
|
+
create_user,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.asyncio
|
|
19
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
20
|
+
@pytest.mark.usefixtures("test_db", "image_config_mock")
|
|
21
|
+
class TestGetMetrics:
|
|
22
|
+
latest_ts = datetime(2023, 1, 2, 3, 4, 25, tzinfo=timezone.utc)
|
|
23
|
+
ts: tuple[datetime, ...] = (
|
|
24
|
+
latest_ts, # 0
|
|
25
|
+
latest_ts - timedelta(seconds=10), # 1
|
|
26
|
+
latest_ts - timedelta(seconds=20), # 2
|
|
27
|
+
latest_ts - timedelta(seconds=30), # 3
|
|
28
|
+
latest_ts - timedelta(seconds=40), # 4
|
|
29
|
+
latest_ts - timedelta(seconds=50), # 5
|
|
30
|
+
)
|
|
31
|
+
# dt, cpu_usage_sec, memory_usage_bytes, memory_ws_bytes, gpu0_memory_usage_bytes, gpu0_util,
|
|
32
|
+
# gpu1_memory_usage_bytess, gpu1_util
|
|
33
|
+
points: tuple[tuple[datetime, int, int, int, int, int, int, int], ...] = (
|
|
34
|
+
(ts[0], 110, 512, 128, 768, 15, 128, 20),
|
|
35
|
+
(ts[1], 104, 1024, 512, 1024, 10, 256, 10),
|
|
36
|
+
(ts[2], 100, 1024, 512, 1024, 20, 128, 5),
|
|
37
|
+
(ts[3], 90, 512, 512, 2048, 40, 512, 20),
|
|
38
|
+
(ts[4], 90, 1024, 1024, 1024, 0, 128, 0),
|
|
39
|
+
(ts[5], 80, 512, 512, 1024, 10, 256, 0),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
@pytest.mark.parametrize(
|
|
43
|
+
["params", "ts", "cpu", "mem", "mem_ws", "gpu0_mem", "gpu0_util", "gpu1_mem", "gpu1_util"],
|
|
44
|
+
[
|
|
45
|
+
pytest.param(
|
|
46
|
+
{"limit": 1},
|
|
47
|
+
[ts[0]],
|
|
48
|
+
[60],
|
|
49
|
+
[512],
|
|
50
|
+
[128],
|
|
51
|
+
[768],
|
|
52
|
+
[15],
|
|
53
|
+
[128],
|
|
54
|
+
[20],
|
|
55
|
+
id="limit-1-latest",
|
|
56
|
+
),
|
|
57
|
+
pytest.param(
|
|
58
|
+
{"limit": 3},
|
|
59
|
+
[ts[0], ts[1], ts[2]],
|
|
60
|
+
[60, 40, 100],
|
|
61
|
+
[512, 1024, 1024],
|
|
62
|
+
[128, 512, 512],
|
|
63
|
+
[768, 1024, 1024],
|
|
64
|
+
[15, 10, 20],
|
|
65
|
+
[128, 256, 128],
|
|
66
|
+
[20, 10, 5],
|
|
67
|
+
id="limit-3-latest",
|
|
68
|
+
),
|
|
69
|
+
pytest.param(
|
|
70
|
+
{},
|
|
71
|
+
[ts[0], ts[1], ts[2], ts[3], ts[4]],
|
|
72
|
+
[60, 40, 100, 0, 100],
|
|
73
|
+
[512, 1024, 1024, 512, 1024],
|
|
74
|
+
[128, 512, 512, 512, 1024],
|
|
75
|
+
[768, 1024, 1024, 2048, 1024],
|
|
76
|
+
[15, 10, 20, 40, 0],
|
|
77
|
+
[128, 256, 128, 512, 128],
|
|
78
|
+
[20, 10, 5, 20, 0],
|
|
79
|
+
id="all",
|
|
80
|
+
),
|
|
81
|
+
pytest.param(
|
|
82
|
+
{"after": ts[3]},
|
|
83
|
+
[ts[0], ts[1], ts[2]],
|
|
84
|
+
[60, 40, 100],
|
|
85
|
+
[512, 1024, 1024],
|
|
86
|
+
[128, 512, 512],
|
|
87
|
+
[768, 1024, 1024],
|
|
88
|
+
[15, 10, 20],
|
|
89
|
+
[128, 256, 128],
|
|
90
|
+
[20, 10, 5],
|
|
91
|
+
id="all-after",
|
|
92
|
+
),
|
|
93
|
+
pytest.param(
|
|
94
|
+
{"before": ts[2]},
|
|
95
|
+
[ts[3], ts[4]],
|
|
96
|
+
[0, 100],
|
|
97
|
+
[512, 1024],
|
|
98
|
+
[512, 1024],
|
|
99
|
+
[2048, 1024],
|
|
100
|
+
[40, 0],
|
|
101
|
+
[512, 128],
|
|
102
|
+
[20, 0],
|
|
103
|
+
id="all-before",
|
|
104
|
+
),
|
|
105
|
+
],
|
|
106
|
+
)
|
|
107
|
+
async def test_get_metrics(
|
|
108
|
+
self,
|
|
109
|
+
session: AsyncSession,
|
|
110
|
+
params: dict,
|
|
111
|
+
ts: list[datetime],
|
|
112
|
+
cpu: list[int],
|
|
113
|
+
mem: list[int],
|
|
114
|
+
mem_ws: list[int],
|
|
115
|
+
gpu0_mem: list[int],
|
|
116
|
+
gpu0_util: list[int],
|
|
117
|
+
gpu1_mem: list[int],
|
|
118
|
+
gpu1_util: list[int],
|
|
119
|
+
):
|
|
120
|
+
user = await create_user(session=session)
|
|
121
|
+
project = await create_project(session=session, owner=user)
|
|
122
|
+
repo = await create_repo(
|
|
123
|
+
session=session,
|
|
124
|
+
project_id=project.id,
|
|
125
|
+
)
|
|
126
|
+
run = await create_run(
|
|
127
|
+
session=session,
|
|
128
|
+
project=project,
|
|
129
|
+
repo=repo,
|
|
130
|
+
user=user,
|
|
131
|
+
)
|
|
132
|
+
job = await create_job(
|
|
133
|
+
session=session,
|
|
134
|
+
run=run,
|
|
135
|
+
)
|
|
136
|
+
for dt, _cpu, _mem, _mem_ws, _gpu0_mem, _gpu0_util, _gpu1_mem, _gpu1_util in self.points:
|
|
137
|
+
await create_job_metrics_point(
|
|
138
|
+
session=session,
|
|
139
|
+
job_model=job,
|
|
140
|
+
timestamp=dt,
|
|
141
|
+
cpu_usage_micro=_cpu * 1_000_000,
|
|
142
|
+
memory_usage_bytes=_mem,
|
|
143
|
+
memory_working_set_bytes=_mem_ws,
|
|
144
|
+
gpus_memory_usage_bytes=[_gpu0_mem, _gpu1_mem],
|
|
145
|
+
gpus_util_percent=[_gpu0_util, _gpu1_util],
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
metrics = await get_job_metrics(session, job, **params)
|
|
149
|
+
|
|
150
|
+
assert metrics.metrics == [
|
|
151
|
+
Metric(name="cpu_usage_percent", timestamps=ts, values=cpu),
|
|
152
|
+
Metric(name="memory_usage_bytes", timestamps=ts, values=mem),
|
|
153
|
+
Metric(name="memory_working_set_bytes", timestamps=ts, values=mem_ws),
|
|
154
|
+
Metric(
|
|
155
|
+
name="gpus_detected_num",
|
|
156
|
+
timestamps=[ts[0], ts[-1]] if len(ts) > 1 else ts,
|
|
157
|
+
values=[2, 2] if len(ts) > 1 else [2],
|
|
158
|
+
),
|
|
159
|
+
Metric(name="gpu_memory_usage_bytes_gpu0", timestamps=ts, values=gpu0_mem),
|
|
160
|
+
Metric(name="gpu_memory_usage_bytes_gpu1", timestamps=ts, values=gpu1_mem),
|
|
161
|
+
Metric(name="gpu_util_percent_gpu0", timestamps=ts, values=gpu0_util),
|
|
162
|
+
Metric(name="gpu_util_percent_gpu1", timestamps=ts, values=gpu1_util),
|
|
163
|
+
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|