dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -0
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +1 -0
- dstack/_internal/core/backends/azure/compute.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +1 -1
- dstack/_internal/core/backends/runpod/compute.py +21 -3
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -1
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -19
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +13 -3
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4eb116b97819badd1e2c.js} +66 -13
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +17 -0
- dstack/api/_public/runs.py +3 -0
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/RECORD +59 -50
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +125 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +79 -56
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from unittest.mock import Mock, patch
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import pytest_asyncio
|
|
7
|
+
from freezegun import freeze_time
|
|
8
|
+
from sqlalchemy import select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
|
|
11
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
12
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
13
|
+
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
14
|
+
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
15
|
+
collect_prometheus_metrics,
|
|
16
|
+
delete_prometheus_metrics,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.server.models import JobModel, JobPrometheusMetrics
|
|
19
|
+
from dstack._internal.server.services.projects import add_project_member
|
|
20
|
+
from dstack._internal.server.testing.common import (
|
|
21
|
+
create_instance,
|
|
22
|
+
create_job,
|
|
23
|
+
create_job_prometheus_metrics,
|
|
24
|
+
create_pool,
|
|
25
|
+
create_project,
|
|
26
|
+
create_repo,
|
|
27
|
+
create_run,
|
|
28
|
+
create_user,
|
|
29
|
+
get_job_provisioning_data,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.asyncio
|
|
34
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
35
|
+
@pytest.mark.usefixtures("test_db", "image_config_mock")
|
|
36
|
+
class TestCollectPrometheusMetrics:
|
|
37
|
+
@pytest_asyncio.fixture
|
|
38
|
+
async def job(self, session: AsyncSession) -> JobModel:
|
|
39
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
40
|
+
project = await create_project(session=session, owner=user)
|
|
41
|
+
await add_project_member(
|
|
42
|
+
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
43
|
+
)
|
|
44
|
+
repo = await create_repo(
|
|
45
|
+
session=session,
|
|
46
|
+
project_id=project.id,
|
|
47
|
+
)
|
|
48
|
+
pool = await create_pool(session=session, project=project)
|
|
49
|
+
instance = await create_instance(
|
|
50
|
+
session=session,
|
|
51
|
+
project=project,
|
|
52
|
+
pool=pool,
|
|
53
|
+
status=InstanceStatus.BUSY,
|
|
54
|
+
)
|
|
55
|
+
run = await create_run(
|
|
56
|
+
session=session,
|
|
57
|
+
project=project,
|
|
58
|
+
repo=repo,
|
|
59
|
+
user=user,
|
|
60
|
+
)
|
|
61
|
+
job = await create_job(
|
|
62
|
+
session=session,
|
|
63
|
+
run=run,
|
|
64
|
+
status=JobStatus.RUNNING,
|
|
65
|
+
job_provisioning_data=get_job_provisioning_data(),
|
|
66
|
+
instance_assigned=True,
|
|
67
|
+
instance=instance,
|
|
68
|
+
)
|
|
69
|
+
return job
|
|
70
|
+
|
|
71
|
+
@pytest.fixture
|
|
72
|
+
def ssh_tunnel_mock(self) -> Generator[Mock, None, None]:
|
|
73
|
+
with patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock:
|
|
74
|
+
yield SSHTunnelMock
|
|
75
|
+
|
|
76
|
+
@pytest.fixture
|
|
77
|
+
def shim_client_mock(self) -> Generator[Mock, None, None]:
|
|
78
|
+
with patch("dstack._internal.server.services.runner.client.ShimClient") as ShimClientMock:
|
|
79
|
+
yield ShimClientMock.return_value
|
|
80
|
+
|
|
81
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
82
|
+
async def test_inserts_new_record(
|
|
83
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
84
|
+
):
|
|
85
|
+
shim_client_mock.get_task_metrics.return_value = "# prom response"
|
|
86
|
+
|
|
87
|
+
await collect_prometheus_metrics()
|
|
88
|
+
|
|
89
|
+
ssh_tunnel_mock.assert_called_once()
|
|
90
|
+
shim_client_mock.get_task_metrics.assert_called_once()
|
|
91
|
+
res = await session.execute(
|
|
92
|
+
select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id)
|
|
93
|
+
)
|
|
94
|
+
metrics = res.scalar_one()
|
|
95
|
+
assert metrics.text == "# prom response"
|
|
96
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
|
|
97
|
+
|
|
98
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
99
|
+
async def test_updates_record(
|
|
100
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
101
|
+
):
|
|
102
|
+
metrics = await create_job_prometheus_metrics(
|
|
103
|
+
session=session,
|
|
104
|
+
job=job,
|
|
105
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 0),
|
|
106
|
+
text="# prom old response",
|
|
107
|
+
)
|
|
108
|
+
shim_client_mock.get_task_metrics.return_value = "# prom new response"
|
|
109
|
+
|
|
110
|
+
await collect_prometheus_metrics()
|
|
111
|
+
|
|
112
|
+
ssh_tunnel_mock.assert_called_once()
|
|
113
|
+
shim_client_mock.get_task_metrics.assert_called_once()
|
|
114
|
+
res = await session.execute(
|
|
115
|
+
select(JobPrometheusMetrics)
|
|
116
|
+
.where(JobPrometheusMetrics.job_id == job.id)
|
|
117
|
+
.execution_options(populate_existing=True)
|
|
118
|
+
)
|
|
119
|
+
metrics = res.scalar_one()
|
|
120
|
+
assert metrics.text == "# prom new response"
|
|
121
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
|
|
122
|
+
|
|
123
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
124
|
+
async def test_skips_recently_updated(
|
|
125
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
126
|
+
):
|
|
127
|
+
metrics = await create_job_prometheus_metrics(
|
|
128
|
+
session=session,
|
|
129
|
+
job=job,
|
|
130
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 15),
|
|
131
|
+
text="# prom old response",
|
|
132
|
+
)
|
|
133
|
+
shim_client_mock.get_task_metrics.return_value = "# prom new response"
|
|
134
|
+
|
|
135
|
+
await collect_prometheus_metrics()
|
|
136
|
+
|
|
137
|
+
ssh_tunnel_mock.assert_not_called()
|
|
138
|
+
shim_client_mock.get_task_metrics.assert_not_called()
|
|
139
|
+
res = await session.execute(
|
|
140
|
+
select(JobPrometheusMetrics)
|
|
141
|
+
.where(JobPrometheusMetrics.job_id == job.id)
|
|
142
|
+
.execution_options(populate_existing=True)
|
|
143
|
+
)
|
|
144
|
+
metrics = res.scalar_one()
|
|
145
|
+
assert metrics.text == "# prom old response"
|
|
146
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 15)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@pytest.mark.asyncio
|
|
150
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
151
|
+
@pytest.mark.usefixtures("test_db", "image_config_mock")
|
|
152
|
+
class TestDeletePrometheusMetrics:
|
|
153
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
154
|
+
async def test_deletes_old_metrics(self, session: AsyncSession):
|
|
155
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
156
|
+
project = await create_project(session=session, owner=user)
|
|
157
|
+
await add_project_member(
|
|
158
|
+
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
159
|
+
)
|
|
160
|
+
repo = await create_repo(session=session, project_id=project.id)
|
|
161
|
+
run_1 = await create_run(
|
|
162
|
+
session=session, project=project, repo=repo, user=user, run_name="run-1"
|
|
163
|
+
)
|
|
164
|
+
job_1 = await create_job(session=session, run=run_1)
|
|
165
|
+
# old metrics
|
|
166
|
+
await create_job_prometheus_metrics(
|
|
167
|
+
session=session,
|
|
168
|
+
job=job_1,
|
|
169
|
+
collected_at=datetime(2023, 1, 2, 2, 3, 30),
|
|
170
|
+
)
|
|
171
|
+
run_2 = await create_run(
|
|
172
|
+
session=session, project=project, repo=repo, user=user, run_name="run-2"
|
|
173
|
+
)
|
|
174
|
+
job_2 = await create_job(session=session, run=run_2)
|
|
175
|
+
# recent metrics
|
|
176
|
+
metrics_2 = await create_job_prometheus_metrics(
|
|
177
|
+
session=session,
|
|
178
|
+
job=job_2,
|
|
179
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 0),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
await delete_prometheus_metrics()
|
|
183
|
+
|
|
184
|
+
res = await session.execute(
|
|
185
|
+
select(JobPrometheusMetrics).join(JobModel).where(JobModel.project_id == project.id)
|
|
186
|
+
)
|
|
187
|
+
all_metrics = res.scalars().all()
|
|
188
|
+
assert len(all_metrics) == 1
|
|
189
|
+
assert all_metrics[0] == metrics_2
|
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
|
4
4
|
from unittest.mock import MagicMock, Mock, patch
|
|
5
5
|
|
|
6
6
|
import pytest
|
|
7
|
+
from freezegun import freeze_time
|
|
7
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
9
|
|
|
9
10
|
from dstack._internal.core.errors import SSHError
|
|
@@ -11,6 +12,7 @@ from dstack._internal.core.models.backends.base import BackendType
|
|
|
11
12
|
from dstack._internal.core.models.common import NetworkMode
|
|
12
13
|
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
13
14
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
15
|
+
from dstack._internal.core.models.profiles import UtilizationPolicy
|
|
14
16
|
from dstack._internal.core.models.runs import (
|
|
15
17
|
JobRuntimeData,
|
|
16
18
|
JobStatus,
|
|
@@ -39,6 +41,7 @@ from dstack._internal.server.services.volumes import (
|
|
|
39
41
|
from dstack._internal.server.testing.common import (
|
|
40
42
|
create_instance,
|
|
41
43
|
create_job,
|
|
44
|
+
create_job_metrics_point,
|
|
42
45
|
create_pool,
|
|
43
46
|
create_project,
|
|
44
47
|
create_repo,
|
|
@@ -688,3 +691,125 @@ class TestProcessRunningJobs:
|
|
|
688
691
|
assert job.status == expected_status
|
|
689
692
|
assert job.termination_reason == expected_termination_reason
|
|
690
693
|
assert job.inactivity_secs == expected_inactivity_secs
|
|
694
|
+
|
|
695
|
+
@pytest.mark.asyncio
|
|
696
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
697
|
+
@pytest.mark.parametrize(
|
|
698
|
+
["samples", "expected_status"],
|
|
699
|
+
[
|
|
700
|
+
pytest.param(
|
|
701
|
+
[
|
|
702
|
+
(datetime(2023, 1, 1, 12, 25, 20, tzinfo=timezone.utc), 30),
|
|
703
|
+
(datetime(2023, 1, 1, 12, 25, 30, tzinfo=timezone.utc), 30),
|
|
704
|
+
(datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
|
|
705
|
+
],
|
|
706
|
+
JobStatus.RUNNING,
|
|
707
|
+
id="not-enough-points",
|
|
708
|
+
),
|
|
709
|
+
pytest.param(
|
|
710
|
+
[
|
|
711
|
+
(datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
|
|
712
|
+
(datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
|
|
713
|
+
(datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 80),
|
|
714
|
+
],
|
|
715
|
+
JobStatus.RUNNING,
|
|
716
|
+
id="any-above-min",
|
|
717
|
+
),
|
|
718
|
+
pytest.param(
|
|
719
|
+
[
|
|
720
|
+
(datetime(2023, 1, 1, 12, 10, 10, tzinfo=timezone.utc), 80), # outside window
|
|
721
|
+
(datetime(2023, 1, 1, 12, 10, 20, tzinfo=timezone.utc), 80), # outside window
|
|
722
|
+
(datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
|
|
723
|
+
(datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
|
|
724
|
+
(datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
|
|
725
|
+
],
|
|
726
|
+
JobStatus.TERMINATING,
|
|
727
|
+
id="all-below-min",
|
|
728
|
+
),
|
|
729
|
+
],
|
|
730
|
+
)
|
|
731
|
+
@freeze_time(datetime(2023, 1, 1, 12, 30, tzinfo=timezone.utc))
|
|
732
|
+
async def test_gpu_utilization(
|
|
733
|
+
self,
|
|
734
|
+
test_db,
|
|
735
|
+
session: AsyncSession,
|
|
736
|
+
samples: list[tuple[datetime, int]],
|
|
737
|
+
expected_status: JobStatus,
|
|
738
|
+
) -> None:
|
|
739
|
+
project = await create_project(session=session)
|
|
740
|
+
user = await create_user(session=session)
|
|
741
|
+
repo = await create_repo(
|
|
742
|
+
session=session,
|
|
743
|
+
project_id=project.id,
|
|
744
|
+
)
|
|
745
|
+
run = await create_run(
|
|
746
|
+
session=session,
|
|
747
|
+
project=project,
|
|
748
|
+
repo=repo,
|
|
749
|
+
user=user,
|
|
750
|
+
status=RunStatus.RUNNING,
|
|
751
|
+
run_name="test-run",
|
|
752
|
+
run_spec=get_run_spec(
|
|
753
|
+
run_name="test-run",
|
|
754
|
+
repo_id=repo.name,
|
|
755
|
+
configuration=DevEnvironmentConfiguration(
|
|
756
|
+
name="test-run",
|
|
757
|
+
ide="vscode",
|
|
758
|
+
utilization_policy=UtilizationPolicy(
|
|
759
|
+
min_gpu_utilization=80,
|
|
760
|
+
time_window=600,
|
|
761
|
+
),
|
|
762
|
+
),
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
pool = await create_pool(session=session, project=project)
|
|
766
|
+
instance = await create_instance(
|
|
767
|
+
session=session,
|
|
768
|
+
project=project,
|
|
769
|
+
pool=pool,
|
|
770
|
+
status=InstanceStatus.BUSY,
|
|
771
|
+
)
|
|
772
|
+
job = await create_job(
|
|
773
|
+
session=session,
|
|
774
|
+
run=run,
|
|
775
|
+
status=JobStatus.RUNNING,
|
|
776
|
+
job_provisioning_data=get_job_provisioning_data(),
|
|
777
|
+
instance=instance,
|
|
778
|
+
instance_assigned=True,
|
|
779
|
+
)
|
|
780
|
+
for timestamp, gpu_util in samples:
|
|
781
|
+
# two GPUs, the second one always 100% utilized
|
|
782
|
+
await create_job_metrics_point(
|
|
783
|
+
session=session,
|
|
784
|
+
job_model=job,
|
|
785
|
+
timestamp=timestamp,
|
|
786
|
+
gpus_memory_usage_bytes=[1024, 1024],
|
|
787
|
+
gpus_util_percent=[gpu_util, 100],
|
|
788
|
+
)
|
|
789
|
+
with (
|
|
790
|
+
patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock,
|
|
791
|
+
patch(
|
|
792
|
+
"dstack._internal.server.services.runner.client.RunnerClient"
|
|
793
|
+
) as RunnerClientMock,
|
|
794
|
+
):
|
|
795
|
+
runner_client_mock = RunnerClientMock.return_value
|
|
796
|
+
runner_client_mock.pull.return_value = PullResponse(
|
|
797
|
+
job_states=[],
|
|
798
|
+
job_logs=[],
|
|
799
|
+
runner_logs=[],
|
|
800
|
+
last_updated=0,
|
|
801
|
+
no_connections_secs=0,
|
|
802
|
+
)
|
|
803
|
+
await process_running_jobs()
|
|
804
|
+
SSHTunnelMock.assert_called_once()
|
|
805
|
+
runner_client_mock.pull.assert_called_once()
|
|
806
|
+
await session.refresh(job)
|
|
807
|
+
assert job.status == expected_status
|
|
808
|
+
if expected_status == JobStatus.TERMINATING:
|
|
809
|
+
assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER
|
|
810
|
+
assert job.termination_reason_message == (
|
|
811
|
+
"The job GPU utilization below 80% for 600 seconds"
|
|
812
|
+
)
|
|
813
|
+
else:
|
|
814
|
+
assert job.termination_reason is None
|
|
815
|
+
assert job.termination_reason_message is None
|
|
@@ -370,6 +370,7 @@ class TestCreateFleet:
|
|
|
370
370
|
"idle_duration": None,
|
|
371
371
|
"termination_policy": None,
|
|
372
372
|
"termination_idle_time": None,
|
|
373
|
+
"utilization_policy": None,
|
|
373
374
|
"name": "",
|
|
374
375
|
"default": False,
|
|
375
376
|
"reservation": None,
|
|
@@ -495,6 +496,7 @@ class TestCreateFleet:
|
|
|
495
496
|
"idle_duration": None,
|
|
496
497
|
"termination_policy": None,
|
|
497
498
|
"termination_idle_time": None,
|
|
499
|
+
"utilization_policy": None,
|
|
498
500
|
"name": "",
|
|
499
501
|
"default": False,
|
|
500
502
|
"reservation": None,
|
|
@@ -55,11 +55,25 @@ class TestGetJobMetrics:
|
|
|
55
55
|
session=session,
|
|
56
56
|
run=run,
|
|
57
57
|
)
|
|
58
|
+
await create_job_metrics_point(
|
|
59
|
+
session=session,
|
|
60
|
+
job_model=job,
|
|
61
|
+
timestamp=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc),
|
|
62
|
+
cpu_usage_micro=2 * 1_000_000,
|
|
63
|
+
memory_usage_bytes=256,
|
|
64
|
+
memory_working_set_bytes=128,
|
|
65
|
+
gpus_memory_usage_bytes=[256],
|
|
66
|
+
gpus_util_percent=[2],
|
|
67
|
+
)
|
|
58
68
|
await create_job_metrics_point(
|
|
59
69
|
session=session,
|
|
60
70
|
job_model=job,
|
|
61
71
|
timestamp=datetime(2023, 1, 2, 3, 4, 15, tzinfo=timezone.utc),
|
|
62
72
|
cpu_usage_micro=4 * 1_000_000,
|
|
73
|
+
memory_usage_bytes=512,
|
|
74
|
+
memory_working_set_bytes=256,
|
|
75
|
+
gpus_memory_usage_bytes=[512],
|
|
76
|
+
gpus_util_percent=[6],
|
|
63
77
|
)
|
|
64
78
|
await create_job_metrics_point(
|
|
65
79
|
session=session,
|
|
@@ -76,6 +90,7 @@ class TestGetJobMetrics:
|
|
|
76
90
|
headers=get_auth_headers(user.token),
|
|
77
91
|
)
|
|
78
92
|
assert response.status_code == 200
|
|
93
|
+
# Returns one last sample by default. Filtering is tested in services/test_metrics.py
|
|
79
94
|
assert response.json() == {
|
|
80
95
|
"metrics": [
|
|
81
96
|
{
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
from textwrap import dedent
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from httpx import AsyncClient
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
+
|
|
7
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
8
|
+
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
9
|
+
from dstack._internal.server.models import JobModel, ProjectModel, UserModel
|
|
10
|
+
from dstack._internal.server.services.projects import add_project_member
|
|
11
|
+
from dstack._internal.server.testing.common import (
|
|
12
|
+
create_job,
|
|
13
|
+
create_job_prometheus_metrics,
|
|
14
|
+
create_project,
|
|
15
|
+
create_repo,
|
|
16
|
+
create_run,
|
|
17
|
+
create_user,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def enable_metrics(monkeypatch: pytest.MonkeyPatch):
|
|
23
|
+
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", True)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.asyncio
|
|
27
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
28
|
+
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
|
|
29
|
+
class TestGetPrometheusMetrics:
|
|
30
|
+
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
31
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
32
|
+
project_2 = await _create_project(session, "project-2", user)
|
|
33
|
+
job_2_1 = await _create_job(session, "run-1", project_2, user, JobStatus.RUNNING)
|
|
34
|
+
await create_job_prometheus_metrics(
|
|
35
|
+
session=session,
|
|
36
|
+
job=job_2_1,
|
|
37
|
+
text=dedent("""
|
|
38
|
+
# HELP FIELD_1 Test field 1
|
|
39
|
+
# TYPE FIELD_1 gauge
|
|
40
|
+
FIELD_1{gpu="0"} 100
|
|
41
|
+
FIELD_1{gpu="1"} 200
|
|
42
|
+
"""),
|
|
43
|
+
)
|
|
44
|
+
project_1 = await _create_project(session, "project-1", user)
|
|
45
|
+
job_1_1 = await _create_job(session, "run-1", project_1, user, JobStatus.RUNNING)
|
|
46
|
+
await create_job_prometheus_metrics(
|
|
47
|
+
session=session,
|
|
48
|
+
job=job_1_1,
|
|
49
|
+
text=dedent("""
|
|
50
|
+
# Comments should be skipped
|
|
51
|
+
|
|
52
|
+
# HELP FIELD_1 Test field 1
|
|
53
|
+
# TYPE FIELD_1 gauge
|
|
54
|
+
FIELD_1{gpu="0"} 350
|
|
55
|
+
FIELD_1{gpu="1"} 400
|
|
56
|
+
|
|
57
|
+
# HELP FIELD_2 Test field 2
|
|
58
|
+
# TYPE FIELD_2 counter
|
|
59
|
+
FIELD_2{gpu="0"} 337325 1395066363000
|
|
60
|
+
FIELD_2{gpu="1"} 987169 1395066363010
|
|
61
|
+
"""),
|
|
62
|
+
)
|
|
63
|
+
job_1_2 = await _create_job(session, "run-2", project_1, user, JobStatus.RUNNING)
|
|
64
|
+
await create_job_prometheus_metrics(
|
|
65
|
+
session=session,
|
|
66
|
+
job=job_1_2,
|
|
67
|
+
text=dedent("""
|
|
68
|
+
# HELP FIELD_1 Test field 1
|
|
69
|
+
# TYPE FIELD_1 gauge
|
|
70
|
+
FIELD_1{gpu="0"} 1200.0
|
|
71
|
+
FIELD_1{gpu="1"} 1600.0
|
|
72
|
+
FIELD_1{gpu="2"} 2400.0
|
|
73
|
+
"""),
|
|
74
|
+
)
|
|
75
|
+
# Terminated job, should not appear in the response
|
|
76
|
+
job_1_3 = await _create_job(session, "run-3", project_1, user, JobStatus.TERMINATED)
|
|
77
|
+
await create_job_prometheus_metrics(
|
|
78
|
+
session=session,
|
|
79
|
+
job=job_1_3,
|
|
80
|
+
text=dedent("""
|
|
81
|
+
# HELP FIELD_1 Test field 1
|
|
82
|
+
# TYPE FIELD_1 gauge
|
|
83
|
+
FIELD_1{gpu="0"} 10
|
|
84
|
+
FIELD_1{gpu="1"} 20
|
|
85
|
+
"""),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
response = await client.get("/metrics")
|
|
89
|
+
|
|
90
|
+
assert response.status_code == 200
|
|
91
|
+
assert response.text == dedent("""\
|
|
92
|
+
# HELP FIELD_1 Test field 1
|
|
93
|
+
# TYPE FIELD_1 gauge
|
|
94
|
+
FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 350.0
|
|
95
|
+
FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 400.0
|
|
96
|
+
FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1200.0
|
|
97
|
+
FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1600.0
|
|
98
|
+
FIELD_1{gpu="2",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 2400.0
|
|
99
|
+
FIELD_1{gpu="0",dstack_project_name="project-2",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 100.0
|
|
100
|
+
FIELD_1{gpu="1",dstack_project_name="project-2",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 200.0
|
|
101
|
+
# HELP FIELD_2 Test field 2
|
|
102
|
+
# TYPE FIELD_2 counter
|
|
103
|
+
FIELD_2{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 337325.0 1395066363000
|
|
104
|
+
FIELD_2{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 987169.0 1395066363010
|
|
105
|
+
""")
|
|
106
|
+
|
|
107
|
+
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
|
|
108
|
+
response = await client.get("/metrics")
|
|
109
|
+
assert response.status_code == 200
|
|
110
|
+
assert response.text == ""
|
|
111
|
+
|
|
112
|
+
async def test_returns_404_if_not_enabled(
|
|
113
|
+
self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient
|
|
114
|
+
):
|
|
115
|
+
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
|
|
116
|
+
response = await client.get("/metrics")
|
|
117
|
+
assert response.status_code == 404
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@pytest.mark.asyncio
|
|
121
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
122
|
+
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
|
|
123
|
+
class TestGetPrometheusProjectMetrics:
|
|
124
|
+
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
125
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
126
|
+
project = await _create_project(session, "project-1", user)
|
|
127
|
+
job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
|
|
128
|
+
await create_job_prometheus_metrics(
|
|
129
|
+
session=session,
|
|
130
|
+
job=job_1,
|
|
131
|
+
text=dedent("""
|
|
132
|
+
# Comments should be skipped
|
|
133
|
+
|
|
134
|
+
# HELP FIELD_1 Test field 1
|
|
135
|
+
# TYPE FIELD_1 gauge
|
|
136
|
+
FIELD_1{gpu="0"} 350
|
|
137
|
+
FIELD_1{gpu="1"} 400
|
|
138
|
+
|
|
139
|
+
# HELP FIELD_2 Test field 2
|
|
140
|
+
# TYPE FIELD_2 counter
|
|
141
|
+
FIELD_2{gpu="0"} 337325 1395066363000
|
|
142
|
+
FIELD_2{gpu="1"} 987169 1395066363010
|
|
143
|
+
"""),
|
|
144
|
+
)
|
|
145
|
+
job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
|
|
146
|
+
await create_job_prometheus_metrics(
|
|
147
|
+
session=session,
|
|
148
|
+
job=job_2,
|
|
149
|
+
text=dedent("""
|
|
150
|
+
# HELP FIELD_1 Test field 1
|
|
151
|
+
# TYPE FIELD_1 gauge
|
|
152
|
+
FIELD_1{gpu="0"} 1200.0
|
|
153
|
+
FIELD_1{gpu="1"} 1600.0
|
|
154
|
+
FIELD_1{gpu="2"} 2400.0
|
|
155
|
+
"""),
|
|
156
|
+
)
|
|
157
|
+
# Terminated job, should not appear in the response
|
|
158
|
+
job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
|
|
159
|
+
await create_job_prometheus_metrics(
|
|
160
|
+
session=session,
|
|
161
|
+
job=job_3,
|
|
162
|
+
text=dedent("""
|
|
163
|
+
# HELP FIELD_1 Test field 1
|
|
164
|
+
# TYPE FIELD_1 gauge
|
|
165
|
+
FIELD_1{gpu="0"} 10
|
|
166
|
+
FIELD_1{gpu="1"} 20
|
|
167
|
+
"""),
|
|
168
|
+
)
|
|
169
|
+
another_project = await _create_project(session, "project-2", user)
|
|
170
|
+
another_project_job = await _create_job(
|
|
171
|
+
session, "run-4", another_project, user, JobStatus.RUNNING
|
|
172
|
+
)
|
|
173
|
+
await create_job_prometheus_metrics(
|
|
174
|
+
session=session,
|
|
175
|
+
job=another_project_job,
|
|
176
|
+
text=dedent("""
|
|
177
|
+
# HELP FIELD_1 Test field 1
|
|
178
|
+
# TYPE FIELD_1 gauge
|
|
179
|
+
FIELD_1{gpu="0"} 100
|
|
180
|
+
FIELD_1{gpu="1"} 200
|
|
181
|
+
"""),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
response = await client.get("/metrics/project/project-1")
|
|
185
|
+
|
|
186
|
+
assert response.status_code == 200
|
|
187
|
+
assert response.text == dedent("""\
|
|
188
|
+
# HELP FIELD_1 Test field 1
|
|
189
|
+
# TYPE FIELD_1 gauge
|
|
190
|
+
FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 350.0
|
|
191
|
+
FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 400.0
|
|
192
|
+
FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1200.0
|
|
193
|
+
FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1600.0
|
|
194
|
+
FIELD_1{gpu="2",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 2400.0
|
|
195
|
+
# HELP FIELD_2 Test field 2
|
|
196
|
+
# TYPE FIELD_2 counter
|
|
197
|
+
FIELD_2{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 337325.0 1395066363000
|
|
198
|
+
FIELD_2{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 987169.0 1395066363010
|
|
199
|
+
""")
|
|
200
|
+
|
|
201
|
+
async def test_returns_empty_response_if_no_runs(
|
|
202
|
+
self, session: AsyncSession, client: AsyncClient
|
|
203
|
+
):
|
|
204
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
205
|
+
await create_project(session=session, owner=user, name="test-project")
|
|
206
|
+
response = await client.get("/metrics/project/test-project")
|
|
207
|
+
assert response.status_code == 200
|
|
208
|
+
assert response.text == ""
|
|
209
|
+
|
|
210
|
+
async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
|
|
211
|
+
response = await client.get("/metrics/project/nonexistent")
|
|
212
|
+
assert response.status_code == 404
|
|
213
|
+
|
|
214
|
+
async def test_returns_404_if_not_enabled(
|
|
215
|
+
self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
|
|
216
|
+
):
|
|
217
|
+
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
|
|
218
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
219
|
+
await create_project(session=session, owner=user, name="test-project")
|
|
220
|
+
response = await client.get("/metrics/project/test-project")
|
|
221
|
+
assert response.status_code == 404
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
|
|
225
|
+
project = await create_project(session=session, owner=user, name=name)
|
|
226
|
+
await add_project_member(
|
|
227
|
+
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
228
|
+
)
|
|
229
|
+
return project
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
async def _create_job(
|
|
233
|
+
session: AsyncSession, run_name: str, project: ProjectModel, user: UserModel, status: JobStatus
|
|
234
|
+
) -> JobModel:
|
|
235
|
+
repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
|
|
236
|
+
run = await create_run(
|
|
237
|
+
session=session,
|
|
238
|
+
project=project,
|
|
239
|
+
repo=repo,
|
|
240
|
+
user=user,
|
|
241
|
+
run_name=run_name,
|
|
242
|
+
)
|
|
243
|
+
job = await create_job(session=session, run=run, status=status)
|
|
244
|
+
return job
|