dstack 0.19.0rc1__py3-none-any.whl → 0.19.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/metrics.py +138 -0
- dstack/_internal/cli/commands/stats.py +5 -119
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/core/backends/base/compute.py +3 -0
- dstack/_internal/core/backends/base/models.py +7 -7
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +270 -0
- dstack/_internal/core/backends/nebius/configurator.py +74 -0
- dstack/_internal/core/backends/nebius/models.py +108 -0
- dstack/_internal/core/backends/nebius/resources.py +222 -0
- dstack/_internal/core/errors.py +14 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
- dstack/_internal/server/background/tasks/process_instances.py +26 -12
- dstack/_internal/server/routers/prometheus.py +5 -12
- dstack/_internal/server/security/permissions.py +19 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
- dstack/_internal/server/services/prometheus.py +175 -112
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-bcb3228138bc8483cc0b.js} +7278 -131
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-bcb3228138bc8483cc0b.js.map} +1 -1
- dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-c0bdaac8f1ea67d499eb.css} +1 -1
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/version.py +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/METADATA +27 -11
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/RECORD +37 -28
- tests/_internal/server/background/tasks/test_process_instances.py +68 -2
- tests/_internal/server/routers/test_backends.py +116 -0
- tests/_internal/server/routers/test_prometheus.py +158 -120
- tests/_internal/utils/test_event_loop.py +18 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/WHEEL +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/top_level.txt +0 -0
|
@@ -9,19 +9,26 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
12
|
-
from dstack._internal.core.models.runs import
|
|
12
|
+
from dstack._internal.core.models.runs import (
|
|
13
|
+
JobProvisioningData,
|
|
14
|
+
JobRuntimeData,
|
|
15
|
+
JobStatus,
|
|
16
|
+
RunStatus,
|
|
17
|
+
)
|
|
13
18
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
14
|
-
from dstack._internal.server.models import JobModel, ProjectModel, UserModel
|
|
19
|
+
from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel
|
|
15
20
|
from dstack._internal.server.services.projects import add_project_member
|
|
16
21
|
from dstack._internal.server.testing.common import (
|
|
17
22
|
create_fleet,
|
|
18
23
|
create_instance,
|
|
19
24
|
create_job,
|
|
25
|
+
create_job_metrics_point,
|
|
20
26
|
create_job_prometheus_metrics,
|
|
21
27
|
create_project,
|
|
22
28
|
create_repo,
|
|
23
29
|
create_run,
|
|
24
30
|
create_user,
|
|
31
|
+
get_auth_headers,
|
|
25
32
|
get_instance_offer_with_availability,
|
|
26
33
|
get_job_provisioning_data,
|
|
27
34
|
get_job_runtime_data,
|
|
@@ -32,6 +39,7 @@ from dstack._internal.server.testing.common import (
|
|
|
32
39
|
@pytest.fixture
|
|
33
40
|
def enable_metrics(monkeypatch: pytest.MonkeyPatch):
|
|
34
41
|
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", True)
|
|
42
|
+
monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", None)
|
|
35
43
|
|
|
36
44
|
|
|
37
45
|
FAKE_NOW = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)
|
|
@@ -45,11 +53,21 @@ class TestGetPrometheusMetrics:
|
|
|
45
53
|
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
46
54
|
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
47
55
|
offer = get_instance_offer_with_availability(
|
|
48
|
-
instance_type="test-type",
|
|
56
|
+
instance_type="test-type",
|
|
57
|
+
cpu_count=32,
|
|
58
|
+
memory_gib=128,
|
|
59
|
+
gpu_count=2,
|
|
60
|
+
gpu_name="V4",
|
|
61
|
+
price=12,
|
|
49
62
|
)
|
|
50
63
|
project_2 = await _create_project(session, "project-2", user)
|
|
51
64
|
jpd_2_1 = get_job_provisioning_data(
|
|
52
|
-
backend=BackendType.AWS,
|
|
65
|
+
backend=BackendType.AWS,
|
|
66
|
+
cpu_count=16,
|
|
67
|
+
memory_gib=64,
|
|
68
|
+
gpu_name="T4",
|
|
69
|
+
gpu_count=2,
|
|
70
|
+
price=16,
|
|
53
71
|
)
|
|
54
72
|
job_2_1 = await _create_job(
|
|
55
73
|
session=session,
|
|
@@ -100,7 +118,41 @@ class TestGetPrometheusMetrics:
|
|
|
100
118
|
FIELD_2{gpu="1"} 987169 1395066363010
|
|
101
119
|
"""),
|
|
102
120
|
)
|
|
103
|
-
|
|
121
|
+
await create_job_metrics_point(
|
|
122
|
+
session=session,
|
|
123
|
+
job_model=job_1_1,
|
|
124
|
+
timestamp=FAKE_NOW - timedelta(seconds=30),
|
|
125
|
+
cpu_usage_micro=3_500_000,
|
|
126
|
+
memory_working_set_bytes=3_221_225_472,
|
|
127
|
+
memory_usage_bytes=4_294_967_296,
|
|
128
|
+
)
|
|
129
|
+
# Older, ignored
|
|
130
|
+
await create_job_metrics_point(
|
|
131
|
+
session=session,
|
|
132
|
+
job_model=job_1_1,
|
|
133
|
+
timestamp=FAKE_NOW - timedelta(seconds=60),
|
|
134
|
+
cpu_usage_micro=2_000_000,
|
|
135
|
+
memory_working_set_bytes=1_073_741_824,
|
|
136
|
+
memory_usage_bytes=2_147_483_648,
|
|
137
|
+
)
|
|
138
|
+
jpd_1_2 = get_job_provisioning_data(
|
|
139
|
+
backend=BackendType.AWS,
|
|
140
|
+
cpu_count=24,
|
|
141
|
+
memory_gib=224,
|
|
142
|
+
gpu_count=3,
|
|
143
|
+
gpu_name="L4",
|
|
144
|
+
price=12.5,
|
|
145
|
+
)
|
|
146
|
+
job_1_2 = await _create_job(
|
|
147
|
+
session=session,
|
|
148
|
+
run_name="run-2",
|
|
149
|
+
project=project_1,
|
|
150
|
+
user=user,
|
|
151
|
+
status=JobStatus.RUNNING,
|
|
152
|
+
job_provisioning_data=jpd_1_2,
|
|
153
|
+
submitted_at=FAKE_NOW - timedelta(seconds=150),
|
|
154
|
+
)
|
|
155
|
+
|
|
104
156
|
await create_job_prometheus_metrics(
|
|
105
157
|
session=session,
|
|
106
158
|
job=job_1_2,
|
|
@@ -124,6 +176,15 @@ class TestGetPrometheusMetrics:
|
|
|
124
176
|
FIELD_1{gpu="1"} 20
|
|
125
177
|
"""),
|
|
126
178
|
)
|
|
179
|
+
await _create_run(session, "done", project_1, user, RunStatus.DONE)
|
|
180
|
+
other_user = await create_user(
|
|
181
|
+
session=session, name="other-user", global_role=GlobalRole.USER
|
|
182
|
+
)
|
|
183
|
+
await add_project_member(
|
|
184
|
+
session=session, project=project_2, user=other_user, project_role=ProjectRole.USER
|
|
185
|
+
)
|
|
186
|
+
await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED)
|
|
187
|
+
await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED)
|
|
127
188
|
fleet = await create_fleet(session=session, project=project_1, name="test-fleet")
|
|
128
189
|
instance = await create_instance(
|
|
129
190
|
session=session,
|
|
@@ -149,31 +210,73 @@ class TestGetPrometheusMetrics:
|
|
|
149
210
|
# HELP dstack_instance_gpu_count Instance GPU count
|
|
150
211
|
# TYPE dstack_instance_gpu_count gauge
|
|
151
212
|
dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
213
|
+
# HELP dstack_run_count_total Total runs count
|
|
214
|
+
# TYPE dstack_run_count_total counter
|
|
215
|
+
dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0
|
|
216
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
217
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0
|
|
218
|
+
# HELP dstack_run_count_terminated_total Terminated runs count
|
|
219
|
+
# TYPE dstack_run_count_terminated_total counter
|
|
220
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
221
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
222
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
223
|
+
# HELP dstack_run_count_failed_total Failed runs count
|
|
224
|
+
# TYPE dstack_run_count_failed_total counter
|
|
225
|
+
dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
226
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
227
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
228
|
+
# HELP dstack_run_count_done_total Done runs count
|
|
229
|
+
# TYPE dstack_run_count_done_total counter
|
|
230
|
+
dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0
|
|
231
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
232
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
152
233
|
# HELP dstack_job_duration_seconds_total Total seconds the job is running
|
|
153
234
|
# TYPE dstack_job_duration_seconds_total counter
|
|
154
235
|
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0
|
|
236
|
+
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0
|
|
155
237
|
dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
156
238
|
# HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour
|
|
157
239
|
# TYPE dstack_job_price_dollars_per_hour gauge
|
|
158
240
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0
|
|
241
|
+
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5
|
|
159
242
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
160
243
|
# HELP dstack_job_gpu_count Job GPU count
|
|
161
244
|
# TYPE dstack_job_gpu_count gauge
|
|
162
245
|
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
246
|
+
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0
|
|
163
247
|
dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0
|
|
248
|
+
# HELP dstack_job_cpu_count Job CPU count
|
|
249
|
+
# TYPE dstack_job_cpu_count gauge
|
|
250
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0
|
|
251
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0
|
|
252
|
+
dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
253
|
+
# HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds
|
|
254
|
+
# TYPE dstack_job_cpu_time_seconds_total counter
|
|
255
|
+
dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5
|
|
256
|
+
# HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes
|
|
257
|
+
# TYPE dstack_job_memory_total_bytes gauge
|
|
258
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0
|
|
259
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0
|
|
260
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0
|
|
261
|
+
# HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes
|
|
262
|
+
# TYPE dstack_job_memory_usage_bytes gauge
|
|
263
|
+
dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0
|
|
264
|
+
# HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
|
|
265
|
+
# TYPE dstack_job_memory_working_set_bytes gauge
|
|
266
|
+
dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
|
|
164
267
|
# HELP FIELD_1 Test field 1
|
|
165
268
|
# TYPE FIELD_1 gauge
|
|
166
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
167
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
168
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
169
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
170
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
171
|
-
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 100.0
|
|
172
|
-
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 200.0
|
|
269
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
|
|
270
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0
|
|
271
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0
|
|
272
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0
|
|
273
|
+
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0
|
|
274
|
+
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
275
|
+
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0
|
|
173
276
|
# HELP FIELD_2 Test field 2
|
|
174
277
|
# TYPE FIELD_2 counter
|
|
175
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
176
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
278
|
+
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
|
|
279
|
+
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
|
|
177
280
|
""")
|
|
178
281
|
|
|
179
282
|
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
|
|
@@ -188,109 +291,24 @@ class TestGetPrometheusMetrics:
|
|
|
188
291
|
response = await client.get("/metrics")
|
|
189
292
|
assert response.status_code == 404
|
|
190
293
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
|
|
195
|
-
class TestGetPrometheusProjectMetrics:
|
|
196
|
-
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
197
|
-
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
198
|
-
project = await _create_project(session, "project-1", user)
|
|
199
|
-
job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
|
|
200
|
-
await create_job_prometheus_metrics(
|
|
201
|
-
session=session,
|
|
202
|
-
job=job_1,
|
|
203
|
-
text=dedent("""
|
|
204
|
-
# Comments should be skipped
|
|
205
|
-
|
|
206
|
-
# HELP FIELD_1 Test field 1
|
|
207
|
-
# TYPE FIELD_1 gauge
|
|
208
|
-
FIELD_1{gpu="0"} 350
|
|
209
|
-
FIELD_1{gpu="1"} 400
|
|
210
|
-
|
|
211
|
-
# HELP FIELD_2 Test field 2
|
|
212
|
-
# TYPE FIELD_2 counter
|
|
213
|
-
FIELD_2{gpu="0"} 337325 1395066363000
|
|
214
|
-
FIELD_2{gpu="1"} 987169 1395066363010
|
|
215
|
-
"""),
|
|
216
|
-
)
|
|
217
|
-
job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
|
|
218
|
-
await create_job_prometheus_metrics(
|
|
219
|
-
session=session,
|
|
220
|
-
job=job_2,
|
|
221
|
-
text=dedent("""
|
|
222
|
-
# HELP FIELD_1 Test field 1
|
|
223
|
-
# TYPE FIELD_1 gauge
|
|
224
|
-
FIELD_1{gpu="0"} 1200.0
|
|
225
|
-
FIELD_1{gpu="1"} 1600.0
|
|
226
|
-
FIELD_1{gpu="2"} 2400.0
|
|
227
|
-
"""),
|
|
228
|
-
)
|
|
229
|
-
# Terminated job, should not appear in the response
|
|
230
|
-
job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
|
|
231
|
-
await create_job_prometheus_metrics(
|
|
232
|
-
session=session,
|
|
233
|
-
job=job_3,
|
|
234
|
-
text=dedent("""
|
|
235
|
-
# HELP FIELD_1 Test field 1
|
|
236
|
-
# TYPE FIELD_1 gauge
|
|
237
|
-
FIELD_1{gpu="0"} 10
|
|
238
|
-
FIELD_1{gpu="1"} 20
|
|
239
|
-
"""),
|
|
240
|
-
)
|
|
241
|
-
another_project = await _create_project(session, "project-2", user)
|
|
242
|
-
another_project_job = await _create_job(
|
|
243
|
-
session, "run-4", another_project, user, JobStatus.RUNNING
|
|
244
|
-
)
|
|
245
|
-
await create_job_prometheus_metrics(
|
|
246
|
-
session=session,
|
|
247
|
-
job=another_project_job,
|
|
248
|
-
text=dedent("""
|
|
249
|
-
# HELP FIELD_1 Test field 1
|
|
250
|
-
# TYPE FIELD_1 gauge
|
|
251
|
-
FIELD_1{gpu="0"} 100
|
|
252
|
-
FIELD_1{gpu="1"} 200
|
|
253
|
-
"""),
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
response = await client.get("/metrics/project/project-1")
|
|
257
|
-
|
|
258
|
-
assert response.status_code == 200
|
|
259
|
-
assert response.text == dedent(f"""\
|
|
260
|
-
# HELP FIELD_1 Test field 1
|
|
261
|
-
# TYPE FIELD_1 gauge
|
|
262
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
263
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
264
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
265
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
266
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
267
|
-
# HELP FIELD_2 Test field 2
|
|
268
|
-
# TYPE FIELD_2 counter
|
|
269
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
270
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
271
|
-
""")
|
|
272
|
-
|
|
273
|
-
async def test_returns_empty_response_if_no_runs(
|
|
274
|
-
self, session: AsyncSession, client: AsyncClient
|
|
294
|
+
@pytest.mark.parametrize("token", [None, "foo"])
|
|
295
|
+
async def test_returns_403_if_not_authenticated(
|
|
296
|
+
self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient, token: Optional[str]
|
|
275
297
|
):
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
298
|
+
monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", "secret")
|
|
299
|
+
if token is not None:
|
|
300
|
+
headers = get_auth_headers(token)
|
|
301
|
+
else:
|
|
302
|
+
headers = None
|
|
303
|
+
response = await client.get("/metrics", headers=headers)
|
|
304
|
+
assert response.status_code == 403
|
|
281
305
|
|
|
282
|
-
async def
|
|
283
|
-
|
|
284
|
-
assert response.status_code == 404
|
|
285
|
-
|
|
286
|
-
async def test_returns_404_if_not_enabled(
|
|
287
|
-
self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
|
|
306
|
+
async def test_returns_200_if_token_is_valid(
|
|
307
|
+
self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient
|
|
288
308
|
):
|
|
289
|
-
monkeypatch.setattr("dstack._internal.server.
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
response = await client.get("/metrics/project/test-project")
|
|
293
|
-
assert response.status_code == 404
|
|
309
|
+
monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", "secret")
|
|
310
|
+
response = await client.get("/metrics", headers=get_auth_headers("secret"))
|
|
311
|
+
assert response.status_code == 200
|
|
294
312
|
|
|
295
313
|
|
|
296
314
|
async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
|
|
@@ -301,26 +319,46 @@ async def _create_project(session: AsyncSession, name: str, user: UserModel) ->
|
|
|
301
319
|
return project
|
|
302
320
|
|
|
303
321
|
|
|
304
|
-
async def
|
|
322
|
+
async def _create_run(
|
|
305
323
|
session: AsyncSession,
|
|
306
324
|
run_name: str,
|
|
307
325
|
project: ProjectModel,
|
|
308
326
|
user: UserModel,
|
|
309
|
-
status:
|
|
310
|
-
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
311
|
-
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
327
|
+
status: RunStatus,
|
|
312
328
|
submitted_at: datetime = FAKE_NOW,
|
|
313
|
-
) ->
|
|
329
|
+
) -> RunModel:
|
|
314
330
|
repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
|
|
315
331
|
configuration = DevEnvironmentConfiguration(ide="vscode")
|
|
316
332
|
run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration)
|
|
317
|
-
|
|
333
|
+
return await create_run(
|
|
318
334
|
session=session,
|
|
319
335
|
project=project,
|
|
320
336
|
repo=repo,
|
|
321
337
|
user=user,
|
|
322
338
|
run_name=run_name,
|
|
323
339
|
run_spec=run_spec,
|
|
340
|
+
status=status,
|
|
341
|
+
submitted_at=submitted_at,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
async def _create_job(
|
|
346
|
+
session: AsyncSession,
|
|
347
|
+
run_name: str,
|
|
348
|
+
project: ProjectModel,
|
|
349
|
+
user: UserModel,
|
|
350
|
+
status: JobStatus,
|
|
351
|
+
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
352
|
+
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
353
|
+
submitted_at: datetime = FAKE_NOW,
|
|
354
|
+
) -> JobModel:
|
|
355
|
+
run = await _create_run(
|
|
356
|
+
session=session,
|
|
357
|
+
run_name=run_name,
|
|
358
|
+
project=project,
|
|
359
|
+
user=user,
|
|
360
|
+
status=RunStatus.SUBMITTED,
|
|
361
|
+
submitted_at=submitted_at,
|
|
324
362
|
)
|
|
325
363
|
job = await create_job(
|
|
326
364
|
session=session,
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from dstack._internal.utils.event_loop import DaemonEventLoop
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_daemon_event_loop():
|
|
7
|
+
q = asyncio.Queue()
|
|
8
|
+
|
|
9
|
+
async def worker(i):
|
|
10
|
+
await q.put(i)
|
|
11
|
+
|
|
12
|
+
async def all_workers():
|
|
13
|
+
await asyncio.gather(*[worker(i) for i in range(3)])
|
|
14
|
+
|
|
15
|
+
loop = DaemonEventLoop()
|
|
16
|
+
loop.await_(all_workers())
|
|
17
|
+
assert q.qsize() == 3
|
|
18
|
+
assert {loop.await_(q.get()) for _ in range(3)} == {0, 1, 2}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|