dstack 0.19.0rc1__py3-none-any.whl → 0.19.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. dstack/_internal/cli/commands/metrics.py +138 -0
  2. dstack/_internal/cli/commands/stats.py +5 -119
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/core/backends/base/compute.py +3 -0
  5. dstack/_internal/core/backends/base/models.py +7 -7
  6. dstack/_internal/core/backends/configurators.py +9 -0
  7. dstack/_internal/core/backends/models.py +8 -0
  8. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  9. dstack/_internal/core/backends/nebius/backend.py +16 -0
  10. dstack/_internal/core/backends/nebius/compute.py +270 -0
  11. dstack/_internal/core/backends/nebius/configurator.py +74 -0
  12. dstack/_internal/core/backends/nebius/models.py +108 -0
  13. dstack/_internal/core/backends/nebius/resources.py +222 -0
  14. dstack/_internal/core/errors.py +14 -0
  15. dstack/_internal/core/models/backends/base.py +2 -0
  16. dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
  17. dstack/_internal/server/background/tasks/process_instances.py +26 -12
  18. dstack/_internal/server/routers/prometheus.py +5 -12
  19. dstack/_internal/server/security/permissions.py +19 -1
  20. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
  21. dstack/_internal/server/services/prometheus.py +175 -112
  22. dstack/_internal/server/statics/index.html +1 -1
  23. dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-bcb3228138bc8483cc0b.js} +7278 -131
  24. dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-bcb3228138bc8483cc0b.js.map} +1 -1
  25. dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-c0bdaac8f1ea67d499eb.css} +1 -1
  26. dstack/_internal/utils/event_loop.py +30 -0
  27. dstack/version.py +1 -1
  28. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/METADATA +27 -11
  29. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/RECORD +37 -28
  30. tests/_internal/server/background/tasks/test_process_instances.py +68 -2
  31. tests/_internal/server/routers/test_backends.py +116 -0
  32. tests/_internal/server/routers/test_prometheus.py +158 -120
  33. tests/_internal/utils/test_event_loop.py +18 -0
  34. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/LICENSE.md +0 -0
  35. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/WHEEL +0 -0
  36. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/entry_points.txt +0 -0
  37. {dstack-0.19.0rc1.dist-info → dstack-0.19.2.dist-info}/top_level.txt +0 -0
@@ -9,19 +9,26 @@ from sqlalchemy.ext.asyncio import AsyncSession
9
9
 
10
10
  from dstack._internal.core.models.backends.base import BackendType
11
11
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
12
- from dstack._internal.core.models.runs import JobProvisioningData, JobRuntimeData, JobStatus
12
+ from dstack._internal.core.models.runs import (
13
+ JobProvisioningData,
14
+ JobRuntimeData,
15
+ JobStatus,
16
+ RunStatus,
17
+ )
13
18
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
14
- from dstack._internal.server.models import JobModel, ProjectModel, UserModel
19
+ from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel
15
20
  from dstack._internal.server.services.projects import add_project_member
16
21
  from dstack._internal.server.testing.common import (
17
22
  create_fleet,
18
23
  create_instance,
19
24
  create_job,
25
+ create_job_metrics_point,
20
26
  create_job_prometheus_metrics,
21
27
  create_project,
22
28
  create_repo,
23
29
  create_run,
24
30
  create_user,
31
+ get_auth_headers,
25
32
  get_instance_offer_with_availability,
26
33
  get_job_provisioning_data,
27
34
  get_job_runtime_data,
@@ -32,6 +39,7 @@ from dstack._internal.server.testing.common import (
32
39
  @pytest.fixture
33
40
  def enable_metrics(monkeypatch: pytest.MonkeyPatch):
34
41
  monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", True)
42
+ monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", None)
35
43
 
36
44
 
37
45
  FAKE_NOW = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc)
@@ -45,11 +53,21 @@ class TestGetPrometheusMetrics:
45
53
  async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
46
54
  user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
47
55
  offer = get_instance_offer_with_availability(
48
- instance_type="test-type", gpu_count=2, gpu_name="V4", price=12
56
+ instance_type="test-type",
57
+ cpu_count=32,
58
+ memory_gib=128,
59
+ gpu_count=2,
60
+ gpu_name="V4",
61
+ price=12,
49
62
  )
50
63
  project_2 = await _create_project(session, "project-2", user)
51
64
  jpd_2_1 = get_job_provisioning_data(
52
- backend=BackendType.AWS, gpu_name="T4", gpu_count=2, price=16
65
+ backend=BackendType.AWS,
66
+ cpu_count=16,
67
+ memory_gib=64,
68
+ gpu_name="T4",
69
+ gpu_count=2,
70
+ price=16,
53
71
  )
54
72
  job_2_1 = await _create_job(
55
73
  session=session,
@@ -100,7 +118,41 @@ class TestGetPrometheusMetrics:
100
118
  FIELD_2{gpu="1"} 987169 1395066363010
101
119
  """),
102
120
  )
103
- job_1_2 = await _create_job(session, "run-2", project_1, user, JobStatus.RUNNING)
121
+ await create_job_metrics_point(
122
+ session=session,
123
+ job_model=job_1_1,
124
+ timestamp=FAKE_NOW - timedelta(seconds=30),
125
+ cpu_usage_micro=3_500_000,
126
+ memory_working_set_bytes=3_221_225_472,
127
+ memory_usage_bytes=4_294_967_296,
128
+ )
129
+ # Older, ignored
130
+ await create_job_metrics_point(
131
+ session=session,
132
+ job_model=job_1_1,
133
+ timestamp=FAKE_NOW - timedelta(seconds=60),
134
+ cpu_usage_micro=2_000_000,
135
+ memory_working_set_bytes=1_073_741_824,
136
+ memory_usage_bytes=2_147_483_648,
137
+ )
138
+ jpd_1_2 = get_job_provisioning_data(
139
+ backend=BackendType.AWS,
140
+ cpu_count=24,
141
+ memory_gib=224,
142
+ gpu_count=3,
143
+ gpu_name="L4",
144
+ price=12.5,
145
+ )
146
+ job_1_2 = await _create_job(
147
+ session=session,
148
+ run_name="run-2",
149
+ project=project_1,
150
+ user=user,
151
+ status=JobStatus.RUNNING,
152
+ job_provisioning_data=jpd_1_2,
153
+ submitted_at=FAKE_NOW - timedelta(seconds=150),
154
+ )
155
+
104
156
  await create_job_prometheus_metrics(
105
157
  session=session,
106
158
  job=job_1_2,
@@ -124,6 +176,15 @@ class TestGetPrometheusMetrics:
124
176
  FIELD_1{gpu="1"} 20
125
177
  """),
126
178
  )
179
+ await _create_run(session, "done", project_1, user, RunStatus.DONE)
180
+ other_user = await create_user(
181
+ session=session, name="other-user", global_role=GlobalRole.USER
182
+ )
183
+ await add_project_member(
184
+ session=session, project=project_2, user=other_user, project_role=ProjectRole.USER
185
+ )
186
+ await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED)
187
+ await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED)
127
188
  fleet = await create_fleet(session=session, project=project_1, name="test-fleet")
128
189
  instance = await create_instance(
129
190
  session=session,
@@ -149,31 +210,73 @@ class TestGetPrometheusMetrics:
149
210
  # HELP dstack_instance_gpu_count Instance GPU count
150
211
  # TYPE dstack_instance_gpu_count gauge
151
212
  dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0
213
+ # HELP dstack_run_count_total Total runs count
214
+ # TYPE dstack_run_count_total counter
215
+ dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0
216
+ dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
217
+ dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0
218
+ # HELP dstack_run_count_terminated_total Terminated runs count
219
+ # TYPE dstack_run_count_terminated_total counter
220
+ dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
221
+ dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
222
+ dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
223
+ # HELP dstack_run_count_failed_total Failed runs count
224
+ # TYPE dstack_run_count_failed_total counter
225
+ dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
226
+ dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
227
+ dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
228
+ # HELP dstack_run_count_done_total Done runs count
229
+ # TYPE dstack_run_count_done_total counter
230
+ dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0
231
+ dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
232
+ dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
152
233
  # HELP dstack_job_duration_seconds_total Total seconds the job is running
153
234
  # TYPE dstack_job_duration_seconds_total counter
154
235
  dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0
236
+ dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0
155
237
  dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
156
238
  # HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour
157
239
  # TYPE dstack_job_price_dollars_per_hour gauge
158
240
  dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0
241
+ dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5
159
242
  dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
160
243
  # HELP dstack_job_gpu_count Job GPU count
161
244
  # TYPE dstack_job_gpu_count gauge
162
245
  dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0
246
+ dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0
163
247
  dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0
248
+ # HELP dstack_job_cpu_count Job CPU count
249
+ # TYPE dstack_job_cpu_count gauge
250
+ dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0
251
+ dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0
252
+ dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
253
+ # HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds
254
+ # TYPE dstack_job_cpu_time_seconds_total counter
255
+ dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5
256
+ # HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes
257
+ # TYPE dstack_job_memory_total_bytes gauge
258
+ dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0
259
+ dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0
260
+ dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0
261
+ # HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes
262
+ # TYPE dstack_job_memory_usage_bytes gauge
263
+ dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0
264
+ # HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
265
+ # TYPE dstack_job_memory_working_set_bytes gauge
266
+ dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
164
267
  # HELP FIELD_1 Test field 1
165
268
  # TYPE FIELD_1 gauge
166
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
167
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
168
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
169
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
170
- FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
171
- FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 100.0
172
- FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 200.0
269
+ FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
270
+ FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0
271
+ FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0
272
+ FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0
273
+ FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0
274
+ FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
275
+ FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0
173
276
  # HELP FIELD_2 Test field 2
174
277
  # TYPE FIELD_2 counter
175
- FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
176
- FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
278
+ FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
279
+ FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
177
280
  """)
178
281
 
179
282
  async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
@@ -188,109 +291,24 @@ class TestGetPrometheusMetrics:
188
291
  response = await client.get("/metrics")
189
292
  assert response.status_code == 404
190
293
 
191
-
192
- @pytest.mark.asyncio
193
- @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
194
- @pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
195
- class TestGetPrometheusProjectMetrics:
196
- async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
197
- user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
198
- project = await _create_project(session, "project-1", user)
199
- job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
200
- await create_job_prometheus_metrics(
201
- session=session,
202
- job=job_1,
203
- text=dedent("""
204
- # Comments should be skipped
205
-
206
- # HELP FIELD_1 Test field 1
207
- # TYPE FIELD_1 gauge
208
- FIELD_1{gpu="0"} 350
209
- FIELD_1{gpu="1"} 400
210
-
211
- # HELP FIELD_2 Test field 2
212
- # TYPE FIELD_2 counter
213
- FIELD_2{gpu="0"} 337325 1395066363000
214
- FIELD_2{gpu="1"} 987169 1395066363010
215
- """),
216
- )
217
- job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
218
- await create_job_prometheus_metrics(
219
- session=session,
220
- job=job_2,
221
- text=dedent("""
222
- # HELP FIELD_1 Test field 1
223
- # TYPE FIELD_1 gauge
224
- FIELD_1{gpu="0"} 1200.0
225
- FIELD_1{gpu="1"} 1600.0
226
- FIELD_1{gpu="2"} 2400.0
227
- """),
228
- )
229
- # Terminated job, should not appear in the response
230
- job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
231
- await create_job_prometheus_metrics(
232
- session=session,
233
- job=job_3,
234
- text=dedent("""
235
- # HELP FIELD_1 Test field 1
236
- # TYPE FIELD_1 gauge
237
- FIELD_1{gpu="0"} 10
238
- FIELD_1{gpu="1"} 20
239
- """),
240
- )
241
- another_project = await _create_project(session, "project-2", user)
242
- another_project_job = await _create_job(
243
- session, "run-4", another_project, user, JobStatus.RUNNING
244
- )
245
- await create_job_prometheus_metrics(
246
- session=session,
247
- job=another_project_job,
248
- text=dedent("""
249
- # HELP FIELD_1 Test field 1
250
- # TYPE FIELD_1 gauge
251
- FIELD_1{gpu="0"} 100
252
- FIELD_1{gpu="1"} 200
253
- """),
254
- )
255
-
256
- response = await client.get("/metrics/project/project-1")
257
-
258
- assert response.status_code == 200
259
- assert response.text == dedent(f"""\
260
- # HELP FIELD_1 Test field 1
261
- # TYPE FIELD_1 gauge
262
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
263
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
264
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
265
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
266
- FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
267
- # HELP FIELD_2 Test field 2
268
- # TYPE FIELD_2 counter
269
- FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
270
- FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
271
- """)
272
-
273
- async def test_returns_empty_response_if_no_runs(
274
- self, session: AsyncSession, client: AsyncClient
294
+ @pytest.mark.parametrize("token", [None, "foo"])
295
+ async def test_returns_403_if_not_authenticated(
296
+ self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient, token: Optional[str]
275
297
  ):
276
- user = await create_user(session=session, global_role=GlobalRole.USER)
277
- await create_project(session=session, owner=user, name="test-project")
278
- response = await client.get("/metrics/project/test-project")
279
- assert response.status_code == 200
280
- assert response.text == "\n"
298
+ monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", "secret")
299
+ if token is not None:
300
+ headers = get_auth_headers(token)
301
+ else:
302
+ headers = None
303
+ response = await client.get("/metrics", headers=headers)
304
+ assert response.status_code == 403
281
305
 
282
- async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
283
- response = await client.get("/metrics/project/nonexistent")
284
- assert response.status_code == 404
285
-
286
- async def test_returns_404_if_not_enabled(
287
- self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
306
+ async def test_returns_200_if_token_is_valid(
307
+ self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient
288
308
  ):
289
- monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
290
- user = await create_user(session=session, global_role=GlobalRole.USER)
291
- await create_project(session=session, owner=user, name="test-project")
292
- response = await client.get("/metrics/project/test-project")
293
- assert response.status_code == 404
309
+ monkeypatch.setattr("dstack._internal.server.routers.prometheus._auth._token", "secret")
310
+ response = await client.get("/metrics", headers=get_auth_headers("secret"))
311
+ assert response.status_code == 200
294
312
 
295
313
 
296
314
  async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
@@ -301,26 +319,46 @@ async def _create_project(session: AsyncSession, name: str, user: UserModel) ->
301
319
  return project
302
320
 
303
321
 
304
- async def _create_job(
322
+ async def _create_run(
305
323
  session: AsyncSession,
306
324
  run_name: str,
307
325
  project: ProjectModel,
308
326
  user: UserModel,
309
- status: JobStatus,
310
- job_provisioning_data: Optional[JobProvisioningData] = None,
311
- job_runtime_data: Optional[JobRuntimeData] = None,
327
+ status: RunStatus,
312
328
  submitted_at: datetime = FAKE_NOW,
313
- ) -> JobModel:
329
+ ) -> RunModel:
314
330
  repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
315
331
  configuration = DevEnvironmentConfiguration(ide="vscode")
316
332
  run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration)
317
- run = await create_run(
333
+ return await create_run(
318
334
  session=session,
319
335
  project=project,
320
336
  repo=repo,
321
337
  user=user,
322
338
  run_name=run_name,
323
339
  run_spec=run_spec,
340
+ status=status,
341
+ submitted_at=submitted_at,
342
+ )
343
+
344
+
345
+ async def _create_job(
346
+ session: AsyncSession,
347
+ run_name: str,
348
+ project: ProjectModel,
349
+ user: UserModel,
350
+ status: JobStatus,
351
+ job_provisioning_data: Optional[JobProvisioningData] = None,
352
+ job_runtime_data: Optional[JobRuntimeData] = None,
353
+ submitted_at: datetime = FAKE_NOW,
354
+ ) -> JobModel:
355
+ run = await _create_run(
356
+ session=session,
357
+ run_name=run_name,
358
+ project=project,
359
+ user=user,
360
+ status=RunStatus.SUBMITTED,
361
+ submitted_at=submitted_at,
324
362
  )
325
363
  job = await create_job(
326
364
  session=session,
@@ -0,0 +1,18 @@
1
+ import asyncio
2
+
3
+ from dstack._internal.utils.event_loop import DaemonEventLoop
4
+
5
+
6
+ def test_daemon_event_loop():
7
+ q = asyncio.Queue()
8
+
9
+ async def worker(i):
10
+ await q.put(i)
11
+
12
+ async def all_workers():
13
+ await asyncio.gather(*[worker(i) for i in range(3)])
14
+
15
+ loop = DaemonEventLoop()
16
+ loop.await_(all_workers())
17
+ assert q.qsize() == 3
18
+ assert {loop.await_(q.get()) for _ in range(3)} == {0, 1, 2}