dstack 0.19.0__py3-none-any.whl → 0.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/server/background/tasks/process_instances.py +14 -5
- dstack/_internal/server/routers/prometheus.py +0 -12
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
- dstack/_internal/server/services/prometheus.py +175 -112
- dstack/version.py +1 -1
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/METADATA +1 -1
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/RECORD +13 -13
- tests/_internal/server/background/tasks/test_process_instances.py +65 -1
- tests/_internal/server/routers/test_prometheus.py +141 -124
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/WHEEL +0 -0
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.0.dist-info → dstack-0.19.1.dist-info}/top_level.txt +0 -0
|
@@ -64,6 +64,7 @@ from dstack._internal.core.models.runs import (
|
|
|
64
64
|
Retry,
|
|
65
65
|
)
|
|
66
66
|
from dstack._internal.core.services.profiles import get_retry
|
|
67
|
+
from dstack._internal.server import settings as server_settings
|
|
67
68
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
68
69
|
from dstack._internal.server.db import get_session_ctx
|
|
69
70
|
from dstack._internal.server.models import (
|
|
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
529
530
|
session=session, fleet_id=instance.fleet_id
|
|
530
531
|
)
|
|
531
532
|
|
|
532
|
-
|
|
533
|
+
# Limit number of offers tried to prevent long-running processing
|
|
534
|
+
# in case all offers fail.
|
|
535
|
+
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
533
536
|
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
|
|
534
537
|
continue
|
|
535
538
|
compute = backend.compute()
|
|
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
578
581
|
extra={"instance_name": instance.name},
|
|
579
582
|
)
|
|
580
583
|
continue
|
|
581
|
-
except
|
|
582
|
-
|
|
584
|
+
except Exception:
|
|
585
|
+
logger.exception(
|
|
586
|
+
"Got exception when launching %s in %s/%s",
|
|
587
|
+
instance_offer.instance.name,
|
|
588
|
+
instance_offer.backend.value,
|
|
589
|
+
instance_offer.region,
|
|
590
|
+
)
|
|
583
591
|
continue
|
|
584
592
|
|
|
585
593
|
instance.status = InstanceStatus.PROVISIONING
|
|
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
607
615
|
|
|
608
616
|
if not should_retry:
|
|
609
617
|
instance.status = InstanceStatus.TERMINATED
|
|
610
|
-
instance.termination_reason = "No offers found"
|
|
618
|
+
instance.termination_reason = "All offers failed" if offers else "No offers found"
|
|
611
619
|
logger.info(
|
|
612
|
-
"
|
|
620
|
+
"Terminated instance %s: %s",
|
|
613
621
|
instance.name,
|
|
622
|
+
instance.termination_reason,
|
|
614
623
|
extra={
|
|
615
624
|
"instance_name": instance.name,
|
|
616
625
|
"instance_status": InstanceStatus.TERMINATED.value,
|
|
@@ -6,8 +6,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.server import settings
|
|
8
8
|
from dstack._internal.server.db import get_session
|
|
9
|
-
from dstack._internal.server.deps import Project
|
|
10
|
-
from dstack._internal.server.models import ProjectModel
|
|
11
9
|
from dstack._internal.server.services import prometheus
|
|
12
10
|
from dstack._internal.server.utils.routers import error_not_found
|
|
13
11
|
|
|
@@ -24,13 +22,3 @@ async def get_prometheus_metrics(
|
|
|
24
22
|
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
25
23
|
raise error_not_found()
|
|
26
24
|
return await prometheus.get_metrics(session=session)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@router.get("/metrics/project/{project_name}", deprecated=True)
|
|
30
|
-
async def get_project_prometheus_metrics(
|
|
31
|
-
session: Annotated[AsyncSession, Depends(get_session)],
|
|
32
|
-
project: Annotated[ProjectModel, Depends(Project())],
|
|
33
|
-
) -> str:
|
|
34
|
-
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
35
|
-
raise error_not_found()
|
|
36
|
-
return await prometheus.get_project_metrics(session=session, project=project)
|
|
@@ -35,7 +35,7 @@ class CursorDesktop:
|
|
|
35
35
|
|
|
36
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
37
37
|
return [
|
|
38
|
-
"echo To open in
|
|
38
|
+
"echo To open in Cursor, use link below:",
|
|
39
39
|
"echo ''",
|
|
40
40
|
f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
|
|
41
41
|
"echo ''",
|
|
@@ -1,40 +1,38 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from collections.abc import Generator, Iterable
|
|
3
4
|
from datetime import timezone
|
|
5
|
+
from typing import ClassVar
|
|
6
|
+
from uuid import UUID
|
|
4
7
|
|
|
5
8
|
from prometheus_client import Metric
|
|
6
9
|
from prometheus_client.parser import text_string_to_metric_families
|
|
7
10
|
from prometheus_client.samples import Sample
|
|
8
|
-
from sqlalchemy import select
|
|
11
|
+
from sqlalchemy import func, select
|
|
9
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
13
|
+
from sqlalchemy.orm import aliased, joinedload
|
|
11
14
|
|
|
12
15
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
13
|
-
from dstack._internal.core.models.runs import JobStatus, RunSpec
|
|
16
|
+
from dstack._internal.core.models.runs import JobStatus, RunSpec, RunStatus
|
|
14
17
|
from dstack._internal.server.models import (
|
|
15
18
|
InstanceModel,
|
|
19
|
+
JobMetricsPoint,
|
|
16
20
|
JobModel,
|
|
17
21
|
JobPrometheusMetrics,
|
|
18
22
|
ProjectModel,
|
|
19
23
|
RunModel,
|
|
24
|
+
UserModel,
|
|
20
25
|
)
|
|
21
26
|
from dstack._internal.server.services.instances import get_instance_offer
|
|
22
27
|
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
23
28
|
from dstack._internal.utils.common import get_current_datetime
|
|
24
29
|
|
|
25
|
-
_INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
|
|
26
|
-
_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
|
|
27
|
-
_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
|
|
28
|
-
_JOB_DURATION = "dstack_job_duration_seconds_total"
|
|
29
|
-
_JOB_PRICE = "dstack_job_price_dollars_per_hour"
|
|
30
|
-
_JOB_GPU_COUNT = "dstack_job_gpu_count"
|
|
31
|
-
|
|
32
30
|
|
|
33
31
|
async def get_metrics(session: AsyncSession) -> str:
|
|
34
32
|
metrics_iter = itertools.chain(
|
|
35
33
|
await get_instance_metrics(session),
|
|
34
|
+
await get_run_metrics(session),
|
|
36
35
|
await get_job_metrics(session),
|
|
37
|
-
await get_job_gpu_metrics(session),
|
|
38
36
|
)
|
|
39
37
|
return "\n".join(_render_metrics(metrics_iter)) + "\n"
|
|
40
38
|
|
|
@@ -61,19 +59,7 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
instances = res.unique().scalars().all()
|
|
64
|
-
metrics
|
|
65
|
-
_INSTANCE_DURATION: Metric(
|
|
66
|
-
name=_INSTANCE_DURATION,
|
|
67
|
-
documentation="Total seconds the instance is running",
|
|
68
|
-
typ="counter",
|
|
69
|
-
),
|
|
70
|
-
_INSTANCE_PRICE: Metric(
|
|
71
|
-
name=_INSTANCE_PRICE, documentation="Instance price, USD/hour", typ="gauge"
|
|
72
|
-
),
|
|
73
|
-
_INSTANCE_GPU_COUNT: Metric(
|
|
74
|
-
name=_INSTANCE_GPU_COUNT, documentation="Instance GPU count", typ="gauge"
|
|
75
|
-
),
|
|
76
|
-
}
|
|
62
|
+
metrics = _InstanceMetrics()
|
|
77
63
|
now = get_current_datetime()
|
|
78
64
|
for instance in instances:
|
|
79
65
|
fleet = instance.fleet
|
|
@@ -94,15 +80,36 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
94
80
|
"dstack_gpu": gpu,
|
|
95
81
|
}
|
|
96
82
|
duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
97
|
-
metrics
|
|
98
|
-
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
83
|
+
metrics.add_sample(_INSTANCE_DURATION, labels, duration)
|
|
84
|
+
metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0)
|
|
85
|
+
metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count)
|
|
86
|
+
return metrics.values()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def get_run_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
90
|
+
res = await session.execute(
|
|
91
|
+
select(ProjectModel.name, UserModel.name, RunModel.status, func.count(RunModel.id))
|
|
92
|
+
.join_from(RunModel, ProjectModel)
|
|
93
|
+
.join_from(RunModel, UserModel, RunModel.user_id == UserModel.id)
|
|
94
|
+
.group_by(ProjectModel.name, UserModel.name, RunModel.status)
|
|
95
|
+
.order_by(ProjectModel.name, UserModel.name, RunModel.status)
|
|
96
|
+
)
|
|
97
|
+
projects: dict[str, dict[str, dict[RunStatus, int]]] = defaultdict(
|
|
98
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
99
|
+
)
|
|
100
|
+
for project_name, user_name, status, count in res.all():
|
|
101
|
+
projects[project_name][user_name][status] = count
|
|
102
|
+
metrics = _RunMetrics()
|
|
103
|
+
for project_name, users in projects.items():
|
|
104
|
+
for user_name, statuses in users.items():
|
|
105
|
+
labels: dict[str, str] = {
|
|
106
|
+
"dstack_project_name": project_name,
|
|
107
|
+
"dstack_user_name": user_name,
|
|
108
|
+
}
|
|
109
|
+
metrics.add_sample(_RUN_COUNT_TOTAL, labels, sum(statuses.values()))
|
|
110
|
+
metrics.add_sample(_RUN_COUNT_TERMINATED, labels, statuses[RunStatus.TERMINATED])
|
|
111
|
+
metrics.add_sample(_RUN_COUNT_FAILED, labels, statuses[RunStatus.FAILED])
|
|
112
|
+
metrics.add_sample(_RUN_COUNT_DONE, labels, statuses[RunStatus.DONE])
|
|
106
113
|
return metrics.values()
|
|
107
114
|
|
|
108
115
|
|
|
@@ -127,106 +134,162 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
127
134
|
)
|
|
128
135
|
)
|
|
129
136
|
jobs = res.scalars().all()
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
name=_JOB_PRICE, documentation="Job instance price, USD/hour", typ="gauge"
|
|
136
|
-
),
|
|
137
|
-
_JOB_GPU_COUNT: Metric(name=_JOB_GPU_COUNT, documentation="Job GPU count", typ="gauge"),
|
|
138
|
-
}
|
|
137
|
+
job_ids = {job.id for job in jobs}
|
|
138
|
+
job_metrics_points = await _get_job_metrics_points(session, job_ids)
|
|
139
|
+
job_prometheus_metrics = await _get_job_prometheus_metrics(session, job_ids)
|
|
140
|
+
|
|
141
|
+
metrics = _JobMetrics()
|
|
139
142
|
now = get_current_datetime()
|
|
140
143
|
for job in jobs:
|
|
141
144
|
jpd = get_job_provisioning_data(job)
|
|
142
145
|
if jpd is None:
|
|
143
146
|
continue
|
|
144
147
|
jrd = get_job_runtime_data(job)
|
|
145
|
-
|
|
148
|
+
resources = jpd.instance_type.resources
|
|
146
149
|
price = jpd.price
|
|
147
150
|
if jrd is not None and jrd.offer is not None:
|
|
148
|
-
|
|
151
|
+
resources = jrd.offer.instance.resources
|
|
149
152
|
price = jrd.offer.price
|
|
153
|
+
gpus = resources.gpus
|
|
154
|
+
cpus = resources.cpus
|
|
150
155
|
run_spec = RunSpec.__response__.parse_raw(job.run.run_spec)
|
|
151
|
-
labels =
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
156
|
+
labels = {
|
|
157
|
+
"dstack_project_name": job.project.name,
|
|
158
|
+
"dstack_user_name": job.run.user.name,
|
|
159
|
+
"dstack_run_name": job.run_name,
|
|
160
|
+
"dstack_run_id": str(job.run_id),
|
|
161
|
+
"dstack_job_name": job.job_name,
|
|
162
|
+
"dstack_job_id": str(job.id),
|
|
163
|
+
"dstack_job_num": str(job.job_num),
|
|
164
|
+
"dstack_replica_num": str(job.replica_num),
|
|
165
|
+
"dstack_run_type": run_spec.configuration.type,
|
|
166
|
+
"dstack_backend": jpd.get_base_backend().value,
|
|
167
|
+
"dstack_gpu": gpus[0].name if gpus else "",
|
|
168
|
+
}
|
|
155
169
|
duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
156
|
-
metrics
|
|
157
|
-
metrics
|
|
158
|
-
metrics
|
|
170
|
+
metrics.add_sample(_JOB_DURATION, labels, duration)
|
|
171
|
+
metrics.add_sample(_JOB_PRICE, labels, price)
|
|
172
|
+
metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus))
|
|
173
|
+
metrics.add_sample(_JOB_CPU_COUNT, labels, cpus)
|
|
174
|
+
metrics.add_sample(_JOB_MEMORY_TOTAL, labels, resources.memory_mib * 1024 * 1024)
|
|
175
|
+
jmp = job_metrics_points.get(job.id)
|
|
176
|
+
if jmp is not None:
|
|
177
|
+
metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
|
|
178
|
+
metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
|
|
179
|
+
metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
|
|
180
|
+
jpm = job_prometheus_metrics.get(job.id)
|
|
181
|
+
if jpm is not None:
|
|
182
|
+
for metric in text_string_to_metric_families(jpm.text):
|
|
183
|
+
metrics.add_metric(metric, labels)
|
|
159
184
|
return metrics.values()
|
|
160
185
|
|
|
161
186
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
select(JobPrometheusMetrics)
|
|
165
|
-
.join(JobModel)
|
|
166
|
-
.join(ProjectModel)
|
|
167
|
-
.where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
168
|
-
.order_by(ProjectModel.name, JobModel.job_name)
|
|
169
|
-
.options(
|
|
170
|
-
joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
|
|
171
|
-
joinedload(JobPrometheusMetrics.job)
|
|
172
|
-
.joinedload(JobModel.run)
|
|
173
|
-
.joinedload(RunModel.user),
|
|
174
|
-
)
|
|
175
|
-
)
|
|
176
|
-
metrics_models = res.scalars().all()
|
|
177
|
-
return _parse_and_enrich_job_gpu_metrics(metrics_models)
|
|
187
|
+
_COUNTER = "counter"
|
|
188
|
+
_GAUGE = "gauge"
|
|
178
189
|
|
|
190
|
+
_INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
|
|
191
|
+
_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
|
|
192
|
+
_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
|
|
193
|
+
_RUN_COUNT_TOTAL = "dstack_run_count_total"
|
|
194
|
+
_RUN_COUNT_TERMINATED = "dstack_run_count_terminated_total"
|
|
195
|
+
_RUN_COUNT_FAILED = "dstack_run_count_failed_total"
|
|
196
|
+
_RUN_COUNT_DONE = "dstack_run_count_done_total"
|
|
197
|
+
_JOB_DURATION = "dstack_job_duration_seconds_total"
|
|
198
|
+
_JOB_PRICE = "dstack_job_price_dollars_per_hour"
|
|
199
|
+
_JOB_GPU_COUNT = "dstack_job_gpu_count"
|
|
200
|
+
_JOB_CPU_COUNT = "dstack_job_cpu_count"
|
|
201
|
+
_JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total"
|
|
202
|
+
_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
|
|
203
|
+
_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
|
|
204
|
+
_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
|
|
179
205
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
206
|
+
|
|
207
|
+
class _Metrics(dict[str, Metric]):
|
|
208
|
+
metrics: ClassVar[list[tuple[str, str, str]]]
|
|
209
|
+
|
|
210
|
+
def __init__(self):
|
|
211
|
+
super().__init__()
|
|
212
|
+
for name, typ, documentation in self.metrics:
|
|
213
|
+
self[name] = Metric(name=name, documentation=documentation, typ=typ)
|
|
214
|
+
|
|
215
|
+
def add_sample(self, name: str, labels: dict[str, str], value: float) -> None:
|
|
216
|
+
# NOTE: Keeps reference to labels.
|
|
217
|
+
self[name].add_sample(name=name, labels=labels, value=value)
|
|
218
|
+
|
|
219
|
+
def add_metric(self, metric: Metric, labels: dict[str, str]) -> None:
|
|
220
|
+
# NOTE: Modifies and keeps reference to metric.
|
|
221
|
+
name = metric.name
|
|
222
|
+
samples = metric.samples
|
|
223
|
+
stored_metric = self.get(name)
|
|
224
|
+
if stored_metric is None:
|
|
225
|
+
stored_metric = metric
|
|
226
|
+
stored_metric.samples = []
|
|
227
|
+
self[name] = stored_metric
|
|
228
|
+
for sample in samples:
|
|
229
|
+
sample.labels.update(labels)
|
|
230
|
+
# text_string_to_metric_families "fixes" counter names appending _total,
|
|
231
|
+
# we rebuild Sample to revert this
|
|
232
|
+
stored_metric.samples.append(Sample(name, *sample[1:]))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class _InstanceMetrics(_Metrics):
|
|
236
|
+
metrics = [
|
|
237
|
+
(_INSTANCE_DURATION, _COUNTER, "Total seconds the instance is running"),
|
|
238
|
+
(_INSTANCE_PRICE, _GAUGE, "Instance price, USD/hour"),
|
|
239
|
+
(_INSTANCE_GPU_COUNT, _GAUGE, "Instance GPU count"),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class _RunMetrics(_Metrics):
|
|
244
|
+
metrics = [
|
|
245
|
+
(_RUN_COUNT_TOTAL, _COUNTER, "Total runs count"),
|
|
246
|
+
(_RUN_COUNT_TERMINATED, _COUNTER, "Terminated runs count"),
|
|
247
|
+
(_RUN_COUNT_FAILED, _COUNTER, "Failed runs count"),
|
|
248
|
+
(_RUN_COUNT_DONE, _COUNTER, "Done runs count"),
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class _JobMetrics(_Metrics):
|
|
253
|
+
metrics = [
|
|
254
|
+
(_JOB_DURATION, _COUNTER, "Total seconds the job is running"),
|
|
255
|
+
(_JOB_PRICE, _GAUGE, "Job instance price, USD/hour"),
|
|
256
|
+
(_JOB_GPU_COUNT, _GAUGE, "Job GPU count"),
|
|
257
|
+
(_JOB_CPU_COUNT, _GAUGE, "Job CPU count"),
|
|
258
|
+
(_JOB_CPU_TIME, _COUNTER, "Total CPU time consumed by the job, seconds"),
|
|
259
|
+
(_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
|
|
260
|
+
(_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
|
|
261
|
+
(_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
async def _get_job_metrics_points(
|
|
266
|
+
session: AsyncSession, job_ids: Iterable[UUID]
|
|
267
|
+
) -> dict[UUID, JobMetricsPoint]:
|
|
268
|
+
subquery = select(
|
|
269
|
+
JobMetricsPoint,
|
|
270
|
+
func.row_number()
|
|
271
|
+
.over(
|
|
272
|
+
partition_by=JobMetricsPoint.job_id,
|
|
273
|
+
order_by=JobMetricsPoint.timestamp_micro.desc(),
|
|
187
274
|
)
|
|
188
|
-
.
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
.
|
|
193
|
-
.
|
|
275
|
+
.label("row_number"),
|
|
276
|
+
).subquery()
|
|
277
|
+
res = await session.execute(
|
|
278
|
+
select(aliased(JobMetricsPoint, subquery)).where(
|
|
279
|
+
subquery.c.row_number == 1,
|
|
280
|
+
subquery.c.job_id.in_(job_ids),
|
|
194
281
|
)
|
|
195
282
|
)
|
|
196
|
-
|
|
197
|
-
return "\n".join(_render_metrics(_parse_and_enrich_job_gpu_metrics(metrics_models))) + "\n"
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _parse_and_enrich_job_gpu_metrics(
|
|
201
|
-
metrics_models: Iterable[JobPrometheusMetrics],
|
|
202
|
-
) -> Iterable[Metric]:
|
|
203
|
-
metrics: dict[str, Metric] = {}
|
|
204
|
-
for metrics_model in metrics_models:
|
|
205
|
-
for metric in text_string_to_metric_families(metrics_model.text):
|
|
206
|
-
samples = metric.samples
|
|
207
|
-
metric.samples = []
|
|
208
|
-
name = metric.name
|
|
209
|
-
metric = metrics.setdefault(name, metric)
|
|
210
|
-
for sample in samples:
|
|
211
|
-
labels = sample.labels
|
|
212
|
-
labels.update(_get_job_labels(metrics_model.job))
|
|
213
|
-
# text_string_to_metric_families "fixes" counter names appending _total,
|
|
214
|
-
# we rebuild Sample to revert this
|
|
215
|
-
metric.samples.append(Sample(name, labels, *sample[2:]))
|
|
216
|
-
return metrics.values()
|
|
283
|
+
return {p.job_id: p for p in res.scalars().all()}
|
|
217
284
|
|
|
218
285
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"dstack_job_id": str(job.id),
|
|
227
|
-
"dstack_job_num": str(job.job_num),
|
|
228
|
-
"dstack_replica_num": str(job.replica_num),
|
|
229
|
-
}
|
|
286
|
+
async def _get_job_prometheus_metrics(
|
|
287
|
+
session: AsyncSession, job_ids: Iterable[UUID]
|
|
288
|
+
) -> dict[UUID, JobPrometheusMetrics]:
|
|
289
|
+
res = await session.execute(
|
|
290
|
+
select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id.in_(job_ids))
|
|
291
|
+
)
|
|
292
|
+
return {p.job_id: p for p in res.scalars().all()}
|
|
230
293
|
|
|
231
294
|
|
|
232
295
|
def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]:
|
dstack/version.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
dstack/version.py,sha256=
|
|
2
|
+
dstack/version.py,sha256=ja4J6HzBpBX3wxm5CKLUUgzAwmmr8naAhq3SBch6VIw,64
|
|
3
3
|
dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
|
|
5
5
|
dstack/_internal/settings.py,sha256=8XODoSW2joaEndvZxuHUPSFK85sGgJ7fVL976isYeJM,557
|
|
@@ -242,7 +242,7 @@ dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
|
|
|
242
242
|
dstack/_internal/server/background/tasks/common.py,sha256=N7xSXbf2MoBWgbJ1e3AEzYBTf1Gn-pDXYND8Zr_YCJQ,970
|
|
243
243
|
dstack/_internal/server/background/tasks/process_fleets.py,sha256=lKXUvN_b7DNjD3psHzyCt_JYsTxPFuQ86iXi8fj8GkM,3202
|
|
244
244
|
dstack/_internal/server/background/tasks/process_gateways.py,sha256=hoUI1CSqbHt_uMwnzTRAEDl-LBw0wUk_W4xobIbdvRc,7017
|
|
245
|
-
dstack/_internal/server/background/tasks/process_instances.py,sha256=
|
|
245
|
+
dstack/_internal/server/background/tasks/process_instances.py,sha256=Kc7CbWK4mFOsKwOqp-Pt0ewTsB5OZ5gkPyv9T6TNbpM,37674
|
|
246
246
|
dstack/_internal/server/background/tasks/process_metrics.py,sha256=acySfsacpYbTPV9Yivs-oU37z1S2sUdWhRHdJkfBcCA,5332
|
|
247
247
|
dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=FqGfbzvfILdnPUfxjFPAM1ij2xd2mCDi8qufiBcUMI8,4107
|
|
248
248
|
dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=u8hCXjOOek7VLEsmLy2VnDXFmIwTNjrJwcpWG7a1zW0,5093
|
|
@@ -325,7 +325,7 @@ dstack/_internal/server/routers/instances.py,sha256=XOogTC9My2Zv0ck37_PbHKoZI-j4
|
|
|
325
325
|
dstack/_internal/server/routers/logs.py,sha256=_Euk283LbhlwHibJTKM-7YcpbeQFtWBqMfbOry3PSkU,1159
|
|
326
326
|
dstack/_internal/server/routers/metrics.py,sha256=VFgWhkOvxVFDLlRM_kXHYFylLcfCD6UjXInvcd7H4dY,2314
|
|
327
327
|
dstack/_internal/server/routers/projects.py,sha256=0R-w_6WXUbNo6fREAexFUQ3RoOJF2D_Iz35elKjym14,2717
|
|
328
|
-
dstack/_internal/server/routers/prometheus.py,sha256=
|
|
328
|
+
dstack/_internal/server/routers/prometheus.py,sha256=OuC17kgKkb2ErxDD5QZ_ZdZft5A8dMIAFlIzQ_04NEo,744
|
|
329
329
|
dstack/_internal/server/routers/repos.py,sha256=P_zLoEQderxhCeHQJwRkrIhVcc0-cpabfyde22bWVRk,3362
|
|
330
330
|
dstack/_internal/server/routers/runs.py,sha256=oPqyIRPwkMjj12M1IdMF2UitatqvljISAXnJAjfEJyQ,5352
|
|
331
331
|
dstack/_internal/server/routers/secrets.py,sha256=50_qJCTYRpnGSlLyS93gqoV17wWewOVmM65PcG1bT_Y,856
|
|
@@ -360,7 +360,7 @@ dstack/_internal/server/services/offers.py,sha256=tTld2ZcYdbhzShtMIf1YfTyIADtpN3
|
|
|
360
360
|
dstack/_internal/server/services/permissions.py,sha256=l7Ngdelmn65vjw13NcOdaC6lBYMRuSw6FbHzYwdK3nE,1005
|
|
361
361
|
dstack/_internal/server/services/placement.py,sha256=DWZ8-iAE3o0J0xaHikuJYZzpuBiq7lj41LiAP1PfoEs,1773
|
|
362
362
|
dstack/_internal/server/services/projects.py,sha256=Y4LEkSvOVUHHP-F2qlrwBR7rFu0CFFhbHmDTKrrNuXE,15071
|
|
363
|
-
dstack/_internal/server/services/prometheus.py,sha256=
|
|
363
|
+
dstack/_internal/server/services/prometheus.py,sha256=xq5G-Q2BJup9lS2F6__0wUVTs-k1Gr3dYclGzo2WoWo,12474
|
|
364
364
|
dstack/_internal/server/services/repos.py,sha256=f9ztN7jz_2gvD9hXF5sJwWDVyG2-NHRfjIdSukowPh8,9342
|
|
365
365
|
dstack/_internal/server/services/runs.py,sha256=B2jZtTOxavUHr6WqKMXqgLzB3xWsHTkWKykcvcT2lXI,37245
|
|
366
366
|
dstack/_internal/server/services/storage.py,sha256=6I0xI_3_RpJNbKZwHjDnjrEwXGdHfiaeb5li15T-M1I,1884
|
|
@@ -385,7 +385,7 @@ dstack/_internal/server/services/jobs/configurators/service.py,sha256=FOWrLE-6YF
|
|
|
385
385
|
dstack/_internal/server/services/jobs/configurators/task.py,sha256=0-B3oO-61Eq4-mmlLmqJPliFKHhvvIV0tqc12slcQuA,1436
|
|
386
386
|
dstack/_internal/server/services/jobs/configurators/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
387
387
|
dstack/_internal/server/services/jobs/configurators/extensions/base.py,sha256=xJbHxaaSJ1zjn8zuuApP1Xt2uBaedPhhc-IY0NtDDJQ,418
|
|
388
|
-
dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=
|
|
388
|
+
dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=00HB1kC_eMlioEW0nZI7Ly78b-RSehySVNhC9pajBP8,1636
|
|
389
389
|
dstack/_internal/server/services/jobs/configurators/extensions/vscode.py,sha256=DAj8OEVLyL1x8Jko2EXKhnAkcSnlO1sJk6o6eiiVkDI,1611
|
|
390
390
|
dstack/_internal/server/services/logs/__init__.py,sha256=NAjO1KeYvuDznN2EkfAaJt9S6Y00fo_dl3ob3WmsdGQ,3088
|
|
391
391
|
dstack/_internal/server/services/logs/aws.py,sha256=949k8t9H9v_-aedDjDWkw8yPVyhZemmsszcDDEL5Tb4,13711
|
|
@@ -639,7 +639,7 @@ tests/_internal/server/background/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
639
639
|
tests/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
640
640
|
tests/_internal/server/background/tasks/test_process_fleets.py,sha256=Dl31_TwxoCzYqkVNPWGLsYxmGL2sZfEK3rQXLFyPIz8,2701
|
|
641
641
|
tests/_internal/server/background/tasks/test_process_gateways.py,sha256=lOP4jPXDtadAgYp0aFND_fp5R_X19M58CaOlgnDAEck,5085
|
|
642
|
-
tests/_internal/server/background/tasks/test_process_instances.py,sha256=
|
|
642
|
+
tests/_internal/server/background/tasks/test_process_instances.py,sha256=WC32HvynBuSxwFtAyMTHS4eVzqCnyGufcrIUTEVoozI,27944
|
|
643
643
|
tests/_internal/server/background/tasks/test_process_metrics.py,sha256=z-u4HXJE5EMVH9kwU_POHmvp55ldAvuLpEMkaebBtsg,4976
|
|
644
644
|
tests/_internal/server/background/tasks/test_process_placement_groups.py,sha256=19LYbIMZIIeKAN0b9KOMyS-cHUx0FoOojqQuM8Oeiq4,1620
|
|
645
645
|
tests/_internal/server/background/tasks/test_process_prometheus_metrics.py,sha256=I9DgIJXVGS7UvbFgm4HFnzWiCICBpy72NjDPKU_7WII,7178
|
|
@@ -656,7 +656,7 @@ tests/_internal/server/routers/test_instances.py,sha256=78HFMU9Xel8BNZL3TqnuvrKE
|
|
|
656
656
|
tests/_internal/server/routers/test_logs.py,sha256=NZwyJlgjMOGq4XEx7-VDjTpniYPhZpsbZvB0dTawaog,3989
|
|
657
657
|
tests/_internal/server/routers/test_metrics.py,sha256=xMdDFZW73Zl06QfggjatfwTut37s0soeliJivkCgBks,7620
|
|
658
658
|
tests/_internal/server/routers/test_projects.py,sha256=Z3Ok7onAjUYS4ADvKvN-SwSxYKvlvf4MG5Y8baqQU14,25964
|
|
659
|
-
tests/_internal/server/routers/test_prometheus.py,sha256=
|
|
659
|
+
tests/_internal/server/routers/test_prometheus.py,sha256=LqJwWn5ztSLIGnvZgj-sD7BFW-JuePFt6k__ymF5Btw,22711
|
|
660
660
|
tests/_internal/server/routers/test_repos.py,sha256=G4dKuFGd_UrxAHwh_XLl1xCHK_DCsiJcXBsHODw3yJk,16682
|
|
661
661
|
tests/_internal/server/routers/test_runs.py,sha256=q02oBrUcp4JoJOL68jbxlfFxH9B8JO9Bkb7v_Qg-Aug,62984
|
|
662
662
|
tests/_internal/server/routers/test_server.py,sha256=ROkuRNNJEkMQuK8guZ3Qy3iRRfiWvPIJJJDc09BI0D4,489
|
|
@@ -701,9 +701,9 @@ tests/_internal/utils/test_path.py,sha256=rzS-1YCxsFUocBe42dghLOMFNymPruGrA7bqFZ
|
|
|
701
701
|
tests/_internal/utils/test_ssh.py,sha256=V-cBFPhD--9eM9d1uQQgpj2gnYLA3c43f4cX9uJ6E-U,1743
|
|
702
702
|
tests/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
703
703
|
tests/api/test_utils.py,sha256=SSSqHcNE5cZVqDq4n2sKZthRoXaZ_Bx7z1AAN5xTM9s,391
|
|
704
|
-
dstack-0.19.
|
|
705
|
-
dstack-0.19.
|
|
706
|
-
dstack-0.19.
|
|
707
|
-
dstack-0.19.
|
|
708
|
-
dstack-0.19.
|
|
709
|
-
dstack-0.19.
|
|
704
|
+
dstack-0.19.1.dist-info/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
|
|
705
|
+
dstack-0.19.1.dist-info/METADATA,sha256=sj_wcanWBaGU9ecMn6I32zvXuYFniyN_6K6ehbcO3tA,18231
|
|
706
|
+
dstack-0.19.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
707
|
+
dstack-0.19.1.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
|
|
708
|
+
dstack-0.19.1.dist-info/top_level.txt,sha256=3BrIO1zrqxT9P20ymhRM6k15meZXzbPL6ykBlDZG2_k,13
|
|
709
|
+
dstack-0.19.1.dist-info/RECORD,,
|
|
@@ -8,7 +8,7 @@ import pytest
|
|
|
8
8
|
from freezegun import freeze_time
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.errors import BackendError
|
|
11
|
+
from dstack._internal.core.errors import BackendError, ProvisioningError
|
|
12
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
13
13
|
from dstack._internal.core.models.instances import (
|
|
14
14
|
Gpu,
|
|
@@ -35,6 +35,8 @@ from dstack._internal.server.testing.common import (
|
|
|
35
35
|
create_repo,
|
|
36
36
|
create_run,
|
|
37
37
|
create_user,
|
|
38
|
+
get_instance_offer_with_availability,
|
|
39
|
+
get_job_provisioning_data,
|
|
38
40
|
get_remote_connection_info,
|
|
39
41
|
)
|
|
40
42
|
from dstack._internal.utils.common import get_current_datetime
|
|
@@ -557,6 +559,68 @@ class TestCreateInstance:
|
|
|
557
559
|
assert instance.total_blocks == expected_blocks
|
|
558
560
|
assert instance.busy_blocks == 0
|
|
559
561
|
|
|
562
|
+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
|
|
563
|
+
async def test_tries_second_offer_if_first_fails(self, session: AsyncSession, err: Exception):
|
|
564
|
+
project = await create_project(session=session)
|
|
565
|
+
instance = await create_instance(
|
|
566
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
567
|
+
)
|
|
568
|
+
aws_mock = Mock()
|
|
569
|
+
aws_mock.TYPE = BackendType.AWS
|
|
570
|
+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
|
|
571
|
+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
572
|
+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
573
|
+
aws_mock.compute.return_value.create_instance.side_effect = err
|
|
574
|
+
gcp_mock = Mock()
|
|
575
|
+
gcp_mock.TYPE = BackendType.GCP
|
|
576
|
+
offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0)
|
|
577
|
+
gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
578
|
+
gcp_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
579
|
+
gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data(
|
|
580
|
+
backend=offer.backend, region=offer.region, price=offer.price
|
|
581
|
+
)
|
|
582
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
583
|
+
m.return_value = [aws_mock, gcp_mock]
|
|
584
|
+
await process_instances()
|
|
585
|
+
|
|
586
|
+
await session.refresh(instance)
|
|
587
|
+
assert instance.status == InstanceStatus.PROVISIONING
|
|
588
|
+
aws_mock.compute.return_value.create_instance.assert_called_once()
|
|
589
|
+
assert instance.backend == BackendType.GCP
|
|
590
|
+
|
|
591
|
+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
|
|
592
|
+
async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Exception):
|
|
593
|
+
project = await create_project(session=session)
|
|
594
|
+
instance = await create_instance(
|
|
595
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
596
|
+
)
|
|
597
|
+
aws_mock = Mock()
|
|
598
|
+
aws_mock.TYPE = BackendType.AWS
|
|
599
|
+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
|
|
600
|
+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
601
|
+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
602
|
+
aws_mock.compute.return_value.create_instance.side_effect = err
|
|
603
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
604
|
+
m.return_value = [aws_mock]
|
|
605
|
+
await process_instances()
|
|
606
|
+
|
|
607
|
+
await session.refresh(instance)
|
|
608
|
+
assert instance.status == InstanceStatus.TERMINATED
|
|
609
|
+
assert instance.termination_reason == "All offers failed"
|
|
610
|
+
|
|
611
|
+
async def test_fails_if_no_offers(self, session: AsyncSession):
|
|
612
|
+
project = await create_project(session=session)
|
|
613
|
+
instance = await create_instance(
|
|
614
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
615
|
+
)
|
|
616
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
617
|
+
m.return_value = []
|
|
618
|
+
await process_instances()
|
|
619
|
+
|
|
620
|
+
await session.refresh(instance)
|
|
621
|
+
assert instance.status == InstanceStatus.TERMINATED
|
|
622
|
+
assert instance.termination_reason == "No offers found"
|
|
623
|
+
|
|
560
624
|
|
|
561
625
|
@pytest.mark.asyncio
|
|
562
626
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
@@ -9,14 +9,20 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
12
|
-
from dstack._internal.core.models.runs import
|
|
12
|
+
from dstack._internal.core.models.runs import (
|
|
13
|
+
JobProvisioningData,
|
|
14
|
+
JobRuntimeData,
|
|
15
|
+
JobStatus,
|
|
16
|
+
RunStatus,
|
|
17
|
+
)
|
|
13
18
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
14
|
-
from dstack._internal.server.models import JobModel, ProjectModel, UserModel
|
|
19
|
+
from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel
|
|
15
20
|
from dstack._internal.server.services.projects import add_project_member
|
|
16
21
|
from dstack._internal.server.testing.common import (
|
|
17
22
|
create_fleet,
|
|
18
23
|
create_instance,
|
|
19
24
|
create_job,
|
|
25
|
+
create_job_metrics_point,
|
|
20
26
|
create_job_prometheus_metrics,
|
|
21
27
|
create_project,
|
|
22
28
|
create_repo,
|
|
@@ -45,11 +51,21 @@ class TestGetPrometheusMetrics:
|
|
|
45
51
|
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
46
52
|
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
47
53
|
offer = get_instance_offer_with_availability(
|
|
48
|
-
instance_type="test-type",
|
|
54
|
+
instance_type="test-type",
|
|
55
|
+
cpu_count=32,
|
|
56
|
+
memory_gib=128,
|
|
57
|
+
gpu_count=2,
|
|
58
|
+
gpu_name="V4",
|
|
59
|
+
price=12,
|
|
49
60
|
)
|
|
50
61
|
project_2 = await _create_project(session, "project-2", user)
|
|
51
62
|
jpd_2_1 = get_job_provisioning_data(
|
|
52
|
-
backend=BackendType.AWS,
|
|
63
|
+
backend=BackendType.AWS,
|
|
64
|
+
cpu_count=16,
|
|
65
|
+
memory_gib=64,
|
|
66
|
+
gpu_name="T4",
|
|
67
|
+
gpu_count=2,
|
|
68
|
+
price=16,
|
|
53
69
|
)
|
|
54
70
|
job_2_1 = await _create_job(
|
|
55
71
|
session=session,
|
|
@@ -100,7 +116,41 @@ class TestGetPrometheusMetrics:
|
|
|
100
116
|
FIELD_2{gpu="1"} 987169 1395066363010
|
|
101
117
|
"""),
|
|
102
118
|
)
|
|
103
|
-
|
|
119
|
+
await create_job_metrics_point(
|
|
120
|
+
session=session,
|
|
121
|
+
job_model=job_1_1,
|
|
122
|
+
timestamp=FAKE_NOW - timedelta(seconds=30),
|
|
123
|
+
cpu_usage_micro=3_500_000,
|
|
124
|
+
memory_working_set_bytes=3_221_225_472,
|
|
125
|
+
memory_usage_bytes=4_294_967_296,
|
|
126
|
+
)
|
|
127
|
+
# Older, ignored
|
|
128
|
+
await create_job_metrics_point(
|
|
129
|
+
session=session,
|
|
130
|
+
job_model=job_1_1,
|
|
131
|
+
timestamp=FAKE_NOW - timedelta(seconds=60),
|
|
132
|
+
cpu_usage_micro=2_000_000,
|
|
133
|
+
memory_working_set_bytes=1_073_741_824,
|
|
134
|
+
memory_usage_bytes=2_147_483_648,
|
|
135
|
+
)
|
|
136
|
+
jpd_1_2 = get_job_provisioning_data(
|
|
137
|
+
backend=BackendType.AWS,
|
|
138
|
+
cpu_count=24,
|
|
139
|
+
memory_gib=224,
|
|
140
|
+
gpu_count=3,
|
|
141
|
+
gpu_name="L4",
|
|
142
|
+
price=12.5,
|
|
143
|
+
)
|
|
144
|
+
job_1_2 = await _create_job(
|
|
145
|
+
session=session,
|
|
146
|
+
run_name="run-2",
|
|
147
|
+
project=project_1,
|
|
148
|
+
user=user,
|
|
149
|
+
status=JobStatus.RUNNING,
|
|
150
|
+
job_provisioning_data=jpd_1_2,
|
|
151
|
+
submitted_at=FAKE_NOW - timedelta(seconds=150),
|
|
152
|
+
)
|
|
153
|
+
|
|
104
154
|
await create_job_prometheus_metrics(
|
|
105
155
|
session=session,
|
|
106
156
|
job=job_1_2,
|
|
@@ -124,6 +174,15 @@ class TestGetPrometheusMetrics:
|
|
|
124
174
|
FIELD_1{gpu="1"} 20
|
|
125
175
|
"""),
|
|
126
176
|
)
|
|
177
|
+
await _create_run(session, "done", project_1, user, RunStatus.DONE)
|
|
178
|
+
other_user = await create_user(
|
|
179
|
+
session=session, name="other-user", global_role=GlobalRole.USER
|
|
180
|
+
)
|
|
181
|
+
await add_project_member(
|
|
182
|
+
session=session, project=project_2, user=other_user, project_role=ProjectRole.USER
|
|
183
|
+
)
|
|
184
|
+
await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED)
|
|
185
|
+
await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED)
|
|
127
186
|
fleet = await create_fleet(session=session, project=project_1, name="test-fleet")
|
|
128
187
|
instance = await create_instance(
|
|
129
188
|
session=session,
|
|
@@ -149,31 +208,73 @@ class TestGetPrometheusMetrics:
|
|
|
149
208
|
# HELP dstack_instance_gpu_count Instance GPU count
|
|
150
209
|
# TYPE dstack_instance_gpu_count gauge
|
|
151
210
|
dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
211
|
+
# HELP dstack_run_count_total Total runs count
|
|
212
|
+
# TYPE dstack_run_count_total counter
|
|
213
|
+
dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0
|
|
214
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
215
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0
|
|
216
|
+
# HELP dstack_run_count_terminated_total Terminated runs count
|
|
217
|
+
# TYPE dstack_run_count_terminated_total counter
|
|
218
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
219
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
220
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
221
|
+
# HELP dstack_run_count_failed_total Failed runs count
|
|
222
|
+
# TYPE dstack_run_count_failed_total counter
|
|
223
|
+
dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
224
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
225
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
226
|
+
# HELP dstack_run_count_done_total Done runs count
|
|
227
|
+
# TYPE dstack_run_count_done_total counter
|
|
228
|
+
dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0
|
|
229
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
230
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
152
231
|
# HELP dstack_job_duration_seconds_total Total seconds the job is running
|
|
153
232
|
# TYPE dstack_job_duration_seconds_total counter
|
|
154
233
|
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0
|
|
234
|
+
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0
|
|
155
235
|
dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
156
236
|
# HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour
|
|
157
237
|
# TYPE dstack_job_price_dollars_per_hour gauge
|
|
158
238
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0
|
|
239
|
+
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5
|
|
159
240
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
160
241
|
# HELP dstack_job_gpu_count Job GPU count
|
|
161
242
|
# TYPE dstack_job_gpu_count gauge
|
|
162
243
|
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
244
|
+
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0
|
|
163
245
|
dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0
|
|
246
|
+
# HELP dstack_job_cpu_count Job CPU count
|
|
247
|
+
# TYPE dstack_job_cpu_count gauge
|
|
248
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0
|
|
249
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0
|
|
250
|
+
dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
251
|
+
# HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds
|
|
252
|
+
# TYPE dstack_job_cpu_time_seconds_total counter
|
|
253
|
+
dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5
|
|
254
|
+
# HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes
|
|
255
|
+
# TYPE dstack_job_memory_total_bytes gauge
|
|
256
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0
|
|
257
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0
|
|
258
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0
|
|
259
|
+
# HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes
|
|
260
|
+
# TYPE dstack_job_memory_usage_bytes gauge
|
|
261
|
+
dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0
|
|
262
|
+
# HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
|
|
263
|
+
# TYPE dstack_job_memory_working_set_bytes gauge
|
|
264
|
+
dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
|
|
164
265
|
# HELP FIELD_1 Test field 1
|
|
165
266
|
# TYPE FIELD_1 gauge
|
|
166
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
167
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
168
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
169
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
170
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
171
|
-
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 100.0
|
|
172
|
-
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 200.0
|
|
267
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
|
|
268
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0
|
|
269
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0
|
|
270
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0
|
|
271
|
+
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0
|
|
272
|
+
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
273
|
+
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0
|
|
173
274
|
# HELP FIELD_2 Test field 2
|
|
174
275
|
# TYPE FIELD_2 counter
|
|
175
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
176
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
276
|
+
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
|
|
277
|
+
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
|
|
177
278
|
""")
|
|
178
279
|
|
|
179
280
|
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
|
|
@@ -189,110 +290,6 @@ class TestGetPrometheusMetrics:
|
|
|
189
290
|
assert response.status_code == 404
|
|
190
291
|
|
|
191
292
|
|
|
192
|
-
@pytest.mark.asyncio
|
|
193
|
-
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
194
|
-
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
|
|
195
|
-
class TestGetPrometheusProjectMetrics:
|
|
196
|
-
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
197
|
-
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
198
|
-
project = await _create_project(session, "project-1", user)
|
|
199
|
-
job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
|
|
200
|
-
await create_job_prometheus_metrics(
|
|
201
|
-
session=session,
|
|
202
|
-
job=job_1,
|
|
203
|
-
text=dedent("""
|
|
204
|
-
# Comments should be skipped
|
|
205
|
-
|
|
206
|
-
# HELP FIELD_1 Test field 1
|
|
207
|
-
# TYPE FIELD_1 gauge
|
|
208
|
-
FIELD_1{gpu="0"} 350
|
|
209
|
-
FIELD_1{gpu="1"} 400
|
|
210
|
-
|
|
211
|
-
# HELP FIELD_2 Test field 2
|
|
212
|
-
# TYPE FIELD_2 counter
|
|
213
|
-
FIELD_2{gpu="0"} 337325 1395066363000
|
|
214
|
-
FIELD_2{gpu="1"} 987169 1395066363010
|
|
215
|
-
"""),
|
|
216
|
-
)
|
|
217
|
-
job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
|
|
218
|
-
await create_job_prometheus_metrics(
|
|
219
|
-
session=session,
|
|
220
|
-
job=job_2,
|
|
221
|
-
text=dedent("""
|
|
222
|
-
# HELP FIELD_1 Test field 1
|
|
223
|
-
# TYPE FIELD_1 gauge
|
|
224
|
-
FIELD_1{gpu="0"} 1200.0
|
|
225
|
-
FIELD_1{gpu="1"} 1600.0
|
|
226
|
-
FIELD_1{gpu="2"} 2400.0
|
|
227
|
-
"""),
|
|
228
|
-
)
|
|
229
|
-
# Terminated job, should not appear in the response
|
|
230
|
-
job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
|
|
231
|
-
await create_job_prometheus_metrics(
|
|
232
|
-
session=session,
|
|
233
|
-
job=job_3,
|
|
234
|
-
text=dedent("""
|
|
235
|
-
# HELP FIELD_1 Test field 1
|
|
236
|
-
# TYPE FIELD_1 gauge
|
|
237
|
-
FIELD_1{gpu="0"} 10
|
|
238
|
-
FIELD_1{gpu="1"} 20
|
|
239
|
-
"""),
|
|
240
|
-
)
|
|
241
|
-
another_project = await _create_project(session, "project-2", user)
|
|
242
|
-
another_project_job = await _create_job(
|
|
243
|
-
session, "run-4", another_project, user, JobStatus.RUNNING
|
|
244
|
-
)
|
|
245
|
-
await create_job_prometheus_metrics(
|
|
246
|
-
session=session,
|
|
247
|
-
job=another_project_job,
|
|
248
|
-
text=dedent("""
|
|
249
|
-
# HELP FIELD_1 Test field 1
|
|
250
|
-
# TYPE FIELD_1 gauge
|
|
251
|
-
FIELD_1{gpu="0"} 100
|
|
252
|
-
FIELD_1{gpu="1"} 200
|
|
253
|
-
"""),
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
response = await client.get("/metrics/project/project-1")
|
|
257
|
-
|
|
258
|
-
assert response.status_code == 200
|
|
259
|
-
assert response.text == dedent(f"""\
|
|
260
|
-
# HELP FIELD_1 Test field 1
|
|
261
|
-
# TYPE FIELD_1 gauge
|
|
262
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
263
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
264
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
265
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
266
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
267
|
-
# HELP FIELD_2 Test field 2
|
|
268
|
-
# TYPE FIELD_2 counter
|
|
269
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
270
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
271
|
-
""")
|
|
272
|
-
|
|
273
|
-
async def test_returns_empty_response_if_no_runs(
|
|
274
|
-
self, session: AsyncSession, client: AsyncClient
|
|
275
|
-
):
|
|
276
|
-
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
277
|
-
await create_project(session=session, owner=user, name="test-project")
|
|
278
|
-
response = await client.get("/metrics/project/test-project")
|
|
279
|
-
assert response.status_code == 200
|
|
280
|
-
assert response.text == "\n"
|
|
281
|
-
|
|
282
|
-
async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
|
|
283
|
-
response = await client.get("/metrics/project/nonexistent")
|
|
284
|
-
assert response.status_code == 404
|
|
285
|
-
|
|
286
|
-
async def test_returns_404_if_not_enabled(
|
|
287
|
-
self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
|
|
288
|
-
):
|
|
289
|
-
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
|
|
290
|
-
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
291
|
-
await create_project(session=session, owner=user, name="test-project")
|
|
292
|
-
response = await client.get("/metrics/project/test-project")
|
|
293
|
-
assert response.status_code == 404
|
|
294
|
-
|
|
295
|
-
|
|
296
293
|
async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
|
|
297
294
|
project = await create_project(session=session, owner=user, name=name)
|
|
298
295
|
await add_project_member(
|
|
@@ -301,26 +298,46 @@ async def _create_project(session: AsyncSession, name: str, user: UserModel) ->
|
|
|
301
298
|
return project
|
|
302
299
|
|
|
303
300
|
|
|
304
|
-
async def
|
|
301
|
+
async def _create_run(
|
|
305
302
|
session: AsyncSession,
|
|
306
303
|
run_name: str,
|
|
307
304
|
project: ProjectModel,
|
|
308
305
|
user: UserModel,
|
|
309
|
-
status:
|
|
310
|
-
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
311
|
-
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
306
|
+
status: RunStatus,
|
|
312
307
|
submitted_at: datetime = FAKE_NOW,
|
|
313
|
-
) ->
|
|
308
|
+
) -> RunModel:
|
|
314
309
|
repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
|
|
315
310
|
configuration = DevEnvironmentConfiguration(ide="vscode")
|
|
316
311
|
run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration)
|
|
317
|
-
|
|
312
|
+
return await create_run(
|
|
318
313
|
session=session,
|
|
319
314
|
project=project,
|
|
320
315
|
repo=repo,
|
|
321
316
|
user=user,
|
|
322
317
|
run_name=run_name,
|
|
323
318
|
run_spec=run_spec,
|
|
319
|
+
status=status,
|
|
320
|
+
submitted_at=submitted_at,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
async def _create_job(
|
|
325
|
+
session: AsyncSession,
|
|
326
|
+
run_name: str,
|
|
327
|
+
project: ProjectModel,
|
|
328
|
+
user: UserModel,
|
|
329
|
+
status: JobStatus,
|
|
330
|
+
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
331
|
+
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
332
|
+
submitted_at: datetime = FAKE_NOW,
|
|
333
|
+
) -> JobModel:
|
|
334
|
+
run = await _create_run(
|
|
335
|
+
session=session,
|
|
336
|
+
run_name=run_name,
|
|
337
|
+
project=project,
|
|
338
|
+
user=user,
|
|
339
|
+
status=RunStatus.SUBMITTED,
|
|
340
|
+
submitted_at=submitted_at,
|
|
324
341
|
)
|
|
325
342
|
job = await create_job(
|
|
326
343
|
session=session,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|