dstack 0.19.0rc1__py3-none-any.whl → 0.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/server/background/tasks/process_instances.py +14 -5
- dstack/_internal/server/routers/prometheus.py +0 -12
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
- dstack/_internal/server/services/prometheus.py +175 -112
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-4a0fe83e84574654e397.js} +13 -9
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/METADATA +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/RECORD +16 -16
- tests/_internal/server/background/tasks/test_process_instances.py +65 -1
- tests/_internal/server/routers/test_prometheus.py +141 -124
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/WHEEL +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/top_level.txt +0 -0
|
@@ -64,6 +64,7 @@ from dstack._internal.core.models.runs import (
|
|
|
64
64
|
Retry,
|
|
65
65
|
)
|
|
66
66
|
from dstack._internal.core.services.profiles import get_retry
|
|
67
|
+
from dstack._internal.server import settings as server_settings
|
|
67
68
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
68
69
|
from dstack._internal.server.db import get_session_ctx
|
|
69
70
|
from dstack._internal.server.models import (
|
|
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
529
530
|
session=session, fleet_id=instance.fleet_id
|
|
530
531
|
)
|
|
531
532
|
|
|
532
|
-
|
|
533
|
+
# Limit number of offers tried to prevent long-running processing
|
|
534
|
+
# in case all offers fail.
|
|
535
|
+
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
533
536
|
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
|
|
534
537
|
continue
|
|
535
538
|
compute = backend.compute()
|
|
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
578
581
|
extra={"instance_name": instance.name},
|
|
579
582
|
)
|
|
580
583
|
continue
|
|
581
|
-
except
|
|
582
|
-
|
|
584
|
+
except Exception:
|
|
585
|
+
logger.exception(
|
|
586
|
+
"Got exception when launching %s in %s/%s",
|
|
587
|
+
instance_offer.instance.name,
|
|
588
|
+
instance_offer.backend.value,
|
|
589
|
+
instance_offer.region,
|
|
590
|
+
)
|
|
583
591
|
continue
|
|
584
592
|
|
|
585
593
|
instance.status = InstanceStatus.PROVISIONING
|
|
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
607
615
|
|
|
608
616
|
if not should_retry:
|
|
609
617
|
instance.status = InstanceStatus.TERMINATED
|
|
610
|
-
instance.termination_reason = "No offers found"
|
|
618
|
+
instance.termination_reason = "All offers failed" if offers else "No offers found"
|
|
611
619
|
logger.info(
|
|
612
|
-
"
|
|
620
|
+
"Terminated instance %s: %s",
|
|
613
621
|
instance.name,
|
|
622
|
+
instance.termination_reason,
|
|
614
623
|
extra={
|
|
615
624
|
"instance_name": instance.name,
|
|
616
625
|
"instance_status": InstanceStatus.TERMINATED.value,
|
|
@@ -6,8 +6,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.server import settings
|
|
8
8
|
from dstack._internal.server.db import get_session
|
|
9
|
-
from dstack._internal.server.deps import Project
|
|
10
|
-
from dstack._internal.server.models import ProjectModel
|
|
11
9
|
from dstack._internal.server.services import prometheus
|
|
12
10
|
from dstack._internal.server.utils.routers import error_not_found
|
|
13
11
|
|
|
@@ -24,13 +22,3 @@ async def get_prometheus_metrics(
|
|
|
24
22
|
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
25
23
|
raise error_not_found()
|
|
26
24
|
return await prometheus.get_metrics(session=session)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@router.get("/metrics/project/{project_name}", deprecated=True)
|
|
30
|
-
async def get_project_prometheus_metrics(
|
|
31
|
-
session: Annotated[AsyncSession, Depends(get_session)],
|
|
32
|
-
project: Annotated[ProjectModel, Depends(Project())],
|
|
33
|
-
) -> str:
|
|
34
|
-
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
35
|
-
raise error_not_found()
|
|
36
|
-
return await prometheus.get_project_metrics(session=session, project=project)
|
|
@@ -35,7 +35,7 @@ class CursorDesktop:
|
|
|
35
35
|
|
|
36
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
37
37
|
return [
|
|
38
|
-
"echo To open in
|
|
38
|
+
"echo To open in Cursor, use link below:",
|
|
39
39
|
"echo ''",
|
|
40
40
|
f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
|
|
41
41
|
"echo ''",
|
|
@@ -1,40 +1,38 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
from collections import defaultdict
|
|
2
3
|
from collections.abc import Generator, Iterable
|
|
3
4
|
from datetime import timezone
|
|
5
|
+
from typing import ClassVar
|
|
6
|
+
from uuid import UUID
|
|
4
7
|
|
|
5
8
|
from prometheus_client import Metric
|
|
6
9
|
from prometheus_client.parser import text_string_to_metric_families
|
|
7
10
|
from prometheus_client.samples import Sample
|
|
8
|
-
from sqlalchemy import select
|
|
11
|
+
from sqlalchemy import func, select
|
|
9
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
13
|
+
from sqlalchemy.orm import aliased, joinedload
|
|
11
14
|
|
|
12
15
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
13
|
-
from dstack._internal.core.models.runs import JobStatus, RunSpec
|
|
16
|
+
from dstack._internal.core.models.runs import JobStatus, RunSpec, RunStatus
|
|
14
17
|
from dstack._internal.server.models import (
|
|
15
18
|
InstanceModel,
|
|
19
|
+
JobMetricsPoint,
|
|
16
20
|
JobModel,
|
|
17
21
|
JobPrometheusMetrics,
|
|
18
22
|
ProjectModel,
|
|
19
23
|
RunModel,
|
|
24
|
+
UserModel,
|
|
20
25
|
)
|
|
21
26
|
from dstack._internal.server.services.instances import get_instance_offer
|
|
22
27
|
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
23
28
|
from dstack._internal.utils.common import get_current_datetime
|
|
24
29
|
|
|
25
|
-
_INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
|
|
26
|
-
_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
|
|
27
|
-
_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
|
|
28
|
-
_JOB_DURATION = "dstack_job_duration_seconds_total"
|
|
29
|
-
_JOB_PRICE = "dstack_job_price_dollars_per_hour"
|
|
30
|
-
_JOB_GPU_COUNT = "dstack_job_gpu_count"
|
|
31
|
-
|
|
32
30
|
|
|
33
31
|
async def get_metrics(session: AsyncSession) -> str:
|
|
34
32
|
metrics_iter = itertools.chain(
|
|
35
33
|
await get_instance_metrics(session),
|
|
34
|
+
await get_run_metrics(session),
|
|
36
35
|
await get_job_metrics(session),
|
|
37
|
-
await get_job_gpu_metrics(session),
|
|
38
36
|
)
|
|
39
37
|
return "\n".join(_render_metrics(metrics_iter)) + "\n"
|
|
40
38
|
|
|
@@ -61,19 +59,7 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
61
59
|
)
|
|
62
60
|
)
|
|
63
61
|
instances = res.unique().scalars().all()
|
|
64
|
-
metrics
|
|
65
|
-
_INSTANCE_DURATION: Metric(
|
|
66
|
-
name=_INSTANCE_DURATION,
|
|
67
|
-
documentation="Total seconds the instance is running",
|
|
68
|
-
typ="counter",
|
|
69
|
-
),
|
|
70
|
-
_INSTANCE_PRICE: Metric(
|
|
71
|
-
name=_INSTANCE_PRICE, documentation="Instance price, USD/hour", typ="gauge"
|
|
72
|
-
),
|
|
73
|
-
_INSTANCE_GPU_COUNT: Metric(
|
|
74
|
-
name=_INSTANCE_GPU_COUNT, documentation="Instance GPU count", typ="gauge"
|
|
75
|
-
),
|
|
76
|
-
}
|
|
62
|
+
metrics = _InstanceMetrics()
|
|
77
63
|
now = get_current_datetime()
|
|
78
64
|
for instance in instances:
|
|
79
65
|
fleet = instance.fleet
|
|
@@ -94,15 +80,36 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
94
80
|
"dstack_gpu": gpu,
|
|
95
81
|
}
|
|
96
82
|
duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
97
|
-
metrics
|
|
98
|
-
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
83
|
+
metrics.add_sample(_INSTANCE_DURATION, labels, duration)
|
|
84
|
+
metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0)
|
|
85
|
+
metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count)
|
|
86
|
+
return metrics.values()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def get_run_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
90
|
+
res = await session.execute(
|
|
91
|
+
select(ProjectModel.name, UserModel.name, RunModel.status, func.count(RunModel.id))
|
|
92
|
+
.join_from(RunModel, ProjectModel)
|
|
93
|
+
.join_from(RunModel, UserModel, RunModel.user_id == UserModel.id)
|
|
94
|
+
.group_by(ProjectModel.name, UserModel.name, RunModel.status)
|
|
95
|
+
.order_by(ProjectModel.name, UserModel.name, RunModel.status)
|
|
96
|
+
)
|
|
97
|
+
projects: dict[str, dict[str, dict[RunStatus, int]]] = defaultdict(
|
|
98
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
99
|
+
)
|
|
100
|
+
for project_name, user_name, status, count in res.all():
|
|
101
|
+
projects[project_name][user_name][status] = count
|
|
102
|
+
metrics = _RunMetrics()
|
|
103
|
+
for project_name, users in projects.items():
|
|
104
|
+
for user_name, statuses in users.items():
|
|
105
|
+
labels: dict[str, str] = {
|
|
106
|
+
"dstack_project_name": project_name,
|
|
107
|
+
"dstack_user_name": user_name,
|
|
108
|
+
}
|
|
109
|
+
metrics.add_sample(_RUN_COUNT_TOTAL, labels, sum(statuses.values()))
|
|
110
|
+
metrics.add_sample(_RUN_COUNT_TERMINATED, labels, statuses[RunStatus.TERMINATED])
|
|
111
|
+
metrics.add_sample(_RUN_COUNT_FAILED, labels, statuses[RunStatus.FAILED])
|
|
112
|
+
metrics.add_sample(_RUN_COUNT_DONE, labels, statuses[RunStatus.DONE])
|
|
106
113
|
return metrics.values()
|
|
107
114
|
|
|
108
115
|
|
|
@@ -127,106 +134,162 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
127
134
|
)
|
|
128
135
|
)
|
|
129
136
|
jobs = res.scalars().all()
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
name=_JOB_PRICE, documentation="Job instance price, USD/hour", typ="gauge"
|
|
136
|
-
),
|
|
137
|
-
_JOB_GPU_COUNT: Metric(name=_JOB_GPU_COUNT, documentation="Job GPU count", typ="gauge"),
|
|
138
|
-
}
|
|
137
|
+
job_ids = {job.id for job in jobs}
|
|
138
|
+
job_metrics_points = await _get_job_metrics_points(session, job_ids)
|
|
139
|
+
job_prometheus_metrics = await _get_job_prometheus_metrics(session, job_ids)
|
|
140
|
+
|
|
141
|
+
metrics = _JobMetrics()
|
|
139
142
|
now = get_current_datetime()
|
|
140
143
|
for job in jobs:
|
|
141
144
|
jpd = get_job_provisioning_data(job)
|
|
142
145
|
if jpd is None:
|
|
143
146
|
continue
|
|
144
147
|
jrd = get_job_runtime_data(job)
|
|
145
|
-
|
|
148
|
+
resources = jpd.instance_type.resources
|
|
146
149
|
price = jpd.price
|
|
147
150
|
if jrd is not None and jrd.offer is not None:
|
|
148
|
-
|
|
151
|
+
resources = jrd.offer.instance.resources
|
|
149
152
|
price = jrd.offer.price
|
|
153
|
+
gpus = resources.gpus
|
|
154
|
+
cpus = resources.cpus
|
|
150
155
|
run_spec = RunSpec.__response__.parse_raw(job.run.run_spec)
|
|
151
|
-
labels =
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
156
|
+
labels = {
|
|
157
|
+
"dstack_project_name": job.project.name,
|
|
158
|
+
"dstack_user_name": job.run.user.name,
|
|
159
|
+
"dstack_run_name": job.run_name,
|
|
160
|
+
"dstack_run_id": str(job.run_id),
|
|
161
|
+
"dstack_job_name": job.job_name,
|
|
162
|
+
"dstack_job_id": str(job.id),
|
|
163
|
+
"dstack_job_num": str(job.job_num),
|
|
164
|
+
"dstack_replica_num": str(job.replica_num),
|
|
165
|
+
"dstack_run_type": run_spec.configuration.type,
|
|
166
|
+
"dstack_backend": jpd.get_base_backend().value,
|
|
167
|
+
"dstack_gpu": gpus[0].name if gpus else "",
|
|
168
|
+
}
|
|
155
169
|
duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
|
|
156
|
-
metrics
|
|
157
|
-
metrics
|
|
158
|
-
metrics
|
|
170
|
+
metrics.add_sample(_JOB_DURATION, labels, duration)
|
|
171
|
+
metrics.add_sample(_JOB_PRICE, labels, price)
|
|
172
|
+
metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus))
|
|
173
|
+
metrics.add_sample(_JOB_CPU_COUNT, labels, cpus)
|
|
174
|
+
metrics.add_sample(_JOB_MEMORY_TOTAL, labels, resources.memory_mib * 1024 * 1024)
|
|
175
|
+
jmp = job_metrics_points.get(job.id)
|
|
176
|
+
if jmp is not None:
|
|
177
|
+
metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
|
|
178
|
+
metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
|
|
179
|
+
metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
|
|
180
|
+
jpm = job_prometheus_metrics.get(job.id)
|
|
181
|
+
if jpm is not None:
|
|
182
|
+
for metric in text_string_to_metric_families(jpm.text):
|
|
183
|
+
metrics.add_metric(metric, labels)
|
|
159
184
|
return metrics.values()
|
|
160
185
|
|
|
161
186
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
select(JobPrometheusMetrics)
|
|
165
|
-
.join(JobModel)
|
|
166
|
-
.join(ProjectModel)
|
|
167
|
-
.where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
168
|
-
.order_by(ProjectModel.name, JobModel.job_name)
|
|
169
|
-
.options(
|
|
170
|
-
joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
|
|
171
|
-
joinedload(JobPrometheusMetrics.job)
|
|
172
|
-
.joinedload(JobModel.run)
|
|
173
|
-
.joinedload(RunModel.user),
|
|
174
|
-
)
|
|
175
|
-
)
|
|
176
|
-
metrics_models = res.scalars().all()
|
|
177
|
-
return _parse_and_enrich_job_gpu_metrics(metrics_models)
|
|
187
|
+
_COUNTER = "counter"
|
|
188
|
+
_GAUGE = "gauge"
|
|
178
189
|
|
|
190
|
+
_INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
|
|
191
|
+
_INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
|
|
192
|
+
_INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
|
|
193
|
+
_RUN_COUNT_TOTAL = "dstack_run_count_total"
|
|
194
|
+
_RUN_COUNT_TERMINATED = "dstack_run_count_terminated_total"
|
|
195
|
+
_RUN_COUNT_FAILED = "dstack_run_count_failed_total"
|
|
196
|
+
_RUN_COUNT_DONE = "dstack_run_count_done_total"
|
|
197
|
+
_JOB_DURATION = "dstack_job_duration_seconds_total"
|
|
198
|
+
_JOB_PRICE = "dstack_job_price_dollars_per_hour"
|
|
199
|
+
_JOB_GPU_COUNT = "dstack_job_gpu_count"
|
|
200
|
+
_JOB_CPU_COUNT = "dstack_job_cpu_count"
|
|
201
|
+
_JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total"
|
|
202
|
+
_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
|
|
203
|
+
_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
|
|
204
|
+
_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
|
|
179
205
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
206
|
+
|
|
207
|
+
class _Metrics(dict[str, Metric]):
|
|
208
|
+
metrics: ClassVar[list[tuple[str, str, str]]]
|
|
209
|
+
|
|
210
|
+
def __init__(self):
|
|
211
|
+
super().__init__()
|
|
212
|
+
for name, typ, documentation in self.metrics:
|
|
213
|
+
self[name] = Metric(name=name, documentation=documentation, typ=typ)
|
|
214
|
+
|
|
215
|
+
def add_sample(self, name: str, labels: dict[str, str], value: float) -> None:
|
|
216
|
+
# NOTE: Keeps reference to labels.
|
|
217
|
+
self[name].add_sample(name=name, labels=labels, value=value)
|
|
218
|
+
|
|
219
|
+
def add_metric(self, metric: Metric, labels: dict[str, str]) -> None:
|
|
220
|
+
# NOTE: Modifies and keeps reference to metric.
|
|
221
|
+
name = metric.name
|
|
222
|
+
samples = metric.samples
|
|
223
|
+
stored_metric = self.get(name)
|
|
224
|
+
if stored_metric is None:
|
|
225
|
+
stored_metric = metric
|
|
226
|
+
stored_metric.samples = []
|
|
227
|
+
self[name] = stored_metric
|
|
228
|
+
for sample in samples:
|
|
229
|
+
sample.labels.update(labels)
|
|
230
|
+
# text_string_to_metric_families "fixes" counter names appending _total,
|
|
231
|
+
# we rebuild Sample to revert this
|
|
232
|
+
stored_metric.samples.append(Sample(name, *sample[1:]))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class _InstanceMetrics(_Metrics):
|
|
236
|
+
metrics = [
|
|
237
|
+
(_INSTANCE_DURATION, _COUNTER, "Total seconds the instance is running"),
|
|
238
|
+
(_INSTANCE_PRICE, _GAUGE, "Instance price, USD/hour"),
|
|
239
|
+
(_INSTANCE_GPU_COUNT, _GAUGE, "Instance GPU count"),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class _RunMetrics(_Metrics):
|
|
244
|
+
metrics = [
|
|
245
|
+
(_RUN_COUNT_TOTAL, _COUNTER, "Total runs count"),
|
|
246
|
+
(_RUN_COUNT_TERMINATED, _COUNTER, "Terminated runs count"),
|
|
247
|
+
(_RUN_COUNT_FAILED, _COUNTER, "Failed runs count"),
|
|
248
|
+
(_RUN_COUNT_DONE, _COUNTER, "Done runs count"),
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class _JobMetrics(_Metrics):
|
|
253
|
+
metrics = [
|
|
254
|
+
(_JOB_DURATION, _COUNTER, "Total seconds the job is running"),
|
|
255
|
+
(_JOB_PRICE, _GAUGE, "Job instance price, USD/hour"),
|
|
256
|
+
(_JOB_GPU_COUNT, _GAUGE, "Job GPU count"),
|
|
257
|
+
(_JOB_CPU_COUNT, _GAUGE, "Job CPU count"),
|
|
258
|
+
(_JOB_CPU_TIME, _COUNTER, "Total CPU time consumed by the job, seconds"),
|
|
259
|
+
(_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
|
|
260
|
+
(_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
|
|
261
|
+
(_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
async def _get_job_metrics_points(
|
|
266
|
+
session: AsyncSession, job_ids: Iterable[UUID]
|
|
267
|
+
) -> dict[UUID, JobMetricsPoint]:
|
|
268
|
+
subquery = select(
|
|
269
|
+
JobMetricsPoint,
|
|
270
|
+
func.row_number()
|
|
271
|
+
.over(
|
|
272
|
+
partition_by=JobMetricsPoint.job_id,
|
|
273
|
+
order_by=JobMetricsPoint.timestamp_micro.desc(),
|
|
187
274
|
)
|
|
188
|
-
.
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
.
|
|
193
|
-
.
|
|
275
|
+
.label("row_number"),
|
|
276
|
+
).subquery()
|
|
277
|
+
res = await session.execute(
|
|
278
|
+
select(aliased(JobMetricsPoint, subquery)).where(
|
|
279
|
+
subquery.c.row_number == 1,
|
|
280
|
+
subquery.c.job_id.in_(job_ids),
|
|
194
281
|
)
|
|
195
282
|
)
|
|
196
|
-
|
|
197
|
-
return "\n".join(_render_metrics(_parse_and_enrich_job_gpu_metrics(metrics_models))) + "\n"
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _parse_and_enrich_job_gpu_metrics(
|
|
201
|
-
metrics_models: Iterable[JobPrometheusMetrics],
|
|
202
|
-
) -> Iterable[Metric]:
|
|
203
|
-
metrics: dict[str, Metric] = {}
|
|
204
|
-
for metrics_model in metrics_models:
|
|
205
|
-
for metric in text_string_to_metric_families(metrics_model.text):
|
|
206
|
-
samples = metric.samples
|
|
207
|
-
metric.samples = []
|
|
208
|
-
name = metric.name
|
|
209
|
-
metric = metrics.setdefault(name, metric)
|
|
210
|
-
for sample in samples:
|
|
211
|
-
labels = sample.labels
|
|
212
|
-
labels.update(_get_job_labels(metrics_model.job))
|
|
213
|
-
# text_string_to_metric_families "fixes" counter names appending _total,
|
|
214
|
-
# we rebuild Sample to revert this
|
|
215
|
-
metric.samples.append(Sample(name, labels, *sample[2:]))
|
|
216
|
-
return metrics.values()
|
|
283
|
+
return {p.job_id: p for p in res.scalars().all()}
|
|
217
284
|
|
|
218
285
|
|
|
219
|
-
def
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"dstack_job_id": str(job.id),
|
|
227
|
-
"dstack_job_num": str(job.job_num),
|
|
228
|
-
"dstack_replica_num": str(job.replica_num),
|
|
229
|
-
}
|
|
286
|
+
async def _get_job_prometheus_metrics(
|
|
287
|
+
session: AsyncSession, job_ids: Iterable[UUID]
|
|
288
|
+
) -> dict[UUID, JobPrometheusMetrics]:
|
|
289
|
+
res = await session.execute(
|
|
290
|
+
select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id.in_(job_ids))
|
|
291
|
+
)
|
|
292
|
+
return {p.job_id: p for p in res.scalars().all()}
|
|
230
293
|
|
|
231
294
|
|
|
232
295
|
def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]:
|
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
<!doctype html><html lang="en"><head><meta charset="utf-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><title>dstack</title><meta name="description" content="Get GPUs at the best prices and availability from a wide range of providers. No cloud account of your own is required.
|
|
2
2
|
"/><link rel="preconnect" href="https://fonts.googleapis.com"><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin><link href="https://fonts.googleapis.com/css2?family=Roboto:ital,wght@0,100;0,300;0,400;0,500;0,700;0,900;1,100;1,300;1,400;1,500;1,700;1,900&display=swap" rel="stylesheet"><meta name="og:title" content="dstack"><meta name="og:type" content="article"><meta name="og:image" content="/splash_thumbnail.png"><meta name="og:description" content="Get GPUs at the best prices and availability from a wide range of providers. No cloud account of your own is required.
|
|
3
|
-
"><link rel="icon" type="image/x-icon" href="/assets/favicon.ico"><link rel="icon" type="image/png" sizes="16x16" href="/assets/favicon-16x16.png"><link rel="icon" type="image/png" sizes="32x32" href="/assets/favicon-32x32.png"><link rel="icon" type="image/png" sizes="48x48" href="/assets/favicon-48x48.png"><link rel="manifest" href="/assets/manifest.webmanifest"><meta name="mobile-web-app-capable" content="yes"><meta name="theme-color" content="#fff"><meta name="application-name" content="dstackai"><link rel="apple-touch-icon" sizes="57x57" href="/assets/apple-touch-icon-57x57.png"><link rel="apple-touch-icon" sizes="60x60" href="/assets/apple-touch-icon-60x60.png"><link rel="apple-touch-icon" sizes="72x72" href="/assets/apple-touch-icon-72x72.png"><link rel="apple-touch-icon" sizes="76x76" href="/assets/apple-touch-icon-76x76.png"><link rel="apple-touch-icon" sizes="114x114" href="/assets/apple-touch-icon-114x114.png"><link rel="apple-touch-icon" sizes="120x120" href="/assets/apple-touch-icon-120x120.png"><link rel="apple-touch-icon" sizes="144x144" href="/assets/apple-touch-icon-144x144.png"><link rel="apple-touch-icon" sizes="152x152" href="/assets/apple-touch-icon-152x152.png"><link rel="apple-touch-icon" sizes="167x167" href="/assets/apple-touch-icon-167x167.png"><link rel="apple-touch-icon" sizes="180x180" href="/assets/apple-touch-icon-180x180.png"><link rel="apple-touch-icon" sizes="1024x1024" href="/assets/apple-touch-icon-1024x1024.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta name="apple-mobile-web-app-title" content="dstackai"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-640x1136.png"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1136x640.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-750x1334.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1334x750.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1125x2436.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2436x1125.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1170x2532.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2532x1170.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1179x2556.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2556x1179.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-828x1792.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1792x828.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2688.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2688x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2208.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2208x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1284x2778.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2778x1284.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1290x2796.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2796x1290.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1488x2266.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2266x1488.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1536x2048.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2048x1536.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1620x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1620.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1640x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1640.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2388.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2388x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2224.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2224x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-2048x2732.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2732x2048.png"><meta name="msapplication-TileColor" content="#fff"><meta name="msapplication-TileImage" content="/assets/mstile-144x144.png"><meta name="msapplication-config" content="/assets/browserconfig.xml"><link rel="yandex-tableau-widget" href="/assets/yandex-browser-manifest.json"><script defer="defer" src="/main-
|
|
3
|
+
"><link rel="icon" type="image/x-icon" href="/assets/favicon.ico"><link rel="icon" type="image/png" sizes="16x16" href="/assets/favicon-16x16.png"><link rel="icon" type="image/png" sizes="32x32" href="/assets/favicon-32x32.png"><link rel="icon" type="image/png" sizes="48x48" href="/assets/favicon-48x48.png"><link rel="manifest" href="/assets/manifest.webmanifest"><meta name="mobile-web-app-capable" content="yes"><meta name="theme-color" content="#fff"><meta name="application-name" content="dstackai"><link rel="apple-touch-icon" sizes="57x57" href="/assets/apple-touch-icon-57x57.png"><link rel="apple-touch-icon" sizes="60x60" href="/assets/apple-touch-icon-60x60.png"><link rel="apple-touch-icon" sizes="72x72" href="/assets/apple-touch-icon-72x72.png"><link rel="apple-touch-icon" sizes="76x76" href="/assets/apple-touch-icon-76x76.png"><link rel="apple-touch-icon" sizes="114x114" href="/assets/apple-touch-icon-114x114.png"><link rel="apple-touch-icon" sizes="120x120" href="/assets/apple-touch-icon-120x120.png"><link rel="apple-touch-icon" sizes="144x144" href="/assets/apple-touch-icon-144x144.png"><link rel="apple-touch-icon" sizes="152x152" href="/assets/apple-touch-icon-152x152.png"><link rel="apple-touch-icon" sizes="167x167" href="/assets/apple-touch-icon-167x167.png"><link rel="apple-touch-icon" sizes="180x180" href="/assets/apple-touch-icon-180x180.png"><link rel="apple-touch-icon" sizes="1024x1024" href="/assets/apple-touch-icon-1024x1024.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta name="apple-mobile-web-app-title" content="dstackai"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-640x1136.png"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1136x640.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-750x1334.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1334x750.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1125x2436.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2436x1125.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1170x2532.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2532x1170.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1179x2556.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2556x1179.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-828x1792.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1792x828.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2688.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2688x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2208.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2208x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1284x2778.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2778x1284.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1290x2796.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2796x1290.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1488x2266.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2266x1488.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1536x2048.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2048x1536.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1620x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1620.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1640x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1640.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2388.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2388x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2224.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2224x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-2048x2732.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2732x2048.png"><meta name="msapplication-TileColor" content="#fff"><meta name="msapplication-TileImage" content="/assets/mstile-144x144.png"><meta name="msapplication-config" content="/assets/browserconfig.xml"><link rel="yandex-tableau-widget" href="/assets/yandex-browser-manifest.json"><script defer="defer" src="/main-4a0fe83e84574654e397.js"></script><link href="/main-da9f8c06a69c20dac23e.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div class="b-page-header" id="header"></div><div id="root"></div></body></html>
|