dstack 0.19.0__py3-none-any.whl → 0.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

@@ -64,6 +64,7 @@ from dstack._internal.core.models.runs import (
64
64
  Retry,
65
65
  )
66
66
  from dstack._internal.core.services.profiles import get_retry
67
+ from dstack._internal.server import settings as server_settings
67
68
  from dstack._internal.server.background.tasks.common import get_provisioning_timeout
68
69
  from dstack._internal.server.db import get_session_ctx
69
70
  from dstack._internal.server.models import (
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
529
530
  session=session, fleet_id=instance.fleet_id
530
531
  )
531
532
 
532
- for backend, instance_offer in offers:
533
+ # Limit number of offers tried to prevent long-running processing
534
+ # in case all offers fail.
535
+ for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
533
536
  if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
534
537
  continue
535
538
  compute = backend.compute()
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
578
581
  extra={"instance_name": instance.name},
579
582
  )
580
583
  continue
581
- except NotImplementedError:
582
- # skip a backend without create_instance support, continue with next backend and offer
584
+ except Exception:
585
+ logger.exception(
586
+ "Got exception when launching %s in %s/%s",
587
+ instance_offer.instance.name,
588
+ instance_offer.backend.value,
589
+ instance_offer.region,
590
+ )
583
591
  continue
584
592
 
585
593
  instance.status = InstanceStatus.PROVISIONING
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
607
615
 
608
616
  if not should_retry:
609
617
  instance.status = InstanceStatus.TERMINATED
610
- instance.termination_reason = "No offers found"
618
+ instance.termination_reason = "All offers failed" if offers else "No offers found"
611
619
  logger.info(
612
- "No offers found. Terminated instance %s",
620
+ "Terminated instance %s: %s",
613
621
  instance.name,
622
+ instance.termination_reason,
614
623
  extra={
615
624
  "instance_name": instance.name,
616
625
  "instance_status": InstanceStatus.TERMINATED.value,
@@ -6,8 +6,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
6
6
 
7
7
  from dstack._internal.server import settings
8
8
  from dstack._internal.server.db import get_session
9
- from dstack._internal.server.deps import Project
10
- from dstack._internal.server.models import ProjectModel
11
9
  from dstack._internal.server.services import prometheus
12
10
  from dstack._internal.server.utils.routers import error_not_found
13
11
 
@@ -24,13 +22,3 @@ async def get_prometheus_metrics(
24
22
  if not settings.ENABLE_PROMETHEUS_METRICS:
25
23
  raise error_not_found()
26
24
  return await prometheus.get_metrics(session=session)
27
-
28
-
29
- @router.get("/metrics/project/{project_name}", deprecated=True)
30
- async def get_project_prometheus_metrics(
31
- session: Annotated[AsyncSession, Depends(get_session)],
32
- project: Annotated[ProjectModel, Depends(Project())],
33
- ) -> str:
34
- if not settings.ENABLE_PROMETHEUS_METRICS:
35
- raise error_not_found()
36
- return await prometheus.get_project_metrics(session=session, project=project)
@@ -35,7 +35,7 @@ class CursorDesktop:
35
35
 
36
36
  def get_print_readme_commands(self) -> List[str]:
37
37
  return [
38
- "echo To open in VS Code Desktop, use link below:",
38
+ "echo To open in Cursor, use link below:",
39
39
  "echo ''",
40
40
  f"echo ' cursor://vscode-remote/ssh-remote+{self.run_name}/workflow'", # TODO use $REPO_DIR
41
41
  "echo ''",
@@ -1,40 +1,38 @@
1
1
  import itertools
2
+ from collections import defaultdict
2
3
  from collections.abc import Generator, Iterable
3
4
  from datetime import timezone
5
+ from typing import ClassVar
6
+ from uuid import UUID
4
7
 
5
8
  from prometheus_client import Metric
6
9
  from prometheus_client.parser import text_string_to_metric_families
7
10
  from prometheus_client.samples import Sample
8
- from sqlalchemy import select
11
+ from sqlalchemy import func, select
9
12
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import joinedload
13
+ from sqlalchemy.orm import aliased, joinedload
11
14
 
12
15
  from dstack._internal.core.models.instances import InstanceStatus
13
- from dstack._internal.core.models.runs import JobStatus, RunSpec
16
+ from dstack._internal.core.models.runs import JobStatus, RunSpec, RunStatus
14
17
  from dstack._internal.server.models import (
15
18
  InstanceModel,
19
+ JobMetricsPoint,
16
20
  JobModel,
17
21
  JobPrometheusMetrics,
18
22
  ProjectModel,
19
23
  RunModel,
24
+ UserModel,
20
25
  )
21
26
  from dstack._internal.server.services.instances import get_instance_offer
22
27
  from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
23
28
  from dstack._internal.utils.common import get_current_datetime
24
29
 
25
- _INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
26
- _INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
27
- _INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
28
- _JOB_DURATION = "dstack_job_duration_seconds_total"
29
- _JOB_PRICE = "dstack_job_price_dollars_per_hour"
30
- _JOB_GPU_COUNT = "dstack_job_gpu_count"
31
-
32
30
 
33
31
  async def get_metrics(session: AsyncSession) -> str:
34
32
  metrics_iter = itertools.chain(
35
33
  await get_instance_metrics(session),
34
+ await get_run_metrics(session),
36
35
  await get_job_metrics(session),
37
- await get_job_gpu_metrics(session),
38
36
  )
39
37
  return "\n".join(_render_metrics(metrics_iter)) + "\n"
40
38
 
@@ -61,19 +59,7 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
61
59
  )
62
60
  )
63
61
  instances = res.unique().scalars().all()
64
- metrics: dict[str, Metric] = {
65
- _INSTANCE_DURATION: Metric(
66
- name=_INSTANCE_DURATION,
67
- documentation="Total seconds the instance is running",
68
- typ="counter",
69
- ),
70
- _INSTANCE_PRICE: Metric(
71
- name=_INSTANCE_PRICE, documentation="Instance price, USD/hour", typ="gauge"
72
- ),
73
- _INSTANCE_GPU_COUNT: Metric(
74
- name=_INSTANCE_GPU_COUNT, documentation="Instance GPU count", typ="gauge"
75
- ),
76
- }
62
+ metrics = _InstanceMetrics()
77
63
  now = get_current_datetime()
78
64
  for instance in instances:
79
65
  fleet = instance.fleet
@@ -94,15 +80,36 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
94
80
  "dstack_gpu": gpu,
95
81
  }
96
82
  duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
97
- metrics[_INSTANCE_DURATION].add_sample(
98
- name=_INSTANCE_DURATION, labels=labels, value=duration
99
- )
100
- metrics[_INSTANCE_PRICE].add_sample(
101
- name=_INSTANCE_PRICE, labels=labels, value=instance.price or 0.0
102
- )
103
- metrics[_INSTANCE_GPU_COUNT].add_sample(
104
- name=_INSTANCE_GPU_COUNT, labels=labels, value=gpu_count
105
- )
83
+ metrics.add_sample(_INSTANCE_DURATION, labels, duration)
84
+ metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0)
85
+ metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count)
86
+ return metrics.values()
87
+
88
+
89
+ async def get_run_metrics(session: AsyncSession) -> Iterable[Metric]:
90
+ res = await session.execute(
91
+ select(ProjectModel.name, UserModel.name, RunModel.status, func.count(RunModel.id))
92
+ .join_from(RunModel, ProjectModel)
93
+ .join_from(RunModel, UserModel, RunModel.user_id == UserModel.id)
94
+ .group_by(ProjectModel.name, UserModel.name, RunModel.status)
95
+ .order_by(ProjectModel.name, UserModel.name, RunModel.status)
96
+ )
97
+ projects: dict[str, dict[str, dict[RunStatus, int]]] = defaultdict(
98
+ lambda: defaultdict(lambda: defaultdict(int))
99
+ )
100
+ for project_name, user_name, status, count in res.all():
101
+ projects[project_name][user_name][status] = count
102
+ metrics = _RunMetrics()
103
+ for project_name, users in projects.items():
104
+ for user_name, statuses in users.items():
105
+ labels: dict[str, str] = {
106
+ "dstack_project_name": project_name,
107
+ "dstack_user_name": user_name,
108
+ }
109
+ metrics.add_sample(_RUN_COUNT_TOTAL, labels, sum(statuses.values()))
110
+ metrics.add_sample(_RUN_COUNT_TERMINATED, labels, statuses[RunStatus.TERMINATED])
111
+ metrics.add_sample(_RUN_COUNT_FAILED, labels, statuses[RunStatus.FAILED])
112
+ metrics.add_sample(_RUN_COUNT_DONE, labels, statuses[RunStatus.DONE])
106
113
  return metrics.values()
107
114
 
108
115
 
@@ -127,106 +134,162 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
127
134
  )
128
135
  )
129
136
  jobs = res.scalars().all()
130
- metrics: dict[str, Metric] = {
131
- _JOB_DURATION: Metric(
132
- name=_JOB_DURATION, documentation="Total seconds the job is running", typ="counter"
133
- ),
134
- _JOB_PRICE: Metric(
135
- name=_JOB_PRICE, documentation="Job instance price, USD/hour", typ="gauge"
136
- ),
137
- _JOB_GPU_COUNT: Metric(name=_JOB_GPU_COUNT, documentation="Job GPU count", typ="gauge"),
138
- }
137
+ job_ids = {job.id for job in jobs}
138
+ job_metrics_points = await _get_job_metrics_points(session, job_ids)
139
+ job_prometheus_metrics = await _get_job_prometheus_metrics(session, job_ids)
140
+
141
+ metrics = _JobMetrics()
139
142
  now = get_current_datetime()
140
143
  for job in jobs:
141
144
  jpd = get_job_provisioning_data(job)
142
145
  if jpd is None:
143
146
  continue
144
147
  jrd = get_job_runtime_data(job)
145
- gpus = jpd.instance_type.resources.gpus
148
+ resources = jpd.instance_type.resources
146
149
  price = jpd.price
147
150
  if jrd is not None and jrd.offer is not None:
148
- gpus = jrd.offer.instance.resources.gpus
151
+ resources = jrd.offer.instance.resources
149
152
  price = jrd.offer.price
153
+ gpus = resources.gpus
154
+ cpus = resources.cpus
150
155
  run_spec = RunSpec.__response__.parse_raw(job.run.run_spec)
151
- labels = _get_job_labels(job)
152
- labels["dstack_run_type"] = run_spec.configuration.type
153
- labels["dstack_backend"] = jpd.get_base_backend().value
154
- labels["dstack_gpu"] = gpus[0].name if gpus else ""
156
+ labels = {
157
+ "dstack_project_name": job.project.name,
158
+ "dstack_user_name": job.run.user.name,
159
+ "dstack_run_name": job.run_name,
160
+ "dstack_run_id": str(job.run_id),
161
+ "dstack_job_name": job.job_name,
162
+ "dstack_job_id": str(job.id),
163
+ "dstack_job_num": str(job.job_num),
164
+ "dstack_replica_num": str(job.replica_num),
165
+ "dstack_run_type": run_spec.configuration.type,
166
+ "dstack_backend": jpd.get_base_backend().value,
167
+ "dstack_gpu": gpus[0].name if gpus else "",
168
+ }
155
169
  duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
156
- metrics[_JOB_DURATION].add_sample(name=_JOB_DURATION, labels=labels, value=duration)
157
- metrics[_JOB_PRICE].add_sample(name=_JOB_PRICE, labels=labels, value=price)
158
- metrics[_JOB_GPU_COUNT].add_sample(name=_JOB_GPU_COUNT, labels=labels, value=len(gpus))
170
+ metrics.add_sample(_JOB_DURATION, labels, duration)
171
+ metrics.add_sample(_JOB_PRICE, labels, price)
172
+ metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus))
173
+ metrics.add_sample(_JOB_CPU_COUNT, labels, cpus)
174
+ metrics.add_sample(_JOB_MEMORY_TOTAL, labels, resources.memory_mib * 1024 * 1024)
175
+ jmp = job_metrics_points.get(job.id)
176
+ if jmp is not None:
177
+ metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
178
+ metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
179
+ metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
180
+ jpm = job_prometheus_metrics.get(job.id)
181
+ if jpm is not None:
182
+ for metric in text_string_to_metric_families(jpm.text):
183
+ metrics.add_metric(metric, labels)
159
184
  return metrics.values()
160
185
 
161
186
 
162
- async def get_job_gpu_metrics(session: AsyncSession) -> Iterable[Metric]:
163
- res = await session.execute(
164
- select(JobPrometheusMetrics)
165
- .join(JobModel)
166
- .join(ProjectModel)
167
- .where(JobModel.status.in_([JobStatus.RUNNING]))
168
- .order_by(ProjectModel.name, JobModel.job_name)
169
- .options(
170
- joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
171
- joinedload(JobPrometheusMetrics.job)
172
- .joinedload(JobModel.run)
173
- .joinedload(RunModel.user),
174
- )
175
- )
176
- metrics_models = res.scalars().all()
177
- return _parse_and_enrich_job_gpu_metrics(metrics_models)
187
+ _COUNTER = "counter"
188
+ _GAUGE = "gauge"
178
189
 
190
+ _INSTANCE_DURATION = "dstack_instance_duration_seconds_total"
191
+ _INSTANCE_PRICE = "dstack_instance_price_dollars_per_hour"
192
+ _INSTANCE_GPU_COUNT = "dstack_instance_gpu_count"
193
+ _RUN_COUNT_TOTAL = "dstack_run_count_total"
194
+ _RUN_COUNT_TERMINATED = "dstack_run_count_terminated_total"
195
+ _RUN_COUNT_FAILED = "dstack_run_count_failed_total"
196
+ _RUN_COUNT_DONE = "dstack_run_count_done_total"
197
+ _JOB_DURATION = "dstack_job_duration_seconds_total"
198
+ _JOB_PRICE = "dstack_job_price_dollars_per_hour"
199
+ _JOB_GPU_COUNT = "dstack_job_gpu_count"
200
+ _JOB_CPU_COUNT = "dstack_job_cpu_count"
201
+ _JOB_CPU_TIME = "dstack_job_cpu_time_seconds_total"
202
+ _JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
203
+ _JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
204
+ _JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
179
205
 
180
- async def get_project_metrics(session: AsyncSession, project: ProjectModel) -> str:
181
- res = await session.execute(
182
- select(JobPrometheusMetrics)
183
- .join(JobModel)
184
- .where(
185
- JobModel.project_id == project.id,
186
- JobModel.status.in_([JobStatus.RUNNING]),
206
+
207
+ class _Metrics(dict[str, Metric]):
208
+ metrics: ClassVar[list[tuple[str, str, str]]]
209
+
210
+ def __init__(self):
211
+ super().__init__()
212
+ for name, typ, documentation in self.metrics:
213
+ self[name] = Metric(name=name, documentation=documentation, typ=typ)
214
+
215
+ def add_sample(self, name: str, labels: dict[str, str], value: float) -> None:
216
+ # NOTE: Keeps reference to labels.
217
+ self[name].add_sample(name=name, labels=labels, value=value)
218
+
219
+ def add_metric(self, metric: Metric, labels: dict[str, str]) -> None:
220
+ # NOTE: Modifies and keeps reference to metric.
221
+ name = metric.name
222
+ samples = metric.samples
223
+ stored_metric = self.get(name)
224
+ if stored_metric is None:
225
+ stored_metric = metric
226
+ stored_metric.samples = []
227
+ self[name] = stored_metric
228
+ for sample in samples:
229
+ sample.labels.update(labels)
230
+ # text_string_to_metric_families "fixes" counter names appending _total,
231
+ # we rebuild Sample to revert this
232
+ stored_metric.samples.append(Sample(name, *sample[1:]))
233
+
234
+
235
+ class _InstanceMetrics(_Metrics):
236
+ metrics = [
237
+ (_INSTANCE_DURATION, _COUNTER, "Total seconds the instance is running"),
238
+ (_INSTANCE_PRICE, _GAUGE, "Instance price, USD/hour"),
239
+ (_INSTANCE_GPU_COUNT, _GAUGE, "Instance GPU count"),
240
+ ]
241
+
242
+
243
+ class _RunMetrics(_Metrics):
244
+ metrics = [
245
+ (_RUN_COUNT_TOTAL, _COUNTER, "Total runs count"),
246
+ (_RUN_COUNT_TERMINATED, _COUNTER, "Terminated runs count"),
247
+ (_RUN_COUNT_FAILED, _COUNTER, "Failed runs count"),
248
+ (_RUN_COUNT_DONE, _COUNTER, "Done runs count"),
249
+ ]
250
+
251
+
252
+ class _JobMetrics(_Metrics):
253
+ metrics = [
254
+ (_JOB_DURATION, _COUNTER, "Total seconds the job is running"),
255
+ (_JOB_PRICE, _GAUGE, "Job instance price, USD/hour"),
256
+ (_JOB_GPU_COUNT, _GAUGE, "Job GPU count"),
257
+ (_JOB_CPU_COUNT, _GAUGE, "Job CPU count"),
258
+ (_JOB_CPU_TIME, _COUNTER, "Total CPU time consumed by the job, seconds"),
259
+ (_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
260
+ (_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
261
+ (_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
262
+ ]
263
+
264
+
265
+ async def _get_job_metrics_points(
266
+ session: AsyncSession, job_ids: Iterable[UUID]
267
+ ) -> dict[UUID, JobMetricsPoint]:
268
+ subquery = select(
269
+ JobMetricsPoint,
270
+ func.row_number()
271
+ .over(
272
+ partition_by=JobMetricsPoint.job_id,
273
+ order_by=JobMetricsPoint.timestamp_micro.desc(),
187
274
  )
188
- .order_by(JobModel.job_name)
189
- .options(
190
- joinedload(JobPrometheusMetrics.job).joinedload(JobModel.project),
191
- joinedload(JobPrometheusMetrics.job)
192
- .joinedload(JobModel.run)
193
- .joinedload(RunModel.user),
275
+ .label("row_number"),
276
+ ).subquery()
277
+ res = await session.execute(
278
+ select(aliased(JobMetricsPoint, subquery)).where(
279
+ subquery.c.row_number == 1,
280
+ subquery.c.job_id.in_(job_ids),
194
281
  )
195
282
  )
196
- metrics_models = res.scalars().all()
197
- return "\n".join(_render_metrics(_parse_and_enrich_job_gpu_metrics(metrics_models))) + "\n"
198
-
199
-
200
- def _parse_and_enrich_job_gpu_metrics(
201
- metrics_models: Iterable[JobPrometheusMetrics],
202
- ) -> Iterable[Metric]:
203
- metrics: dict[str, Metric] = {}
204
- for metrics_model in metrics_models:
205
- for metric in text_string_to_metric_families(metrics_model.text):
206
- samples = metric.samples
207
- metric.samples = []
208
- name = metric.name
209
- metric = metrics.setdefault(name, metric)
210
- for sample in samples:
211
- labels = sample.labels
212
- labels.update(_get_job_labels(metrics_model.job))
213
- # text_string_to_metric_families "fixes" counter names appending _total,
214
- # we rebuild Sample to revert this
215
- metric.samples.append(Sample(name, labels, *sample[2:]))
216
- return metrics.values()
283
+ return {p.job_id: p for p in res.scalars().all()}
217
284
 
218
285
 
219
- def _get_job_labels(job: JobModel) -> dict[str, str]:
220
- return {
221
- "dstack_project_name": job.project.name,
222
- "dstack_user_name": job.run.user.name,
223
- "dstack_run_name": job.run_name,
224
- "dstack_run_id": str(job.run_id),
225
- "dstack_job_name": job.job_name,
226
- "dstack_job_id": str(job.id),
227
- "dstack_job_num": str(job.job_num),
228
- "dstack_replica_num": str(job.replica_num),
229
- }
286
+ async def _get_job_prometheus_metrics(
287
+ session: AsyncSession, job_ids: Iterable[UUID]
288
+ ) -> dict[UUID, JobPrometheusMetrics]:
289
+ res = await session.execute(
290
+ select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id.in_(job_ids))
291
+ )
292
+ return {p.job_id: p for p in res.scalars().all()}
230
293
 
231
294
 
232
295
  def _render_metrics(metrics: Iterable[Metric]) -> Generator[str, None, None]:
dstack/version.py CHANGED
@@ -1,3 +1,3 @@
1
- __version__ = "0.19.0"
1
+ __version__ = "0.19.1"
2
2
  __is_release__ = True
3
3
  base_image = "0.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dstack
3
- Version: 0.19.0
3
+ Version: 0.19.1
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Home-page: https://dstack.ai
6
6
  Author: Andrey Cheptsov
@@ -1,5 +1,5 @@
1
1
  dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dstack/version.py,sha256=1doazpT-su1BmUobtt--UmdBSkqrHjSTXH4z74aok9A,64
2
+ dstack/version.py,sha256=ja4J6HzBpBX3wxm5CKLUUgzAwmmr8naAhq3SBch6VIw,64
3
3
  dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
5
5
  dstack/_internal/settings.py,sha256=8XODoSW2joaEndvZxuHUPSFK85sGgJ7fVL976isYeJM,557
@@ -242,7 +242,7 @@ dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
242
242
  dstack/_internal/server/background/tasks/common.py,sha256=N7xSXbf2MoBWgbJ1e3AEzYBTf1Gn-pDXYND8Zr_YCJQ,970
243
243
  dstack/_internal/server/background/tasks/process_fleets.py,sha256=lKXUvN_b7DNjD3psHzyCt_JYsTxPFuQ86iXi8fj8GkM,3202
244
244
  dstack/_internal/server/background/tasks/process_gateways.py,sha256=hoUI1CSqbHt_uMwnzTRAEDl-LBw0wUk_W4xobIbdvRc,7017
245
- dstack/_internal/server/background/tasks/process_instances.py,sha256=TvpMnK211tXcOPAMF3qngGH9DycwKJeEchoYOtKICrg,37284
245
+ dstack/_internal/server/background/tasks/process_instances.py,sha256=Kc7CbWK4mFOsKwOqp-Pt0ewTsB5OZ5gkPyv9T6TNbpM,37674
246
246
  dstack/_internal/server/background/tasks/process_metrics.py,sha256=acySfsacpYbTPV9Yivs-oU37z1S2sUdWhRHdJkfBcCA,5332
247
247
  dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=FqGfbzvfILdnPUfxjFPAM1ij2xd2mCDi8qufiBcUMI8,4107
248
248
  dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=u8hCXjOOek7VLEsmLy2VnDXFmIwTNjrJwcpWG7a1zW0,5093
@@ -325,7 +325,7 @@ dstack/_internal/server/routers/instances.py,sha256=XOogTC9My2Zv0ck37_PbHKoZI-j4
325
325
  dstack/_internal/server/routers/logs.py,sha256=_Euk283LbhlwHibJTKM-7YcpbeQFtWBqMfbOry3PSkU,1159
326
326
  dstack/_internal/server/routers/metrics.py,sha256=VFgWhkOvxVFDLlRM_kXHYFylLcfCD6UjXInvcd7H4dY,2314
327
327
  dstack/_internal/server/routers/projects.py,sha256=0R-w_6WXUbNo6fREAexFUQ3RoOJF2D_Iz35elKjym14,2717
328
- dstack/_internal/server/routers/prometheus.py,sha256=CIz1GoCVnSAevWhTPnbIJKUO-ntz8tvK6Q_2vicDLoo,1246
328
+ dstack/_internal/server/routers/prometheus.py,sha256=OuC17kgKkb2ErxDD5QZ_ZdZft5A8dMIAFlIzQ_04NEo,744
329
329
  dstack/_internal/server/routers/repos.py,sha256=P_zLoEQderxhCeHQJwRkrIhVcc0-cpabfyde22bWVRk,3362
330
330
  dstack/_internal/server/routers/runs.py,sha256=oPqyIRPwkMjj12M1IdMF2UitatqvljISAXnJAjfEJyQ,5352
331
331
  dstack/_internal/server/routers/secrets.py,sha256=50_qJCTYRpnGSlLyS93gqoV17wWewOVmM65PcG1bT_Y,856
@@ -360,7 +360,7 @@ dstack/_internal/server/services/offers.py,sha256=tTld2ZcYdbhzShtMIf1YfTyIADtpN3
360
360
  dstack/_internal/server/services/permissions.py,sha256=l7Ngdelmn65vjw13NcOdaC6lBYMRuSw6FbHzYwdK3nE,1005
361
361
  dstack/_internal/server/services/placement.py,sha256=DWZ8-iAE3o0J0xaHikuJYZzpuBiq7lj41LiAP1PfoEs,1773
362
362
  dstack/_internal/server/services/projects.py,sha256=Y4LEkSvOVUHHP-F2qlrwBR7rFu0CFFhbHmDTKrrNuXE,15071
363
- dstack/_internal/server/services/prometheus.py,sha256=1hXSsML-xIiQHv2DKVuJTfK0FRAlseF3GESUQUvy3NE,9625
363
+ dstack/_internal/server/services/prometheus.py,sha256=xq5G-Q2BJup9lS2F6__0wUVTs-k1Gr3dYclGzo2WoWo,12474
364
364
  dstack/_internal/server/services/repos.py,sha256=f9ztN7jz_2gvD9hXF5sJwWDVyG2-NHRfjIdSukowPh8,9342
365
365
  dstack/_internal/server/services/runs.py,sha256=B2jZtTOxavUHr6WqKMXqgLzB3xWsHTkWKykcvcT2lXI,37245
366
366
  dstack/_internal/server/services/storage.py,sha256=6I0xI_3_RpJNbKZwHjDnjrEwXGdHfiaeb5li15T-M1I,1884
@@ -385,7 +385,7 @@ dstack/_internal/server/services/jobs/configurators/service.py,sha256=FOWrLE-6YF
385
385
  dstack/_internal/server/services/jobs/configurators/task.py,sha256=0-B3oO-61Eq4-mmlLmqJPliFKHhvvIV0tqc12slcQuA,1436
386
386
  dstack/_internal/server/services/jobs/configurators/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
387
387
  dstack/_internal/server/services/jobs/configurators/extensions/base.py,sha256=xJbHxaaSJ1zjn8zuuApP1Xt2uBaedPhhc-IY0NtDDJQ,418
388
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=r2siUu__sz86xDR_oK2xODmmzef1FzOS3veekGTS2-s,1645
388
+ dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=00HB1kC_eMlioEW0nZI7Ly78b-RSehySVNhC9pajBP8,1636
389
389
  dstack/_internal/server/services/jobs/configurators/extensions/vscode.py,sha256=DAj8OEVLyL1x8Jko2EXKhnAkcSnlO1sJk6o6eiiVkDI,1611
390
390
  dstack/_internal/server/services/logs/__init__.py,sha256=NAjO1KeYvuDznN2EkfAaJt9S6Y00fo_dl3ob3WmsdGQ,3088
391
391
  dstack/_internal/server/services/logs/aws.py,sha256=949k8t9H9v_-aedDjDWkw8yPVyhZemmsszcDDEL5Tb4,13711
@@ -639,7 +639,7 @@ tests/_internal/server/background/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
639
639
  tests/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
640
640
  tests/_internal/server/background/tasks/test_process_fleets.py,sha256=Dl31_TwxoCzYqkVNPWGLsYxmGL2sZfEK3rQXLFyPIz8,2701
641
641
  tests/_internal/server/background/tasks/test_process_gateways.py,sha256=lOP4jPXDtadAgYp0aFND_fp5R_X19M58CaOlgnDAEck,5085
642
- tests/_internal/server/background/tasks/test_process_instances.py,sha256=b7Shke7p3FLPT-qTVDMnwuCK4KqR4kLw9UFfsr0TSVg,24549
642
+ tests/_internal/server/background/tasks/test_process_instances.py,sha256=WC32HvynBuSxwFtAyMTHS4eVzqCnyGufcrIUTEVoozI,27944
643
643
  tests/_internal/server/background/tasks/test_process_metrics.py,sha256=z-u4HXJE5EMVH9kwU_POHmvp55ldAvuLpEMkaebBtsg,4976
644
644
  tests/_internal/server/background/tasks/test_process_placement_groups.py,sha256=19LYbIMZIIeKAN0b9KOMyS-cHUx0FoOojqQuM8Oeiq4,1620
645
645
  tests/_internal/server/background/tasks/test_process_prometheus_metrics.py,sha256=I9DgIJXVGS7UvbFgm4HFnzWiCICBpy72NjDPKU_7WII,7178
@@ -656,7 +656,7 @@ tests/_internal/server/routers/test_instances.py,sha256=78HFMU9Xel8BNZL3TqnuvrKE
656
656
  tests/_internal/server/routers/test_logs.py,sha256=NZwyJlgjMOGq4XEx7-VDjTpniYPhZpsbZvB0dTawaog,3989
657
657
  tests/_internal/server/routers/test_metrics.py,sha256=xMdDFZW73Zl06QfggjatfwTut37s0soeliJivkCgBks,7620
658
658
  tests/_internal/server/routers/test_projects.py,sha256=Z3Ok7onAjUYS4ADvKvN-SwSxYKvlvf4MG5Y8baqQU14,25964
659
- tests/_internal/server/routers/test_prometheus.py,sha256=L3qxaJQzir2nJbFtRHZMC7Pw6xWBFcv4-0buQUSdek8,18933
659
+ tests/_internal/server/routers/test_prometheus.py,sha256=LqJwWn5ztSLIGnvZgj-sD7BFW-JuePFt6k__ymF5Btw,22711
660
660
  tests/_internal/server/routers/test_repos.py,sha256=G4dKuFGd_UrxAHwh_XLl1xCHK_DCsiJcXBsHODw3yJk,16682
661
661
  tests/_internal/server/routers/test_runs.py,sha256=q02oBrUcp4JoJOL68jbxlfFxH9B8JO9Bkb7v_Qg-Aug,62984
662
662
  tests/_internal/server/routers/test_server.py,sha256=ROkuRNNJEkMQuK8guZ3Qy3iRRfiWvPIJJJDc09BI0D4,489
@@ -701,9 +701,9 @@ tests/_internal/utils/test_path.py,sha256=rzS-1YCxsFUocBe42dghLOMFNymPruGrA7bqFZ
701
701
  tests/_internal/utils/test_ssh.py,sha256=V-cBFPhD--9eM9d1uQQgpj2gnYLA3c43f4cX9uJ6E-U,1743
702
702
  tests/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
703
703
  tests/api/test_utils.py,sha256=SSSqHcNE5cZVqDq4n2sKZthRoXaZ_Bx7z1AAN5xTM9s,391
704
- dstack-0.19.0.dist-info/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
705
- dstack-0.19.0.dist-info/METADATA,sha256=xHGGa6FKokKvEFYnt2k7cX2iFkp0YsA9o0-I3nbqagg,18231
706
- dstack-0.19.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
707
- dstack-0.19.0.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
708
- dstack-0.19.0.dist-info/top_level.txt,sha256=3BrIO1zrqxT9P20ymhRM6k15meZXzbPL6ykBlDZG2_k,13
709
- dstack-0.19.0.dist-info/RECORD,,
704
+ dstack-0.19.1.dist-info/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
705
+ dstack-0.19.1.dist-info/METADATA,sha256=sj_wcanWBaGU9ecMn6I32zvXuYFniyN_6K6ehbcO3tA,18231
706
+ dstack-0.19.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
707
+ dstack-0.19.1.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
708
+ dstack-0.19.1.dist-info/top_level.txt,sha256=3BrIO1zrqxT9P20ymhRM6k15meZXzbPL6ykBlDZG2_k,13
709
+ dstack-0.19.1.dist-info/RECORD,,
@@ -8,7 +8,7 @@ import pytest
8
8
  from freezegun import freeze_time
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
10
 
11
- from dstack._internal.core.errors import BackendError
11
+ from dstack._internal.core.errors import BackendError, ProvisioningError
12
12
  from dstack._internal.core.models.backends.base import BackendType
13
13
  from dstack._internal.core.models.instances import (
14
14
  Gpu,
@@ -35,6 +35,8 @@ from dstack._internal.server.testing.common import (
35
35
  create_repo,
36
36
  create_run,
37
37
  create_user,
38
+ get_instance_offer_with_availability,
39
+ get_job_provisioning_data,
38
40
  get_remote_connection_info,
39
41
  )
40
42
  from dstack._internal.utils.common import get_current_datetime
@@ -557,6 +559,68 @@ class TestCreateInstance:
557
559
  assert instance.total_blocks == expected_blocks
558
560
  assert instance.busy_blocks == 0
559
561
 
562
+ @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
563
+ async def test_tries_second_offer_if_first_fails(self, session: AsyncSession, err: Exception):
564
+ project = await create_project(session=session)
565
+ instance = await create_instance(
566
+ session=session, project=project, status=InstanceStatus.PENDING
567
+ )
568
+ aws_mock = Mock()
569
+ aws_mock.TYPE = BackendType.AWS
570
+ offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
571
+ aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
572
+ aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
573
+ aws_mock.compute.return_value.create_instance.side_effect = err
574
+ gcp_mock = Mock()
575
+ gcp_mock.TYPE = BackendType.GCP
576
+ offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0)
577
+ gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec)
578
+ gcp_mock.compute.return_value.get_offers_cached.return_value = [offer]
579
+ gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data(
580
+ backend=offer.backend, region=offer.region, price=offer.price
581
+ )
582
+ with patch("dstack._internal.server.services.backends.get_project_backends") as m:
583
+ m.return_value = [aws_mock, gcp_mock]
584
+ await process_instances()
585
+
586
+ await session.refresh(instance)
587
+ assert instance.status == InstanceStatus.PROVISIONING
588
+ aws_mock.compute.return_value.create_instance.assert_called_once()
589
+ assert instance.backend == BackendType.GCP
590
+
591
+ @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
592
+ async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Exception):
593
+ project = await create_project(session=session)
594
+ instance = await create_instance(
595
+ session=session, project=project, status=InstanceStatus.PENDING
596
+ )
597
+ aws_mock = Mock()
598
+ aws_mock.TYPE = BackendType.AWS
599
+ offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
600
+ aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
601
+ aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
602
+ aws_mock.compute.return_value.create_instance.side_effect = err
603
+ with patch("dstack._internal.server.services.backends.get_project_backends") as m:
604
+ m.return_value = [aws_mock]
605
+ await process_instances()
606
+
607
+ await session.refresh(instance)
608
+ assert instance.status == InstanceStatus.TERMINATED
609
+ assert instance.termination_reason == "All offers failed"
610
+
611
+ async def test_fails_if_no_offers(self, session: AsyncSession):
612
+ project = await create_project(session=session)
613
+ instance = await create_instance(
614
+ session=session, project=project, status=InstanceStatus.PENDING
615
+ )
616
+ with patch("dstack._internal.server.services.backends.get_project_backends") as m:
617
+ m.return_value = []
618
+ await process_instances()
619
+
620
+ await session.refresh(instance)
621
+ assert instance.status == InstanceStatus.TERMINATED
622
+ assert instance.termination_reason == "No offers found"
623
+
560
624
 
561
625
  @pytest.mark.asyncio
562
626
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
@@ -9,14 +9,20 @@ from sqlalchemy.ext.asyncio import AsyncSession
9
9
 
10
10
  from dstack._internal.core.models.backends.base import BackendType
11
11
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
12
- from dstack._internal.core.models.runs import JobProvisioningData, JobRuntimeData, JobStatus
12
+ from dstack._internal.core.models.runs import (
13
+ JobProvisioningData,
14
+ JobRuntimeData,
15
+ JobStatus,
16
+ RunStatus,
17
+ )
13
18
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
14
- from dstack._internal.server.models import JobModel, ProjectModel, UserModel
19
+ from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel
15
20
  from dstack._internal.server.services.projects import add_project_member
16
21
  from dstack._internal.server.testing.common import (
17
22
  create_fleet,
18
23
  create_instance,
19
24
  create_job,
25
+ create_job_metrics_point,
20
26
  create_job_prometheus_metrics,
21
27
  create_project,
22
28
  create_repo,
@@ -45,11 +51,21 @@ class TestGetPrometheusMetrics:
45
51
  async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
46
52
  user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
47
53
  offer = get_instance_offer_with_availability(
48
- instance_type="test-type", gpu_count=2, gpu_name="V4", price=12
54
+ instance_type="test-type",
55
+ cpu_count=32,
56
+ memory_gib=128,
57
+ gpu_count=2,
58
+ gpu_name="V4",
59
+ price=12,
49
60
  )
50
61
  project_2 = await _create_project(session, "project-2", user)
51
62
  jpd_2_1 = get_job_provisioning_data(
52
- backend=BackendType.AWS, gpu_name="T4", gpu_count=2, price=16
63
+ backend=BackendType.AWS,
64
+ cpu_count=16,
65
+ memory_gib=64,
66
+ gpu_name="T4",
67
+ gpu_count=2,
68
+ price=16,
53
69
  )
54
70
  job_2_1 = await _create_job(
55
71
  session=session,
@@ -100,7 +116,41 @@ class TestGetPrometheusMetrics:
100
116
  FIELD_2{gpu="1"} 987169 1395066363010
101
117
  """),
102
118
  )
103
- job_1_2 = await _create_job(session, "run-2", project_1, user, JobStatus.RUNNING)
119
+ await create_job_metrics_point(
120
+ session=session,
121
+ job_model=job_1_1,
122
+ timestamp=FAKE_NOW - timedelta(seconds=30),
123
+ cpu_usage_micro=3_500_000,
124
+ memory_working_set_bytes=3_221_225_472,
125
+ memory_usage_bytes=4_294_967_296,
126
+ )
127
+ # Older, ignored
128
+ await create_job_metrics_point(
129
+ session=session,
130
+ job_model=job_1_1,
131
+ timestamp=FAKE_NOW - timedelta(seconds=60),
132
+ cpu_usage_micro=2_000_000,
133
+ memory_working_set_bytes=1_073_741_824,
134
+ memory_usage_bytes=2_147_483_648,
135
+ )
136
+ jpd_1_2 = get_job_provisioning_data(
137
+ backend=BackendType.AWS,
138
+ cpu_count=24,
139
+ memory_gib=224,
140
+ gpu_count=3,
141
+ gpu_name="L4",
142
+ price=12.5,
143
+ )
144
+ job_1_2 = await _create_job(
145
+ session=session,
146
+ run_name="run-2",
147
+ project=project_1,
148
+ user=user,
149
+ status=JobStatus.RUNNING,
150
+ job_provisioning_data=jpd_1_2,
151
+ submitted_at=FAKE_NOW - timedelta(seconds=150),
152
+ )
153
+
104
154
  await create_job_prometheus_metrics(
105
155
  session=session,
106
156
  job=job_1_2,
@@ -124,6 +174,15 @@ class TestGetPrometheusMetrics:
124
174
  FIELD_1{gpu="1"} 20
125
175
  """),
126
176
  )
177
+ await _create_run(session, "done", project_1, user, RunStatus.DONE)
178
+ other_user = await create_user(
179
+ session=session, name="other-user", global_role=GlobalRole.USER
180
+ )
181
+ await add_project_member(
182
+ session=session, project=project_2, user=other_user, project_role=ProjectRole.USER
183
+ )
184
+ await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED)
185
+ await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED)
127
186
  fleet = await create_fleet(session=session, project=project_1, name="test-fleet")
128
187
  instance = await create_instance(
129
188
  session=session,
@@ -149,31 +208,73 @@ class TestGetPrometheusMetrics:
149
208
  # HELP dstack_instance_gpu_count Instance GPU count
150
209
  # TYPE dstack_instance_gpu_count gauge
151
210
  dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0
211
+ # HELP dstack_run_count_total Total runs count
212
+ # TYPE dstack_run_count_total counter
213
+ dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0
214
+ dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
215
+ dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0
216
+ # HELP dstack_run_count_terminated_total Terminated runs count
217
+ # TYPE dstack_run_count_terminated_total counter
218
+ dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
219
+ dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
220
+ dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
221
+ # HELP dstack_run_count_failed_total Failed runs count
222
+ # TYPE dstack_run_count_failed_total counter
223
+ dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
224
+ dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
225
+ dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
226
+ # HELP dstack_run_count_done_total Done runs count
227
+ # TYPE dstack_run_count_done_total counter
228
+ dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0
229
+ dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
230
+ dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
152
231
  # HELP dstack_job_duration_seconds_total Total seconds the job is running
153
232
  # TYPE dstack_job_duration_seconds_total counter
154
233
  dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0
234
+ dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0
155
235
  dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
156
236
  # HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour
157
237
  # TYPE dstack_job_price_dollars_per_hour gauge
158
238
  dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0
239
+ dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5
159
240
  dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
160
241
  # HELP dstack_job_gpu_count Job GPU count
161
242
  # TYPE dstack_job_gpu_count gauge
162
243
  dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0
244
+ dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0
163
245
  dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0
246
+ # HELP dstack_job_cpu_count Job CPU count
247
+ # TYPE dstack_job_cpu_count gauge
248
+ dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0
249
+ dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0
250
+ dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
251
+ # HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds
252
+ # TYPE dstack_job_cpu_time_seconds_total counter
253
+ dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5
254
+ # HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes
255
+ # TYPE dstack_job_memory_total_bytes gauge
256
+ dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0
257
+ dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0
258
+ dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0
259
+ # HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes
260
+ # TYPE dstack_job_memory_usage_bytes gauge
261
+ dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0
262
+ # HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
263
+ # TYPE dstack_job_memory_working_set_bytes gauge
264
+ dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
164
265
  # HELP FIELD_1 Test field 1
165
266
  # TYPE FIELD_1 gauge
166
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
167
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
168
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
169
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
170
- FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
171
- FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 100.0
172
- FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 200.0
267
+ FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
268
+ FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0
269
+ FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0
270
+ FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0
271
+ FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0
272
+ FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
273
+ FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0
173
274
  # HELP FIELD_2 Test field 2
174
275
  # TYPE FIELD_2 counter
175
- FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
176
- FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
276
+ FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
277
+ FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
177
278
  """)
178
279
 
179
280
  async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
@@ -189,110 +290,6 @@ class TestGetPrometheusMetrics:
189
290
  assert response.status_code == 404
190
291
 
191
292
 
192
- @pytest.mark.asyncio
193
- @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
194
- @pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
195
- class TestGetPrometheusProjectMetrics:
196
- async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
197
- user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
198
- project = await _create_project(session, "project-1", user)
199
- job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
200
- await create_job_prometheus_metrics(
201
- session=session,
202
- job=job_1,
203
- text=dedent("""
204
- # Comments should be skipped
205
-
206
- # HELP FIELD_1 Test field 1
207
- # TYPE FIELD_1 gauge
208
- FIELD_1{gpu="0"} 350
209
- FIELD_1{gpu="1"} 400
210
-
211
- # HELP FIELD_2 Test field 2
212
- # TYPE FIELD_2 counter
213
- FIELD_2{gpu="0"} 337325 1395066363000
214
- FIELD_2{gpu="1"} 987169 1395066363010
215
- """),
216
- )
217
- job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
218
- await create_job_prometheus_metrics(
219
- session=session,
220
- job=job_2,
221
- text=dedent("""
222
- # HELP FIELD_1 Test field 1
223
- # TYPE FIELD_1 gauge
224
- FIELD_1{gpu="0"} 1200.0
225
- FIELD_1{gpu="1"} 1600.0
226
- FIELD_1{gpu="2"} 2400.0
227
- """),
228
- )
229
- # Terminated job, should not appear in the response
230
- job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
231
- await create_job_prometheus_metrics(
232
- session=session,
233
- job=job_3,
234
- text=dedent("""
235
- # HELP FIELD_1 Test field 1
236
- # TYPE FIELD_1 gauge
237
- FIELD_1{gpu="0"} 10
238
- FIELD_1{gpu="1"} 20
239
- """),
240
- )
241
- another_project = await _create_project(session, "project-2", user)
242
- another_project_job = await _create_job(
243
- session, "run-4", another_project, user, JobStatus.RUNNING
244
- )
245
- await create_job_prometheus_metrics(
246
- session=session,
247
- job=another_project_job,
248
- text=dedent("""
249
- # HELP FIELD_1 Test field 1
250
- # TYPE FIELD_1 gauge
251
- FIELD_1{gpu="0"} 100
252
- FIELD_1{gpu="1"} 200
253
- """),
254
- )
255
-
256
- response = await client.get("/metrics/project/project-1")
257
-
258
- assert response.status_code == 200
259
- assert response.text == dedent(f"""\
260
- # HELP FIELD_1 Test field 1
261
- # TYPE FIELD_1 gauge
262
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
263
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
264
- FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
265
- FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
266
- FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
267
- # HELP FIELD_2 Test field 2
268
- # TYPE FIELD_2 counter
269
- FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
270
- FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
271
- """)
272
-
273
- async def test_returns_empty_response_if_no_runs(
274
- self, session: AsyncSession, client: AsyncClient
275
- ):
276
- user = await create_user(session=session, global_role=GlobalRole.USER)
277
- await create_project(session=session, owner=user, name="test-project")
278
- response = await client.get("/metrics/project/test-project")
279
- assert response.status_code == 200
280
- assert response.text == "\n"
281
-
282
- async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
283
- response = await client.get("/metrics/project/nonexistent")
284
- assert response.status_code == 404
285
-
286
- async def test_returns_404_if_not_enabled(
287
- self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
288
- ):
289
- monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
290
- user = await create_user(session=session, global_role=GlobalRole.USER)
291
- await create_project(session=session, owner=user, name="test-project")
292
- response = await client.get("/metrics/project/test-project")
293
- assert response.status_code == 404
294
-
295
-
296
293
  async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
297
294
  project = await create_project(session=session, owner=user, name=name)
298
295
  await add_project_member(
@@ -301,26 +298,46 @@ async def _create_project(session: AsyncSession, name: str, user: UserModel) ->
301
298
  return project
302
299
 
303
300
 
304
- async def _create_job(
301
+ async def _create_run(
305
302
  session: AsyncSession,
306
303
  run_name: str,
307
304
  project: ProjectModel,
308
305
  user: UserModel,
309
- status: JobStatus,
310
- job_provisioning_data: Optional[JobProvisioningData] = None,
311
- job_runtime_data: Optional[JobRuntimeData] = None,
306
+ status: RunStatus,
312
307
  submitted_at: datetime = FAKE_NOW,
313
- ) -> JobModel:
308
+ ) -> RunModel:
314
309
  repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
315
310
  configuration = DevEnvironmentConfiguration(ide="vscode")
316
311
  run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration)
317
- run = await create_run(
312
+ return await create_run(
318
313
  session=session,
319
314
  project=project,
320
315
  repo=repo,
321
316
  user=user,
322
317
  run_name=run_name,
323
318
  run_spec=run_spec,
319
+ status=status,
320
+ submitted_at=submitted_at,
321
+ )
322
+
323
+
324
+ async def _create_job(
325
+ session: AsyncSession,
326
+ run_name: str,
327
+ project: ProjectModel,
328
+ user: UserModel,
329
+ status: JobStatus,
330
+ job_provisioning_data: Optional[JobProvisioningData] = None,
331
+ job_runtime_data: Optional[JobRuntimeData] = None,
332
+ submitted_at: datetime = FAKE_NOW,
333
+ ) -> JobModel:
334
+ run = await _create_run(
335
+ session=session,
336
+ run_name=run_name,
337
+ project=project,
338
+ user=user,
339
+ status=RunStatus.SUBMITTED,
340
+ submitted_at=submitted_at,
324
341
  )
325
342
  job = await create_job(
326
343
  session=session,