dstack 0.19.0rc1__py3-none-any.whl → 0.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/server/background/tasks/process_instances.py +14 -5
- dstack/_internal/server/routers/prometheus.py +0 -12
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +1 -1
- dstack/_internal/server/services/prometheus.py +175 -112
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js → main-4a0fe83e84574654e397.js} +13 -9
- dstack/_internal/server/statics/{main-4fd5a4770eff59325ee3.js.map → main-4a0fe83e84574654e397.js.map} +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/METADATA +1 -1
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/RECORD +16 -16
- tests/_internal/server/background/tasks/test_process_instances.py +65 -1
- tests/_internal/server/routers/test_prometheus.py +141 -124
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/WHEEL +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.0rc1.dist-info → dstack-0.19.1.dist-info}/top_level.txt +0 -0
dstack/version.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
dstack/version.py,sha256
|
|
2
|
+
dstack/version.py,sha256=ja4J6HzBpBX3wxm5CKLUUgzAwmmr8naAhq3SBch6VIw,64
|
|
3
3
|
dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
|
|
5
5
|
dstack/_internal/settings.py,sha256=8XODoSW2joaEndvZxuHUPSFK85sGgJ7fVL976isYeJM,557
|
|
@@ -242,7 +242,7 @@ dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
|
|
|
242
242
|
dstack/_internal/server/background/tasks/common.py,sha256=N7xSXbf2MoBWgbJ1e3AEzYBTf1Gn-pDXYND8Zr_YCJQ,970
|
|
243
243
|
dstack/_internal/server/background/tasks/process_fleets.py,sha256=lKXUvN_b7DNjD3psHzyCt_JYsTxPFuQ86iXi8fj8GkM,3202
|
|
244
244
|
dstack/_internal/server/background/tasks/process_gateways.py,sha256=hoUI1CSqbHt_uMwnzTRAEDl-LBw0wUk_W4xobIbdvRc,7017
|
|
245
|
-
dstack/_internal/server/background/tasks/process_instances.py,sha256=
|
|
245
|
+
dstack/_internal/server/background/tasks/process_instances.py,sha256=Kc7CbWK4mFOsKwOqp-Pt0ewTsB5OZ5gkPyv9T6TNbpM,37674
|
|
246
246
|
dstack/_internal/server/background/tasks/process_metrics.py,sha256=acySfsacpYbTPV9Yivs-oU37z1S2sUdWhRHdJkfBcCA,5332
|
|
247
247
|
dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=FqGfbzvfILdnPUfxjFPAM1ij2xd2mCDi8qufiBcUMI8,4107
|
|
248
248
|
dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=u8hCXjOOek7VLEsmLy2VnDXFmIwTNjrJwcpWG7a1zW0,5093
|
|
@@ -325,7 +325,7 @@ dstack/_internal/server/routers/instances.py,sha256=XOogTC9My2Zv0ck37_PbHKoZI-j4
|
|
|
325
325
|
dstack/_internal/server/routers/logs.py,sha256=_Euk283LbhlwHibJTKM-7YcpbeQFtWBqMfbOry3PSkU,1159
|
|
326
326
|
dstack/_internal/server/routers/metrics.py,sha256=VFgWhkOvxVFDLlRM_kXHYFylLcfCD6UjXInvcd7H4dY,2314
|
|
327
327
|
dstack/_internal/server/routers/projects.py,sha256=0R-w_6WXUbNo6fREAexFUQ3RoOJF2D_Iz35elKjym14,2717
|
|
328
|
-
dstack/_internal/server/routers/prometheus.py,sha256=
|
|
328
|
+
dstack/_internal/server/routers/prometheus.py,sha256=OuC17kgKkb2ErxDD5QZ_ZdZft5A8dMIAFlIzQ_04NEo,744
|
|
329
329
|
dstack/_internal/server/routers/repos.py,sha256=P_zLoEQderxhCeHQJwRkrIhVcc0-cpabfyde22bWVRk,3362
|
|
330
330
|
dstack/_internal/server/routers/runs.py,sha256=oPqyIRPwkMjj12M1IdMF2UitatqvljISAXnJAjfEJyQ,5352
|
|
331
331
|
dstack/_internal/server/routers/secrets.py,sha256=50_qJCTYRpnGSlLyS93gqoV17wWewOVmM65PcG1bT_Y,856
|
|
@@ -360,7 +360,7 @@ dstack/_internal/server/services/offers.py,sha256=tTld2ZcYdbhzShtMIf1YfTyIADtpN3
|
|
|
360
360
|
dstack/_internal/server/services/permissions.py,sha256=l7Ngdelmn65vjw13NcOdaC6lBYMRuSw6FbHzYwdK3nE,1005
|
|
361
361
|
dstack/_internal/server/services/placement.py,sha256=DWZ8-iAE3o0J0xaHikuJYZzpuBiq7lj41LiAP1PfoEs,1773
|
|
362
362
|
dstack/_internal/server/services/projects.py,sha256=Y4LEkSvOVUHHP-F2qlrwBR7rFu0CFFhbHmDTKrrNuXE,15071
|
|
363
|
-
dstack/_internal/server/services/prometheus.py,sha256=
|
|
363
|
+
dstack/_internal/server/services/prometheus.py,sha256=xq5G-Q2BJup9lS2F6__0wUVTs-k1Gr3dYclGzo2WoWo,12474
|
|
364
364
|
dstack/_internal/server/services/repos.py,sha256=f9ztN7jz_2gvD9hXF5sJwWDVyG2-NHRfjIdSukowPh8,9342
|
|
365
365
|
dstack/_internal/server/services/runs.py,sha256=B2jZtTOxavUHr6WqKMXqgLzB3xWsHTkWKykcvcT2lXI,37245
|
|
366
366
|
dstack/_internal/server/services/storage.py,sha256=6I0xI_3_RpJNbKZwHjDnjrEwXGdHfiaeb5li15T-M1I,1884
|
|
@@ -385,7 +385,7 @@ dstack/_internal/server/services/jobs/configurators/service.py,sha256=FOWrLE-6YF
|
|
|
385
385
|
dstack/_internal/server/services/jobs/configurators/task.py,sha256=0-B3oO-61Eq4-mmlLmqJPliFKHhvvIV0tqc12slcQuA,1436
|
|
386
386
|
dstack/_internal/server/services/jobs/configurators/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
387
387
|
dstack/_internal/server/services/jobs/configurators/extensions/base.py,sha256=xJbHxaaSJ1zjn8zuuApP1Xt2uBaedPhhc-IY0NtDDJQ,418
|
|
388
|
-
dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=
|
|
388
|
+
dstack/_internal/server/services/jobs/configurators/extensions/cursor.py,sha256=00HB1kC_eMlioEW0nZI7Ly78b-RSehySVNhC9pajBP8,1636
|
|
389
389
|
dstack/_internal/server/services/jobs/configurators/extensions/vscode.py,sha256=DAj8OEVLyL1x8Jko2EXKhnAkcSnlO1sJk6o6eiiVkDI,1611
|
|
390
390
|
dstack/_internal/server/services/logs/__init__.py,sha256=NAjO1KeYvuDznN2EkfAaJt9S6Y00fo_dl3ob3WmsdGQ,3088
|
|
391
391
|
dstack/_internal/server/services/logs/aws.py,sha256=949k8t9H9v_-aedDjDWkw8yPVyhZemmsszcDDEL5Tb4,13711
|
|
@@ -431,9 +431,9 @@ dstack/_internal/server/statics/e467d7d60aae81ab198b.svg,sha256=_XHc9mfQZgGkcy4h
|
|
|
431
431
|
dstack/_internal/server/statics/eb9b344b73818fe2b71a.png,sha256=2H14eOCQRyZhFGJ1Kn2LH1j70kTF1Qop4vH-tiKqyPI,85
|
|
432
432
|
dstack/_internal/server/statics/f517dd626eb964120de0.png,sha256=4QQuNa8SqmcZ67HK6739OHCyjnAJseU1bkcn454KRQs,159
|
|
433
433
|
dstack/_internal/server/statics/f958aecddee5d8e3222c.png,sha256=8CoZkVNgRfOAe62X1dU-AZDvwh_nESKaQblEmaX2Xrs,87
|
|
434
|
-
dstack/_internal/server/statics/index.html,sha256=
|
|
435
|
-
dstack/_internal/server/statics/main-
|
|
436
|
-
dstack/_internal/server/statics/main-
|
|
434
|
+
dstack/_internal/server/statics/index.html,sha256=1RngYCyoktp3XhwHvC2jeCdolq0T-w-LDfNJHInNeI0,10468
|
|
435
|
+
dstack/_internal/server/statics/main-4a0fe83e84574654e397.js,sha256=lNEeGSQNL6XYc_nXkZwtL35FOJkqbAGUexPC38feqsQ,6204421
|
|
436
|
+
dstack/_internal/server/statics/main-4a0fe83e84574654e397.js.map,sha256=WoJHj0c5pkjUN-aGI-v8oyylaxU3y4KgJ_G8QiwtvA8,8143363
|
|
437
437
|
dstack/_internal/server/statics/main-da9f8c06a69c20dac23e.css,sha256=2ObS4Rg6yWkbk7nQqw-lnLVdYC3L8X5N7xZMKcavMWg,1296312
|
|
438
438
|
dstack/_internal/server/statics/manifest.json,sha256=430w2BoWVmYYVr14lDvUxx-ROPt3VjigzeMqfLeiSCM,340
|
|
439
439
|
dstack/_internal/server/statics/robots.txt,sha256=kNJLw79pisHhc3OVAimMzKcq3x9WT6sF9IS4xI0crdI,67
|
|
@@ -639,7 +639,7 @@ tests/_internal/server/background/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
639
639
|
tests/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
640
640
|
tests/_internal/server/background/tasks/test_process_fleets.py,sha256=Dl31_TwxoCzYqkVNPWGLsYxmGL2sZfEK3rQXLFyPIz8,2701
|
|
641
641
|
tests/_internal/server/background/tasks/test_process_gateways.py,sha256=lOP4jPXDtadAgYp0aFND_fp5R_X19M58CaOlgnDAEck,5085
|
|
642
|
-
tests/_internal/server/background/tasks/test_process_instances.py,sha256=
|
|
642
|
+
tests/_internal/server/background/tasks/test_process_instances.py,sha256=WC32HvynBuSxwFtAyMTHS4eVzqCnyGufcrIUTEVoozI,27944
|
|
643
643
|
tests/_internal/server/background/tasks/test_process_metrics.py,sha256=z-u4HXJE5EMVH9kwU_POHmvp55ldAvuLpEMkaebBtsg,4976
|
|
644
644
|
tests/_internal/server/background/tasks/test_process_placement_groups.py,sha256=19LYbIMZIIeKAN0b9KOMyS-cHUx0FoOojqQuM8Oeiq4,1620
|
|
645
645
|
tests/_internal/server/background/tasks/test_process_prometheus_metrics.py,sha256=I9DgIJXVGS7UvbFgm4HFnzWiCICBpy72NjDPKU_7WII,7178
|
|
@@ -656,7 +656,7 @@ tests/_internal/server/routers/test_instances.py,sha256=78HFMU9Xel8BNZL3TqnuvrKE
|
|
|
656
656
|
tests/_internal/server/routers/test_logs.py,sha256=NZwyJlgjMOGq4XEx7-VDjTpniYPhZpsbZvB0dTawaog,3989
|
|
657
657
|
tests/_internal/server/routers/test_metrics.py,sha256=xMdDFZW73Zl06QfggjatfwTut37s0soeliJivkCgBks,7620
|
|
658
658
|
tests/_internal/server/routers/test_projects.py,sha256=Z3Ok7onAjUYS4ADvKvN-SwSxYKvlvf4MG5Y8baqQU14,25964
|
|
659
|
-
tests/_internal/server/routers/test_prometheus.py,sha256=
|
|
659
|
+
tests/_internal/server/routers/test_prometheus.py,sha256=LqJwWn5ztSLIGnvZgj-sD7BFW-JuePFt6k__ymF5Btw,22711
|
|
660
660
|
tests/_internal/server/routers/test_repos.py,sha256=G4dKuFGd_UrxAHwh_XLl1xCHK_DCsiJcXBsHODw3yJk,16682
|
|
661
661
|
tests/_internal/server/routers/test_runs.py,sha256=q02oBrUcp4JoJOL68jbxlfFxH9B8JO9Bkb7v_Qg-Aug,62984
|
|
662
662
|
tests/_internal/server/routers/test_server.py,sha256=ROkuRNNJEkMQuK8guZ3Qy3iRRfiWvPIJJJDc09BI0D4,489
|
|
@@ -701,9 +701,9 @@ tests/_internal/utils/test_path.py,sha256=rzS-1YCxsFUocBe42dghLOMFNymPruGrA7bqFZ
|
|
|
701
701
|
tests/_internal/utils/test_ssh.py,sha256=V-cBFPhD--9eM9d1uQQgpj2gnYLA3c43f4cX9uJ6E-U,1743
|
|
702
702
|
tests/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
703
703
|
tests/api/test_utils.py,sha256=SSSqHcNE5cZVqDq4n2sKZthRoXaZ_Bx7z1AAN5xTM9s,391
|
|
704
|
-
dstack-0.19.
|
|
705
|
-
dstack-0.19.
|
|
706
|
-
dstack-0.19.
|
|
707
|
-
dstack-0.19.
|
|
708
|
-
dstack-0.19.
|
|
709
|
-
dstack-0.19.
|
|
704
|
+
dstack-0.19.1.dist-info/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
|
|
705
|
+
dstack-0.19.1.dist-info/METADATA,sha256=sj_wcanWBaGU9ecMn6I32zvXuYFniyN_6K6ehbcO3tA,18231
|
|
706
|
+
dstack-0.19.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
707
|
+
dstack-0.19.1.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
|
|
708
|
+
dstack-0.19.1.dist-info/top_level.txt,sha256=3BrIO1zrqxT9P20ymhRM6k15meZXzbPL6ykBlDZG2_k,13
|
|
709
|
+
dstack-0.19.1.dist-info/RECORD,,
|
|
@@ -8,7 +8,7 @@ import pytest
|
|
|
8
8
|
from freezegun import freeze_time
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.errors import BackendError
|
|
11
|
+
from dstack._internal.core.errors import BackendError, ProvisioningError
|
|
12
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
13
13
|
from dstack._internal.core.models.instances import (
|
|
14
14
|
Gpu,
|
|
@@ -35,6 +35,8 @@ from dstack._internal.server.testing.common import (
|
|
|
35
35
|
create_repo,
|
|
36
36
|
create_run,
|
|
37
37
|
create_user,
|
|
38
|
+
get_instance_offer_with_availability,
|
|
39
|
+
get_job_provisioning_data,
|
|
38
40
|
get_remote_connection_info,
|
|
39
41
|
)
|
|
40
42
|
from dstack._internal.utils.common import get_current_datetime
|
|
@@ -557,6 +559,68 @@ class TestCreateInstance:
|
|
|
557
559
|
assert instance.total_blocks == expected_blocks
|
|
558
560
|
assert instance.busy_blocks == 0
|
|
559
561
|
|
|
562
|
+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
|
|
563
|
+
async def test_tries_second_offer_if_first_fails(self, session: AsyncSession, err: Exception):
|
|
564
|
+
project = await create_project(session=session)
|
|
565
|
+
instance = await create_instance(
|
|
566
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
567
|
+
)
|
|
568
|
+
aws_mock = Mock()
|
|
569
|
+
aws_mock.TYPE = BackendType.AWS
|
|
570
|
+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
|
|
571
|
+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
572
|
+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
573
|
+
aws_mock.compute.return_value.create_instance.side_effect = err
|
|
574
|
+
gcp_mock = Mock()
|
|
575
|
+
gcp_mock.TYPE = BackendType.GCP
|
|
576
|
+
offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0)
|
|
577
|
+
gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
578
|
+
gcp_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
579
|
+
gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data(
|
|
580
|
+
backend=offer.backend, region=offer.region, price=offer.price
|
|
581
|
+
)
|
|
582
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
583
|
+
m.return_value = [aws_mock, gcp_mock]
|
|
584
|
+
await process_instances()
|
|
585
|
+
|
|
586
|
+
await session.refresh(instance)
|
|
587
|
+
assert instance.status == InstanceStatus.PROVISIONING
|
|
588
|
+
aws_mock.compute.return_value.create_instance.assert_called_once()
|
|
589
|
+
assert instance.backend == BackendType.GCP
|
|
590
|
+
|
|
591
|
+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
|
|
592
|
+
async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Exception):
|
|
593
|
+
project = await create_project(session=session)
|
|
594
|
+
instance = await create_instance(
|
|
595
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
596
|
+
)
|
|
597
|
+
aws_mock = Mock()
|
|
598
|
+
aws_mock.TYPE = BackendType.AWS
|
|
599
|
+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
|
|
600
|
+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
601
|
+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
602
|
+
aws_mock.compute.return_value.create_instance.side_effect = err
|
|
603
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
604
|
+
m.return_value = [aws_mock]
|
|
605
|
+
await process_instances()
|
|
606
|
+
|
|
607
|
+
await session.refresh(instance)
|
|
608
|
+
assert instance.status == InstanceStatus.TERMINATED
|
|
609
|
+
assert instance.termination_reason == "All offers failed"
|
|
610
|
+
|
|
611
|
+
async def test_fails_if_no_offers(self, session: AsyncSession):
|
|
612
|
+
project = await create_project(session=session)
|
|
613
|
+
instance = await create_instance(
|
|
614
|
+
session=session, project=project, status=InstanceStatus.PENDING
|
|
615
|
+
)
|
|
616
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
617
|
+
m.return_value = []
|
|
618
|
+
await process_instances()
|
|
619
|
+
|
|
620
|
+
await session.refresh(instance)
|
|
621
|
+
assert instance.status == InstanceStatus.TERMINATED
|
|
622
|
+
assert instance.termination_reason == "No offers found"
|
|
623
|
+
|
|
560
624
|
|
|
561
625
|
@pytest.mark.asyncio
|
|
562
626
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
@@ -9,14 +9,20 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
12
|
-
from dstack._internal.core.models.runs import
|
|
12
|
+
from dstack._internal.core.models.runs import (
|
|
13
|
+
JobProvisioningData,
|
|
14
|
+
JobRuntimeData,
|
|
15
|
+
JobStatus,
|
|
16
|
+
RunStatus,
|
|
17
|
+
)
|
|
13
18
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
14
|
-
from dstack._internal.server.models import JobModel, ProjectModel, UserModel
|
|
19
|
+
from dstack._internal.server.models import JobModel, ProjectModel, RunModel, UserModel
|
|
15
20
|
from dstack._internal.server.services.projects import add_project_member
|
|
16
21
|
from dstack._internal.server.testing.common import (
|
|
17
22
|
create_fleet,
|
|
18
23
|
create_instance,
|
|
19
24
|
create_job,
|
|
25
|
+
create_job_metrics_point,
|
|
20
26
|
create_job_prometheus_metrics,
|
|
21
27
|
create_project,
|
|
22
28
|
create_repo,
|
|
@@ -45,11 +51,21 @@ class TestGetPrometheusMetrics:
|
|
|
45
51
|
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
46
52
|
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
47
53
|
offer = get_instance_offer_with_availability(
|
|
48
|
-
instance_type="test-type",
|
|
54
|
+
instance_type="test-type",
|
|
55
|
+
cpu_count=32,
|
|
56
|
+
memory_gib=128,
|
|
57
|
+
gpu_count=2,
|
|
58
|
+
gpu_name="V4",
|
|
59
|
+
price=12,
|
|
49
60
|
)
|
|
50
61
|
project_2 = await _create_project(session, "project-2", user)
|
|
51
62
|
jpd_2_1 = get_job_provisioning_data(
|
|
52
|
-
backend=BackendType.AWS,
|
|
63
|
+
backend=BackendType.AWS,
|
|
64
|
+
cpu_count=16,
|
|
65
|
+
memory_gib=64,
|
|
66
|
+
gpu_name="T4",
|
|
67
|
+
gpu_count=2,
|
|
68
|
+
price=16,
|
|
53
69
|
)
|
|
54
70
|
job_2_1 = await _create_job(
|
|
55
71
|
session=session,
|
|
@@ -100,7 +116,41 @@ class TestGetPrometheusMetrics:
|
|
|
100
116
|
FIELD_2{gpu="1"} 987169 1395066363010
|
|
101
117
|
"""),
|
|
102
118
|
)
|
|
103
|
-
|
|
119
|
+
await create_job_metrics_point(
|
|
120
|
+
session=session,
|
|
121
|
+
job_model=job_1_1,
|
|
122
|
+
timestamp=FAKE_NOW - timedelta(seconds=30),
|
|
123
|
+
cpu_usage_micro=3_500_000,
|
|
124
|
+
memory_working_set_bytes=3_221_225_472,
|
|
125
|
+
memory_usage_bytes=4_294_967_296,
|
|
126
|
+
)
|
|
127
|
+
# Older, ignored
|
|
128
|
+
await create_job_metrics_point(
|
|
129
|
+
session=session,
|
|
130
|
+
job_model=job_1_1,
|
|
131
|
+
timestamp=FAKE_NOW - timedelta(seconds=60),
|
|
132
|
+
cpu_usage_micro=2_000_000,
|
|
133
|
+
memory_working_set_bytes=1_073_741_824,
|
|
134
|
+
memory_usage_bytes=2_147_483_648,
|
|
135
|
+
)
|
|
136
|
+
jpd_1_2 = get_job_provisioning_data(
|
|
137
|
+
backend=BackendType.AWS,
|
|
138
|
+
cpu_count=24,
|
|
139
|
+
memory_gib=224,
|
|
140
|
+
gpu_count=3,
|
|
141
|
+
gpu_name="L4",
|
|
142
|
+
price=12.5,
|
|
143
|
+
)
|
|
144
|
+
job_1_2 = await _create_job(
|
|
145
|
+
session=session,
|
|
146
|
+
run_name="run-2",
|
|
147
|
+
project=project_1,
|
|
148
|
+
user=user,
|
|
149
|
+
status=JobStatus.RUNNING,
|
|
150
|
+
job_provisioning_data=jpd_1_2,
|
|
151
|
+
submitted_at=FAKE_NOW - timedelta(seconds=150),
|
|
152
|
+
)
|
|
153
|
+
|
|
104
154
|
await create_job_prometheus_metrics(
|
|
105
155
|
session=session,
|
|
106
156
|
job=job_1_2,
|
|
@@ -124,6 +174,15 @@ class TestGetPrometheusMetrics:
|
|
|
124
174
|
FIELD_1{gpu="1"} 20
|
|
125
175
|
"""),
|
|
126
176
|
)
|
|
177
|
+
await _create_run(session, "done", project_1, user, RunStatus.DONE)
|
|
178
|
+
other_user = await create_user(
|
|
179
|
+
session=session, name="other-user", global_role=GlobalRole.USER
|
|
180
|
+
)
|
|
181
|
+
await add_project_member(
|
|
182
|
+
session=session, project=project_2, user=other_user, project_role=ProjectRole.USER
|
|
183
|
+
)
|
|
184
|
+
await _create_run(session, "failed-1", project_2, other_user, RunStatus.FAILED)
|
|
185
|
+
await _create_run(session, "failed-2", project_2, other_user, RunStatus.FAILED)
|
|
127
186
|
fleet = await create_fleet(session=session, project=project_1, name="test-fleet")
|
|
128
187
|
instance = await create_instance(
|
|
129
188
|
session=session,
|
|
@@ -149,31 +208,73 @@ class TestGetPrometheusMetrics:
|
|
|
149
208
|
# HELP dstack_instance_gpu_count Instance GPU count
|
|
150
209
|
# TYPE dstack_instance_gpu_count gauge
|
|
151
210
|
dstack_instance_gpu_count{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
211
|
+
# HELP dstack_run_count_total Total runs count
|
|
212
|
+
# TYPE dstack_run_count_total counter
|
|
213
|
+
dstack_run_count_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 4.0
|
|
214
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
215
|
+
dstack_run_count_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 1.0
|
|
216
|
+
# HELP dstack_run_count_terminated_total Terminated runs count
|
|
217
|
+
# TYPE dstack_run_count_terminated_total counter
|
|
218
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
219
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
220
|
+
dstack_run_count_terminated_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
221
|
+
# HELP dstack_run_count_failed_total Failed runs count
|
|
222
|
+
# TYPE dstack_run_count_failed_total counter
|
|
223
|
+
dstack_run_count_failed_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 0.0
|
|
224
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 2.0
|
|
225
|
+
dstack_run_count_failed_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
226
|
+
# HELP dstack_run_count_done_total Done runs count
|
|
227
|
+
# TYPE dstack_run_count_done_total counter
|
|
228
|
+
dstack_run_count_done_total{{dstack_project_name="project-1",dstack_user_name="test-user"}} 1.0
|
|
229
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="other-user"}} 0.0
|
|
230
|
+
dstack_run_count_done_total{{dstack_project_name="project-2",dstack_user_name="test-user"}} 0.0
|
|
152
231
|
# HELP dstack_job_duration_seconds_total Total seconds the job is running
|
|
153
232
|
# TYPE dstack_job_duration_seconds_total counter
|
|
154
233
|
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 120.0
|
|
234
|
+
dstack_job_duration_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 150.0
|
|
155
235
|
dstack_job_duration_seconds_total{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
156
236
|
# HELP dstack_job_price_dollars_per_hour Job instance price, USD/hour
|
|
157
237
|
# TYPE dstack_job_price_dollars_per_hour gauge
|
|
158
238
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 12.0
|
|
239
|
+
dstack_job_price_dollars_per_hour{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 12.5
|
|
159
240
|
dstack_job_price_dollars_per_hour{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
160
241
|
# HELP dstack_job_gpu_count Job GPU count
|
|
161
242
|
# TYPE dstack_job_gpu_count gauge
|
|
162
243
|
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 2.0
|
|
244
|
+
dstack_job_gpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 3.0
|
|
163
245
|
dstack_job_gpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 2.0
|
|
246
|
+
# HELP dstack_job_cpu_count Job CPU count
|
|
247
|
+
# TYPE dstack_job_cpu_count gauge
|
|
248
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 32.0
|
|
249
|
+
dstack_job_cpu_count{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 24.0
|
|
250
|
+
dstack_job_cpu_count{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 16.0
|
|
251
|
+
# HELP dstack_job_cpu_time_seconds_total Total CPU time consumed by the job, seconds
|
|
252
|
+
# TYPE dstack_job_cpu_time_seconds_total counter
|
|
253
|
+
dstack_job_cpu_time_seconds_total{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3.5
|
|
254
|
+
# HELP dstack_job_memory_total_bytes Total memory allocated for the job, bytes
|
|
255
|
+
# TYPE dstack_job_memory_total_bytes gauge
|
|
256
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 137438953472.0
|
|
257
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 240518168576.0
|
|
258
|
+
dstack_job_memory_total_bytes{{dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 68719476736.0
|
|
259
|
+
# HELP dstack_job_memory_usage_bytes Memory used by the job (including cache), bytes
|
|
260
|
+
# TYPE dstack_job_memory_usage_bytes gauge
|
|
261
|
+
dstack_job_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 4294967296.0
|
|
262
|
+
# HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
|
|
263
|
+
# TYPE dstack_job_memory_working_set_bytes gauge
|
|
264
|
+
dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
|
|
164
265
|
# HELP FIELD_1 Test field 1
|
|
165
266
|
# TYPE FIELD_1 gauge
|
|
166
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
167
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
168
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
169
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
170
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
171
|
-
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 100.0
|
|
172
|
-
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 200.0
|
|
267
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0
|
|
268
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 400.0
|
|
269
|
+
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1200.0
|
|
270
|
+
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 1600.0
|
|
271
|
+
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_1_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_1_2.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="L4"}} 2400.0
|
|
272
|
+
FIELD_1{{gpu="0",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 100.0
|
|
273
|
+
FIELD_1{{gpu="1",dstack_project_name="project-2",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_2_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_2_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="T4"}} 200.0
|
|
173
274
|
# HELP FIELD_2 Test field 2
|
|
174
275
|
# TYPE FIELD_2 counter
|
|
175
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
176
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
276
|
+
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
|
|
277
|
+
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
|
|
177
278
|
""")
|
|
178
279
|
|
|
179
280
|
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
|
|
@@ -189,110 +290,6 @@ class TestGetPrometheusMetrics:
|
|
|
189
290
|
assert response.status_code == 404
|
|
190
291
|
|
|
191
292
|
|
|
192
|
-
@pytest.mark.asyncio
|
|
193
|
-
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
194
|
-
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
|
|
195
|
-
class TestGetPrometheusProjectMetrics:
|
|
196
|
-
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
|
|
197
|
-
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
|
|
198
|
-
project = await _create_project(session, "project-1", user)
|
|
199
|
-
job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
|
|
200
|
-
await create_job_prometheus_metrics(
|
|
201
|
-
session=session,
|
|
202
|
-
job=job_1,
|
|
203
|
-
text=dedent("""
|
|
204
|
-
# Comments should be skipped
|
|
205
|
-
|
|
206
|
-
# HELP FIELD_1 Test field 1
|
|
207
|
-
# TYPE FIELD_1 gauge
|
|
208
|
-
FIELD_1{gpu="0"} 350
|
|
209
|
-
FIELD_1{gpu="1"} 400
|
|
210
|
-
|
|
211
|
-
# HELP FIELD_2 Test field 2
|
|
212
|
-
# TYPE FIELD_2 counter
|
|
213
|
-
FIELD_2{gpu="0"} 337325 1395066363000
|
|
214
|
-
FIELD_2{gpu="1"} 987169 1395066363010
|
|
215
|
-
"""),
|
|
216
|
-
)
|
|
217
|
-
job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
|
|
218
|
-
await create_job_prometheus_metrics(
|
|
219
|
-
session=session,
|
|
220
|
-
job=job_2,
|
|
221
|
-
text=dedent("""
|
|
222
|
-
# HELP FIELD_1 Test field 1
|
|
223
|
-
# TYPE FIELD_1 gauge
|
|
224
|
-
FIELD_1{gpu="0"} 1200.0
|
|
225
|
-
FIELD_1{gpu="1"} 1600.0
|
|
226
|
-
FIELD_1{gpu="2"} 2400.0
|
|
227
|
-
"""),
|
|
228
|
-
)
|
|
229
|
-
# Terminated job, should not appear in the response
|
|
230
|
-
job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
|
|
231
|
-
await create_job_prometheus_metrics(
|
|
232
|
-
session=session,
|
|
233
|
-
job=job_3,
|
|
234
|
-
text=dedent("""
|
|
235
|
-
# HELP FIELD_1 Test field 1
|
|
236
|
-
# TYPE FIELD_1 gauge
|
|
237
|
-
FIELD_1{gpu="0"} 10
|
|
238
|
-
FIELD_1{gpu="1"} 20
|
|
239
|
-
"""),
|
|
240
|
-
)
|
|
241
|
-
another_project = await _create_project(session, "project-2", user)
|
|
242
|
-
another_project_job = await _create_job(
|
|
243
|
-
session, "run-4", another_project, user, JobStatus.RUNNING
|
|
244
|
-
)
|
|
245
|
-
await create_job_prometheus_metrics(
|
|
246
|
-
session=session,
|
|
247
|
-
job=another_project_job,
|
|
248
|
-
text=dedent("""
|
|
249
|
-
# HELP FIELD_1 Test field 1
|
|
250
|
-
# TYPE FIELD_1 gauge
|
|
251
|
-
FIELD_1{gpu="0"} 100
|
|
252
|
-
FIELD_1{gpu="1"} 200
|
|
253
|
-
"""),
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
response = await client.get("/metrics/project/project-1")
|
|
257
|
-
|
|
258
|
-
assert response.status_code == 200
|
|
259
|
-
assert response.text == dedent(f"""\
|
|
260
|
-
# HELP FIELD_1 Test field 1
|
|
261
|
-
# TYPE FIELD_1 gauge
|
|
262
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 350.0
|
|
263
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 400.0
|
|
264
|
-
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1200.0
|
|
265
|
-
FIELD_1{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 1600.0
|
|
266
|
-
FIELD_1{{gpu="2",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-2",dstack_run_id="{job_2.run_id}",dstack_job_name="run-2-0-0",dstack_job_id="{job_2.id}",dstack_job_num="0",dstack_replica_num="0"}} 2400.0
|
|
267
|
-
# HELP FIELD_2 Test field 2
|
|
268
|
-
# TYPE FIELD_2 counter
|
|
269
|
-
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 337325.0 1395066363000
|
|
270
|
-
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1.id}",dstack_job_num="0",dstack_replica_num="0"}} 987169.0 1395066363010
|
|
271
|
-
""")
|
|
272
|
-
|
|
273
|
-
async def test_returns_empty_response_if_no_runs(
|
|
274
|
-
self, session: AsyncSession, client: AsyncClient
|
|
275
|
-
):
|
|
276
|
-
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
277
|
-
await create_project(session=session, owner=user, name="test-project")
|
|
278
|
-
response = await client.get("/metrics/project/test-project")
|
|
279
|
-
assert response.status_code == 200
|
|
280
|
-
assert response.text == "\n"
|
|
281
|
-
|
|
282
|
-
async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
|
|
283
|
-
response = await client.get("/metrics/project/nonexistent")
|
|
284
|
-
assert response.status_code == 404
|
|
285
|
-
|
|
286
|
-
async def test_returns_404_if_not_enabled(
|
|
287
|
-
self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
|
|
288
|
-
):
|
|
289
|
-
monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
|
|
290
|
-
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
291
|
-
await create_project(session=session, owner=user, name="test-project")
|
|
292
|
-
response = await client.get("/metrics/project/test-project")
|
|
293
|
-
assert response.status_code == 404
|
|
294
|
-
|
|
295
|
-
|
|
296
293
|
async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
|
|
297
294
|
project = await create_project(session=session, owner=user, name=name)
|
|
298
295
|
await add_project_member(
|
|
@@ -301,26 +298,46 @@ async def _create_project(session: AsyncSession, name: str, user: UserModel) ->
|
|
|
301
298
|
return project
|
|
302
299
|
|
|
303
300
|
|
|
304
|
-
async def
|
|
301
|
+
async def _create_run(
|
|
305
302
|
session: AsyncSession,
|
|
306
303
|
run_name: str,
|
|
307
304
|
project: ProjectModel,
|
|
308
305
|
user: UserModel,
|
|
309
|
-
status:
|
|
310
|
-
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
311
|
-
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
306
|
+
status: RunStatus,
|
|
312
307
|
submitted_at: datetime = FAKE_NOW,
|
|
313
|
-
) ->
|
|
308
|
+
) -> RunModel:
|
|
314
309
|
repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
|
|
315
310
|
configuration = DevEnvironmentConfiguration(ide="vscode")
|
|
316
311
|
run_spec = get_run_spec(run_name=run_name, repo_id=repo.name, configuration=configuration)
|
|
317
|
-
|
|
312
|
+
return await create_run(
|
|
318
313
|
session=session,
|
|
319
314
|
project=project,
|
|
320
315
|
repo=repo,
|
|
321
316
|
user=user,
|
|
322
317
|
run_name=run_name,
|
|
323
318
|
run_spec=run_spec,
|
|
319
|
+
status=status,
|
|
320
|
+
submitted_at=submitted_at,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
async def _create_job(
|
|
325
|
+
session: AsyncSession,
|
|
326
|
+
run_name: str,
|
|
327
|
+
project: ProjectModel,
|
|
328
|
+
user: UserModel,
|
|
329
|
+
status: JobStatus,
|
|
330
|
+
job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
331
|
+
job_runtime_data: Optional[JobRuntimeData] = None,
|
|
332
|
+
submitted_at: datetime = FAKE_NOW,
|
|
333
|
+
) -> JobModel:
|
|
334
|
+
run = await _create_run(
|
|
335
|
+
session=session,
|
|
336
|
+
run_name=run_name,
|
|
337
|
+
project=project,
|
|
338
|
+
user=user,
|
|
339
|
+
status=RunStatus.SUBMITTED,
|
|
340
|
+
submitted_at=submitted_at,
|
|
324
341
|
)
|
|
325
342
|
job = await create_job(
|
|
326
343
|
session=session,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|