dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +11 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/run.py +11 -0
  17. dstack/_internal/core/backends/aws/compute.py +23 -10
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +15 -9
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +26 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +32 -7
  33. dstack/_internal/core/backends/runpod/config.py +8 -0
  34. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  35. dstack/_internal/core/backends/vastai/compute.py +12 -2
  36. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  37. dstack/_internal/core/backends/vultr/compute.py +9 -3
  38. dstack/_internal/core/models/backends/aws.py +2 -0
  39. dstack/_internal/core/models/backends/base.py +1 -0
  40. dstack/_internal/core/models/backends/runpod.py +2 -0
  41. dstack/_internal/core/models/configurations.py +2 -2
  42. dstack/_internal/core/models/profiles.py +46 -1
  43. dstack/_internal/core/models/runs.py +4 -0
  44. dstack/_internal/core/services/__init__.py +5 -1
  45. dstack/_internal/core/services/configs/__init__.py +3 -0
  46. dstack/_internal/server/app.py +11 -1
  47. dstack/_internal/server/background/__init__.py +10 -0
  48. dstack/_internal/server/background/tasks/common.py +22 -0
  49. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  50. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
  51. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  52. dstack/_internal/server/background/tasks/process_running_jobs.py +74 -34
  53. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  54. dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
  55. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  56. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  57. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  58. dstack/_internal/server/models.py +11 -0
  59. dstack/_internal/server/routers/logs.py +3 -0
  60. dstack/_internal/server/routers/metrics.py +21 -2
  61. dstack/_internal/server/routers/prometheus.py +36 -0
  62. dstack/_internal/server/security/permissions.py +1 -1
  63. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  64. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  65. dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
  66. dstack/_internal/server/services/config.py +24 -4
  67. dstack/_internal/server/services/fleets.py +1 -0
  68. dstack/_internal/server/services/gateways/__init__.py +1 -0
  69. dstack/_internal/server/services/jobs/__init__.py +12 -9
  70. dstack/_internal/server/services/jobs/configurators/base.py +9 -1
  71. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  72. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  73. dstack/_internal/server/services/logs/__init__.py +78 -0
  74. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  75. dstack/_internal/server/services/logs/base.py +47 -0
  76. dstack/_internal/server/services/logs/filelog.py +110 -0
  77. dstack/_internal/server/services/logs/gcp.py +165 -0
  78. dstack/_internal/server/services/metrics.py +103 -70
  79. dstack/_internal/server/services/pools.py +16 -17
  80. dstack/_internal/server/services/prometheus.py +87 -0
  81. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  82. dstack/_internal/server/services/runner/client.py +14 -3
  83. dstack/_internal/server/services/runs.py +43 -15
  84. dstack/_internal/server/services/volumes.py +1 -0
  85. dstack/_internal/server/settings.py +6 -0
  86. dstack/_internal/server/statics/index.html +1 -1
  87. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-4eb116b97819badd1e2c.js} +131 -78
  88. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
  89. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-da9f8c06a69c20dac23e.css} +1 -1
  90. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  91. dstack/_internal/server/testing/common.py +50 -8
  92. dstack/api/_public/runs.py +4 -1
  93. dstack/api/server/_fleets.py +2 -0
  94. dstack/api/server/_runs.py +4 -0
  95. dstack/api/utils.py +3 -0
  96. dstack/version.py +2 -2
  97. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/METADATA +13 -3
  98. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/RECORD +115 -97
  99. tests/_internal/core/backends/base/__init__.py +0 -0
  100. tests/_internal/core/backends/base/test_compute.py +56 -0
  101. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
  102. tests/_internal/server/background/tasks/test_process_running_jobs.py +126 -1
  103. tests/_internal/server/conftest.py +4 -5
  104. tests/_internal/server/routers/test_backends.py +1 -0
  105. tests/_internal/server/routers/test_fleets.py +2 -0
  106. tests/_internal/server/routers/test_logs.py +1 -1
  107. tests/_internal/server/routers/test_metrics.py +15 -0
  108. tests/_internal/server/routers/test_prometheus.py +244 -0
  109. tests/_internal/server/routers/test_runs.py +81 -58
  110. tests/_internal/server/services/test_logs.py +3 -3
  111. tests/_internal/server/services/test_metrics.py +163 -0
  112. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
  113. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
  114. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
  115. {dstack-0.18.42.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,189 @@
1
+ from collections.abc import Generator
2
+ from datetime import datetime, timezone
3
+ from unittest.mock import Mock, patch
4
+
5
+ import pytest
6
+ import pytest_asyncio
7
+ from freezegun import freeze_time
8
+ from sqlalchemy import select
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+
11
+ from dstack._internal.core.models.instances import InstanceStatus
12
+ from dstack._internal.core.models.runs import JobStatus
13
+ from dstack._internal.core.models.users import GlobalRole, ProjectRole
14
+ from dstack._internal.server.background.tasks.process_prometheus_metrics import (
15
+ collect_prometheus_metrics,
16
+ delete_prometheus_metrics,
17
+ )
18
+ from dstack._internal.server.models import JobModel, JobPrometheusMetrics
19
+ from dstack._internal.server.services.projects import add_project_member
20
+ from dstack._internal.server.testing.common import (
21
+ create_instance,
22
+ create_job,
23
+ create_job_prometheus_metrics,
24
+ create_pool,
25
+ create_project,
26
+ create_repo,
27
+ create_run,
28
+ create_user,
29
+ get_job_provisioning_data,
30
+ )
31
+
32
+
33
+ @pytest.mark.asyncio
34
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
35
+ @pytest.mark.usefixtures("test_db", "image_config_mock")
36
+ class TestCollectPrometheusMetrics:
37
+ @pytest_asyncio.fixture
38
+ async def job(self, session: AsyncSession) -> JobModel:
39
+ user = await create_user(session=session, global_role=GlobalRole.USER)
40
+ project = await create_project(session=session, owner=user)
41
+ await add_project_member(
42
+ session=session, project=project, user=user, project_role=ProjectRole.USER
43
+ )
44
+ repo = await create_repo(
45
+ session=session,
46
+ project_id=project.id,
47
+ )
48
+ pool = await create_pool(session=session, project=project)
49
+ instance = await create_instance(
50
+ session=session,
51
+ project=project,
52
+ pool=pool,
53
+ status=InstanceStatus.BUSY,
54
+ )
55
+ run = await create_run(
56
+ session=session,
57
+ project=project,
58
+ repo=repo,
59
+ user=user,
60
+ )
61
+ job = await create_job(
62
+ session=session,
63
+ run=run,
64
+ status=JobStatus.RUNNING,
65
+ job_provisioning_data=get_job_provisioning_data(),
66
+ instance_assigned=True,
67
+ instance=instance,
68
+ )
69
+ return job
70
+
71
+ @pytest.fixture
72
+ def ssh_tunnel_mock(self) -> Generator[Mock, None, None]:
73
+ with patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock:
74
+ yield SSHTunnelMock
75
+
76
+ @pytest.fixture
77
+ def shim_client_mock(self) -> Generator[Mock, None, None]:
78
+ with patch("dstack._internal.server.services.runner.client.ShimClient") as ShimClientMock:
79
+ yield ShimClientMock.return_value
80
+
81
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
82
+ async def test_inserts_new_record(
83
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
84
+ ):
85
+ shim_client_mock.get_task_metrics.return_value = "# prom response"
86
+
87
+ await collect_prometheus_metrics()
88
+
89
+ ssh_tunnel_mock.assert_called_once()
90
+ shim_client_mock.get_task_metrics.assert_called_once()
91
+ res = await session.execute(
92
+ select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id)
93
+ )
94
+ metrics = res.scalar_one()
95
+ assert metrics.text == "# prom response"
96
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
97
+
98
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
99
+ async def test_updates_record(
100
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
101
+ ):
102
+ metrics = await create_job_prometheus_metrics(
103
+ session=session,
104
+ job=job,
105
+ collected_at=datetime(2023, 1, 2, 3, 5, 0),
106
+ text="# prom old response",
107
+ )
108
+ shim_client_mock.get_task_metrics.return_value = "# prom new response"
109
+
110
+ await collect_prometheus_metrics()
111
+
112
+ ssh_tunnel_mock.assert_called_once()
113
+ shim_client_mock.get_task_metrics.assert_called_once()
114
+ res = await session.execute(
115
+ select(JobPrometheusMetrics)
116
+ .where(JobPrometheusMetrics.job_id == job.id)
117
+ .execution_options(populate_existing=True)
118
+ )
119
+ metrics = res.scalar_one()
120
+ assert metrics.text == "# prom new response"
121
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
122
+
123
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
124
+ async def test_skips_recently_updated(
125
+ self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
126
+ ):
127
+ metrics = await create_job_prometheus_metrics(
128
+ session=session,
129
+ job=job,
130
+ collected_at=datetime(2023, 1, 2, 3, 5, 15),
131
+ text="# prom old response",
132
+ )
133
+ shim_client_mock.get_task_metrics.return_value = "# prom new response"
134
+
135
+ await collect_prometheus_metrics()
136
+
137
+ ssh_tunnel_mock.assert_not_called()
138
+ shim_client_mock.get_task_metrics.assert_not_called()
139
+ res = await session.execute(
140
+ select(JobPrometheusMetrics)
141
+ .where(JobPrometheusMetrics.job_id == job.id)
142
+ .execution_options(populate_existing=True)
143
+ )
144
+ metrics = res.scalar_one()
145
+ assert metrics.text == "# prom old response"
146
+ assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 15)
147
+
148
+
149
+ @pytest.mark.asyncio
150
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
151
+ @pytest.mark.usefixtures("test_db", "image_config_mock")
152
+ class TestDeletePrometheusMetrics:
153
+ @freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
154
+ async def test_deletes_old_metrics(self, session: AsyncSession):
155
+ user = await create_user(session=session, global_role=GlobalRole.USER)
156
+ project = await create_project(session=session, owner=user)
157
+ await add_project_member(
158
+ session=session, project=project, user=user, project_role=ProjectRole.USER
159
+ )
160
+ repo = await create_repo(session=session, project_id=project.id)
161
+ run_1 = await create_run(
162
+ session=session, project=project, repo=repo, user=user, run_name="run-1"
163
+ )
164
+ job_1 = await create_job(session=session, run=run_1)
165
+ # old metrics
166
+ await create_job_prometheus_metrics(
167
+ session=session,
168
+ job=job_1,
169
+ collected_at=datetime(2023, 1, 2, 2, 3, 30),
170
+ )
171
+ run_2 = await create_run(
172
+ session=session, project=project, repo=repo, user=user, run_name="run-2"
173
+ )
174
+ job_2 = await create_job(session=session, run=run_2)
175
+ # recent metrics
176
+ metrics_2 = await create_job_prometheus_metrics(
177
+ session=session,
178
+ job=job_2,
179
+ collected_at=datetime(2023, 1, 2, 3, 5, 0),
180
+ )
181
+
182
+ await delete_prometheus_metrics()
183
+
184
+ res = await session.execute(
185
+ select(JobPrometheusMetrics).join(JobModel).where(JobModel.project_id == project.id)
186
+ )
187
+ all_metrics = res.scalars().all()
188
+ assert len(all_metrics) == 1
189
+ assert all_metrics[0] == metrics_2
@@ -4,6 +4,7 @@ from typing import Optional
4
4
  from unittest.mock import MagicMock, Mock, patch
5
5
 
6
6
  import pytest
7
+ from freezegun import freeze_time
7
8
  from sqlalchemy.ext.asyncio import AsyncSession
8
9
 
9
10
  from dstack._internal.core.errors import SSHError
@@ -11,6 +12,7 @@ from dstack._internal.core.models.backends.base import BackendType
11
12
  from dstack._internal.core.models.common import NetworkMode
12
13
  from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
13
14
  from dstack._internal.core.models.instances import InstanceStatus
15
+ from dstack._internal.core.models.profiles import UtilizationPolicy
14
16
  from dstack._internal.core.models.runs import (
15
17
  JobRuntimeData,
16
18
  JobStatus,
@@ -39,6 +41,7 @@ from dstack._internal.server.services.volumes import (
39
41
  from dstack._internal.server.testing.common import (
40
42
  create_instance,
41
43
  create_job,
44
+ create_job_metrics_point,
42
45
  create_pool,
43
46
  create_project,
44
47
  create_repo,
@@ -335,7 +338,7 @@ class TestProcessRunningJobs:
335
338
  name="test-run-0-0",
336
339
  registry_username="",
337
340
  registry_password="",
338
- image_name="dstackai/base:py3.13-0.6-cuda-12.1",
341
+ image_name="dstackai/base:py3.13-0.7-cuda-12.1",
339
342
  container_user="root",
340
343
  privileged=privileged,
341
344
  gpu=None,
@@ -688,3 +691,125 @@ class TestProcessRunningJobs:
688
691
  assert job.status == expected_status
689
692
  assert job.termination_reason == expected_termination_reason
690
693
  assert job.inactivity_secs == expected_inactivity_secs
694
+
695
+ @pytest.mark.asyncio
696
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
697
+ @pytest.mark.parametrize(
698
+ ["samples", "expected_status"],
699
+ [
700
+ pytest.param(
701
+ [
702
+ (datetime(2023, 1, 1, 12, 25, 20, tzinfo=timezone.utc), 30),
703
+ (datetime(2023, 1, 1, 12, 25, 30, tzinfo=timezone.utc), 30),
704
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
705
+ ],
706
+ JobStatus.RUNNING,
707
+ id="not-enough-points",
708
+ ),
709
+ pytest.param(
710
+ [
711
+ (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
712
+ (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
713
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 80),
714
+ ],
715
+ JobStatus.RUNNING,
716
+ id="any-above-min",
717
+ ),
718
+ pytest.param(
719
+ [
720
+ (datetime(2023, 1, 1, 12, 10, 10, tzinfo=timezone.utc), 80), # outside window
721
+ (datetime(2023, 1, 1, 12, 10, 20, tzinfo=timezone.utc), 80), # outside window
722
+ (datetime(2023, 1, 1, 12, 20, 10, tzinfo=timezone.utc), 30),
723
+ (datetime(2023, 1, 1, 12, 20, 20, tzinfo=timezone.utc), 30),
724
+ (datetime(2023, 1, 1, 12, 29, 50, tzinfo=timezone.utc), 40),
725
+ ],
726
+ JobStatus.TERMINATING,
727
+ id="all-below-min",
728
+ ),
729
+ ],
730
+ )
731
+ @freeze_time(datetime(2023, 1, 1, 12, 30, tzinfo=timezone.utc))
732
+ async def test_gpu_utilization(
733
+ self,
734
+ test_db,
735
+ session: AsyncSession,
736
+ samples: list[tuple[datetime, int]],
737
+ expected_status: JobStatus,
738
+ ) -> None:
739
+ project = await create_project(session=session)
740
+ user = await create_user(session=session)
741
+ repo = await create_repo(
742
+ session=session,
743
+ project_id=project.id,
744
+ )
745
+ run = await create_run(
746
+ session=session,
747
+ project=project,
748
+ repo=repo,
749
+ user=user,
750
+ status=RunStatus.RUNNING,
751
+ run_name="test-run",
752
+ run_spec=get_run_spec(
753
+ run_name="test-run",
754
+ repo_id=repo.name,
755
+ configuration=DevEnvironmentConfiguration(
756
+ name="test-run",
757
+ ide="vscode",
758
+ utilization_policy=UtilizationPolicy(
759
+ min_gpu_utilization=80,
760
+ time_window=600,
761
+ ),
762
+ ),
763
+ ),
764
+ )
765
+ pool = await create_pool(session=session, project=project)
766
+ instance = await create_instance(
767
+ session=session,
768
+ project=project,
769
+ pool=pool,
770
+ status=InstanceStatus.BUSY,
771
+ )
772
+ job = await create_job(
773
+ session=session,
774
+ run=run,
775
+ status=JobStatus.RUNNING,
776
+ job_provisioning_data=get_job_provisioning_data(),
777
+ instance=instance,
778
+ instance_assigned=True,
779
+ )
780
+ for timestamp, gpu_util in samples:
781
+ # two GPUs, the second one always 100% utilized
782
+ await create_job_metrics_point(
783
+ session=session,
784
+ job_model=job,
785
+ timestamp=timestamp,
786
+ gpus_memory_usage_bytes=[1024, 1024],
787
+ gpus_util_percent=[gpu_util, 100],
788
+ )
789
+ with (
790
+ patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock,
791
+ patch(
792
+ "dstack._internal.server.services.runner.client.RunnerClient"
793
+ ) as RunnerClientMock,
794
+ ):
795
+ runner_client_mock = RunnerClientMock.return_value
796
+ runner_client_mock.pull.return_value = PullResponse(
797
+ job_states=[],
798
+ job_logs=[],
799
+ runner_logs=[],
800
+ last_updated=0,
801
+ no_connections_secs=0,
802
+ )
803
+ await process_running_jobs()
804
+ SSHTunnelMock.assert_called_once()
805
+ runner_client_mock.pull.assert_called_once()
806
+ await session.refresh(job)
807
+ assert job.status == expected_status
808
+ if expected_status == JobStatus.TERMINATING:
809
+ assert job.termination_reason == JobTerminationReason.TERMINATED_BY_SERVER
810
+ assert job.termination_reason_message == (
811
+ "The job GPU utilization below 80% for 600 seconds"
812
+ )
813
+ else:
814
+ assert job.termination_reason is None
815
+ assert job.termination_reason_message is None
@@ -8,6 +8,7 @@ from dstack._internal.server.main import app
8
8
  from dstack._internal.server.services import encryption as encryption # import for side-effect
9
9
  from dstack._internal.server.services import logs as logs_services
10
10
  from dstack._internal.server.services.docker import ImageConfig, ImageConfigObject
11
+ from dstack._internal.server.services.logs.filelog import FileLogStorage
11
12
  from dstack._internal.server.testing.conf import postgres_container, session, test_db # noqa: F401
12
13
 
13
14
 
@@ -18,13 +19,11 @@ def client(event_loop):
18
19
 
19
20
 
20
21
  @pytest.fixture
21
- def test_log_storage(
22
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
23
- ) -> logs_services.FileLogStorage:
22
+ def test_log_storage(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FileLogStorage:
24
23
  root = tmp_path / "test_logs"
25
24
  root.mkdir()
26
- storage = logs_services.FileLogStorage(root)
27
- monkeypatch.setattr(logs_services, "_default_log_storage", storage)
25
+ storage = FileLogStorage(root)
26
+ monkeypatch.setattr(logs_services, "_log_storage", storage)
28
27
  return storage
29
28
 
30
29
 
@@ -1335,6 +1335,7 @@ class TestGetConfigInfo:
1335
1335
  "vpc_ids": None,
1336
1336
  "default_vpcs": None,
1337
1337
  "public_ips": None,
1338
+ "iam_instance_profile": None,
1338
1339
  "tags": None,
1339
1340
  "os_images": None,
1340
1341
  "creds": json.loads(backend.auth.plaintext),
@@ -370,6 +370,7 @@ class TestCreateFleet:
370
370
  "idle_duration": None,
371
371
  "termination_policy": None,
372
372
  "termination_idle_time": None,
373
+ "utilization_policy": None,
373
374
  "name": "",
374
375
  "default": False,
375
376
  "reservation": None,
@@ -495,6 +496,7 @@ class TestCreateFleet:
495
496
  "idle_duration": None,
496
497
  "termination_policy": None,
497
498
  "termination_idle_time": None,
499
+ "utilization_policy": None,
498
500
  "name": "",
499
501
  "default": False,
500
502
  "reservation": None,
@@ -3,7 +3,7 @@ from httpx import AsyncClient
3
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
4
 
5
5
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
6
- from dstack._internal.server.services.logs import FileLogStorage
6
+ from dstack._internal.server.services.logs.filelog import FileLogStorage
7
7
  from dstack._internal.server.services.projects import add_project_member
8
8
  from dstack._internal.server.testing.common import create_project, create_user, get_auth_headers
9
9
 
@@ -55,11 +55,25 @@ class TestGetJobMetrics:
55
55
  session=session,
56
56
  run=run,
57
57
  )
58
+ await create_job_metrics_point(
59
+ session=session,
60
+ job_model=job,
61
+ timestamp=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc),
62
+ cpu_usage_micro=2 * 1_000_000,
63
+ memory_usage_bytes=256,
64
+ memory_working_set_bytes=128,
65
+ gpus_memory_usage_bytes=[256],
66
+ gpus_util_percent=[2],
67
+ )
58
68
  await create_job_metrics_point(
59
69
  session=session,
60
70
  job_model=job,
61
71
  timestamp=datetime(2023, 1, 2, 3, 4, 15, tzinfo=timezone.utc),
62
72
  cpu_usage_micro=4 * 1_000_000,
73
+ memory_usage_bytes=512,
74
+ memory_working_set_bytes=256,
75
+ gpus_memory_usage_bytes=[512],
76
+ gpus_util_percent=[6],
63
77
  )
64
78
  await create_job_metrics_point(
65
79
  session=session,
@@ -76,6 +90,7 @@ class TestGetJobMetrics:
76
90
  headers=get_auth_headers(user.token),
77
91
  )
78
92
  assert response.status_code == 200
93
+ # Returns one last sample by default. Filtering is tested in services/test_metrics.py
79
94
  assert response.json() == {
80
95
  "metrics": [
81
96
  {
@@ -0,0 +1,244 @@
1
+ from textwrap import dedent
2
+
3
+ import pytest
4
+ from httpx import AsyncClient
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+
7
+ from dstack._internal.core.models.runs import JobStatus
8
+ from dstack._internal.core.models.users import GlobalRole, ProjectRole
9
+ from dstack._internal.server.models import JobModel, ProjectModel, UserModel
10
+ from dstack._internal.server.services.projects import add_project_member
11
+ from dstack._internal.server.testing.common import (
12
+ create_job,
13
+ create_job_prometheus_metrics,
14
+ create_project,
15
+ create_repo,
16
+ create_run,
17
+ create_user,
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def enable_metrics(monkeypatch: pytest.MonkeyPatch):
23
+ monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", True)
24
+
25
+
26
+ @pytest.mark.asyncio
27
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
28
+ @pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
29
+ class TestGetPrometheusMetrics:
30
+ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
31
+ user = await create_user(session=session, global_role=GlobalRole.USER)
32
+ project_2 = await _create_project(session, "project-2", user)
33
+ job_2_1 = await _create_job(session, "run-1", project_2, user, JobStatus.RUNNING)
34
+ await create_job_prometheus_metrics(
35
+ session=session,
36
+ job=job_2_1,
37
+ text=dedent("""
38
+ # HELP FIELD_1 Test field 1
39
+ # TYPE FIELD_1 gauge
40
+ FIELD_1{gpu="0"} 100
41
+ FIELD_1{gpu="1"} 200
42
+ """),
43
+ )
44
+ project_1 = await _create_project(session, "project-1", user)
45
+ job_1_1 = await _create_job(session, "run-1", project_1, user, JobStatus.RUNNING)
46
+ await create_job_prometheus_metrics(
47
+ session=session,
48
+ job=job_1_1,
49
+ text=dedent("""
50
+ # Comments should be skipped
51
+
52
+ # HELP FIELD_1 Test field 1
53
+ # TYPE FIELD_1 gauge
54
+ FIELD_1{gpu="0"} 350
55
+ FIELD_1{gpu="1"} 400
56
+
57
+ # HELP FIELD_2 Test field 2
58
+ # TYPE FIELD_2 counter
59
+ FIELD_2{gpu="0"} 337325 1395066363000
60
+ FIELD_2{gpu="1"} 987169 1395066363010
61
+ """),
62
+ )
63
+ job_1_2 = await _create_job(session, "run-2", project_1, user, JobStatus.RUNNING)
64
+ await create_job_prometheus_metrics(
65
+ session=session,
66
+ job=job_1_2,
67
+ text=dedent("""
68
+ # HELP FIELD_1 Test field 1
69
+ # TYPE FIELD_1 gauge
70
+ FIELD_1{gpu="0"} 1200.0
71
+ FIELD_1{gpu="1"} 1600.0
72
+ FIELD_1{gpu="2"} 2400.0
73
+ """),
74
+ )
75
+ # Terminated job, should not appear in the response
76
+ job_1_3 = await _create_job(session, "run-3", project_1, user, JobStatus.TERMINATED)
77
+ await create_job_prometheus_metrics(
78
+ session=session,
79
+ job=job_1_3,
80
+ text=dedent("""
81
+ # HELP FIELD_1 Test field 1
82
+ # TYPE FIELD_1 gauge
83
+ FIELD_1{gpu="0"} 10
84
+ FIELD_1{gpu="1"} 20
85
+ """),
86
+ )
87
+
88
+ response = await client.get("/metrics")
89
+
90
+ assert response.status_code == 200
91
+ assert response.text == dedent("""\
92
+ # HELP FIELD_1 Test field 1
93
+ # TYPE FIELD_1 gauge
94
+ FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 350.0
95
+ FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 400.0
96
+ FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1200.0
97
+ FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1600.0
98
+ FIELD_1{gpu="2",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 2400.0
99
+ FIELD_1{gpu="0",dstack_project_name="project-2",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 100.0
100
+ FIELD_1{gpu="1",dstack_project_name="project-2",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 200.0
101
+ # HELP FIELD_2 Test field 2
102
+ # TYPE FIELD_2 counter
103
+ FIELD_2{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 337325.0 1395066363000
104
+ FIELD_2{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 987169.0 1395066363010
105
+ """)
106
+
107
+ async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
108
+ response = await client.get("/metrics")
109
+ assert response.status_code == 200
110
+ assert response.text == ""
111
+
112
+ async def test_returns_404_if_not_enabled(
113
+ self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient
114
+ ):
115
+ monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
116
+ response = await client.get("/metrics")
117
+ assert response.status_code == 404
118
+
119
+
120
+ @pytest.mark.asyncio
121
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
122
+ @pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
123
+ class TestGetPrometheusProjectMetrics:
124
+ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
125
+ user = await create_user(session=session, global_role=GlobalRole.USER)
126
+ project = await _create_project(session, "project-1", user)
127
+ job_1 = await _create_job(session, "run-1", project, user, JobStatus.RUNNING)
128
+ await create_job_prometheus_metrics(
129
+ session=session,
130
+ job=job_1,
131
+ text=dedent("""
132
+ # Comments should be skipped
133
+
134
+ # HELP FIELD_1 Test field 1
135
+ # TYPE FIELD_1 gauge
136
+ FIELD_1{gpu="0"} 350
137
+ FIELD_1{gpu="1"} 400
138
+
139
+ # HELP FIELD_2 Test field 2
140
+ # TYPE FIELD_2 counter
141
+ FIELD_2{gpu="0"} 337325 1395066363000
142
+ FIELD_2{gpu="1"} 987169 1395066363010
143
+ """),
144
+ )
145
+ job_2 = await _create_job(session, "run-2", project, user, JobStatus.RUNNING)
146
+ await create_job_prometheus_metrics(
147
+ session=session,
148
+ job=job_2,
149
+ text=dedent("""
150
+ # HELP FIELD_1 Test field 1
151
+ # TYPE FIELD_1 gauge
152
+ FIELD_1{gpu="0"} 1200.0
153
+ FIELD_1{gpu="1"} 1600.0
154
+ FIELD_1{gpu="2"} 2400.0
155
+ """),
156
+ )
157
+ # Terminated job, should not appear in the response
158
+ job_3 = await _create_job(session, "run-3", project, user, JobStatus.TERMINATED)
159
+ await create_job_prometheus_metrics(
160
+ session=session,
161
+ job=job_3,
162
+ text=dedent("""
163
+ # HELP FIELD_1 Test field 1
164
+ # TYPE FIELD_1 gauge
165
+ FIELD_1{gpu="0"} 10
166
+ FIELD_1{gpu="1"} 20
167
+ """),
168
+ )
169
+ another_project = await _create_project(session, "project-2", user)
170
+ another_project_job = await _create_job(
171
+ session, "run-4", another_project, user, JobStatus.RUNNING
172
+ )
173
+ await create_job_prometheus_metrics(
174
+ session=session,
175
+ job=another_project_job,
176
+ text=dedent("""
177
+ # HELP FIELD_1 Test field 1
178
+ # TYPE FIELD_1 gauge
179
+ FIELD_1{gpu="0"} 100
180
+ FIELD_1{gpu="1"} 200
181
+ """),
182
+ )
183
+
184
+ response = await client.get("/metrics/project/project-1")
185
+
186
+ assert response.status_code == 200
187
+ assert response.text == dedent("""\
188
+ # HELP FIELD_1 Test field 1
189
+ # TYPE FIELD_1 gauge
190
+ FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 350.0
191
+ FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 400.0
192
+ FIELD_1{gpu="0",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1200.0
193
+ FIELD_1{gpu="1",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 1600.0
194
+ FIELD_1{gpu="2",dstack_project_name="project-1",dstack_run_name="run-2",dstack_job_name="run-2-0-0",dstack_job_num="0",dstack_replica_num="0"} 2400.0
195
+ # HELP FIELD_2 Test field 2
196
+ # TYPE FIELD_2 counter
197
+ FIELD_2{gpu="0",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 337325.0 1395066363000
198
+ FIELD_2{gpu="1",dstack_project_name="project-1",dstack_run_name="run-1",dstack_job_name="run-1-0-0",dstack_job_num="0",dstack_replica_num="0"} 987169.0 1395066363010
199
+ """)
200
+
201
+ async def test_returns_empty_response_if_no_runs(
202
+ self, session: AsyncSession, client: AsyncClient
203
+ ):
204
+ user = await create_user(session=session, global_role=GlobalRole.USER)
205
+ await create_project(session=session, owner=user, name="test-project")
206
+ response = await client.get("/metrics/project/test-project")
207
+ assert response.status_code == 200
208
+ assert response.text == ""
209
+
210
+ async def test_returns_404_if_project_doesnt_exist(self, client: AsyncClient):
211
+ response = await client.get("/metrics/project/nonexistent")
212
+ assert response.status_code == 404
213
+
214
+ async def test_returns_404_if_not_enabled(
215
+ self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession, client: AsyncClient
216
+ ):
217
+ monkeypatch.setattr("dstack._internal.server.settings.ENABLE_PROMETHEUS_METRICS", False)
218
+ user = await create_user(session=session, global_role=GlobalRole.USER)
219
+ await create_project(session=session, owner=user, name="test-project")
220
+ response = await client.get("/metrics/project/test-project")
221
+ assert response.status_code == 404
222
+
223
+
224
+ async def _create_project(session: AsyncSession, name: str, user: UserModel) -> ProjectModel:
225
+ project = await create_project(session=session, owner=user, name=name)
226
+ await add_project_member(
227
+ session=session, project=project, user=user, project_role=ProjectRole.USER
228
+ )
229
+ return project
230
+
231
+
232
+ async def _create_job(
233
+ session: AsyncSession, run_name: str, project: ProjectModel, user: UserModel, status: JobStatus
234
+ ) -> JobModel:
235
+ repo = await create_repo(session=session, project_id=project.id, repo_name=f"{run_name}-repo")
236
+ run = await create_run(
237
+ session=session,
238
+ project=project,
239
+ repo=repo,
240
+ user=user,
241
+ run_name=run_name,
242
+ )
243
+ job = await create_job(session=session, run=run, status=status)
244
+ return job