dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (59) hide show
  1. dstack/_internal/cli/services/configurators/run.py +1 -0
  2. dstack/_internal/cli/utils/run.py +11 -0
  3. dstack/_internal/core/backends/aws/compute.py +1 -0
  4. dstack/_internal/core/backends/azure/compute.py +1 -1
  5. dstack/_internal/core/backends/gcp/compute.py +1 -1
  6. dstack/_internal/core/backends/runpod/compute.py +21 -3
  7. dstack/_internal/core/backends/runpod/config.py +8 -0
  8. dstack/_internal/core/models/backends/runpod.py +2 -0
  9. dstack/_internal/core/models/configurations.py +2 -1
  10. dstack/_internal/core/models/profiles.py +46 -1
  11. dstack/_internal/core/models/runs.py +4 -0
  12. dstack/_internal/server/app.py +11 -1
  13. dstack/_internal/server/background/__init__.py +10 -0
  14. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
  15. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
  16. dstack/_internal/server/background/tasks/process_running_jobs.py +66 -19
  17. dstack/_internal/server/background/tasks/process_runs.py +1 -0
  18. dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
  19. dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
  20. dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
  21. dstack/_internal/server/models.py +11 -0
  22. dstack/_internal/server/routers/metrics.py +21 -2
  23. dstack/_internal/server/routers/prometheus.py +36 -0
  24. dstack/_internal/server/security/permissions.py +1 -1
  25. dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
  26. dstack/_internal/server/services/config.py +13 -3
  27. dstack/_internal/server/services/fleets.py +1 -0
  28. dstack/_internal/server/services/gateways/__init__.py +1 -0
  29. dstack/_internal/server/services/jobs/configurators/base.py +9 -1
  30. dstack/_internal/server/services/metrics.py +103 -70
  31. dstack/_internal/server/services/prometheus.py +87 -0
  32. dstack/_internal/server/services/runner/client.py +14 -3
  33. dstack/_internal/server/services/runs.py +43 -15
  34. dstack/_internal/server/services/volumes.py +1 -0
  35. dstack/_internal/server/settings.py +3 -0
  36. dstack/_internal/server/statics/index.html +1 -1
  37. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4eb116b97819badd1e2c.js} +66 -13
  38. dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
  39. dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
  40. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  41. dstack/_internal/server/testing/common.py +17 -0
  42. dstack/api/_public/runs.py +3 -0
  43. dstack/api/server/_fleets.py +2 -0
  44. dstack/api/server/_runs.py +4 -0
  45. dstack/api/utils.py +3 -0
  46. dstack/version.py +1 -1
  47. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/METADATA +10 -1
  48. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/RECORD +59 -50
  49. tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
  50. tests/_internal/server/background/tasks/test_process_running_jobs.py +125 -0
  51. tests/_internal/server/routers/test_fleets.py +2 -0
  52. tests/_internal/server/routers/test_metrics.py +15 -0
  53. tests/_internal/server/routers/test_prometheus.py +244 -0
  54. tests/_internal/server/routers/test_runs.py +79 -56
  55. tests/_internal/server/services/test_metrics.py +163 -0
  56. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
  57. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
  58. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
  59. {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,13 @@ from sqlalchemy.ext.asyncio import AsyncSession
13
13
 
14
14
  from dstack._internal.core.models.backends.base import BackendType
15
15
  from dstack._internal.core.models.common import ApplyAction
16
- from dstack._internal.core.models.configurations import ServiceConfiguration
16
+ from dstack._internal.core.models.configurations import (
17
+ AnyRunConfiguration,
18
+ DevEnvironmentConfiguration,
19
+ ScalingSpec,
20
+ ServiceConfiguration,
21
+ TaskConfiguration,
22
+ )
17
23
  from dstack._internal.core.models.gateways import GatewayStatus
18
24
  from dstack._internal.core.models.instances import (
19
25
  InstanceAvailability,
@@ -126,6 +132,7 @@ def get_dev_env_run_plan_dict(
126
132
  "idle_duration": None,
127
133
  "termination_idle_time": 300,
128
134
  "termination_policy": None,
135
+ "utilization_policy": None,
129
136
  "reservation": None,
130
137
  },
131
138
  "configuration_path": "dstack.yaml",
@@ -148,6 +155,7 @@ def get_dev_env_run_plan_dict(
148
155
  "idle_duration": None,
149
156
  "termination_idle_time": 300,
150
157
  "termination_policy": None,
158
+ "utilization_policy": None,
151
159
  "reservation": None,
152
160
  },
153
161
  "repo_code_hash": None,
@@ -190,6 +198,7 @@ def get_dev_env_run_plan_dict(
190
198
  "single_branch": False,
191
199
  "max_duration": None,
192
200
  "stop_duration": 300,
201
+ "utilization_policy": None,
193
202
  "registry_auth": None,
194
203
  "requirements": {
195
204
  "resources": {
@@ -283,6 +292,7 @@ def get_dev_env_run_dict(
283
292
  "idle_duration": None,
284
293
  "termination_idle_time": 300,
285
294
  "termination_policy": None,
295
+ "utilization_policy": None,
286
296
  "reservation": None,
287
297
  },
288
298
  "configuration_path": "dstack.yaml",
@@ -305,6 +315,7 @@ def get_dev_env_run_dict(
305
315
  "idle_duration": None,
306
316
  "termination_idle_time": 300,
307
317
  "termination_policy": None,
318
+ "utilization_policy": None,
308
319
  "reservation": None,
309
320
  },
310
321
  "repo_code_hash": None,
@@ -347,6 +358,7 @@ def get_dev_env_run_dict(
347
358
  "single_branch": False,
348
359
  "max_duration": None,
349
360
  "stop_duration": 300,
361
+ "utilization_policy": None,
350
362
  "registry_auth": None,
351
363
  "requirements": {
352
364
  "resources": {
@@ -891,66 +903,77 @@ class TestGetRunPlan:
891
903
 
892
904
  @pytest.mark.asyncio
893
905
  @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
894
- async def test_returns_update_action_when_changing_updatable_fields(
895
- self, test_db, session: AsyncSession, client: AsyncClient
896
- ):
897
- user = await create_user(session=session, global_role=GlobalRole.USER)
898
- project = await create_project(session=session, owner=user)
899
- await add_project_member(
900
- session=session, project=project, user=user, project_role=ProjectRole.USER
901
- )
902
- repo = await create_repo(session=session, project_id=project.id)
903
- run_spec = get_run_spec(
904
- run_name="test-service",
905
- repo_id=repo.name,
906
- configuration=ServiceConfiguration(
907
- type="service",
908
- commands=["one", "two"],
909
- port=80,
910
- replicas=1,
906
+ @pytest.mark.parametrize(
907
+ ("old_conf", "new_conf", "action"),
908
+ [
909
+ pytest.param(
910
+ ServiceConfiguration(
911
+ commands=["one", "two"],
912
+ port=80,
913
+ replicas=1,
914
+ scaling=None,
915
+ ),
916
+ ServiceConfiguration(
917
+ commands=["one", "two"],
918
+ port=80,
919
+ replicas="2..4",
920
+ scaling=ScalingSpec(metric="rps", target=5),
921
+ ),
922
+ "update",
923
+ id="update-service",
911
924
  ),
912
- )
913
- run_model = await create_run(
914
- session=session,
915
- project=project,
916
- repo=repo,
917
- user=user,
918
- run_name=run_spec.run_name,
919
- run_spec=run_spec,
920
- )
921
- run = run_model_to_run(run_model)
922
- run_spec.configuration.replicas = 2
923
- response = await client.post(
924
- f"/api/project/{project.name}/runs/get_plan",
925
- headers=get_auth_headers(user.token),
926
- json={"run_spec": run_spec.dict()},
927
- )
928
- assert response.status_code == 200
929
- response_json = response.json()
930
- assert response_json["action"] == "update"
931
- assert response_json["current_resource"] == json.loads(run.json())
932
-
933
- @pytest.mark.asyncio
934
- @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
935
- async def test_returns_create_action_when_changing_non_updatable_fields(
936
- self, test_db, session: AsyncSession, client: AsyncClient
937
- ):
925
+ pytest.param(
926
+ ServiceConfiguration(
927
+ commands=["one", "two"],
928
+ port=80,
929
+ replicas=1,
930
+ scaling=None,
931
+ ),
932
+ ServiceConfiguration(
933
+ commands=["one", "two"],
934
+ port=8080, # not updatable
935
+ replicas="2..4",
936
+ scaling=ScalingSpec(metric="rps", target=5),
937
+ ),
938
+ "create",
939
+ id="no-update-service",
940
+ ),
941
+ pytest.param(
942
+ DevEnvironmentConfiguration(ide="vscode", inactivity_duration=False),
943
+ DevEnvironmentConfiguration(ide="vscode", inactivity_duration="30m"),
944
+ "update",
945
+ id="update-dev-env",
946
+ ),
947
+ pytest.param(
948
+ TaskConfiguration(image="test-image-1"),
949
+ TaskConfiguration(image="test-image-2"),
950
+ "create",
951
+ id="no-update-task",
952
+ ),
953
+ pytest.param(
954
+ DevEnvironmentConfiguration(ide="vscode", image="test-image"),
955
+ TaskConfiguration(image="test-image"),
956
+ "create",
957
+ id="no-update-on-type-change",
958
+ ),
959
+ ],
960
+ )
961
+ async def test_returns_update_or_create_action_on_conf_change(
962
+ self,
963
+ test_db,
964
+ session: AsyncSession,
965
+ client: AsyncClient,
966
+ old_conf: AnyRunConfiguration,
967
+ new_conf: AnyRunConfiguration,
968
+ action: str,
969
+ ) -> None:
938
970
  user = await create_user(session=session, global_role=GlobalRole.USER)
939
971
  project = await create_project(session=session, owner=user)
940
972
  await add_project_member(
941
973
  session=session, project=project, user=user, project_role=ProjectRole.USER
942
974
  )
943
975
  repo = await create_repo(session=session, project_id=project.id)
944
- run_spec = get_run_spec(
945
- run_name="test-service",
946
- repo_id=repo.name,
947
- configuration=ServiceConfiguration(
948
- type="service",
949
- commands=["one", "two"],
950
- port=80,
951
- replicas=1,
952
- ),
953
- )
976
+ run_spec = get_run_spec(run_name="test-run", repo_id=repo.name, configuration=old_conf)
954
977
  run_model = await create_run(
955
978
  session=session,
956
979
  project=project,
@@ -960,7 +983,7 @@ class TestGetRunPlan:
960
983
  run_spec=run_spec,
961
984
  )
962
985
  run = run_model_to_run(run_model)
963
- run_spec.configuration.port = 8080
986
+ run_spec.configuration = new_conf
964
987
  response = await client.post(
965
988
  f"/api/project/{project.name}/runs/get_plan",
966
989
  headers=get_auth_headers(user.token),
@@ -968,7 +991,7 @@ class TestGetRunPlan:
968
991
  )
969
992
  assert response.status_code == 200
970
993
  response_json = response.json()
971
- assert response_json["action"] == "create"
994
+ assert response_json["action"] == action
972
995
  assert response_json["current_resource"] == json.loads(run.json())
973
996
 
974
997
 
@@ -0,0 +1,163 @@
1
+ from datetime import datetime, timedelta, timezone
2
+
3
+ import pytest
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+
6
+ from dstack._internal.core.models.metrics import Metric
7
+ from dstack._internal.server.services.metrics import get_job_metrics
8
+ from dstack._internal.server.testing.common import (
9
+ create_job,
10
+ create_job_metrics_point,
11
+ create_project,
12
+ create_repo,
13
+ create_run,
14
+ create_user,
15
+ )
16
+
17
+
18
+ @pytest.mark.asyncio
19
+ @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
20
+ @pytest.mark.usefixtures("test_db", "image_config_mock")
21
+ class TestGetMetrics:
22
+ latest_ts = datetime(2023, 1, 2, 3, 4, 25, tzinfo=timezone.utc)
23
+ ts: tuple[datetime, ...] = (
24
+ latest_ts, # 0
25
+ latest_ts - timedelta(seconds=10), # 1
26
+ latest_ts - timedelta(seconds=20), # 2
27
+ latest_ts - timedelta(seconds=30), # 3
28
+ latest_ts - timedelta(seconds=40), # 4
29
+ latest_ts - timedelta(seconds=50), # 5
30
+ )
31
+ # dt, cpu_usage_sec, memory_usage_bytes, memory_ws_bytes, gpu0_memory_usage_bytes, gpu0_util,
32
+ # gpu1_memory_usage_bytess, gpu1_util
33
+ points: tuple[tuple[datetime, int, int, int, int, int, int, int], ...] = (
34
+ (ts[0], 110, 512, 128, 768, 15, 128, 20),
35
+ (ts[1], 104, 1024, 512, 1024, 10, 256, 10),
36
+ (ts[2], 100, 1024, 512, 1024, 20, 128, 5),
37
+ (ts[3], 90, 512, 512, 2048, 40, 512, 20),
38
+ (ts[4], 90, 1024, 1024, 1024, 0, 128, 0),
39
+ (ts[5], 80, 512, 512, 1024, 10, 256, 0),
40
+ )
41
+
42
+ @pytest.mark.parametrize(
43
+ ["params", "ts", "cpu", "mem", "mem_ws", "gpu0_mem", "gpu0_util", "gpu1_mem", "gpu1_util"],
44
+ [
45
+ pytest.param(
46
+ {"limit": 1},
47
+ [ts[0]],
48
+ [60],
49
+ [512],
50
+ [128],
51
+ [768],
52
+ [15],
53
+ [128],
54
+ [20],
55
+ id="limit-1-latest",
56
+ ),
57
+ pytest.param(
58
+ {"limit": 3},
59
+ [ts[0], ts[1], ts[2]],
60
+ [60, 40, 100],
61
+ [512, 1024, 1024],
62
+ [128, 512, 512],
63
+ [768, 1024, 1024],
64
+ [15, 10, 20],
65
+ [128, 256, 128],
66
+ [20, 10, 5],
67
+ id="limit-3-latest",
68
+ ),
69
+ pytest.param(
70
+ {},
71
+ [ts[0], ts[1], ts[2], ts[3], ts[4]],
72
+ [60, 40, 100, 0, 100],
73
+ [512, 1024, 1024, 512, 1024],
74
+ [128, 512, 512, 512, 1024],
75
+ [768, 1024, 1024, 2048, 1024],
76
+ [15, 10, 20, 40, 0],
77
+ [128, 256, 128, 512, 128],
78
+ [20, 10, 5, 20, 0],
79
+ id="all",
80
+ ),
81
+ pytest.param(
82
+ {"after": ts[3]},
83
+ [ts[0], ts[1], ts[2]],
84
+ [60, 40, 100],
85
+ [512, 1024, 1024],
86
+ [128, 512, 512],
87
+ [768, 1024, 1024],
88
+ [15, 10, 20],
89
+ [128, 256, 128],
90
+ [20, 10, 5],
91
+ id="all-after",
92
+ ),
93
+ pytest.param(
94
+ {"before": ts[2]},
95
+ [ts[3], ts[4]],
96
+ [0, 100],
97
+ [512, 1024],
98
+ [512, 1024],
99
+ [2048, 1024],
100
+ [40, 0],
101
+ [512, 128],
102
+ [20, 0],
103
+ id="all-before",
104
+ ),
105
+ ],
106
+ )
107
+ async def test_get_metrics(
108
+ self,
109
+ session: AsyncSession,
110
+ params: dict,
111
+ ts: list[datetime],
112
+ cpu: list[int],
113
+ mem: list[int],
114
+ mem_ws: list[int],
115
+ gpu0_mem: list[int],
116
+ gpu0_util: list[int],
117
+ gpu1_mem: list[int],
118
+ gpu1_util: list[int],
119
+ ):
120
+ user = await create_user(session=session)
121
+ project = await create_project(session=session, owner=user)
122
+ repo = await create_repo(
123
+ session=session,
124
+ project_id=project.id,
125
+ )
126
+ run = await create_run(
127
+ session=session,
128
+ project=project,
129
+ repo=repo,
130
+ user=user,
131
+ )
132
+ job = await create_job(
133
+ session=session,
134
+ run=run,
135
+ )
136
+ for dt, _cpu, _mem, _mem_ws, _gpu0_mem, _gpu0_util, _gpu1_mem, _gpu1_util in self.points:
137
+ await create_job_metrics_point(
138
+ session=session,
139
+ job_model=job,
140
+ timestamp=dt,
141
+ cpu_usage_micro=_cpu * 1_000_000,
142
+ memory_usage_bytes=_mem,
143
+ memory_working_set_bytes=_mem_ws,
144
+ gpus_memory_usage_bytes=[_gpu0_mem, _gpu1_mem],
145
+ gpus_util_percent=[_gpu0_util, _gpu1_util],
146
+ )
147
+
148
+ metrics = await get_job_metrics(session, job, **params)
149
+
150
+ assert metrics.metrics == [
151
+ Metric(name="cpu_usage_percent", timestamps=ts, values=cpu),
152
+ Metric(name="memory_usage_bytes", timestamps=ts, values=mem),
153
+ Metric(name="memory_working_set_bytes", timestamps=ts, values=mem_ws),
154
+ Metric(
155
+ name="gpus_detected_num",
156
+ timestamps=[ts[0], ts[-1]] if len(ts) > 1 else ts,
157
+ values=[2, 2] if len(ts) > 1 else [2],
158
+ ),
159
+ Metric(name="gpu_memory_usage_bytes_gpu0", timestamps=ts, values=gpu0_mem),
160
+ Metric(name="gpu_memory_usage_bytes_gpu1", timestamps=ts, values=gpu1_mem),
161
+ Metric(name="gpu_util_percent_gpu0", timestamps=ts, values=gpu0_util),
162
+ Metric(name="gpu_util_percent_gpu1", timestamps=ts, values=gpu1_util),
163
+ ]