dstack 0.18.43__py3-none-any.whl → 0.19.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/gateway.py +15 -3
- dstack/_internal/cli/commands/logs.py +0 -22
- dstack/_internal/cli/commands/stats.py +8 -17
- dstack/_internal/cli/main.py +1 -5
- dstack/_internal/cli/services/configurators/fleet.py +4 -39
- dstack/_internal/cli/services/configurators/run.py +22 -20
- dstack/_internal/cli/services/profile.py +34 -83
- dstack/_internal/cli/utils/gateway.py +1 -1
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/__init__.py +56 -39
- dstack/_internal/core/backends/aws/__init__.py +0 -25
- dstack/_internal/core/backends/aws/auth.py +1 -10
- dstack/_internal/core/backends/aws/backend.py +26 -0
- dstack/_internal/core/backends/aws/compute.py +21 -45
- dstack/_internal/{server/services/backends/configurators/aws.py → core/backends/aws/configurator.py} +46 -85
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +1 -1
- dstack/_internal/core/backends/azure/__init__.py +0 -20
- dstack/_internal/core/backends/azure/auth.py +2 -11
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +14 -28
- dstack/_internal/{server/services/backends/configurators/azure.py → core/backends/azure/configurator.py} +141 -210
- dstack/_internal/core/backends/azure/models.py +89 -0
- dstack/_internal/core/backends/base/__init__.py +0 -12
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +153 -33
- dstack/_internal/core/backends/base/configurator.py +105 -0
- dstack/_internal/core/backends/base/models.py +14 -0
- dstack/_internal/core/backends/configurators.py +138 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -15
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +8 -26
- dstack/_internal/core/backends/cudo/configurator.py +72 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +0 -15
- dstack/_internal/core/backends/datacrunch/backend.py +16 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -25
- dstack/_internal/core/backends/datacrunch/configurator.py +66 -0
- dstack/_internal/core/backends/datacrunch/models.py +38 -0
- dstack/_internal/core/{models/backends/dstack.py → backends/dstack/models.py} +7 -7
- dstack/_internal/core/backends/gcp/__init__.py +0 -16
- dstack/_internal/core/backends/gcp/auth.py +2 -11
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +14 -44
- dstack/_internal/{server/services/backends/configurators/gcp.py → core/backends/gcp/configurator.py} +46 -103
- dstack/_internal/core/backends/gcp/models.py +125 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -15
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +16 -5
- dstack/_internal/core/backends/kubernetes/configurator.py +55 -0
- dstack/_internal/core/backends/kubernetes/models.py +72 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -16
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +7 -28
- dstack/_internal/core/backends/lambdalabs/configurator.py +82 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -13
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +16 -2
- dstack/_internal/core/backends/models.py +128 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -15
- dstack/_internal/core/backends/oci/auth.py +1 -5
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +9 -23
- dstack/_internal/{server/services/backends/configurators/oci.py → core/backends/oci/configurator.py} +40 -85
- dstack/_internal/core/{models/backends/oci.py → backends/oci/models.py} +24 -25
- dstack/_internal/core/backends/oci/region.py +1 -1
- dstack/_internal/core/backends/runpod/__init__.py +0 -15
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +28 -6
- dstack/_internal/core/backends/runpod/configurator.py +59 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/__init__.py +0 -15
- dstack/_internal/core/backends/tensordock/backend.py +16 -0
- dstack/_internal/core/backends/tensordock/compute.py +8 -27
- dstack/_internal/core/backends/tensordock/configurator.py +68 -0
- dstack/_internal/core/backends/tensordock/models.py +38 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -15
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -2
- dstack/_internal/core/backends/vastai/configurator.py +66 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -15
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +10 -24
- dstack/_internal/core/backends/vultr/configurator.py +64 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/models/backends/__init__.py +0 -184
- dstack/_internal/core/models/backends/base.py +0 -19
- dstack/_internal/core/models/configurations.py +22 -16
- dstack/_internal/core/models/envs.py +4 -3
- dstack/_internal/core/models/fleets.py +17 -22
- dstack/_internal/core/models/gateways.py +3 -3
- dstack/_internal/core/models/instances.py +24 -0
- dstack/_internal/core/models/profiles.py +85 -45
- dstack/_internal/core/models/projects.py +1 -1
- dstack/_internal/core/models/repos/base.py +0 -5
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/repos/remote.py +26 -12
- dstack/_internal/core/models/repos/virtual.py +1 -1
- dstack/_internal/core/models/resources.py +45 -76
- dstack/_internal/core/models/runs.py +21 -19
- dstack/_internal/core/models/volumes.py +1 -3
- dstack/_internal/core/services/profiles.py +7 -16
- dstack/_internal/core/services/repos.py +0 -4
- dstack/_internal/server/app.py +11 -4
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_gateways.py +4 -8
- dstack/_internal/server/background/tasks/process_instances.py +14 -9
- dstack/_internal/server/background/tasks/process_metrics.py +1 -1
- dstack/_internal/server/background/tasks/process_placement_groups.py +5 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +80 -24
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +20 -38
- dstack/_internal/server/background/tasks/process_volumes.py +5 -2
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/models.py +59 -9
- dstack/_internal/server/routers/backends.py +14 -23
- dstack/_internal/server/routers/instances.py +3 -4
- dstack/_internal/server/routers/metrics.py +31 -10
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/routers/repos.py +1 -2
- dstack/_internal/server/routers/runs.py +13 -59
- dstack/_internal/server/schemas/gateways.py +14 -23
- dstack/_internal/server/schemas/projects.py +7 -2
- dstack/_internal/server/schemas/repos.py +2 -38
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/schemas/runs.py +1 -24
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/__init__.py +85 -158
- dstack/_internal/server/services/config.py +53 -567
- dstack/_internal/server/services/fleets.py +9 -103
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/{pools.py → instances.py} +22 -329
- dstack/_internal/server/services/jobs/__init__.py +9 -6
- dstack/_internal/server/services/jobs/configurators/base.py +25 -1
- dstack/_internal/server/services/jobs/configurators/dev.py +9 -1
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/metrics.py +131 -72
- dstack/_internal/server/services/offers.py +1 -1
- dstack/_internal/server/services/projects.py +23 -14
- dstack/_internal/server/services/prometheus.py +245 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +67 -31
- dstack/_internal/server/services/volumes.py +9 -4
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4fd5a4770eff59325ee3.js} +68 -15
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4fd5a4770eff59325ee3.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +75 -32
- dstack/_internal/utils/json_schema.py +6 -0
- dstack/_internal/utils/ssh.py +2 -1
- dstack/api/__init__.py +4 -0
- dstack/api/_public/__init__.py +16 -20
- dstack/api/_public/backends.py +1 -1
- dstack/api/_public/repos.py +36 -36
- dstack/api/_public/runs.py +170 -83
- dstack/api/server/__init__.py +11 -13
- dstack/api/server/_backends.py +12 -16
- dstack/api/server/_fleets.py +15 -55
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_repos.py +1 -4
- dstack/api/server/_runs.py +21 -96
- dstack/api/server/_volumes.py +10 -5
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/RECORD +229 -206
- tests/_internal/cli/services/configurators/test_profile.py +6 -6
- tests/_internal/core/backends/aws/test_configurator.py +35 -0
- tests/_internal/core/backends/aws/test_resources.py +1 -1
- tests/_internal/core/backends/azure/test_configurator.py +61 -0
- tests/_internal/core/backends/cudo/__init__.py +0 -0
- tests/_internal/core/backends/cudo/test_configurator.py +37 -0
- tests/_internal/core/backends/datacrunch/__init__.py +0 -0
- tests/_internal/core/backends/datacrunch/test_configurator.py +17 -0
- tests/_internal/core/backends/gcp/test_configurator.py +42 -0
- tests/_internal/core/backends/kubernetes/test_configurator.py +43 -0
- tests/_internal/core/backends/lambdalabs/__init__.py +0 -0
- tests/_internal/core/backends/lambdalabs/test_configurator.py +38 -0
- tests/_internal/core/backends/oci/test_configurator.py +55 -0
- tests/_internal/core/backends/runpod/__init__.py +0 -0
- tests/_internal/core/backends/runpod/test_configurator.py +33 -0
- tests/_internal/core/backends/tensordock/__init__.py +0 -0
- tests/_internal/core/backends/tensordock/test_configurator.py +38 -0
- tests/_internal/core/backends/vastai/__init__.py +0 -0
- tests/_internal/core/backends/vastai/test_configurator.py +33 -0
- tests/_internal/core/backends/vultr/__init__.py +0 -0
- tests/_internal/core/backends/vultr/test_configurator.py +33 -0
- tests/_internal/server/background/tasks/test_process_gateways.py +4 -0
- tests/_internal/server/background/tasks/test_process_instances.py +49 -48
- tests/_internal/server/background/tasks/test_process_metrics.py +0 -3
- tests/_internal/server/background/tasks/test_process_placement_groups.py +2 -0
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +186 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +123 -19
- tests/_internal/server/background/tasks/test_process_runs.py +8 -22
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +3 -40
- tests/_internal/server/background/tasks/test_process_submitted_volumes.py +2 -0
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +10 -15
- tests/_internal/server/routers/test_backends.py +6 -764
- tests/_internal/server/routers/test_fleets.py +2 -26
- tests/_internal/server/routers/test_gateways.py +27 -3
- tests/_internal/server/routers/test_instances.py +0 -10
- tests/_internal/server/routers/test_metrics.py +42 -0
- tests/_internal/server/routers/test_projects.py +56 -0
- tests/_internal/server/routers/test_prometheus.py +333 -0
- tests/_internal/server/routers/test_repos.py +0 -15
- tests/_internal/server/routers/test_runs.py +83 -275
- tests/_internal/server/routers/test_volumes.py +2 -3
- tests/_internal/server/services/backends/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_task.py +35 -0
- tests/_internal/server/services/test_config.py +7 -4
- tests/_internal/server/services/test_fleets.py +1 -4
- tests/_internal/server/services/{test_pools.py → test_instances.py} +11 -49
- tests/_internal/server/services/test_metrics.py +167 -0
- tests/_internal/server/services/test_repos.py +1 -14
- tests/_internal/server/services/test_runs.py +0 -4
- dstack/_internal/cli/commands/pool.py +0 -581
- dstack/_internal/cli/commands/run.py +0 -75
- dstack/_internal/core/backends/aws/config.py +0 -18
- dstack/_internal/core/backends/azure/config.py +0 -12
- dstack/_internal/core/backends/base/config.py +0 -5
- dstack/_internal/core/backends/cudo/config.py +0 -9
- dstack/_internal/core/backends/datacrunch/config.py +0 -9
- dstack/_internal/core/backends/gcp/config.py +0 -22
- dstack/_internal/core/backends/kubernetes/config.py +0 -6
- dstack/_internal/core/backends/lambdalabs/config.py +0 -9
- dstack/_internal/core/backends/nebius/__init__.py +0 -15
- dstack/_internal/core/backends/nebius/api_client.py +0 -319
- dstack/_internal/core/backends/nebius/compute.py +0 -220
- dstack/_internal/core/backends/nebius/config.py +0 -6
- dstack/_internal/core/backends/nebius/types.py +0 -37
- dstack/_internal/core/backends/oci/config.py +0 -6
- dstack/_internal/core/backends/runpod/config.py +0 -9
- dstack/_internal/core/backends/tensordock/config.py +0 -9
- dstack/_internal/core/backends/vastai/config.py +0 -6
- dstack/_internal/core/backends/vultr/config.py +0 -9
- dstack/_internal/core/models/backends/aws.py +0 -86
- dstack/_internal/core/models/backends/azure.py +0 -68
- dstack/_internal/core/models/backends/cudo.py +0 -43
- dstack/_internal/core/models/backends/datacrunch.py +0 -44
- dstack/_internal/core/models/backends/gcp.py +0 -67
- dstack/_internal/core/models/backends/kubernetes.py +0 -40
- dstack/_internal/core/models/backends/lambdalabs.py +0 -43
- dstack/_internal/core/models/backends/nebius.py +0 -54
- dstack/_internal/core/models/backends/runpod.py +0 -40
- dstack/_internal/core/models/backends/tensordock.py +0 -44
- dstack/_internal/core/models/backends/vastai.py +0 -43
- dstack/_internal/core/models/backends/vultr.py +0 -40
- dstack/_internal/core/models/pools.py +0 -43
- dstack/_internal/server/routers/pools.py +0 -142
- dstack/_internal/server/schemas/pools.py +0 -38
- dstack/_internal/server/services/backends/configurators/base.py +0 -72
- dstack/_internal/server/services/backends/configurators/cudo.py +0 -87
- dstack/_internal/server/services/backends/configurators/datacrunch.py +0 -79
- dstack/_internal/server/services/backends/configurators/kubernetes.py +0 -63
- dstack/_internal/server/services/backends/configurators/lambdalabs.py +0 -98
- dstack/_internal/server/services/backends/configurators/nebius.py +0 -85
- dstack/_internal/server/services/backends/configurators/runpod.py +0 -97
- dstack/_internal/server/services/backends/configurators/tensordock.py +0 -82
- dstack/_internal/server/services/backends/configurators/vastai.py +0 -80
- dstack/_internal/server/services/backends/configurators/vultr.py +0 -80
- dstack/api/_public/pools.py +0 -41
- dstack/api/_public/resources.py +0 -105
- dstack/api/server/_pools.py +0 -63
- tests/_internal/server/routers/test_pools.py +0 -612
- /dstack/_internal/{server/services/backends/configurators → core/backends/dstack}/__init__.py +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.19.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -28,9 +28,9 @@ from dstack._internal.server.background.tasks.process_instances import (
|
|
|
28
28
|
process_instances,
|
|
29
29
|
)
|
|
30
30
|
from dstack._internal.server.testing.common import (
|
|
31
|
+
ComputeMockSpec,
|
|
31
32
|
create_instance,
|
|
32
33
|
create_job,
|
|
33
|
-
create_pool,
|
|
34
34
|
create_project,
|
|
35
35
|
create_repo,
|
|
36
36
|
create_run,
|
|
@@ -49,10 +49,10 @@ class TestCheckShim:
|
|
|
49
49
|
self, test_db, session: AsyncSession
|
|
50
50
|
):
|
|
51
51
|
project = await create_project(session=session)
|
|
52
|
-
pool = await create_pool(session, project)
|
|
53
|
-
|
|
54
52
|
instance = await create_instance(
|
|
55
|
-
session,
|
|
53
|
+
session=session,
|
|
54
|
+
project=project,
|
|
55
|
+
status=InstanceStatus.PROVISIONING,
|
|
56
56
|
)
|
|
57
57
|
instance.termination_deadline = get_current_datetime() + dt.timedelta(days=1)
|
|
58
58
|
instance.health_status = "ssh connect problem"
|
|
@@ -78,10 +78,10 @@ class TestCheckShim:
|
|
|
78
78
|
self, test_db, session: AsyncSession
|
|
79
79
|
):
|
|
80
80
|
project = await create_project(session=session)
|
|
81
|
-
pool = await create_pool(session, project)
|
|
82
|
-
|
|
83
81
|
instance = await create_instance(
|
|
84
|
-
session,
|
|
82
|
+
session=session,
|
|
83
|
+
project=project,
|
|
84
|
+
status=InstanceStatus.PROVISIONING,
|
|
85
85
|
)
|
|
86
86
|
instance.started_at = get_current_datetime() + dt.timedelta(minutes=-20)
|
|
87
87
|
instance.health_status = "ssh connect problem"
|
|
@@ -110,7 +110,6 @@ class TestCheckShim:
|
|
|
110
110
|
):
|
|
111
111
|
user = await create_user(session=session)
|
|
112
112
|
project = await create_project(session=session, owner=user)
|
|
113
|
-
pool = await create_pool(session, project)
|
|
114
113
|
repo = await create_repo(
|
|
115
114
|
session=session,
|
|
116
115
|
project_id=project.id,
|
|
@@ -121,9 +120,10 @@ class TestCheckShim:
|
|
|
121
120
|
repo=repo,
|
|
122
121
|
user=user,
|
|
123
122
|
)
|
|
124
|
-
|
|
125
123
|
instance = await create_instance(
|
|
126
|
-
session,
|
|
124
|
+
session=session,
|
|
125
|
+
project=project,
|
|
126
|
+
status=InstanceStatus.PROVISIONING,
|
|
127
127
|
)
|
|
128
128
|
instance.termination_deadline = get_current_datetime().replace(
|
|
129
129
|
tzinfo=dt.timezone.utc
|
|
@@ -158,10 +158,11 @@ class TestCheckShim:
|
|
|
158
158
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
159
159
|
async def test_check_shim_start_termination_deadline(self, test_db, session: AsyncSession):
|
|
160
160
|
project = await create_project(session=session)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
161
|
+
instance = await create_instance(
|
|
162
|
+
session=session,
|
|
163
|
+
project=project,
|
|
164
|
+
status=InstanceStatus.IDLE,
|
|
165
|
+
)
|
|
165
166
|
health_status = "SSH connection fail"
|
|
166
167
|
with patch(
|
|
167
168
|
"dstack._internal.server.background.tasks.process_instances._instance_healthcheck"
|
|
@@ -183,9 +184,11 @@ class TestCheckShim:
|
|
|
183
184
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
184
185
|
async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession):
|
|
185
186
|
project = await create_project(session=session)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
187
|
+
instance = await create_instance(
|
|
188
|
+
session=session,
|
|
189
|
+
project=project,
|
|
190
|
+
status=InstanceStatus.IDLE,
|
|
191
|
+
)
|
|
189
192
|
instance.termination_deadline = get_current_datetime() + dt.timedelta(minutes=19)
|
|
190
193
|
await session.commit()
|
|
191
194
|
|
|
@@ -206,9 +209,11 @@ class TestCheckShim:
|
|
|
206
209
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
207
210
|
async def test_check_shim_terminate_instance_by_dedaline(self, test_db, session: AsyncSession):
|
|
208
211
|
project = await create_project(session=session)
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
+
instance = await create_instance(
|
|
213
|
+
session=session,
|
|
214
|
+
project=project,
|
|
215
|
+
status=InstanceStatus.IDLE,
|
|
216
|
+
)
|
|
212
217
|
termination_deadline_time = get_current_datetime() + dt.timedelta(minutes=-19)
|
|
213
218
|
instance.termination_deadline = termination_deadline_time
|
|
214
219
|
await session.commit()
|
|
@@ -251,7 +256,6 @@ class TestCheckShim:
|
|
|
251
256
|
):
|
|
252
257
|
# see https://github.com/dstackai/dstack/issues/2041
|
|
253
258
|
project = await create_project(session=session)
|
|
254
|
-
pool = await create_pool(session, project)
|
|
255
259
|
if has_job:
|
|
256
260
|
user = await create_user(session=session)
|
|
257
261
|
repo = await create_repo(
|
|
@@ -272,9 +276,8 @@ class TestCheckShim:
|
|
|
272
276
|
else:
|
|
273
277
|
job = None
|
|
274
278
|
instance = await create_instance(
|
|
275
|
-
session,
|
|
276
|
-
project,
|
|
277
|
-
pool,
|
|
279
|
+
session=session,
|
|
280
|
+
project=project,
|
|
278
281
|
created_at=get_current_datetime(),
|
|
279
282
|
termination_policy=termination_policy,
|
|
280
283
|
status=InstanceStatus.IDLE,
|
|
@@ -302,8 +305,9 @@ class TestTerminateIdleTime:
|
|
|
302
305
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
303
306
|
async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
|
|
304
307
|
project = await create_project(session=session)
|
|
305
|
-
|
|
306
|
-
|
|
308
|
+
instance = await create_instance(
|
|
309
|
+
session=session, project=project, status=InstanceStatus.IDLE
|
|
310
|
+
)
|
|
307
311
|
instance.termination_idle_time = 300
|
|
308
312
|
instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
|
|
309
313
|
instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19)
|
|
@@ -320,11 +324,9 @@ class TestSSHInstanceTerminateProvisionTimeoutExpired:
|
|
|
320
324
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
321
325
|
async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
|
|
322
326
|
project = await create_project(session=session)
|
|
323
|
-
pool = await create_pool(session, project)
|
|
324
327
|
instance = await create_instance(
|
|
325
|
-
session,
|
|
326
|
-
project,
|
|
327
|
-
pool,
|
|
328
|
+
session=session,
|
|
329
|
+
project=project,
|
|
328
330
|
status=InstanceStatus.PENDING,
|
|
329
331
|
created_at=get_current_datetime() - dt.timedelta(days=100),
|
|
330
332
|
)
|
|
@@ -357,10 +359,9 @@ class TestTerminate:
|
|
|
357
359
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
358
360
|
async def test_terminate(self, test_db, session: AsyncSession):
|
|
359
361
|
project = await create_project(session=session)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
362
|
+
instance = await create_instance(
|
|
363
|
+
session=session, project=project, status=InstanceStatus.TERMINATING
|
|
364
|
+
)
|
|
364
365
|
reason = "some reason"
|
|
365
366
|
instance.termination_reason = reason
|
|
366
367
|
instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19)
|
|
@@ -384,8 +385,9 @@ class TestTerminate:
|
|
|
384
385
|
@pytest.mark.parametrize("error", [BackendError("err"), RuntimeError("err")])
|
|
385
386
|
async def test_terminate_retry(self, test_db, session: AsyncSession, error: Exception):
|
|
386
387
|
project = await create_project(session=session)
|
|
387
|
-
|
|
388
|
-
|
|
388
|
+
instance = await create_instance(
|
|
389
|
+
session=session, project=project, status=InstanceStatus.TERMINATING
|
|
390
|
+
)
|
|
389
391
|
instance.termination_reason = "some reason"
|
|
390
392
|
initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
|
|
391
393
|
instance.last_job_processed_at = initial_time
|
|
@@ -415,8 +417,9 @@ class TestTerminate:
|
|
|
415
417
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
416
418
|
async def test_terminate_not_retries_if_too_early(self, test_db, session: AsyncSession):
|
|
417
419
|
project = await create_project(session=session)
|
|
418
|
-
|
|
419
|
-
|
|
420
|
+
instance = await create_instance(
|
|
421
|
+
session=session, project=project, status=InstanceStatus.TERMINATING
|
|
422
|
+
)
|
|
420
423
|
instance.termination_reason = "some reason"
|
|
421
424
|
initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
|
|
422
425
|
instance.last_job_processed_at = initial_time
|
|
@@ -446,8 +449,9 @@ class TestTerminate:
|
|
|
446
449
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
447
450
|
async def test_terminate_on_termination_deadline(self, test_db, session: AsyncSession):
|
|
448
451
|
project = await create_project(session=session)
|
|
449
|
-
|
|
450
|
-
|
|
452
|
+
instance = await create_instance(
|
|
453
|
+
session=session, project=project, status=InstanceStatus.TERMINATING
|
|
454
|
+
)
|
|
451
455
|
instance.termination_reason = "some reason"
|
|
452
456
|
initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc)
|
|
453
457
|
instance.last_job_processed_at = initial_time
|
|
@@ -505,11 +509,9 @@ class TestCreateInstance:
|
|
|
505
509
|
expected_blocks: int,
|
|
506
510
|
):
|
|
507
511
|
project = await create_project(session=session)
|
|
508
|
-
pool = await create_pool(session, project)
|
|
509
512
|
instance = await create_instance(
|
|
510
|
-
session,
|
|
511
|
-
project,
|
|
512
|
-
pool,
|
|
513
|
+
session=session,
|
|
514
|
+
project=project,
|
|
513
515
|
status=InstanceStatus.PENDING,
|
|
514
516
|
total_blocks=requested_blocks,
|
|
515
517
|
busy_blocks=0,
|
|
@@ -531,6 +533,7 @@ class TestCreateInstance:
|
|
|
531
533
|
price=1.0,
|
|
532
534
|
availability=InstanceAvailability.AVAILABLE,
|
|
533
535
|
)
|
|
536
|
+
backend_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
534
537
|
backend_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
535
538
|
backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData(
|
|
536
539
|
backend=offer.backend,
|
|
@@ -611,11 +614,9 @@ class TestAddSSHInstance:
|
|
|
611
614
|
host_info["cpus"] = cpus
|
|
612
615
|
host_info["gpu_count"] = gpus
|
|
613
616
|
project = await create_project(session=session)
|
|
614
|
-
pool = await create_pool(session, project)
|
|
615
617
|
instance = await create_instance(
|
|
616
|
-
session,
|
|
617
|
-
project,
|
|
618
|
-
pool,
|
|
618
|
+
session=session,
|
|
619
|
+
project=project,
|
|
619
620
|
status=InstanceStatus.PENDING,
|
|
620
621
|
created_at=get_current_datetime(),
|
|
621
622
|
remote_connection_info=get_remote_connection_info(),
|
|
@@ -21,7 +21,6 @@ from dstack._internal.server.testing.common import (
|
|
|
21
21
|
create_instance,
|
|
22
22
|
create_job,
|
|
23
23
|
create_job_metrics_point,
|
|
24
|
-
create_pool,
|
|
25
24
|
create_project,
|
|
26
25
|
create_repo,
|
|
27
26
|
create_run,
|
|
@@ -45,11 +44,9 @@ class TestCollectMetrics:
|
|
|
45
44
|
session=session,
|
|
46
45
|
project_id=project.id,
|
|
47
46
|
)
|
|
48
|
-
pool = await create_pool(session=session, project=project)
|
|
49
47
|
instance = await create_instance(
|
|
50
48
|
session=session,
|
|
51
49
|
project=project,
|
|
52
|
-
pool=pool,
|
|
53
50
|
status=InstanceStatus.BUSY,
|
|
54
51
|
)
|
|
55
52
|
run = await create_run(
|
|
@@ -7,6 +7,7 @@ from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
|
7
7
|
process_placement_groups,
|
|
8
8
|
)
|
|
9
9
|
from dstack._internal.server.testing.common import (
|
|
10
|
+
ComputeMockSpec,
|
|
10
11
|
create_fleet,
|
|
11
12
|
create_placement_group,
|
|
12
13
|
create_project,
|
|
@@ -34,6 +35,7 @@ class TestProcessPlacementGroups:
|
|
|
34
35
|
with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m:
|
|
35
36
|
aws_mock = Mock()
|
|
36
37
|
m.return_value = aws_mock
|
|
38
|
+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
|
|
37
39
|
await process_placement_groups()
|
|
38
40
|
aws_mock.compute.return_value.delete_placement_group.assert_called_once()
|
|
39
41
|
await session.refresh(placement_group1)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from unittest.mock import Mock, patch
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import pytest_asyncio
|
|
7
|
+
from freezegun import freeze_time
|
|
8
|
+
from sqlalchemy import select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
|
|
11
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
12
|
+
from dstack._internal.core.models.runs import JobStatus
|
|
13
|
+
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
14
|
+
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
15
|
+
collect_prometheus_metrics,
|
|
16
|
+
delete_prometheus_metrics,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.server.models import JobModel, JobPrometheusMetrics
|
|
19
|
+
from dstack._internal.server.services.projects import add_project_member
|
|
20
|
+
from dstack._internal.server.testing.common import (
|
|
21
|
+
create_instance,
|
|
22
|
+
create_job,
|
|
23
|
+
create_job_prometheus_metrics,
|
|
24
|
+
create_project,
|
|
25
|
+
create_repo,
|
|
26
|
+
create_run,
|
|
27
|
+
create_user,
|
|
28
|
+
get_job_provisioning_data,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.asyncio
|
|
33
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
34
|
+
@pytest.mark.usefixtures("test_db", "image_config_mock")
|
|
35
|
+
class TestCollectPrometheusMetrics:
|
|
36
|
+
@pytest_asyncio.fixture
|
|
37
|
+
async def job(self, session: AsyncSession) -> JobModel:
|
|
38
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
39
|
+
project = await create_project(session=session, owner=user)
|
|
40
|
+
await add_project_member(
|
|
41
|
+
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
42
|
+
)
|
|
43
|
+
repo = await create_repo(
|
|
44
|
+
session=session,
|
|
45
|
+
project_id=project.id,
|
|
46
|
+
)
|
|
47
|
+
instance = await create_instance(
|
|
48
|
+
session=session,
|
|
49
|
+
project=project,
|
|
50
|
+
status=InstanceStatus.BUSY,
|
|
51
|
+
)
|
|
52
|
+
run = await create_run(
|
|
53
|
+
session=session,
|
|
54
|
+
project=project,
|
|
55
|
+
repo=repo,
|
|
56
|
+
user=user,
|
|
57
|
+
)
|
|
58
|
+
job = await create_job(
|
|
59
|
+
session=session,
|
|
60
|
+
run=run,
|
|
61
|
+
status=JobStatus.RUNNING,
|
|
62
|
+
job_provisioning_data=get_job_provisioning_data(),
|
|
63
|
+
instance_assigned=True,
|
|
64
|
+
instance=instance,
|
|
65
|
+
)
|
|
66
|
+
return job
|
|
67
|
+
|
|
68
|
+
@pytest.fixture
|
|
69
|
+
def ssh_tunnel_mock(self) -> Generator[Mock, None, None]:
|
|
70
|
+
with patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock:
|
|
71
|
+
yield SSHTunnelMock
|
|
72
|
+
|
|
73
|
+
@pytest.fixture
|
|
74
|
+
def shim_client_mock(self) -> Generator[Mock, None, None]:
|
|
75
|
+
with patch("dstack._internal.server.services.runner.client.ShimClient") as ShimClientMock:
|
|
76
|
+
yield ShimClientMock.return_value
|
|
77
|
+
|
|
78
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
79
|
+
async def test_inserts_new_record(
|
|
80
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
81
|
+
):
|
|
82
|
+
shim_client_mock.get_task_metrics.return_value = "# prom response"
|
|
83
|
+
|
|
84
|
+
await collect_prometheus_metrics()
|
|
85
|
+
|
|
86
|
+
ssh_tunnel_mock.assert_called_once()
|
|
87
|
+
shim_client_mock.get_task_metrics.assert_called_once()
|
|
88
|
+
res = await session.execute(
|
|
89
|
+
select(JobPrometheusMetrics).where(JobPrometheusMetrics.job_id == job.id)
|
|
90
|
+
)
|
|
91
|
+
metrics = res.scalar_one()
|
|
92
|
+
assert metrics.text == "# prom response"
|
|
93
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
|
|
94
|
+
|
|
95
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
96
|
+
async def test_updates_record(
|
|
97
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
98
|
+
):
|
|
99
|
+
metrics = await create_job_prometheus_metrics(
|
|
100
|
+
session=session,
|
|
101
|
+
job=job,
|
|
102
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 0),
|
|
103
|
+
text="# prom old response",
|
|
104
|
+
)
|
|
105
|
+
shim_client_mock.get_task_metrics.return_value = "# prom new response"
|
|
106
|
+
|
|
107
|
+
await collect_prometheus_metrics()
|
|
108
|
+
|
|
109
|
+
ssh_tunnel_mock.assert_called_once()
|
|
110
|
+
shim_client_mock.get_task_metrics.assert_called_once()
|
|
111
|
+
res = await session.execute(
|
|
112
|
+
select(JobPrometheusMetrics)
|
|
113
|
+
.where(JobPrometheusMetrics.job_id == job.id)
|
|
114
|
+
.execution_options(populate_existing=True)
|
|
115
|
+
)
|
|
116
|
+
metrics = res.scalar_one()
|
|
117
|
+
assert metrics.text == "# prom new response"
|
|
118
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 20)
|
|
119
|
+
|
|
120
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
121
|
+
async def test_skips_recently_updated(
|
|
122
|
+
self, session: AsyncSession, job: JobModel, ssh_tunnel_mock: Mock, shim_client_mock: Mock
|
|
123
|
+
):
|
|
124
|
+
metrics = await create_job_prometheus_metrics(
|
|
125
|
+
session=session,
|
|
126
|
+
job=job,
|
|
127
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 15),
|
|
128
|
+
text="# prom old response",
|
|
129
|
+
)
|
|
130
|
+
shim_client_mock.get_task_metrics.return_value = "# prom new response"
|
|
131
|
+
|
|
132
|
+
await collect_prometheus_metrics()
|
|
133
|
+
|
|
134
|
+
ssh_tunnel_mock.assert_not_called()
|
|
135
|
+
shim_client_mock.get_task_metrics.assert_not_called()
|
|
136
|
+
res = await session.execute(
|
|
137
|
+
select(JobPrometheusMetrics)
|
|
138
|
+
.where(JobPrometheusMetrics.job_id == job.id)
|
|
139
|
+
.execution_options(populate_existing=True)
|
|
140
|
+
)
|
|
141
|
+
metrics = res.scalar_one()
|
|
142
|
+
assert metrics.text == "# prom old response"
|
|
143
|
+
assert metrics.collected_at == datetime(2023, 1, 2, 3, 5, 15)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@pytest.mark.asyncio
|
|
147
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
148
|
+
@pytest.mark.usefixtures("test_db", "image_config_mock")
|
|
149
|
+
class TestDeletePrometheusMetrics:
|
|
150
|
+
@freeze_time(datetime(2023, 1, 2, 3, 5, 20, tzinfo=timezone.utc))
|
|
151
|
+
async def test_deletes_old_metrics(self, session: AsyncSession):
|
|
152
|
+
user = await create_user(session=session, global_role=GlobalRole.USER)
|
|
153
|
+
project = await create_project(session=session, owner=user)
|
|
154
|
+
await add_project_member(
|
|
155
|
+
session=session, project=project, user=user, project_role=ProjectRole.USER
|
|
156
|
+
)
|
|
157
|
+
repo = await create_repo(session=session, project_id=project.id)
|
|
158
|
+
run_1 = await create_run(
|
|
159
|
+
session=session, project=project, repo=repo, user=user, run_name="run-1"
|
|
160
|
+
)
|
|
161
|
+
job_1 = await create_job(session=session, run=run_1)
|
|
162
|
+
# old metrics
|
|
163
|
+
await create_job_prometheus_metrics(
|
|
164
|
+
session=session,
|
|
165
|
+
job=job_1,
|
|
166
|
+
collected_at=datetime(2023, 1, 2, 2, 3, 30),
|
|
167
|
+
)
|
|
168
|
+
run_2 = await create_run(
|
|
169
|
+
session=session, project=project, repo=repo, user=user, run_name="run-2"
|
|
170
|
+
)
|
|
171
|
+
job_2 = await create_job(session=session, run=run_2)
|
|
172
|
+
# recent metrics
|
|
173
|
+
metrics_2 = await create_job_prometheus_metrics(
|
|
174
|
+
session=session,
|
|
175
|
+
job=job_2,
|
|
176
|
+
collected_at=datetime(2023, 1, 2, 3, 5, 0),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
await delete_prometheus_metrics()
|
|
180
|
+
|
|
181
|
+
res = await session.execute(
|
|
182
|
+
select(JobPrometheusMetrics).join(JobModel).where(JobModel.project_id == project.id)
|
|
183
|
+
)
|
|
184
|
+
all_metrics = res.scalars().all()
|
|
185
|
+
assert len(all_metrics) == 1
|
|
186
|
+
assert all_metrics[0] == metrics_2
|