dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/apply.py +8 -5
- dstack/_internal/cli/services/configurators/base.py +4 -2
- dstack/_internal/cli/services/configurators/fleet.py +21 -9
- dstack/_internal/cli/services/configurators/gateway.py +15 -0
- dstack/_internal/cli/services/configurators/run.py +6 -5
- dstack/_internal/cli/services/configurators/volume.py +15 -0
- dstack/_internal/cli/services/repos.py +3 -3
- dstack/_internal/cli/utils/fleet.py +44 -33
- dstack/_internal/cli/utils/run.py +27 -7
- dstack/_internal/cli/utils/volume.py +30 -9
- dstack/_internal/core/backends/aws/compute.py +94 -53
- dstack/_internal/core/backends/aws/resources.py +22 -12
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/gcp/compute.py +32 -24
- dstack/_internal/core/backends/gcp/resources.py +0 -15
- dstack/_internal/core/backends/oci/compute.py +10 -5
- dstack/_internal/core/backends/oci/resources.py +23 -26
- dstack/_internal/core/backends/remote/provisioning.py +65 -27
- dstack/_internal/core/backends/runpod/compute.py +1 -0
- dstack/_internal/core/models/backends/azure.py +3 -1
- dstack/_internal/core/models/configurations.py +24 -1
- dstack/_internal/core/models/fleets.py +46 -0
- dstack/_internal/core/models/instances.py +5 -1
- dstack/_internal/core/models/pools.py +4 -1
- dstack/_internal/core/models/profiles.py +10 -4
- dstack/_internal/core/models/runs.py +23 -3
- dstack/_internal/core/models/volumes.py +26 -0
- dstack/_internal/core/services/ssh/attach.py +92 -53
- dstack/_internal/core/services/ssh/tunnel.py +58 -31
- dstack/_internal/proxy/gateway/routers/registry.py +2 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
- dstack/_internal/proxy/gateway/services/registry.py +4 -0
- dstack/_internal/proxy/lib/models.py +3 -0
- dstack/_internal/proxy/lib/services/service_connection.py +8 -1
- dstack/_internal/server/background/tasks/process_instances.py +73 -35
- dstack/_internal/server/background/tasks/process_metrics.py +9 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
- dstack/_internal/server/background/tasks/process_runs.py +2 -12
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +27 -23
- dstack/_internal/server/routers/runs.py +1 -0
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/configurators/azure.py +34 -8
- dstack/_internal/server/services/config.py +9 -0
- dstack/_internal/server/services/fleets.py +32 -3
- dstack/_internal/server/services/gateways/client.py +9 -1
- dstack/_internal/server/services/jobs/__init__.py +217 -45
- dstack/_internal/server/services/jobs/configurators/base.py +47 -2
- dstack/_internal/server/services/offers.py +96 -10
- dstack/_internal/server/services/pools.py +98 -14
- dstack/_internal/server/services/proxy/repo.py +17 -3
- dstack/_internal/server/services/runner/client.py +9 -6
- dstack/_internal/server/services/runner/ssh.py +33 -5
- dstack/_internal/server/services/runs.py +48 -179
- dstack/_internal/server/services/services/__init__.py +9 -1
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
- dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
- dstack/_internal/server/testing/common.py +130 -61
- dstack/_internal/utils/common.py +22 -8
- dstack/_internal/utils/env.py +14 -0
- dstack/_internal/utils/ssh.py +1 -1
- dstack/api/server/_fleets.py +25 -1
- dstack/api/server/_runs.py +23 -2
- dstack/api/server/_volumes.py +12 -1
- dstack/version.py +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
- tests/_internal/cli/services/configurators/test_profile.py +3 -3
- tests/_internal/core/services/ssh/test_tunnel.py +56 -4
- tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
- tests/_internal/server/background/tasks/test_process_instances.py +138 -20
- tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
- tests/_internal/server/background/tasks/test_process_runs.py +27 -3
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
- tests/_internal/server/routers/test_fleets.py +15 -2
- tests/_internal/server/routers/test_pools.py +6 -0
- tests/_internal/server/routers/test_runs.py +27 -0
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/jobs/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +109 -1
- tests/_internal/server/services/test_runs.py +5 -41
- tests/_internal/utils/test_common.py +21 -0
- tests/_internal/utils/test_env.py +38 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
|
@@ -32,7 +32,9 @@ class TestSSHTunnel:
|
|
|
32
32
|
options={"Opt1": "opt1"},
|
|
33
33
|
ssh_config_path="/home/user/.ssh/config",
|
|
34
34
|
port=10022,
|
|
35
|
-
|
|
35
|
+
ssh_proxies=[
|
|
36
|
+
(SSHConnectionParams(hostname="proxy", username="test", port=10022), None)
|
|
37
|
+
],
|
|
36
38
|
forwarded_sockets=[SocketPair(UnixSocket("/1"), UnixSocket("/2"))],
|
|
37
39
|
reverse_forwarded_sockets=[SocketPair(UnixSocket("/1"), UnixSocket("/2"))],
|
|
38
40
|
)
|
|
@@ -105,13 +107,18 @@ class TestSSHTunnel:
|
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
@pytest.mark.usefixtures("ssh_client_info")
|
|
108
|
-
def
|
|
110
|
+
def test_open_command_with_one_proxy(self) -> None:
|
|
109
111
|
tunnel = SSHTunnel(
|
|
110
112
|
destination="ubuntu@my-server",
|
|
111
113
|
identity=FilePath("/home/user/.ssh/id_rsa"),
|
|
112
114
|
control_sock_path="/tmp/control.sock",
|
|
113
115
|
options={},
|
|
114
|
-
|
|
116
|
+
ssh_proxies=[
|
|
117
|
+
(
|
|
118
|
+
SSHConnectionParams(hostname="proxy", username="test", port=10022),
|
|
119
|
+
FilePath("/home/user/.ssh/proxy"),
|
|
120
|
+
)
|
|
121
|
+
],
|
|
115
122
|
)
|
|
116
123
|
assert tunnel.open_command() == [
|
|
117
124
|
"/usr/bin/ssh",
|
|
@@ -130,12 +137,57 @@ class TestSSHTunnel:
|
|
|
130
137
|
"-o",
|
|
131
138
|
(
|
|
132
139
|
"ProxyCommand="
|
|
133
|
-
"/usr/bin/ssh -i /home/user/.ssh/
|
|
140
|
+
"/usr/bin/ssh -i /home/user/.ssh/proxy -W %h:%p -o StrictHostKeyChecking=no"
|
|
134
141
|
" -o UserKnownHostsFile=/dev/null -p 10022 test@proxy"
|
|
135
142
|
),
|
|
136
143
|
"ubuntu@my-server",
|
|
137
144
|
]
|
|
138
145
|
|
|
146
|
+
@pytest.mark.usefixtures("ssh_client_info")
|
|
147
|
+
def test_open_command_with_two_proxies(self) -> None:
|
|
148
|
+
tunnel = SSHTunnel(
|
|
149
|
+
destination="ubuntu@my-server",
|
|
150
|
+
identity=FilePath("/home/user/.ssh/id_rsa"),
|
|
151
|
+
control_sock_path="/tmp/control.sock",
|
|
152
|
+
options={},
|
|
153
|
+
ssh_proxies=[
|
|
154
|
+
(
|
|
155
|
+
SSHConnectionParams(hostname="proxy1", username="test1", port=10022),
|
|
156
|
+
None,
|
|
157
|
+
),
|
|
158
|
+
(
|
|
159
|
+
SSHConnectionParams(hostname="proxy2", username="test2", port=20022),
|
|
160
|
+
FilePath("/home/user/.ssh/proxy2"),
|
|
161
|
+
),
|
|
162
|
+
],
|
|
163
|
+
)
|
|
164
|
+
assert tunnel.open_command() == [
|
|
165
|
+
"/usr/bin/ssh",
|
|
166
|
+
"-F",
|
|
167
|
+
"none",
|
|
168
|
+
"-i",
|
|
169
|
+
"/home/user/.ssh/id_rsa",
|
|
170
|
+
"-E",
|
|
171
|
+
f"{tunnel.temp_dir.name}/tunnel.log",
|
|
172
|
+
"-N",
|
|
173
|
+
"-f",
|
|
174
|
+
"-o",
|
|
175
|
+
"ControlMaster=auto",
|
|
176
|
+
"-S",
|
|
177
|
+
"/tmp/control.sock",
|
|
178
|
+
"-o",
|
|
179
|
+
(
|
|
180
|
+
"ProxyCommand="
|
|
181
|
+
"/usr/bin/ssh -i /home/user/.ssh/proxy2 -W %h:%p -o StrictHostKeyChecking=no"
|
|
182
|
+
" -o UserKnownHostsFile=/dev/null"
|
|
183
|
+
" -o 'ProxyCommand=/usr/bin/ssh -i /home/user/.ssh/id_rsa -W %%h:%%p"
|
|
184
|
+
" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
|
|
185
|
+
" -p 10022 test1@proxy1'"
|
|
186
|
+
" -p 20022 test2@proxy2"
|
|
187
|
+
),
|
|
188
|
+
"ubuntu@my-server",
|
|
189
|
+
]
|
|
190
|
+
|
|
139
191
|
@pytest.mark.usefixtures("ssh_client_info")
|
|
140
192
|
def test_open_command_with_forwarding(self) -> None:
|
|
141
193
|
tunnel = SSHTunnel(
|
|
@@ -48,6 +48,24 @@ def register_replica_payload(job_id: str = "xxx-xxx") -> dict:
|
|
|
48
48
|
"ssh_host": "host.test",
|
|
49
49
|
"ssh_port": 22,
|
|
50
50
|
"ssh_proxy": None,
|
|
51
|
+
"ssh_head_proxy": None,
|
|
52
|
+
"ssh_head_proxy_private_key": None,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def register_replica_payload_with_head_proxy(job_id: str = "xxx-xxx") -> dict:
|
|
57
|
+
return {
|
|
58
|
+
"job_id": job_id,
|
|
59
|
+
"app_port": 8888,
|
|
60
|
+
"ssh_host": "host.test",
|
|
61
|
+
"ssh_port": 22,
|
|
62
|
+
"ssh_proxy": None,
|
|
63
|
+
"ssh_head_proxy": {
|
|
64
|
+
"hostname": "proxy.test",
|
|
65
|
+
"username": "debian",
|
|
66
|
+
"port": 222,
|
|
67
|
+
},
|
|
68
|
+
"ssh_head_proxy_private_key": "private-key",
|
|
51
69
|
}
|
|
52
70
|
|
|
53
71
|
|
|
@@ -190,13 +208,18 @@ class TestRegisterReplica:
|
|
|
190
208
|
conf = (tmp_path / "443-test-run.gtw.test.conf").read_text()
|
|
191
209
|
assert "upstream test-run" not in conf
|
|
192
210
|
# register 2 replicas
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
211
|
+
resp = await client.post(
|
|
212
|
+
"/api/registry/test-proj/services/test-run/replicas/register",
|
|
213
|
+
json=register_replica_payload(job_id="xxx-xxx"),
|
|
214
|
+
)
|
|
215
|
+
assert resp.status_code == 200
|
|
216
|
+
assert resp.json() == {"status": "ok"}
|
|
217
|
+
resp = await client.post(
|
|
218
|
+
"/api/registry/test-proj/services/test-run/replicas/register",
|
|
219
|
+
json=register_replica_payload_with_head_proxy(job_id="yyy-yyy"),
|
|
220
|
+
)
|
|
221
|
+
assert resp.status_code == 200
|
|
222
|
+
assert resp.json() == {"status": "ok"}
|
|
200
223
|
conf = (tmp_path / "443-test-run.gtw.test.conf").read_text()
|
|
201
224
|
assert "upstream test-run" in conf
|
|
202
225
|
assert (m1 := re.search(r"server unix:/(.+)/replica.sock; # replica xxx-xxx", conf))
|
|
@@ -3,6 +3,7 @@ from contextlib import contextmanager
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
from unittest.mock import Mock, patch
|
|
5
5
|
|
|
6
|
+
import gpuhunt
|
|
6
7
|
import pytest
|
|
7
8
|
from freezegun import freeze_time
|
|
8
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
@@ -10,6 +11,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
10
11
|
from dstack._internal.core.errors import BackendError
|
|
11
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
12
13
|
from dstack._internal.core.models.instances import (
|
|
14
|
+
Gpu,
|
|
13
15
|
InstanceAvailability,
|
|
14
16
|
InstanceOfferWithAvailability,
|
|
15
17
|
InstanceStatus,
|
|
@@ -33,6 +35,7 @@ from dstack._internal.server.testing.common import (
|
|
|
33
35
|
create_repo,
|
|
34
36
|
create_run,
|
|
35
37
|
create_user,
|
|
38
|
+
get_remote_connection_info,
|
|
36
39
|
)
|
|
37
40
|
from dstack._internal.utils.common import get_current_datetime
|
|
38
41
|
|
|
@@ -118,11 +121,6 @@ class TestCheckShim:
|
|
|
118
121
|
repo=repo,
|
|
119
122
|
user=user,
|
|
120
123
|
)
|
|
121
|
-
job = await create_job(
|
|
122
|
-
session=session,
|
|
123
|
-
run=run,
|
|
124
|
-
status=JobStatus.SUBMITTED,
|
|
125
|
-
)
|
|
126
124
|
|
|
127
125
|
instance = await create_instance(
|
|
128
126
|
session, project, pool, status=InstanceStatus.PROVISIONING
|
|
@@ -131,7 +129,13 @@ class TestCheckShim:
|
|
|
131
129
|
tzinfo=dt.timezone.utc
|
|
132
130
|
) + dt.timedelta(days=1)
|
|
133
131
|
instance.health_status = "ssh connect problem"
|
|
134
|
-
|
|
132
|
+
|
|
133
|
+
job = await create_job(
|
|
134
|
+
session=session,
|
|
135
|
+
run=run,
|
|
136
|
+
status=JobStatus.SUBMITTED,
|
|
137
|
+
instance=instance,
|
|
138
|
+
)
|
|
135
139
|
|
|
136
140
|
await session.commit()
|
|
137
141
|
|
|
@@ -142,12 +146,13 @@ class TestCheckShim:
|
|
|
142
146
|
await process_instances()
|
|
143
147
|
|
|
144
148
|
await session.refresh(instance)
|
|
149
|
+
await session.refresh(job)
|
|
145
150
|
|
|
146
151
|
assert instance is not None
|
|
147
152
|
assert instance.status == InstanceStatus.BUSY
|
|
148
153
|
assert instance.termination_deadline is None
|
|
149
154
|
assert instance.health_status is None
|
|
150
|
-
assert instance
|
|
155
|
+
assert job.instance == instance
|
|
151
156
|
|
|
152
157
|
@pytest.mark.asyncio
|
|
153
158
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
@@ -310,7 +315,7 @@ class TestTerminateIdleTime:
|
|
|
310
315
|
assert instance.termination_reason == "Idle timeout"
|
|
311
316
|
|
|
312
317
|
|
|
313
|
-
class
|
|
318
|
+
class TestSSHInstanceTerminateProvisionTimeoutExpired:
|
|
314
319
|
@pytest.mark.asyncio
|
|
315
320
|
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
316
321
|
async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
|
|
@@ -469,29 +474,63 @@ class TestTerminate:
|
|
|
469
474
|
assert instance.status == InstanceStatus.TERMINATED
|
|
470
475
|
|
|
471
476
|
|
|
477
|
+
@pytest.mark.asyncio
|
|
478
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
479
|
+
@pytest.mark.usefixtures("test_db")
|
|
472
480
|
class TestCreateInstance:
|
|
473
|
-
@pytest.mark.
|
|
474
|
-
|
|
475
|
-
|
|
481
|
+
@pytest.mark.parametrize(
|
|
482
|
+
# requested_blocks = None means `auto` (as many as possible)
|
|
483
|
+
["cpus", "gpus", "requested_blocks", "expected_blocks"],
|
|
484
|
+
[
|
|
485
|
+
# GPU instances
|
|
486
|
+
pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"),
|
|
487
|
+
pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"),
|
|
488
|
+
pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"),
|
|
489
|
+
pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"),
|
|
490
|
+
pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"),
|
|
491
|
+
pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"),
|
|
492
|
+
# CPU instances
|
|
493
|
+
pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"),
|
|
494
|
+
pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"),
|
|
495
|
+
pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"),
|
|
496
|
+
pytest.param(32, 0, None, 32, id="gpu-instance-auto-max-cpu"),
|
|
497
|
+
],
|
|
498
|
+
)
|
|
499
|
+
async def test_creates_instance(
|
|
500
|
+
self,
|
|
501
|
+
session: AsyncSession,
|
|
502
|
+
cpus: int,
|
|
503
|
+
gpus: int,
|
|
504
|
+
requested_blocks: Optional[int],
|
|
505
|
+
expected_blocks: int,
|
|
506
|
+
):
|
|
476
507
|
project = await create_project(session=session)
|
|
477
508
|
pool = await create_pool(session, project)
|
|
478
|
-
instance = await create_instance(
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
509
|
+
instance = await create_instance(
|
|
510
|
+
session,
|
|
511
|
+
project,
|
|
512
|
+
pool,
|
|
513
|
+
status=InstanceStatus.PENDING,
|
|
514
|
+
total_blocks=requested_blocks,
|
|
515
|
+
busy_blocks=0,
|
|
516
|
+
)
|
|
517
|
+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
|
|
518
|
+
backend_mock = Mock()
|
|
519
|
+
m.return_value = [backend_mock]
|
|
520
|
+
backend_mock.TYPE = BackendType.AWS
|
|
521
|
+
gpu = Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)
|
|
482
522
|
offer = InstanceOfferWithAvailability(
|
|
483
523
|
backend=BackendType.AWS,
|
|
484
524
|
instance=InstanceType(
|
|
485
525
|
name="instance",
|
|
486
|
-
resources=Resources(
|
|
526
|
+
resources=Resources(
|
|
527
|
+
cpus=cpus, memory_mib=131072, spot=False, gpus=[gpu] * gpus
|
|
528
|
+
),
|
|
487
529
|
),
|
|
488
530
|
region="us",
|
|
489
531
|
price=1.0,
|
|
490
532
|
availability=InstanceAvailability.AVAILABLE,
|
|
491
533
|
)
|
|
492
|
-
|
|
493
|
-
backend_mock = Mock()
|
|
494
|
-
backend_mock.TYPE = BackendType.AWS
|
|
495
534
|
backend_mock.compute.return_value.get_offers_cached.return_value = [offer]
|
|
496
535
|
backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData(
|
|
497
536
|
backend=offer.backend,
|
|
@@ -507,8 +546,87 @@ class TestCreateInstance:
|
|
|
507
546
|
dockerized=True,
|
|
508
547
|
backend_data=None,
|
|
509
548
|
)
|
|
510
|
-
|
|
549
|
+
|
|
511
550
|
await process_instances()
|
|
512
551
|
|
|
513
552
|
await session.refresh(instance)
|
|
514
553
|
assert instance.status == InstanceStatus.PROVISIONING
|
|
554
|
+
assert instance.total_blocks == expected_blocks
|
|
555
|
+
assert instance.busy_blocks == 0
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
@pytest.mark.asyncio
|
|
559
|
+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
|
|
560
|
+
@pytest.mark.usefixtures("test_db", "deploy_instance_mock")
|
|
561
|
+
class TestAddSSHInstance:
|
|
562
|
+
@pytest.fixture
|
|
563
|
+
def host_info(self) -> dict:
|
|
564
|
+
return {
|
|
565
|
+
"gpu_vendor": "nvidia",
|
|
566
|
+
"gpu_name": "T4",
|
|
567
|
+
"gpu_memory": 16384,
|
|
568
|
+
"gpu_count": 1,
|
|
569
|
+
"addresses": ["192.168.100.100/24"],
|
|
570
|
+
"disk_size": 260976517120,
|
|
571
|
+
"cpus": 32,
|
|
572
|
+
"memory": 33544130560,
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
@pytest.fixture
|
|
576
|
+
def deploy_instance_mock(self, monkeypatch: pytest.MonkeyPatch, host_info: dict):
|
|
577
|
+
mock = Mock(return_value=(HealthStatus(healthy=True, reason="OK"), host_info))
|
|
578
|
+
monkeypatch.setattr(
|
|
579
|
+
"dstack._internal.server.background.tasks.process_instances._deploy_instance", mock
|
|
580
|
+
)
|
|
581
|
+
return mock
|
|
582
|
+
|
|
583
|
+
@pytest.mark.parametrize(
|
|
584
|
+
# requested_blocks = None means `auto` (as many as possible)
|
|
585
|
+
["cpus", "gpus", "requested_blocks", "expected_blocks"],
|
|
586
|
+
[
|
|
587
|
+
# GPU instances
|
|
588
|
+
pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"),
|
|
589
|
+
pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"),
|
|
590
|
+
pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"),
|
|
591
|
+
pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"),
|
|
592
|
+
pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"),
|
|
593
|
+
pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"),
|
|
594
|
+
# CPU instances
|
|
595
|
+
pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"),
|
|
596
|
+
pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"),
|
|
597
|
+
pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"),
|
|
598
|
+
pytest.param(32, 0, None, 32, id="gpu-instance-auto-max-cpu"),
|
|
599
|
+
],
|
|
600
|
+
)
|
|
601
|
+
@pytest.mark.usefixtures("deploy_instance_mock")
|
|
602
|
+
async def test_adds_ssh_instance(
|
|
603
|
+
self,
|
|
604
|
+
session: AsyncSession,
|
|
605
|
+
host_info: dict,
|
|
606
|
+
cpus: int,
|
|
607
|
+
gpus: int,
|
|
608
|
+
requested_blocks: Optional[int],
|
|
609
|
+
expected_blocks: int,
|
|
610
|
+
):
|
|
611
|
+
host_info["cpus"] = cpus
|
|
612
|
+
host_info["gpu_count"] = gpus
|
|
613
|
+
project = await create_project(session=session)
|
|
614
|
+
pool = await create_pool(session, project)
|
|
615
|
+
instance = await create_instance(
|
|
616
|
+
session,
|
|
617
|
+
project,
|
|
618
|
+
pool,
|
|
619
|
+
status=InstanceStatus.PENDING,
|
|
620
|
+
created_at=get_current_datetime(),
|
|
621
|
+
remote_connection_info=get_remote_connection_info(),
|
|
622
|
+
total_blocks=requested_blocks,
|
|
623
|
+
busy_blocks=0,
|
|
624
|
+
)
|
|
625
|
+
await session.commit()
|
|
626
|
+
|
|
627
|
+
await process_instances()
|
|
628
|
+
|
|
629
|
+
await session.refresh(instance)
|
|
630
|
+
assert instance.status == InstanceStatus.IDLE
|
|
631
|
+
assert instance.total_blocks == expected_blocks
|
|
632
|
+
assert instance.busy_blocks == 0
|
|
@@ -6,6 +6,7 @@ from freezegun import freeze_time
|
|
|
6
6
|
from sqlalchemy import select
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
8
|
|
|
9
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
9
10
|
from dstack._internal.core.models.runs import JobStatus
|
|
10
11
|
from dstack._internal.core.models.users import GlobalRole, ProjectRole
|
|
11
12
|
from dstack._internal.server import settings
|
|
@@ -17,8 +18,10 @@ from dstack._internal.server.models import JobMetricsPoint
|
|
|
17
18
|
from dstack._internal.server.schemas.runner import GPUMetrics, MetricsResponse
|
|
18
19
|
from dstack._internal.server.services.projects import add_project_member
|
|
19
20
|
from dstack._internal.server.testing.common import (
|
|
21
|
+
create_instance,
|
|
20
22
|
create_job,
|
|
21
23
|
create_job_metrics_point,
|
|
24
|
+
create_pool,
|
|
22
25
|
create_project,
|
|
23
26
|
create_repo,
|
|
24
27
|
create_run,
|
|
@@ -42,6 +45,13 @@ class TestCollectMetrics:
|
|
|
42
45
|
session=session,
|
|
43
46
|
project_id=project.id,
|
|
44
47
|
)
|
|
48
|
+
pool = await create_pool(session=session, project=project)
|
|
49
|
+
instance = await create_instance(
|
|
50
|
+
session=session,
|
|
51
|
+
project=project,
|
|
52
|
+
pool=pool,
|
|
53
|
+
status=InstanceStatus.BUSY,
|
|
54
|
+
)
|
|
45
55
|
run = await create_run(
|
|
46
56
|
session=session,
|
|
47
57
|
project=project,
|
|
@@ -53,6 +63,8 @@ class TestCollectMetrics:
|
|
|
53
63
|
run=run,
|
|
54
64
|
status=JobStatus.RUNNING,
|
|
55
65
|
job_provisioning_data=get_job_provisioning_data(),
|
|
66
|
+
instance_assigned=True,
|
|
67
|
+
instance=instance,
|
|
56
68
|
)
|
|
57
69
|
with (
|
|
58
70
|
patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock,
|