dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/apply.py +8 -5
- dstack/_internal/cli/services/configurators/base.py +4 -2
- dstack/_internal/cli/services/configurators/fleet.py +21 -9
- dstack/_internal/cli/services/configurators/gateway.py +15 -0
- dstack/_internal/cli/services/configurators/run.py +6 -5
- dstack/_internal/cli/services/configurators/volume.py +15 -0
- dstack/_internal/cli/services/repos.py +3 -3
- dstack/_internal/cli/utils/fleet.py +44 -33
- dstack/_internal/cli/utils/run.py +27 -7
- dstack/_internal/cli/utils/volume.py +30 -9
- dstack/_internal/core/backends/aws/compute.py +94 -53
- dstack/_internal/core/backends/aws/resources.py +22 -12
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/gcp/compute.py +32 -24
- dstack/_internal/core/backends/gcp/resources.py +0 -15
- dstack/_internal/core/backends/oci/compute.py +10 -5
- dstack/_internal/core/backends/oci/resources.py +23 -26
- dstack/_internal/core/backends/remote/provisioning.py +65 -27
- dstack/_internal/core/backends/runpod/compute.py +1 -0
- dstack/_internal/core/models/backends/azure.py +3 -1
- dstack/_internal/core/models/configurations.py +24 -1
- dstack/_internal/core/models/fleets.py +46 -0
- dstack/_internal/core/models/instances.py +5 -1
- dstack/_internal/core/models/pools.py +4 -1
- dstack/_internal/core/models/profiles.py +10 -4
- dstack/_internal/core/models/runs.py +23 -3
- dstack/_internal/core/models/volumes.py +26 -0
- dstack/_internal/core/services/ssh/attach.py +92 -53
- dstack/_internal/core/services/ssh/tunnel.py +58 -31
- dstack/_internal/proxy/gateway/routers/registry.py +2 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
- dstack/_internal/proxy/gateway/services/registry.py +4 -0
- dstack/_internal/proxy/lib/models.py +3 -0
- dstack/_internal/proxy/lib/services/service_connection.py +8 -1
- dstack/_internal/server/background/tasks/process_instances.py +73 -35
- dstack/_internal/server/background/tasks/process_metrics.py +9 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
- dstack/_internal/server/background/tasks/process_runs.py +2 -12
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +27 -23
- dstack/_internal/server/routers/runs.py +1 -0
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/configurators/azure.py +34 -8
- dstack/_internal/server/services/config.py +9 -0
- dstack/_internal/server/services/fleets.py +32 -3
- dstack/_internal/server/services/gateways/client.py +9 -1
- dstack/_internal/server/services/jobs/__init__.py +217 -45
- dstack/_internal/server/services/jobs/configurators/base.py +47 -2
- dstack/_internal/server/services/offers.py +96 -10
- dstack/_internal/server/services/pools.py +98 -14
- dstack/_internal/server/services/proxy/repo.py +17 -3
- dstack/_internal/server/services/runner/client.py +9 -6
- dstack/_internal/server/services/runner/ssh.py +33 -5
- dstack/_internal/server/services/runs.py +48 -179
- dstack/_internal/server/services/services/__init__.py +9 -1
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
- dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
- dstack/_internal/server/testing/common.py +130 -61
- dstack/_internal/utils/common.py +22 -8
- dstack/_internal/utils/env.py +14 -0
- dstack/_internal/utils/ssh.py +1 -1
- dstack/api/server/_fleets.py +25 -1
- dstack/api/server/_runs.py +23 -2
- dstack/api/server/_volumes.py +12 -1
- dstack/version.py +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
- tests/_internal/cli/services/configurators/test_profile.py +3 -3
- tests/_internal/core/services/ssh/test_tunnel.py +56 -4
- tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
- tests/_internal/server/background/tasks/test_process_instances.py +138 -20
- tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
- tests/_internal/server/background/tasks/test_process_runs.py +27 -3
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
- tests/_internal/server/routers/test_fleets.py +15 -2
- tests/_internal/server/routers/test_pools.py +6 -0
- tests/_internal/server/routers/test_runs.py +27 -0
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/jobs/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +109 -1
- tests/_internal/server/services/test_runs.py +5 -41
- tests/_internal/utils/test_common.py +21 -0
- tests/_internal/utils/test_env.py +38 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
|
@@ -3,18 +3,19 @@ import json
|
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from sqlalchemy import delete, select
|
|
6
|
-
from sqlalchemy.orm import
|
|
6
|
+
from sqlalchemy.orm import joinedload
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
|
|
9
9
|
from dstack._internal.core.models.runs import JobStatus
|
|
10
10
|
from dstack._internal.server import settings
|
|
11
11
|
from dstack._internal.server.db import get_session_ctx
|
|
12
|
-
from dstack._internal.server.models import JobMetricsPoint, JobModel
|
|
12
|
+
from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel
|
|
13
13
|
from dstack._internal.server.schemas.runner import MetricsResponse
|
|
14
14
|
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
|
|
15
|
+
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
15
16
|
from dstack._internal.server.services.runner import client
|
|
16
17
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
17
|
-
from dstack._internal.utils.common import batched, get_current_datetime, run_async
|
|
18
|
+
from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
|
|
18
19
|
from dstack._internal.utils.logging import get_logger
|
|
19
20
|
|
|
20
21
|
logger = get_logger(__name__)
|
|
@@ -29,14 +30,12 @@ async def collect_metrics():
|
|
|
29
30
|
async with get_session_ctx() as session:
|
|
30
31
|
res = await session.execute(
|
|
31
32
|
select(JobModel)
|
|
32
|
-
.where(
|
|
33
|
-
|
|
34
|
-
)
|
|
35
|
-
.options(selectinload(JobModel.project))
|
|
33
|
+
.where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
34
|
+
.options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
|
|
36
35
|
.order_by(JobModel.last_processed_at.asc())
|
|
37
36
|
.limit(MAX_JOBS_FETCHED)
|
|
38
37
|
)
|
|
39
|
-
job_models = res.scalars().all()
|
|
38
|
+
job_models = res.unique().scalars().all()
|
|
40
39
|
|
|
41
40
|
for batch in batched(job_models, BATCH_SIZE):
|
|
42
41
|
await _collect_jobs_metrics(batch)
|
|
@@ -87,6 +86,7 @@ def _get_recently_collected_metric_cutoff() -> int:
|
|
|
87
86
|
|
|
88
87
|
|
|
89
88
|
async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]:
|
|
89
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
90
90
|
jpd = get_job_provisioning_data(job_model)
|
|
91
91
|
jrd = get_job_runtime_data(job_model)
|
|
92
92
|
if jpd is None:
|
|
@@ -94,7 +94,7 @@ async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]
|
|
|
94
94
|
try:
|
|
95
95
|
res = await run_async(
|
|
96
96
|
_pull_runner_metrics,
|
|
97
|
-
|
|
97
|
+
ssh_private_keys,
|
|
98
98
|
jpd,
|
|
99
99
|
jrd,
|
|
100
100
|
)
|
|
@@ -10,7 +10,12 @@ from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HT
|
|
|
10
10
|
from dstack._internal.core.errors import GatewayError
|
|
11
11
|
from dstack._internal.core.models.backends.base import BackendType
|
|
12
12
|
from dstack._internal.core.models.common import NetworkMode, RegistryAuth, is_core_model_instance
|
|
13
|
-
from dstack._internal.core.models.
|
|
13
|
+
from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
|
|
14
|
+
from dstack._internal.core.models.instances import (
|
|
15
|
+
InstanceStatus,
|
|
16
|
+
RemoteConnectionInfo,
|
|
17
|
+
SSHConnectionParams,
|
|
18
|
+
)
|
|
14
19
|
from dstack._internal.core.models.repos import RemoteRepoCreds
|
|
15
20
|
from dstack._internal.core.models.runs import (
|
|
16
21
|
ClusterInfo,
|
|
@@ -20,10 +25,12 @@ from dstack._internal.core.models.runs import (
|
|
|
20
25
|
JobStatus,
|
|
21
26
|
JobTerminationReason,
|
|
22
27
|
Run,
|
|
28
|
+
RunSpec,
|
|
23
29
|
)
|
|
24
30
|
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
|
|
25
31
|
from dstack._internal.server.db import get_session_ctx
|
|
26
32
|
from dstack._internal.server.models import (
|
|
33
|
+
InstanceModel,
|
|
27
34
|
JobModel,
|
|
28
35
|
ProjectModel,
|
|
29
36
|
RepoModel,
|
|
@@ -34,11 +41,13 @@ from dstack._internal.server.services import logs as logs_services
|
|
|
34
41
|
from dstack._internal.server.services import services
|
|
35
42
|
from dstack._internal.server.services.jobs import (
|
|
36
43
|
find_job,
|
|
44
|
+
get_job_attached_volumes,
|
|
37
45
|
get_job_runtime_data,
|
|
38
46
|
job_model_to_job_submission,
|
|
39
47
|
)
|
|
40
48
|
from dstack._internal.server.services.locking import get_locker
|
|
41
49
|
from dstack._internal.server.services.logging import fmt
|
|
50
|
+
from dstack._internal.server.services.pools import get_instance_ssh_private_keys
|
|
42
51
|
from dstack._internal.server.services.repos import (
|
|
43
52
|
get_code_model,
|
|
44
53
|
get_repo_creds,
|
|
@@ -47,7 +56,6 @@ from dstack._internal.server.services.repos import (
|
|
|
47
56
|
from dstack._internal.server.services.runner import client
|
|
48
57
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
49
58
|
from dstack._internal.server.services.runs import (
|
|
50
|
-
get_job_volumes,
|
|
51
59
|
run_model_to_run,
|
|
52
60
|
)
|
|
53
61
|
from dstack._internal.server.services.storage import get_default_storage
|
|
@@ -81,7 +89,7 @@ async def _process_next_running_job():
|
|
|
81
89
|
.limit(1)
|
|
82
90
|
.with_for_update(skip_locked=True)
|
|
83
91
|
)
|
|
84
|
-
job_model = res.scalar()
|
|
92
|
+
job_model = res.unique().scalar()
|
|
85
93
|
if job_model is None:
|
|
86
94
|
return
|
|
87
95
|
lockset.add(job_model.id)
|
|
@@ -99,10 +107,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
99
107
|
res = await session.execute(
|
|
100
108
|
select(JobModel)
|
|
101
109
|
.where(JobModel.id == job_model.id)
|
|
102
|
-
.options(joinedload(JobModel.instance))
|
|
110
|
+
.options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
|
|
103
111
|
.execution_options(populate_existing=True)
|
|
104
112
|
)
|
|
105
|
-
job_model = res.scalar_one()
|
|
113
|
+
job_model = res.unique().scalar_one()
|
|
106
114
|
res = await session.execute(
|
|
107
115
|
select(RunModel)
|
|
108
116
|
.where(RunModel.id == job_model.run_id)
|
|
@@ -142,25 +150,17 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
142
150
|
job_provisioning_data=job_provisioning_data,
|
|
143
151
|
)
|
|
144
152
|
|
|
145
|
-
volumes = await
|
|
153
|
+
volumes = await get_job_attached_volumes(
|
|
146
154
|
session=session,
|
|
147
155
|
project=project,
|
|
148
156
|
run_spec=run.run_spec,
|
|
157
|
+
job_num=job.job_spec.job_num,
|
|
149
158
|
job_provisioning_data=job_provisioning_data,
|
|
150
159
|
)
|
|
151
160
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if (
|
|
156
|
-
job_model.instance is not None
|
|
157
|
-
and job_model.instance.remote_connection_info is not None
|
|
158
|
-
and job_provisioning_data.dockerized
|
|
159
|
-
):
|
|
160
|
-
remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw(
|
|
161
|
-
job_model.instance.remote_connection_info
|
|
162
|
-
)
|
|
163
|
-
server_ssh_private_key = remote_conn_info.ssh_keys[0].private
|
|
161
|
+
server_ssh_private_keys = get_instance_ssh_private_keys(
|
|
162
|
+
common_utils.get_or_error(job_model.instance)
|
|
163
|
+
)
|
|
164
164
|
|
|
165
165
|
secrets = {} # TODO secrets
|
|
166
166
|
|
|
@@ -200,11 +200,12 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
200
200
|
user_ssh_key = ""
|
|
201
201
|
success = await common_utils.run_async(
|
|
202
202
|
_process_provisioning_with_shim,
|
|
203
|
-
|
|
203
|
+
server_ssh_private_keys,
|
|
204
204
|
job_provisioning_data,
|
|
205
205
|
None,
|
|
206
206
|
run,
|
|
207
207
|
job_model,
|
|
208
|
+
job_provisioning_data,
|
|
208
209
|
volumes,
|
|
209
210
|
secrets,
|
|
210
211
|
job.job_spec.registry_auth,
|
|
@@ -226,7 +227,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
226
227
|
)
|
|
227
228
|
success = await common_utils.run_async(
|
|
228
229
|
_submit_job_to_runner,
|
|
229
|
-
|
|
230
|
+
server_ssh_private_keys,
|
|
230
231
|
job_provisioning_data,
|
|
231
232
|
None,
|
|
232
233
|
run,
|
|
@@ -269,7 +270,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
269
270
|
)
|
|
270
271
|
success = await common_utils.run_async(
|
|
271
272
|
_process_pulling_with_shim,
|
|
272
|
-
|
|
273
|
+
server_ssh_private_keys,
|
|
273
274
|
job_provisioning_data,
|
|
274
275
|
None,
|
|
275
276
|
run,
|
|
@@ -279,14 +280,14 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
279
280
|
code,
|
|
280
281
|
secrets,
|
|
281
282
|
repo_creds,
|
|
282
|
-
|
|
283
|
+
server_ssh_private_keys,
|
|
283
284
|
job_provisioning_data,
|
|
284
285
|
)
|
|
285
286
|
elif initial_status == JobStatus.RUNNING:
|
|
286
287
|
logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
|
|
287
288
|
success = await common_utils.run_async(
|
|
288
289
|
_process_running,
|
|
289
|
-
|
|
290
|
+
server_ssh_private_keys,
|
|
290
291
|
job_provisioning_data,
|
|
291
292
|
job_submission.job_runtime_data,
|
|
292
293
|
run_model,
|
|
@@ -312,8 +313,24 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
312
313
|
and job_model.job_num == 0 # gateway connects only to the first node
|
|
313
314
|
and run.run_spec.configuration.type == "service"
|
|
314
315
|
):
|
|
316
|
+
ssh_head_proxy: Optional[SSHConnectionParams] = None
|
|
317
|
+
ssh_head_proxy_private_key: Optional[str] = None
|
|
318
|
+
instance = common_utils.get_or_error(job_model.instance)
|
|
319
|
+
if instance.remote_connection_info is not None:
|
|
320
|
+
rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
|
|
321
|
+
if rci.ssh_proxy is not None:
|
|
322
|
+
ssh_head_proxy = rci.ssh_proxy
|
|
323
|
+
ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys)
|
|
324
|
+
ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private
|
|
315
325
|
try:
|
|
316
|
-
await services.register_replica(
|
|
326
|
+
await services.register_replica(
|
|
327
|
+
session,
|
|
328
|
+
run_model.gateway_id,
|
|
329
|
+
run,
|
|
330
|
+
job_model,
|
|
331
|
+
ssh_head_proxy,
|
|
332
|
+
ssh_head_proxy_private_key,
|
|
333
|
+
)
|
|
317
334
|
except GatewayError as e:
|
|
318
335
|
logger.warning(
|
|
319
336
|
"%s: failed to register service replica: %s, age=%s",
|
|
@@ -360,6 +377,7 @@ def _process_provisioning_with_shim(
|
|
|
360
377
|
ports: Dict[int, int],
|
|
361
378
|
run: Run,
|
|
362
379
|
job_model: JobModel,
|
|
380
|
+
job_provisioning_data: JobProvisioningData,
|
|
363
381
|
volumes: List[Volume],
|
|
364
382
|
secrets: Dict[str, str],
|
|
365
383
|
registry_auth: Optional[RegistryAuth],
|
|
@@ -443,6 +461,7 @@ def _process_provisioning_with_shim(
|
|
|
443
461
|
host_ssh_user=ssh_user,
|
|
444
462
|
host_ssh_keys=[ssh_key] if ssh_key else [],
|
|
445
463
|
container_ssh_keys=public_keys,
|
|
464
|
+
instance_id=job_provisioning_data.instance_id,
|
|
446
465
|
)
|
|
447
466
|
else:
|
|
448
467
|
submitted = shim_client.submit(
|
|
@@ -459,6 +478,7 @@ def _process_provisioning_with_shim(
|
|
|
459
478
|
mounts=volume_mounts,
|
|
460
479
|
volumes=volumes,
|
|
461
480
|
instance_mounts=instance_mounts,
|
|
481
|
+
instance_id=job_provisioning_data.instance_id,
|
|
462
482
|
)
|
|
463
483
|
if not submitted:
|
|
464
484
|
# This can happen when we lost connection to the runner (e.g., network issues), marked
|
|
@@ -490,7 +510,7 @@ def _process_pulling_with_shim(
|
|
|
490
510
|
code: bytes,
|
|
491
511
|
secrets: Dict[str, str],
|
|
492
512
|
repo_credentials: Optional[RemoteRepoCreds],
|
|
493
|
-
|
|
513
|
+
server_ssh_private_keys: tuple[str, Optional[str]],
|
|
494
514
|
job_provisioning_data: JobProvisioningData,
|
|
495
515
|
) -> bool:
|
|
496
516
|
"""
|
|
@@ -555,7 +575,7 @@ def _process_pulling_with_shim(
|
|
|
555
575
|
return True
|
|
556
576
|
|
|
557
577
|
return _submit_job_to_runner(
|
|
558
|
-
|
|
578
|
+
server_ssh_private_keys,
|
|
559
579
|
job_provisioning_data,
|
|
560
580
|
job_runtime_data,
|
|
561
581
|
run=run,
|
|
@@ -597,6 +617,7 @@ def _process_running(
|
|
|
597
617
|
runner_logs=resp.runner_logs,
|
|
598
618
|
job_logs=resp.job_logs,
|
|
599
619
|
)
|
|
620
|
+
previous_status = job_model.status
|
|
600
621
|
if len(resp.job_states) > 0:
|
|
601
622
|
latest_state_event = resp.job_states[-1]
|
|
602
623
|
latest_status = latest_state_event.state
|
|
@@ -612,10 +633,40 @@ def _process_running(
|
|
|
612
633
|
)
|
|
613
634
|
if latest_state_event.termination_message:
|
|
614
635
|
job_model.termination_reason_message = latest_state_event.termination_message
|
|
636
|
+
else:
|
|
637
|
+
_terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
|
|
638
|
+
if job_model.status != previous_status:
|
|
615
639
|
logger.info("%s: now is %s", fmt(job_model), job_model.status.name)
|
|
616
640
|
return True
|
|
617
641
|
|
|
618
642
|
|
|
643
|
+
def _terminate_if_inactivity_duration_exceeded(
|
|
644
|
+
run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
|
|
645
|
+
) -> None:
|
|
646
|
+
conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
|
|
647
|
+
if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
|
|
648
|
+
conf.inactivity_duration, int
|
|
649
|
+
):
|
|
650
|
+
logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
|
|
651
|
+
job_model.inactivity_secs = no_connections_secs
|
|
652
|
+
if no_connections_secs is None:
|
|
653
|
+
# TODO(0.19 or earlier): make no_connections_secs required
|
|
654
|
+
job_model.status = JobStatus.TERMINATING
|
|
655
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
656
|
+
job_model.termination_reason_message = (
|
|
657
|
+
"The selected instance was created before dstack 0.18.41"
|
|
658
|
+
" and does not support inactivity_duration"
|
|
659
|
+
)
|
|
660
|
+
elif no_connections_secs >= conf.inactivity_duration:
|
|
661
|
+
job_model.status = JobStatus.TERMINATING
|
|
662
|
+
# TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
|
|
663
|
+
job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
|
|
664
|
+
job_model.termination_reason_message = (
|
|
665
|
+
f"The job was inactive for {no_connections_secs} seconds,"
|
|
666
|
+
f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
|
|
619
670
|
def _get_cluster_info(
|
|
620
671
|
jobs: List[Job],
|
|
621
672
|
replica_num: int,
|
|
@@ -230,7 +230,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
230
230
|
# the job is submitted
|
|
231
231
|
replica_statuses.add(RunStatus.SUBMITTED)
|
|
232
232
|
elif job_model.status == JobStatus.FAILED or (
|
|
233
|
-
job_model.status
|
|
233
|
+
job_model.status
|
|
234
|
+
in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED]
|
|
234
235
|
and job_model.termination_reason
|
|
235
236
|
not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN}
|
|
236
237
|
):
|
|
@@ -244,17 +245,6 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
|
|
|
244
245
|
run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED)
|
|
245
246
|
else:
|
|
246
247
|
replica_needs_retry = True
|
|
247
|
-
elif job_model.status in {
|
|
248
|
-
JobStatus.TERMINATING,
|
|
249
|
-
JobStatus.TERMINATED,
|
|
250
|
-
JobStatus.ABORTED,
|
|
251
|
-
}:
|
|
252
|
-
# FIXME: This code does not expect JobStatus.TERMINATED status,
|
|
253
|
-
# so if a job transitions from RUNNING to TERMINATED,
|
|
254
|
-
# the run will transition to PENDING instead of TERMINATING.
|
|
255
|
-
# This may not be observed because process_runs is invoked more frequently
|
|
256
|
-
# than process_terminating_jobs and because most jobs usually transition to FAILED.
|
|
257
|
-
pass # unexpected, but let's ignore it
|
|
258
248
|
else:
|
|
259
249
|
raise ValueError(f"Unexpected job status {job_model.status}")
|
|
260
250
|
|