dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -1
- dstack/_internal/core/backends/base/compute.py +20 -1
- dstack/_internal/core/backends/base/models.py +10 -0
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/nebius/compute.py +28 -16
- dstack/_internal/core/backends/nebius/configurator.py +1 -1
- dstack/_internal/core/backends/nebius/models.py +4 -0
- dstack/_internal/core/backends/nebius/resources.py +41 -20
- dstack/_internal/core/backends/runpod/api_client.py +245 -59
- dstack/_internal/core/backends/runpod/compute.py +157 -13
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/profiles.py +3 -1
- dstack/_internal/core/models/runs.py +3 -0
- dstack/_internal/server/app.py +14 -2
- dstack/_internal/server/background/__init__.py +7 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
- dstack/_internal/server/background/tasks/process_instances.py +81 -49
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
- dstack/_internal/server/migrations/env.py +20 -2
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
- dstack/_internal/server/models.py +39 -0
- dstack/_internal/server/routers/runs.py +15 -6
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +13 -0
- dstack/_internal/server/services/jobs/configurators/base.py +3 -2
- dstack/_internal/server/services/requirements/combine.py +1 -0
- dstack/_internal/server/services/runs.py +17 -3
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/server/utils/routers.py +18 -20
- dstack/_internal/settings.py +4 -1
- dstack/_internal/utils/version.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
- dstack/_internal/core/backends/nebius/fabrics.py +0 -49
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import uuid
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
7
|
+
from dstack._internal.core.models.common import CoreModel
|
|
8
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ComputeGroupStatus(str, enum.Enum):
|
|
12
|
+
RUNNING = "running"
|
|
13
|
+
TERMINATED = "terminated"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ComputeGroupProvisioningData(CoreModel):
|
|
17
|
+
compute_group_id: str
|
|
18
|
+
compute_group_name: str
|
|
19
|
+
backend: BackendType
|
|
20
|
+
# In case backend provisions instance in another backend,
|
|
21
|
+
# it may set that backend as base_backend.
|
|
22
|
+
base_backend: Optional[BackendType] = None
|
|
23
|
+
region: str
|
|
24
|
+
job_provisioning_datas: List[JobProvisioningData]
|
|
25
|
+
backend_data: Optional[str] = None # backend-specific data in json
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ComputeGroup(CoreModel):
|
|
29
|
+
"""
|
|
30
|
+
Compute group is a group of instances managed as a single unit via the provider API,
|
|
31
|
+
i.e. instances are not created/deleted one-by-one but all at once.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
id: uuid.UUID
|
|
35
|
+
name: str
|
|
36
|
+
project_name: str
|
|
37
|
+
created_at: datetime
|
|
38
|
+
status: ComputeGroupStatus
|
|
39
|
+
provisioning_data: ComputeGroupProvisioningData
|
|
@@ -309,7 +309,12 @@ class InstanceGroupParams(CoreModel):
|
|
|
309
309
|
idle_duration: Annotated[
|
|
310
310
|
Optional[int],
|
|
311
311
|
Field(
|
|
312
|
-
description=
|
|
312
|
+
description=(
|
|
313
|
+
"Time to wait before terminating idle instances."
|
|
314
|
+
" Instances are not terminated if the fleet is already at `nodes.min`."
|
|
315
|
+
" Defaults to `5m` for runs and `3d` for fleets."
|
|
316
|
+
" Use `off` for unlimited duration"
|
|
317
|
+
)
|
|
313
318
|
),
|
|
314
319
|
] = None
|
|
315
320
|
|
|
@@ -341,7 +341,9 @@ class ProfileParams(CoreModel):
|
|
|
341
341
|
Field(
|
|
342
342
|
description=(
|
|
343
343
|
"Time to wait before terminating idle instances."
|
|
344
|
-
"
|
|
344
|
+
" Instances are not terminated if the fleet is already at `nodes.min`."
|
|
345
|
+
" Defaults to `5m` for runs and `3d` for fleets."
|
|
346
|
+
" Use `off` for unlimited duration"
|
|
345
347
|
)
|
|
346
348
|
),
|
|
347
349
|
] = None
|
|
@@ -207,6 +207,9 @@ class Requirements(CoreModel):
|
|
|
207
207
|
max_price: Optional[float] = None
|
|
208
208
|
spot: Optional[bool] = None
|
|
209
209
|
reservation: Optional[str] = None
|
|
210
|
+
# Backends can use `multinode` to filter out offers if
|
|
211
|
+
# some offers support multinode and some do not.
|
|
212
|
+
multinode: Optional[bool] = None
|
|
210
213
|
|
|
211
214
|
def pretty_format(self, resources_only: bool = False):
|
|
212
215
|
res = self.resources.pretty_format()
|
dstack/_internal/server/app.py
CHANGED
|
@@ -5,7 +5,7 @@ import time
|
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
from contextlib import asynccontextmanager
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Awaitable, Callable, List
|
|
8
|
+
from typing import Awaitable, Callable, List, Optional
|
|
9
9
|
|
|
10
10
|
import sentry_sdk
|
|
11
11
|
from fastapi import FastAPI, Request, Response, status
|
|
@@ -62,6 +62,7 @@ from dstack._internal.server.utils.routers import (
|
|
|
62
62
|
CustomORJSONResponse,
|
|
63
63
|
check_client_server_compatibility,
|
|
64
64
|
error_detail,
|
|
65
|
+
get_client_version,
|
|
65
66
|
get_server_client_error_details,
|
|
66
67
|
)
|
|
67
68
|
from dstack._internal.settings import DSTACK_VERSION
|
|
@@ -319,8 +320,19 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
319
320
|
or request.url.path in _NO_API_VERSION_CHECK_ROUTES
|
|
320
321
|
):
|
|
321
322
|
return await call_next(request)
|
|
323
|
+
try:
|
|
324
|
+
client_version = get_client_version(request)
|
|
325
|
+
except ValueError as e:
|
|
326
|
+
return CustomORJSONResponse(
|
|
327
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
328
|
+
content={"detail": [error_detail(str(e))]},
|
|
329
|
+
)
|
|
330
|
+
client_release: Optional[tuple[int, ...]] = None
|
|
331
|
+
if client_version is not None:
|
|
332
|
+
client_release = client_version.release
|
|
333
|
+
request.state.client_release = client_release
|
|
322
334
|
response = check_client_server_compatibility(
|
|
323
|
-
client_version=
|
|
335
|
+
client_version=client_version,
|
|
324
336
|
server_version=DSTACK_VERSION,
|
|
325
337
|
)
|
|
326
338
|
if response is not None:
|
|
@@ -2,6 +2,7 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
|
2
2
|
from apscheduler.triggers.interval import IntervalTrigger
|
|
3
3
|
|
|
4
4
|
from dstack._internal.server import settings
|
|
5
|
+
from dstack._internal.server.background.tasks.process_compute_groups import process_compute_groups
|
|
5
6
|
from dstack._internal.server.background.tasks.process_fleets import process_fleets
|
|
6
7
|
from dstack._internal.server.background.tasks.process_gateways import (
|
|
7
8
|
process_gateways,
|
|
@@ -122,5 +123,11 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
122
123
|
kwargs={"batch_size": 5},
|
|
123
124
|
max_instances=2 if replica == 0 else 1,
|
|
124
125
|
)
|
|
126
|
+
_scheduler.add_job(
|
|
127
|
+
process_compute_groups,
|
|
128
|
+
IntervalTrigger(seconds=15, jitter=2),
|
|
129
|
+
kwargs={"batch_size": 1},
|
|
130
|
+
max_instances=2 if replica == 0 else 1,
|
|
131
|
+
)
|
|
125
132
|
_scheduler.start()
|
|
126
133
|
return _scheduler
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import select
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.backends.base.compute import ComputeWithGroupProvisioningSupport
|
|
10
|
+
from dstack._internal.core.errors import BackendError
|
|
11
|
+
from dstack._internal.core.models.compute_groups import ComputeGroupStatus
|
|
12
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
13
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
14
|
+
from dstack._internal.server.models import (
|
|
15
|
+
ComputeGroupModel,
|
|
16
|
+
ProjectModel,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.server.services import backends as backends_services
|
|
19
|
+
from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group
|
|
20
|
+
from dstack._internal.server.services.locking import get_locker
|
|
21
|
+
from dstack._internal.server.utils import sentry_utils
|
|
22
|
+
from dstack._internal.utils.common import get_current_datetime, run_async
|
|
23
|
+
from dstack._internal.utils.logging import get_logger
|
|
24
|
+
|
|
25
|
+
logger = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
29
|
+
|
|
30
|
+
TERMINATION_RETRY_TIMEOUT = timedelta(seconds=60)
|
|
31
|
+
TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
async def process_compute_groups(batch_size: int = 1):
|
|
35
|
+
tasks = []
|
|
36
|
+
for _ in range(batch_size):
|
|
37
|
+
tasks.append(_process_next_compute_group())
|
|
38
|
+
await asyncio.gather(*tasks)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@sentry_utils.instrument_background_task
|
|
42
|
+
async def _process_next_compute_group():
|
|
43
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(ComputeGroupModel.__tablename__)
|
|
44
|
+
async with get_session_ctx() as session:
|
|
45
|
+
async with lock:
|
|
46
|
+
res = await session.execute(
|
|
47
|
+
select(ComputeGroupModel)
|
|
48
|
+
.where(
|
|
49
|
+
ComputeGroupModel.deleted == False,
|
|
50
|
+
ComputeGroupModel.id.not_in(lockset),
|
|
51
|
+
ComputeGroupModel.last_processed_at
|
|
52
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
53
|
+
)
|
|
54
|
+
.options(load_only(ComputeGroupModel.id))
|
|
55
|
+
.order_by(ComputeGroupModel.last_processed_at.asc())
|
|
56
|
+
.limit(1)
|
|
57
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
58
|
+
)
|
|
59
|
+
compute_group_model = res.scalar()
|
|
60
|
+
if compute_group_model is None:
|
|
61
|
+
return
|
|
62
|
+
compute_group_model_id = compute_group_model.id
|
|
63
|
+
lockset.add(compute_group_model_id)
|
|
64
|
+
try:
|
|
65
|
+
await _process_compute_group(
|
|
66
|
+
session=session,
|
|
67
|
+
compute_group_model=compute_group_model,
|
|
68
|
+
)
|
|
69
|
+
finally:
|
|
70
|
+
lockset.difference_update([compute_group_model_id])
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def _process_compute_group(session: AsyncSession, compute_group_model: ComputeGroupModel):
|
|
74
|
+
# Refetch to load related attributes.
|
|
75
|
+
res = await session.execute(
|
|
76
|
+
select(ComputeGroupModel)
|
|
77
|
+
.where(ComputeGroupModel.id == compute_group_model.id)
|
|
78
|
+
.options(
|
|
79
|
+
joinedload(ComputeGroupModel.instances),
|
|
80
|
+
joinedload(ComputeGroupModel.project).joinedload(ProjectModel.backends),
|
|
81
|
+
)
|
|
82
|
+
.execution_options(populate_existing=True)
|
|
83
|
+
)
|
|
84
|
+
compute_group_model = res.unique().scalar_one()
|
|
85
|
+
if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances):
|
|
86
|
+
await _terminate_compute_group(compute_group_model)
|
|
87
|
+
compute_group_model.last_processed_at = get_current_datetime()
|
|
88
|
+
await session.commit()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> None:
|
|
92
|
+
if (
|
|
93
|
+
compute_group_model.last_termination_retry_at is not None
|
|
94
|
+
and _next_termination_retry_at(compute_group_model) > get_current_datetime()
|
|
95
|
+
):
|
|
96
|
+
return
|
|
97
|
+
compute_group = compute_group_model_to_compute_group(compute_group_model)
|
|
98
|
+
cgpd = compute_group.provisioning_data
|
|
99
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
100
|
+
project=compute_group_model.project,
|
|
101
|
+
backend_type=cgpd.backend,
|
|
102
|
+
)
|
|
103
|
+
if backend is None:
|
|
104
|
+
logger.error(
|
|
105
|
+
"Failed to terminate compute group %s. Backend %s not available.",
|
|
106
|
+
compute_group.name,
|
|
107
|
+
cgpd.backend,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
logger.debug("Terminating compute group %s", compute_group.name)
|
|
111
|
+
compute = backend.compute()
|
|
112
|
+
assert isinstance(compute, ComputeWithGroupProvisioningSupport)
|
|
113
|
+
try:
|
|
114
|
+
await run_async(
|
|
115
|
+
compute.terminate_compute_group,
|
|
116
|
+
compute_group,
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
if compute_group_model.first_termination_retry_at is None:
|
|
120
|
+
compute_group_model.first_termination_retry_at = get_current_datetime()
|
|
121
|
+
compute_group_model.last_termination_retry_at = get_current_datetime()
|
|
122
|
+
if _next_termination_retry_at(compute_group_model) < _get_termination_deadline(
|
|
123
|
+
compute_group_model
|
|
124
|
+
):
|
|
125
|
+
logger.warning(
|
|
126
|
+
"Failed to terminate compute group %s. Will retry. Error: %r",
|
|
127
|
+
compute_group.name,
|
|
128
|
+
e,
|
|
129
|
+
exc_info=not isinstance(e, BackendError),
|
|
130
|
+
)
|
|
131
|
+
return
|
|
132
|
+
logger.error(
|
|
133
|
+
"Failed all attempts to terminate compute group %s."
|
|
134
|
+
" Please terminate it manually to avoid unexpected charges."
|
|
135
|
+
" Error: %r",
|
|
136
|
+
compute_group.name,
|
|
137
|
+
e,
|
|
138
|
+
exc_info=not isinstance(e, BackendError),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
compute_group_model.deleted = True
|
|
142
|
+
compute_group_model.deleted_at = get_current_datetime()
|
|
143
|
+
compute_group_model.status = ComputeGroupStatus.TERMINATED
|
|
144
|
+
# Terminating instances belonging to a compute group are locked implicitly
|
|
145
|
+
# by locking the compute group.
|
|
146
|
+
for instance_model in compute_group_model.instances:
|
|
147
|
+
instance_model.deleted = True
|
|
148
|
+
instance_model.deleted_at = get_current_datetime()
|
|
149
|
+
instance_model.finished_at = get_current_datetime()
|
|
150
|
+
instance_model.status = InstanceStatus.TERMINATED
|
|
151
|
+
logger.info(
|
|
152
|
+
"Terminated compute group %s",
|
|
153
|
+
compute_group.name,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _next_termination_retry_at(compute_group_model: ComputeGroupModel) -> datetime.datetime:
|
|
158
|
+
assert compute_group_model.last_termination_retry_at is not None
|
|
159
|
+
return compute_group_model.last_termination_retry_at + TERMINATION_RETRY_TIMEOUT
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _get_termination_deadline(compute_group_model: ComputeGroupModel) -> datetime.datetime:
|
|
163
|
+
assert compute_group_model.first_termination_retry_at is not None
|
|
164
|
+
return compute_group_model.first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION
|
|
@@ -8,7 +8,7 @@ import requests
|
|
|
8
8
|
from paramiko.pkey import PKey
|
|
9
9
|
from paramiko.ssh_exception import PasswordRequiredException
|
|
10
10
|
from pydantic import ValidationError
|
|
11
|
-
from sqlalchemy import delete, func, select
|
|
11
|
+
from sqlalchemy import and_, delete, func, not_, select
|
|
12
12
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
13
13
|
from sqlalchemy.orm import joinedload
|
|
14
14
|
|
|
@@ -57,7 +57,6 @@ from dstack._internal.core.models.profiles import (
|
|
|
57
57
|
)
|
|
58
58
|
from dstack._internal.core.models.runs import (
|
|
59
59
|
JobProvisioningData,
|
|
60
|
-
Retry,
|
|
61
60
|
)
|
|
62
61
|
from dstack._internal.server import settings as server_settings
|
|
63
62
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
@@ -167,6 +166,14 @@ async def _process_next_instance():
|
|
|
167
166
|
InstanceStatus.TERMINATING,
|
|
168
167
|
]
|
|
169
168
|
),
|
|
169
|
+
# Terminating instances belonging to a compute group
|
|
170
|
+
# are handled by process_compute_groups.
|
|
171
|
+
not_(
|
|
172
|
+
and_(
|
|
173
|
+
InstanceModel.status == InstanceStatus.TERMINATING,
|
|
174
|
+
InstanceModel.compute_group_id.is_not(None),
|
|
175
|
+
)
|
|
176
|
+
),
|
|
170
177
|
InstanceModel.id.not_in(lockset),
|
|
171
178
|
InstanceModel.last_processed_at
|
|
172
179
|
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
@@ -189,12 +196,12 @@ async def _process_next_instance():
|
|
|
189
196
|
|
|
190
197
|
|
|
191
198
|
async def _process_instance(session: AsyncSession, instance: InstanceModel):
|
|
199
|
+
# Refetch to load related attributes.
|
|
200
|
+
# Load related attributes only for statuses that always need them.
|
|
192
201
|
if instance.status in (
|
|
193
202
|
InstanceStatus.PENDING,
|
|
194
203
|
InstanceStatus.TERMINATING,
|
|
195
204
|
):
|
|
196
|
-
# Refetch to load related attributes.
|
|
197
|
-
# Load related attributes only for statuses that always need them.
|
|
198
205
|
res = await session.execute(
|
|
199
206
|
select(InstanceModel)
|
|
200
207
|
.where(InstanceModel.id == instance.id)
|
|
@@ -204,6 +211,16 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel):
|
|
|
204
211
|
.execution_options(populate_existing=True)
|
|
205
212
|
)
|
|
206
213
|
instance = res.unique().scalar_one()
|
|
214
|
+
elif instance.status == InstanceStatus.IDLE:
|
|
215
|
+
res = await session.execute(
|
|
216
|
+
select(InstanceModel)
|
|
217
|
+
.where(InstanceModel.id == instance.id)
|
|
218
|
+
.options(joinedload(InstanceModel.project))
|
|
219
|
+
.options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
|
|
220
|
+
.options(joinedload(InstanceModel.fleet).joinedload(FleetModel.instances))
|
|
221
|
+
.execution_options(populate_existing=True)
|
|
222
|
+
)
|
|
223
|
+
instance = res.unique().scalar_one()
|
|
207
224
|
|
|
208
225
|
if instance.status == InstanceStatus.PENDING:
|
|
209
226
|
if instance.remote_connection_info is not None:
|
|
@@ -235,6 +252,14 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel
|
|
|
235
252
|
and not instance.jobs
|
|
236
253
|
):
|
|
237
254
|
return False
|
|
255
|
+
if instance.fleet is not None and not _can_terminate_fleet_instances_on_idle_duration(
|
|
256
|
+
instance.fleet
|
|
257
|
+
):
|
|
258
|
+
logger.debug(
|
|
259
|
+
"Skipping instance %s termination on idle duration. Fleet is already at `nodes.min`.",
|
|
260
|
+
instance.name,
|
|
261
|
+
)
|
|
262
|
+
return False
|
|
238
263
|
idle_duration = _get_instance_idle_duration(instance)
|
|
239
264
|
idle_seconds = instance.termination_idle_time
|
|
240
265
|
delta = datetime.timedelta(seconds=idle_seconds)
|
|
@@ -254,6 +279,20 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel
|
|
|
254
279
|
return False
|
|
255
280
|
|
|
256
281
|
|
|
282
|
+
def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> bool:
|
|
283
|
+
# Do not terminate instances on idle duration if fleet is already at `nodes.min`.
|
|
284
|
+
# This is an optimization to avoid terminate-create loop.
|
|
285
|
+
# There may be race conditions since we don't take the fleet lock.
|
|
286
|
+
# That's ok: in the worst case we go below `nodes.min`, but
|
|
287
|
+
# the fleet consolidation logic will provision new nodes.
|
|
288
|
+
fleet = fleet_model_to_fleet(fleet_model)
|
|
289
|
+
if fleet.spec.configuration.nodes is None or fleet.spec.autocreated:
|
|
290
|
+
return True
|
|
291
|
+
active_instances = [i for i in fleet_model.instances if i.status.is_active()]
|
|
292
|
+
active_instances_num = len(active_instances)
|
|
293
|
+
return active_instances_num > fleet.spec.configuration.nodes.min
|
|
294
|
+
|
|
295
|
+
|
|
257
296
|
async def _add_remote(instance: InstanceModel) -> None:
|
|
258
297
|
logger.info("Adding ssh instance %s...", instance.name)
|
|
259
298
|
if instance.status == InstanceStatus.PENDING:
|
|
@@ -918,51 +957,48 @@ async def _terminate(instance: InstanceModel) -> None:
|
|
|
918
957
|
):
|
|
919
958
|
return
|
|
920
959
|
jpd = get_instance_provisioning_data(instance)
|
|
921
|
-
if jpd is not None:
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
960
|
+
if jpd is not None and jpd.backend != BackendType.REMOTE:
|
|
961
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
962
|
+
project=instance.project, backend_type=jpd.backend
|
|
963
|
+
)
|
|
964
|
+
if backend is None:
|
|
965
|
+
logger.error(
|
|
966
|
+
"Failed to terminate instance %s. Backend %s not available.",
|
|
967
|
+
instance.name,
|
|
968
|
+
jpd.backend,
|
|
925
969
|
)
|
|
926
|
-
|
|
970
|
+
else:
|
|
971
|
+
logger.debug("Terminating runner instance %s", jpd.hostname)
|
|
972
|
+
try:
|
|
973
|
+
await run_async(
|
|
974
|
+
backend.compute().terminate_instance,
|
|
975
|
+
jpd.instance_id,
|
|
976
|
+
jpd.region,
|
|
977
|
+
jpd.backend_data,
|
|
978
|
+
)
|
|
979
|
+
except Exception as e:
|
|
980
|
+
if instance.first_termination_retry_at is None:
|
|
981
|
+
instance.first_termination_retry_at = get_current_datetime()
|
|
982
|
+
instance.last_termination_retry_at = get_current_datetime()
|
|
983
|
+
if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
|
|
984
|
+
if isinstance(e, NotYetTerminated):
|
|
985
|
+
logger.debug("Instance %s termination in progress: %s", instance.name, e)
|
|
986
|
+
else:
|
|
987
|
+
logger.warning(
|
|
988
|
+
"Failed to terminate instance %s. Will retry. Error: %r",
|
|
989
|
+
instance.name,
|
|
990
|
+
e,
|
|
991
|
+
exc_info=not isinstance(e, BackendError),
|
|
992
|
+
)
|
|
993
|
+
return
|
|
927
994
|
logger.error(
|
|
928
|
-
"Failed to terminate instance %s.
|
|
995
|
+
"Failed all attempts to terminate instance %s."
|
|
996
|
+
" Please terminate the instance manually to avoid unexpected charges."
|
|
997
|
+
" Error: %r",
|
|
929
998
|
instance.name,
|
|
930
|
-
|
|
999
|
+
e,
|
|
1000
|
+
exc_info=not isinstance(e, BackendError),
|
|
931
1001
|
)
|
|
932
|
-
else:
|
|
933
|
-
logger.debug("Terminating runner instance %s", jpd.hostname)
|
|
934
|
-
try:
|
|
935
|
-
await run_async(
|
|
936
|
-
backend.compute().terminate_instance,
|
|
937
|
-
jpd.instance_id,
|
|
938
|
-
jpd.region,
|
|
939
|
-
jpd.backend_data,
|
|
940
|
-
)
|
|
941
|
-
except Exception as e:
|
|
942
|
-
if instance.first_termination_retry_at is None:
|
|
943
|
-
instance.first_termination_retry_at = get_current_datetime()
|
|
944
|
-
instance.last_termination_retry_at = get_current_datetime()
|
|
945
|
-
if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
|
|
946
|
-
if isinstance(e, NotYetTerminated):
|
|
947
|
-
logger.debug(
|
|
948
|
-
"Instance %s termination in progress: %s", instance.name, e
|
|
949
|
-
)
|
|
950
|
-
else:
|
|
951
|
-
logger.warning(
|
|
952
|
-
"Failed to terminate instance %s. Will retry. Error: %r",
|
|
953
|
-
instance.name,
|
|
954
|
-
e,
|
|
955
|
-
exc_info=not isinstance(e, BackendError),
|
|
956
|
-
)
|
|
957
|
-
return
|
|
958
|
-
logger.error(
|
|
959
|
-
"Failed all attempts to terminate instance %s."
|
|
960
|
-
" Please terminate the instance manually to avoid unexpected charges."
|
|
961
|
-
" Error: %r",
|
|
962
|
-
instance.name,
|
|
963
|
-
e,
|
|
964
|
-
exc_info=not isinstance(e, BackendError),
|
|
965
|
-
)
|
|
966
1002
|
|
|
967
1003
|
instance.deleted = True
|
|
968
1004
|
instance.deleted_at = get_current_datetime()
|
|
@@ -1126,10 +1162,6 @@ def _get_instance_idle_duration(instance: InstanceModel) -> datetime.timedelta:
|
|
|
1126
1162
|
return get_current_datetime() - last_time
|
|
1127
1163
|
|
|
1128
1164
|
|
|
1129
|
-
def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datetime.datetime:
|
|
1130
|
-
return instance.created_at + timedelta(seconds=retry.duration)
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
1165
|
def _get_provisioning_deadline(
|
|
1134
1166
|
instance: InstanceModel,
|
|
1135
1167
|
job_provisioning_data: JobProvisioningData,
|