dstack 0.19.20__py3-none-any.whl → 0.19.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +4 -0
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/server/app.py +22 -10
- dstack/_internal/server/background/__init__.py +5 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +62 -48
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
- dstack/_internal/server/background/tasks/process_runs.py +63 -20
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +16 -16
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +10 -14
- dstack/_internal/server/services/jobs/__init__.py +10 -12
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/projects.py +35 -15
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
- dstack/_internal/server/services/runs.py +74 -34
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/testing/common.py +7 -0
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/version.py +1 -1
- {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
- {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/RECORD +44 -39
- {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import datetime
|
|
3
3
|
import uuid
|
|
4
|
-
from datetime import timedelta
|
|
4
|
+
from datetime import timedelta
|
|
5
5
|
from functools import partial
|
|
6
6
|
from typing import List, Optional, Sequence
|
|
7
7
|
|
|
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
11
11
|
from sqlalchemy.orm import selectinload
|
|
12
12
|
|
|
13
13
|
import dstack._internal.utils.random_names as random_names
|
|
14
|
-
from dstack._internal.core.backends import (
|
|
15
|
-
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
16
|
-
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
17
|
-
)
|
|
18
14
|
from dstack._internal.core.backends.base.compute import (
|
|
19
15
|
Compute,
|
|
20
16
|
ComputeWithGatewaySupport,
|
|
21
17
|
get_dstack_gateway_wheel,
|
|
22
18
|
get_dstack_runner_version,
|
|
23
19
|
)
|
|
20
|
+
from dstack._internal.core.backends.features import (
|
|
21
|
+
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
22
|
+
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
23
|
+
)
|
|
24
24
|
from dstack._internal.core.errors import (
|
|
25
25
|
GatewayError,
|
|
26
26
|
ResourceNotExistsError,
|
|
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
|
|
|
86
86
|
return gateway_model_to_gateway(gateway)
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
async def get_project_default_gateway(
|
|
90
|
-
session: AsyncSession, project: ProjectModel
|
|
91
|
-
) -> Optional[Gateway]:
|
|
92
|
-
gateway: Optional[GatewayModel] = project.default_gateway
|
|
93
|
-
if gateway is None:
|
|
94
|
-
return None
|
|
95
|
-
return gateway_model_to_gateway(gateway)
|
|
96
|
-
|
|
97
|
-
|
|
98
89
|
async def create_gateway_compute(
|
|
99
90
|
project_name: str,
|
|
100
91
|
backend_compute: Compute,
|
|
@@ -181,9 +172,9 @@ async def create_gateway(
|
|
|
181
172
|
session.add(gateway)
|
|
182
173
|
await session.commit()
|
|
183
174
|
|
|
184
|
-
|
|
175
|
+
default_gateway = await get_project_default_gateway_model(session=session, project=project)
|
|
176
|
+
if default_gateway is None or configuration.default:
|
|
185
177
|
await set_default_gateway(session=session, project=project, name=configuration.name)
|
|
186
|
-
|
|
187
178
|
return gateway_model_to_gateway(gateway)
|
|
188
179
|
|
|
189
180
|
|
|
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
|
|
|
349
340
|
return res.scalar()
|
|
350
341
|
|
|
351
342
|
|
|
343
|
+
async def get_project_default_gateway_model(
|
|
344
|
+
session: AsyncSession, project: ProjectModel
|
|
345
|
+
) -> Optional[GatewayModel]:
|
|
346
|
+
res = await session.execute(
|
|
347
|
+
select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
|
|
348
|
+
)
|
|
349
|
+
return res.scalar_one_or_none()
|
|
350
|
+
|
|
351
|
+
|
|
352
352
|
async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
|
|
353
353
|
gateways = await list_project_gateway_models(session=session, project=project)
|
|
354
354
|
names = {g.name for g in gateways}
|
|
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
|
|
|
557
557
|
region=gateway_model.region,
|
|
558
558
|
wildcard_domain=gateway_model.wildcard_domain,
|
|
559
559
|
default=gateway_model.project.default_gateway_id == gateway_model.id,
|
|
560
|
-
created_at=gateway_model.created_at
|
|
560
|
+
created_at=gateway_model.created_at,
|
|
561
561
|
status=gateway_model.status,
|
|
562
562
|
status_message=gateway_model.status_message,
|
|
563
563
|
configuration=configuration,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from collections.abc import Container, Iterable
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import gpuhunt
|
|
@@ -8,11 +8,11 @@ from sqlalchemy import and_, or_, select
|
|
|
8
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
9
|
from sqlalchemy.orm import joinedload
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
12
11
|
from dstack._internal.core.backends.base.offers import (
|
|
13
12
|
offer_to_catalog_item,
|
|
14
13
|
requirements_to_query_filter,
|
|
15
14
|
)
|
|
15
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
16
16
|
from dstack._internal.core.models.backends.base import BackendType
|
|
17
17
|
from dstack._internal.core.models.envs import Env
|
|
18
18
|
from dstack._internal.core.models.instances import (
|
|
@@ -34,7 +34,6 @@ from dstack._internal.core.models.profiles import (
|
|
|
34
34
|
TerminationPolicy,
|
|
35
35
|
)
|
|
36
36
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
37
|
-
from dstack._internal.core.models.users import GlobalRole
|
|
38
37
|
from dstack._internal.core.models.volumes import Volume
|
|
39
38
|
from dstack._internal.core.services.profiles import get_termination
|
|
40
39
|
from dstack._internal.server.models import (
|
|
@@ -44,7 +43,7 @@ from dstack._internal.server.models import (
|
|
|
44
43
|
UserModel,
|
|
45
44
|
)
|
|
46
45
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
47
|
-
from dstack._internal.server.services.projects import
|
|
46
|
+
from dstack._internal.server.services.projects import list_user_project_models
|
|
48
47
|
from dstack._internal.utils import common as common_utils
|
|
49
48
|
from dstack._internal.utils.logging import get_logger
|
|
50
49
|
|
|
@@ -62,7 +61,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
62
61
|
status=instance_model.status,
|
|
63
62
|
unreachable=instance_model.unreachable,
|
|
64
63
|
termination_reason=instance_model.termination_reason,
|
|
65
|
-
created=instance_model.created_at
|
|
64
|
+
created=instance_model.created_at,
|
|
66
65
|
total_blocks=instance_model.total_blocks,
|
|
67
66
|
busy_blocks=instance_model.busy_blocks,
|
|
68
67
|
)
|
|
@@ -372,18 +371,15 @@ async def list_user_instances(
|
|
|
372
371
|
limit: int,
|
|
373
372
|
ascending: bool,
|
|
374
373
|
) -> List[Instance]:
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
return []
|
|
381
|
-
|
|
374
|
+
projects = await list_user_project_models(
|
|
375
|
+
session=session,
|
|
376
|
+
user=user,
|
|
377
|
+
only_names=True,
|
|
378
|
+
)
|
|
382
379
|
if project_names is not None:
|
|
383
|
-
projects = [
|
|
380
|
+
projects = [p for p in projects if p.name in project_names]
|
|
384
381
|
if len(projects) == 0:
|
|
385
382
|
return []
|
|
386
|
-
|
|
387
383
|
instance_models = await list_projects_instance_models(
|
|
388
384
|
session=session,
|
|
389
385
|
projects=projects,
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta
|
|
4
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
from sqlalchemy import select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
11
11
|
|
|
12
12
|
import dstack._internal.server.services.backends as backends_services
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Backend
|
|
@@ -130,7 +130,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
130
130
|
):
|
|
131
131
|
backend_data = json.loads(job_provisioning_data.backend_data)
|
|
132
132
|
job_provisioning_data.backend = backend_data["base_backend"]
|
|
133
|
-
last_processed_at = job_model.last_processed_at
|
|
133
|
+
last_processed_at = job_model.last_processed_at
|
|
134
134
|
finished_at = None
|
|
135
135
|
if job_model.status.is_finished():
|
|
136
136
|
finished_at = last_processed_at
|
|
@@ -140,7 +140,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
140
140
|
id=job_model.id,
|
|
141
141
|
submission_num=job_model.submission_num,
|
|
142
142
|
deployment_num=job_model.deployment_num,
|
|
143
|
-
submitted_at=job_model.submitted_at
|
|
143
|
+
submitted_at=job_model.submitted_at,
|
|
144
144
|
last_processed_at=last_processed_at,
|
|
145
145
|
finished_at=finished_at,
|
|
146
146
|
inactivity_secs=job_model.inactivity_secs,
|
|
@@ -231,10 +231,7 @@ async def process_terminating_job(
|
|
|
231
231
|
Graceful stop should already be done by `process_terminating_run`.
|
|
232
232
|
Caller must acquire the locks on the job and the job's instance.
|
|
233
233
|
"""
|
|
234
|
-
if (
|
|
235
|
-
job_model.remove_at is not None
|
|
236
|
-
and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
|
|
237
|
-
):
|
|
234
|
+
if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
|
|
238
235
|
# it's too early to terminate the instance
|
|
239
236
|
return
|
|
240
237
|
|
|
@@ -550,24 +547,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
|
|
|
550
547
|
return (
|
|
551
548
|
job_model.volumes_detached_at is not None
|
|
552
549
|
and common.get_current_datetime()
|
|
553
|
-
> job_model.volumes_detached_at
|
|
550
|
+
> job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
|
|
554
551
|
and (
|
|
555
552
|
job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
|
|
556
553
|
or stop_duration is not None
|
|
557
554
|
and common.get_current_datetime()
|
|
558
|
-
> job_model.volumes_detached_at
|
|
559
|
-
+ timedelta(seconds=stop_duration)
|
|
555
|
+
> job_model.volumes_detached_at + timedelta(seconds=stop_duration)
|
|
560
556
|
)
|
|
561
557
|
)
|
|
562
558
|
|
|
563
559
|
|
|
564
560
|
async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
|
|
565
561
|
res = await session.execute(
|
|
566
|
-
select(JobModel)
|
|
562
|
+
select(JobModel)
|
|
563
|
+
.where(
|
|
567
564
|
JobModel.status == JobStatus.TERMINATING,
|
|
568
565
|
JobModel.used_instance_id.is_not(None),
|
|
569
566
|
JobModel.volumes_detached_at.is_not(None),
|
|
570
567
|
)
|
|
568
|
+
.options(load_only(JobModel.used_instance_id))
|
|
571
569
|
)
|
|
572
570
|
job_models = res.scalars().all()
|
|
573
571
|
return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
|
|
@@ -2,13 +2,13 @@ from typing import List, Literal, Optional, Tuple, Union
|
|
|
2
2
|
|
|
3
3
|
import gpuhunt
|
|
4
4
|
|
|
5
|
-
from dstack._internal.core.backends import
|
|
5
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
6
|
+
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
7
|
+
from dstack._internal.core.backends.features import (
|
|
6
8
|
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
|
|
7
9
|
BACKENDS_WITH_MULTINODE_SUPPORT,
|
|
8
10
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
9
11
|
)
|
|
10
|
-
from dstack._internal.core.backends.base.backend import Backend
|
|
11
|
-
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
12
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
13
13
|
from dstack._internal.core.models.instances import (
|
|
14
14
|
InstanceOfferWithAvailability,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import uuid
|
|
2
|
-
from datetime import timezone
|
|
3
2
|
from typing import Awaitable, Callable, List, Optional, Tuple
|
|
4
3
|
|
|
5
4
|
from sqlalchemy import delete, select, update
|
|
6
5
|
from sqlalchemy import func as safunc
|
|
7
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
-
from sqlalchemy.orm import joinedload
|
|
7
|
+
from sqlalchemy.orm import QueryableAttribute, joinedload, load_only
|
|
9
8
|
|
|
10
9
|
from dstack._internal.core.backends.configurators import get_configurator
|
|
11
10
|
from dstack._internal.core.backends.dstack.models import (
|
|
@@ -54,13 +53,12 @@ async def list_user_projects(
|
|
|
54
53
|
user: UserModel,
|
|
55
54
|
) -> List[Project]:
|
|
56
55
|
"""
|
|
57
|
-
Returns projects where the user is a member.
|
|
56
|
+
Returns projects where the user is a member or all projects for global admins.
|
|
58
57
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
projects = await list_user_project_models(
|
|
59
|
+
session=session,
|
|
60
|
+
user=user,
|
|
61
|
+
)
|
|
64
62
|
projects = sorted(projects, key=lambda p: p.created_at)
|
|
65
63
|
return [
|
|
66
64
|
project_model_to_project(p, include_backends=False, include_members=False)
|
|
@@ -80,7 +78,7 @@ async def list_user_accessible_projects(
|
|
|
80
78
|
if user.global_role == GlobalRole.ADMIN:
|
|
81
79
|
projects = await list_project_models(session=session)
|
|
82
80
|
else:
|
|
83
|
-
member_projects = await
|
|
81
|
+
member_projects = await list_member_project_models(session=session, user=user)
|
|
84
82
|
public_projects = await list_public_non_member_project_models(session=session, user=user)
|
|
85
83
|
projects = member_projects + public_projects
|
|
86
84
|
|
|
@@ -167,7 +165,7 @@ async def delete_projects(
|
|
|
167
165
|
projects_names: List[str],
|
|
168
166
|
):
|
|
169
167
|
if user.global_role != GlobalRole.ADMIN:
|
|
170
|
-
user_projects = await
|
|
168
|
+
user_projects = await list_member_project_models(
|
|
171
169
|
session=session, user=user, include_members=True
|
|
172
170
|
)
|
|
173
171
|
user_project_names = [p.name for p in user_projects]
|
|
@@ -339,9 +337,25 @@ async def clear_project_members(
|
|
|
339
337
|
|
|
340
338
|
|
|
341
339
|
async def list_user_project_models(
|
|
340
|
+
session: AsyncSession,
|
|
341
|
+
user: UserModel,
|
|
342
|
+
only_names: bool = False,
|
|
343
|
+
) -> List[ProjectModel]:
|
|
344
|
+
load_only_attrs = []
|
|
345
|
+
if only_names:
|
|
346
|
+
load_only_attrs += [ProjectModel.id, ProjectModel.name]
|
|
347
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
348
|
+
return await list_project_models(session=session, load_only_attrs=load_only_attrs)
|
|
349
|
+
return await list_member_project_models(
|
|
350
|
+
session=session, user=user, load_only_attrs=load_only_attrs
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
async def list_member_project_models(
|
|
342
355
|
session: AsyncSession,
|
|
343
356
|
user: UserModel,
|
|
344
357
|
include_members: bool = False,
|
|
358
|
+
load_only_attrs: Optional[List[QueryableAttribute]] = None,
|
|
345
359
|
) -> List[ProjectModel]:
|
|
346
360
|
"""
|
|
347
361
|
List project models for a user where they are a member.
|
|
@@ -349,6 +363,8 @@ async def list_user_project_models(
|
|
|
349
363
|
options = []
|
|
350
364
|
if include_members:
|
|
351
365
|
options.append(joinedload(ProjectModel.members))
|
|
366
|
+
if load_only_attrs:
|
|
367
|
+
options.append(load_only(*load_only_attrs))
|
|
352
368
|
res = await session.execute(
|
|
353
369
|
select(ProjectModel)
|
|
354
370
|
.where(
|
|
@@ -395,13 +411,20 @@ async def list_user_owned_project_models(
|
|
|
395
411
|
|
|
396
412
|
async def list_project_models(
|
|
397
413
|
session: AsyncSession,
|
|
414
|
+
load_only_attrs: Optional[List[QueryableAttribute]] = None,
|
|
398
415
|
) -> List[ProjectModel]:
|
|
416
|
+
options = []
|
|
417
|
+
if load_only_attrs:
|
|
418
|
+
options.append(load_only(*load_only_attrs))
|
|
399
419
|
res = await session.execute(
|
|
400
|
-
select(ProjectModel).where(ProjectModel.deleted == False)
|
|
420
|
+
select(ProjectModel).where(ProjectModel.deleted == False).options(*options)
|
|
401
421
|
)
|
|
402
422
|
return list(res.scalars().all())
|
|
403
423
|
|
|
404
424
|
|
|
425
|
+
# TODO: Do not load ProjectModel.backends and ProjectModel.members by default when getting project
|
|
426
|
+
|
|
427
|
+
|
|
405
428
|
async def get_project_model_by_name(
|
|
406
429
|
session: AsyncSession, project_name: str, ignore_case: bool = True
|
|
407
430
|
) -> Optional[ProjectModel]:
|
|
@@ -415,7 +438,6 @@ async def get_project_model_by_name(
|
|
|
415
438
|
.where(*filters)
|
|
416
439
|
.options(joinedload(ProjectModel.backends))
|
|
417
440
|
.options(joinedload(ProjectModel.members))
|
|
418
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
419
441
|
)
|
|
420
442
|
return res.unique().scalar()
|
|
421
443
|
|
|
@@ -432,7 +454,6 @@ async def get_project_model_by_name_or_error(
|
|
|
432
454
|
)
|
|
433
455
|
.options(joinedload(ProjectModel.backends))
|
|
434
456
|
.options(joinedload(ProjectModel.members))
|
|
435
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
436
457
|
)
|
|
437
458
|
return res.unique().scalar_one()
|
|
438
459
|
|
|
@@ -449,7 +470,6 @@ async def get_project_model_by_id_or_error(
|
|
|
449
470
|
)
|
|
450
471
|
.options(joinedload(ProjectModel.backends))
|
|
451
472
|
.options(joinedload(ProjectModel.members))
|
|
452
|
-
.options(joinedload(ProjectModel.default_gateway))
|
|
453
473
|
)
|
|
454
474
|
return res.unique().scalar_one()
|
|
455
475
|
|
|
@@ -537,7 +557,7 @@ def project_model_to_project(
|
|
|
537
557
|
project_id=project_model.id,
|
|
538
558
|
project_name=project_model.name,
|
|
539
559
|
owner=users.user_model_to_user(project_model.owner),
|
|
540
|
-
created_at=project_model.created_at
|
|
560
|
+
created_at=project_model.created_at,
|
|
541
561
|
backends=backends,
|
|
542
562
|
members=members,
|
|
543
563
|
is_public=project_model.is_public,
|
|
@@ -5,6 +5,9 @@ class RunMetrics:
|
|
|
5
5
|
"""Wrapper class for run-related Prometheus metrics."""
|
|
6
6
|
|
|
7
7
|
def __init__(self):
|
|
8
|
+
# submit_to_provision_duration reflects real provisioning time
|
|
9
|
+
# but does not reflect how quickly provisioning processing works
|
|
10
|
+
# since it includes scheduling time, retrying, etc.
|
|
8
11
|
self._submit_to_provision_duration = Histogram(
|
|
9
12
|
"dstack_submit_to_provision_duration_seconds",
|
|
10
13
|
"Time from when a run has been submitted and first job provisioning",
|
|
@@ -2,7 +2,6 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from collections.abc import Generator, Iterable
|
|
5
|
-
from datetime import timezone
|
|
6
5
|
from typing import ClassVar
|
|
7
6
|
from uuid import UUID
|
|
8
7
|
|
|
@@ -80,7 +79,7 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
80
79
|
"dstack_backend": instance.backend.value if instance.backend is not None else "",
|
|
81
80
|
"dstack_gpu": gpu,
|
|
82
81
|
}
|
|
83
|
-
duration = (now - instance.created_at
|
|
82
|
+
duration = (now - instance.created_at).total_seconds()
|
|
84
83
|
metrics.add_sample(_INSTANCE_DURATION, labels, duration)
|
|
85
84
|
metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0)
|
|
86
85
|
metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count)
|
|
@@ -167,7 +166,7 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
|
|
|
167
166
|
"dstack_backend": jpd.get_base_backend().value,
|
|
168
167
|
"dstack_gpu": gpus[0].name if gpus else "",
|
|
169
168
|
}
|
|
170
|
-
duration = (now - job.submitted_at
|
|
169
|
+
duration = (now - job.submitted_at).total_seconds()
|
|
171
170
|
metrics.add_sample(_JOB_DURATION, labels, duration)
|
|
172
171
|
metrics.add_sample(_JOB_PRICE, labels, price)
|
|
173
172
|
metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus))
|
|
@@ -5,9 +5,10 @@ from datetime import datetime, timezone
|
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
7
|
import pydantic
|
|
8
|
+
from apscheduler.triggers.cron import CronTrigger
|
|
8
9
|
from sqlalchemy import and_, func, or_, select, update
|
|
9
10
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
11
|
+
from sqlalchemy.orm import joinedload
|
|
11
12
|
|
|
12
13
|
import dstack._internal.utils.common as common_utils
|
|
13
14
|
from dstack._internal.core.errors import (
|
|
@@ -42,7 +43,6 @@ from dstack._internal.core.models.runs import (
|
|
|
42
43
|
RunTerminationReason,
|
|
43
44
|
ServiceSpec,
|
|
44
45
|
)
|
|
45
|
-
from dstack._internal.core.models.users import GlobalRole
|
|
46
46
|
from dstack._internal.core.models.volumes import (
|
|
47
47
|
InstanceMountPoint,
|
|
48
48
|
Volume,
|
|
@@ -81,7 +81,7 @@ from dstack._internal.server.services.locking import get_locker, string_to_lock_
|
|
|
81
81
|
from dstack._internal.server.services.logging import fmt
|
|
82
82
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
83
83
|
from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
84
|
-
from dstack._internal.server.services.projects import
|
|
84
|
+
from dstack._internal.server.services.projects import list_user_project_models
|
|
85
85
|
from dstack._internal.server.services.resources import set_resources_defaults
|
|
86
86
|
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
87
87
|
from dstack._internal.server.services.users import get_user_model_by_name
|
|
@@ -115,10 +115,11 @@ async def list_user_runs(
|
|
|
115
115
|
) -> List[Run]:
|
|
116
116
|
if project_name is None and repo_id is not None:
|
|
117
117
|
return []
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
118
|
+
projects = await list_user_project_models(
|
|
119
|
+
session=session,
|
|
120
|
+
user=user,
|
|
121
|
+
only_names=True,
|
|
122
|
+
)
|
|
122
123
|
runs_user = None
|
|
123
124
|
if username is not None:
|
|
124
125
|
runs_user = await get_user_model_by_name(session=session, username=username)
|
|
@@ -217,9 +218,9 @@ async def list_projects_run_models(
|
|
|
217
218
|
res = await session.execute(
|
|
218
219
|
select(RunModel)
|
|
219
220
|
.where(*filters)
|
|
221
|
+
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
220
222
|
.order_by(*order_by)
|
|
221
223
|
.limit(limit)
|
|
222
|
-
.options(selectinload(RunModel.user))
|
|
223
224
|
)
|
|
224
225
|
run_models = list(res.scalars().all())
|
|
225
226
|
return run_models
|
|
@@ -511,6 +512,14 @@ async def submit_run(
|
|
|
511
512
|
)
|
|
512
513
|
|
|
513
514
|
submitted_at = common_utils.get_current_datetime()
|
|
515
|
+
initial_status = RunStatus.SUBMITTED
|
|
516
|
+
initial_replicas = 1
|
|
517
|
+
if run_spec.merged_profile.schedule is not None:
|
|
518
|
+
initial_status = RunStatus.PENDING
|
|
519
|
+
initial_replicas = 0
|
|
520
|
+
elif run_spec.configuration.type == "service":
|
|
521
|
+
initial_replicas = run_spec.configuration.replicas.min
|
|
522
|
+
|
|
514
523
|
run_model = RunModel(
|
|
515
524
|
id=uuid.uuid4(),
|
|
516
525
|
project_id=project.id,
|
|
@@ -519,21 +528,20 @@ async def submit_run(
|
|
|
519
528
|
user_id=user.id,
|
|
520
529
|
run_name=run_spec.run_name,
|
|
521
530
|
submitted_at=submitted_at,
|
|
522
|
-
status=
|
|
531
|
+
status=initial_status,
|
|
523
532
|
run_spec=run_spec.json(),
|
|
524
533
|
last_processed_at=submitted_at,
|
|
525
534
|
priority=run_spec.configuration.priority,
|
|
526
535
|
deployment_num=0,
|
|
527
536
|
desired_replica_count=1, # a relevant value will be set in process_runs.py
|
|
537
|
+
next_triggered_at=_get_next_triggered_at(run_spec),
|
|
528
538
|
)
|
|
529
539
|
session.add(run_model)
|
|
530
540
|
|
|
531
|
-
replicas = 1
|
|
532
541
|
if run_spec.configuration.type == "service":
|
|
533
|
-
replicas = run_spec.configuration.replicas.min
|
|
534
542
|
await services.register_service(session, run_model, run_spec)
|
|
535
543
|
|
|
536
|
-
for replica_num in range(
|
|
544
|
+
for replica_num in range(initial_replicas):
|
|
537
545
|
jobs = await get_jobs_from_run_spec(
|
|
538
546
|
run_spec=run_spec,
|
|
539
547
|
secrets=secrets,
|
|
@@ -693,8 +701,8 @@ def run_model_to_run(
|
|
|
693
701
|
id=run_model.id,
|
|
694
702
|
project_name=run_model.project.name,
|
|
695
703
|
user=run_model.user.name,
|
|
696
|
-
submitted_at=run_model.submitted_at
|
|
697
|
-
last_processed_at=run_model.last_processed_at
|
|
704
|
+
submitted_at=run_model.submitted_at,
|
|
705
|
+
last_processed_at=run_model.last_processed_at,
|
|
698
706
|
status=run_model.status,
|
|
699
707
|
status_message=status_message,
|
|
700
708
|
termination_reason=run_model.termination_reason,
|
|
@@ -972,6 +980,12 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
972
980
|
raise ServerClientError(
|
|
973
981
|
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
|
|
974
982
|
)
|
|
983
|
+
if (
|
|
984
|
+
run_spec.merged_profile.schedule
|
|
985
|
+
and run_spec.configuration.type == "service"
|
|
986
|
+
and run_spec.configuration.replicas.min == 0
|
|
987
|
+
):
|
|
988
|
+
raise ServerClientError("Scheduled services with autoscaling to zero are not supported")
|
|
975
989
|
if run_spec.configuration.priority is None:
|
|
976
990
|
run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
|
|
977
991
|
set_resources_defaults(run_spec.configuration.resources)
|
|
@@ -1059,7 +1073,7 @@ def _check_can_update_configuration(
|
|
|
1059
1073
|
)
|
|
1060
1074
|
|
|
1061
1075
|
|
|
1062
|
-
async def process_terminating_run(session: AsyncSession,
|
|
1076
|
+
async def process_terminating_run(session: AsyncSession, run_model: RunModel):
|
|
1063
1077
|
"""
|
|
1064
1078
|
Used by both `process_runs` and `stop_run` to process a TERMINATING run.
|
|
1065
1079
|
Stops the jobs gracefully and marks them as TERMINATING.
|
|
@@ -1067,44 +1081,54 @@ async def process_terminating_run(session: AsyncSession, run: RunModel):
|
|
|
1067
1081
|
When all jobs are terminated, assigns a finished status to the run.
|
|
1068
1082
|
Caller must acquire the lock on run.
|
|
1069
1083
|
"""
|
|
1070
|
-
assert
|
|
1071
|
-
|
|
1084
|
+
assert run_model.termination_reason is not None
|
|
1085
|
+
run = run_model_to_run(run_model, include_jobs=False)
|
|
1086
|
+
job_termination_reason = run_model.termination_reason.to_job_termination_reason()
|
|
1072
1087
|
|
|
1073
1088
|
unfinished_jobs_count = 0
|
|
1074
|
-
for
|
|
1075
|
-
if
|
|
1089
|
+
for job_model in run_model.jobs:
|
|
1090
|
+
if job_model.status.is_finished():
|
|
1076
1091
|
continue
|
|
1077
1092
|
unfinished_jobs_count += 1
|
|
1078
|
-
if
|
|
1093
|
+
if job_model.status == JobStatus.TERMINATING:
|
|
1079
1094
|
if job_termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
1080
1095
|
# Override termination reason so that
|
|
1081
1096
|
# abort actions such as volume force detach are triggered
|
|
1082
|
-
|
|
1097
|
+
job_model.termination_reason = job_termination_reason
|
|
1083
1098
|
continue
|
|
1084
1099
|
|
|
1085
|
-
if
|
|
1100
|
+
if job_model.status == JobStatus.RUNNING and job_termination_reason not in {
|
|
1086
1101
|
JobTerminationReason.ABORTED_BY_USER,
|
|
1087
1102
|
JobTerminationReason.DONE_BY_RUNNER,
|
|
1088
1103
|
}:
|
|
1089
1104
|
# Send a signal to stop the job gracefully
|
|
1090
|
-
await stop_runner(session,
|
|
1091
|
-
delay_job_instance_termination(
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1105
|
+
await stop_runner(session, job_model)
|
|
1106
|
+
delay_job_instance_termination(job_model)
|
|
1107
|
+
job_model.status = JobStatus.TERMINATING
|
|
1108
|
+
job_model.termination_reason = job_termination_reason
|
|
1109
|
+
job_model.last_processed_at = common_utils.get_current_datetime()
|
|
1095
1110
|
|
|
1096
1111
|
if unfinished_jobs_count == 0:
|
|
1097
|
-
if
|
|
1112
|
+
if run_model.service_spec is not None:
|
|
1098
1113
|
try:
|
|
1099
|
-
await services.unregister_service(session,
|
|
1114
|
+
await services.unregister_service(session, run_model)
|
|
1100
1115
|
except Exception as e:
|
|
1101
|
-
logger.warning("%s: failed to unregister service: %s", fmt(
|
|
1102
|
-
|
|
1116
|
+
logger.warning("%s: failed to unregister service: %s", fmt(run_model), repr(e))
|
|
1117
|
+
if (
|
|
1118
|
+
run.run_spec.merged_profile.schedule is not None
|
|
1119
|
+
and run_model.termination_reason
|
|
1120
|
+
not in [RunTerminationReason.ABORTED_BY_USER, RunTerminationReason.STOPPED_BY_USER]
|
|
1121
|
+
):
|
|
1122
|
+
run_model.next_triggered_at = _get_next_triggered_at(run.run_spec)
|
|
1123
|
+
run_model.status = RunStatus.PENDING
|
|
1124
|
+
else:
|
|
1125
|
+
run_model.status = run_model.termination_reason.to_status()
|
|
1126
|
+
|
|
1103
1127
|
logger.info(
|
|
1104
1128
|
"%s: run status has changed TERMINATING -> %s, reason: %s",
|
|
1105
|
-
fmt(
|
|
1106
|
-
|
|
1107
|
-
|
|
1129
|
+
fmt(run_model),
|
|
1130
|
+
run_model.status.name,
|
|
1131
|
+
run_model.termination_reason.name,
|
|
1108
1132
|
)
|
|
1109
1133
|
|
|
1110
1134
|
|
|
@@ -1224,3 +1248,19 @@ async def retry_run_replica_jobs(
|
|
|
1224
1248
|
|
|
1225
1249
|
def _remove_job_spec_sensitive_info(spec: JobSpec):
|
|
1226
1250
|
spec.ssh_key = None
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def _get_next_triggered_at(run_spec: RunSpec) -> Optional[datetime]:
|
|
1254
|
+
if run_spec.merged_profile.schedule is None:
|
|
1255
|
+
return None
|
|
1256
|
+
now = common_utils.get_current_datetime()
|
|
1257
|
+
fire_times = []
|
|
1258
|
+
for cron in run_spec.merged_profile.schedule.crons:
|
|
1259
|
+
cron_trigger = CronTrigger.from_crontab(cron, timezone=timezone.utc)
|
|
1260
|
+
fire_times.append(
|
|
1261
|
+
cron_trigger.get_next_fire_time(
|
|
1262
|
+
previous_fire_time=None,
|
|
1263
|
+
now=now,
|
|
1264
|
+
)
|
|
1265
|
+
)
|
|
1266
|
+
return min(fire_times)
|
|
@@ -28,6 +28,7 @@ from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel,
|
|
|
28
28
|
from dstack._internal.server.services.gateways import (
|
|
29
29
|
get_gateway_configuration,
|
|
30
30
|
get_or_add_gateway_connection,
|
|
31
|
+
get_project_default_gateway_model,
|
|
31
32
|
get_project_gateway_model_by_name,
|
|
32
33
|
)
|
|
33
34
|
from dstack._internal.server.services.logging import fmt
|
|
@@ -52,7 +53,9 @@ async def register_service(session: AsyncSession, run_model: RunModel, run_spec:
|
|
|
52
53
|
elif run_spec.configuration.gateway == False:
|
|
53
54
|
gateway = None
|
|
54
55
|
else:
|
|
55
|
-
gateway =
|
|
56
|
+
gateway = await get_project_default_gateway_model(
|
|
57
|
+
session=session, project=run_model.project
|
|
58
|
+
)
|
|
56
59
|
|
|
57
60
|
if gateway is not None:
|
|
58
61
|
service_spec = await _register_service_in_gateway(session, run_model, run_spec, gateway)
|