dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -0
- dstack/_internal/cli/utils/run.py +11 -0
- dstack/_internal/core/backends/aws/compute.py +1 -0
- dstack/_internal/core/backends/azure/compute.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +1 -1
- dstack/_internal/core/backends/runpod/compute.py +21 -3
- dstack/_internal/core/backends/runpod/config.py +8 -0
- dstack/_internal/core/models/backends/runpod.py +2 -0
- dstack/_internal/core/models/configurations.py +2 -1
- dstack/_internal/core/models/profiles.py +46 -1
- dstack/_internal/core/models/runs.py +4 -0
- dstack/_internal/server/app.py +11 -1
- dstack/_internal/server/background/__init__.py +10 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +135 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -19
- dstack/_internal/server/background/tasks/process_runs.py +1 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +4 -1
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/models.py +11 -0
- dstack/_internal/server/routers/metrics.py +21 -2
- dstack/_internal/server/routers/prometheus.py +36 -0
- dstack/_internal/server/security/permissions.py +1 -1
- dstack/_internal/server/services/backends/configurators/runpod.py +3 -33
- dstack/_internal/server/services/config.py +13 -3
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/gateways/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +9 -1
- dstack/_internal/server/services/metrics.py +103 -70
- dstack/_internal/server/services/prometheus.py +87 -0
- dstack/_internal/server/services/runner/client.py +14 -3
- dstack/_internal/server/services/runs.py +43 -15
- dstack/_internal/server/services/volumes.py +1 -0
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js → main-4eb116b97819badd1e2c.js} +66 -13
- dstack/_internal/server/statics/{main-fe8fd9db55df8d10e648.js.map → main-4eb116b97819badd1e2c.js.map} +1 -1
- dstack/_internal/server/statics/{main-7510e71dfa9749a4e70e.css → main-da9f8c06a69c20dac23e.css} +1 -1
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/testing/common.py +17 -0
- dstack/api/_public/runs.py +3 -0
- dstack/api/server/_fleets.py +2 -0
- dstack/api/server/_runs.py +4 -0
- dstack/api/utils.py +3 -0
- dstack/version.py +1 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/METADATA +10 -1
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/RECORD +59 -50
- tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +189 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +125 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_metrics.py +15 -0
- tests/_internal/server/routers/test_prometheus.py +244 -0
- tests/_internal/server/routers/test_runs.py +79 -56
- tests/_internal/server/services/test_metrics.py +163 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/WHEEL +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.43.dist-info → dstack-0.18.44.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Add JobPrometheusMetrics
|
|
2
|
+
|
|
3
|
+
Revision ID: 60e444118b6d
|
|
4
|
+
Revises: a751ef183f27
|
|
5
|
+
Create Date: 2025-02-21 10:59:26.339353
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "60e444118b6d"
|
|
17
|
+
down_revision = "a751ef183f27"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.create_table(
|
|
25
|
+
"job_prometheus_metrics",
|
|
26
|
+
sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
27
|
+
sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
28
|
+
sa.Column("text", sa.Text(), nullable=False),
|
|
29
|
+
sa.ForeignKeyConstraint(
|
|
30
|
+
["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs")
|
|
31
|
+
),
|
|
32
|
+
sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")),
|
|
33
|
+
)
|
|
34
|
+
# ### end Alembic commands ###
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def downgrade() -> None:
|
|
38
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
39
|
+
op.drop_table("job_prometheus_metrics")
|
|
40
|
+
# ### end Alembic commands ###
|
dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
|
|
2
|
+
|
|
3
|
+
Revision ID: 98d1b92988bc
|
|
4
|
+
Revises: 60e444118b6d
|
|
5
|
+
Create Date: 2025-02-28 15:12:37.649876
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
from alembic_postgresql_enum import TableReference
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = "98d1b92988bc"
|
|
15
|
+
down_revision = "60e444118b6d"
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
# SQLite
|
|
22
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
23
|
+
batch_op.alter_column(
|
|
24
|
+
"termination_reason",
|
|
25
|
+
existing_type=sa.VARCHAR(length=34),
|
|
26
|
+
type_=sa.Enum(
|
|
27
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
28
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
29
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
30
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
31
|
+
"TERMINATED_BY_USER",
|
|
32
|
+
"VOLUME_ERROR",
|
|
33
|
+
"GATEWAY_ERROR",
|
|
34
|
+
"SCALED_DOWN",
|
|
35
|
+
"DONE_BY_RUNNER",
|
|
36
|
+
"ABORTED_BY_USER",
|
|
37
|
+
"TERMINATED_BY_SERVER",
|
|
38
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
39
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
40
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
41
|
+
"PORTS_BINDING_FAILED",
|
|
42
|
+
"CREATING_CONTAINER_ERROR",
|
|
43
|
+
"EXECUTOR_ERROR",
|
|
44
|
+
"MAX_DURATION_EXCEEDED",
|
|
45
|
+
name="jobterminationreason",
|
|
46
|
+
),
|
|
47
|
+
existing_nullable=True,
|
|
48
|
+
)
|
|
49
|
+
# PostgreSQL
|
|
50
|
+
op.sync_enum_values(
|
|
51
|
+
enum_schema="public",
|
|
52
|
+
enum_name="jobterminationreason",
|
|
53
|
+
new_values=[
|
|
54
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
55
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
56
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
57
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
58
|
+
"TERMINATED_BY_USER",
|
|
59
|
+
"VOLUME_ERROR",
|
|
60
|
+
"GATEWAY_ERROR",
|
|
61
|
+
"SCALED_DOWN",
|
|
62
|
+
"DONE_BY_RUNNER",
|
|
63
|
+
"ABORTED_BY_USER",
|
|
64
|
+
"TERMINATED_BY_SERVER",
|
|
65
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
66
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
67
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
68
|
+
"PORTS_BINDING_FAILED",
|
|
69
|
+
"CREATING_CONTAINER_ERROR",
|
|
70
|
+
"EXECUTOR_ERROR",
|
|
71
|
+
"MAX_DURATION_EXCEEDED",
|
|
72
|
+
],
|
|
73
|
+
affected_columns=[
|
|
74
|
+
TableReference(
|
|
75
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
76
|
+
)
|
|
77
|
+
],
|
|
78
|
+
enum_values_to_rename=[],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def downgrade() -> None:
|
|
83
|
+
# SQLite
|
|
84
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
85
|
+
batch_op.alter_column(
|
|
86
|
+
"termination_reason",
|
|
87
|
+
existing_type=sa.Enum(
|
|
88
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
89
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
90
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
91
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
92
|
+
"TERMINATED_BY_USER",
|
|
93
|
+
"VOLUME_ERROR",
|
|
94
|
+
"GATEWAY_ERROR",
|
|
95
|
+
"SCALED_DOWN",
|
|
96
|
+
"DONE_BY_RUNNER",
|
|
97
|
+
"ABORTED_BY_USER",
|
|
98
|
+
"TERMINATED_BY_SERVER",
|
|
99
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
100
|
+
"TERMINATED_DUE_TO_UTILIZATION_POLICY",
|
|
101
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
102
|
+
"PORTS_BINDING_FAILED",
|
|
103
|
+
"CREATING_CONTAINER_ERROR",
|
|
104
|
+
"EXECUTOR_ERROR",
|
|
105
|
+
"MAX_DURATION_EXCEEDED",
|
|
106
|
+
name="jobterminationreason",
|
|
107
|
+
),
|
|
108
|
+
type_=sa.VARCHAR(length=34),
|
|
109
|
+
existing_nullable=True,
|
|
110
|
+
)
|
|
111
|
+
# PostgreSQL
|
|
112
|
+
op.sync_enum_values(
|
|
113
|
+
enum_schema="public",
|
|
114
|
+
enum_name="jobterminationreason",
|
|
115
|
+
new_values=[
|
|
116
|
+
"FAILED_TO_START_DUE_TO_NO_CAPACITY",
|
|
117
|
+
"INTERRUPTED_BY_NO_CAPACITY",
|
|
118
|
+
"WAITING_INSTANCE_LIMIT_EXCEEDED",
|
|
119
|
+
"WAITING_RUNNER_LIMIT_EXCEEDED",
|
|
120
|
+
"TERMINATED_BY_USER",
|
|
121
|
+
"VOLUME_ERROR",
|
|
122
|
+
"GATEWAY_ERROR",
|
|
123
|
+
"SCALED_DOWN",
|
|
124
|
+
"DONE_BY_RUNNER",
|
|
125
|
+
"ABORTED_BY_USER",
|
|
126
|
+
"TERMINATED_BY_SERVER",
|
|
127
|
+
"INACTIVITY_DURATION_EXCEEDED",
|
|
128
|
+
"CONTAINER_EXITED_WITH_ERROR",
|
|
129
|
+
"PORTS_BINDING_FAILED",
|
|
130
|
+
"CREATING_CONTAINER_ERROR",
|
|
131
|
+
"EXECUTOR_ERROR",
|
|
132
|
+
"MAX_DURATION_EXCEEDED",
|
|
133
|
+
],
|
|
134
|
+
affected_columns=[
|
|
135
|
+
TableReference(
|
|
136
|
+
table_schema="public", table_name="jobs", column_name="termination_reason"
|
|
137
|
+
)
|
|
138
|
+
],
|
|
139
|
+
enum_values_to_rename=[],
|
|
140
|
+
)
|
|
@@ -648,3 +648,14 @@ class JobMetricsPoint(BaseModel):
|
|
|
648
648
|
# json-encoded lists of metric values of len(gpus) length
|
|
649
649
|
gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text)
|
|
650
650
|
gpus_util_percent: Mapped[str] = mapped_column(Text)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
class JobPrometheusMetrics(BaseModel):
|
|
654
|
+
__tablename__ = "job_prometheus_metrics"
|
|
655
|
+
|
|
656
|
+
job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
|
|
657
|
+
job: Mapped["JobModel"] = relationship()
|
|
658
|
+
|
|
659
|
+
collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
660
|
+
# Raw Prometheus text response
|
|
661
|
+
text: Mapped[str] = mapped_column(Text)
|
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional, Tuple
|
|
2
3
|
|
|
3
4
|
from fastapi import APIRouter, Depends
|
|
4
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
6
|
|
|
7
|
+
from dstack._internal.core.errors import ResourceNotExistsError
|
|
6
8
|
from dstack._internal.core.models.metrics import JobMetrics
|
|
7
9
|
from dstack._internal.server.db import get_session
|
|
8
10
|
from dstack._internal.server.models import ProjectModel, UserModel
|
|
9
11
|
from dstack._internal.server.security.permissions import ProjectMember
|
|
10
12
|
from dstack._internal.server.services import metrics
|
|
13
|
+
from dstack._internal.server.services.jobs import get_run_job_model
|
|
11
14
|
from dstack._internal.server.utils.routers import get_base_api_additional_responses
|
|
12
15
|
|
|
13
16
|
router = APIRouter(
|
|
@@ -24,6 +27,9 @@ async def get_job_metrics(
|
|
|
24
27
|
run_name: str,
|
|
25
28
|
replica_num: int = 0,
|
|
26
29
|
job_num: int = 0,
|
|
30
|
+
limit: int = 1,
|
|
31
|
+
after: Optional[datetime] = None,
|
|
32
|
+
before: Optional[datetime] = None,
|
|
27
33
|
session: AsyncSession = Depends(get_session),
|
|
28
34
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
29
35
|
) -> JobMetrics:
|
|
@@ -31,6 +37,8 @@ async def get_job_metrics(
|
|
|
31
37
|
Returns job-level metrics such as hardware utilization
|
|
32
38
|
given `run_name`, `replica_num`, and `job_num`.
|
|
33
39
|
If only `run_name` is specified, returns metrics of `(replica_num=0, job_num=0)`.
|
|
40
|
+
By default, returns one latest sample. To control time window/number of samples, use
|
|
41
|
+
`limit`, `after`, `before`.
|
|
34
42
|
|
|
35
43
|
Supported metrics: [
|
|
36
44
|
"cpu_usage_percent",
|
|
@@ -42,10 +50,21 @@ async def get_job_metrics(
|
|
|
42
50
|
]
|
|
43
51
|
"""
|
|
44
52
|
_, project = user_project
|
|
45
|
-
|
|
53
|
+
|
|
54
|
+
job_model = await get_run_job_model(
|
|
46
55
|
session=session,
|
|
47
56
|
project=project,
|
|
48
57
|
run_name=run_name,
|
|
49
58
|
replica_num=replica_num,
|
|
50
59
|
job_num=job_num,
|
|
51
60
|
)
|
|
61
|
+
if job_model is None:
|
|
62
|
+
raise ResourceNotExistsError("Found no job with given parameters")
|
|
63
|
+
|
|
64
|
+
return await metrics.get_job_metrics(
|
|
65
|
+
session=session,
|
|
66
|
+
job_model=job_model,
|
|
67
|
+
limit=limit,
|
|
68
|
+
after=after,
|
|
69
|
+
before=before,
|
|
70
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends
|
|
4
|
+
from fastapi.responses import PlainTextResponse
|
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
6
|
+
|
|
7
|
+
from dstack._internal.server import settings
|
|
8
|
+
from dstack._internal.server.db import get_session
|
|
9
|
+
from dstack._internal.server.deps import Project
|
|
10
|
+
from dstack._internal.server.models import ProjectModel
|
|
11
|
+
from dstack._internal.server.services import prometheus
|
|
12
|
+
from dstack._internal.server.utils.routers import error_not_found
|
|
13
|
+
|
|
14
|
+
router = APIRouter(
|
|
15
|
+
tags=["prometheus"],
|
|
16
|
+
default_response_class=PlainTextResponse,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@router.get("/metrics")
|
|
21
|
+
async def get_prometheus_metrics(
|
|
22
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
23
|
+
) -> str:
|
|
24
|
+
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
25
|
+
raise error_not_found()
|
|
26
|
+
return await prometheus.get_metrics(session=session)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@router.get("/metrics/project/{project_name}")
|
|
30
|
+
async def get_project_prometheus_metrics(
|
|
31
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
32
|
+
project: Annotated[ProjectModel, Depends(Project())],
|
|
33
|
+
) -> str:
|
|
34
|
+
if not settings.ENABLE_PROMETHEUS_METRICS:
|
|
35
|
+
raise error_not_found()
|
|
36
|
+
return await prometheus.get_project_metrics(session=session, project=project)
|
|
@@ -80,7 +80,7 @@ class ProjectManager:
|
|
|
80
80
|
project = await get_project_model_by_name(session=session, project_name=project_name)
|
|
81
81
|
if project is None:
|
|
82
82
|
raise error_forbidden()
|
|
83
|
-
if user.global_role
|
|
83
|
+
if user.global_role == GlobalRole.ADMIN:
|
|
84
84
|
return user, project
|
|
85
85
|
project_role = get_user_project_role(user=user, project=project)
|
|
86
86
|
if project_role in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
|
|
@@ -3,11 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from dstack._internal.core.backends.base import Backend
|
|
5
5
|
from dstack._internal.core.backends.runpod import RunpodBackend, RunpodConfig, api_client
|
|
6
|
-
from dstack._internal.core.models.backends.base import
|
|
7
|
-
BackendType,
|
|
8
|
-
ConfigElementValue,
|
|
9
|
-
ConfigMultiElement,
|
|
10
|
-
)
|
|
6
|
+
from dstack._internal.core.models.backends.base import BackendType, ConfigMultiElement
|
|
11
7
|
from dstack._internal.core.models.backends.runpod import (
|
|
12
8
|
RunpodConfigInfo,
|
|
13
9
|
RunpodConfigInfoWithCreds,
|
|
@@ -22,25 +18,6 @@ from dstack._internal.server.services.backends.configurators.base import (
|
|
|
22
18
|
raise_invalid_credentials_error,
|
|
23
19
|
)
|
|
24
20
|
|
|
25
|
-
REGIONS = [
|
|
26
|
-
"CA-MTL-1",
|
|
27
|
-
"CA-MTL-2",
|
|
28
|
-
"CA-MTL-3",
|
|
29
|
-
"EU-NL-1",
|
|
30
|
-
"EU-RO-1",
|
|
31
|
-
"EU-SE-1",
|
|
32
|
-
"EUR-IS-1",
|
|
33
|
-
"EUR-IS-2",
|
|
34
|
-
"US-CA-1",
|
|
35
|
-
"US-GA-1",
|
|
36
|
-
"US-GA-2",
|
|
37
|
-
"US-KS-2",
|
|
38
|
-
"US-OR-1",
|
|
39
|
-
"US-TX-3",
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
DEFAULT_REGION = "CA-MTL-1"
|
|
43
|
-
|
|
44
21
|
|
|
45
22
|
class RunpodConfigurator(Configurator):
|
|
46
23
|
TYPE: BackendType = BackendType.RUNPOD
|
|
@@ -50,16 +27,12 @@ class RunpodConfigurator(Configurator):
|
|
|
50
27
|
if config.creds is None:
|
|
51
28
|
return config_values
|
|
52
29
|
self._validate_runpod_api_key(config.creds.api_key)
|
|
53
|
-
config_values.regions = self._get_regions_element(
|
|
54
|
-
selected=config.regions or [DEFAULT_REGION]
|
|
55
|
-
)
|
|
30
|
+
config_values.regions = self._get_regions_element(selected=config.regions or [])
|
|
56
31
|
return config_values
|
|
57
32
|
|
|
58
33
|
def create_backend(
|
|
59
34
|
self, project: ProjectModel, config: RunpodConfigInfoWithCreds
|
|
60
35
|
) -> BackendModel:
|
|
61
|
-
if config.regions is None:
|
|
62
|
-
config.regions = REGIONS
|
|
63
36
|
return BackendModel(
|
|
64
37
|
project_id=project.id,
|
|
65
38
|
type=self.TYPE.value,
|
|
@@ -80,10 +53,7 @@ class RunpodConfigurator(Configurator):
|
|
|
80
53
|
return RunpodBackend(config=config)
|
|
81
54
|
|
|
82
55
|
def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement:
|
|
83
|
-
|
|
84
|
-
for r in REGIONS:
|
|
85
|
-
element.values.append(ConfigElementValue(value=r, label=r))
|
|
86
|
-
return element
|
|
56
|
+
return ConfigMultiElement(selected=selected)
|
|
87
57
|
|
|
88
58
|
def _get_backend_config(self, model: BackendModel) -> RunpodConfig:
|
|
89
59
|
return RunpodConfig(
|
|
@@ -6,6 +6,7 @@ from pydantic import BaseModel, Field, ValidationError, root_validator
|
|
|
6
6
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
|
+
from dstack._internal.core.backends.runpod.config import RUNPOD_COMMUNITY_CLOUD_DEFAULT
|
|
9
10
|
from dstack._internal.core.errors import (
|
|
10
11
|
BackendNotAvailable,
|
|
11
12
|
ResourceNotExistsError,
|
|
@@ -45,7 +46,7 @@ logger = get_logger(__name__)
|
|
|
45
46
|
# By default, PyYAML chooses the style of a collection depending on whether it has nested collections.
|
|
46
47
|
# If a collection has nested collections, it will be assigned the block style. Otherwise it will have the flow style.
|
|
47
48
|
#
|
|
48
|
-
# We want mapping to always be
|
|
49
|
+
# We want mapping to always be displayed in block-style but lists without nested objects in flow-style.
|
|
49
50
|
# So we define a custom representeter
|
|
50
51
|
|
|
51
52
|
|
|
@@ -340,7 +341,7 @@ class KubernetesConfig(CoreModel):
|
|
|
340
341
|
kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
|
|
341
342
|
networking: Annotated[
|
|
342
343
|
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
343
|
-
]
|
|
344
|
+
] = None
|
|
344
345
|
|
|
345
346
|
|
|
346
347
|
class KubernetesAPIConfig(CoreModel):
|
|
@@ -348,7 +349,7 @@ class KubernetesAPIConfig(CoreModel):
|
|
|
348
349
|
kubeconfig: Annotated[KubeconfigAPIConfig, Field(description="The kubeconfig configuration")]
|
|
349
350
|
networking: Annotated[
|
|
350
351
|
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
351
|
-
]
|
|
352
|
+
] = None
|
|
352
353
|
|
|
353
354
|
|
|
354
355
|
class LambdaConfig(CoreModel):
|
|
@@ -428,6 +429,15 @@ class RunpodConfig(CoreModel):
|
|
|
428
429
|
Optional[List[str]],
|
|
429
430
|
Field(description="The list of RunPod regions. Omit to use all regions"),
|
|
430
431
|
] = None
|
|
432
|
+
community_cloud: Annotated[
|
|
433
|
+
Optional[bool],
|
|
434
|
+
Field(
|
|
435
|
+
description=(
|
|
436
|
+
"Whether Community Cloud offers can be suggested in addition to Secure Cloud."
|
|
437
|
+
f" Defaults to `{str(RUNPOD_COMMUNITY_CLOUD_DEFAULT).lower()}`"
|
|
438
|
+
)
|
|
439
|
+
),
|
|
440
|
+
] = None
|
|
431
441
|
creds: Annotated[AnyRunpodCreds, Field(description="The credentials")]
|
|
432
442
|
|
|
433
443
|
|
|
@@ -517,6 +517,7 @@ async def delete_fleets(
|
|
|
517
517
|
.options(selectinload(FleetModel.instances))
|
|
518
518
|
.options(selectinload(FleetModel.runs))
|
|
519
519
|
.execution_options(populate_existing=True)
|
|
520
|
+
.order_by(FleetModel.id) # take locks in order
|
|
520
521
|
.with_for_update()
|
|
521
522
|
)
|
|
522
523
|
fleet_models = res.scalars().unique().all()
|
|
@@ -220,6 +220,7 @@ async def delete_gateways(
|
|
|
220
220
|
)
|
|
221
221
|
.options(selectinload(GatewayModel.gateway_compute))
|
|
222
222
|
.execution_options(populate_existing=True)
|
|
223
|
+
.order_by(GatewayModel.id) # take locks in order
|
|
223
224
|
.with_for_update()
|
|
224
225
|
)
|
|
225
226
|
gateway_models = res.scalars().all()
|
|
@@ -13,7 +13,11 @@ from dstack._internal.core.models.configurations import (
|
|
|
13
13
|
PythonVersion,
|
|
14
14
|
RunConfigurationType,
|
|
15
15
|
)
|
|
16
|
-
from dstack._internal.core.models.profiles import
|
|
16
|
+
from dstack._internal.core.models.profiles import (
|
|
17
|
+
DEFAULT_STOP_DURATION,
|
|
18
|
+
SpotPolicy,
|
|
19
|
+
UtilizationPolicy,
|
|
20
|
+
)
|
|
17
21
|
from dstack._internal.core.models.runs import (
|
|
18
22
|
AppSpec,
|
|
19
23
|
JobSpec,
|
|
@@ -113,6 +117,7 @@ class JobConfigurator(ABC):
|
|
|
113
117
|
single_branch=self._single_branch(),
|
|
114
118
|
max_duration=self._max_duration(),
|
|
115
119
|
stop_duration=self._stop_duration(),
|
|
120
|
+
utilization_policy=self._utilization_policy(),
|
|
116
121
|
registry_auth=self._registry_auth(),
|
|
117
122
|
requirements=self._requirements(),
|
|
118
123
|
retry=self._retry(),
|
|
@@ -201,6 +206,9 @@ class JobConfigurator(ABC):
|
|
|
201
206
|
# pydantic validator ensures this is int
|
|
202
207
|
return self.run_spec.merged_profile.stop_duration
|
|
203
208
|
|
|
209
|
+
def _utilization_policy(self) -> Optional[UtilizationPolicy]:
|
|
210
|
+
return self.run_spec.merged_profile.utilization_policy
|
|
211
|
+
|
|
204
212
|
def _registry_auth(self) -> Optional[RegistryAuth]:
|
|
205
213
|
return self.run_spec.configuration.registry_auth
|
|
206
214
|
|