dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +12 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +26 -10
- dstack/_internal/server/background/__init__.py +9 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +168 -103
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
- dstack/_internal/server/background/tasks/process_runs.py +84 -34
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +57 -16
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +113 -15
- dstack/_internal/server/services/jobs/__init__.py +18 -13
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +51 -19
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +115 -39
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple
|
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import select
|
|
7
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
-
from sqlalchemy.orm import joinedload,
|
|
8
|
+
from sqlalchemy.orm import joinedload, load_only, selectinload
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.backend import Backend
|
|
11
11
|
from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
|
|
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
43
43
|
JobModel,
|
|
44
44
|
ProjectModel,
|
|
45
45
|
RunModel,
|
|
46
|
+
UserModel,
|
|
46
47
|
VolumeAttachmentModel,
|
|
47
48
|
VolumeModel,
|
|
48
49
|
)
|
|
@@ -74,6 +75,7 @@ from dstack._internal.server.services.runs import (
|
|
|
74
75
|
from dstack._internal.server.services.volumes import (
|
|
75
76
|
volume_model_to_volume,
|
|
76
77
|
)
|
|
78
|
+
from dstack._internal.server.utils import sentry_utils
|
|
77
79
|
from dstack._internal.utils import common as common_utils
|
|
78
80
|
from dstack._internal.utils import env as env_utils
|
|
79
81
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -108,6 +110,7 @@ def _get_effective_batch_size(batch_size: int) -> int:
|
|
|
108
110
|
return batch_size
|
|
109
111
|
|
|
110
112
|
|
|
113
|
+
@sentry_utils.instrument_background_task
|
|
111
114
|
async def _process_next_submitted_job():
|
|
112
115
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
113
116
|
async with get_session_ctx() as session:
|
|
@@ -119,6 +122,7 @@ async def _process_next_submitted_job():
|
|
|
119
122
|
JobModel.status == JobStatus.SUBMITTED,
|
|
120
123
|
JobModel.id.not_in(lockset),
|
|
121
124
|
)
|
|
125
|
+
.options(load_only(JobModel.id))
|
|
122
126
|
# Jobs are process in FIFO sorted by priority globally,
|
|
123
127
|
# thus runs from different projects can "overtake" each other by using higher priorities.
|
|
124
128
|
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
@@ -151,9 +155,7 @@ async def _process_next_submitted_job():
|
|
|
151
155
|
|
|
152
156
|
|
|
153
157
|
async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
154
|
-
logger.debug("%s: provisioning has started", fmt(job_model))
|
|
155
158
|
# Refetch to load related attributes.
|
|
156
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
157
159
|
res = await session.execute(
|
|
158
160
|
select(JobModel).where(JobModel.id == job_model.id).options(joinedload(JobModel.instance))
|
|
159
161
|
)
|
|
@@ -162,15 +164,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
162
164
|
select(RunModel)
|
|
163
165
|
.where(RunModel.id == job_model.run_id)
|
|
164
166
|
.options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
|
|
165
|
-
.options(joinedload(RunModel.user))
|
|
167
|
+
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
166
168
|
.options(joinedload(RunModel.fleet).joinedload(FleetModel.instances))
|
|
167
169
|
)
|
|
168
170
|
run_model = res.unique().scalar_one()
|
|
169
|
-
|
|
170
|
-
run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
|
|
171
|
-
profile = run_spec.merged_profile
|
|
171
|
+
logger.debug("%s: provisioning has started", fmt(job_model))
|
|
172
172
|
|
|
173
|
+
project = run_model.project
|
|
173
174
|
run = run_model_to_run(run_model)
|
|
175
|
+
run_spec = run.run_spec
|
|
176
|
+
profile = run_spec.merged_profile
|
|
174
177
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
175
178
|
|
|
176
179
|
master_job = find_job(run.jobs, job_model.replica_num, 0)
|
|
@@ -228,7 +231,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
228
231
|
InstanceModel.deleted == False,
|
|
229
232
|
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
230
233
|
)
|
|
231
|
-
.options(lazyload(InstanceModel.jobs))
|
|
232
234
|
.order_by(InstanceModel.id) # take locks in order
|
|
233
235
|
.with_for_update(key_share=True)
|
|
234
236
|
)
|
|
@@ -357,9 +359,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
357
359
|
await session.execute(
|
|
358
360
|
select(VolumeModel)
|
|
359
361
|
.where(VolumeModel.id.in_(volumes_ids))
|
|
360
|
-
.options(
|
|
362
|
+
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
361
363
|
.order_by(VolumeModel.id) # take locks in order
|
|
362
|
-
.with_for_update(key_share=True)
|
|
364
|
+
.with_for_update(key_share=True, of=VolumeModel)
|
|
363
365
|
)
|
|
364
366
|
async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
365
367
|
if len(volume_models) > 0:
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
|
|
3
3
|
from sqlalchemy import or_, select
|
|
4
4
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
-
from sqlalchemy.orm import joinedload
|
|
5
|
+
from sqlalchemy.orm import joinedload
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
8
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
@@ -18,7 +18,11 @@ from dstack._internal.server.services.jobs import (
|
|
|
18
18
|
)
|
|
19
19
|
from dstack._internal.server.services.locking import get_locker
|
|
20
20
|
from dstack._internal.server.services.logging import fmt
|
|
21
|
-
from dstack._internal.utils
|
|
21
|
+
from dstack._internal.server.utils import sentry_utils
|
|
22
|
+
from dstack._internal.utils.common import (
|
|
23
|
+
get_current_datetime,
|
|
24
|
+
get_or_error,
|
|
25
|
+
)
|
|
22
26
|
from dstack._internal.utils.logging import get_logger
|
|
23
27
|
|
|
24
28
|
logger = get_logger(__name__)
|
|
@@ -31,6 +35,7 @@ async def process_terminating_jobs(batch_size: int = 1):
|
|
|
31
35
|
await asyncio.gather(*tasks)
|
|
32
36
|
|
|
33
37
|
|
|
38
|
+
@sentry_utils.instrument_background_task
|
|
34
39
|
async def _process_next_terminating_job():
|
|
35
40
|
job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
|
|
36
41
|
instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
@@ -43,7 +48,10 @@ async def _process_next_terminating_job():
|
|
|
43
48
|
.where(
|
|
44
49
|
JobModel.id.not_in(job_lockset),
|
|
45
50
|
JobModel.status == JobStatus.TERMINATING,
|
|
46
|
-
or_(
|
|
51
|
+
or_(
|
|
52
|
+
JobModel.remove_at.is_(None),
|
|
53
|
+
JobModel.remove_at < get_current_datetime(),
|
|
54
|
+
),
|
|
47
55
|
)
|
|
48
56
|
.order_by(JobModel.last_processed_at.asc())
|
|
49
57
|
.limit(1)
|
|
@@ -59,7 +67,6 @@ async def _process_next_terminating_job():
|
|
|
59
67
|
InstanceModel.id == job_model.used_instance_id,
|
|
60
68
|
InstanceModel.id.not_in(instance_lockset),
|
|
61
69
|
)
|
|
62
|
-
.options(lazyload(InstanceModel.jobs))
|
|
63
70
|
.with_for_update(skip_locked=True, key_share=True)
|
|
64
71
|
)
|
|
65
72
|
instance_model = res.scalar()
|
|
@@ -88,6 +95,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
88
95
|
.options(
|
|
89
96
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
90
97
|
joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
|
|
98
|
+
joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
91
99
|
)
|
|
92
100
|
)
|
|
93
101
|
instance_model = res.unique().scalar()
|
|
@@ -7,6 +7,7 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
|
7
7
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
8
8
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
9
9
|
from dstack._internal.server.models import (
|
|
10
|
+
FleetModel,
|
|
10
11
|
InstanceModel,
|
|
11
12
|
ProjectModel,
|
|
12
13
|
VolumeAttachmentModel,
|
|
@@ -15,12 +16,14 @@ from dstack._internal.server.models import (
|
|
|
15
16
|
from dstack._internal.server.services import backends as backends_services
|
|
16
17
|
from dstack._internal.server.services import volumes as volumes_services
|
|
17
18
|
from dstack._internal.server.services.locking import get_locker
|
|
19
|
+
from dstack._internal.server.utils import sentry_utils
|
|
18
20
|
from dstack._internal.utils.common import get_current_datetime, run_async
|
|
19
21
|
from dstack._internal.utils.logging import get_logger
|
|
20
22
|
|
|
21
23
|
logger = get_logger(__name__)
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
@sentry_utils.instrument_background_task
|
|
24
27
|
async def process_submitted_volumes():
|
|
25
28
|
lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
|
|
26
29
|
async with get_session_ctx() as session:
|
|
@@ -49,7 +52,6 @@ async def process_submitted_volumes():
|
|
|
49
52
|
async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeModel):
|
|
50
53
|
logger.info("Started submitted volume %s processing", volume_model.name)
|
|
51
54
|
# Refetch to load related attributes.
|
|
52
|
-
# joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
|
|
53
55
|
res = await session.execute(
|
|
54
56
|
select(VolumeModel)
|
|
55
57
|
.where(VolumeModel.id == volume_model.id)
|
|
@@ -59,6 +61,7 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
|
|
|
59
61
|
joinedload(VolumeModel.attachments)
|
|
60
62
|
.joinedload(VolumeAttachmentModel.instance)
|
|
61
63
|
.joinedload(InstanceModel.fleet)
|
|
64
|
+
.load_only(FleetModel.name)
|
|
62
65
|
)
|
|
63
66
|
.execution_options(populate_existing=True)
|
|
64
67
|
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Add probes
|
|
2
|
+
|
|
3
|
+
Revision ID: 25479f540245
|
|
4
|
+
Revises: 50dd7ea98639
|
|
5
|
+
Create Date: 2025-08-03 19:51:07.722217
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "25479f540245"
|
|
17
|
+
down_revision = "50dd7ea98639"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.create_table(
|
|
25
|
+
"probes",
|
|
26
|
+
sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
27
|
+
sa.Column("name", sa.String(length=100), nullable=False),
|
|
28
|
+
sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
29
|
+
sa.Column("probe_num", sa.Integer(), nullable=False),
|
|
30
|
+
sa.Column("due", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
31
|
+
sa.Column("success_streak", sa.BigInteger(), nullable=False),
|
|
32
|
+
sa.Column("active", sa.Boolean(), nullable=False),
|
|
33
|
+
sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], name=op.f("fk_probes_job_id_jobs")),
|
|
34
|
+
sa.PrimaryKeyConstraint("id", "job_id", name=op.f("pk_probes")),
|
|
35
|
+
sa.UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),
|
|
36
|
+
)
|
|
37
|
+
# ### end Alembic commands ###
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def downgrade() -> None:
|
|
41
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
42
|
+
op.drop_table("probes")
|
|
43
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Index status columns
|
|
2
|
+
|
|
3
|
+
Revision ID: 50dd7ea98639
|
|
4
|
+
Revises: ec02a26a256c
|
|
5
|
+
Create Date: 2025-07-25 10:36:25.127923
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
# revision identifiers, used by Alembic.
|
|
12
|
+
revision = "50dd7ea98639"
|
|
13
|
+
down_revision = "ec02a26a256c"
|
|
14
|
+
branch_labels = None
|
|
15
|
+
depends_on = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def upgrade() -> None:
|
|
19
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
20
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
21
|
+
batch_op.create_index(batch_op.f("ix_runs_status"), ["status"], unique=False)
|
|
22
|
+
|
|
23
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
24
|
+
batch_op.create_index(batch_op.f("ix_jobs_status"), ["status"], unique=False)
|
|
25
|
+
|
|
26
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
27
|
+
batch_op.create_index(batch_op.f("ix_fleets_status"), ["status"], unique=False)
|
|
28
|
+
|
|
29
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
30
|
+
batch_op.create_index(batch_op.f("ix_instances_status"), ["status"], unique=False)
|
|
31
|
+
|
|
32
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
33
|
+
batch_op.create_index(batch_op.f("ix_volumes_status"), ["status"], unique=False)
|
|
34
|
+
|
|
35
|
+
# ### end Alembic commands ###
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def downgrade() -> None:
|
|
39
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
40
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
41
|
+
batch_op.drop_index(batch_op.f("ix_runs_status"))
|
|
42
|
+
|
|
43
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
44
|
+
batch_op.drop_index(batch_op.f("ix_jobs_status"))
|
|
45
|
+
|
|
46
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
47
|
+
batch_op.drop_index(batch_op.f("ix_fleets_status"))
|
|
48
|
+
|
|
49
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
50
|
+
batch_op.drop_index(batch_op.f("ix_instances_status"))
|
|
51
|
+
|
|
52
|
+
with op.batch_alter_table("volumes", schema=None) as batch_op:
|
|
53
|
+
batch_op.drop_index(batch_op.f("ix_volumes_status"))
|
|
54
|
+
|
|
55
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Add instance health
|
|
2
|
+
|
|
3
|
+
Revision ID: 728b1488b1b4
|
|
4
|
+
Revises: 25479f540245
|
|
5
|
+
Create Date: 2025-08-01 14:56:20.466990
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "728b1488b1b4"
|
|
17
|
+
down_revision = "25479f540245"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
op.create_table(
|
|
24
|
+
"instance_health_checks",
|
|
25
|
+
sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
26
|
+
sa.Column(
|
|
27
|
+
"instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
|
|
28
|
+
),
|
|
29
|
+
sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
30
|
+
sa.Column("status", sa.VARCHAR(length=100), nullable=False),
|
|
31
|
+
sa.Column("response", sa.Text(), nullable=False),
|
|
32
|
+
sa.ForeignKeyConstraint(
|
|
33
|
+
["instance_id"],
|
|
34
|
+
["instances.id"],
|
|
35
|
+
name=op.f("fk_instance_health_checks_instance_id_instances"),
|
|
36
|
+
),
|
|
37
|
+
sa.PrimaryKeyConstraint("id", name=op.f("pk_instance_health_checks")),
|
|
38
|
+
)
|
|
39
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
40
|
+
batch_op.add_column(sa.Column("health", sa.VARCHAR(length=100), nullable=True))
|
|
41
|
+
op.execute("UPDATE instances SET health = 'HEALTHY'")
|
|
42
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
43
|
+
batch_op.alter_column("health", existing_type=sa.VARCHAR(length=100), nullable=False)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def downgrade() -> None:
|
|
47
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
48
|
+
batch_op.drop_column("health")
|
|
49
|
+
|
|
50
|
+
op.drop_table("instance_health_checks")
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Add RunModel.next_triggered_at
|
|
2
|
+
|
|
3
|
+
Revision ID: ec02a26a256c
|
|
4
|
+
Revises: d5863798bf41
|
|
5
|
+
Create Date: 2025-07-17 15:47:00.443217
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "ec02a26a256c"
|
|
16
|
+
down_revision = "d5863798bf41"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column(
|
|
26
|
+
"next_triggered_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# ### end Alembic commands ###
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
35
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
36
|
+
batch_op.drop_column("next_triggered_at")
|
|
37
|
+
|
|
38
|
+
# ### end Alembic commands ###
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import enum
|
|
2
2
|
import uuid
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
4
|
from typing import Callable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import (
|
|
@@ -28,6 +28,7 @@ from dstack._internal.core.models.backends.base import BackendType
|
|
|
28
28
|
from dstack._internal.core.models.common import CoreModel
|
|
29
29
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
30
30
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
31
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
31
32
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
32
33
|
from dstack._internal.core.models.profiles import (
|
|
33
34
|
DEFAULT_FLEET_TERMINATION_IDLE_TIME,
|
|
@@ -51,9 +52,10 @@ logger = get_logger(__name__)
|
|
|
51
52
|
|
|
52
53
|
class NaiveDateTime(TypeDecorator):
|
|
53
54
|
"""
|
|
54
|
-
A custom type decorator that ensures datetime objects are offset-naive when stored in the database
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
A custom type decorator that ensures datetime objects are offset-naive when stored in the database
|
|
56
|
+
and offset-aware with UTC timezone when loaded from the database.
|
|
57
|
+
This is because we use datetimes in UTC everywhere, and
|
|
58
|
+
some databases (e.g. Postgres) throw an error if the timezone is set.
|
|
57
59
|
"""
|
|
58
60
|
|
|
59
61
|
impl = DateTime
|
|
@@ -65,7 +67,9 @@ class NaiveDateTime(TypeDecorator):
|
|
|
65
67
|
return value
|
|
66
68
|
|
|
67
69
|
def process_result_value(self, value, dialect):
|
|
68
|
-
|
|
70
|
+
if value is None:
|
|
71
|
+
return None
|
|
72
|
+
return value.replace(tzinfo=timezone.utc)
|
|
69
73
|
|
|
70
74
|
|
|
71
75
|
class DecryptedString(CoreModel):
|
|
@@ -355,7 +359,8 @@ class RunModel(BaseModel):
|
|
|
355
359
|
run_name: Mapped[str] = mapped_column(String(100))
|
|
356
360
|
submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
357
361
|
last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
358
|
-
|
|
362
|
+
next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
363
|
+
status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True)
|
|
359
364
|
termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
|
|
360
365
|
Enum(RunTerminationReason)
|
|
361
366
|
)
|
|
@@ -396,7 +401,7 @@ class JobModel(BaseModel):
|
|
|
396
401
|
submission_num: Mapped[int] = mapped_column(Integer)
|
|
397
402
|
submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
398
403
|
last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
399
|
-
status: Mapped[JobStatus] = mapped_column(Enum(JobStatus))
|
|
404
|
+
status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True)
|
|
400
405
|
termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column(
|
|
401
406
|
Enum(JobTerminationReason)
|
|
402
407
|
)
|
|
@@ -423,6 +428,9 @@ class JobModel(BaseModel):
|
|
|
423
428
|
replica_num: Mapped[int] = mapped_column(Integer)
|
|
424
429
|
deployment_num: Mapped[int] = mapped_column(Integer)
|
|
425
430
|
job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
431
|
+
probes: Mapped[list["ProbeModel"]] = relationship(
|
|
432
|
+
back_populates="job", order_by="ProbeModel.probe_num"
|
|
433
|
+
)
|
|
426
434
|
|
|
427
435
|
|
|
428
436
|
class GatewayModel(BaseModel):
|
|
@@ -524,7 +532,7 @@ class FleetModel(BaseModel):
|
|
|
524
532
|
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
525
533
|
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
526
534
|
|
|
527
|
-
status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus))
|
|
535
|
+
status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True)
|
|
528
536
|
status_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
529
537
|
|
|
530
538
|
spec: Mapped[str] = mapped_column(Text)
|
|
@@ -543,7 +551,6 @@ class InstanceModel(BaseModel):
|
|
|
543
551
|
|
|
544
552
|
instance_num: Mapped[int] = mapped_column(Integer, default=0)
|
|
545
553
|
|
|
546
|
-
# instance
|
|
547
554
|
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
548
555
|
last_processed_at: Mapped[datetime] = mapped_column(
|
|
549
556
|
NaiveDateTime, default=get_current_datetime
|
|
@@ -564,7 +571,7 @@ class InstanceModel(BaseModel):
|
|
|
564
571
|
fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
|
|
565
572
|
fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
|
|
566
573
|
|
|
567
|
-
status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus))
|
|
574
|
+
status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True)
|
|
568
575
|
unreachable: Mapped[bool] = mapped_column(Boolean)
|
|
569
576
|
|
|
570
577
|
# VM
|
|
@@ -580,7 +587,6 @@ class InstanceModel(BaseModel):
|
|
|
580
587
|
requirements: Mapped[Optional[str]] = mapped_column(Text)
|
|
581
588
|
instance_configuration: Mapped[Optional[str]] = mapped_column(Text)
|
|
582
589
|
|
|
583
|
-
# temination policy
|
|
584
590
|
termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100))
|
|
585
591
|
# TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here
|
|
586
592
|
# (make Optional instead; also instead of -1)
|
|
@@ -594,15 +600,17 @@ class InstanceModel(BaseModel):
|
|
|
594
600
|
# instance termination handling
|
|
595
601
|
termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
596
602
|
termination_reason: Mapped[Optional[str]] = mapped_column(String(4000))
|
|
603
|
+
# Deprecated since 0.19.22, not used
|
|
597
604
|
health_status: Mapped[Optional[str]] = mapped_column(String(4000))
|
|
605
|
+
health: Mapped[HealthStatus] = mapped_column(
|
|
606
|
+
EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY
|
|
607
|
+
)
|
|
598
608
|
first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
599
609
|
last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
600
610
|
|
|
601
|
-
# backend
|
|
602
611
|
backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
|
|
603
612
|
backend_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
604
613
|
|
|
605
|
-
# offer
|
|
606
614
|
offer: Mapped[Optional[str]] = mapped_column(Text)
|
|
607
615
|
region: Mapped[Optional[str]] = mapped_column(String(2000))
|
|
608
616
|
price: Mapped[Optional[float]] = mapped_column(Float)
|
|
@@ -615,18 +623,33 @@ class InstanceModel(BaseModel):
|
|
|
615
623
|
total_blocks: Mapped[Optional[int]] = mapped_column(Integer)
|
|
616
624
|
busy_blocks: Mapped[int] = mapped_column(Integer, default=0)
|
|
617
625
|
|
|
618
|
-
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance"
|
|
626
|
+
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance")
|
|
619
627
|
last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
620
628
|
|
|
621
629
|
volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
|
|
622
630
|
back_populates="instance",
|
|
623
631
|
# Add delete-orphan option so that removing entries from volume_attachments
|
|
624
632
|
# automatically marks them for deletion.
|
|
625
|
-
#
|
|
633
|
+
# SQLAlchemy requires delete when using delete-orphan.
|
|
626
634
|
cascade="save-update, merge, delete-orphan, delete",
|
|
627
635
|
)
|
|
628
636
|
|
|
629
637
|
|
|
638
|
+
class InstanceHealthCheckModel(BaseModel):
|
|
639
|
+
__tablename__ = "instance_health_checks"
|
|
640
|
+
|
|
641
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
642
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"))
|
|
646
|
+
instance: Mapped["InstanceModel"] = relationship()
|
|
647
|
+
|
|
648
|
+
collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
649
|
+
status: Mapped[HealthStatus] = mapped_column(EnumAsString(HealthStatus, 100))
|
|
650
|
+
response: Mapped[str] = mapped_column(Text)
|
|
651
|
+
|
|
652
|
+
|
|
630
653
|
class VolumeModel(BaseModel):
|
|
631
654
|
__tablename__ = "volumes"
|
|
632
655
|
|
|
@@ -649,7 +672,7 @@ class VolumeModel(BaseModel):
|
|
|
649
672
|
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
650
673
|
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
651
674
|
|
|
652
|
-
status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus))
|
|
675
|
+
status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True)
|
|
653
676
|
status_message: Mapped[Optional[str]] = mapped_column(Text)
|
|
654
677
|
|
|
655
678
|
configuration: Mapped[str] = mapped_column(Text)
|
|
@@ -729,6 +752,24 @@ class JobPrometheusMetrics(BaseModel):
|
|
|
729
752
|
text: Mapped[str] = mapped_column(Text)
|
|
730
753
|
|
|
731
754
|
|
|
755
|
+
class ProbeModel(BaseModel):
|
|
756
|
+
__tablename__ = "probes"
|
|
757
|
+
__table_args__ = (UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),)
|
|
758
|
+
|
|
759
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
760
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
761
|
+
)
|
|
762
|
+
name: Mapped[str] = mapped_column(String(100))
|
|
763
|
+
|
|
764
|
+
job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
|
|
765
|
+
job: Mapped["JobModel"] = relationship(back_populates="probes")
|
|
766
|
+
|
|
767
|
+
probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes
|
|
768
|
+
due: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
769
|
+
success_streak: Mapped[int] = mapped_column(BigInteger)
|
|
770
|
+
active: Mapped[bool] = mapped_column(Boolean)
|
|
771
|
+
|
|
772
|
+
|
|
732
773
|
class SecretModel(BaseModel):
|
|
733
774
|
__tablename__ = "secrets"
|
|
734
775
|
__table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
|
|
@@ -3,12 +3,16 @@ from typing import List
|
|
|
3
3
|
from fastapi import APIRouter, Depends
|
|
4
4
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
5
|
|
|
6
|
-
import dstack._internal.server.services.instances as
|
|
6
|
+
import dstack._internal.server.services.instances as instances_services
|
|
7
7
|
from dstack._internal.core.models.instances import Instance
|
|
8
8
|
from dstack._internal.server.db import get_session
|
|
9
|
-
from dstack._internal.server.models import UserModel
|
|
10
|
-
from dstack._internal.server.schemas.instances import
|
|
11
|
-
|
|
9
|
+
from dstack._internal.server.models import ProjectModel, UserModel
|
|
10
|
+
from dstack._internal.server.schemas.instances import (
|
|
11
|
+
GetInstanceHealthChecksRequest,
|
|
12
|
+
GetInstanceHealthChecksResponse,
|
|
13
|
+
ListInstancesRequest,
|
|
14
|
+
)
|
|
15
|
+
from dstack._internal.server.security.permissions import Authenticated, ProjectMember
|
|
12
16
|
from dstack._internal.server.utils.routers import (
|
|
13
17
|
CustomORJSONResponse,
|
|
14
18
|
get_base_api_additional_responses,
|
|
@@ -19,6 +23,11 @@ root_router = APIRouter(
|
|
|
19
23
|
tags=["instances"],
|
|
20
24
|
responses=get_base_api_additional_responses(),
|
|
21
25
|
)
|
|
26
|
+
project_router = APIRouter(
|
|
27
|
+
prefix="/api/project/{project_name}/instances",
|
|
28
|
+
tags=["instances"],
|
|
29
|
+
responses=get_base_api_additional_responses(),
|
|
30
|
+
)
|
|
22
31
|
|
|
23
32
|
|
|
24
33
|
@root_router.post("/list", response_model=List[Instance])
|
|
@@ -35,7 +44,7 @@ async def list_instances(
|
|
|
35
44
|
the last instance from the previous page as `prev_created_at` and `prev_id`.
|
|
36
45
|
"""
|
|
37
46
|
return CustomORJSONResponse(
|
|
38
|
-
await
|
|
47
|
+
await instances_services.list_user_instances(
|
|
39
48
|
session=session,
|
|
40
49
|
user=user,
|
|
41
50
|
project_names=body.project_names,
|
|
@@ -47,3 +56,22 @@ async def list_instances(
|
|
|
47
56
|
ascending=body.ascending,
|
|
48
57
|
)
|
|
49
58
|
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@project_router.post("/get_instance_health_checks", response_model=GetInstanceHealthChecksResponse)
|
|
62
|
+
async def get_instance_health_checks(
|
|
63
|
+
body: GetInstanceHealthChecksRequest,
|
|
64
|
+
session: AsyncSession = Depends(get_session),
|
|
65
|
+
user_project: tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
66
|
+
):
|
|
67
|
+
_, project = user_project
|
|
68
|
+
health_checks = await instances_services.get_instance_health_checks(
|
|
69
|
+
session=session,
|
|
70
|
+
project=project,
|
|
71
|
+
fleet_name=body.fleet_name,
|
|
72
|
+
instance_num=body.instance_num,
|
|
73
|
+
after=body.after,
|
|
74
|
+
before=body.before,
|
|
75
|
+
limit=body.limit,
|
|
76
|
+
)
|
|
77
|
+
return CustomORJSONResponse(GetInstanceHealthChecksResponse(health_checks=health_checks))
|