dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/__init__.py +0 -65
  11. dstack/_internal/core/backends/configurators.py +9 -0
  12. dstack/_internal/core/backends/features.py +64 -0
  13. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  14. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  15. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  16. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  17. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  18. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  20. dstack/_internal/core/backends/models.py +8 -0
  21. dstack/_internal/core/compatibility/fleets.py +2 -0
  22. dstack/_internal/core/compatibility/runs.py +12 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/profiles.py +37 -0
  29. dstack/_internal/core/models/runs.py +21 -1
  30. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  31. dstack/_internal/server/app.py +26 -10
  32. dstack/_internal/server/background/__init__.py +9 -6
  33. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  34. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  35. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  36. dstack/_internal/server/background/tasks/process_instances.py +168 -103
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  38. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  39. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  40. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  41. dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
  42. dstack/_internal/server/background/tasks/process_runs.py +84 -34
  43. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  45. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  46. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  47. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  48. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  49. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  50. dstack/_internal/server/models.py +57 -16
  51. dstack/_internal/server/routers/instances.py +33 -5
  52. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  53. dstack/_internal/server/schemas/instances.py +32 -0
  54. dstack/_internal/server/schemas/runner.py +5 -0
  55. dstack/_internal/server/services/fleets.py +19 -10
  56. dstack/_internal/server/services/gateways/__init__.py +17 -17
  57. dstack/_internal/server/services/instances.py +113 -15
  58. dstack/_internal/server/services/jobs/__init__.py +18 -13
  59. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  60. dstack/_internal/server/services/logging.py +4 -2
  61. dstack/_internal/server/services/logs/aws.py +13 -1
  62. dstack/_internal/server/services/logs/gcp.py +16 -1
  63. dstack/_internal/server/services/offers.py +3 -3
  64. dstack/_internal/server/services/probes.py +6 -0
  65. dstack/_internal/server/services/projects.py +51 -19
  66. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  67. dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
  68. dstack/_internal/server/services/runner/client.py +52 -20
  69. dstack/_internal/server/services/runner/ssh.py +4 -4
  70. dstack/_internal/server/services/runs.py +115 -39
  71. dstack/_internal/server/services/services/__init__.py +4 -1
  72. dstack/_internal/server/services/ssh.py +66 -0
  73. dstack/_internal/server/services/users.py +2 -3
  74. dstack/_internal/server/services/volumes.py +11 -11
  75. dstack/_internal/server/settings.py +16 -0
  76. dstack/_internal/server/statics/index.html +1 -1
  77. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  78. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  79. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  80. dstack/_internal/server/testing/common.py +51 -0
  81. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  82. dstack/_internal/server/utils/sentry_utils.py +12 -0
  83. dstack/_internal/settings.py +3 -0
  84. dstack/_internal/utils/common.py +15 -0
  85. dstack/_internal/utils/cron.py +5 -0
  86. dstack/api/server/__init__.py +1 -1
  87. dstack/version.py +1 -1
  88. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
  89. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
  90. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  91. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple
5
5
 
6
6
  from sqlalchemy import select
7
7
  from sqlalchemy.ext.asyncio import AsyncSession
8
- from sqlalchemy.orm import joinedload, lazyload, selectinload
8
+ from sqlalchemy.orm import joinedload, load_only, selectinload
9
9
 
10
10
  from dstack._internal.core.backends.base.backend import Backend
11
11
  from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
43
43
  JobModel,
44
44
  ProjectModel,
45
45
  RunModel,
46
+ UserModel,
46
47
  VolumeAttachmentModel,
47
48
  VolumeModel,
48
49
  )
@@ -74,6 +75,7 @@ from dstack._internal.server.services.runs import (
74
75
  from dstack._internal.server.services.volumes import (
75
76
  volume_model_to_volume,
76
77
  )
78
+ from dstack._internal.server.utils import sentry_utils
77
79
  from dstack._internal.utils import common as common_utils
78
80
  from dstack._internal.utils import env as env_utils
79
81
  from dstack._internal.utils.logging import get_logger
@@ -108,6 +110,7 @@ def _get_effective_batch_size(batch_size: int) -> int:
108
110
  return batch_size
109
111
 
110
112
 
113
+ @sentry_utils.instrument_background_task
111
114
  async def _process_next_submitted_job():
112
115
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
113
116
  async with get_session_ctx() as session:
@@ -119,6 +122,7 @@ async def _process_next_submitted_job():
119
122
  JobModel.status == JobStatus.SUBMITTED,
120
123
  JobModel.id.not_in(lockset),
121
124
  )
125
+ .options(load_only(JobModel.id))
122
126
  # Jobs are process in FIFO sorted by priority globally,
123
127
  # thus runs from different projects can "overtake" each other by using higher priorities.
124
128
  # That's not a big problem as long as projects do not compete for the same compute resources.
@@ -151,9 +155,7 @@ async def _process_next_submitted_job():
151
155
 
152
156
 
153
157
  async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
154
- logger.debug("%s: provisioning has started", fmt(job_model))
155
158
  # Refetch to load related attributes.
156
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
157
159
  res = await session.execute(
158
160
  select(JobModel).where(JobModel.id == job_model.id).options(joinedload(JobModel.instance))
159
161
  )
@@ -162,15 +164,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
162
164
  select(RunModel)
163
165
  .where(RunModel.id == job_model.run_id)
164
166
  .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
165
- .options(joinedload(RunModel.user))
167
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
166
168
  .options(joinedload(RunModel.fleet).joinedload(FleetModel.instances))
167
169
  )
168
170
  run_model = res.unique().scalar_one()
169
- project = run_model.project
170
- run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
171
- profile = run_spec.merged_profile
171
+ logger.debug("%s: provisioning has started", fmt(job_model))
172
172
 
173
+ project = run_model.project
173
174
  run = run_model_to_run(run_model)
175
+ run_spec = run.run_spec
176
+ profile = run_spec.merged_profile
174
177
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
175
178
 
176
179
  master_job = find_job(run.jobs, job_model.replica_num, 0)
@@ -228,7 +231,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
228
231
  InstanceModel.deleted == False,
229
232
  InstanceModel.total_blocks > InstanceModel.busy_blocks,
230
233
  )
231
- .options(lazyload(InstanceModel.jobs))
232
234
  .order_by(InstanceModel.id) # take locks in order
233
235
  .with_for_update(key_share=True)
234
236
  )
@@ -357,9 +359,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
357
359
  await session.execute(
358
360
  select(VolumeModel)
359
361
  .where(VolumeModel.id.in_(volumes_ids))
360
- .options(selectinload(VolumeModel.user))
362
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
361
363
  .order_by(VolumeModel.id) # take locks in order
362
- .with_for_update(key_share=True)
364
+ .with_for_update(key_share=True, of=VolumeModel)
363
365
  )
364
366
  async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
365
367
  if len(volume_models) > 0:
@@ -2,7 +2,7 @@ import asyncio
2
2
 
3
3
  from sqlalchemy import or_, select
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
- from sqlalchemy.orm import joinedload, lazyload
5
+ from sqlalchemy.orm import joinedload
6
6
 
7
7
  from dstack._internal.core.models.runs import JobStatus
8
8
  from dstack._internal.server.db import get_db, get_session_ctx
@@ -18,7 +18,11 @@ from dstack._internal.server.services.jobs import (
18
18
  )
19
19
  from dstack._internal.server.services.locking import get_locker
20
20
  from dstack._internal.server.services.logging import fmt
21
- from dstack._internal.utils.common import get_current_datetime, get_or_error
21
+ from dstack._internal.server.utils import sentry_utils
22
+ from dstack._internal.utils.common import (
23
+ get_current_datetime,
24
+ get_or_error,
25
+ )
22
26
  from dstack._internal.utils.logging import get_logger
23
27
 
24
28
  logger = get_logger(__name__)
@@ -31,6 +35,7 @@ async def process_terminating_jobs(batch_size: int = 1):
31
35
  await asyncio.gather(*tasks)
32
36
 
33
37
 
38
+ @sentry_utils.instrument_background_task
34
39
  async def _process_next_terminating_job():
35
40
  job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
36
41
  instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
@@ -43,7 +48,10 @@ async def _process_next_terminating_job():
43
48
  .where(
44
49
  JobModel.id.not_in(job_lockset),
45
50
  JobModel.status == JobStatus.TERMINATING,
46
- or_(JobModel.remove_at.is_(None), JobModel.remove_at < get_current_datetime()),
51
+ or_(
52
+ JobModel.remove_at.is_(None),
53
+ JobModel.remove_at < get_current_datetime(),
54
+ ),
47
55
  )
48
56
  .order_by(JobModel.last_processed_at.asc())
49
57
  .limit(1)
@@ -59,7 +67,6 @@ async def _process_next_terminating_job():
59
67
  InstanceModel.id == job_model.used_instance_id,
60
68
  InstanceModel.id.not_in(instance_lockset),
61
69
  )
62
- .options(lazyload(InstanceModel.jobs))
63
70
  .with_for_update(skip_locked=True, key_share=True)
64
71
  )
65
72
  instance_model = res.scalar()
@@ -88,6 +95,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
88
95
  .options(
89
96
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
90
97
  joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
98
+ joinedload(InstanceModel.jobs).load_only(JobModel.id),
91
99
  )
92
100
  )
93
101
  instance_model = res.unique().scalar()
@@ -7,6 +7,7 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable
7
7
  from dstack._internal.core.models.volumes import VolumeStatus
8
8
  from dstack._internal.server.db import get_db, get_session_ctx
9
9
  from dstack._internal.server.models import (
10
+ FleetModel,
10
11
  InstanceModel,
11
12
  ProjectModel,
12
13
  VolumeAttachmentModel,
@@ -15,12 +16,14 @@ from dstack._internal.server.models import (
15
16
  from dstack._internal.server.services import backends as backends_services
16
17
  from dstack._internal.server.services import volumes as volumes_services
17
18
  from dstack._internal.server.services.locking import get_locker
19
+ from dstack._internal.server.utils import sentry_utils
18
20
  from dstack._internal.utils.common import get_current_datetime, run_async
19
21
  from dstack._internal.utils.logging import get_logger
20
22
 
21
23
  logger = get_logger(__name__)
22
24
 
23
25
 
26
+ @sentry_utils.instrument_background_task
24
27
  async def process_submitted_volumes():
25
28
  lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__)
26
29
  async with get_session_ctx() as session:
@@ -49,7 +52,6 @@ async def process_submitted_volumes():
49
52
  async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeModel):
50
53
  logger.info("Started submitted volume %s processing", volume_model.name)
51
54
  # Refetch to load related attributes.
52
- # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
53
55
  res = await session.execute(
54
56
  select(VolumeModel)
55
57
  .where(VolumeModel.id == volume_model.id)
@@ -59,6 +61,7 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
59
61
  joinedload(VolumeModel.attachments)
60
62
  .joinedload(VolumeAttachmentModel.instance)
61
63
  .joinedload(InstanceModel.fleet)
64
+ .load_only(FleetModel.name)
62
65
  )
63
66
  .execution_options(populate_existing=True)
64
67
  )
@@ -0,0 +1,43 @@
1
+ """Add probes
2
+
3
+ Revision ID: 25479f540245
4
+ Revises: 50dd7ea98639
5
+ Create Date: 2025-08-03 19:51:07.722217
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "25479f540245"
17
+ down_revision = "50dd7ea98639"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table(
25
+ "probes",
26
+ sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
27
+ sa.Column("name", sa.String(length=100), nullable=False),
28
+ sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
29
+ sa.Column("probe_num", sa.Integer(), nullable=False),
30
+ sa.Column("due", dstack._internal.server.models.NaiveDateTime(), nullable=False),
31
+ sa.Column("success_streak", sa.BigInteger(), nullable=False),
32
+ sa.Column("active", sa.Boolean(), nullable=False),
33
+ sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], name=op.f("fk_probes_job_id_jobs")),
34
+ sa.PrimaryKeyConstraint("id", "job_id", name=op.f("pk_probes")),
35
+ sa.UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),
36
+ )
37
+ # ### end Alembic commands ###
38
+
39
+
40
+ def downgrade() -> None:
41
+ # ### commands auto generated by Alembic - please adjust! ###
42
+ op.drop_table("probes")
43
+ # ### end Alembic commands ###
@@ -0,0 +1,55 @@
1
+ """Index status columns
2
+
3
+ Revision ID: 50dd7ea98639
4
+ Revises: ec02a26a256c
5
+ Create Date: 2025-07-25 10:36:25.127923
6
+
7
+ """
8
+
9
+ from alembic import op
10
+
11
+ # revision identifiers, used by Alembic.
12
+ revision = "50dd7ea98639"
13
+ down_revision = "ec02a26a256c"
14
+ branch_labels = None
15
+ depends_on = None
16
+
17
+
18
+ def upgrade() -> None:
19
+ # ### commands auto generated by Alembic - please adjust! ###
20
+ with op.batch_alter_table("runs", schema=None) as batch_op:
21
+ batch_op.create_index(batch_op.f("ix_runs_status"), ["status"], unique=False)
22
+
23
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
24
+ batch_op.create_index(batch_op.f("ix_jobs_status"), ["status"], unique=False)
25
+
26
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
27
+ batch_op.create_index(batch_op.f("ix_fleets_status"), ["status"], unique=False)
28
+
29
+ with op.batch_alter_table("instances", schema=None) as batch_op:
30
+ batch_op.create_index(batch_op.f("ix_instances_status"), ["status"], unique=False)
31
+
32
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
33
+ batch_op.create_index(batch_op.f("ix_volumes_status"), ["status"], unique=False)
34
+
35
+ # ### end Alembic commands ###
36
+
37
+
38
+ def downgrade() -> None:
39
+ # ### commands auto generated by Alembic - please adjust! ###
40
+ with op.batch_alter_table("runs", schema=None) as batch_op:
41
+ batch_op.drop_index(batch_op.f("ix_runs_status"))
42
+
43
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
44
+ batch_op.drop_index(batch_op.f("ix_jobs_status"))
45
+
46
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
47
+ batch_op.drop_index(batch_op.f("ix_fleets_status"))
48
+
49
+ with op.batch_alter_table("instances", schema=None) as batch_op:
50
+ batch_op.drop_index(batch_op.f("ix_instances_status"))
51
+
52
+ with op.batch_alter_table("volumes", schema=None) as batch_op:
53
+ batch_op.drop_index(batch_op.f("ix_volumes_status"))
54
+
55
+ # ### end Alembic commands ###
@@ -0,0 +1,50 @@
1
+ """Add instance health
2
+
3
+ Revision ID: 728b1488b1b4
4
+ Revises: 25479f540245
5
+ Create Date: 2025-08-01 14:56:20.466990
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "728b1488b1b4"
17
+ down_revision = "25479f540245"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ op.create_table(
24
+ "instance_health_checks",
25
+ sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
26
+ sa.Column(
27
+ "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
28
+ ),
29
+ sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
30
+ sa.Column("status", sa.VARCHAR(length=100), nullable=False),
31
+ sa.Column("response", sa.Text(), nullable=False),
32
+ sa.ForeignKeyConstraint(
33
+ ["instance_id"],
34
+ ["instances.id"],
35
+ name=op.f("fk_instance_health_checks_instance_id_instances"),
36
+ ),
37
+ sa.PrimaryKeyConstraint("id", name=op.f("pk_instance_health_checks")),
38
+ )
39
+ with op.batch_alter_table("instances", schema=None) as batch_op:
40
+ batch_op.add_column(sa.Column("health", sa.VARCHAR(length=100), nullable=True))
41
+ op.execute("UPDATE instances SET health = 'HEALTHY'")
42
+ with op.batch_alter_table("instances", schema=None) as batch_op:
43
+ batch_op.alter_column("health", existing_type=sa.VARCHAR(length=100), nullable=False)
44
+
45
+
46
+ def downgrade() -> None:
47
+ with op.batch_alter_table("instances", schema=None) as batch_op:
48
+ batch_op.drop_column("health")
49
+
50
+ op.drop_table("instance_health_checks")
@@ -0,0 +1,38 @@
1
+ """Add RunModel.next_triggered_at
2
+
3
+ Revision ID: ec02a26a256c
4
+ Revises: d5863798bf41
5
+ Create Date: 2025-07-17 15:47:00.443217
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ import dstack._internal.server.models
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "ec02a26a256c"
16
+ down_revision = "d5863798bf41"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ with op.batch_alter_table("runs", schema=None) as batch_op:
24
+ batch_op.add_column(
25
+ sa.Column(
26
+ "next_triggered_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
27
+ )
28
+ )
29
+
30
+ # ### end Alembic commands ###
31
+
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ with op.batch_alter_table("runs", schema=None) as batch_op:
36
+ batch_op.drop_column("next_triggered_at")
37
+
38
+ # ### end Alembic commands ###
@@ -1,6 +1,6 @@
1
1
  import enum
2
2
  import uuid
3
- from datetime import datetime
3
+ from datetime import datetime, timezone
4
4
  from typing import Callable, List, Optional, Union
5
5
 
6
6
  from sqlalchemy import (
@@ -28,6 +28,7 @@ from dstack._internal.core.models.backends.base import BackendType
28
28
  from dstack._internal.core.models.common import CoreModel
29
29
  from dstack._internal.core.models.fleets import FleetStatus
30
30
  from dstack._internal.core.models.gateways import GatewayStatus
31
+ from dstack._internal.core.models.health import HealthStatus
31
32
  from dstack._internal.core.models.instances import InstanceStatus
32
33
  from dstack._internal.core.models.profiles import (
33
34
  DEFAULT_FLEET_TERMINATION_IDLE_TIME,
@@ -51,9 +52,10 @@ logger = get_logger(__name__)
51
52
 
52
53
  class NaiveDateTime(TypeDecorator):
53
54
  """
54
- A custom type decorator that ensures datetime objects are offset-naive when stored in the database.
55
- This is needed because we use datetimes in UTC only and store them as offset-naive.
56
- Some databases (e.g. Postgres) throw an error if the timezone is set.
55
+ A custom type decorator that ensures datetime objects are offset-naive when stored in the database
56
+ and offset-aware with UTC timezone when loaded from the database.
57
+ This is because we use datetimes in UTC everywhere, and
58
+ some databases (e.g. Postgres) throw an error if the timezone is set.
57
59
  """
58
60
 
59
61
  impl = DateTime
@@ -65,7 +67,9 @@ class NaiveDateTime(TypeDecorator):
65
67
  return value
66
68
 
67
69
  def process_result_value(self, value, dialect):
68
- return value
70
+ if value is None:
71
+ return None
72
+ return value.replace(tzinfo=timezone.utc)
69
73
 
70
74
 
71
75
  class DecryptedString(CoreModel):
@@ -355,7 +359,8 @@ class RunModel(BaseModel):
355
359
  run_name: Mapped[str] = mapped_column(String(100))
356
360
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
357
361
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
358
- status: Mapped[RunStatus] = mapped_column(Enum(RunStatus))
362
+ next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
363
+ status: Mapped[RunStatus] = mapped_column(Enum(RunStatus), index=True)
359
364
  termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
360
365
  Enum(RunTerminationReason)
361
366
  )
@@ -396,7 +401,7 @@ class JobModel(BaseModel):
396
401
  submission_num: Mapped[int] = mapped_column(Integer)
397
402
  submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime)
398
403
  last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime)
399
- status: Mapped[JobStatus] = mapped_column(Enum(JobStatus))
404
+ status: Mapped[JobStatus] = mapped_column(Enum(JobStatus), index=True)
400
405
  termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column(
401
406
  Enum(JobTerminationReason)
402
407
  )
@@ -423,6 +428,9 @@ class JobModel(BaseModel):
423
428
  replica_num: Mapped[int] = mapped_column(Integer)
424
429
  deployment_num: Mapped[int] = mapped_column(Integer)
425
430
  job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)
431
+ probes: Mapped[list["ProbeModel"]] = relationship(
432
+ back_populates="job", order_by="ProbeModel.probe_num"
433
+ )
426
434
 
427
435
 
428
436
  class GatewayModel(BaseModel):
@@ -524,7 +532,7 @@ class FleetModel(BaseModel):
524
532
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
525
533
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
526
534
 
527
- status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus))
535
+ status: Mapped[FleetStatus] = mapped_column(Enum(FleetStatus), index=True)
528
536
  status_message: Mapped[Optional[str]] = mapped_column(Text)
529
537
 
530
538
  spec: Mapped[str] = mapped_column(Text)
@@ -543,7 +551,6 @@ class InstanceModel(BaseModel):
543
551
 
544
552
  instance_num: Mapped[int] = mapped_column(Integer, default=0)
545
553
 
546
- # instance
547
554
  created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
548
555
  last_processed_at: Mapped[datetime] = mapped_column(
549
556
  NaiveDateTime, default=get_current_datetime
@@ -564,7 +571,7 @@ class InstanceModel(BaseModel):
564
571
  fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
565
572
  fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
566
573
 
567
- status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus))
574
+ status: Mapped[InstanceStatus] = mapped_column(Enum(InstanceStatus), index=True)
568
575
  unreachable: Mapped[bool] = mapped_column(Boolean)
569
576
 
570
577
  # VM
@@ -580,7 +587,6 @@ class InstanceModel(BaseModel):
580
587
  requirements: Mapped[Optional[str]] = mapped_column(Text)
581
588
  instance_configuration: Mapped[Optional[str]] = mapped_column(Text)
582
589
 
583
- # temination policy
584
590
  termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100))
585
591
  # TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here
586
592
  # (make Optional instead; also instead of -1)
@@ -594,15 +600,17 @@ class InstanceModel(BaseModel):
594
600
  # instance termination handling
595
601
  termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
596
602
  termination_reason: Mapped[Optional[str]] = mapped_column(String(4000))
603
+ # Deprecated since 0.19.22, not used
597
604
  health_status: Mapped[Optional[str]] = mapped_column(String(4000))
605
+ health: Mapped[HealthStatus] = mapped_column(
606
+ EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY
607
+ )
598
608
  first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
599
609
  last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
600
610
 
601
- # backend
602
611
  backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
603
612
  backend_data: Mapped[Optional[str]] = mapped_column(Text)
604
613
 
605
- # offer
606
614
  offer: Mapped[Optional[str]] = mapped_column(Text)
607
615
  region: Mapped[Optional[str]] = mapped_column(String(2000))
608
616
  price: Mapped[Optional[float]] = mapped_column(Float)
@@ -615,18 +623,33 @@ class InstanceModel(BaseModel):
615
623
  total_blocks: Mapped[Optional[int]] = mapped_column(Integer)
616
624
  busy_blocks: Mapped[int] = mapped_column(Integer, default=0)
617
625
 
618
- jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
626
+ jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance")
619
627
  last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
620
628
 
621
629
  volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
622
630
  back_populates="instance",
623
631
  # Add delete-orphan option so that removing entries from volume_attachments
624
632
  # automatically marks them for deletion.
625
- # SQLalchemy requires delete when using delete-orphan.
633
+ # SQLAlchemy requires delete when using delete-orphan.
626
634
  cascade="save-update, merge, delete-orphan, delete",
627
635
  )
628
636
 
629
637
 
638
+ class InstanceHealthCheckModel(BaseModel):
639
+ __tablename__ = "instance_health_checks"
640
+
641
+ id: Mapped[uuid.UUID] = mapped_column(
642
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
643
+ )
644
+
645
+ instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"))
646
+ instance: Mapped["InstanceModel"] = relationship()
647
+
648
+ collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
649
+ status: Mapped[HealthStatus] = mapped_column(EnumAsString(HealthStatus, 100))
650
+ response: Mapped[str] = mapped_column(Text)
651
+
652
+
630
653
  class VolumeModel(BaseModel):
631
654
  __tablename__ = "volumes"
632
655
 
@@ -649,7 +672,7 @@ class VolumeModel(BaseModel):
649
672
  deleted: Mapped[bool] = mapped_column(Boolean, default=False)
650
673
  deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
651
674
 
652
- status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus))
675
+ status: Mapped[VolumeStatus] = mapped_column(Enum(VolumeStatus), index=True)
653
676
  status_message: Mapped[Optional[str]] = mapped_column(Text)
654
677
 
655
678
  configuration: Mapped[str] = mapped_column(Text)
@@ -729,6 +752,24 @@ class JobPrometheusMetrics(BaseModel):
729
752
  text: Mapped[str] = mapped_column(Text)
730
753
 
731
754
 
755
+ class ProbeModel(BaseModel):
756
+ __tablename__ = "probes"
757
+ __table_args__ = (UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),)
758
+
759
+ id: Mapped[uuid.UUID] = mapped_column(
760
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
761
+ )
762
+ name: Mapped[str] = mapped_column(String(100))
763
+
764
+ job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
765
+ job: Mapped["JobModel"] = relationship(back_populates="probes")
766
+
767
+ probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes
768
+ due: Mapped[datetime] = mapped_column(NaiveDateTime)
769
+ success_streak: Mapped[int] = mapped_column(BigInteger)
770
+ active: Mapped[bool] = mapped_column(Boolean)
771
+
772
+
732
773
  class SecretModel(BaseModel):
733
774
  __tablename__ = "secrets"
734
775
  __table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
@@ -3,12 +3,16 @@ from typing import List
3
3
  from fastapi import APIRouter, Depends
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
5
 
6
- import dstack._internal.server.services.instances as instances
6
+ import dstack._internal.server.services.instances as instances_services
7
7
  from dstack._internal.core.models.instances import Instance
8
8
  from dstack._internal.server.db import get_session
9
- from dstack._internal.server.models import UserModel
10
- from dstack._internal.server.schemas.instances import ListInstancesRequest
11
- from dstack._internal.server.security.permissions import Authenticated
9
+ from dstack._internal.server.models import ProjectModel, UserModel
10
+ from dstack._internal.server.schemas.instances import (
11
+ GetInstanceHealthChecksRequest,
12
+ GetInstanceHealthChecksResponse,
13
+ ListInstancesRequest,
14
+ )
15
+ from dstack._internal.server.security.permissions import Authenticated, ProjectMember
12
16
  from dstack._internal.server.utils.routers import (
13
17
  CustomORJSONResponse,
14
18
  get_base_api_additional_responses,
@@ -19,6 +23,11 @@ root_router = APIRouter(
19
23
  tags=["instances"],
20
24
  responses=get_base_api_additional_responses(),
21
25
  )
26
+ project_router = APIRouter(
27
+ prefix="/api/project/{project_name}/instances",
28
+ tags=["instances"],
29
+ responses=get_base_api_additional_responses(),
30
+ )
22
31
 
23
32
 
24
33
  @root_router.post("/list", response_model=List[Instance])
@@ -35,7 +44,7 @@ async def list_instances(
35
44
  the last instance from the previous page as `prev_created_at` and `prev_id`.
36
45
  """
37
46
  return CustomORJSONResponse(
38
- await instances.list_user_instances(
47
+ await instances_services.list_user_instances(
39
48
  session=session,
40
49
  user=user,
41
50
  project_names=body.project_names,
@@ -47,3 +56,22 @@ async def list_instances(
47
56
  ascending=body.ascending,
48
57
  )
49
58
  )
59
+
60
+
61
+ @project_router.post("/get_instance_health_checks", response_model=GetInstanceHealthChecksResponse)
62
+ async def get_instance_health_checks(
63
+ body: GetInstanceHealthChecksRequest,
64
+ session: AsyncSession = Depends(get_session),
65
+ user_project: tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
66
+ ):
67
+ _, project = user_project
68
+ health_checks = await instances_services.get_instance_health_checks(
69
+ session=session,
70
+ project=project,
71
+ fleet_name=body.fleet_name,
72
+ instance_num=body.instance_num,
73
+ after=body.after,
74
+ before=body.before,
75
+ limit=body.limit,
76
+ )
77
+ return CustomORJSONResponse(GetInstanceHealthChecksResponse(health_checks=health_checks))