dstack 0.19.20__py3-none-any.whl → 0.19.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (44) hide show
  1. dstack/_internal/core/backends/__init__.py +0 -65
  2. dstack/_internal/core/backends/features.py +64 -0
  3. dstack/_internal/core/compatibility/fleets.py +2 -0
  4. dstack/_internal/core/compatibility/runs.py +4 -0
  5. dstack/_internal/core/models/profiles.py +37 -0
  6. dstack/_internal/server/app.py +22 -10
  7. dstack/_internal/server/background/__init__.py +5 -6
  8. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  9. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  10. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  11. dstack/_internal/server/background/tasks/process_instances.py +62 -48
  12. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  13. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  14. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  15. dstack/_internal/server/background/tasks/process_running_jobs.py +129 -124
  16. dstack/_internal/server/background/tasks/process_runs.py +63 -20
  17. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  18. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  19. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  20. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  21. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  22. dstack/_internal/server/models.py +16 -16
  23. dstack/_internal/server/services/fleets.py +19 -10
  24. dstack/_internal/server/services/gateways/__init__.py +17 -17
  25. dstack/_internal/server/services/instances.py +10 -14
  26. dstack/_internal/server/services/jobs/__init__.py +10 -12
  27. dstack/_internal/server/services/offers.py +3 -3
  28. dstack/_internal/server/services/projects.py +35 -15
  29. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  30. dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
  31. dstack/_internal/server/services/runs.py +74 -34
  32. dstack/_internal/server/services/services/__init__.py +4 -1
  33. dstack/_internal/server/services/users.py +2 -3
  34. dstack/_internal/server/services/volumes.py +11 -11
  35. dstack/_internal/server/settings.py +3 -0
  36. dstack/_internal/server/testing/common.py +7 -0
  37. dstack/_internal/server/utils/sentry_utils.py +12 -0
  38. dstack/_internal/utils/cron.py +5 -0
  39. dstack/version.py +1 -1
  40. {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/METADATA +2 -11
  41. {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/RECORD +44 -39
  42. {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/WHEEL +0 -0
  43. {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/entry_points.txt +0 -0
  44. {dstack-0.19.20.dist-info → dstack-0.19.21.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
  import datetime
3
3
  import uuid
4
- from datetime import timedelta, timezone
4
+ from datetime import timedelta
5
5
  from functools import partial
6
6
  from typing import List, Optional, Sequence
7
7
 
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
11
11
  from sqlalchemy.orm import selectinload
12
12
 
13
13
  import dstack._internal.utils.random_names as random_names
14
- from dstack._internal.core.backends import (
15
- BACKENDS_WITH_GATEWAY_SUPPORT,
16
- BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
17
- )
18
14
  from dstack._internal.core.backends.base.compute import (
19
15
  Compute,
20
16
  ComputeWithGatewaySupport,
21
17
  get_dstack_gateway_wheel,
22
18
  get_dstack_runner_version,
23
19
  )
20
+ from dstack._internal.core.backends.features import (
21
+ BACKENDS_WITH_GATEWAY_SUPPORT,
22
+ BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
23
+ )
24
24
  from dstack._internal.core.errors import (
25
25
  GatewayError,
26
26
  ResourceNotExistsError,
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
86
86
  return gateway_model_to_gateway(gateway)
87
87
 
88
88
 
89
- async def get_project_default_gateway(
90
- session: AsyncSession, project: ProjectModel
91
- ) -> Optional[Gateway]:
92
- gateway: Optional[GatewayModel] = project.default_gateway
93
- if gateway is None:
94
- return None
95
- return gateway_model_to_gateway(gateway)
96
-
97
-
98
89
  async def create_gateway_compute(
99
90
  project_name: str,
100
91
  backend_compute: Compute,
@@ -181,9 +172,9 @@ async def create_gateway(
181
172
  session.add(gateway)
182
173
  await session.commit()
183
174
 
184
- if project.default_gateway is None or configuration.default:
175
+ default_gateway = await get_project_default_gateway_model(session=session, project=project)
176
+ if default_gateway is None or configuration.default:
185
177
  await set_default_gateway(session=session, project=project, name=configuration.name)
186
-
187
178
  return gateway_model_to_gateway(gateway)
188
179
 
189
180
 
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
349
340
  return res.scalar()
350
341
 
351
342
 
343
+ async def get_project_default_gateway_model(
344
+ session: AsyncSession, project: ProjectModel
345
+ ) -> Optional[GatewayModel]:
346
+ res = await session.execute(
347
+ select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
348
+ )
349
+ return res.scalar_one_or_none()
350
+
351
+
352
352
  async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
353
353
  gateways = await list_project_gateway_models(session=session, project=project)
354
354
  names = {g.name for g in gateways}
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
557
557
  region=gateway_model.region,
558
558
  wildcard_domain=gateway_model.wildcard_domain,
559
559
  default=gateway_model.project.default_gateway_id == gateway_model.id,
560
- created_at=gateway_model.created_at.replace(tzinfo=timezone.utc),
560
+ created_at=gateway_model.created_at,
561
561
  status=gateway_model.status,
562
562
  status_message=gateway_model.status_message,
563
563
  configuration=configuration,
@@ -1,6 +1,6 @@
1
1
  import uuid
2
2
  from collections.abc import Container, Iterable
3
- from datetime import datetime, timezone
3
+ from datetime import datetime
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import gpuhunt
@@ -8,11 +8,11 @@ from sqlalchemy import and_, or_, select
8
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
9
  from sqlalchemy.orm import joinedload
10
10
 
11
- from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
12
11
  from dstack._internal.core.backends.base.offers import (
13
12
  offer_to_catalog_item,
14
13
  requirements_to_query_filter,
15
14
  )
15
+ from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
16
16
  from dstack._internal.core.models.backends.base import BackendType
17
17
  from dstack._internal.core.models.envs import Env
18
18
  from dstack._internal.core.models.instances import (
@@ -34,7 +34,6 @@ from dstack._internal.core.models.profiles import (
34
34
  TerminationPolicy,
35
35
  )
36
36
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
37
- from dstack._internal.core.models.users import GlobalRole
38
37
  from dstack._internal.core.models.volumes import Volume
39
38
  from dstack._internal.core.services.profiles import get_termination
40
39
  from dstack._internal.server.models import (
@@ -44,7 +43,7 @@ from dstack._internal.server.models import (
44
43
  UserModel,
45
44
  )
46
45
  from dstack._internal.server.services.offers import generate_shared_offer
47
- from dstack._internal.server.services.projects import list_project_models, list_user_project_models
46
+ from dstack._internal.server.services.projects import list_user_project_models
48
47
  from dstack._internal.utils import common as common_utils
49
48
  from dstack._internal.utils.logging import get_logger
50
49
 
@@ -62,7 +61,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
62
61
  status=instance_model.status,
63
62
  unreachable=instance_model.unreachable,
64
63
  termination_reason=instance_model.termination_reason,
65
- created=instance_model.created_at.replace(tzinfo=timezone.utc),
64
+ created=instance_model.created_at,
66
65
  total_blocks=instance_model.total_blocks,
67
66
  busy_blocks=instance_model.busy_blocks,
68
67
  )
@@ -372,18 +371,15 @@ async def list_user_instances(
372
371
  limit: int,
373
372
  ascending: bool,
374
373
  ) -> List[Instance]:
375
- if user.global_role == GlobalRole.ADMIN:
376
- projects = await list_project_models(session=session)
377
- else:
378
- projects = await list_user_project_models(session=session, user=user)
379
- if not projects:
380
- return []
381
-
374
+ projects = await list_user_project_models(
375
+ session=session,
376
+ user=user,
377
+ only_names=True,
378
+ )
382
379
  if project_names is not None:
383
- projects = [proj for proj in projects if proj.name in project_names]
380
+ projects = [p for p in projects if p.name in project_names]
384
381
  if len(projects) == 0:
385
382
  return []
386
-
387
383
  instance_models = await list_projects_instance_models(
388
384
  session=session,
389
385
  projects=projects,
@@ -1,13 +1,13 @@
1
1
  import itertools
2
2
  import json
3
- from datetime import timedelta, timezone
3
+ from datetime import timedelta
4
4
  from typing import Dict, Iterable, List, Optional, Tuple
5
5
  from uuid import UUID
6
6
 
7
7
  import requests
8
8
  from sqlalchemy import select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy.orm import joinedload, load_only
11
11
 
12
12
  import dstack._internal.server.services.backends as backends_services
13
13
  from dstack._internal.core.backends.base.backend import Backend
@@ -130,7 +130,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
130
130
  ):
131
131
  backend_data = json.loads(job_provisioning_data.backend_data)
132
132
  job_provisioning_data.backend = backend_data["base_backend"]
133
- last_processed_at = job_model.last_processed_at.replace(tzinfo=timezone.utc)
133
+ last_processed_at = job_model.last_processed_at
134
134
  finished_at = None
135
135
  if job_model.status.is_finished():
136
136
  finished_at = last_processed_at
@@ -140,7 +140,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
140
140
  id=job_model.id,
141
141
  submission_num=job_model.submission_num,
142
142
  deployment_num=job_model.deployment_num,
143
- submitted_at=job_model.submitted_at.replace(tzinfo=timezone.utc),
143
+ submitted_at=job_model.submitted_at,
144
144
  last_processed_at=last_processed_at,
145
145
  finished_at=finished_at,
146
146
  inactivity_secs=job_model.inactivity_secs,
@@ -231,10 +231,7 @@ async def process_terminating_job(
231
231
  Graceful stop should already be done by `process_terminating_run`.
232
232
  Caller must acquire the locks on the job and the job's instance.
233
233
  """
234
- if (
235
- job_model.remove_at is not None
236
- and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
237
- ):
234
+ if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
238
235
  # it's too early to terminate the instance
239
236
  return
240
237
 
@@ -550,24 +547,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
550
547
  return (
551
548
  job_model.volumes_detached_at is not None
552
549
  and common.get_current_datetime()
553
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc) + MIN_FORCE_DETACH_WAIT_PERIOD
550
+ > job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
554
551
  and (
555
552
  job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
556
553
  or stop_duration is not None
557
554
  and common.get_current_datetime()
558
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc)
559
- + timedelta(seconds=stop_duration)
555
+ > job_model.volumes_detached_at + timedelta(seconds=stop_duration)
560
556
  )
561
557
  )
562
558
 
563
559
 
564
560
  async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
565
561
  res = await session.execute(
566
- select(JobModel).where(
562
+ select(JobModel)
563
+ .where(
567
564
  JobModel.status == JobStatus.TERMINATING,
568
565
  JobModel.used_instance_id.is_not(None),
569
566
  JobModel.volumes_detached_at.is_not(None),
570
567
  )
568
+ .options(load_only(JobModel.used_instance_id))
571
569
  )
572
570
  job_models = res.scalars().all()
573
571
  return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
@@ -2,13 +2,13 @@ from typing import List, Literal, Optional, Tuple, Union
2
2
 
3
3
  import gpuhunt
4
4
 
5
- from dstack._internal.core.backends import (
5
+ from dstack._internal.core.backends.base.backend import Backend
6
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
7
+ from dstack._internal.core.backends.features import (
6
8
  BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
7
9
  BACKENDS_WITH_MULTINODE_SUPPORT,
8
10
  BACKENDS_WITH_RESERVATION_SUPPORT,
9
11
  )
10
- from dstack._internal.core.backends.base.backend import Backend
11
- from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
12
12
  from dstack._internal.core.models.backends.base import BackendType
13
13
  from dstack._internal.core.models.instances import (
14
14
  InstanceOfferWithAvailability,
@@ -1,11 +1,10 @@
1
1
  import uuid
2
- from datetime import timezone
3
2
  from typing import Awaitable, Callable, List, Optional, Tuple
4
3
 
5
4
  from sqlalchemy import delete, select, update
6
5
  from sqlalchemy import func as safunc
7
6
  from sqlalchemy.ext.asyncio import AsyncSession
8
- from sqlalchemy.orm import joinedload
7
+ from sqlalchemy.orm import QueryableAttribute, joinedload, load_only
9
8
 
10
9
  from dstack._internal.core.backends.configurators import get_configurator
11
10
  from dstack._internal.core.backends.dstack.models import (
@@ -54,13 +53,12 @@ async def list_user_projects(
54
53
  user: UserModel,
55
54
  ) -> List[Project]:
56
55
  """
57
- Returns projects where the user is a member.
56
+ Returns projects where the user is a member or all projects for global admins.
58
57
  """
59
- if user.global_role == GlobalRole.ADMIN:
60
- projects = await list_project_models(session=session)
61
- else:
62
- projects = await list_user_project_models(session=session, user=user)
63
-
58
+ projects = await list_user_project_models(
59
+ session=session,
60
+ user=user,
61
+ )
64
62
  projects = sorted(projects, key=lambda p: p.created_at)
65
63
  return [
66
64
  project_model_to_project(p, include_backends=False, include_members=False)
@@ -80,7 +78,7 @@ async def list_user_accessible_projects(
80
78
  if user.global_role == GlobalRole.ADMIN:
81
79
  projects = await list_project_models(session=session)
82
80
  else:
83
- member_projects = await list_user_project_models(session=session, user=user)
81
+ member_projects = await list_member_project_models(session=session, user=user)
84
82
  public_projects = await list_public_non_member_project_models(session=session, user=user)
85
83
  projects = member_projects + public_projects
86
84
 
@@ -167,7 +165,7 @@ async def delete_projects(
167
165
  projects_names: List[str],
168
166
  ):
169
167
  if user.global_role != GlobalRole.ADMIN:
170
- user_projects = await list_user_project_models(
168
+ user_projects = await list_member_project_models(
171
169
  session=session, user=user, include_members=True
172
170
  )
173
171
  user_project_names = [p.name for p in user_projects]
@@ -339,9 +337,25 @@ async def clear_project_members(
339
337
 
340
338
 
341
339
  async def list_user_project_models(
340
+ session: AsyncSession,
341
+ user: UserModel,
342
+ only_names: bool = False,
343
+ ) -> List[ProjectModel]:
344
+ load_only_attrs = []
345
+ if only_names:
346
+ load_only_attrs += [ProjectModel.id, ProjectModel.name]
347
+ if user.global_role == GlobalRole.ADMIN:
348
+ return await list_project_models(session=session, load_only_attrs=load_only_attrs)
349
+ return await list_member_project_models(
350
+ session=session, user=user, load_only_attrs=load_only_attrs
351
+ )
352
+
353
+
354
+ async def list_member_project_models(
342
355
  session: AsyncSession,
343
356
  user: UserModel,
344
357
  include_members: bool = False,
358
+ load_only_attrs: Optional[List[QueryableAttribute]] = None,
345
359
  ) -> List[ProjectModel]:
346
360
  """
347
361
  List project models for a user where they are a member.
@@ -349,6 +363,8 @@ async def list_user_project_models(
349
363
  options = []
350
364
  if include_members:
351
365
  options.append(joinedload(ProjectModel.members))
366
+ if load_only_attrs:
367
+ options.append(load_only(*load_only_attrs))
352
368
  res = await session.execute(
353
369
  select(ProjectModel)
354
370
  .where(
@@ -395,13 +411,20 @@ async def list_user_owned_project_models(
395
411
 
396
412
  async def list_project_models(
397
413
  session: AsyncSession,
414
+ load_only_attrs: Optional[List[QueryableAttribute]] = None,
398
415
  ) -> List[ProjectModel]:
416
+ options = []
417
+ if load_only_attrs:
418
+ options.append(load_only(*load_only_attrs))
399
419
  res = await session.execute(
400
- select(ProjectModel).where(ProjectModel.deleted == False),
420
+ select(ProjectModel).where(ProjectModel.deleted == False).options(*options)
401
421
  )
402
422
  return list(res.scalars().all())
403
423
 
404
424
 
425
+ # TODO: Do not load ProjectModel.backends and ProjectModel.members by default when getting project
426
+
427
+
405
428
  async def get_project_model_by_name(
406
429
  session: AsyncSession, project_name: str, ignore_case: bool = True
407
430
  ) -> Optional[ProjectModel]:
@@ -415,7 +438,6 @@ async def get_project_model_by_name(
415
438
  .where(*filters)
416
439
  .options(joinedload(ProjectModel.backends))
417
440
  .options(joinedload(ProjectModel.members))
418
- .options(joinedload(ProjectModel.default_gateway))
419
441
  )
420
442
  return res.unique().scalar()
421
443
 
@@ -432,7 +454,6 @@ async def get_project_model_by_name_or_error(
432
454
  )
433
455
  .options(joinedload(ProjectModel.backends))
434
456
  .options(joinedload(ProjectModel.members))
435
- .options(joinedload(ProjectModel.default_gateway))
436
457
  )
437
458
  return res.unique().scalar_one()
438
459
 
@@ -449,7 +470,6 @@ async def get_project_model_by_id_or_error(
449
470
  )
450
471
  .options(joinedload(ProjectModel.backends))
451
472
  .options(joinedload(ProjectModel.members))
452
- .options(joinedload(ProjectModel.default_gateway))
453
473
  )
454
474
  return res.unique().scalar_one()
455
475
 
@@ -537,7 +557,7 @@ def project_model_to_project(
537
557
  project_id=project_model.id,
538
558
  project_name=project_model.name,
539
559
  owner=users.user_model_to_user(project_model.owner),
540
- created_at=project_model.created_at.replace(tzinfo=timezone.utc),
560
+ created_at=project_model.created_at,
541
561
  backends=backends,
542
562
  members=members,
543
563
  is_public=project_model.is_public,
@@ -5,6 +5,9 @@ class RunMetrics:
5
5
  """Wrapper class for run-related Prometheus metrics."""
6
6
 
7
7
  def __init__(self):
8
+ # submit_to_provision_duration reflects real provisioning time
9
+ # but does not reflect how quickly provisioning processing works
10
+ # since it includes scheduling time, retrying, etc.
8
11
  self._submit_to_provision_duration = Histogram(
9
12
  "dstack_submit_to_provision_duration_seconds",
10
13
  "Time from when a run has been submitted and first job provisioning",
@@ -2,7 +2,6 @@ import itertools
2
2
  import json
3
3
  from collections import defaultdict
4
4
  from collections.abc import Generator, Iterable
5
- from datetime import timezone
6
5
  from typing import ClassVar
7
6
  from uuid import UUID
8
7
 
@@ -80,7 +79,7 @@ async def get_instance_metrics(session: AsyncSession) -> Iterable[Metric]:
80
79
  "dstack_backend": instance.backend.value if instance.backend is not None else "",
81
80
  "dstack_gpu": gpu,
82
81
  }
83
- duration = (now - instance.created_at.replace(tzinfo=timezone.utc)).total_seconds()
82
+ duration = (now - instance.created_at).total_seconds()
84
83
  metrics.add_sample(_INSTANCE_DURATION, labels, duration)
85
84
  metrics.add_sample(_INSTANCE_PRICE, labels, instance.price or 0.0)
86
85
  metrics.add_sample(_INSTANCE_GPU_COUNT, labels, gpu_count)
@@ -167,7 +166,7 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
167
166
  "dstack_backend": jpd.get_base_backend().value,
168
167
  "dstack_gpu": gpus[0].name if gpus else "",
169
168
  }
170
- duration = (now - job.submitted_at.replace(tzinfo=timezone.utc)).total_seconds()
169
+ duration = (now - job.submitted_at).total_seconds()
171
170
  metrics.add_sample(_JOB_DURATION, labels, duration)
172
171
  metrics.add_sample(_JOB_PRICE, labels, price)
173
172
  metrics.add_sample(_JOB_GPU_COUNT, labels, len(gpus))
@@ -5,9 +5,10 @@ from datetime import datetime, timezone
5
5
  from typing import List, Optional
6
6
 
7
7
  import pydantic
8
+ from apscheduler.triggers.cron import CronTrigger
8
9
  from sqlalchemy import and_, func, or_, select, update
9
10
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import joinedload, selectinload
11
+ from sqlalchemy.orm import joinedload
11
12
 
12
13
  import dstack._internal.utils.common as common_utils
13
14
  from dstack._internal.core.errors import (
@@ -42,7 +43,6 @@ from dstack._internal.core.models.runs import (
42
43
  RunTerminationReason,
43
44
  ServiceSpec,
44
45
  )
45
- from dstack._internal.core.models.users import GlobalRole
46
46
  from dstack._internal.core.models.volumes import (
47
47
  InstanceMountPoint,
48
48
  Volume,
@@ -81,7 +81,7 @@ from dstack._internal.server.services.locking import get_locker, string_to_lock_
81
81
  from dstack._internal.server.services.logging import fmt
82
82
  from dstack._internal.server.services.offers import get_offers_by_requirements
83
83
  from dstack._internal.server.services.plugins import apply_plugin_policies
84
- from dstack._internal.server.services.projects import list_project_models, list_user_project_models
84
+ from dstack._internal.server.services.projects import list_user_project_models
85
85
  from dstack._internal.server.services.resources import set_resources_defaults
86
86
  from dstack._internal.server.services.secrets import get_project_secrets_mapping
87
87
  from dstack._internal.server.services.users import get_user_model_by_name
@@ -115,10 +115,11 @@ async def list_user_runs(
115
115
  ) -> List[Run]:
116
116
  if project_name is None and repo_id is not None:
117
117
  return []
118
- if user.global_role == GlobalRole.ADMIN:
119
- projects = await list_project_models(session=session)
120
- else:
121
- projects = await list_user_project_models(session=session, user=user)
118
+ projects = await list_user_project_models(
119
+ session=session,
120
+ user=user,
121
+ only_names=True,
122
+ )
122
123
  runs_user = None
123
124
  if username is not None:
124
125
  runs_user = await get_user_model_by_name(session=session, username=username)
@@ -217,9 +218,9 @@ async def list_projects_run_models(
217
218
  res = await session.execute(
218
219
  select(RunModel)
219
220
  .where(*filters)
221
+ .options(joinedload(RunModel.user).load_only(UserModel.name))
220
222
  .order_by(*order_by)
221
223
  .limit(limit)
222
- .options(selectinload(RunModel.user))
223
224
  )
224
225
  run_models = list(res.scalars().all())
225
226
  return run_models
@@ -511,6 +512,14 @@ async def submit_run(
511
512
  )
512
513
 
513
514
  submitted_at = common_utils.get_current_datetime()
515
+ initial_status = RunStatus.SUBMITTED
516
+ initial_replicas = 1
517
+ if run_spec.merged_profile.schedule is not None:
518
+ initial_status = RunStatus.PENDING
519
+ initial_replicas = 0
520
+ elif run_spec.configuration.type == "service":
521
+ initial_replicas = run_spec.configuration.replicas.min
522
+
514
523
  run_model = RunModel(
515
524
  id=uuid.uuid4(),
516
525
  project_id=project.id,
@@ -519,21 +528,20 @@ async def submit_run(
519
528
  user_id=user.id,
520
529
  run_name=run_spec.run_name,
521
530
  submitted_at=submitted_at,
522
- status=RunStatus.SUBMITTED,
531
+ status=initial_status,
523
532
  run_spec=run_spec.json(),
524
533
  last_processed_at=submitted_at,
525
534
  priority=run_spec.configuration.priority,
526
535
  deployment_num=0,
527
536
  desired_replica_count=1, # a relevant value will be set in process_runs.py
537
+ next_triggered_at=_get_next_triggered_at(run_spec),
528
538
  )
529
539
  session.add(run_model)
530
540
 
531
- replicas = 1
532
541
  if run_spec.configuration.type == "service":
533
- replicas = run_spec.configuration.replicas.min
534
542
  await services.register_service(session, run_model, run_spec)
535
543
 
536
- for replica_num in range(replicas):
544
+ for replica_num in range(initial_replicas):
537
545
  jobs = await get_jobs_from_run_spec(
538
546
  run_spec=run_spec,
539
547
  secrets=secrets,
@@ -693,8 +701,8 @@ def run_model_to_run(
693
701
  id=run_model.id,
694
702
  project_name=run_model.project.name,
695
703
  user=run_model.user.name,
696
- submitted_at=run_model.submitted_at.replace(tzinfo=timezone.utc),
697
- last_processed_at=run_model.last_processed_at.replace(tzinfo=timezone.utc),
704
+ submitted_at=run_model.submitted_at,
705
+ last_processed_at=run_model.last_processed_at,
698
706
  status=run_model.status,
699
707
  status_message=status_message,
700
708
  termination_reason=run_model.termination_reason,
@@ -972,6 +980,12 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
972
980
  raise ServerClientError(
973
981
  f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
974
982
  )
983
+ if (
984
+ run_spec.merged_profile.schedule
985
+ and run_spec.configuration.type == "service"
986
+ and run_spec.configuration.replicas.min == 0
987
+ ):
988
+ raise ServerClientError("Scheduled services with autoscaling to zero are not supported")
975
989
  if run_spec.configuration.priority is None:
976
990
  run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
977
991
  set_resources_defaults(run_spec.configuration.resources)
@@ -1059,7 +1073,7 @@ def _check_can_update_configuration(
1059
1073
  )
1060
1074
 
1061
1075
 
1062
- async def process_terminating_run(session: AsyncSession, run: RunModel):
1076
+ async def process_terminating_run(session: AsyncSession, run_model: RunModel):
1063
1077
  """
1064
1078
  Used by both `process_runs` and `stop_run` to process a TERMINATING run.
1065
1079
  Stops the jobs gracefully and marks them as TERMINATING.
@@ -1067,44 +1081,54 @@ async def process_terminating_run(session: AsyncSession, run: RunModel):
1067
1081
  When all jobs are terminated, assigns a finished status to the run.
1068
1082
  Caller must acquire the lock on run.
1069
1083
  """
1070
- assert run.termination_reason is not None
1071
- job_termination_reason = run.termination_reason.to_job_termination_reason()
1084
+ assert run_model.termination_reason is not None
1085
+ run = run_model_to_run(run_model, include_jobs=False)
1086
+ job_termination_reason = run_model.termination_reason.to_job_termination_reason()
1072
1087
 
1073
1088
  unfinished_jobs_count = 0
1074
- for job in run.jobs:
1075
- if job.status.is_finished():
1089
+ for job_model in run_model.jobs:
1090
+ if job_model.status.is_finished():
1076
1091
  continue
1077
1092
  unfinished_jobs_count += 1
1078
- if job.status == JobStatus.TERMINATING:
1093
+ if job_model.status == JobStatus.TERMINATING:
1079
1094
  if job_termination_reason == JobTerminationReason.ABORTED_BY_USER:
1080
1095
  # Override termination reason so that
1081
1096
  # abort actions such as volume force detach are triggered
1082
- job.termination_reason = job_termination_reason
1097
+ job_model.termination_reason = job_termination_reason
1083
1098
  continue
1084
1099
 
1085
- if job.status == JobStatus.RUNNING and job_termination_reason not in {
1100
+ if job_model.status == JobStatus.RUNNING and job_termination_reason not in {
1086
1101
  JobTerminationReason.ABORTED_BY_USER,
1087
1102
  JobTerminationReason.DONE_BY_RUNNER,
1088
1103
  }:
1089
1104
  # Send a signal to stop the job gracefully
1090
- await stop_runner(session, job)
1091
- delay_job_instance_termination(job)
1092
- job.status = JobStatus.TERMINATING
1093
- job.termination_reason = job_termination_reason
1094
- job.last_processed_at = common_utils.get_current_datetime()
1105
+ await stop_runner(session, job_model)
1106
+ delay_job_instance_termination(job_model)
1107
+ job_model.status = JobStatus.TERMINATING
1108
+ job_model.termination_reason = job_termination_reason
1109
+ job_model.last_processed_at = common_utils.get_current_datetime()
1095
1110
 
1096
1111
  if unfinished_jobs_count == 0:
1097
- if run.service_spec is not None:
1112
+ if run_model.service_spec is not None:
1098
1113
  try:
1099
- await services.unregister_service(session, run)
1114
+ await services.unregister_service(session, run_model)
1100
1115
  except Exception as e:
1101
- logger.warning("%s: failed to unregister service: %s", fmt(run), repr(e))
1102
- run.status = run.termination_reason.to_status()
1116
+ logger.warning("%s: failed to unregister service: %s", fmt(run_model), repr(e))
1117
+ if (
1118
+ run.run_spec.merged_profile.schedule is not None
1119
+ and run_model.termination_reason
1120
+ not in [RunTerminationReason.ABORTED_BY_USER, RunTerminationReason.STOPPED_BY_USER]
1121
+ ):
1122
+ run_model.next_triggered_at = _get_next_triggered_at(run.run_spec)
1123
+ run_model.status = RunStatus.PENDING
1124
+ else:
1125
+ run_model.status = run_model.termination_reason.to_status()
1126
+
1103
1127
  logger.info(
1104
1128
  "%s: run status has changed TERMINATING -> %s, reason: %s",
1105
- fmt(run),
1106
- run.status.name,
1107
- run.termination_reason.name,
1129
+ fmt(run_model),
1130
+ run_model.status.name,
1131
+ run_model.termination_reason.name,
1108
1132
  )
1109
1133
 
1110
1134
 
@@ -1224,3 +1248,19 @@ async def retry_run_replica_jobs(
1224
1248
 
1225
1249
  def _remove_job_spec_sensitive_info(spec: JobSpec):
1226
1250
  spec.ssh_key = None
1251
+
1252
+
1253
+ def _get_next_triggered_at(run_spec: RunSpec) -> Optional[datetime]:
1254
+ if run_spec.merged_profile.schedule is None:
1255
+ return None
1256
+ now = common_utils.get_current_datetime()
1257
+ fire_times = []
1258
+ for cron in run_spec.merged_profile.schedule.crons:
1259
+ cron_trigger = CronTrigger.from_crontab(cron, timezone=timezone.utc)
1260
+ fire_times.append(
1261
+ cron_trigger.get_next_fire_time(
1262
+ previous_fire_time=None,
1263
+ now=now,
1264
+ )
1265
+ )
1266
+ return min(fire_times)
@@ -28,6 +28,7 @@ from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel,
28
28
  from dstack._internal.server.services.gateways import (
29
29
  get_gateway_configuration,
30
30
  get_or_add_gateway_connection,
31
+ get_project_default_gateway_model,
31
32
  get_project_gateway_model_by_name,
32
33
  )
33
34
  from dstack._internal.server.services.logging import fmt
@@ -52,7 +53,9 @@ async def register_service(session: AsyncSession, run_model: RunModel, run_spec:
52
53
  elif run_spec.configuration.gateway == False:
53
54
  gateway = None
54
55
  else:
55
- gateway = run_model.project.default_gateway
56
+ gateway = await get_project_default_gateway_model(
57
+ session=session, project=run_model.project
58
+ )
56
59
 
57
60
  if gateway is not None:
58
61
  service_spec = await _register_service_in_gateway(session, run_model, run_spec, gateway)