dstack 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (42) hide show
  1. dstack/_internal/cli/commands/config.py +1 -1
  2. dstack/_internal/cli/commands/project.py +161 -0
  3. dstack/_internal/cli/commands/ps.py +9 -2
  4. dstack/_internal/cli/main.py +2 -0
  5. dstack/_internal/cli/services/configurators/run.py +18 -11
  6. dstack/_internal/cli/utils/run.py +7 -2
  7. dstack/_internal/core/backends/azure/compute.py +5 -2
  8. dstack/_internal/core/backends/cudo/compute.py +1 -1
  9. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  10. dstack/_internal/core/backends/nebius/models.py +1 -1
  11. dstack/_internal/core/models/configurations.py +19 -3
  12. dstack/_internal/core/models/resources.py +1 -1
  13. dstack/_internal/core/models/runs.py +19 -7
  14. dstack/_internal/server/background/tasks/process_metrics.py +30 -11
  15. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  16. dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
  17. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  18. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  19. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  20. dstack/_internal/server/models.py +6 -1
  21. dstack/_internal/server/routers/repos.py +8 -4
  22. dstack/_internal/server/schemas/runner.py +41 -8
  23. dstack/_internal/server/services/instances.py +6 -2
  24. dstack/_internal/server/services/jobs/__init__.py +1 -0
  25. dstack/_internal/server/services/jobs/configurators/base.py +3 -3
  26. dstack/_internal/server/services/runner/client.py +7 -4
  27. dstack/_internal/server/services/runs.py +33 -20
  28. dstack/_internal/server/settings.py +21 -1
  29. dstack/_internal/server/statics/index.html +1 -1
  30. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
  31. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
  32. dstack/_internal/server/testing/common.py +4 -0
  33. dstack/_internal/server/utils/routers.py +3 -6
  34. dstack/_internal/settings.py +4 -0
  35. dstack/api/_public/runs.py +6 -3
  36. dstack/api/server/_runs.py +6 -0
  37. dstack/version.py +1 -1
  38. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/METADATA +46 -34
  39. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/RECORD +42 -38
  40. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
  41. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
  42. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
93
93
  async with lock:
94
94
  res = await session.execute(
95
95
  select(JobModel)
96
+ .join(JobModel.run)
96
97
  .where(
97
98
  JobModel.status == JobStatus.SUBMITTED,
98
99
  JobModel.id.not_in(lockset),
99
100
  )
100
- .order_by(JobModel.last_processed_at.asc())
101
+ # Jobs are process in FIFO sorted by priority globally,
102
+ # thus runs from different project can "overtake" each other by using higher priorities.
103
+ # That's not a big problem as long as projects do not compete for the same compute resources.
104
+ # Jobs with lower priorities from other projects will be processed without major lag
105
+ # as long as new higher priority runs are not constantly submitted.
106
+ # TODO: Consider processing jobs from different projects fairly/round-robin
107
+ # Fully fair processing can be tricky to implement via the current DB queue as
108
+ # there can be many projects and we are limited by the max DB connections.
109
+ .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
101
110
  .limit(1)
102
111
  .with_for_update(skip_locked=True)
103
112
  )
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
360
369
  (instance, common_utils.get_or_error(get_instance_offer(instance)))
361
370
  for instance in nonshared_instances
362
371
  ]
363
- if not multinode:
364
- shared_instances_with_offers = get_shared_pool_instances_with_offers(
365
- pool_instances=pool_instances,
366
- profile=profile,
367
- requirements=job.job_spec.requirements,
368
- idle_only=True,
369
- fleet_model=fleet_model,
370
- volumes=volumes,
371
- )
372
- instances_with_offers.extend(shared_instances_with_offers)
372
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
373
+ pool_instances=pool_instances,
374
+ profile=profile,
375
+ requirements=job.job_spec.requirements,
376
+ idle_only=True,
377
+ fleet_model=fleet_model,
378
+ multinode=multinode,
379
+ volumes=volumes,
380
+ )
381
+ instances_with_offers.extend(shared_instances_with_offers)
373
382
 
374
383
  if len(instances_with_offers) == 0:
375
384
  return None
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
572
581
 
573
582
 
574
583
  def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
575
- if offer.total_blocks == 1:
584
+ if offer.blocks == offer.total_blocks:
576
585
  if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
577
586
  network_mode = NetworkMode.BRIDGE
578
587
  else:
@@ -0,0 +1,100 @@
1
+ """Add JobModel.disconnected_at
2
+
3
+ Revision ID: 20166748b60c
4
+ Revises: 6c1a9d6530ee
5
+ Create Date: 2025-05-13 16:24:32.496578
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from alembic_postgresql_enum import TableReference
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "20166748b60c"
17
+ down_revision = "6c1a9d6530ee"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
25
+ batch_op.add_column(
26
+ sa.Column(
27
+ "disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
28
+ )
29
+ )
30
+
31
+ op.sync_enum_values(
32
+ enum_schema="public",
33
+ enum_name="jobterminationreason",
34
+ new_values=[
35
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
36
+ "INTERRUPTED_BY_NO_CAPACITY",
37
+ "INSTANCE_UNREACHABLE",
38
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
39
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
40
+ "TERMINATED_BY_USER",
41
+ "VOLUME_ERROR",
42
+ "GATEWAY_ERROR",
43
+ "SCALED_DOWN",
44
+ "DONE_BY_RUNNER",
45
+ "ABORTED_BY_USER",
46
+ "TERMINATED_BY_SERVER",
47
+ "INACTIVITY_DURATION_EXCEEDED",
48
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
49
+ "CONTAINER_EXITED_WITH_ERROR",
50
+ "PORTS_BINDING_FAILED",
51
+ "CREATING_CONTAINER_ERROR",
52
+ "EXECUTOR_ERROR",
53
+ "MAX_DURATION_EXCEEDED",
54
+ ],
55
+ affected_columns=[
56
+ TableReference(
57
+ table_schema="public", table_name="jobs", column_name="termination_reason"
58
+ )
59
+ ],
60
+ enum_values_to_rename=[],
61
+ )
62
+ # ### end Alembic commands ###
63
+
64
+
65
+ def downgrade() -> None:
66
+ # ### commands auto generated by Alembic - please adjust! ###
67
+ op.sync_enum_values(
68
+ enum_schema="public",
69
+ enum_name="jobterminationreason",
70
+ new_values=[
71
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
72
+ "INTERRUPTED_BY_NO_CAPACITY",
73
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
74
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
75
+ "TERMINATED_BY_USER",
76
+ "VOLUME_ERROR",
77
+ "GATEWAY_ERROR",
78
+ "SCALED_DOWN",
79
+ "DONE_BY_RUNNER",
80
+ "ABORTED_BY_USER",
81
+ "TERMINATED_BY_SERVER",
82
+ "INACTIVITY_DURATION_EXCEEDED",
83
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
84
+ "CONTAINER_EXITED_WITH_ERROR",
85
+ "PORTS_BINDING_FAILED",
86
+ "CREATING_CONTAINER_ERROR",
87
+ "EXECUTOR_ERROR",
88
+ "MAX_DURATION_EXCEEDED",
89
+ ],
90
+ affected_columns=[
91
+ TableReference(
92
+ table_schema="public", table_name="jobs", column_name="termination_reason"
93
+ )
94
+ ],
95
+ enum_values_to_rename=[],
96
+ )
97
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
98
+ batch_op.drop_column("disconnected_at")
99
+
100
+ # ### end Alembic commands ###
@@ -0,0 +1,26 @@
1
+ """Add JobModel.exit_status
2
+
3
+ Revision ID: 6c1a9d6530ee
4
+ Revises: 7ba3b59d7ca6
5
+ Create Date: 2025-05-09 10:25:19.715852
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "6c1a9d6530ee"
14
+ down_revision = "7ba3b59d7ca6"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
21
+ batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
22
+
23
+
24
+ def downgrade() -> None:
25
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
26
+ batch_op.drop_column("exit_status")
@@ -0,0 +1,34 @@
1
+ """Add RunModel.priority
2
+
3
+ Revision ID: bca2fdf130bf
4
+ Revises: 20166748b60c
5
+ Create Date: 2025-05-14 15:24:21.269775
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "bca2fdf130bf"
14
+ down_revision = "20166748b60c"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("runs", schema=None) as batch_op:
22
+ batch_op.add_column(sa.Column("priority", sa.Integer(), nullable=True))
23
+ batch_op.execute("UPDATE runs SET priority = 0")
24
+ with op.batch_alter_table("runs", schema=None) as batch_op:
25
+ batch_op.alter_column("priority", nullable=False)
26
+ # ### end Alembic commands ###
27
+
28
+
29
+ def downgrade() -> None:
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ with op.batch_alter_table("runs", schema=None) as batch_op:
32
+ batch_op.drop_column("priority")
33
+
34
+ # ### end Alembic commands ###
@@ -348,6 +348,7 @@ class RunModel(BaseModel):
348
348
  resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0)
349
349
  run_spec: Mapped[str] = mapped_column(Text)
350
350
  service_spec: Mapped[Optional[str]] = mapped_column(Text)
351
+ priority: Mapped[int] = mapped_column(Integer, default=0)
351
352
 
352
353
  jobs: Mapped[List["JobModel"]] = relationship(
353
354
  back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]"
@@ -382,6 +383,10 @@ class JobModel(BaseModel):
382
383
  Enum(JobTerminationReason)
383
384
  )
384
385
  termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
386
+ # `disconnected_at` stores the first time of connectivity issues with the instance.
387
+ # Resets every time connectivity is restored.
388
+ disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
389
+ exit_status: Mapped[Optional[int]] = mapped_column(Integer)
385
390
  job_spec_data: Mapped[str] = mapped_column(Text)
386
391
  job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
387
392
  runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
@@ -390,7 +395,7 @@ class JobModel(BaseModel):
390
395
  remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
391
396
  volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
392
397
  # `instance_assigned` means instance assignment was done.
393
- # if `instance_assigned` is True and `instance` is None, no instance was assiged.
398
+ # if `instance_assigned` is True and `instance` is None, no instance was assigned.
394
399
  instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
395
400
  instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
396
401
  ForeignKey("instances.id", ondelete="CASCADE")
@@ -1,6 +1,7 @@
1
1
  from typing import List, Tuple
2
2
 
3
3
  from fastapi import APIRouter, Depends, Request, UploadFile
4
+ from humanize import naturalsize
4
5
  from sqlalchemy.ext.asyncio import AsyncSession
5
6
 
6
7
  from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
@@ -14,9 +15,10 @@ from dstack._internal.server.schemas.repos import (
14
15
  )
15
16
  from dstack._internal.server.security.permissions import ProjectMember
16
17
  from dstack._internal.server.services import repos
18
+ from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
17
19
  from dstack._internal.server.utils.routers import (
18
20
  get_base_api_additional_responses,
19
- request_size_exceeded,
21
+ get_request_size,
20
22
  )
21
23
 
22
24
  router = APIRouter(
@@ -94,10 +96,12 @@ async def upload_code(
94
96
  session: AsyncSession = Depends(get_session),
95
97
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
96
98
  ):
97
- if request_size_exceeded(request, limit=2 * 2**20):
99
+ request_size = get_request_size(request)
100
+ if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
98
101
  raise ServerClientError(
99
- "Repo diff size exceeds the limit of 2MB. "
100
- "Use .gitignore to exclude large files from the repo."
102
+ f"Repo diff size is {naturalsize(request_size)}, which exceeds the limit of "
103
+ f"{naturalsize(SERVER_CODE_UPLOAD_LIMIT)}. Use .gitignore to exclude large files from the repo. This "
104
+ f"limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT_BYTES environment variable"
101
105
  )
102
106
  _, project = user_project
103
107
  await repos.upload_code(
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
7
7
 
8
8
  from dstack._internal.core.models.common import CoreModel, NetworkMode
9
9
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
10
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, JobStatus, RunSpec
10
+ from dstack._internal.core.models.runs import (
11
+ ClusterInfo,
12
+ JobSpec,
13
+ JobStatus,
14
+ JobSubmission,
15
+ Run,
16
+ RunSpec,
17
+ )
11
18
  from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
12
19
 
13
20
 
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
16
23
  state: JobStatus
17
24
  termination_reason: Optional[str] = None
18
25
  termination_message: Optional[str] = None
26
+ exit_status: Optional[int] = None
19
27
 
20
28
 
21
29
  class LogEvent(CoreModel):
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
38
46
 
39
47
 
40
48
  class SubmitBody(CoreModel):
41
- run_spec: Annotated[
42
- RunSpec,
49
+ run: Annotated[
50
+ Run,
43
51
  Field(
44
52
  include={
45
- "run_name",
46
- "repo_id",
47
- "repo_data",
48
- "configuration",
49
- "configuration_path",
53
+ "id": True,
54
+ "run_spec": {
55
+ "run_name",
56
+ "repo_id",
57
+ "repo_data",
58
+ "configuration",
59
+ "configuration_path",
60
+ },
50
61
  }
51
62
  ),
52
63
  ]
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
69
80
  }
70
81
  ),
71
82
  ]
83
+ job_submission: Annotated[
84
+ JobSubmission,
85
+ Field(
86
+ include={
87
+ "id",
88
+ }
89
+ ),
90
+ ]
72
91
  cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
73
92
  secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
74
93
  repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
94
+ # run_spec is deprecated in favor of run.run_spec
95
+ # TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
96
+ run_spec: Annotated[
97
+ RunSpec,
98
+ Field(
99
+ include={
100
+ "run_name",
101
+ "repo_id",
102
+ "repo_data",
103
+ "configuration",
104
+ "configuration_path",
105
+ },
106
+ ),
107
+ ]
75
108
 
76
109
 
77
110
  class HealthcheckResponse(CoreModel):
@@ -235,6 +235,7 @@ def get_shared_pool_instances_with_offers(
235
235
  *,
236
236
  idle_only: bool = False,
237
237
  fleet_model: Optional[FleetModel] = None,
238
+ multinode: bool = False,
238
239
  volumes: Optional[List[List[Volume]]] = None,
239
240
  ) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
240
241
  instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = []
@@ -243,19 +244,22 @@ def get_shared_pool_instances_with_offers(
243
244
  pool_instances=pool_instances,
244
245
  profile=profile,
245
246
  fleet_model=fleet_model,
246
- multinode=False,
247
+ multinode=multinode,
247
248
  volumes=volumes,
248
249
  shared=True,
249
250
  )
250
251
  for instance in filtered_instances:
251
252
  if idle_only and instance.status not in [InstanceStatus.IDLE, InstanceStatus.BUSY]:
252
253
  continue
254
+ if multinode and instance.busy_blocks > 0:
255
+ continue
253
256
  offer = get_instance_offer(instance)
254
257
  if offer is None:
255
258
  continue
256
259
  total_blocks = common_utils.get_or_error(instance.total_blocks)
257
260
  idle_blocks = total_blocks - instance.busy_blocks
258
- for blocks in range(1, total_blocks + 1):
261
+ min_blocks = total_blocks if multinode else 1
262
+ for blocks in range(min_blocks, total_blocks + 1):
259
263
  shared_offer = generate_shared_offer(offer, blocks, total_blocks)
260
264
  catalog_item = offer_to_catalog_item(shared_offer)
261
265
  if gpuhunt.matches(catalog_item, query_filter):
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
135
135
  status=job_model.status,
136
136
  termination_reason=job_model.termination_reason,
137
137
  termination_reason_message=job_model.termination_reason_message,
138
+ exit_status=job_model.exit_status,
138
139
  job_provisioning_data=job_provisioning_data,
139
140
  job_runtime_data=get_job_runtime_data(job_model),
140
141
  )
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Union
6
6
 
7
7
  from cachetools import TTLCache, cached
8
8
 
9
- import dstack.version as version
9
+ from dstack._internal import settings
10
10
  from dstack._internal.core.errors import DockerRegistryError, ServerClientError
11
11
  from dstack._internal.core.models.common import RegistryAuth
12
12
  from dstack._internal.core.models.configurations import (
@@ -53,14 +53,14 @@ def get_default_image(python_version: str, nvcc: bool = False) -> str:
53
53
  suffix = ""
54
54
  if nvcc:
55
55
  suffix = "-devel"
56
- return f"dstackai/base:py{python_version}-{version.base_image}-cuda-12.1{suffix}"
56
+ return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}"
57
57
 
58
58
 
59
59
  class JobConfigurator(ABC):
60
60
  TYPE: RunConfigurationType
61
61
 
62
62
  _image_config: Optional[ImageConfig] = None
63
- # JobSSHKey should be shared for all jobs in a replica for inter-node communitation.
63
+ # JobSSHKey should be shared for all jobs in a replica for inter-node communication.
64
64
  _job_ssh_key: Optional[JobSSHKey] = None
65
65
 
66
66
  def __init__(self, run_spec: RunSpec):
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
12
12
  from dstack._internal.core.models.envs import Env
13
13
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
14
14
  from dstack._internal.core.models.resources import Memory
15
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, RunSpec
15
+ from dstack._internal.core.models.runs import ClusterInfo, Job, Run
16
16
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
17
17
  from dstack._internal.server.schemas.runner import (
18
18
  GPUDevice,
@@ -72,8 +72,8 @@ class RunnerClient:
72
72
 
73
73
  def submit_job(
74
74
  self,
75
- run_spec: RunSpec,
76
- job_spec: JobSpec,
75
+ run: Run,
76
+ job: Job,
77
77
  cluster_info: ClusterInfo,
78
78
  secrets: Dict[str, str],
79
79
  repo_credentials: Optional[RemoteRepoCreds],
@@ -81,6 +81,7 @@ class RunnerClient:
81
81
  ):
82
82
  # XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
83
83
  # variables to the runner without runner API modification.
84
+ job_spec = job.job_spec
84
85
  if instance_env is not None:
85
86
  if isinstance(instance_env, Env):
86
87
  merged_env = instance_env.as_dict()
@@ -90,11 +91,13 @@ class RunnerClient:
90
91
  job_spec = job_spec.copy(deep=True)
91
92
  job_spec.env = merged_env
92
93
  body = SubmitBody(
93
- run_spec=run_spec,
94
+ run=run,
94
95
  job_spec=job_spec,
96
+ job_submission=job.job_submissions[-1],
95
97
  cluster_info=cluster_info,
96
98
  secrets=secrets,
97
99
  repo_credentials=repo_credentials,
100
+ run_spec=run.run_spec,
98
101
  )
99
102
  resp = requests.post(
100
103
  # use .json() to encode enums
@@ -16,7 +16,7 @@ from dstack._internal.core.errors import (
16
16
  ServerClientError,
17
17
  )
18
18
  from dstack._internal.core.models.common import ApplyAction
19
- from dstack._internal.core.models.configurations import AnyRunConfiguration
19
+ from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, AnyRunConfiguration
20
20
  from dstack._internal.core.models.instances import (
21
21
  InstanceAvailability,
22
22
  InstanceOfferWithAvailability,
@@ -434,7 +434,12 @@ async def apply_plan(
434
434
  # FIXME: potentially long write transaction
435
435
  # Avoid getting run_model after update
436
436
  await session.execute(
437
- update(RunModel).where(RunModel.id == current_resource.id).values(run_spec=run_spec.json())
437
+ update(RunModel)
438
+ .where(RunModel.id == current_resource.id)
439
+ .values(
440
+ run_spec=run_spec.json(),
441
+ priority=run_spec.configuration.priority,
442
+ )
438
443
  )
439
444
  run = await get_run_by_name(
440
445
  session=session,
@@ -495,6 +500,7 @@ async def submit_run(
495
500
  status=RunStatus.SUBMITTED,
496
501
  run_spec=run_spec.json(),
497
502
  last_processed_at=submitted_at,
503
+ priority=run_spec.configuration.priority,
498
504
  )
499
505
  session.add(run_model)
500
506
 
@@ -721,15 +727,15 @@ async def _get_pool_offers(
721
727
  pool_instances = [i for i in pool_instances if i.id not in detaching_instances_ids]
722
728
  multinode = job.job_spec.jobs_per_replica > 1
723
729
 
724
- if not multinode:
725
- shared_instances_with_offers = get_shared_pool_instances_with_offers(
726
- pool_instances=pool_instances,
727
- profile=run_spec.merged_profile,
728
- requirements=job.job_spec.requirements,
729
- volumes=volumes,
730
- )
731
- for _, offer in shared_instances_with_offers:
732
- pool_offers.append(offer)
730
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
731
+ pool_instances=pool_instances,
732
+ profile=run_spec.merged_profile,
733
+ requirements=job.job_spec.requirements,
734
+ volumes=volumes,
735
+ multinode=multinode,
736
+ )
737
+ for _, offer in shared_instances_with_offers:
738
+ pool_offers.append(offer)
733
739
 
734
740
  nonshared_instances = filter_pool_instances(
735
741
  pool_instances=pool_instances,
@@ -852,6 +858,13 @@ def _get_job_submission_cost(job_submission: JobSubmission) -> float:
852
858
 
853
859
 
854
860
  def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
861
+ # This function may set defaults for null run_spec values,
862
+ # although most defaults are resolved when building job_spec
863
+ # so that we can keep both the original user-supplied value (null in run_spec)
864
+ # and the default in job_spec.
865
+ # If a property is stored in job_spec - resolve the default there.
866
+ # Server defaults are preferable over client defaults so that
867
+ # the defaults depend on the server version, not the client version.
855
868
  if run_spec.run_name is not None:
856
869
  validate_dstack_resource_name(run_spec.run_name)
857
870
  for mount_point in run_spec.configuration.volumes:
@@ -870,16 +883,19 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
870
883
  if (
871
884
  run_spec.merged_profile.utilization_policy is not None
872
885
  and run_spec.merged_profile.utilization_policy.time_window
873
- > settings.SERVER_METRICS_TTL_SECONDS
886
+ > settings.SERVER_METRICS_RUNNING_TTL_SECONDS
874
887
  ):
875
888
  raise ServerClientError(
876
- f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
889
+ f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
877
890
  )
891
+ if run_spec.configuration.priority is None:
892
+ run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
878
893
  set_resources_defaults(run_spec.configuration.resources)
879
894
 
880
895
 
881
896
  _UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
882
- _CONF_TYPE_TO_UPDATABLE_FIELDS = {
897
+ _CONF_UPDATABLE_FIELDS = ["priority"]
898
+ _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
883
899
  "dev-environment": ["inactivity_duration"],
884
900
  # Most service fields can be updated via replica redeployment.
885
901
  # TODO: Allow updating other fields when rolling deployment is supported.
@@ -915,12 +931,9 @@ def _check_can_update_configuration(
915
931
  raise ServerClientError(
916
932
  f"Configuration type changed from {current.type} to {new.type}, cannot update"
917
933
  )
918
- updatable_fields = _CONF_TYPE_TO_UPDATABLE_FIELDS.get(new.type)
919
- if updatable_fields is None:
920
- raise ServerClientError(
921
- f"Can only update {', '.join(_CONF_TYPE_TO_UPDATABLE_FIELDS)} configurations."
922
- f" Not {new.type}"
923
- )
934
+ updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get(
935
+ new.type, []
936
+ )
924
937
  diff = diff_models(current, new)
925
938
  changed_fields = list(diff.keys())
926
939
  for key in changed_fields:
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import warnings
2
3
  from pathlib import Path
3
4
 
4
5
  DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
45
46
 
46
47
  SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
47
48
 
48
- SERVER_METRICS_TTL_SECONDS = int(os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS", 3600))
49
+ SERVER_METRICS_RUNNING_TTL_SECONDS: int
50
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
51
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
52
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
53
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
54
+ warnings.warn(
55
+ (
56
+ "DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
57
+ " use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
58
+ ),
59
+ DeprecationWarning,
60
+ )
61
+ else:
62
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
63
+ SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
64
+ del _SERVER_METRICS_RUNNING_TTL_SECONDS
65
+ SERVER_METRICS_FINISHED_TTL_SECONDS = int(
66
+ os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
67
+ )
49
68
 
50
69
  DEFAULT_PROJECT_NAME = "main"
51
70
 
@@ -66,6 +85,7 @@ DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE = int(
66
85
  USER_PROJECT_DEFAULT_QUOTA = int(os.getenv("DSTACK_USER_PROJECT_DEFAULT_QUOTA", 10))
67
86
  FORBID_SERVICES_WITHOUT_GATEWAY = os.getenv("DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY") is not None
68
87
 
88
+ SERVER_CODE_UPLOAD_LIMIT = int(os.getenv("DSTACK_SERVER_CODE_UPLOAD_LIMIT", 2 * 2**20))
69
89
 
70
90
  # Development settings
71
91