dstack 0.19.9__py3-none-any.whl → 0.19.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (29) hide show
  1. dstack/_internal/cli/commands/config.py +1 -1
  2. dstack/_internal/cli/commands/project.py +161 -0
  3. dstack/_internal/cli/commands/ps.py +9 -2
  4. dstack/_internal/cli/main.py +2 -0
  5. dstack/_internal/core/backends/azure/compute.py +5 -2
  6. dstack/_internal/core/models/configurations.py +19 -3
  7. dstack/_internal/server/background/tasks/process_metrics.py +23 -21
  8. dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
  9. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  10. dstack/_internal/server/models.py +1 -0
  11. dstack/_internal/server/routers/repos.py +8 -4
  12. dstack/_internal/server/services/instances.py +6 -2
  13. dstack/_internal/server/services/jobs/configurators/base.py +3 -3
  14. dstack/_internal/server/services/runs.py +31 -18
  15. dstack/_internal/server/settings.py +1 -0
  16. dstack/_internal/server/statics/index.html +1 -1
  17. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
  18. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
  19. dstack/_internal/server/testing/common.py +2 -0
  20. dstack/_internal/server/utils/routers.py +3 -6
  21. dstack/_internal/settings.py +4 -0
  22. dstack/api/_public/runs.py +6 -3
  23. dstack/api/server/_runs.py +2 -0
  24. dstack/version.py +1 -1
  25. {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/METADATA +11 -6
  26. {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/RECORD +29 -27
  27. {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
  28. {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
  29. {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
@@ -14,7 +14,7 @@ logger = get_logger(__name__)
14
14
 
15
15
  class ConfigCommand(BaseCommand):
16
16
  NAME = "config"
17
- DESCRIPTION = "Configure CLI"
17
+ DESCRIPTION = "Configure CLI (deprecated; use `dstack project`)"
18
18
 
19
19
  def _register(self):
20
20
  super()._register()
@@ -0,0 +1,161 @@
1
+ import argparse
2
+
3
+ from requests import HTTPError
4
+ from rich.table import Table
5
+
6
+ import dstack.api.server
7
+ from dstack._internal.cli.commands import BaseCommand
8
+ from dstack._internal.cli.utils.common import confirm_ask, console
9
+ from dstack._internal.core.errors import ClientError, CLIError
10
+ from dstack._internal.core.services.configs import ConfigManager
11
+ from dstack._internal.utils.logging import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class ProjectCommand(BaseCommand):
17
+ NAME = "project"
18
+ DESCRIPTION = "Manage projects configs"
19
+
20
+ def _register(self):
21
+ super()._register()
22
+ subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
23
+
24
+ # Add subcommand
25
+ add_parser = subparsers.add_parser("add", help="Add or update a project config")
26
+ add_parser.add_argument(
27
+ "--name", type=str, help="The name of the project to configure", required=True
28
+ )
29
+ add_parser.add_argument("--url", type=str, help="Server url", required=True)
30
+ add_parser.add_argument("--token", type=str, help="User token", required=True)
31
+ add_parser.add_argument(
32
+ "-y",
33
+ "--yes",
34
+ help="Don't ask for confirmation (e.g. update the config)",
35
+ action="store_true",
36
+ )
37
+ add_parser.add_argument(
38
+ "-n",
39
+ "--no",
40
+ help="Don't ask for confirmation (e.g. do not update the config)",
41
+ action="store_true",
42
+ )
43
+ add_parser.set_defaults(subfunc=self._add)
44
+
45
+ # Delete subcommand
46
+ delete_parser = subparsers.add_parser("delete", help="Delete a project config")
47
+ delete_parser.add_argument(
48
+ "--name", type=str, help="The name of the project to delete", required=True
49
+ )
50
+ delete_parser.add_argument(
51
+ "-y",
52
+ "--yes",
53
+ help="Don't ask for confirmation",
54
+ action="store_true",
55
+ )
56
+ delete_parser.set_defaults(subfunc=self._delete)
57
+
58
+ # List subcommand
59
+ list_parser = subparsers.add_parser("list", help="List configured projects")
60
+ list_parser.set_defaults(subfunc=self._list)
61
+
62
+ # Set default subcommand
63
+ set_default_parser = subparsers.add_parser("set-default", help="Set default project")
64
+ set_default_parser.add_argument(
65
+ "name", type=str, help="The name of the project to set as default"
66
+ )
67
+ set_default_parser.set_defaults(subfunc=self._set_default)
68
+
69
+ def _command(self, args: argparse.Namespace):
70
+ if not hasattr(args, "subfunc"):
71
+ args.subfunc = self._list
72
+ args.subfunc(args)
73
+
74
+ def _add(self, args: argparse.Namespace):
75
+ config_manager = ConfigManager()
76
+ api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
77
+ try:
78
+ api_client.projects.get(args.name)
79
+ except HTTPError as e:
80
+ if e.response.status_code == 403:
81
+ raise CLIError("Forbidden. Ensure the token is valid.")
82
+ elif e.response.status_code == 404:
83
+ raise CLIError(f"Project '{args.name}' not found.")
84
+ else:
85
+ raise e
86
+ default_project = config_manager.get_project_config()
87
+ if (
88
+ default_project is None
89
+ or default_project.name != args.name
90
+ or default_project.url != args.url
91
+ or default_project.token != args.token
92
+ ):
93
+ set_it_as_default = (
94
+ (
95
+ args.yes
96
+ or not default_project
97
+ or confirm_ask(f"Set '{args.name}' as your default project?")
98
+ )
99
+ if not args.no
100
+ else False
101
+ )
102
+ config_manager.configure_project(
103
+ name=args.name, url=args.url, token=args.token, default=set_it_as_default
104
+ )
105
+ config_manager.save()
106
+ logger.info(
107
+ f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
108
+ )
109
+
110
+ def _delete(self, args: argparse.Namespace):
111
+ config_manager = ConfigManager()
112
+ if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
113
+ config_manager.delete_project(args.name)
114
+ config_manager.save()
115
+ console.print("[grey58]OK[/]")
116
+
117
+ def _list(self, args: argparse.Namespace):
118
+ config_manager = ConfigManager()
119
+ default_project = config_manager.get_project_config()
120
+
121
+ table = Table(box=None)
122
+ table.add_column("PROJECT", style="bold", no_wrap=True)
123
+ table.add_column("URL", style="grey58")
124
+ table.add_column("USER", style="grey58")
125
+ table.add_column("DEFAULT", justify="center")
126
+
127
+ for project_name in config_manager.list_projects():
128
+ project_config = config_manager.get_project_config(project_name)
129
+ is_default = project_name == default_project.name if default_project else False
130
+
131
+ # Get username from API
132
+ try:
133
+ api_client = dstack.api.server.APIClient(
134
+ base_url=project_config.url, token=project_config.token
135
+ )
136
+ user_info = api_client.users.get_my_user()
137
+ username = user_info.username
138
+ except ClientError:
139
+ username = "(invalid token)"
140
+
141
+ table.add_row(
142
+ project_name,
143
+ project_config.url,
144
+ username,
145
+ "✓" if is_default else "",
146
+ style="bold" if is_default else None,
147
+ )
148
+
149
+ console.print(table)
150
+
151
+ def _set_default(self, args: argparse.Namespace):
152
+ config_manager = ConfigManager()
153
+ project_config = config_manager.get_project_config(args.name)
154
+ if project_config is None:
155
+ raise CLIError(f"Project '{args.name}' not found")
156
+
157
+ config_manager.configure_project(
158
+ name=args.name, url=project_config.url, token=project_config.token, default=True
159
+ )
160
+ config_manager.save()
161
+ console.print("[grey58]OK[/]")
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
36
36
  help="Watch statuses of runs in realtime",
37
37
  action="store_true",
38
38
  )
39
+ self._parser.add_argument(
40
+ "-n",
41
+ "--last",
42
+ help="Show only the last N runs. Implies --all",
43
+ type=int,
44
+ default=None,
45
+ )
39
46
 
40
47
  def _command(self, args: argparse.Namespace):
41
48
  super()._command(args)
42
- runs = self.api.runs.list(all=args.all)
49
+ runs = self.api.runs.list(all=args.all, limit=args.last)
43
50
  if not args.watch:
44
51
  console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
45
52
  return
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
49
56
  while True:
50
57
  live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
51
58
  time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
52
- runs = self.api.runs.list(all=args.all)
59
+ runs = self.api.runs.list(all=args.all, limit=args.last)
53
60
  except KeyboardInterrupt:
54
61
  pass
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
15
15
  from dstack._internal.cli.commands.logs import LogsCommand
16
16
  from dstack._internal.cli.commands.metrics import MetricsCommand
17
17
  from dstack._internal.cli.commands.offer import OfferCommand
18
+ from dstack._internal.cli.commands.project import ProjectCommand
18
19
  from dstack._internal.cli.commands.ps import PsCommand
19
20
  from dstack._internal.cli.commands.server import ServerCommand
20
21
  from dstack._internal.cli.commands.stats import StatsCommand
@@ -69,6 +70,7 @@ def main():
69
70
  OfferCommand.register(subparsers)
70
71
  LogsCommand.register(subparsers)
71
72
  MetricsCommand.register(subparsers)
73
+ ProjectCommand.register(subparsers)
72
74
  PsCommand.register(subparsers)
73
75
  ServerCommand.register(subparsers)
74
76
  StatsCommand.register(subparsers)
@@ -391,9 +391,12 @@ class VMImageVariant(enum.Enum):
391
391
 
392
392
 
393
393
  _SUPPORTED_VM_SERIES_PATTERNS = [
394
- r"D(\d+)s_v3", # Dsv3-series
395
- r"E(\d+)i?s_v4", # Esv4-series
394
+ # TODO: Support newer CPU series (Dsv6, Esv6).
395
+ # They are NVMe-only and require marking the VM image as NVMe.
396
+ r"D(\d+)s_v3", # Dsv3-series (general purpose)
397
+ r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
396
398
  r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
399
+ r"F(\d+)s_v2", # Fsv2-series (compute optimized)
397
400
  r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
398
401
  r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
399
402
  r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
@@ -23,6 +23,9 @@ ValidPort = conint(gt=0, le=65536)
23
23
  MAX_INT64 = 2**63 - 1
24
24
  SERVICE_HTTPS_DEFAULT = True
25
25
  STRIP_PREFIX_DEFAULT = True
26
+ RUN_PRIOTIRY_MIN = 0
27
+ RUN_PRIOTIRY_MAX = 100
28
+ RUN_PRIORITY_DEFAULT = 0
26
29
 
27
30
 
28
31
  class RunConfigurationType(str, Enum):
@@ -77,7 +80,8 @@ class ScalingSpec(CoreModel):
77
80
  Field(
78
81
  description="The target value of the metric. "
79
82
  "The number of replicas is calculated based on this number and automatically adjusts "
80
- "(scales up or down) as this metric changes"
83
+ "(scales up or down) as this metric changes",
84
+ gt=0,
81
85
  ),
82
86
  ]
83
87
  scale_up_delay: Annotated[
@@ -221,14 +225,26 @@ class BaseRunConfiguration(CoreModel):
221
225
  )
222
226
  ),
223
227
  ] = None
224
- # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
225
- setup: CommandsList = []
226
228
  resources: Annotated[
227
229
  ResourcesSpec, Field(description="The resources requirements to run the configuration")
228
230
  ] = ResourcesSpec()
231
+ priority: Annotated[
232
+ Optional[int],
233
+ Field(
234
+ ge=RUN_PRIOTIRY_MIN,
235
+ le=RUN_PRIOTIRY_MAX,
236
+ description=(
237
+ f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
238
+ " `dstack` tries to provision runs with higher priority first."
239
+ f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
240
+ ),
241
+ ),
242
+ ] = None
229
243
  volumes: Annotated[
230
244
  List[Union[MountPoint, str]], Field(description="The volumes mount points")
231
245
  ] = []
246
+ # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
247
+ setup: CommandsList = []
232
248
 
233
249
  @validator("python", pre=True, always=True)
234
250
  def convert_python(cls, v, values) -> Optional[PythonVersion]:
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  from typing import Dict, List, Optional
4
4
 
5
- from sqlalchemy import delete, select
5
+ from sqlalchemy import Delete, delete, select
6
6
  from sqlalchemy.orm import joinedload
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
@@ -49,27 +49,29 @@ async def delete_metrics():
49
49
  finished_timestamp_micro_cutoff = (
50
50
  now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
51
51
  )
52
+ await asyncio.gather(
53
+ _execute_delete_statement(
54
+ delete(JobMetricsPoint).where(
55
+ JobMetricsPoint.job_id.in_(
56
+ select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
57
+ ),
58
+ JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
59
+ )
60
+ ),
61
+ _execute_delete_statement(
62
+ delete(JobMetricsPoint).where(
63
+ JobMetricsPoint.job_id.in_(
64
+ select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
65
+ ),
66
+ JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
67
+ )
68
+ ),
69
+ )
70
+
71
+
72
+ async def _execute_delete_statement(stmt: Delete) -> None:
52
73
  async with get_session_ctx() as session:
53
- await asyncio.gather(
54
- session.execute(
55
- delete(JobMetricsPoint).where(
56
- JobMetricsPoint.job_id.in_(
57
- select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
58
- ),
59
- JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
60
- )
61
- ),
62
- session.execute(
63
- delete(JobMetricsPoint).where(
64
- JobMetricsPoint.job_id.in_(
65
- select(JobModel.id).where(
66
- JobModel.status.in_(JobStatus.finished_statuses())
67
- )
68
- ),
69
- JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
70
- )
71
- ),
72
- )
74
+ await session.execute(stmt)
73
75
  await session.commit()
74
76
 
75
77
 
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
93
93
  async with lock:
94
94
  res = await session.execute(
95
95
  select(JobModel)
96
+ .join(JobModel.run)
96
97
  .where(
97
98
  JobModel.status == JobStatus.SUBMITTED,
98
99
  JobModel.id.not_in(lockset),
99
100
  )
100
- .order_by(JobModel.last_processed_at.asc())
101
+ # Jobs are process in FIFO sorted by priority globally,
102
+ # thus runs from different project can "overtake" each other by using higher priorities.
103
+ # That's not a big problem as long as projects do not compete for the same compute resources.
104
+ # Jobs with lower priorities from other projects will be processed without major lag
105
+ # as long as new higher priority runs are not constantly submitted.
106
+ # TODO: Consider processing jobs from different projects fairly/round-robin
107
+ # Fully fair processing can be tricky to implement via the current DB queue as
108
+ # there can be many projects and we are limited by the max DB connections.
109
+ .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
101
110
  .limit(1)
102
111
  .with_for_update(skip_locked=True)
103
112
  )
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
360
369
  (instance, common_utils.get_or_error(get_instance_offer(instance)))
361
370
  for instance in nonshared_instances
362
371
  ]
363
- if not multinode:
364
- shared_instances_with_offers = get_shared_pool_instances_with_offers(
365
- pool_instances=pool_instances,
366
- profile=profile,
367
- requirements=job.job_spec.requirements,
368
- idle_only=True,
369
- fleet_model=fleet_model,
370
- volumes=volumes,
371
- )
372
- instances_with_offers.extend(shared_instances_with_offers)
372
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
373
+ pool_instances=pool_instances,
374
+ profile=profile,
375
+ requirements=job.job_spec.requirements,
376
+ idle_only=True,
377
+ fleet_model=fleet_model,
378
+ multinode=multinode,
379
+ volumes=volumes,
380
+ )
381
+ instances_with_offers.extend(shared_instances_with_offers)
373
382
 
374
383
  if len(instances_with_offers) == 0:
375
384
  return None
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
572
581
 
573
582
 
574
583
  def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
575
- if offer.total_blocks == 1:
584
+ if offer.blocks == offer.total_blocks:
576
585
  if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
577
586
  network_mode = NetworkMode.BRIDGE
578
587
  else:
@@ -0,0 +1,34 @@
1
+ """Add RunModel.priority
2
+
3
+ Revision ID: bca2fdf130bf
4
+ Revises: 20166748b60c
5
+ Create Date: 2025-05-14 15:24:21.269775
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "bca2fdf130bf"
14
+ down_revision = "20166748b60c"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("runs", schema=None) as batch_op:
22
+ batch_op.add_column(sa.Column("priority", sa.Integer(), nullable=True))
23
+ batch_op.execute("UPDATE runs SET priority = 0")
24
+ with op.batch_alter_table("runs", schema=None) as batch_op:
25
+ batch_op.alter_column("priority", nullable=False)
26
+ # ### end Alembic commands ###
27
+
28
+
29
+ def downgrade() -> None:
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ with op.batch_alter_table("runs", schema=None) as batch_op:
32
+ batch_op.drop_column("priority")
33
+
34
+ # ### end Alembic commands ###
@@ -348,6 +348,7 @@ class RunModel(BaseModel):
348
348
  resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0)
349
349
  run_spec: Mapped[str] = mapped_column(Text)
350
350
  service_spec: Mapped[Optional[str]] = mapped_column(Text)
351
+ priority: Mapped[int] = mapped_column(Integer, default=0)
351
352
 
352
353
  jobs: Mapped[List["JobModel"]] = relationship(
353
354
  back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]"
@@ -1,6 +1,7 @@
1
1
  from typing import List, Tuple
2
2
 
3
3
  from fastapi import APIRouter, Depends, Request, UploadFile
4
+ from humanize import naturalsize
4
5
  from sqlalchemy.ext.asyncio import AsyncSession
5
6
 
6
7
  from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
@@ -14,9 +15,10 @@ from dstack._internal.server.schemas.repos import (
14
15
  )
15
16
  from dstack._internal.server.security.permissions import ProjectMember
16
17
  from dstack._internal.server.services import repos
18
+ from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
17
19
  from dstack._internal.server.utils.routers import (
18
20
  get_base_api_additional_responses,
19
- request_size_exceeded,
21
+ get_request_size,
20
22
  )
21
23
 
22
24
  router = APIRouter(
@@ -94,10 +96,12 @@ async def upload_code(
94
96
  session: AsyncSession = Depends(get_session),
95
97
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
96
98
  ):
97
- if request_size_exceeded(request, limit=2 * 2**20):
99
+ request_size = get_request_size(request)
100
+ if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
98
101
  raise ServerClientError(
99
- "Repo diff size exceeds the limit of 2MB. "
100
- "Use .gitignore to exclude large files from the repo."
102
+ f"Repo diff size is {naturalsize(request_size)}, which exceeds the limit of "
103
+ f"{naturalsize(SERVER_CODE_UPLOAD_LIMIT)}. Use .gitignore to exclude large files from the repo. This "
104
+ f"limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT_BYTES environment variable"
101
105
  )
102
106
  _, project = user_project
103
107
  await repos.upload_code(
@@ -235,6 +235,7 @@ def get_shared_pool_instances_with_offers(
235
235
  *,
236
236
  idle_only: bool = False,
237
237
  fleet_model: Optional[FleetModel] = None,
238
+ multinode: bool = False,
238
239
  volumes: Optional[List[List[Volume]]] = None,
239
240
  ) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
240
241
  instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = []
@@ -243,19 +244,22 @@ def get_shared_pool_instances_with_offers(
243
244
  pool_instances=pool_instances,
244
245
  profile=profile,
245
246
  fleet_model=fleet_model,
246
- multinode=False,
247
+ multinode=multinode,
247
248
  volumes=volumes,
248
249
  shared=True,
249
250
  )
250
251
  for instance in filtered_instances:
251
252
  if idle_only and instance.status not in [InstanceStatus.IDLE, InstanceStatus.BUSY]:
252
253
  continue
254
+ if multinode and instance.busy_blocks > 0:
255
+ continue
253
256
  offer = get_instance_offer(instance)
254
257
  if offer is None:
255
258
  continue
256
259
  total_blocks = common_utils.get_or_error(instance.total_blocks)
257
260
  idle_blocks = total_blocks - instance.busy_blocks
258
- for blocks in range(1, total_blocks + 1):
261
+ min_blocks = total_blocks if multinode else 1
262
+ for blocks in range(min_blocks, total_blocks + 1):
259
263
  shared_offer = generate_shared_offer(offer, blocks, total_blocks)
260
264
  catalog_item = offer_to_catalog_item(shared_offer)
261
265
  if gpuhunt.matches(catalog_item, query_filter):
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Union
6
6
 
7
7
  from cachetools import TTLCache, cached
8
8
 
9
- import dstack.version as version
9
+ from dstack._internal import settings
10
10
  from dstack._internal.core.errors import DockerRegistryError, ServerClientError
11
11
  from dstack._internal.core.models.common import RegistryAuth
12
12
  from dstack._internal.core.models.configurations import (
@@ -53,14 +53,14 @@ def get_default_image(python_version: str, nvcc: bool = False) -> str:
53
53
  suffix = ""
54
54
  if nvcc:
55
55
  suffix = "-devel"
56
- return f"dstackai/base:py{python_version}-{version.base_image}-cuda-12.1{suffix}"
56
+ return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}"
57
57
 
58
58
 
59
59
  class JobConfigurator(ABC):
60
60
  TYPE: RunConfigurationType
61
61
 
62
62
  _image_config: Optional[ImageConfig] = None
63
- # JobSSHKey should be shared for all jobs in a replica for inter-node communitation.
63
+ # JobSSHKey should be shared for all jobs in a replica for inter-node communication.
64
64
  _job_ssh_key: Optional[JobSSHKey] = None
65
65
 
66
66
  def __init__(self, run_spec: RunSpec):
@@ -16,7 +16,7 @@ from dstack._internal.core.errors import (
16
16
  ServerClientError,
17
17
  )
18
18
  from dstack._internal.core.models.common import ApplyAction
19
- from dstack._internal.core.models.configurations import AnyRunConfiguration
19
+ from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, AnyRunConfiguration
20
20
  from dstack._internal.core.models.instances import (
21
21
  InstanceAvailability,
22
22
  InstanceOfferWithAvailability,
@@ -434,7 +434,12 @@ async def apply_plan(
434
434
  # FIXME: potentially long write transaction
435
435
  # Avoid getting run_model after update
436
436
  await session.execute(
437
- update(RunModel).where(RunModel.id == current_resource.id).values(run_spec=run_spec.json())
437
+ update(RunModel)
438
+ .where(RunModel.id == current_resource.id)
439
+ .values(
440
+ run_spec=run_spec.json(),
441
+ priority=run_spec.configuration.priority,
442
+ )
438
443
  )
439
444
  run = await get_run_by_name(
440
445
  session=session,
@@ -495,6 +500,7 @@ async def submit_run(
495
500
  status=RunStatus.SUBMITTED,
496
501
  run_spec=run_spec.json(),
497
502
  last_processed_at=submitted_at,
503
+ priority=run_spec.configuration.priority,
498
504
  )
499
505
  session.add(run_model)
500
506
 
@@ -721,15 +727,15 @@ async def _get_pool_offers(
721
727
  pool_instances = [i for i in pool_instances if i.id not in detaching_instances_ids]
722
728
  multinode = job.job_spec.jobs_per_replica > 1
723
729
 
724
- if not multinode:
725
- shared_instances_with_offers = get_shared_pool_instances_with_offers(
726
- pool_instances=pool_instances,
727
- profile=run_spec.merged_profile,
728
- requirements=job.job_spec.requirements,
729
- volumes=volumes,
730
- )
731
- for _, offer in shared_instances_with_offers:
732
- pool_offers.append(offer)
730
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
731
+ pool_instances=pool_instances,
732
+ profile=run_spec.merged_profile,
733
+ requirements=job.job_spec.requirements,
734
+ volumes=volumes,
735
+ multinode=multinode,
736
+ )
737
+ for _, offer in shared_instances_with_offers:
738
+ pool_offers.append(offer)
733
739
 
734
740
  nonshared_instances = filter_pool_instances(
735
741
  pool_instances=pool_instances,
@@ -852,6 +858,13 @@ def _get_job_submission_cost(job_submission: JobSubmission) -> float:
852
858
 
853
859
 
854
860
  def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
861
+ # This function may set defaults for null run_spec values,
862
+ # although most defaults are resolved when building job_spec
863
+ # so that we can keep both the original user-supplied value (null in run_spec)
864
+ # and the default in job_spec.
865
+ # If a property is stored in job_spec - resolve the default there.
866
+ # Server defaults are preferable over client defaults so that
867
+ # the defaults depend on the server version, not the client version.
855
868
  if run_spec.run_name is not None:
856
869
  validate_dstack_resource_name(run_spec.run_name)
857
870
  for mount_point in run_spec.configuration.volumes:
@@ -875,11 +888,14 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
875
888
  raise ServerClientError(
876
889
  f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
877
890
  )
891
+ if run_spec.configuration.priority is None:
892
+ run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
878
893
  set_resources_defaults(run_spec.configuration.resources)
879
894
 
880
895
 
881
896
  _UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
882
- _CONF_TYPE_TO_UPDATABLE_FIELDS = {
897
+ _CONF_UPDATABLE_FIELDS = ["priority"]
898
+ _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
883
899
  "dev-environment": ["inactivity_duration"],
884
900
  # Most service fields can be updated via replica redeployment.
885
901
  # TODO: Allow updating other fields when rolling deployment is supported.
@@ -915,12 +931,9 @@ def _check_can_update_configuration(
915
931
  raise ServerClientError(
916
932
  f"Configuration type changed from {current.type} to {new.type}, cannot update"
917
933
  )
918
- updatable_fields = _CONF_TYPE_TO_UPDATABLE_FIELDS.get(new.type)
919
- if updatable_fields is None:
920
- raise ServerClientError(
921
- f"Can only update {', '.join(_CONF_TYPE_TO_UPDATABLE_FIELDS)} configurations."
922
- f" Not {new.type}"
923
- )
934
+ updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get(
935
+ new.type, []
936
+ )
924
937
  diff = diff_models(current, new)
925
938
  changed_fields = list(diff.keys())
926
939
  for key in changed_fields:
@@ -85,6 +85,7 @@ DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE = int(
85
85
  USER_PROJECT_DEFAULT_QUOTA = int(os.getenv("DSTACK_USER_PROJECT_DEFAULT_QUOTA", 10))
86
86
  FORBID_SERVICES_WITHOUT_GATEWAY = os.getenv("DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY") is not None
87
87
 
88
+ SERVER_CODE_UPLOAD_LIMIT = int(os.getenv("DSTACK_SERVER_CODE_UPLOAD_LIMIT", 2 * 2**20))
88
89
 
89
90
  # Development settings
90
91