dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +56 -13
  3. dstack/_internal/cli/utils/run.py +10 -5
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +3 -1
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +48 -0
  19. dstack/_internal/core/backends/nebius/models.py +9 -1
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +79 -4
  28. dstack/_internal/core/models/runs.py +26 -9
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_metrics.py +26 -9
  33. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  34. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  36. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  37. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  38. dstack/_internal/server/models.py +6 -1
  39. dstack/_internal/server/schemas/runner.py +41 -8
  40. dstack/_internal/server/services/fleets.py +9 -26
  41. dstack/_internal/server/services/instances.py +0 -2
  42. dstack/_internal/server/services/jobs/__init__.py +1 -0
  43. dstack/_internal/server/services/offers.py +15 -0
  44. dstack/_internal/server/services/placement.py +27 -6
  45. dstack/_internal/server/services/resources.py +21 -0
  46. dstack/_internal/server/services/runner/client.py +7 -4
  47. dstack/_internal/server/services/runs.py +18 -8
  48. dstack/_internal/server/settings.py +20 -1
  49. dstack/_internal/server/testing/common.py +37 -26
  50. dstack/_internal/utils/common.py +13 -1
  51. dstack/_internal/utils/json_schema.py +6 -3
  52. dstack/api/__init__.py +1 -0
  53. dstack/api/server/_fleets.py +16 -0
  54. dstack/api/server/_runs.py +48 -3
  55. dstack/version.py +1 -1
  56. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
  57. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
  58. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,100 @@
1
+ """Add JobModel.disconnected_at
2
+
3
+ Revision ID: 20166748b60c
4
+ Revises: 6c1a9d6530ee
5
+ Create Date: 2025-05-13 16:24:32.496578
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+ from alembic_postgresql_enum import TableReference
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "20166748b60c"
17
+ down_revision = "6c1a9d6530ee"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
25
+ batch_op.add_column(
26
+ sa.Column(
27
+ "disconnected_at", dstack._internal.server.models.NaiveDateTime(), nullable=True
28
+ )
29
+ )
30
+
31
+ op.sync_enum_values(
32
+ enum_schema="public",
33
+ enum_name="jobterminationreason",
34
+ new_values=[
35
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
36
+ "INTERRUPTED_BY_NO_CAPACITY",
37
+ "INSTANCE_UNREACHABLE",
38
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
39
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
40
+ "TERMINATED_BY_USER",
41
+ "VOLUME_ERROR",
42
+ "GATEWAY_ERROR",
43
+ "SCALED_DOWN",
44
+ "DONE_BY_RUNNER",
45
+ "ABORTED_BY_USER",
46
+ "TERMINATED_BY_SERVER",
47
+ "INACTIVITY_DURATION_EXCEEDED",
48
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
49
+ "CONTAINER_EXITED_WITH_ERROR",
50
+ "PORTS_BINDING_FAILED",
51
+ "CREATING_CONTAINER_ERROR",
52
+ "EXECUTOR_ERROR",
53
+ "MAX_DURATION_EXCEEDED",
54
+ ],
55
+ affected_columns=[
56
+ TableReference(
57
+ table_schema="public", table_name="jobs", column_name="termination_reason"
58
+ )
59
+ ],
60
+ enum_values_to_rename=[],
61
+ )
62
+ # ### end Alembic commands ###
63
+
64
+
65
+ def downgrade() -> None:
66
+ # ### commands auto generated by Alembic - please adjust! ###
67
+ op.sync_enum_values(
68
+ enum_schema="public",
69
+ enum_name="jobterminationreason",
70
+ new_values=[
71
+ "FAILED_TO_START_DUE_TO_NO_CAPACITY",
72
+ "INTERRUPTED_BY_NO_CAPACITY",
73
+ "WAITING_INSTANCE_LIMIT_EXCEEDED",
74
+ "WAITING_RUNNER_LIMIT_EXCEEDED",
75
+ "TERMINATED_BY_USER",
76
+ "VOLUME_ERROR",
77
+ "GATEWAY_ERROR",
78
+ "SCALED_DOWN",
79
+ "DONE_BY_RUNNER",
80
+ "ABORTED_BY_USER",
81
+ "TERMINATED_BY_SERVER",
82
+ "INACTIVITY_DURATION_EXCEEDED",
83
+ "TERMINATED_DUE_TO_UTILIZATION_POLICY",
84
+ "CONTAINER_EXITED_WITH_ERROR",
85
+ "PORTS_BINDING_FAILED",
86
+ "CREATING_CONTAINER_ERROR",
87
+ "EXECUTOR_ERROR",
88
+ "MAX_DURATION_EXCEEDED",
89
+ ],
90
+ affected_columns=[
91
+ TableReference(
92
+ table_schema="public", table_name="jobs", column_name="termination_reason"
93
+ )
94
+ ],
95
+ enum_values_to_rename=[],
96
+ )
97
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
98
+ batch_op.drop_column("disconnected_at")
99
+
100
+ # ### end Alembic commands ###
@@ -0,0 +1,26 @@
1
+ """Add JobModel.exit_status
2
+
3
+ Revision ID: 6c1a9d6530ee
4
+ Revises: 7ba3b59d7ca6
5
+ Create Date: 2025-05-09 10:25:19.715852
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "6c1a9d6530ee"
14
+ down_revision = "7ba3b59d7ca6"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
21
+ batch_op.add_column(sa.Column("exit_status", sa.Integer(), nullable=True))
22
+
23
+
24
+ def downgrade() -> None:
25
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
26
+ batch_op.drop_column("exit_status")
@@ -382,6 +382,10 @@ class JobModel(BaseModel):
382
382
  Enum(JobTerminationReason)
383
383
  )
384
384
  termination_reason_message: Mapped[Optional[str]] = mapped_column(Text)
385
+ # `disconnected_at` stores the first time of connectivity issues with the instance.
386
+ # Resets every time connectivity is restored.
387
+ disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
388
+ exit_status: Mapped[Optional[int]] = mapped_column(Integer)
385
389
  job_spec_data: Mapped[str] = mapped_column(Text)
386
390
  job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
387
391
  runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
@@ -390,7 +394,7 @@ class JobModel(BaseModel):
390
394
  remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
391
395
  volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
392
396
  # `instance_assigned` means instance assignment was done.
393
- # if `instance_assigned` is True and `instance` is None, no instance was assiged.
397
+ # if `instance_assigned` is True and `instance` is None, no instance was assigned.
394
398
  instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
395
399
  instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
396
400
  ForeignKey("instances.id", ondelete="CASCADE")
@@ -659,6 +663,7 @@ class PlacementGroupModel(BaseModel):
659
663
 
660
664
  fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id"))
661
665
  fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id])
666
+ # TODO: rename `fleet_deleted` -> `to_be_deleted`
662
667
  fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False)
663
668
 
664
669
  created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
@@ -7,7 +7,14 @@ from typing_extensions import Annotated
7
7
 
8
8
  from dstack._internal.core.models.common import CoreModel, NetworkMode
9
9
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
10
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, JobStatus, RunSpec
10
+ from dstack._internal.core.models.runs import (
11
+ ClusterInfo,
12
+ JobSpec,
13
+ JobStatus,
14
+ JobSubmission,
15
+ Run,
16
+ RunSpec,
17
+ )
11
18
  from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
12
19
 
13
20
 
@@ -16,6 +23,7 @@ class JobStateEvent(CoreModel):
16
23
  state: JobStatus
17
24
  termination_reason: Optional[str] = None
18
25
  termination_message: Optional[str] = None
26
+ exit_status: Optional[int] = None
19
27
 
20
28
 
21
29
  class LogEvent(CoreModel):
@@ -38,15 +46,18 @@ class PullResponse(CoreModel):
38
46
 
39
47
 
40
48
  class SubmitBody(CoreModel):
41
- run_spec: Annotated[
42
- RunSpec,
49
+ run: Annotated[
50
+ Run,
43
51
  Field(
44
52
  include={
45
- "run_name",
46
- "repo_id",
47
- "repo_data",
48
- "configuration",
49
- "configuration_path",
53
+ "id": True,
54
+ "run_spec": {
55
+ "run_name",
56
+ "repo_id",
57
+ "repo_data",
58
+ "configuration",
59
+ "configuration_path",
60
+ },
50
61
  }
51
62
  ),
52
63
  ]
@@ -69,9 +80,31 @@ class SubmitBody(CoreModel):
69
80
  }
70
81
  ),
71
82
  ]
83
+ job_submission: Annotated[
84
+ JobSubmission,
85
+ Field(
86
+ include={
87
+ "id",
88
+ }
89
+ ),
90
+ ]
72
91
  cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)]
73
92
  secrets: Annotated[Optional[Dict[str, str]], Field(include=True)]
74
93
  repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)]
94
+ # run_spec is deprecated in favor of run.run_spec
95
+ # TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier.
96
+ run_spec: Annotated[
97
+ RunSpec,
98
+ Field(
99
+ include={
100
+ "run_name",
101
+ "repo_id",
102
+ "repo_data",
103
+ "configuration",
104
+ "configuration_path",
105
+ },
106
+ ),
107
+ ]
75
108
 
76
109
 
77
110
  class HealthcheckResponse(CoreModel):
@@ -1,5 +1,3 @@
1
- import random
2
- import string
3
1
  import uuid
4
2
  from datetime import datetime, timezone
5
3
  from typing import List, Literal, Optional, Tuple, Union, cast
@@ -33,6 +31,7 @@ from dstack._internal.core.models.instances import (
33
31
  SSHConnectionParams,
34
32
  SSHKey,
35
33
  )
34
+ from dstack._internal.core.models.placement import PlacementGroup
36
35
  from dstack._internal.core.models.profiles import (
37
36
  Profile,
38
37
  SpotPolicy,
@@ -62,6 +61,7 @@ from dstack._internal.server.services.projects import (
62
61
  list_project_models,
63
62
  list_user_project_models,
64
63
  )
64
+ from dstack._internal.server.services.resources import set_resources_defaults
65
65
  from dstack._internal.utils import random_names
66
66
  from dstack._internal.utils.logging import get_logger
67
67
  from dstack._internal.utils.ssh import pkey_from_str
@@ -243,6 +243,7 @@ async def get_plan(
243
243
  spec=effective_spec,
244
244
  )
245
245
  effective_spec = FleetSpec.parse_obj(effective_spec.dict())
246
+ _validate_fleet_spec_and_set_defaults(spec)
246
247
  current_fleet: Optional[Fleet] = None
247
248
  current_fleet_id: Optional[uuid.UUID] = None
248
249
  if effective_spec.configuration.name is not None:
@@ -282,6 +283,7 @@ async def get_create_instance_offers(
282
283
  project: ProjectModel,
283
284
  profile: Profile,
284
285
  requirements: Requirements,
286
+ placement_group: Optional[PlacementGroup] = None,
285
287
  fleet_spec: Optional[FleetSpec] = None,
286
288
  fleet_model: Optional[FleetModel] = None,
287
289
  blocks: Union[int, Literal["auto"]] = 1,
@@ -307,6 +309,7 @@ async def get_create_instance_offers(
307
309
  exclude_not_available=exclude_not_available,
308
310
  multinode=multinode,
309
311
  master_job_provisioning_data=master_job_provisioning_data,
312
+ placement_group=placement_group,
310
313
  blocks=blocks,
311
314
  )
312
315
  offers = [
@@ -345,7 +348,7 @@ async def create_fleet(
345
348
  spec=spec,
346
349
  )
347
350
  spec = FleetSpec.parse_obj(spec.dict())
348
- _validate_fleet_spec(spec)
351
+ _validate_fleet_spec_and_set_defaults(spec)
349
352
 
350
353
  if spec.configuration.ssh_config is not None:
351
354
  _check_can_manage_ssh_fleets(user=user, project=project)
@@ -393,17 +396,12 @@ async def create_fleet(
393
396
  )
394
397
  fleet_model.instances.append(instances_model)
395
398
  else:
396
- placement_group_name = _get_placement_group_name(
397
- project=project,
398
- fleet_spec=spec,
399
- )
400
399
  for i in range(_get_fleet_nodes_to_provision(spec)):
401
400
  instance_model = await create_fleet_instance_model(
402
401
  session=session,
403
402
  project=project,
404
403
  user=user,
405
404
  spec=spec,
406
- placement_group_name=placement_group_name,
407
405
  reservation=spec.configuration.reservation,
408
406
  instance_num=i,
409
407
  )
@@ -417,7 +415,6 @@ async def create_fleet_instance_model(
417
415
  project: ProjectModel,
418
416
  user: UserModel,
419
417
  spec: FleetSpec,
420
- placement_group_name: Optional[str],
421
418
  reservation: Optional[str],
422
419
  instance_num: int,
423
420
  ) -> InstanceModel:
@@ -431,7 +428,6 @@ async def create_fleet_instance_model(
431
428
  requirements=requirements,
432
429
  instance_name=f"{spec.configuration.name}-{instance_num}",
433
430
  instance_num=instance_num,
434
- placement_group_name=placement_group_name,
435
431
  reservation=reservation,
436
432
  blocks=spec.configuration.blocks,
437
433
  tags=spec.configuration.tags,
@@ -652,7 +648,7 @@ def _remove_fleet_spec_sensitive_info(spec: FleetSpec):
652
648
  host.ssh_key = None
653
649
 
654
650
 
655
- def _validate_fleet_spec(spec: FleetSpec):
651
+ def _validate_fleet_spec_and_set_defaults(spec: FleetSpec):
656
652
  if spec.configuration.name is not None:
657
653
  validate_dstack_resource_name(spec.configuration.name)
658
654
  if spec.configuration.ssh_config is None and spec.configuration.nodes is None:
@@ -665,6 +661,8 @@ def _validate_fleet_spec(spec: FleetSpec):
665
661
  if isinstance(host, SSHHostParams) and host.ssh_key is not None:
666
662
  _validate_ssh_key(host.ssh_key)
667
663
  _validate_internal_ips(spec.configuration.ssh_config)
664
+ if spec.configuration.resources is not None:
665
+ set_resources_defaults(spec.configuration.resources)
668
666
 
669
667
 
670
668
  def _validate_all_ssh_params_specified(ssh_config: SSHParams):
@@ -735,18 +733,3 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
735
733
  reservation=fleet_spec.configuration.reservation,
736
734
  )
737
735
  return requirements
738
-
739
-
740
- def _get_placement_group_name(
741
- project: ProjectModel,
742
- fleet_spec: FleetSpec,
743
- ) -> Optional[str]:
744
- if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER:
745
- return None
746
- # A random suffix to avoid clashing with to-be-deleted placement groups left by old fleets
747
- suffix = _generate_random_placement_group_suffix()
748
- return f"{project.name}-{fleet_spec.configuration.name}-{suffix}-pg"
749
-
750
-
751
- def _generate_random_placement_group_suffix(length: int = 8) -> str:
752
- return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
@@ -408,7 +408,6 @@ async def create_instance_model(
408
408
  requirements: Requirements,
409
409
  instance_name: str,
410
410
  instance_num: int,
411
- placement_group_name: Optional[str],
412
411
  reservation: Optional[str],
413
412
  blocks: Union[Literal["auto"], int],
414
413
  tags: Optional[Dict[str, str]],
@@ -427,7 +426,6 @@ async def create_instance_model(
427
426
  user=user.name,
428
427
  ssh_keys=[project_ssh_key],
429
428
  instance_id=str(instance_id),
430
- placement_group_name=placement_group_name,
431
429
  reservation=reservation,
432
430
  tags=tags,
433
431
  )
@@ -135,6 +135,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
135
135
  status=job_model.status,
136
136
  termination_reason=job_model.termination_reason,
137
137
  termination_reason_message=job_model.termination_reason_message,
138
+ exit_status=job_model.exit_status,
138
139
  job_provisioning_data=job_provisioning_data,
139
140
  job_runtime_data=get_job_runtime_data(job_model),
140
141
  )
@@ -8,12 +8,14 @@ from dstack._internal.core.backends import (
8
8
  BACKENDS_WITH_RESERVATION_SUPPORT,
9
9
  )
10
10
  from dstack._internal.core.backends.base.backend import Backend
11
+ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
11
12
  from dstack._internal.core.models.backends.base import BackendType
12
13
  from dstack._internal.core.models.instances import (
13
14
  InstanceOfferWithAvailability,
14
15
  InstanceType,
15
16
  Resources,
16
17
  )
18
+ from dstack._internal.core.models.placement import PlacementGroup
17
19
  from dstack._internal.core.models.profiles import Profile
18
20
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
19
21
  from dstack._internal.core.models.volumes import Volume
@@ -31,6 +33,7 @@ async def get_offers_by_requirements(
31
33
  volumes: Optional[List[List[Volume]]] = None,
32
34
  privileged: bool = False,
33
35
  instance_mounts: bool = False,
36
+ placement_group: Optional[PlacementGroup] = None,
34
37
  blocks: Union[int, Literal["auto"]] = 1,
35
38
  ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
36
39
  backends: List[Backend] = await backends_services.get_project_backends(project=project)
@@ -116,6 +119,18 @@ async def get_offers_by_requirements(
116
119
  new_offers.append((b, new_offer))
117
120
  offers = new_offers
118
121
 
122
+ if placement_group is not None:
123
+ new_offers = []
124
+ for b, o in offers:
125
+ for backend in backends:
126
+ compute = backend.compute()
127
+ if isinstance(
128
+ compute, ComputeWithPlacementGroupSupport
129
+ ) and compute.is_suitable_placement_group(placement_group, o):
130
+ new_offers.append((b, o))
131
+ break
132
+ offers = new_offers
133
+
119
134
  if profile.instance_types is not None:
120
135
  instance_types = [i.lower() for i in profile.instance_types]
121
136
  offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
@@ -1,8 +1,9 @@
1
+ from collections.abc import Iterable
1
2
  from typing import Optional
2
3
  from uuid import UUID
3
4
 
4
5
  from git import List
5
- from sqlalchemy import select
6
+ from sqlalchemy import and_, select, update
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
8
 
8
9
  from dstack._internal.core.models.placement import (
@@ -13,15 +14,35 @@ from dstack._internal.core.models.placement import (
13
14
  from dstack._internal.server.models import PlacementGroupModel
14
15
 
15
16
 
16
- async def get_fleet_placement_groups(
17
+ async def get_fleet_placement_group_models(
17
18
  session: AsyncSession,
18
19
  fleet_id: UUID,
19
- ) -> List[PlacementGroup]:
20
+ ) -> List[PlacementGroupModel]:
20
21
  res = await session.execute(
21
- select(PlacementGroupModel).where(PlacementGroupModel.fleet_id == fleet_id)
22
+ select(PlacementGroupModel).where(
23
+ and_(
24
+ PlacementGroupModel.fleet_id == fleet_id,
25
+ PlacementGroupModel.deleted == False,
26
+ PlacementGroupModel.fleet_deleted == False,
27
+ )
28
+ )
29
+ )
30
+ return list(res.scalars().all())
31
+
32
+
33
+ async def schedule_fleet_placement_groups_deletion(
34
+ session: AsyncSession, fleet_id: UUID, except_placement_group_ids: Iterable[UUID] = ()
35
+ ) -> None:
36
+ await session.execute(
37
+ update(PlacementGroupModel)
38
+ .where(
39
+ and_(
40
+ PlacementGroupModel.fleet_id == fleet_id,
41
+ PlacementGroupModel.id.not_in(except_placement_group_ids),
42
+ )
43
+ )
44
+ .values(fleet_deleted=True) # TODO: rename `fleet_deleted` -> `to_be_deleted`
22
45
  )
23
- placement_groups = res.scalars().all()
24
- return [placement_group_model_to_placement_group(pg) for pg in placement_groups]
25
46
 
26
47
 
27
48
  def placement_group_model_to_placement_group(
@@ -0,0 +1,21 @@
1
+ import gpuhunt
2
+ from pydantic import parse_obj_as
3
+
4
+ from dstack._internal.core.models.resources import CPUSpec, ResourcesSpec
5
+
6
+
7
+ def set_resources_defaults(resources: ResourcesSpec) -> None:
8
+ # TODO: Remove in 0.20. Use resources.cpu directly
9
+ cpu = parse_obj_as(CPUSpec, resources.cpu)
10
+ if cpu.arch is None:
11
+ gpu = resources.gpu
12
+ if (
13
+ gpu is not None
14
+ and gpu.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA]
15
+ and gpu.name
16
+ and any(map(gpuhunt.is_nvidia_superchip, gpu.name))
17
+ ):
18
+ cpu.arch = gpuhunt.CPUArchitecture.ARM
19
+ else:
20
+ cpu.arch = gpuhunt.CPUArchitecture.X86
21
+ resources.cpu = cpu
@@ -12,7 +12,7 @@ from dstack._internal.core.models.common import CoreModel, NetworkMode
12
12
  from dstack._internal.core.models.envs import Env
13
13
  from dstack._internal.core.models.repos.remote import RemoteRepoCreds
14
14
  from dstack._internal.core.models.resources import Memory
15
- from dstack._internal.core.models.runs import ClusterInfo, JobSpec, RunSpec
15
+ from dstack._internal.core.models.runs import ClusterInfo, Job, Run
16
16
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
17
17
  from dstack._internal.server.schemas.runner import (
18
18
  GPUDevice,
@@ -72,8 +72,8 @@ class RunnerClient:
72
72
 
73
73
  def submit_job(
74
74
  self,
75
- run_spec: RunSpec,
76
- job_spec: JobSpec,
75
+ run: Run,
76
+ job: Job,
77
77
  cluster_info: ClusterInfo,
78
78
  secrets: Dict[str, str],
79
79
  repo_credentials: Optional[RemoteRepoCreds],
@@ -81,6 +81,7 @@ class RunnerClient:
81
81
  ):
82
82
  # XXX: This is a quick-and-dirty hack to deliver InstanceModel-specific environment
83
83
  # variables to the runner without runner API modification.
84
+ job_spec = job.job_spec
84
85
  if instance_env is not None:
85
86
  if isinstance(instance_env, Env):
86
87
  merged_env = instance_env.as_dict()
@@ -90,11 +91,13 @@ class RunnerClient:
90
91
  job_spec = job_spec.copy(deep=True)
91
92
  job_spec.env = merged_env
92
93
  body = SubmitBody(
93
- run_spec=run_spec,
94
+ run=run,
94
95
  job_spec=job_spec,
96
+ job_submission=job.job_submissions[-1],
95
97
  cluster_info=cluster_info,
96
98
  secrets=secrets,
97
99
  repo_credentials=repo_credentials,
100
+ run_spec=run.run_spec,
98
101
  )
99
102
  resp = requests.post(
100
103
  # use .json() to encode enums
@@ -81,6 +81,7 @@ from dstack._internal.server.services.logging import fmt
81
81
  from dstack._internal.server.services.offers import get_offers_by_requirements
82
82
  from dstack._internal.server.services.plugins import apply_plugin_policies
83
83
  from dstack._internal.server.services.projects import list_project_models, list_user_project_models
84
+ from dstack._internal.server.services.resources import set_resources_defaults
84
85
  from dstack._internal.server.services.users import get_user_model_by_name
85
86
  from dstack._internal.utils.logging import get_logger
86
87
  from dstack._internal.utils.random_names import generate_name
@@ -301,12 +302,14 @@ async def get_plan(
301
302
  project=project,
302
303
  run_name=effective_run_spec.run_name,
303
304
  )
304
- if (
305
- current_resource is not None
306
- and not current_resource.status.is_finished()
307
- and _can_update_run_spec(current_resource.run_spec, effective_run_spec)
308
- ):
309
- action = ApplyAction.UPDATE
305
+ if current_resource is not None:
306
+ # For backward compatibility (current_resource may has been submitted before
307
+ # some fields, e.g., CPUSpec.arch, were added)
308
+ set_resources_defaults(current_resource.run_spec.configuration.resources)
309
+ if not current_resource.status.is_finished() and _can_update_run_spec(
310
+ current_resource.run_spec, effective_run_spec
311
+ ):
312
+ action = ApplyAction.UPDATE
310
313
 
311
314
  jobs = await get_jobs_from_run_spec(effective_run_spec, replica_num=0)
312
315
 
@@ -406,6 +409,10 @@ async def apply_plan(
406
409
  project=project,
407
410
  run_spec=run_spec,
408
411
  )
412
+
413
+ # For backward compatibility (current_resource may has been submitted before
414
+ # some fields, e.g., CPUSpec.arch, were added)
415
+ set_resources_defaults(current_resource.run_spec.configuration.resources)
409
416
  try:
410
417
  _check_can_update_run_spec(current_resource.run_spec, run_spec)
411
418
  except ServerClientError:
@@ -414,6 +421,8 @@ async def apply_plan(
414
421
  raise ServerClientError("Cannot override active run. Stop the run first.")
415
422
  raise
416
423
  if not force:
424
+ if plan.current_resource is not None:
425
+ set_resources_defaults(plan.current_resource.run_spec.configuration.resources)
417
426
  if (
418
427
  plan.current_resource is None
419
428
  or plan.current_resource.id != current_resource.id
@@ -861,11 +870,12 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
861
870
  if (
862
871
  run_spec.merged_profile.utilization_policy is not None
863
872
  and run_spec.merged_profile.utilization_policy.time_window
864
- > settings.SERVER_METRICS_TTL_SECONDS
873
+ > settings.SERVER_METRICS_RUNNING_TTL_SECONDS
865
874
  ):
866
875
  raise ServerClientError(
867
- f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
876
+ f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
868
877
  )
878
+ set_resources_defaults(run_spec.configuration.resources)
869
879
 
870
880
 
871
881
  _UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import warnings
2
3
  from pathlib import Path
3
4
 
4
5
  DSTACK_DIR_PATH = Path("~/.dstack/").expanduser()
@@ -45,7 +46,25 @@ SERVER_CLOUDWATCH_LOG_REGION = os.getenv("DSTACK_SERVER_CLOUDWATCH_LOG_REGION")
45
46
 
46
47
  SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT")
47
48
 
48
- SERVER_METRICS_TTL_SECONDS = int(os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS", 3600))
49
+ SERVER_METRICS_RUNNING_TTL_SECONDS: int
50
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS")
51
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is None:
52
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = os.getenv("DSTACK_SERVER_METRICS_TTL_SECONDS")
53
+ if _SERVER_METRICS_RUNNING_TTL_SECONDS is not None:
54
+ warnings.warn(
55
+ (
56
+ "DSTACK_SERVER_METRICS_TTL_SECONDS is deprecated,"
57
+ " use DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS instead"
58
+ ),
59
+ DeprecationWarning,
60
+ )
61
+ else:
62
+ _SERVER_METRICS_RUNNING_TTL_SECONDS = 3600
63
+ SERVER_METRICS_RUNNING_TTL_SECONDS = int(_SERVER_METRICS_RUNNING_TTL_SECONDS)
64
+ del _SERVER_METRICS_RUNNING_TTL_SECONDS
65
+ SERVER_METRICS_FINISHED_TTL_SECONDS = int(
66
+ os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
67
+ )
49
68
 
50
69
  DEFAULT_PROJECT_NAME = "main"
51
70