dstack 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +10 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/volume.py +9 -0
  17. dstack/_internal/core/backends/aws/compute.py +24 -11
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +14 -8
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +27 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +11 -4
  33. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  34. dstack/_internal/core/backends/vastai/compute.py +12 -2
  35. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  36. dstack/_internal/core/backends/vultr/compute.py +9 -3
  37. dstack/_internal/core/models/backends/aws.py +2 -0
  38. dstack/_internal/core/models/backends/base.py +1 -0
  39. dstack/_internal/core/models/configurations.py +0 -1
  40. dstack/_internal/core/models/runs.py +3 -3
  41. dstack/_internal/core/models/volumes.py +23 -0
  42. dstack/_internal/core/services/__init__.py +5 -1
  43. dstack/_internal/core/services/configs/__init__.py +3 -0
  44. dstack/_internal/server/background/tasks/common.py +22 -0
  45. dstack/_internal/server/background/tasks/process_instances.py +13 -21
  46. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -16
  47. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -7
  48. dstack/_internal/server/background/tasks/process_terminating_jobs.py +7 -2
  49. dstack/_internal/server/background/tasks/process_volumes.py +11 -1
  50. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  51. dstack/_internal/server/models.py +17 -19
  52. dstack/_internal/server/routers/logs.py +3 -0
  53. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  54. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  55. dstack/_internal/server/services/config.py +11 -1
  56. dstack/_internal/server/services/fleets.py +5 -1
  57. dstack/_internal/server/services/jobs/__init__.py +14 -11
  58. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  59. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  60. dstack/_internal/server/services/logs/__init__.py +78 -0
  61. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  62. dstack/_internal/server/services/logs/base.py +47 -0
  63. dstack/_internal/server/services/logs/filelog.py +110 -0
  64. dstack/_internal/server/services/logs/gcp.py +165 -0
  65. dstack/_internal/server/services/offers.py +7 -7
  66. dstack/_internal/server/services/pools.py +19 -20
  67. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  68. dstack/_internal/server/services/runner/client.py +8 -5
  69. dstack/_internal/server/services/volumes.py +68 -9
  70. dstack/_internal/server/settings.py +3 -0
  71. dstack/_internal/server/statics/index.html +1 -1
  72. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
  73. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
  74. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
  75. dstack/_internal/server/testing/common.py +46 -17
  76. dstack/api/_public/runs.py +1 -1
  77. dstack/version.py +2 -2
  78. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
  79. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/RECORD +97 -86
  80. tests/_internal/core/backends/base/__init__.py +0 -0
  81. tests/_internal/core/backends/base/test_compute.py +56 -0
  82. tests/_internal/server/background/tasks/test_process_running_jobs.py +2 -1
  83. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +5 -3
  84. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +11 -6
  85. tests/_internal/server/conftest.py +4 -5
  86. tests/_internal/server/routers/test_backends.py +1 -0
  87. tests/_internal/server/routers/test_logs.py +1 -1
  88. tests/_internal/server/routers/test_runs.py +2 -2
  89. tests/_internal/server/routers/test_volumes.py +9 -2
  90. tests/_internal/server/services/runner/test_client.py +22 -3
  91. tests/_internal/server/services/test_logs.py +3 -3
  92. tests/_internal/server/services/test_offers.py +167 -0
  93. tests/_internal/server/services/test_pools.py +105 -1
  94. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
  95. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
  96. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
  97. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
43
43
  PoolModel,
44
44
  ProjectModel,
45
45
  RunModel,
46
+ VolumeAttachmentModel,
46
47
  VolumeModel,
47
48
  )
48
49
  from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
@@ -236,7 +237,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
236
237
  res = await session.execute(
237
238
  select(InstanceModel)
238
239
  .where(InstanceModel.id == job_model.instance.id)
239
- .options(selectinload(InstanceModel.volumes))
240
+ .options(selectinload(InstanceModel.volume_attachments))
240
241
  .execution_options(populate_existing=True)
241
242
  )
242
243
  instance = res.unique().scalar_one()
@@ -390,11 +391,11 @@ async def _assign_job_to_pool_instance(
390
391
 
391
392
  instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
392
393
  instance, offer = instances_with_offers[0]
393
- # Reload InstanceModel with volumes
394
+ # Reload InstanceModel with volume attachments
394
395
  res = await session.execute(
395
396
  select(InstanceModel)
396
397
  .where(InstanceModel.id == instance.id)
397
- .options(joinedload(InstanceModel.volumes))
398
+ .options(joinedload(InstanceModel.volume_attachments))
398
399
  )
399
400
  instance = res.unique().scalar_one()
400
401
  instance.status = InstanceStatus.BUSY
@@ -580,7 +581,7 @@ def _create_instance_model_for_job(
580
581
  backend=offer.backend,
581
582
  price=offer.price,
582
583
  region=offer.region,
583
- volumes=[],
584
+ volume_attachments=[],
584
585
  total_blocks=1,
585
586
  busy_blocks=1,
586
587
  )
@@ -696,14 +697,18 @@ async def _attach_volume(
696
697
  instance: InstanceModel,
697
698
  instance_id: str,
698
699
  ):
700
+ volume = volume_model_to_volume(volume_model)
701
+ # Refresh only to check if the volume wasn't deleted before the lock
699
702
  await session.refresh(volume_model)
700
703
  if volume_model.deleted:
701
704
  raise ServerClientError("Cannot attach a deleted volume")
702
- volume = volume_model_to_volume(volume_model)
703
705
  attachment_data = await common_utils.run_async(
704
706
  backend.compute().attach_volume,
705
707
  volume=volume,
706
708
  instance_id=instance_id,
707
709
  )
708
- volume_model.volume_attachment_data = attachment_data.json()
709
- instance.volumes.append(volume_model)
710
+ volume_attachment_model = VolumeAttachmentModel(
711
+ volume=volume_model,
712
+ attachment_data=attachment_data.json(),
713
+ )
714
+ instance.volume_attachments.append(volume_attachment_model)
@@ -6,7 +6,12 @@ from sqlalchemy.orm import joinedload, lazyload
6
6
 
7
7
  from dstack._internal.core.models.runs import JobStatus
8
8
  from dstack._internal.server.db import get_session_ctx
9
- from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel, VolumeModel
9
+ from dstack._internal.server.models import (
10
+ InstanceModel,
11
+ JobModel,
12
+ ProjectModel,
13
+ VolumeAttachmentModel,
14
+ )
10
15
  from dstack._internal.server.services.jobs import (
11
16
  process_terminating_job,
12
17
  process_volumes_detaching,
@@ -80,7 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
80
85
  .where(InstanceModel.id == job_model.used_instance_id)
81
86
  .options(
82
87
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
83
- joinedload(InstanceModel.volumes).joinedload(VolumeModel.user),
88
+ joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
84
89
  )
85
90
  )
86
91
  instance_model = res.unique().scalar()
@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
5
5
  from dstack._internal.core.errors import BackendError, BackendNotAvailable
6
6
  from dstack._internal.core.models.volumes import VolumeStatus
7
7
  from dstack._internal.server.db import get_session_ctx
8
- from dstack._internal.server.models import ProjectModel, VolumeModel
8
+ from dstack._internal.server.models import (
9
+ InstanceModel,
10
+ ProjectModel,
11
+ VolumeAttachmentModel,
12
+ VolumeModel,
13
+ )
9
14
  from dstack._internal.server.services import backends as backends_services
10
15
  from dstack._internal.server.services import volumes as volumes_services
11
16
  from dstack._internal.server.services.locking import get_locker
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
49
54
  .where(VolumeModel.id == volume_model.id)
50
55
  .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
51
56
  .options(joinedload(VolumeModel.user))
57
+ .options(
58
+ joinedload(VolumeModel.attachments)
59
+ .joinedload(VolumeAttachmentModel.instance)
60
+ .joinedload(InstanceModel.fleet)
61
+ )
52
62
  .execution_options(populate_existing=True)
53
63
  )
54
64
  volume_model = res.unique().scalar_one()
@@ -0,0 +1,34 @@
1
+ """Move attachment_data to volumes_attachments
2
+
3
+ Revision ID: a751ef183f27
4
+ Revises: 1e76fb0dde87
5
+ Create Date: 2025-02-12 13:19:57.569591
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "a751ef183f27"
14
+ down_revision = "1e76fb0dde87"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
22
+ batch_op.alter_column("instace_id", new_column_name="instance_id")
23
+ batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
24
+
25
+ # ### end Alembic commands ###
26
+
27
+
28
+ def downgrade() -> None:
29
+ # ### commands auto generated by Alembic - please adjust! ###
30
+ with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
31
+ batch_op.drop_column("attachment_data")
32
+ batch_op.alter_column("instance_id", new_column_name="instace_id")
33
+
34
+ # ### end Alembic commands ###
@@ -5,7 +5,6 @@ from typing import Callable, List, Optional, Union
5
5
  from sqlalchemy import (
6
6
  BigInteger,
7
7
  Boolean,
8
- Column,
9
8
  DateTime,
10
9
  Enum,
11
10
  Float,
@@ -15,7 +14,6 @@ from sqlalchemy import (
15
14
  LargeBinary,
16
15
  MetaData,
17
16
  String,
18
- Table,
19
17
  Text,
20
18
  TypeDecorator,
21
19
  UniqueConstraint,
@@ -554,10 +552,12 @@ class InstanceModel(BaseModel):
554
552
  jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
555
553
  last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
556
554
 
557
- # volumes attached to the instance
558
- volumes: Mapped[List["VolumeModel"]] = relationship(
559
- secondary="volumes_attachments",
560
- back_populates="instances",
555
+ volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
556
+ back_populates="instance",
557
+ # Add delete-orphan option so that removing entries from volume_attachments
558
+ # automatically marks them for deletion.
559
+ # SQLalchemy requires delete when using delete-orphan.
560
+ cascade="save-update, merge, delete-orphan, delete",
561
561
  )
562
562
 
563
563
 
@@ -587,23 +587,21 @@ class VolumeModel(BaseModel):
587
587
 
588
588
  configuration: Mapped[str] = mapped_column(Text)
589
589
  volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
590
- # FIXME: volume_attachment_data should be in "volumes_attachments"
591
- # to support multi-attach volumes
590
+
591
+ attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume")
592
+
593
+ # Deprecated in favor of VolumeAttachmentModel.attachment_data
592
594
  volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text)
593
595
 
594
- # instances the volume is attached to
595
- instances: Mapped[List["InstanceModel"]] = relationship(
596
- secondary="volumes_attachments",
597
- back_populates="volumes",
598
- )
599
596
 
597
+ class VolumeAttachmentModel(BaseModel):
598
+ __tablename__ = "volumes_attachments"
600
599
 
601
- volumes_attachments_table = Table(
602
- "volumes_attachments",
603
- BackendModel.metadata,
604
- Column("volume_id", ForeignKey("volumes.id"), primary_key=True),
605
- Column("instace_id", ForeignKey("instances.id"), primary_key=True),
606
- )
600
+ volume_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("volumes.id"), primary_key=True)
601
+ volume: Mapped[VolumeModel] = relationship(back_populates="attachments")
602
+ instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"), primary_key=True)
603
+ instance: Mapped[InstanceModel] = relationship(back_populates="volume_attachments")
604
+ attachment_data: Mapped[Optional[str]] = mapped_column(Text)
607
605
 
608
606
 
609
607
  class PlacementGroupModel(BaseModel):
@@ -24,4 +24,7 @@ async def poll_logs(
24
24
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
25
  ) -> JobSubmissionLogs:
26
26
  _, project = user_project
27
+ # The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
28
+ # Otherwise, some logs with duplicated timestamps may be filtered out.
29
+ # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
27
30
  return await logs.poll_logs_async(project=project, request=body)
@@ -2,6 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  from typing import List
4
4
 
5
+ import botocore.exceptions
5
6
  from boto3.session import Session
6
7
 
7
8
  from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
35
36
  Configurator,
36
37
  raise_invalid_credentials_error,
37
38
  )
39
+ from dstack._internal.utils.logging import get_logger
40
+
41
+ logger = get_logger(__name__)
38
42
 
39
43
  REGIONS = [
40
44
  ("US East, N. Virginia", "us-east-1"),
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
137
141
 
138
142
  def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
139
143
  self._check_tags_config(config)
140
- self._check_vpc_config(session=session, config=config)
144
+ self._check_iam_instance_profile_config(session, config)
145
+ self._check_vpc_config(session, config)
141
146
 
142
147
  def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
143
148
  if not config.tags:
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
151
156
  except BackendError as e:
152
157
  raise ServerClientError(e.args[0])
153
158
 
159
+ def _check_iam_instance_profile_config(
160
+ self, session: Session, config: AWSConfigInfoWithCredsPartial
161
+ ):
162
+ if config.iam_instance_profile is None:
163
+ return
164
+ try:
165
+ iam_client = session.client("iam")
166
+ iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
167
+ except botocore.exceptions.ClientError as e:
168
+ if e.response["Error"]["Code"] == "NoSuchEntity":
169
+ raise ServerClientError(
170
+ f"IAM instance profile {config.iam_instance_profile} not found"
171
+ )
172
+ logger.exception(
173
+ "Got botocore.exceptions.ClientError when checking iam_instance_profile"
174
+ )
175
+ raise ServerClientError(
176
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
177
+ )
178
+ except Exception:
179
+ logger.exception("Got exception when checking iam_instance_profile")
180
+ raise ServerClientError(
181
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
182
+ )
183
+
154
184
  def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
155
185
  allocate_public_ip = config.public_ips if config.public_ips is not None else True
156
186
  use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
127
127
  _, project_id = auth.authenticate(GCPDefaultCreds())
128
128
  except BackendAuthError:
129
129
  return []
130
-
131
- if project_id is None:
132
- return []
133
-
134
130
  return [
135
131
  GCPConfigInfoWithCreds(
136
132
  project_id=project_id,
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
152
148
  ):
153
149
  raise_invalid_credentials_error(fields=[["creds"]])
154
150
  try:
155
- credentials, project_id = auth.authenticate(creds=config.creds)
156
- except BackendAuthError:
151
+ credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
152
+ except BackendAuthError as e:
153
+ details = None
154
+ if len(e.args) > 0:
155
+ details = e.args[0]
157
156
  if is_core_model_instance(config.creds, GCPServiceAccountCreds):
158
- raise_invalid_credentials_error(fields=[["creds", "data"]])
157
+ raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
159
158
  else:
160
- raise_invalid_credentials_error(fields=[["creds"]])
161
- if (
162
- project_id is not None
163
- and config.project_id is not None
164
- and config.project_id != project_id
165
- ):
166
- raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
167
- config_values.project_id = self._get_project_id_element(selected=project_id)
159
+ raise_invalid_credentials_error(fields=[["creds"]], details=details)
168
160
  config_values.regions = self._get_regions_element(
169
161
  selected=config.regions or DEFAULT_REGIONS
170
162
  )
171
163
  if config.project_id is None:
172
164
  return config_values
165
+ config_values.project_id = self._get_project_id_element(selected=config.project_id)
173
166
  self._check_config(config=config, credentials=credentials)
174
167
  return config_values
175
168
 
@@ -107,6 +107,16 @@ class AWSConfig(CoreModel):
107
107
  )
108
108
  ),
109
109
  ] = None
110
+ iam_instance_profile: Annotated[
111
+ Optional[str],
112
+ Field(
113
+ description=(
114
+ "The name of the IAM instance profile to associate with EC2 instances."
115
+ " You can also specify the IAM role name for roles created via the AWS console."
116
+ " AWS automatically creates an instance profile and gives it the same name as the role"
117
+ )
118
+ ),
119
+ ] = None
110
120
  tags: Annotated[
111
121
  Optional[Dict[str, str]],
112
122
  Field(description="The tags that will be assigned to resources created by `dstack`"),
@@ -251,7 +261,7 @@ class GCPConfig(CoreModel):
251
261
  ),
252
262
  ] = None
253
263
  vm_service_account: Annotated[
254
- Optional[str], Field(description="The service account associated with provisioned VMs")
264
+ Optional[str], Field(description="The service account to associate with provisioned VMs")
255
265
  ] = None
256
266
  tags: Annotated[
257
267
  Optional[Dict[str, str]],
@@ -257,6 +257,7 @@ async def get_plan(
257
257
  project=project,
258
258
  profile=spec.merged_profile,
259
259
  requirements=_get_fleet_requirements(spec),
260
+ fleet_spec=spec,
260
261
  blocks=spec.configuration.blocks,
261
262
  )
262
263
  offers = [offer for _, offer in offers_with_backends]
@@ -277,12 +278,15 @@ async def get_create_instance_offers(
277
278
  project: ProjectModel,
278
279
  profile: Profile,
279
280
  requirements: Requirements,
280
- exclude_not_available=False,
281
+ fleet_spec: Optional[FleetSpec] = None,
281
282
  fleet_model: Optional[FleetModel] = None,
282
283
  blocks: Union[int, Literal["auto"]] = 1,
284
+ exclude_not_available: bool = False,
283
285
  ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
284
286
  multinode = False
285
287
  master_job_provisioning_data = None
288
+ if fleet_spec is not None:
289
+ multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER
286
290
  if fleet_model is not None:
287
291
  fleet = fleet_model_to_fleet(fleet_model)
288
292
  multinode = fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
@@ -236,13 +236,14 @@ async def process_terminating_job(
236
236
  logger.debug("%s: stopping container", fmt(job_model))
237
237
  ssh_private_keys = get_instance_ssh_private_keys(instance_model)
238
238
  await stop_container(job_model, jpd, ssh_private_keys)
239
- volume_models: list[VolumeModel]
240
239
  if jrd is not None and jrd.volume_names is not None:
241
- volume_models = await list_project_volume_models(
242
- session=session, project=instance_model.project, names=jrd.volume_names
243
- )
240
+ volume_names = jrd.volume_names
244
241
  else:
245
- volume_models = instance_model.volumes
242
+ # Legacy jobs before job_runtime_data/blocks were introduced
243
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
244
+ volume_models = await list_project_volume_models(
245
+ session=session, project=instance_model.project, names=volume_names
246
+ )
246
247
  if len(volume_models) > 0:
247
248
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
248
249
  all_volumes_detached = await _detach_volumes_from_job_instance(
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
302
303
  jpd = get_or_error(get_job_provisioning_data(job_model))
303
304
  jrd = get_job_runtime_data(job_model)
304
305
  if jrd is not None and jrd.volume_names is not None:
305
- volume_models = await list_project_volume_models(
306
- session=session, project=instance_model.project, names=jrd.volume_names
307
- )
306
+ volume_names = jrd.volume_names
308
307
  else:
309
- volume_models = instance_model.volumes
308
+ # Legacy jobs before job_runtime_data/blocks were introduced
309
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
310
+ volume_models = await list_project_volume_models(
311
+ session=session, project=instance_model.project, names=volume_names
312
+ )
310
313
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
311
314
  all_volumes_detached = await _detach_volumes_from_job_instance(
312
315
  project=instance_model.project,
@@ -439,8 +442,8 @@ async def _detach_volumes_from_job_instance(
439
442
  if job_model.volumes_detached_at is None:
440
443
  job_model.volumes_detached_at = common.get_current_datetime()
441
444
  detached_volumes_ids = {v.id for v in detached_volumes}
442
- instance_model.volumes = [
443
- v for v in instance_model.volumes if v.id not in detached_volumes_ids
445
+ instance_model.volume_attachments = [
446
+ va for va in instance_model.volume_attachments if va.volume_id not in detached_volumes_ids
444
447
  ]
445
448
  return all_detached
446
449
 
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
6
6
  from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
7
7
  from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
8
8
 
9
- DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
10
-
11
9
  INSTALL_IPYKERNEL = (
12
10
  "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
13
11
  'echo "no pip, ipykernel was not installed"'
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
44
42
  return False
45
43
 
46
44
  def _default_max_duration(self) -> Optional[int]:
47
- return DEFAULT_MAX_DURATION_SECONDS
45
+ return None
48
46
 
49
47
  def _spot_policy(self) -> SpotPolicy:
50
48
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
@@ -5,8 +5,6 @@ from dstack._internal.core.models.profiles import SpotPolicy
5
5
  from dstack._internal.core.models.runs import JobSpec
6
6
  from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
7
7
 
8
- DEFAULT_MAX_DURATION_SECONDS = 72 * 3600
9
-
10
8
 
11
9
  class TaskJobConfigurator(JobConfigurator):
12
10
  TYPE: RunConfigurationType = RunConfigurationType.TASK
@@ -29,7 +27,7 @@ class TaskJobConfigurator(JobConfigurator):
29
27
  return True
30
28
 
31
29
  def _default_max_duration(self) -> Optional[int]:
32
- return DEFAULT_MAX_DURATION_SECONDS
30
+ return None
33
31
 
34
32
  def _spot_policy(self) -> SpotPolicy:
35
33
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
@@ -0,0 +1,78 @@
1
+ import atexit
2
+ from typing import List, Optional
3
+ from uuid import UUID
4
+
5
+ from dstack._internal.core.models.logs import JobSubmissionLogs
6
+ from dstack._internal.server import settings
7
+ from dstack._internal.server.models import ProjectModel
8
+ from dstack._internal.server.schemas.logs import PollLogsRequest
9
+ from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
10
+ from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
11
+ from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
12
+ from dstack._internal.server.services.logs.filelog import FileLogStorage
13
+ from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
14
+ from dstack._internal.utils.common import run_async
15
+ from dstack._internal.utils.logging import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ _log_storage: Optional[LogStorage] = None
21
+
22
+
23
+ def get_log_storage() -> LogStorage:
24
+ global _log_storage
25
+ if _log_storage is not None:
26
+ return _log_storage
27
+ if settings.SERVER_CLOUDWATCH_LOG_GROUP:
28
+ if BOTO_AVAILABLE:
29
+ try:
30
+ _log_storage = CloudWatchLogStorage(
31
+ group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
32
+ region=settings.SERVER_CLOUDWATCH_LOG_REGION,
33
+ )
34
+ except LogStorageError as e:
35
+ logger.error("Failed to initialize CloudWatch Logs storage: %s", e)
36
+ except Exception:
37
+ logger.exception("Got exception when initializing CloudWatch Logs storage")
38
+ else:
39
+ logger.debug("Using CloudWatch Logs storage")
40
+ else:
41
+ logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
42
+ elif settings.SERVER_GCP_LOGGING_PROJECT:
43
+ if GCP_LOGGING_AVAILABLE:
44
+ try:
45
+ _log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
46
+ except LogStorageError as e:
47
+ logger.error("Failed to initialize GCP Logs storage: %s", e)
48
+ except Exception:
49
+ logger.exception("Got exception when initializing GCP Logs storage")
50
+ else:
51
+ logger.debug("Using GCP Logs storage")
52
+ else:
53
+ logger.error("Cannot use GCP Logs storage: GCP deps are not installed")
54
+ if _log_storage is None:
55
+ _log_storage = FileLogStorage()
56
+ logger.debug("Using file-based storage")
57
+ atexit.register(_log_storage.close)
58
+ return _log_storage
59
+
60
+
61
+ def write_logs(
62
+ project: ProjectModel,
63
+ run_name: str,
64
+ job_submission_id: UUID,
65
+ runner_logs: List[RunnerLogEvent],
66
+ job_logs: List[RunnerLogEvent],
67
+ ) -> None:
68
+ return get_log_storage().write_logs(
69
+ project=project,
70
+ run_name=run_name,
71
+ job_submission_id=job_submission_id,
72
+ runner_logs=runner_logs,
73
+ job_logs=job_logs,
74
+ )
75
+
76
+
77
+ async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
78
+ return await run_async(get_log_storage().poll_logs, project=project, request=request)