dstack 0.19.26__py3-none-any.whl → 0.19.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +11 -8
- dstack/_internal/cli/commands/apply.py +6 -3
- dstack/_internal/cli/commands/completion.py +3 -1
- dstack/_internal/cli/commands/config.py +1 -0
- dstack/_internal/cli/commands/init.py +4 -4
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/commands/project.py +1 -0
- dstack/_internal/cli/commands/server.py +2 -2
- dstack/_internal/cli/main.py +1 -1
- dstack/_internal/cli/services/configurators/base.py +2 -4
- dstack/_internal/cli/services/configurators/fleet.py +4 -5
- dstack/_internal/cli/services/configurators/gateway.py +3 -5
- dstack/_internal/cli/services/configurators/run.py +165 -43
- dstack/_internal/cli/services/configurators/volume.py +3 -5
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +10 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/common.py +67 -43
- dstack/_internal/core/models/configurations.py +109 -69
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +115 -25
- dstack/_internal/core/models/instances.py +5 -5
- dstack/_internal/core/models/profiles.py +66 -47
- dstack/_internal/core/models/repos/remote.py +21 -16
- dstack/_internal/core/models/resources.py +69 -65
- dstack/_internal/core/models/runs.py +41 -14
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/app.py +5 -0
- dstack/_internal/server/background/tasks/process_fleets.py +117 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +48 -16
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +11 -7
- dstack/_internal/server/schemas/gateways.py +10 -9
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/handlers.py +2 -0
- dstack/_internal/server/services/docker.py +8 -7
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/projects.py +52 -1
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/settings.py +46 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-5e0d56245c4bd241ec27.css} +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-a2a16772fbf11a14d191.js} +1215 -998
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-a2a16772fbf11a14d191.js.map} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/env.py +85 -11
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/RECORD +92 -78
- dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.28.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -16,6 +16,7 @@ from dstack._internal.core.models.common import NetworkMode
|
|
|
16
16
|
from dstack._internal.core.models.fleets import (
|
|
17
17
|
Fleet,
|
|
18
18
|
FleetConfiguration,
|
|
19
|
+
FleetNodesSpec,
|
|
19
20
|
FleetSpec,
|
|
20
21
|
FleetStatus,
|
|
21
22
|
InstanceGroupPlacement,
|
|
@@ -26,7 +27,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
26
27
|
CreationPolicy,
|
|
27
28
|
TerminationPolicy,
|
|
28
29
|
)
|
|
29
|
-
from dstack._internal.core.models.resources import Memory
|
|
30
|
+
from dstack._internal.core.models.resources import Memory
|
|
30
31
|
from dstack._internal.core.models.runs import (
|
|
31
32
|
Job,
|
|
32
33
|
JobProvisioningData,
|
|
@@ -53,7 +54,9 @@ from dstack._internal.server.models import (
|
|
|
53
54
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
54
55
|
from dstack._internal.server.services.fleets import (
|
|
55
56
|
fleet_model_to_fleet,
|
|
57
|
+
generate_fleet_name,
|
|
56
58
|
get_fleet_requirements,
|
|
59
|
+
get_next_instance_num,
|
|
57
60
|
)
|
|
58
61
|
from dstack._internal.server.services.instances import (
|
|
59
62
|
filter_pool_instances,
|
|
@@ -69,7 +72,7 @@ from dstack._internal.server.services.jobs import (
|
|
|
69
72
|
get_job_configured_volumes,
|
|
70
73
|
get_job_runtime_data,
|
|
71
74
|
)
|
|
72
|
-
from dstack._internal.server.services.locking import get_locker
|
|
75
|
+
from dstack._internal.server.services.locking import get_locker, string_to_lock_id
|
|
73
76
|
from dstack._internal.server.services.logging import fmt
|
|
74
77
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
75
78
|
from dstack._internal.server.services.requirements.combine import (
|
|
@@ -85,7 +88,6 @@ from dstack._internal.server.services.volumes import (
|
|
|
85
88
|
)
|
|
86
89
|
from dstack._internal.server.utils import sentry_utils
|
|
87
90
|
from dstack._internal.utils import common as common_utils
|
|
88
|
-
from dstack._internal.utils import env as env_utils
|
|
89
91
|
from dstack._internal.utils.logging import get_logger
|
|
90
92
|
|
|
91
93
|
logger = get_logger(__name__)
|
|
@@ -186,6 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
186
188
|
run_spec = run.run_spec
|
|
187
189
|
profile = run_spec.merged_profile
|
|
188
190
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
191
|
+
multinode = job.job_spec.jobs_per_replica > 1
|
|
189
192
|
|
|
190
193
|
# Master job chooses fleet for the run.
|
|
191
194
|
# Due to two-step processing, it's saved to job_model.fleet.
|
|
@@ -308,6 +311,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
308
311
|
session=session,
|
|
309
312
|
instances_with_offers=fleet_instances_with_offers,
|
|
310
313
|
job_model=job_model,
|
|
314
|
+
multinode=multinode,
|
|
311
315
|
)
|
|
312
316
|
job_model.fleet = fleet_model
|
|
313
317
|
job_model.instance_assigned = True
|
|
@@ -361,7 +365,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
361
365
|
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
362
366
|
job_model.status = JobStatus.PROVISIONING
|
|
363
367
|
if fleet_model is None:
|
|
364
|
-
fleet_model = _create_fleet_model_for_job(
|
|
368
|
+
fleet_model = await _create_fleet_model_for_job(
|
|
369
|
+
session=session,
|
|
365
370
|
project=project,
|
|
366
371
|
run=run,
|
|
367
372
|
)
|
|
@@ -383,7 +388,9 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
383
388
|
offer=offer,
|
|
384
389
|
instance_num=instance_num,
|
|
385
390
|
)
|
|
386
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
391
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
392
|
+
# Both this task and process_fleets can add instances to fleets.
|
|
393
|
+
# TODO: Ensure this does not violate nodes.max when it's enforced.
|
|
387
394
|
instance.fleet_id = fleet_model.id
|
|
388
395
|
logger.info(
|
|
389
396
|
"The job %s created the new instance %s",
|
|
@@ -610,6 +617,7 @@ async def _assign_job_to_fleet_instance(
|
|
|
610
617
|
session: AsyncSession,
|
|
611
618
|
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
612
619
|
job_model: JobModel,
|
|
620
|
+
multinode: bool,
|
|
613
621
|
) -> Optional[InstanceModel]:
|
|
614
622
|
if len(instances_with_offers) == 0:
|
|
615
623
|
return None
|
|
@@ -639,7 +647,7 @@ async def _assign_job_to_fleet_instance(
|
|
|
639
647
|
job_model.instance = instance
|
|
640
648
|
job_model.used_instance_id = instance.id
|
|
641
649
|
job_model.job_provisioning_data = instance.job_provisioning_data
|
|
642
|
-
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
650
|
+
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
643
651
|
return instance
|
|
644
652
|
|
|
645
653
|
|
|
@@ -748,26 +756,42 @@ def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
|
748
756
|
return True
|
|
749
757
|
|
|
750
758
|
|
|
751
|
-
def _create_fleet_model_for_job(
|
|
759
|
+
async def _create_fleet_model_for_job(
|
|
760
|
+
session: AsyncSession,
|
|
752
761
|
project: ProjectModel,
|
|
753
762
|
run: Run,
|
|
754
763
|
) -> FleetModel:
|
|
755
764
|
placement = InstanceGroupPlacement.ANY
|
|
756
765
|
if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
|
|
757
766
|
placement = InstanceGroupPlacement.CLUSTER
|
|
767
|
+
nodes = _get_nodes_required_num_for_run(run.run_spec)
|
|
768
|
+
|
|
769
|
+
lock_namespace = f"fleet_names_{project.name}"
|
|
770
|
+
# TODO: Lock fleet names on SQLite.
|
|
771
|
+
# Needs some refactoring so that the lock is released after commit.
|
|
772
|
+
if get_db().dialect_name == "postgresql":
|
|
773
|
+
await session.execute(
|
|
774
|
+
select(func.pg_advisory_xact_lock(string_to_lock_id(lock_namespace)))
|
|
775
|
+
)
|
|
776
|
+
fleet_name = await generate_fleet_name(session=session, project=project)
|
|
777
|
+
|
|
758
778
|
spec = FleetSpec(
|
|
759
779
|
configuration=FleetConfiguration(
|
|
760
|
-
name=
|
|
780
|
+
name=fleet_name,
|
|
761
781
|
placement=placement,
|
|
762
782
|
reservation=run.run_spec.configuration.reservation,
|
|
763
|
-
nodes=
|
|
783
|
+
nodes=FleetNodesSpec(
|
|
784
|
+
min=nodes,
|
|
785
|
+
target=nodes,
|
|
786
|
+
max=None,
|
|
787
|
+
),
|
|
764
788
|
),
|
|
765
789
|
profile=run.run_spec.merged_profile,
|
|
766
790
|
autocreated=True,
|
|
767
791
|
)
|
|
768
792
|
fleet_model = FleetModel(
|
|
769
793
|
id=uuid.uuid4(),
|
|
770
|
-
name=
|
|
794
|
+
name=fleet_name,
|
|
771
795
|
project=project,
|
|
772
796
|
status=FleetStatus.ACTIVE,
|
|
773
797
|
spec=spec.json(),
|
|
@@ -778,10 +802,13 @@ def _create_fleet_model_for_job(
|
|
|
778
802
|
|
|
779
803
|
async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int:
|
|
780
804
|
res = await session.execute(
|
|
781
|
-
select(
|
|
805
|
+
select(InstanceModel.instance_num).where(
|
|
806
|
+
InstanceModel.fleet_id == fleet_model.id,
|
|
807
|
+
InstanceModel.deleted.is_(False),
|
|
808
|
+
)
|
|
782
809
|
)
|
|
783
|
-
|
|
784
|
-
return
|
|
810
|
+
taken_instance_nums = set(res.scalars().all())
|
|
811
|
+
return get_next_instance_num(taken_instance_nums)
|
|
785
812
|
|
|
786
813
|
|
|
787
814
|
def _create_instance_model_for_job(
|
|
@@ -827,12 +854,17 @@ def _create_instance_model_for_job(
|
|
|
827
854
|
return instance
|
|
828
855
|
|
|
829
856
|
|
|
830
|
-
def _prepare_job_runtime_data(
|
|
857
|
+
def _prepare_job_runtime_data(
|
|
858
|
+
offer: InstanceOfferWithAvailability, multinode: bool
|
|
859
|
+
) -> JobRuntimeData:
|
|
831
860
|
if offer.blocks == offer.total_blocks:
|
|
832
|
-
if
|
|
861
|
+
if settings.JOB_NETWORK_MODE == settings.JobNetworkMode.FORCED_BRIDGE:
|
|
833
862
|
network_mode = NetworkMode.BRIDGE
|
|
834
|
-
|
|
863
|
+
elif settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_WHEN_POSSIBLE:
|
|
835
864
|
network_mode = NetworkMode.HOST
|
|
865
|
+
else:
|
|
866
|
+
assert settings.JOB_NETWORK_MODE == settings.JobNetworkMode.HOST_FOR_MULTINODE_ONLY
|
|
867
|
+
network_mode = NetworkMode.HOST if multinode else NetworkMode.BRIDGE
|
|
836
868
|
return JobRuntimeData(
|
|
837
869
|
network_mode=network_mode,
|
|
838
870
|
offer=offer,
|
dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Add FleetModel.consolidation_attempt and FleetModel.last_consolidated_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 2498ab323443
|
|
4
|
+
Revises: e2d08cd1b8d9
|
|
5
|
+
Create Date: 2025-08-29 16:08:48.686595
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "2498ab323443"
|
|
16
|
+
down_revision = "e2d08cd1b8d9"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column("consolidation_attempt", sa.Integer(), server_default="0", nullable=False)
|
|
26
|
+
)
|
|
27
|
+
batch_op.add_column(
|
|
28
|
+
sa.Column(
|
|
29
|
+
"last_consolidated_at",
|
|
30
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
31
|
+
nullable=True,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# ### end Alembic commands ###
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def downgrade() -> None:
|
|
39
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
40
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
41
|
+
batch_op.drop_column("last_consolidated_at")
|
|
42
|
+
batch_op.drop_column("consolidation_attempt")
|
|
43
|
+
|
|
44
|
+
# ### end Alembic commands ###
|
|
@@ -24,7 +24,7 @@ from sqlalchemy_utils import UUIDType
|
|
|
24
24
|
|
|
25
25
|
from dstack._internal.core.errors import DstackError
|
|
26
26
|
from dstack._internal.core.models.backends.base import BackendType
|
|
27
|
-
from dstack._internal.core.models.common import
|
|
27
|
+
from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model
|
|
28
28
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
29
29
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
30
30
|
from dstack._internal.core.models.health import HealthStatus
|
|
@@ -71,7 +71,11 @@ class NaiveDateTime(TypeDecorator):
|
|
|
71
71
|
return value.replace(tzinfo=timezone.utc)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
class
|
|
74
|
+
class DecryptedStringConfig(CoreConfig):
|
|
75
|
+
arbitrary_types_allowed = True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DecryptedString(generate_dual_core_model(DecryptedStringConfig)):
|
|
75
79
|
"""
|
|
76
80
|
A type for representing plaintext strings encrypted with `EncryptedString`.
|
|
77
81
|
Besides the string, stores information if the decryption was successful.
|
|
@@ -84,9 +88,6 @@ class DecryptedString(CoreModel):
|
|
|
84
88
|
decrypted: bool = True
|
|
85
89
|
exc: Optional[Exception] = None
|
|
86
90
|
|
|
87
|
-
class Config(CoreModel.Config):
|
|
88
|
-
arbitrary_types_allowed = True
|
|
89
|
-
|
|
90
91
|
def get_plaintext_or_error(self) -> str:
|
|
91
92
|
if self.decrypted and self.plaintext is not None:
|
|
92
93
|
return self.plaintext
|
|
@@ -551,6 +552,9 @@ class FleetModel(BaseModel):
|
|
|
551
552
|
jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet")
|
|
552
553
|
instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet")
|
|
553
554
|
|
|
555
|
+
consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0")
|
|
556
|
+
last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
557
|
+
|
|
554
558
|
|
|
555
559
|
class InstanceModel(BaseModel):
|
|
556
560
|
__tablename__ = "instances"
|
|
@@ -605,8 +609,8 @@ class InstanceModel(BaseModel):
|
|
|
605
609
|
Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME
|
|
606
610
|
)
|
|
607
611
|
|
|
608
|
-
#
|
|
609
|
-
last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
612
|
+
# Deprecated
|
|
613
|
+
last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True)
|
|
610
614
|
|
|
611
615
|
# instance termination handling
|
|
612
616
|
termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
@@ -3,24 +3,25 @@ from typing import Annotated, Any, Dict, List, Optional
|
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.backends.base import BackendType
|
|
6
|
-
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
from dstack._internal.core.models.common import CoreConfig, CoreModel, generate_dual_core_model
|
|
7
7
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class
|
|
10
|
+
class CreateGatewayRequestConfig(CoreConfig):
|
|
11
|
+
@staticmethod
|
|
12
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
13
|
+
del schema["properties"]["name"]
|
|
14
|
+
del schema["properties"]["backend_type"]
|
|
15
|
+
del schema["properties"]["region"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CreateGatewayRequest(generate_dual_core_model(CreateGatewayRequestConfig)):
|
|
11
19
|
configuration: GatewayConfiguration
|
|
12
20
|
# Deprecated and unused. Left for compatibility with 0.18 clients.
|
|
13
21
|
name: Annotated[Optional[str], Field(exclude=True)] = None
|
|
14
22
|
backend_type: Annotated[Optional[BackendType], Field(exclude=True)] = None
|
|
15
23
|
region: Annotated[Optional[str], Field(exclude=True)] = None
|
|
16
24
|
|
|
17
|
-
class Config(CoreModel.Config):
|
|
18
|
-
@staticmethod
|
|
19
|
-
def schema_extra(schema: Dict[str, Any]) -> None:
|
|
20
|
-
del schema["properties"]["name"]
|
|
21
|
-
del schema["properties"]["backend_type"]
|
|
22
|
-
del schema["properties"]["region"]
|
|
23
|
-
|
|
24
25
|
|
|
25
26
|
class GetGatewayRequest(CoreModel):
|
|
26
27
|
name: str
|
|
@@ -9,7 +9,11 @@ from pydantic import Field, ValidationError, validator
|
|
|
9
9
|
from typing_extensions import Annotated
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.errors import DockerRegistryError
|
|
12
|
-
from dstack._internal.core.models.common import
|
|
12
|
+
from dstack._internal.core.models.common import (
|
|
13
|
+
CoreModel,
|
|
14
|
+
FrozenCoreModel,
|
|
15
|
+
RegistryAuth,
|
|
16
|
+
)
|
|
13
17
|
from dstack._internal.server.utils.common import join_byte_stream_checked
|
|
14
18
|
from dstack._internal.utils.dxf import PatchedDXF
|
|
15
19
|
|
|
@@ -31,15 +35,12 @@ class DXFAuthAdapter:
|
|
|
31
35
|
)
|
|
32
36
|
|
|
33
37
|
|
|
34
|
-
class DockerImage(
|
|
38
|
+
class DockerImage(FrozenCoreModel):
|
|
35
39
|
image: str
|
|
36
|
-
registry: Optional[str]
|
|
40
|
+
registry: Optional[str] = None
|
|
37
41
|
repo: str
|
|
38
42
|
tag: str
|
|
39
|
-
digest: Optional[str]
|
|
40
|
-
|
|
41
|
-
class Config(CoreModel.Config):
|
|
42
|
-
frozen = True
|
|
43
|
+
digest: Optional[str] = None
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class ImageConfig(CoreModel):
|
|
@@ -449,25 +449,24 @@ async def create_fleet(
|
|
|
449
449
|
return await _create_fleet(session=session, project=project, user=user, spec=spec)
|
|
450
450
|
|
|
451
451
|
|
|
452
|
-
|
|
452
|
+
def create_fleet_instance_model(
|
|
453
453
|
session: AsyncSession,
|
|
454
454
|
project: ProjectModel,
|
|
455
|
-
|
|
455
|
+
username: str,
|
|
456
456
|
spec: FleetSpec,
|
|
457
|
-
reservation: Optional[str],
|
|
458
457
|
instance_num: int,
|
|
459
458
|
) -> InstanceModel:
|
|
460
459
|
profile = spec.merged_profile
|
|
461
460
|
requirements = get_fleet_requirements(spec)
|
|
462
|
-
instance_model =
|
|
461
|
+
instance_model = instances_services.create_instance_model(
|
|
463
462
|
session=session,
|
|
464
463
|
project=project,
|
|
465
|
-
|
|
464
|
+
username=username,
|
|
466
465
|
profile=profile,
|
|
467
466
|
requirements=requirements,
|
|
468
467
|
instance_name=f"{spec.configuration.name}-{instance_num}",
|
|
469
468
|
instance_num=instance_num,
|
|
470
|
-
reservation=reservation,
|
|
469
|
+
reservation=spec.merged_profile.reservation,
|
|
471
470
|
blocks=spec.configuration.blocks,
|
|
472
471
|
tags=spec.configuration.tags,
|
|
473
472
|
)
|
|
@@ -655,6 +654,19 @@ def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
|
655
654
|
return requirements
|
|
656
655
|
|
|
657
656
|
|
|
657
|
+
def get_next_instance_num(taken_instance_nums: set[int]) -> int:
|
|
658
|
+
if not taken_instance_nums:
|
|
659
|
+
return 0
|
|
660
|
+
min_instance_num = min(taken_instance_nums)
|
|
661
|
+
if min_instance_num > 0:
|
|
662
|
+
return 0
|
|
663
|
+
instance_num = min_instance_num + 1
|
|
664
|
+
while True:
|
|
665
|
+
if instance_num not in taken_instance_nums:
|
|
666
|
+
return instance_num
|
|
667
|
+
instance_num += 1
|
|
668
|
+
|
|
669
|
+
|
|
658
670
|
async def _create_fleet(
|
|
659
671
|
session: AsyncSession,
|
|
660
672
|
project: ProjectModel,
|
|
@@ -705,12 +717,11 @@ async def _create_fleet(
|
|
|
705
717
|
fleet_model.instances.append(instances_model)
|
|
706
718
|
else:
|
|
707
719
|
for i in range(_get_fleet_nodes_to_provision(spec)):
|
|
708
|
-
instance_model =
|
|
720
|
+
instance_model = create_fleet_instance_model(
|
|
709
721
|
session=session,
|
|
710
722
|
project=project,
|
|
711
|
-
|
|
723
|
+
username=user.name,
|
|
712
724
|
spec=spec,
|
|
713
|
-
reservation=spec.configuration.reservation,
|
|
714
725
|
instance_num=i,
|
|
715
726
|
)
|
|
716
727
|
fleet_model.instances.append(instance_model)
|
|
@@ -778,7 +789,7 @@ async def _update_fleet(
|
|
|
778
789
|
if added_hosts:
|
|
779
790
|
await _check_ssh_hosts_not_yet_added(session, spec, fleet.id)
|
|
780
791
|
for host in added_hosts.values():
|
|
781
|
-
instance_num =
|
|
792
|
+
instance_num = get_next_instance_num(active_instance_nums)
|
|
782
793
|
instance_model = await create_fleet_ssh_instance_model(
|
|
783
794
|
project=project,
|
|
784
795
|
spec=spec,
|
|
@@ -994,9 +1005,9 @@ def _validate_internal_ips(ssh_config: SSHParams):
|
|
|
994
1005
|
|
|
995
1006
|
|
|
996
1007
|
def _get_fleet_nodes_to_provision(spec: FleetSpec) -> int:
|
|
997
|
-
if spec.configuration.nodes is None
|
|
1008
|
+
if spec.configuration.nodes is None:
|
|
998
1009
|
return 0
|
|
999
|
-
return spec.configuration.nodes.
|
|
1010
|
+
return spec.configuration.nodes.target
|
|
1000
1011
|
|
|
1001
1012
|
|
|
1002
1013
|
def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[List[int]]):
|
|
@@ -1013,16 +1024,3 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[
|
|
|
1013
1024
|
instance.deleted = True
|
|
1014
1025
|
else:
|
|
1015
1026
|
instance.status = InstanceStatus.TERMINATING
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
def _get_next_instance_num(instance_nums: set[int]) -> int:
|
|
1019
|
-
if not instance_nums:
|
|
1020
|
-
return 0
|
|
1021
|
-
min_instance_num = min(instance_nums)
|
|
1022
|
-
if min_instance_num > 0:
|
|
1023
|
-
return 0
|
|
1024
|
-
instance_num = min_instance_num + 1
|
|
1025
|
-
while True:
|
|
1026
|
-
if instance_num not in instance_nums:
|
|
1027
|
-
return instance_num
|
|
1028
|
-
instance_num += 1
|
|
@@ -513,10 +513,10 @@ async def list_active_remote_instances(
|
|
|
513
513
|
return instance_models
|
|
514
514
|
|
|
515
515
|
|
|
516
|
-
|
|
516
|
+
def create_instance_model(
|
|
517
517
|
session: AsyncSession,
|
|
518
518
|
project: ProjectModel,
|
|
519
|
-
|
|
519
|
+
username: str,
|
|
520
520
|
profile: Profile,
|
|
521
521
|
requirements: Requirements,
|
|
522
522
|
instance_name: str,
|
|
@@ -536,7 +536,7 @@ async def create_instance_model(
|
|
|
536
536
|
instance_config = InstanceConfiguration(
|
|
537
537
|
project_name=project.name,
|
|
538
538
|
instance_name=instance_name,
|
|
539
|
-
user=
|
|
539
|
+
user=username,
|
|
540
540
|
ssh_keys=[project_ssh_key],
|
|
541
541
|
instance_id=str(instance_id),
|
|
542
542
|
reservation=reservation,
|
|
@@ -16,7 +16,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
16
16
|
DEFAULT_PROBE_READY_AFTER,
|
|
17
17
|
DEFAULT_PROBE_TIMEOUT,
|
|
18
18
|
DEFAULT_PROBE_URL,
|
|
19
|
-
|
|
19
|
+
LEGACY_REPO_DIR,
|
|
20
20
|
PortMapping,
|
|
21
21
|
ProbeConfig,
|
|
22
22
|
PythonVersion,
|
|
@@ -45,6 +45,14 @@ from dstack._internal.server.services.docker import ImageConfig, get_image_confi
|
|
|
45
45
|
from dstack._internal.utils import crypto
|
|
46
46
|
from dstack._internal.utils.common import run_async
|
|
47
47
|
from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
|
|
48
|
+
from dstack._internal.utils.logging import get_logger
|
|
49
|
+
from dstack._internal.utils.path import is_absolute_posix_path
|
|
50
|
+
|
|
51
|
+
logger = get_logger(__name__)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
DSTACK_DIR = "/dstack"
|
|
55
|
+
DSTACK_PROFILE_PATH = f"{DSTACK_DIR}/profile"
|
|
48
56
|
|
|
49
57
|
|
|
50
58
|
def get_default_python_verison() -> str:
|
|
@@ -160,6 +168,7 @@ class JobConfigurator(ABC):
|
|
|
160
168
|
ssh_key=self._ssh_key(jobs_per_replica),
|
|
161
169
|
repo_data=self.run_spec.repo_data,
|
|
162
170
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
171
|
+
repo_dir=self._repo_dir(),
|
|
163
172
|
file_archives=self.run_spec.file_archives,
|
|
164
173
|
service_port=self._service_port(),
|
|
165
174
|
probes=self._probes(),
|
|
@@ -209,9 +218,17 @@ class JobConfigurator(ABC):
|
|
|
209
218
|
):
|
|
210
219
|
return []
|
|
211
220
|
return [
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
221
|
+
# `uv` may emit:
|
|
222
|
+
# > warning: `VIRTUAL_ENV=/dstack/venv` does not match the project environment path
|
|
223
|
+
# > `.venv` and will be ignored; use `--active` to target the active environment
|
|
224
|
+
# > instead
|
|
225
|
+
# Safe to ignore, reusing dstack's venv for `uv` is discouraged (it should only be
|
|
226
|
+
# used for legacy `pip`-based configurations). `--no-active` suppresses the warning.
|
|
227
|
+
# Alternatively, the user can call `deactivate` once before using `uv`.
|
|
228
|
+
# If the user really wants to reuse dstack's venv, they must spefify `--active`.
|
|
229
|
+
f"uv venv -q --prompt dstack -p {self._python()} --seed {DSTACK_DIR}/venv",
|
|
230
|
+
f"echo '. {DSTACK_DIR}/venv/bin/activate' >> {DSTACK_PROFILE_PATH}",
|
|
231
|
+
f". {DSTACK_DIR}/venv/bin/activate",
|
|
215
232
|
]
|
|
216
233
|
|
|
217
234
|
def _app_specs(self) -> List[AppSpec]:
|
|
@@ -290,11 +307,34 @@ class JobConfigurator(ABC):
|
|
|
290
307
|
def _retry(self) -> Optional[Retry]:
|
|
291
308
|
return get_retry(self.run_spec.merged_profile)
|
|
292
309
|
|
|
310
|
+
def _repo_dir(self) -> str:
|
|
311
|
+
"""
|
|
312
|
+
Returns absolute or relative path
|
|
313
|
+
"""
|
|
314
|
+
repo_dir = self.run_spec.repo_dir
|
|
315
|
+
if repo_dir is None:
|
|
316
|
+
return LEGACY_REPO_DIR
|
|
317
|
+
return repo_dir
|
|
318
|
+
|
|
293
319
|
def _working_dir(self) -> Optional[str]:
|
|
294
320
|
"""
|
|
295
|
-
|
|
321
|
+
Returns path or None
|
|
322
|
+
|
|
323
|
+
None means the default working directory taken from the image
|
|
324
|
+
|
|
325
|
+
Currently, for compatibility with pre-0.19.27 runners, the path may be relative.
|
|
326
|
+
Future versions should return only absolute paths
|
|
296
327
|
"""
|
|
297
|
-
|
|
328
|
+
working_dir = self.run_spec.configuration.working_dir
|
|
329
|
+
if working_dir is None:
|
|
330
|
+
return working_dir
|
|
331
|
+
# Return a relative path if possible
|
|
332
|
+
if is_absolute_posix_path(working_dir):
|
|
333
|
+
try:
|
|
334
|
+
return str(PurePosixPath(working_dir).relative_to(LEGACY_REPO_DIR))
|
|
335
|
+
except ValueError:
|
|
336
|
+
pass
|
|
337
|
+
return working_dir
|
|
298
338
|
|
|
299
339
|
def _python(self) -> str:
|
|
300
340
|
if self.run_spec.configuration.python is not None:
|
|
@@ -9,8 +9,8 @@ from dstack._internal.server.services.jobs.configurators.extensions.cursor impor
|
|
|
9
9
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
10
10
|
|
|
11
11
|
INSTALL_IPYKERNEL = (
|
|
12
|
-
"(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
|
-
|
|
12
|
+
"(echo 'pip install ipykernel...' && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
|
+
"echo 'no pip, ipykernel was not installed'"
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
@@ -39,12 +39,12 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
39
39
|
commands = self.ide.get_install_commands()
|
|
40
40
|
commands.append(INSTALL_IPYKERNEL)
|
|
41
41
|
commands += self.run_spec.configuration.setup
|
|
42
|
-
commands.append("echo
|
|
42
|
+
commands.append("echo")
|
|
43
43
|
commands += self.run_spec.configuration.init
|
|
44
44
|
commands += self.ide.get_print_readme_commands()
|
|
45
45
|
commands += [
|
|
46
46
|
f"echo 'To connect via SSH, use: `ssh {self.run_spec.run_name}`'",
|
|
47
|
-
"echo
|
|
47
|
+
"echo",
|
|
48
48
|
"echo -n 'To exit, press Ctrl+C.'",
|
|
49
49
|
]
|
|
50
50
|
commands += ["tail -f /dev/null"] # idle
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
|
-
|
|
5
3
|
|
|
6
4
|
class CursorDesktop:
|
|
7
5
|
def __init__(
|
|
@@ -38,7 +36,7 @@ class CursorDesktop:
|
|
|
38
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
39
37
|
return [
|
|
40
38
|
"echo To open in Cursor, use link below:",
|
|
41
|
-
"echo
|
|
42
|
-
f
|
|
43
|
-
"echo
|
|
39
|
+
"echo",
|
|
40
|
+
f'echo " cursor://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
|
|
41
|
+
"echo",
|
|
44
42
|
]
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
|
-
|
|
5
3
|
|
|
6
4
|
class VSCodeDesktop:
|
|
7
5
|
def __init__(
|
|
@@ -37,8 +35,8 @@ class VSCodeDesktop:
|
|
|
37
35
|
|
|
38
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
39
37
|
return [
|
|
40
|
-
"echo To open in VS Code Desktop, use link below:",
|
|
41
|
-
"echo
|
|
42
|
-
f
|
|
43
|
-
"echo
|
|
38
|
+
"echo 'To open in VS Code Desktop, use link below:'",
|
|
39
|
+
"echo",
|
|
40
|
+
f'echo " vscode://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
|
|
41
|
+
"echo",
|
|
44
42
|
]
|
|
@@ -37,6 +37,3 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
37
37
|
def _ports(self) -> List[PortMapping]:
|
|
38
38
|
assert self.run_spec.configuration.type == "task"
|
|
39
39
|
return self.run_spec.configuration.ports
|
|
40
|
-
|
|
41
|
-
def _working_dir(self) -> Optional[str]:
|
|
42
|
-
return None if not self._shell_commands() else super()._working_dir()
|