dstack 0.19.26__py3-none-any.whl → 0.19.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/init.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +114 -16
- dstack/_internal/cli/services/repos.py +1 -18
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/compute.py +6 -1
- dstack/_internal/core/backends/base/compute.py +33 -5
- dstack/_internal/core/backends/base/offers.py +2 -0
- dstack/_internal/core/backends/configurators.py +15 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/gcp/compute.py +32 -8
- dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
- dstack/_internal/core/backends/hotaisle/compute.py +1 -6
- dstack/_internal/core/backends/models.py +7 -0
- dstack/_internal/core/backends/nebius/compute.py +0 -7
- dstack/_internal/core/backends/oci/compute.py +4 -5
- dstack/_internal/core/backends/vultr/compute.py +1 -5
- dstack/_internal/core/compatibility/fleets.py +5 -0
- dstack/_internal/core/compatibility/runs.py +8 -1
- dstack/_internal/core/models/backends/base.py +5 -1
- dstack/_internal/core/models/configurations.py +21 -7
- dstack/_internal/core/models/files.py +1 -1
- dstack/_internal/core/models/fleets.py +75 -2
- dstack/_internal/core/models/runs.py +24 -5
- dstack/_internal/core/services/repos.py +85 -80
- dstack/_internal/server/background/tasks/process_fleets.py +109 -13
- dstack/_internal/server/background/tasks/process_instances.py +12 -71
- dstack/_internal/server/background/tasks/process_running_jobs.py +2 -0
- dstack/_internal/server/background/tasks/process_runs.py +2 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +18 -6
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/models.py +5 -2
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/fleets.py +23 -25
- dstack/_internal/server/services/instances.py +3 -3
- dstack/_internal/server/services/jobs/configurators/base.py +46 -6
- dstack/_internal/server/services/jobs/configurators/dev.py +4 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -5
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +4 -6
- dstack/_internal/server/services/jobs/configurators/service.py +0 -3
- dstack/_internal/server/services/jobs/configurators/task.py +0 -3
- dstack/_internal/server/services/runs.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
- dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
- dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
- dstack/_internal/server/testing/common.py +6 -3
- dstack/_internal/utils/path.py +8 -1
- dstack/_internal/utils/ssh.py +7 -0
- dstack/api/_public/repos.py +41 -6
- dstack/api/_public/runs.py +14 -1
- dstack/version.py +1 -1
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/RECORD +68 -53
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.26.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -53,14 +53,12 @@ from dstack._internal.core.models.placement import (
|
|
|
53
53
|
PlacementStrategy,
|
|
54
54
|
)
|
|
55
55
|
from dstack._internal.core.models.profiles import (
|
|
56
|
-
RetryEvent,
|
|
57
56
|
TerminationPolicy,
|
|
58
57
|
)
|
|
59
58
|
from dstack._internal.core.models.runs import (
|
|
60
59
|
JobProvisioningData,
|
|
61
60
|
Retry,
|
|
62
61
|
)
|
|
63
|
-
from dstack._internal.core.services.profiles import get_retry
|
|
64
62
|
from dstack._internal.server import settings as server_settings
|
|
65
63
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
66
64
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
@@ -327,7 +325,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
327
325
|
e,
|
|
328
326
|
)
|
|
329
327
|
instance.status = InstanceStatus.PENDING
|
|
330
|
-
instance.last_retry_at = get_current_datetime()
|
|
331
328
|
return
|
|
332
329
|
|
|
333
330
|
instance_type = host_info_to_instance_type(host_info, cpu_arch)
|
|
@@ -426,7 +423,6 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
426
423
|
instance.offer = instance_offer.json()
|
|
427
424
|
instance.job_provisioning_data = jpd.json()
|
|
428
425
|
instance.started_at = get_current_datetime()
|
|
429
|
-
instance.last_retry_at = get_current_datetime()
|
|
430
426
|
|
|
431
427
|
|
|
432
428
|
def _deploy_instance(
|
|
@@ -493,29 +489,6 @@ def _deploy_instance(
|
|
|
493
489
|
|
|
494
490
|
|
|
495
491
|
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
496
|
-
if instance.last_retry_at is not None:
|
|
497
|
-
last_retry = instance.last_retry_at
|
|
498
|
-
if get_current_datetime() < last_retry + timedelta(minutes=1):
|
|
499
|
-
return
|
|
500
|
-
|
|
501
|
-
if (
|
|
502
|
-
instance.profile is None
|
|
503
|
-
or instance.requirements is None
|
|
504
|
-
or instance.instance_configuration is None
|
|
505
|
-
):
|
|
506
|
-
instance.status = InstanceStatus.TERMINATED
|
|
507
|
-
instance.termination_reason = "Empty profile, requirements or instance_configuration"
|
|
508
|
-
instance.last_retry_at = get_current_datetime()
|
|
509
|
-
logger.warning(
|
|
510
|
-
"Empty profile, requirements or instance_configuration. Terminate instance: %s",
|
|
511
|
-
instance.name,
|
|
512
|
-
extra={
|
|
513
|
-
"instance_name": instance.name,
|
|
514
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
515
|
-
},
|
|
516
|
-
)
|
|
517
|
-
return
|
|
518
|
-
|
|
519
492
|
if _need_to_wait_fleet_provisioning(instance):
|
|
520
493
|
logger.debug("Waiting for the first instance in the fleet to be provisioned")
|
|
521
494
|
return
|
|
@@ -529,7 +502,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
529
502
|
instance.termination_reason = (
|
|
530
503
|
f"Error to parse profile, requirements or instance_configuration: {e}"
|
|
531
504
|
)
|
|
532
|
-
instance.last_retry_at = get_current_datetime()
|
|
533
505
|
logger.warning(
|
|
534
506
|
"Error to parse profile, requirements or instance_configuration. Terminate instance: %s",
|
|
535
507
|
instance.name,
|
|
@@ -540,24 +512,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
540
512
|
)
|
|
541
513
|
return
|
|
542
514
|
|
|
543
|
-
retry = get_retry(profile)
|
|
544
|
-
should_retry = retry is not None and RetryEvent.NO_CAPACITY in retry.on_events
|
|
545
|
-
|
|
546
|
-
if retry is not None:
|
|
547
|
-
retry_duration_deadline = _get_retry_duration_deadline(instance, retry)
|
|
548
|
-
if get_current_datetime() > retry_duration_deadline:
|
|
549
|
-
instance.status = InstanceStatus.TERMINATED
|
|
550
|
-
instance.termination_reason = "Retry duration expired"
|
|
551
|
-
logger.warning(
|
|
552
|
-
"Retry duration expired. Terminating instance %s",
|
|
553
|
-
instance.name,
|
|
554
|
-
extra={
|
|
555
|
-
"instance_name": instance.name,
|
|
556
|
-
"instance_status": InstanceStatus.TERMINATED.value,
|
|
557
|
-
},
|
|
558
|
-
)
|
|
559
|
-
return
|
|
560
|
-
|
|
561
515
|
placement_group_models = []
|
|
562
516
|
placement_group_model = None
|
|
563
517
|
if instance.fleet_id:
|
|
@@ -595,15 +549,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
595
549
|
exclude_not_available=True,
|
|
596
550
|
)
|
|
597
551
|
|
|
598
|
-
if not offers and should_retry:
|
|
599
|
-
instance.last_retry_at = get_current_datetime()
|
|
600
|
-
logger.debug(
|
|
601
|
-
"No offers for instance %s. Next retry",
|
|
602
|
-
instance.name,
|
|
603
|
-
extra={"instance_name": instance.name},
|
|
604
|
-
)
|
|
605
|
-
return
|
|
606
|
-
|
|
607
552
|
# Limit number of offers tried to prevent long-running processing
|
|
608
553
|
# in case all offers fail.
|
|
609
554
|
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
@@ -681,7 +626,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
681
626
|
instance.offer = instance_offer.json()
|
|
682
627
|
instance.total_blocks = instance_offer.total_blocks
|
|
683
628
|
instance.started_at = get_current_datetime()
|
|
684
|
-
instance.last_retry_at = get_current_datetime()
|
|
685
629
|
|
|
686
630
|
logger.info(
|
|
687
631
|
"Created instance %s",
|
|
@@ -702,21 +646,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
702
646
|
)
|
|
703
647
|
return
|
|
704
648
|
|
|
705
|
-
instance
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
if sibling_instance.id == instance.id:
|
|
718
|
-
continue
|
|
719
|
-
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
649
|
+
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
|
|
650
|
+
if (
|
|
651
|
+
instance.fleet
|
|
652
|
+
and _is_fleet_master_instance(instance)
|
|
653
|
+
and _is_cloud_cluster(instance.fleet)
|
|
654
|
+
):
|
|
655
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
656
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
657
|
+
for sibling_instance in instance.fleet.instances:
|
|
658
|
+
if sibling_instance.id == instance.id:
|
|
659
|
+
continue
|
|
660
|
+
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
720
661
|
|
|
721
662
|
|
|
722
663
|
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
|
|
@@ -41,6 +41,7 @@ from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, Vol
|
|
|
41
41
|
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
42
42
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
43
43
|
from dstack._internal.server.models import (
|
|
44
|
+
FleetModel,
|
|
44
45
|
InstanceModel,
|
|
45
46
|
JobModel,
|
|
46
47
|
ProbeModel,
|
|
@@ -151,6 +152,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
151
152
|
.options(joinedload(RunModel.project))
|
|
152
153
|
.options(joinedload(RunModel.user))
|
|
153
154
|
.options(joinedload(RunModel.repo))
|
|
155
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
154
156
|
.options(joinedload(RunModel.jobs))
|
|
155
157
|
)
|
|
156
158
|
run_model = res.unique().scalar_one()
|
|
@@ -21,6 +21,7 @@ from dstack._internal.core.models.runs import (
|
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
23
23
|
from dstack._internal.server.models import (
|
|
24
|
+
FleetModel,
|
|
24
25
|
InstanceModel,
|
|
25
26
|
JobModel,
|
|
26
27
|
ProjectModel,
|
|
@@ -145,6 +146,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
|
|
|
145
146
|
.execution_options(populate_existing=True)
|
|
146
147
|
.options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
|
|
147
148
|
.options(joinedload(RunModel.user).load_only(UserModel.name))
|
|
149
|
+
.options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name))
|
|
148
150
|
.options(
|
|
149
151
|
selectinload(RunModel.jobs)
|
|
150
152
|
.joinedload(JobModel.instance)
|
|
@@ -5,7 +5,7 @@ import uuid
|
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
from sqlalchemy import and_,
|
|
8
|
+
from sqlalchemy import and_, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
|
|
11
11
|
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.models.common import NetworkMode
|
|
|
16
16
|
from dstack._internal.core.models.fleets import (
|
|
17
17
|
Fleet,
|
|
18
18
|
FleetConfiguration,
|
|
19
|
+
FleetNodesSpec,
|
|
19
20
|
FleetSpec,
|
|
20
21
|
FleetStatus,
|
|
21
22
|
InstanceGroupPlacement,
|
|
@@ -26,7 +27,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
26
27
|
CreationPolicy,
|
|
27
28
|
TerminationPolicy,
|
|
28
29
|
)
|
|
29
|
-
from dstack._internal.core.models.resources import Memory
|
|
30
|
+
from dstack._internal.core.models.resources import Memory
|
|
30
31
|
from dstack._internal.core.models.runs import (
|
|
31
32
|
Job,
|
|
32
33
|
JobProvisioningData,
|
|
@@ -54,6 +55,7 @@ from dstack._internal.server.services.backends import get_project_backend_by_typ
|
|
|
54
55
|
from dstack._internal.server.services.fleets import (
|
|
55
56
|
fleet_model_to_fleet,
|
|
56
57
|
get_fleet_requirements,
|
|
58
|
+
get_next_instance_num,
|
|
57
59
|
)
|
|
58
60
|
from dstack._internal.server.services.instances import (
|
|
59
61
|
filter_pool_instances,
|
|
@@ -384,6 +386,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
384
386
|
instance_num=instance_num,
|
|
385
387
|
)
|
|
386
388
|
job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
|
|
389
|
+
# Both this task and process_fleets can add instances to fleets.
|
|
390
|
+
# TODO: Ensure this does not violate nodes.max when it's enforced.
|
|
387
391
|
instance.fleet_id = fleet_model.id
|
|
388
392
|
logger.info(
|
|
389
393
|
"The job %s created the new instance %s",
|
|
@@ -755,12 +759,17 @@ def _create_fleet_model_for_job(
|
|
|
755
759
|
placement = InstanceGroupPlacement.ANY
|
|
756
760
|
if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
|
|
757
761
|
placement = InstanceGroupPlacement.CLUSTER
|
|
762
|
+
nodes = _get_nodes_required_num_for_run(run.run_spec)
|
|
758
763
|
spec = FleetSpec(
|
|
759
764
|
configuration=FleetConfiguration(
|
|
760
765
|
name=run.run_spec.run_name,
|
|
761
766
|
placement=placement,
|
|
762
767
|
reservation=run.run_spec.configuration.reservation,
|
|
763
|
-
nodes=
|
|
768
|
+
nodes=FleetNodesSpec(
|
|
769
|
+
min=nodes,
|
|
770
|
+
target=nodes,
|
|
771
|
+
max=None,
|
|
772
|
+
),
|
|
764
773
|
),
|
|
765
774
|
profile=run.run_spec.merged_profile,
|
|
766
775
|
autocreated=True,
|
|
@@ -778,10 +787,13 @@ def _create_fleet_model_for_job(
|
|
|
778
787
|
|
|
779
788
|
async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int:
|
|
780
789
|
res = await session.execute(
|
|
781
|
-
select(
|
|
790
|
+
select(InstanceModel.instance_num).where(
|
|
791
|
+
InstanceModel.fleet_id == fleet_model.id,
|
|
792
|
+
InstanceModel.deleted.is_(False),
|
|
793
|
+
)
|
|
782
794
|
)
|
|
783
|
-
|
|
784
|
-
return
|
|
795
|
+
taken_instance_nums = set(res.scalars().all())
|
|
796
|
+
return get_next_instance_num(taken_instance_nums)
|
|
785
797
|
|
|
786
798
|
|
|
787
799
|
def _create_instance_model_for_job(
|
dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Add FleetModel.consolidation_attempt and FleetModel.last_consolidated_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 2498ab323443
|
|
4
|
+
Revises: e2d08cd1b8d9
|
|
5
|
+
Create Date: 2025-08-29 16:08:48.686595
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
import dstack._internal.server.models
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = "2498ab323443"
|
|
16
|
+
down_revision = "e2d08cd1b8d9"
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
24
|
+
batch_op.add_column(
|
|
25
|
+
sa.Column("consolidation_attempt", sa.Integer(), server_default="0", nullable=False)
|
|
26
|
+
)
|
|
27
|
+
batch_op.add_column(
|
|
28
|
+
sa.Column(
|
|
29
|
+
"last_consolidated_at",
|
|
30
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
31
|
+
nullable=True,
|
|
32
|
+
)
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# ### end Alembic commands ###
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def downgrade() -> None:
|
|
39
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
40
|
+
with op.batch_alter_table("fleets", schema=None) as batch_op:
|
|
41
|
+
batch_op.drop_column("last_consolidated_at")
|
|
42
|
+
batch_op.drop_column("consolidation_attempt")
|
|
43
|
+
|
|
44
|
+
# ### end Alembic commands ###
|
|
@@ -551,6 +551,9 @@ class FleetModel(BaseModel):
|
|
|
551
551
|
jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet")
|
|
552
552
|
instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet")
|
|
553
553
|
|
|
554
|
+
consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0")
|
|
555
|
+
last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
556
|
+
|
|
554
557
|
|
|
555
558
|
class InstanceModel(BaseModel):
|
|
556
559
|
__tablename__ = "instances"
|
|
@@ -605,8 +608,8 @@ class InstanceModel(BaseModel):
|
|
|
605
608
|
Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME
|
|
606
609
|
)
|
|
607
610
|
|
|
608
|
-
#
|
|
609
|
-
last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
611
|
+
# Deprecated
|
|
612
|
+
last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True)
|
|
610
613
|
|
|
611
614
|
# instance termination handling
|
|
612
615
|
termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
@@ -449,25 +449,24 @@ async def create_fleet(
|
|
|
449
449
|
return await _create_fleet(session=session, project=project, user=user, spec=spec)
|
|
450
450
|
|
|
451
451
|
|
|
452
|
-
|
|
452
|
+
def create_fleet_instance_model(
|
|
453
453
|
session: AsyncSession,
|
|
454
454
|
project: ProjectModel,
|
|
455
|
-
|
|
455
|
+
username: str,
|
|
456
456
|
spec: FleetSpec,
|
|
457
|
-
reservation: Optional[str],
|
|
458
457
|
instance_num: int,
|
|
459
458
|
) -> InstanceModel:
|
|
460
459
|
profile = spec.merged_profile
|
|
461
460
|
requirements = get_fleet_requirements(spec)
|
|
462
|
-
instance_model =
|
|
461
|
+
instance_model = instances_services.create_instance_model(
|
|
463
462
|
session=session,
|
|
464
463
|
project=project,
|
|
465
|
-
|
|
464
|
+
username=username,
|
|
466
465
|
profile=profile,
|
|
467
466
|
requirements=requirements,
|
|
468
467
|
instance_name=f"{spec.configuration.name}-{instance_num}",
|
|
469
468
|
instance_num=instance_num,
|
|
470
|
-
reservation=reservation,
|
|
469
|
+
reservation=spec.merged_profile.reservation,
|
|
471
470
|
blocks=spec.configuration.blocks,
|
|
472
471
|
tags=spec.configuration.tags,
|
|
473
472
|
)
|
|
@@ -655,6 +654,19 @@ def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
|
655
654
|
return requirements
|
|
656
655
|
|
|
657
656
|
|
|
657
|
+
def get_next_instance_num(taken_instance_nums: set[int]) -> int:
|
|
658
|
+
if not taken_instance_nums:
|
|
659
|
+
return 0
|
|
660
|
+
min_instance_num = min(taken_instance_nums)
|
|
661
|
+
if min_instance_num > 0:
|
|
662
|
+
return 0
|
|
663
|
+
instance_num = min_instance_num + 1
|
|
664
|
+
while True:
|
|
665
|
+
if instance_num not in taken_instance_nums:
|
|
666
|
+
return instance_num
|
|
667
|
+
instance_num += 1
|
|
668
|
+
|
|
669
|
+
|
|
658
670
|
async def _create_fleet(
|
|
659
671
|
session: AsyncSession,
|
|
660
672
|
project: ProjectModel,
|
|
@@ -705,12 +717,11 @@ async def _create_fleet(
|
|
|
705
717
|
fleet_model.instances.append(instances_model)
|
|
706
718
|
else:
|
|
707
719
|
for i in range(_get_fleet_nodes_to_provision(spec)):
|
|
708
|
-
instance_model =
|
|
720
|
+
instance_model = create_fleet_instance_model(
|
|
709
721
|
session=session,
|
|
710
722
|
project=project,
|
|
711
|
-
|
|
723
|
+
username=user.name,
|
|
712
724
|
spec=spec,
|
|
713
|
-
reservation=spec.configuration.reservation,
|
|
714
725
|
instance_num=i,
|
|
715
726
|
)
|
|
716
727
|
fleet_model.instances.append(instance_model)
|
|
@@ -778,7 +789,7 @@ async def _update_fleet(
|
|
|
778
789
|
if added_hosts:
|
|
779
790
|
await _check_ssh_hosts_not_yet_added(session, spec, fleet.id)
|
|
780
791
|
for host in added_hosts.values():
|
|
781
|
-
instance_num =
|
|
792
|
+
instance_num = get_next_instance_num(active_instance_nums)
|
|
782
793
|
instance_model = await create_fleet_ssh_instance_model(
|
|
783
794
|
project=project,
|
|
784
795
|
spec=spec,
|
|
@@ -994,9 +1005,9 @@ def _validate_internal_ips(ssh_config: SSHParams):
|
|
|
994
1005
|
|
|
995
1006
|
|
|
996
1007
|
def _get_fleet_nodes_to_provision(spec: FleetSpec) -> int:
|
|
997
|
-
if spec.configuration.nodes is None
|
|
1008
|
+
if spec.configuration.nodes is None:
|
|
998
1009
|
return 0
|
|
999
|
-
return spec.configuration.nodes.
|
|
1010
|
+
return spec.configuration.nodes.target
|
|
1000
1011
|
|
|
1001
1012
|
|
|
1002
1013
|
def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[List[int]]):
|
|
@@ -1013,16 +1024,3 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[
|
|
|
1013
1024
|
instance.deleted = True
|
|
1014
1025
|
else:
|
|
1015
1026
|
instance.status = InstanceStatus.TERMINATING
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
def _get_next_instance_num(instance_nums: set[int]) -> int:
|
|
1019
|
-
if not instance_nums:
|
|
1020
|
-
return 0
|
|
1021
|
-
min_instance_num = min(instance_nums)
|
|
1022
|
-
if min_instance_num > 0:
|
|
1023
|
-
return 0
|
|
1024
|
-
instance_num = min_instance_num + 1
|
|
1025
|
-
while True:
|
|
1026
|
-
if instance_num not in instance_nums:
|
|
1027
|
-
return instance_num
|
|
1028
|
-
instance_num += 1
|
|
@@ -513,10 +513,10 @@ async def list_active_remote_instances(
|
|
|
513
513
|
return instance_models
|
|
514
514
|
|
|
515
515
|
|
|
516
|
-
|
|
516
|
+
def create_instance_model(
|
|
517
517
|
session: AsyncSession,
|
|
518
518
|
project: ProjectModel,
|
|
519
|
-
|
|
519
|
+
username: str,
|
|
520
520
|
profile: Profile,
|
|
521
521
|
requirements: Requirements,
|
|
522
522
|
instance_name: str,
|
|
@@ -536,7 +536,7 @@ async def create_instance_model(
|
|
|
536
536
|
instance_config = InstanceConfiguration(
|
|
537
537
|
project_name=project.name,
|
|
538
538
|
instance_name=instance_name,
|
|
539
|
-
user=
|
|
539
|
+
user=username,
|
|
540
540
|
ssh_keys=[project_ssh_key],
|
|
541
541
|
instance_id=str(instance_id),
|
|
542
542
|
reservation=reservation,
|
|
@@ -16,7 +16,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
16
16
|
DEFAULT_PROBE_READY_AFTER,
|
|
17
17
|
DEFAULT_PROBE_TIMEOUT,
|
|
18
18
|
DEFAULT_PROBE_URL,
|
|
19
|
-
|
|
19
|
+
LEGACY_REPO_DIR,
|
|
20
20
|
PortMapping,
|
|
21
21
|
ProbeConfig,
|
|
22
22
|
PythonVersion,
|
|
@@ -45,6 +45,14 @@ from dstack._internal.server.services.docker import ImageConfig, get_image_confi
|
|
|
45
45
|
from dstack._internal.utils import crypto
|
|
46
46
|
from dstack._internal.utils.common import run_async
|
|
47
47
|
from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
|
|
48
|
+
from dstack._internal.utils.logging import get_logger
|
|
49
|
+
from dstack._internal.utils.path import is_absolute_posix_path
|
|
50
|
+
|
|
51
|
+
logger = get_logger(__name__)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
DSTACK_DIR = "/dstack"
|
|
55
|
+
DSTACK_PROFILE_PATH = f"{DSTACK_DIR}/profile"
|
|
48
56
|
|
|
49
57
|
|
|
50
58
|
def get_default_python_verison() -> str:
|
|
@@ -160,6 +168,7 @@ class JobConfigurator(ABC):
|
|
|
160
168
|
ssh_key=self._ssh_key(jobs_per_replica),
|
|
161
169
|
repo_data=self.run_spec.repo_data,
|
|
162
170
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
171
|
+
repo_dir=self._repo_dir(),
|
|
163
172
|
file_archives=self.run_spec.file_archives,
|
|
164
173
|
service_port=self._service_port(),
|
|
165
174
|
probes=self._probes(),
|
|
@@ -209,9 +218,17 @@ class JobConfigurator(ABC):
|
|
|
209
218
|
):
|
|
210
219
|
return []
|
|
211
220
|
return [
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
221
|
+
# `uv` may emit:
|
|
222
|
+
# > warning: `VIRTUAL_ENV=/dstack/venv` does not match the project environment path
|
|
223
|
+
# > `.venv` and will be ignored; use `--active` to target the active environment
|
|
224
|
+
# > instead
|
|
225
|
+
# Safe to ignore, reusing dstack's venv for `uv` is discouraged (it should only be
|
|
226
|
+
# used for legacy `pip`-based configurations). `--no-active` suppresses the warning.
|
|
227
|
+
# Alternatively, the user can call `deactivate` once before using `uv`.
|
|
228
|
+
# If the user really wants to reuse dstack's venv, they must spefify `--active`.
|
|
229
|
+
f"uv venv -q --prompt dstack -p {self._python()} --seed {DSTACK_DIR}/venv",
|
|
230
|
+
f"echo '. {DSTACK_DIR}/venv/bin/activate' >> {DSTACK_PROFILE_PATH}",
|
|
231
|
+
f". {DSTACK_DIR}/venv/bin/activate",
|
|
215
232
|
]
|
|
216
233
|
|
|
217
234
|
def _app_specs(self) -> List[AppSpec]:
|
|
@@ -290,11 +307,34 @@ class JobConfigurator(ABC):
|
|
|
290
307
|
def _retry(self) -> Optional[Retry]:
|
|
291
308
|
return get_retry(self.run_spec.merged_profile)
|
|
292
309
|
|
|
310
|
+
def _repo_dir(self) -> str:
|
|
311
|
+
"""
|
|
312
|
+
Returns absolute or relative path
|
|
313
|
+
"""
|
|
314
|
+
repo_dir = self.run_spec.repo_dir
|
|
315
|
+
if repo_dir is None:
|
|
316
|
+
return LEGACY_REPO_DIR
|
|
317
|
+
return repo_dir
|
|
318
|
+
|
|
293
319
|
def _working_dir(self) -> Optional[str]:
|
|
294
320
|
"""
|
|
295
|
-
|
|
321
|
+
Returns path or None
|
|
322
|
+
|
|
323
|
+
None means the default working directory taken from the image
|
|
324
|
+
|
|
325
|
+
Currently, for compatibility with pre-0.19.27 runners, the path may be relative.
|
|
326
|
+
Future versions should return only absolute paths
|
|
296
327
|
"""
|
|
297
|
-
|
|
328
|
+
working_dir = self.run_spec.configuration.working_dir
|
|
329
|
+
if working_dir is None:
|
|
330
|
+
return working_dir
|
|
331
|
+
# Return a relative path if possible
|
|
332
|
+
if is_absolute_posix_path(working_dir):
|
|
333
|
+
try:
|
|
334
|
+
return str(PurePosixPath(working_dir).relative_to(LEGACY_REPO_DIR))
|
|
335
|
+
except ValueError:
|
|
336
|
+
pass
|
|
337
|
+
return working_dir
|
|
298
338
|
|
|
299
339
|
def _python(self) -> str:
|
|
300
340
|
if self.run_spec.configuration.python is not None:
|
|
@@ -9,8 +9,8 @@ from dstack._internal.server.services.jobs.configurators.extensions.cursor impor
|
|
|
9
9
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
10
10
|
|
|
11
11
|
INSTALL_IPYKERNEL = (
|
|
12
|
-
"(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
|
-
|
|
12
|
+
"(echo 'pip install ipykernel...' && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
|
+
"echo 'no pip, ipykernel was not installed'"
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
|
|
@@ -39,12 +39,12 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
39
39
|
commands = self.ide.get_install_commands()
|
|
40
40
|
commands.append(INSTALL_IPYKERNEL)
|
|
41
41
|
commands += self.run_spec.configuration.setup
|
|
42
|
-
commands.append("echo
|
|
42
|
+
commands.append("echo")
|
|
43
43
|
commands += self.run_spec.configuration.init
|
|
44
44
|
commands += self.ide.get_print_readme_commands()
|
|
45
45
|
commands += [
|
|
46
46
|
f"echo 'To connect via SSH, use: `ssh {self.run_spec.run_name}`'",
|
|
47
|
-
"echo
|
|
47
|
+
"echo",
|
|
48
48
|
"echo -n 'To exit, press Ctrl+C.'",
|
|
49
49
|
]
|
|
50
50
|
commands += ["tail -f /dev/null"] # idle
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
|
-
|
|
5
3
|
|
|
6
4
|
class CursorDesktop:
|
|
7
5
|
def __init__(
|
|
@@ -38,7 +36,7 @@ class CursorDesktop:
|
|
|
38
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
39
37
|
return [
|
|
40
38
|
"echo To open in Cursor, use link below:",
|
|
41
|
-
"echo
|
|
42
|
-
f
|
|
43
|
-
"echo
|
|
39
|
+
"echo",
|
|
40
|
+
f'echo " cursor://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
|
|
41
|
+
"echo",
|
|
44
42
|
]
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from typing import List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
4
|
-
|
|
5
3
|
|
|
6
4
|
class VSCodeDesktop:
|
|
7
5
|
def __init__(
|
|
@@ -37,8 +35,8 @@ class VSCodeDesktop:
|
|
|
37
35
|
|
|
38
36
|
def get_print_readme_commands(self) -> List[str]:
|
|
39
37
|
return [
|
|
40
|
-
"echo To open in VS Code Desktop, use link below:",
|
|
41
|
-
"echo
|
|
42
|
-
f
|
|
43
|
-
"echo
|
|
38
|
+
"echo 'To open in VS Code Desktop, use link below:'",
|
|
39
|
+
"echo",
|
|
40
|
+
f'echo " vscode://vscode-remote/ssh-remote+{self.run_name}$DSTACK_REPO_DIR"',
|
|
41
|
+
"echo",
|
|
44
42
|
]
|
|
@@ -37,6 +37,3 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
37
37
|
def _ports(self) -> List[PortMapping]:
|
|
38
38
|
assert self.run_spec.configuration.type == "task"
|
|
39
39
|
return self.run_spec.configuration.ports
|
|
40
|
-
|
|
41
|
-
def _working_dir(self) -> Optional[str]:
|
|
42
|
-
return None if not self._shell_commands() else super()._working_dir()
|