dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/services/configurators/fleet.py +20 -6
- dstack/_internal/cli/utils/gpu.py +2 -2
- dstack/_internal/core/backends/aws/compute.py +13 -5
- dstack/_internal/core/backends/aws/resources.py +11 -6
- dstack/_internal/core/backends/azure/compute.py +17 -6
- dstack/_internal/core/backends/base/compute.py +57 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +87 -38
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +22 -0
- dstack/_internal/core/backends/kubernetes/compute.py +531 -215
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +17 -0
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/backends/oci/compute.py +7 -1
- dstack/_internal/core/backends/oci/resources.py +8 -3
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/consts.py +2 -0
- dstack/_internal/core/models/profiles.py +11 -4
- dstack/_internal/core/services/repos.py +101 -11
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +3 -5
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/_internal/utils/ssh.py +22 -2
- dstack/version.py +2 -2
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -3,7 +3,7 @@ import itertools
|
|
|
3
3
|
import math
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional
|
|
7
7
|
|
|
8
8
|
from sqlalchemy import and_, func, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
|
|
|
25
25
|
from dstack._internal.core.models.profiles import (
|
|
26
26
|
DEFAULT_RUN_TERMINATION_IDLE_TIME,
|
|
27
27
|
CreationPolicy,
|
|
28
|
+
Profile,
|
|
28
29
|
TerminationPolicy,
|
|
29
30
|
)
|
|
30
31
|
from dstack._internal.core.models.resources import Memory
|
|
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
|
|
|
34
35
|
JobRuntimeData,
|
|
35
36
|
JobStatus,
|
|
36
37
|
JobTerminationReason,
|
|
38
|
+
Requirements,
|
|
37
39
|
Run,
|
|
38
40
|
RunSpec,
|
|
39
41
|
)
|
|
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
186
188
|
project = run_model.project
|
|
187
189
|
run = run_model_to_run(run_model)
|
|
188
190
|
run_spec = run.run_spec
|
|
189
|
-
|
|
191
|
+
run_profile = run_spec.merged_profile
|
|
190
192
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
191
193
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
192
194
|
|
|
@@ -258,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
258
260
|
|
|
259
261
|
instance_filters = [
|
|
260
262
|
InstanceModel.deleted == False,
|
|
261
|
-
InstanceModel.total_blocks > InstanceModel.busy_blocks,
|
|
262
263
|
InstanceModel.id.not_in(detaching_instances_ids),
|
|
263
264
|
]
|
|
264
265
|
|
|
@@ -333,7 +334,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
333
334
|
job_model.status = JobStatus.PROVISIONING
|
|
334
335
|
else:
|
|
335
336
|
# Assigned no instance, create a new one
|
|
336
|
-
if
|
|
337
|
+
if run_profile.creation_policy == CreationPolicy.REUSE:
|
|
337
338
|
logger.debug("%s: reuse instance failed", fmt(job_model))
|
|
338
339
|
job_model.status = JobStatus.TERMINATING
|
|
339
340
|
job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
@@ -362,7 +363,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
362
363
|
return
|
|
363
364
|
|
|
364
365
|
logger.info("%s: now is provisioning a new instance", fmt(job_model))
|
|
365
|
-
job_provisioning_data, offer = run_job_result
|
|
366
|
+
job_provisioning_data, offer, effective_profile, _ = run_job_result
|
|
366
367
|
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
367
368
|
job_model.status = JobStatus.PROVISIONING
|
|
368
369
|
if fleet_model is None:
|
|
@@ -382,12 +383,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
382
383
|
instance = _create_instance_model_for_job(
|
|
383
384
|
project=project,
|
|
384
385
|
fleet_model=fleet_model,
|
|
385
|
-
run_spec=run_spec,
|
|
386
386
|
job_model=job_model,
|
|
387
|
-
job=job,
|
|
388
387
|
job_provisioning_data=job_provisioning_data,
|
|
389
388
|
offer=offer,
|
|
390
389
|
instance_num=instance_num,
|
|
390
|
+
profile=effective_profile,
|
|
391
391
|
)
|
|
392
392
|
job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
|
|
393
393
|
# Both this task and process_fleets can add instances to fleets.
|
|
@@ -513,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
|
|
|
513
513
|
)
|
|
514
514
|
return run_model.fleet, fleet_instances_with_pool_offers
|
|
515
515
|
|
|
516
|
-
if len(fleet_models) == 0:
|
|
517
|
-
return None, []
|
|
518
|
-
|
|
519
516
|
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
520
517
|
# The current strategy is first to consider fleets that can accommodate
|
|
521
518
|
# the run without additional provisioning and choose the one with the cheapest pool offer.
|
|
@@ -533,6 +530,7 @@ async def _find_optimal_fleet_with_offers(
|
|
|
533
530
|
]
|
|
534
531
|
] = []
|
|
535
532
|
for candidate_fleet_model in fleet_models:
|
|
533
|
+
candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
|
|
536
534
|
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
537
535
|
fleet_model=candidate_fleet_model,
|
|
538
536
|
run_spec=run_spec,
|
|
@@ -540,29 +538,25 @@ async def _find_optimal_fleet_with_offers(
|
|
|
540
538
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
541
539
|
volumes=volumes,
|
|
542
540
|
)
|
|
543
|
-
|
|
541
|
+
fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
|
|
544
542
|
fleet_cheapest_pool_offer = math.inf
|
|
545
543
|
if len(fleet_instances_with_pool_offers) > 0:
|
|
546
544
|
fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
|
|
547
545
|
|
|
548
|
-
|
|
549
|
-
profile = combine_fleet_and_run_profiles(
|
|
550
|
-
candidate_fleet.spec.merged_profile, run_spec.merged_profile
|
|
551
|
-
)
|
|
552
|
-
fleet_requirements = get_fleet_requirements(candidate_fleet.spec)
|
|
553
|
-
requirements = combine_fleet_and_run_requirements(
|
|
554
|
-
fleet_requirements, job.job_spec.requirements
|
|
555
|
-
)
|
|
556
|
-
multinode = (
|
|
557
|
-
candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
558
|
-
or job.job_spec.jobs_per_replica > 1
|
|
559
|
-
)
|
|
560
|
-
fleet_backend_offers = []
|
|
561
|
-
if (
|
|
546
|
+
try:
|
|
562
547
|
_check_can_create_new_instance_in_fleet(candidate_fleet)
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
548
|
+
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
549
|
+
job=job,
|
|
550
|
+
run_spec=run_spec,
|
|
551
|
+
fleet=candidate_fleet,
|
|
552
|
+
)
|
|
553
|
+
except ValueError:
|
|
554
|
+
fleet_backend_offers = []
|
|
555
|
+
else:
|
|
556
|
+
multinode = (
|
|
557
|
+
candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
558
|
+
or job.job_spec.jobs_per_replica > 1
|
|
559
|
+
)
|
|
566
560
|
fleet_backend_offers = await get_offers_by_requirements(
|
|
567
561
|
project=project,
|
|
568
562
|
profile=profile,
|
|
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
|
|
|
579
573
|
if len(fleet_backend_offers) > 0:
|
|
580
574
|
fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
|
|
581
575
|
|
|
576
|
+
if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
|
|
577
|
+
logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
|
|
578
|
+
continue
|
|
579
|
+
|
|
582
580
|
fleet_priority = (
|
|
583
|
-
not
|
|
581
|
+
not fleet_has_pool_capacity,
|
|
584
582
|
fleet_cheapest_pool_offer,
|
|
585
583
|
fleet_cheapest_backend_offer,
|
|
586
584
|
)
|
|
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
|
|
|
593
591
|
fleet_priority,
|
|
594
592
|
)
|
|
595
593
|
)
|
|
594
|
+
if len(candidate_fleets_with_offers) == 0:
|
|
595
|
+
return None, []
|
|
596
596
|
if run_spec.merged_profile.fleets is None and all(
|
|
597
597
|
t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
|
|
598
598
|
):
|
|
599
|
-
# If fleets are not specified and no fleets have available pool
|
|
599
|
+
# If fleets are not specified and no fleets have available pool
|
|
600
|
+
# or backend offers, create a new fleet.
|
|
600
601
|
# This is for compatibility with non-fleet-first UX when runs created new fleets
|
|
601
602
|
# if there are no instances to reuse.
|
|
602
603
|
return None, []
|
|
@@ -616,6 +617,39 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
|
|
|
616
617
|
return nodes_required_num
|
|
617
618
|
|
|
618
619
|
|
|
620
|
+
def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
|
|
621
|
+
"""
|
|
622
|
+
Returns `False` if the run cannot fit into fleet for sure.
|
|
623
|
+
This is helpful heuristic to avoid even considering fleets too small for a run.
|
|
624
|
+
A run may not fit even if this function returns `True`.
|
|
625
|
+
This will lead to some jobs failing due to exceeding `nodes.max`
|
|
626
|
+
or more than `nodes.max` instances being provisioned
|
|
627
|
+
and eventually removed by the fleet consolidation logic.
|
|
628
|
+
"""
|
|
629
|
+
# No check for cloud fleets with blocks > 1 since we don't know
|
|
630
|
+
# how many jobs such fleets can accommodate.
|
|
631
|
+
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
632
|
+
if (
|
|
633
|
+
fleet.spec.configuration.nodes is not None
|
|
634
|
+
and fleet.spec.configuration.blocks == 1
|
|
635
|
+
and fleet.spec.configuration.nodes.max is not None
|
|
636
|
+
):
|
|
637
|
+
busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
|
|
638
|
+
fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
|
|
639
|
+
if fleet_available_capacity < nodes_required_num:
|
|
640
|
+
return False
|
|
641
|
+
elif fleet.spec.configuration.ssh_config is not None:
|
|
642
|
+
# Currently assume that each idle block can run a job.
|
|
643
|
+
# TODO: Take resources / eligible offers into account.
|
|
644
|
+
total_idle_blocks = 0
|
|
645
|
+
for instance in fleet.instances:
|
|
646
|
+
total_blocks = instance.total_blocks or 1
|
|
647
|
+
total_idle_blocks += total_blocks - instance.busy_blocks
|
|
648
|
+
if total_idle_blocks < nodes_required_num:
|
|
649
|
+
return False
|
|
650
|
+
return True
|
|
651
|
+
|
|
652
|
+
|
|
619
653
|
def _get_fleet_instances_with_pool_offers(
|
|
620
654
|
fleet_model: FleetModel,
|
|
621
655
|
run_spec: RunSpec,
|
|
@@ -704,7 +738,7 @@ async def _run_job_on_new_instance(
|
|
|
704
738
|
master_job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
705
739
|
volumes: Optional[List[List[Volume]]] = None,
|
|
706
740
|
fleet_model: Optional[FleetModel] = None,
|
|
707
|
-
) -> Optional[
|
|
741
|
+
) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
|
|
708
742
|
if volumes is None:
|
|
709
743
|
volumes = []
|
|
710
744
|
profile = run.run_spec.merged_profile
|
|
@@ -712,21 +746,15 @@ async def _run_job_on_new_instance(
|
|
|
712
746
|
fleet = None
|
|
713
747
|
if fleet_model is not None:
|
|
714
748
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
if profile is None:
|
|
722
|
-
logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
|
|
723
|
-
return None
|
|
724
|
-
fleet_requirements = get_fleet_requirements(fleet.spec)
|
|
725
|
-
requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
|
|
726
|
-
if requirements is None:
|
|
727
|
-
logger.debug(
|
|
728
|
-
"%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
|
|
749
|
+
try:
|
|
750
|
+
_check_can_create_new_instance_in_fleet(fleet)
|
|
751
|
+
profile, requirements = _get_run_profile_and_requirements_in_fleet(
|
|
752
|
+
job=job,
|
|
753
|
+
run_spec=run.run_spec,
|
|
754
|
+
fleet=fleet,
|
|
729
755
|
)
|
|
756
|
+
except ValueError as e:
|
|
757
|
+
logger.debug("%s: %s", fmt(job_model), e.args[0])
|
|
730
758
|
return None
|
|
731
759
|
# TODO: Respect fleet provisioning properties such as tags
|
|
732
760
|
|
|
@@ -766,7 +794,7 @@ async def _run_job_on_new_instance(
|
|
|
766
794
|
project_ssh_private_key,
|
|
767
795
|
offer_volumes,
|
|
768
796
|
)
|
|
769
|
-
return job_provisioning_data, offer
|
|
797
|
+
return job_provisioning_data, offer, profile, requirements
|
|
770
798
|
except BackendError as e:
|
|
771
799
|
logger.warning(
|
|
772
800
|
"%s: %s launch in %s/%s failed: %s",
|
|
@@ -789,13 +817,40 @@ async def _run_job_on_new_instance(
|
|
|
789
817
|
return None
|
|
790
818
|
|
|
791
819
|
|
|
792
|
-
def
|
|
820
|
+
def _get_run_profile_and_requirements_in_fleet(
|
|
821
|
+
job: Job,
|
|
822
|
+
run_spec: RunSpec,
|
|
823
|
+
fleet: Fleet,
|
|
824
|
+
) -> tuple[Profile, Requirements]:
|
|
825
|
+
profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
|
|
826
|
+
if profile is None:
|
|
827
|
+
raise ValueError("Cannot combine fleet profile")
|
|
828
|
+
fleet_requirements = get_fleet_requirements(fleet.spec)
|
|
829
|
+
requirements = combine_fleet_and_run_requirements(
|
|
830
|
+
fleet_requirements, job.job_spec.requirements
|
|
831
|
+
)
|
|
832
|
+
if requirements is None:
|
|
833
|
+
raise ValueError("Cannot combine fleet requirements")
|
|
834
|
+
return profile, requirements
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _check_can_create_new_instance_in_fleet(fleet: Fleet):
|
|
838
|
+
if not _can_create_new_instance_in_fleet(fleet):
|
|
839
|
+
raise ValueError("Cannot fit new instance into fleet")
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
|
|
793
843
|
if fleet.spec.configuration.ssh_config is not None:
|
|
794
844
|
return False
|
|
795
|
-
|
|
796
|
-
#
|
|
797
|
-
#
|
|
798
|
-
|
|
845
|
+
active_instances = [i for i in fleet.instances if i.status.is_active()]
|
|
846
|
+
# nodes.max is a soft limit that can be exceeded when provisioning concurrently.
|
|
847
|
+
# The fleet consolidation logic will remove redundant nodes eventually.
|
|
848
|
+
if (
|
|
849
|
+
fleet.spec.configuration.nodes is not None
|
|
850
|
+
and fleet.spec.configuration.nodes.max is not None
|
|
851
|
+
and len(active_instances) >= fleet.spec.configuration.nodes.max
|
|
852
|
+
):
|
|
853
|
+
return False
|
|
799
854
|
return True
|
|
800
855
|
|
|
801
856
|
|
|
@@ -857,14 +912,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
|
|
|
857
912
|
def _create_instance_model_for_job(
|
|
858
913
|
project: ProjectModel,
|
|
859
914
|
fleet_model: FleetModel,
|
|
860
|
-
run_spec: RunSpec,
|
|
861
915
|
job_model: JobModel,
|
|
862
|
-
job: Job,
|
|
863
916
|
job_provisioning_data: JobProvisioningData,
|
|
864
917
|
offer: InstanceOfferWithAvailability,
|
|
865
918
|
instance_num: int,
|
|
919
|
+
profile: Profile,
|
|
866
920
|
) -> InstanceModel:
|
|
867
|
-
profile = run_spec.merged_profile
|
|
868
921
|
if not job_provisioning_data.dockerized:
|
|
869
922
|
# terminate vastai/k8s instances immediately
|
|
870
923
|
termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
|
|
@@ -7,6 +7,7 @@ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGrou
|
|
|
7
7
|
from dstack._internal.core.backends.features import (
|
|
8
8
|
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
|
|
9
9
|
BACKENDS_WITH_MULTINODE_SUPPORT,
|
|
10
|
+
BACKENDS_WITH_PRIVILEGED_SUPPORT,
|
|
10
11
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -67,7 +68,12 @@ async def get_offers_by_requirements(
|
|
|
67
68
|
backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
|
|
68
69
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
|
|
69
70
|
|
|
70
|
-
if privileged
|
|
71
|
+
if privileged:
|
|
72
|
+
if backend_types is None:
|
|
73
|
+
backend_types = BACKENDS_WITH_PRIVILEGED_SUPPORT
|
|
74
|
+
backend_types = [b for b in backend_types if b in BACKENDS_WITH_PRIVILEGED_SUPPORT]
|
|
75
|
+
|
|
76
|
+
if instance_mounts:
|
|
71
77
|
if backend_types is None:
|
|
72
78
|
backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
73
79
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
16
16
|
ComputeWithMultinodeSupport,
|
|
17
17
|
ComputeWithPlacementGroupSupport,
|
|
18
18
|
ComputeWithPrivateGatewaySupport,
|
|
19
|
+
ComputeWithPrivilegedSupport,
|
|
19
20
|
ComputeWithReservationSupport,
|
|
20
21
|
ComputeWithVolumeSupport,
|
|
21
22
|
)
|
|
@@ -1131,6 +1132,7 @@ class AsyncContextManager:
|
|
|
1131
1132
|
class ComputeMockSpec(
|
|
1132
1133
|
Compute,
|
|
1133
1134
|
ComputeWithCreateInstanceSupport,
|
|
1135
|
+
ComputeWithPrivilegedSupport,
|
|
1134
1136
|
ComputeWithMultinodeSupport,
|
|
1135
1137
|
ComputeWithReservationSupport,
|
|
1136
1138
|
ComputeWithPlacementGroupSupport,
|
|
@@ -6,7 +6,7 @@ from textwrap import dedent
|
|
|
6
6
|
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
|
-
from gpuhunt import AcceleratorVendor,
|
|
9
|
+
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
|
|
12
12
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
@@ -248,14 +248,7 @@ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]:
|
|
|
248
248
|
return out
|
|
249
249
|
|
|
250
250
|
|
|
251
|
-
def host_info_to_instance_type(host_info: Dict[str, Any],
|
|
252
|
-
_cpu_arch: CPUArchitecture
|
|
253
|
-
if cpu_arch == "amd64":
|
|
254
|
-
_cpu_arch = CPUArchitecture.X86
|
|
255
|
-
elif cpu_arch == "arm64":
|
|
256
|
-
_cpu_arch = CPUArchitecture.ARM
|
|
257
|
-
else:
|
|
258
|
-
raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
|
|
251
|
+
def host_info_to_instance_type(host_info: Dict[str, Any], arch: GoArchType) -> InstanceType:
|
|
259
252
|
gpu_count = host_info.get("gpu_count", 0)
|
|
260
253
|
if gpu_count > 0:
|
|
261
254
|
gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
|
|
@@ -280,7 +273,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType)
|
|
|
280
273
|
instance_type = InstanceType(
|
|
281
274
|
name="instance",
|
|
282
275
|
resources=Resources(
|
|
283
|
-
cpu_arch=
|
|
276
|
+
cpu_arch=arch.to_cpu_architecture(),
|
|
284
277
|
cpus=host_info["cpus"],
|
|
285
278
|
memory_mib=host_info["memory"] / 1024 / 1024,
|
|
286
279
|
spot=False,
|
dstack/_internal/utils/ssh.py
CHANGED
|
@@ -50,8 +50,28 @@ def make_ssh_command_for_git(identity_file: PathLike) -> str:
|
|
|
50
50
|
)
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def make_git_env(
|
|
54
|
-
|
|
53
|
+
def make_git_env(
|
|
54
|
+
*,
|
|
55
|
+
disable_prompt: bool = True,
|
|
56
|
+
disable_askpass: bool = False,
|
|
57
|
+
disable_config: bool = False,
|
|
58
|
+
identity_file: Optional[PathLike] = None,
|
|
59
|
+
) -> dict[str, str]:
|
|
60
|
+
env: dict[str, str] = {}
|
|
61
|
+
if disable_prompt:
|
|
62
|
+
# Fail with error instead of prompting on the terminal (e.g., when asking for
|
|
63
|
+
# HTTP authentication)
|
|
64
|
+
env["GIT_TERMINAL_PROMPT"] = "0"
|
|
65
|
+
if disable_askpass:
|
|
66
|
+
env["GIT_ASKPASS"] = ""
|
|
67
|
+
env["SSH_ASKPASS"] = ""
|
|
68
|
+
if disable_config:
|
|
69
|
+
# Disable system-wide config (usually /etc/gitconfig)
|
|
70
|
+
env["GIT_CONFIG_SYSTEM"] = os.devnull
|
|
71
|
+
# Disable user (aka "global") config ($XDG_CONFIG_HOME/git/config or ~/.git/config)
|
|
72
|
+
env["GIT_CONFIG_GLOBAL"] = os.devnull
|
|
73
|
+
# Disable repo (aka "local") config (./.git/config)
|
|
74
|
+
env["GIT_DIR"] = os.devnull
|
|
55
75
|
if identity_file is not None:
|
|
56
76
|
env["GIT_SSH_COMMAND"] = make_ssh_command_for_git(identity_file)
|
|
57
77
|
return env
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.32
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -73,7 +73,7 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
|
|
|
73
73
|
Requires-Dist: httpx; extra == 'all'
|
|
74
74
|
Requires-Dist: jinja2; extra == 'all'
|
|
75
75
|
Requires-Dist: kubernetes; extra == 'all'
|
|
76
|
-
Requires-Dist: nebius
|
|
76
|
+
Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'all'
|
|
77
77
|
Requires-Dist: oci>=2.150.0; extra == 'all'
|
|
78
78
|
Requires-Dist: prometheus-client; extra == 'all'
|
|
79
79
|
Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
|
|
@@ -259,7 +259,7 @@ Requires-Dist: fastapi; extra == 'nebius'
|
|
|
259
259
|
Requires-Dist: grpcio>=1.50; extra == 'nebius'
|
|
260
260
|
Requires-Dist: httpx; extra == 'nebius'
|
|
261
261
|
Requires-Dist: jinja2; extra == 'nebius'
|
|
262
|
-
Requires-Dist: nebius
|
|
262
|
+
Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
|
|
263
263
|
Requires-Dist: prometheus-client; extra == 'nebius'
|
|
264
264
|
Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
|
|
265
265
|
Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
|
|
@@ -331,24 +331,26 @@ Description-Content-Type: text/markdown
|
|
|
331
331
|
|
|
332
332
|
</div>
|
|
333
333
|
|
|
334
|
-
`dstack` is
|
|
334
|
+
`dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters.
|
|
335
335
|
|
|
336
|
-
|
|
336
|
+
It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks.
|
|
337
|
+
|
|
338
|
+
#### Hardware
|
|
337
339
|
|
|
338
340
|
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
|
|
339
341
|
|
|
340
342
|
## Latest news ✨
|
|
343
|
+
- [2025/10] [dstack 0.19.31: Kubernetes, GCP A4 spot](https://github.com/dstackai/dstack/releases/tag/0.19.31)
|
|
344
|
+
- [2025/08] [dstack 0.19.26: Repos](https://github.com/dstackai/dstack/releases/tag/0.19.26)
|
|
345
|
+
- [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy](https://github.com/dstackai/dstack/releases/tag/0.19.22)
|
|
346
|
+
- [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
|
|
341
347
|
- [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
|
|
342
|
-
- [2025/06] [dstack 0.19.16: Docker in Docker
|
|
343
|
-
- [2025/06] [dstack 0.19.13:
|
|
344
|
-
- [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
|
|
345
|
-
- [2025/05] [dstack 0.19.10: Priorities](https://github.com/dstackai/dstack/releases/tag/0.19.10)
|
|
346
|
-
- [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
|
|
347
|
-
- [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
|
|
348
|
+
- [2025/06] [dstack 0.19.16: Docker in Docker](https://github.com/dstackai/dstack/releases/tag/0.19.16)
|
|
349
|
+
- [2025/06] [dstack 0.19.13: Default images with InfiniBand support](https://github.com/dstackai/dstack/releases/tag/0.19.13)
|
|
348
350
|
|
|
349
351
|
## How does it work?
|
|
350
352
|
|
|
351
|
-
<img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-
|
|
353
|
+
<img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v11.svg" width="750" />
|
|
352
354
|
|
|
353
355
|
### Installation
|
|
354
356
|
|
|
@@ -356,15 +358,15 @@ Description-Content-Type: text/markdown
|
|
|
356
358
|
|
|
357
359
|
#### Set up the server
|
|
358
360
|
|
|
359
|
-
#####
|
|
361
|
+
##### Configure backends
|
|
362
|
+
|
|
363
|
+
To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
|
|
360
364
|
|
|
361
|
-
|
|
362
|
-
via the `~/.dstack/server/config.yml` file.
|
|
365
|
+
Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI.
|
|
363
366
|
|
|
364
|
-
For more details
|
|
367
|
+
For more details, see [Backends](https://dstack.ai/docs/concepts/backends).
|
|
365
368
|
|
|
366
|
-
>
|
|
367
|
-
> once the server is up.
|
|
369
|
+
> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh) once the server is up.
|
|
368
370
|
|
|
369
371
|
##### Start the server
|
|
370
372
|
|