dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/cli/commands/__init__.py +8 -0
  2. dstack/_internal/cli/commands/project.py +27 -20
  3. dstack/_internal/cli/commands/server.py +5 -0
  4. dstack/_internal/cli/services/configurators/fleet.py +20 -6
  5. dstack/_internal/cli/utils/gpu.py +2 -2
  6. dstack/_internal/core/backends/aws/compute.py +13 -5
  7. dstack/_internal/core/backends/aws/resources.py +11 -6
  8. dstack/_internal/core/backends/azure/compute.py +17 -6
  9. dstack/_internal/core/backends/base/compute.py +57 -9
  10. dstack/_internal/core/backends/base/offers.py +1 -0
  11. dstack/_internal/core/backends/cloudrift/compute.py +2 -0
  12. dstack/_internal/core/backends/cudo/compute.py +2 -0
  13. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  14. dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
  15. dstack/_internal/core/backends/features.py +5 -0
  16. dstack/_internal/core/backends/gcp/compute.py +87 -38
  17. dstack/_internal/core/backends/gcp/configurator.py +1 -1
  18. dstack/_internal/core/backends/gcp/models.py +14 -1
  19. dstack/_internal/core/backends/gcp/resources.py +35 -12
  20. dstack/_internal/core/backends/hotaisle/compute.py +22 -0
  21. dstack/_internal/core/backends/kubernetes/compute.py +531 -215
  22. dstack/_internal/core/backends/kubernetes/models.py +13 -16
  23. dstack/_internal/core/backends/kubernetes/utils.py +145 -8
  24. dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
  25. dstack/_internal/core/backends/local/compute.py +2 -0
  26. dstack/_internal/core/backends/nebius/compute.py +17 -0
  27. dstack/_internal/core/backends/nebius/configurator.py +15 -0
  28. dstack/_internal/core/backends/nebius/models.py +57 -5
  29. dstack/_internal/core/backends/nebius/resources.py +45 -2
  30. dstack/_internal/core/backends/oci/compute.py +7 -1
  31. dstack/_internal/core/backends/oci/resources.py +8 -3
  32. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  33. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  34. dstack/_internal/core/backends/vultr/compute.py +2 -0
  35. dstack/_internal/core/compatibility/runs.py +8 -0
  36. dstack/_internal/core/consts.py +2 -0
  37. dstack/_internal/core/models/profiles.py +11 -4
  38. dstack/_internal/core/services/repos.py +101 -11
  39. dstack/_internal/server/background/tasks/common.py +2 -0
  40. dstack/_internal/server/background/tasks/process_fleets.py +75 -17
  41. dstack/_internal/server/background/tasks/process_instances.py +3 -5
  42. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  43. dstack/_internal/server/background/tasks/process_runs.py +27 -23
  44. dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
  45. dstack/_internal/server/services/offers.py +7 -1
  46. dstack/_internal/server/testing/common.py +2 -0
  47. dstack/_internal/server/utils/provisioning.py +3 -10
  48. dstack/_internal/utils/ssh.py +22 -2
  49. dstack/version.py +2 -2
  50. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
  51. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
  52. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,7 +3,7 @@ import itertools
3
3
  import math
4
4
  import uuid
5
5
  from datetime import datetime, timedelta
6
- from typing import List, Optional, Tuple
6
+ from typing import List, Optional
7
7
 
8
8
  from sqlalchemy import and_, func, not_, or_, select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
25
25
  from dstack._internal.core.models.profiles import (
26
26
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
27
27
  CreationPolicy,
28
+ Profile,
28
29
  TerminationPolicy,
29
30
  )
30
31
  from dstack._internal.core.models.resources import Memory
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
34
35
  JobRuntimeData,
35
36
  JobStatus,
36
37
  JobTerminationReason,
38
+ Requirements,
37
39
  Run,
38
40
  RunSpec,
39
41
  )
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
186
188
  project = run_model.project
187
189
  run = run_model_to_run(run_model)
188
190
  run_spec = run.run_spec
189
- profile = run_spec.merged_profile
191
+ run_profile = run_spec.merged_profile
190
192
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
191
193
  multinode = job.job_spec.jobs_per_replica > 1
192
194
 
@@ -258,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
258
260
 
259
261
  instance_filters = [
260
262
  InstanceModel.deleted == False,
261
- InstanceModel.total_blocks > InstanceModel.busy_blocks,
262
263
  InstanceModel.id.not_in(detaching_instances_ids),
263
264
  ]
264
265
 
@@ -333,7 +334,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
333
334
  job_model.status = JobStatus.PROVISIONING
334
335
  else:
335
336
  # Assigned no instance, create a new one
336
- if profile.creation_policy == CreationPolicy.REUSE:
337
+ if run_profile.creation_policy == CreationPolicy.REUSE:
337
338
  logger.debug("%s: reuse instance failed", fmt(job_model))
338
339
  job_model.status = JobStatus.TERMINATING
339
340
  job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
@@ -362,7 +363,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
362
363
  return
363
364
 
364
365
  logger.info("%s: now is provisioning a new instance", fmt(job_model))
365
- job_provisioning_data, offer = run_job_result
366
+ job_provisioning_data, offer, effective_profile, _ = run_job_result
366
367
  job_model.job_provisioning_data = job_provisioning_data.json()
367
368
  job_model.status = JobStatus.PROVISIONING
368
369
  if fleet_model is None:
@@ -382,12 +383,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
382
383
  instance = _create_instance_model_for_job(
383
384
  project=project,
384
385
  fleet_model=fleet_model,
385
- run_spec=run_spec,
386
386
  job_model=job_model,
387
- job=job,
388
387
  job_provisioning_data=job_provisioning_data,
389
388
  offer=offer,
390
389
  instance_num=instance_num,
390
+ profile=effective_profile,
391
391
  )
392
392
  job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
393
393
  # Both this task and process_fleets can add instances to fleets.
@@ -513,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
513
513
  )
514
514
  return run_model.fleet, fleet_instances_with_pool_offers
515
515
 
516
- if len(fleet_models) == 0:
517
- return None, []
518
-
519
516
  nodes_required_num = _get_nodes_required_num_for_run(run_spec)
520
517
  # The current strategy is first to consider fleets that can accommodate
521
518
  # the run without additional provisioning and choose the one with the cheapest pool offer.
@@ -533,6 +530,7 @@ async def _find_optimal_fleet_with_offers(
533
530
  ]
534
531
  ] = []
535
532
  for candidate_fleet_model in fleet_models:
533
+ candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
536
534
  fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
537
535
  fleet_model=candidate_fleet_model,
538
536
  run_spec=run_spec,
@@ -540,29 +538,25 @@ async def _find_optimal_fleet_with_offers(
540
538
  master_job_provisioning_data=master_job_provisioning_data,
541
539
  volumes=volumes,
542
540
  )
543
- fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
541
+ fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
544
542
  fleet_cheapest_pool_offer = math.inf
545
543
  if len(fleet_instances_with_pool_offers) > 0:
546
544
  fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
547
545
 
548
- candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
549
- profile = combine_fleet_and_run_profiles(
550
- candidate_fleet.spec.merged_profile, run_spec.merged_profile
551
- )
552
- fleet_requirements = get_fleet_requirements(candidate_fleet.spec)
553
- requirements = combine_fleet_and_run_requirements(
554
- fleet_requirements, job.job_spec.requirements
555
- )
556
- multinode = (
557
- candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
558
- or job.job_spec.jobs_per_replica > 1
559
- )
560
- fleet_backend_offers = []
561
- if (
546
+ try:
562
547
  _check_can_create_new_instance_in_fleet(candidate_fleet)
563
- and profile is not None
564
- and requirements is not None
565
- ):
548
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
549
+ job=job,
550
+ run_spec=run_spec,
551
+ fleet=candidate_fleet,
552
+ )
553
+ except ValueError:
554
+ fleet_backend_offers = []
555
+ else:
556
+ multinode = (
557
+ candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
558
+ or job.job_spec.jobs_per_replica > 1
559
+ )
566
560
  fleet_backend_offers = await get_offers_by_requirements(
567
561
  project=project,
568
562
  profile=profile,
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
579
573
  if len(fleet_backend_offers) > 0:
580
574
  fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
581
575
 
576
+ if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
577
+ logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
578
+ continue
579
+
582
580
  fleet_priority = (
583
- not fleet_has_available_capacity,
581
+ not fleet_has_pool_capacity,
584
582
  fleet_cheapest_pool_offer,
585
583
  fleet_cheapest_backend_offer,
586
584
  )
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
593
591
  fleet_priority,
594
592
  )
595
593
  )
594
+ if len(candidate_fleets_with_offers) == 0:
595
+ return None, []
596
596
  if run_spec.merged_profile.fleets is None and all(
597
597
  t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
598
598
  ):
599
- # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
599
+ # If fleets are not specified and no fleets have available pool
600
+ # or backend offers, create a new fleet.
600
601
  # This is for compatibility with non-fleet-first UX when runs created new fleets
601
602
  # if there are no instances to reuse.
602
603
  return None, []
@@ -616,6 +617,39 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
616
617
  return nodes_required_num
617
618
 
618
619
 
620
+ def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
621
+ """
622
+ Returns `False` if the run cannot fit into fleet for sure.
623
+ This is helpful heuristic to avoid even considering fleets too small for a run.
624
+ A run may not fit even if this function returns `True`.
625
+ This will lead to some jobs failing due to exceeding `nodes.max`
626
+ or more than `nodes.max` instances being provisioned
627
+ and eventually removed by the fleet consolidation logic.
628
+ """
629
+ # No check for cloud fleets with blocks > 1 since we don't know
630
+ # how many jobs such fleets can accommodate.
631
+ nodes_required_num = _get_nodes_required_num_for_run(run_spec)
632
+ if (
633
+ fleet.spec.configuration.nodes is not None
634
+ and fleet.spec.configuration.blocks == 1
635
+ and fleet.spec.configuration.nodes.max is not None
636
+ ):
637
+ busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
638
+ fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
639
+ if fleet_available_capacity < nodes_required_num:
640
+ return False
641
+ elif fleet.spec.configuration.ssh_config is not None:
642
+ # Currently assume that each idle block can run a job.
643
+ # TODO: Take resources / eligible offers into account.
644
+ total_idle_blocks = 0
645
+ for instance in fleet.instances:
646
+ total_blocks = instance.total_blocks or 1
647
+ total_idle_blocks += total_blocks - instance.busy_blocks
648
+ if total_idle_blocks < nodes_required_num:
649
+ return False
650
+ return True
651
+
652
+
619
653
  def _get_fleet_instances_with_pool_offers(
620
654
  fleet_model: FleetModel,
621
655
  run_spec: RunSpec,
@@ -704,7 +738,7 @@ async def _run_job_on_new_instance(
704
738
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
705
739
  volumes: Optional[List[List[Volume]]] = None,
706
740
  fleet_model: Optional[FleetModel] = None,
707
- ) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
741
+ ) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
708
742
  if volumes is None:
709
743
  volumes = []
710
744
  profile = run.run_spec.merged_profile
@@ -712,21 +746,15 @@ async def _run_job_on_new_instance(
712
746
  fleet = None
713
747
  if fleet_model is not None:
714
748
  fleet = fleet_model_to_fleet(fleet_model)
715
- if not _check_can_create_new_instance_in_fleet(fleet):
716
- logger.debug(
717
- "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
718
- )
719
- return None
720
- profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
721
- if profile is None:
722
- logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
723
- return None
724
- fleet_requirements = get_fleet_requirements(fleet.spec)
725
- requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
726
- if requirements is None:
727
- logger.debug(
728
- "%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
749
+ try:
750
+ _check_can_create_new_instance_in_fleet(fleet)
751
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
752
+ job=job,
753
+ run_spec=run.run_spec,
754
+ fleet=fleet,
729
755
  )
756
+ except ValueError as e:
757
+ logger.debug("%s: %s", fmt(job_model), e.args[0])
730
758
  return None
731
759
  # TODO: Respect fleet provisioning properties such as tags
732
760
 
@@ -766,7 +794,7 @@ async def _run_job_on_new_instance(
766
794
  project_ssh_private_key,
767
795
  offer_volumes,
768
796
  )
769
- return job_provisioning_data, offer
797
+ return job_provisioning_data, offer, profile, requirements
770
798
  except BackendError as e:
771
799
  logger.warning(
772
800
  "%s: %s launch in %s/%s failed: %s",
@@ -789,13 +817,40 @@ async def _run_job_on_new_instance(
789
817
  return None
790
818
 
791
819
 
792
- def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
820
+ def _get_run_profile_and_requirements_in_fleet(
821
+ job: Job,
822
+ run_spec: RunSpec,
823
+ fleet: Fleet,
824
+ ) -> tuple[Profile, Requirements]:
825
+ profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
826
+ if profile is None:
827
+ raise ValueError("Cannot combine fleet profile")
828
+ fleet_requirements = get_fleet_requirements(fleet.spec)
829
+ requirements = combine_fleet_and_run_requirements(
830
+ fleet_requirements, job.job_spec.requirements
831
+ )
832
+ if requirements is None:
833
+ raise ValueError("Cannot combine fleet requirements")
834
+ return profile, requirements
835
+
836
+
837
+ def _check_can_create_new_instance_in_fleet(fleet: Fleet):
838
+ if not _can_create_new_instance_in_fleet(fleet):
839
+ raise ValueError("Cannot fit new instance into fleet")
840
+
841
+
842
+ def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
793
843
  if fleet.spec.configuration.ssh_config is not None:
794
844
  return False
795
- # TODO: Respect nodes.max
796
- # Ensure concurrent provisioning does not violate nodes.max
797
- # E.g. lock fleet and split instance model creation
798
- # and instance provisioning into separate transactions.
845
+ active_instances = [i for i in fleet.instances if i.status.is_active()]
846
+ # nodes.max is a soft limit that can be exceeded when provisioning concurrently.
847
+ # The fleet consolidation logic will remove redundant nodes eventually.
848
+ if (
849
+ fleet.spec.configuration.nodes is not None
850
+ and fleet.spec.configuration.nodes.max is not None
851
+ and len(active_instances) >= fleet.spec.configuration.nodes.max
852
+ ):
853
+ return False
799
854
  return True
800
855
 
801
856
 
@@ -857,14 +912,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
857
912
  def _create_instance_model_for_job(
858
913
  project: ProjectModel,
859
914
  fleet_model: FleetModel,
860
- run_spec: RunSpec,
861
915
  job_model: JobModel,
862
- job: Job,
863
916
  job_provisioning_data: JobProvisioningData,
864
917
  offer: InstanceOfferWithAvailability,
865
918
  instance_num: int,
919
+ profile: Profile,
866
920
  ) -> InstanceModel:
867
- profile = run_spec.merged_profile
868
921
  if not job_provisioning_data.dockerized:
869
922
  # terminate vastai/k8s instances immediately
870
923
  termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
@@ -7,6 +7,7 @@ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGrou
7
7
  from dstack._internal.core.backends.features import (
8
8
  BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
9
9
  BACKENDS_WITH_MULTINODE_SUPPORT,
10
+ BACKENDS_WITH_PRIVILEGED_SUPPORT,
10
11
  BACKENDS_WITH_RESERVATION_SUPPORT,
11
12
  )
12
13
  from dstack._internal.core.models.backends.base import BackendType
@@ -67,7 +68,12 @@ async def get_offers_by_requirements(
67
68
  backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
68
69
  backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
69
70
 
70
- if privileged or instance_mounts:
71
+ if privileged:
72
+ if backend_types is None:
73
+ backend_types = BACKENDS_WITH_PRIVILEGED_SUPPORT
74
+ backend_types = [b for b in backend_types if b in BACKENDS_WITH_PRIVILEGED_SUPPORT]
75
+
76
+ if instance_mounts:
71
77
  if backend_types is None:
72
78
  backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
73
79
  backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]
@@ -16,6 +16,7 @@ from dstack._internal.core.backends.base.compute import (
16
16
  ComputeWithMultinodeSupport,
17
17
  ComputeWithPlacementGroupSupport,
18
18
  ComputeWithPrivateGatewaySupport,
19
+ ComputeWithPrivilegedSupport,
19
20
  ComputeWithReservationSupport,
20
21
  ComputeWithVolumeSupport,
21
22
  )
@@ -1131,6 +1132,7 @@ class AsyncContextManager:
1131
1132
  class ComputeMockSpec(
1132
1133
  Compute,
1133
1134
  ComputeWithCreateInstanceSupport,
1135
+ ComputeWithPrivilegedSupport,
1134
1136
  ComputeWithMultinodeSupport,
1135
1137
  ComputeWithReservationSupport,
1136
1138
  ComputeWithPlacementGroupSupport,
@@ -6,7 +6,7 @@ from textwrap import dedent
6
6
  from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
- from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
9
+ from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
10
10
 
11
11
  from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
12
12
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
@@ -248,14 +248,7 @@ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]:
248
248
  return out
249
249
 
250
250
 
251
- def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
252
- _cpu_arch: CPUArchitecture
253
- if cpu_arch == "amd64":
254
- _cpu_arch = CPUArchitecture.X86
255
- elif cpu_arch == "arm64":
256
- _cpu_arch = CPUArchitecture.ARM
257
- else:
258
- raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
251
+ def host_info_to_instance_type(host_info: Dict[str, Any], arch: GoArchType) -> InstanceType:
259
252
  gpu_count = host_info.get("gpu_count", 0)
260
253
  if gpu_count > 0:
261
254
  gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -280,7 +273,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType)
280
273
  instance_type = InstanceType(
281
274
  name="instance",
282
275
  resources=Resources(
283
- cpu_arch=_cpu_arch,
276
+ cpu_arch=arch.to_cpu_architecture(),
284
277
  cpus=host_info["cpus"],
285
278
  memory_mib=host_info["memory"] / 1024 / 1024,
286
279
  spot=False,
@@ -50,8 +50,28 @@ def make_ssh_command_for_git(identity_file: PathLike) -> str:
50
50
  )
51
51
 
52
52
 
53
- def make_git_env(*, identity_file: Optional[PathLike] = None) -> dict[str, str]:
54
- env: dict[str, str] = {"GIT_TERMINAL_PROMPT": "0"}
53
+ def make_git_env(
54
+ *,
55
+ disable_prompt: bool = True,
56
+ disable_askpass: bool = False,
57
+ disable_config: bool = False,
58
+ identity_file: Optional[PathLike] = None,
59
+ ) -> dict[str, str]:
60
+ env: dict[str, str] = {}
61
+ if disable_prompt:
62
+ # Fail with error instead of prompting on the terminal (e.g., when asking for
63
+ # HTTP authentication)
64
+ env["GIT_TERMINAL_PROMPT"] = "0"
65
+ if disable_askpass:
66
+ env["GIT_ASKPASS"] = ""
67
+ env["SSH_ASKPASS"] = ""
68
+ if disable_config:
69
+ # Disable system-wide config (usually /etc/gitconfig)
70
+ env["GIT_CONFIG_SYSTEM"] = os.devnull
71
+ # Disable user (aka "global") config ($XDG_CONFIG_HOME/git/config or ~/.git/config)
72
+ env["GIT_CONFIG_GLOBAL"] = os.devnull
73
+ # Disable repo (aka "local") config (./.git/config)
74
+ env["GIT_DIR"] = os.devnull
55
75
  if identity_file is not None:
56
76
  env["GIT_SSH_COMMAND"] = make_ssh_command_for_git(identity_file)
57
77
  return env
dstack/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.19.30rc1"
1
+ __version__ = "0.19.32"
2
2
  __is_release__ = True
3
- base_image = "0.10"
3
+ base_image = "0.11rc2"
4
4
  base_image_ubuntu_version = "22.04"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.30rc1
3
+ Version: 0.19.32
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -73,7 +73,7 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
73
73
  Requires-Dist: httpx; extra == 'all'
74
74
  Requires-Dist: jinja2; extra == 'all'
75
75
  Requires-Dist: kubernetes; extra == 'all'
76
- Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'all'
76
+ Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'all'
77
77
  Requires-Dist: oci>=2.150.0; extra == 'all'
78
78
  Requires-Dist: prometheus-client; extra == 'all'
79
79
  Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
@@ -259,7 +259,7 @@ Requires-Dist: fastapi; extra == 'nebius'
259
259
  Requires-Dist: grpcio>=1.50; extra == 'nebius'
260
260
  Requires-Dist: httpx; extra == 'nebius'
261
261
  Requires-Dist: jinja2; extra == 'nebius'
262
- Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
262
+ Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
263
263
  Requires-Dist: prometheus-client; extra == 'nebius'
264
264
  Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
265
265
  Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
@@ -331,24 +331,26 @@ Description-Content-Type: text/markdown
331
331
 
332
332
  </div>
333
333
 
334
- `dstack` is an open-source container orchestrator that simplifies workload orchestration and drives GPU utilization for ML teams. It works with any GPU cloud, on-prem cluster, or accelerated hardware.
334
+ `dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters.
335
335
 
336
- #### Accelerators
336
+ It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks.
337
+
338
+ #### Hardware
337
339
 
338
340
  `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
339
341
 
340
342
  ## Latest news ✨
343
+ - [2025/10] [dstack 0.19.31: Kubernetes, GCP A4 spot](https://github.com/dstackai/dstack/releases/tag/0.19.31)
344
+ - [2025/08] [dstack 0.19.26: Repos](https://github.com/dstackai/dstack/releases/tag/0.19.26)
345
+ - [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy](https://github.com/dstackai/dstack/releases/tag/0.19.22)
346
+ - [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
341
347
  - [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
342
- - [2025/06] [dstack 0.19.16: Docker in Docker, CloudRift](https://github.com/dstackai/dstack/releases/tag/0.19.16)
343
- - [2025/06] [dstack 0.19.13: InfiniBand support in default images](https://github.com/dstackai/dstack/releases/tag/0.19.13)
344
- - [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
345
- - [2025/05] [dstack 0.19.10: Priorities](https://github.com/dstackai/dstack/releases/tag/0.19.10)
346
- - [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
347
- - [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
348
+ - [2025/06] [dstack 0.19.16: Docker in Docker](https://github.com/dstackai/dstack/releases/tag/0.19.16)
349
+ - [2025/06] [dstack 0.19.13: Default images with InfiniBand support](https://github.com/dstackai/dstack/releases/tag/0.19.13)
348
350
 
349
351
  ## How does it work?
350
352
 
351
- <img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v10.svg" width="750" />
353
+ <img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v11.svg" width="750" />
352
354
 
353
355
  ### Installation
354
356
 
@@ -356,15 +358,15 @@ Description-Content-Type: text/markdown
356
358
 
357
359
  #### Set up the server
358
360
 
359
- ##### (Optional) Configure backends
361
+ ##### Configure backends
362
+
363
+ To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
360
364
 
361
- To use `dstack` with cloud providers, configure backends
362
- via the `~/.dstack/server/config.yml` file.
365
+ Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI.
363
366
 
364
- For more details on how to configure backends, check [Backends](https://dstack.ai/docs/concepts/backends).
367
+ For more details, see [Backends](https://dstack.ai/docs/concepts/backends).
365
368
 
366
- > For using `dstack` with on-prem servers, create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh)
367
- > once the server is up.
369
+ > When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh) once the server is up.
368
370
 
369
371
  ##### Start the server
370
372