dstack 0.19.30rc1__py3-none-any.whl → 0.19.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (47) hide show
  1. dstack/_internal/cli/commands/__init__.py +8 -0
  2. dstack/_internal/cli/commands/project.py +27 -20
  3. dstack/_internal/cli/commands/server.py +5 -0
  4. dstack/_internal/cli/services/configurators/fleet.py +20 -6
  5. dstack/_internal/cli/utils/gpu.py +2 -2
  6. dstack/_internal/core/backends/aws/compute.py +13 -5
  7. dstack/_internal/core/backends/aws/resources.py +11 -6
  8. dstack/_internal/core/backends/azure/compute.py +17 -6
  9. dstack/_internal/core/backends/base/compute.py +57 -9
  10. dstack/_internal/core/backends/base/offers.py +1 -0
  11. dstack/_internal/core/backends/cloudrift/compute.py +2 -0
  12. dstack/_internal/core/backends/cudo/compute.py +2 -0
  13. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  14. dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
  15. dstack/_internal/core/backends/features.py +5 -0
  16. dstack/_internal/core/backends/gcp/compute.py +87 -38
  17. dstack/_internal/core/backends/gcp/configurator.py +1 -1
  18. dstack/_internal/core/backends/gcp/models.py +14 -1
  19. dstack/_internal/core/backends/gcp/resources.py +35 -12
  20. dstack/_internal/core/backends/hotaisle/compute.py +2 -0
  21. dstack/_internal/core/backends/kubernetes/compute.py +466 -213
  22. dstack/_internal/core/backends/kubernetes/models.py +13 -16
  23. dstack/_internal/core/backends/kubernetes/utils.py +145 -8
  24. dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
  25. dstack/_internal/core/backends/local/compute.py +2 -0
  26. dstack/_internal/core/backends/nebius/compute.py +2 -0
  27. dstack/_internal/core/backends/oci/compute.py +7 -1
  28. dstack/_internal/core/backends/oci/resources.py +8 -3
  29. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  30. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  31. dstack/_internal/core/backends/vultr/compute.py +2 -0
  32. dstack/_internal/core/consts.py +2 -0
  33. dstack/_internal/core/services/repos.py +101 -11
  34. dstack/_internal/server/background/tasks/common.py +2 -0
  35. dstack/_internal/server/background/tasks/process_instances.py +2 -2
  36. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  37. dstack/_internal/server/background/tasks/process_submitted_jobs.py +51 -41
  38. dstack/_internal/server/services/offers.py +7 -1
  39. dstack/_internal/server/testing/common.py +2 -0
  40. dstack/_internal/server/utils/provisioning.py +3 -10
  41. dstack/_internal/utils/ssh.py +22 -2
  42. dstack/version.py +2 -2
  43. {dstack-0.19.30rc1.dist-info → dstack-0.19.31.dist-info}/METADATA +17 -13
  44. {dstack-0.19.30rc1.dist-info → dstack-0.19.31.dist-info}/RECORD +47 -47
  45. {dstack-0.19.30rc1.dist-info → dstack-0.19.31.dist-info}/WHEEL +0 -0
  46. {dstack-0.19.30rc1.dist-info → dstack-0.19.31.dist-info}/entry_points.txt +0 -0
  47. {dstack-0.19.30rc1.dist-info → dstack-0.19.31.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,7 +3,7 @@ import itertools
3
3
  import math
4
4
  import uuid
5
5
  from datetime import datetime, timedelta
6
- from typing import List, Optional, Tuple
6
+ from typing import List, Optional
7
7
 
8
8
  from sqlalchemy import and_, func, not_, or_, select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
25
25
  from dstack._internal.core.models.profiles import (
26
26
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
27
27
  CreationPolicy,
28
+ Profile,
28
29
  TerminationPolicy,
29
30
  )
30
31
  from dstack._internal.core.models.resources import Memory
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
34
35
  JobRuntimeData,
35
36
  JobStatus,
36
37
  JobTerminationReason,
38
+ Requirements,
37
39
  Run,
38
40
  RunSpec,
39
41
  )
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
186
188
  project = run_model.project
187
189
  run = run_model_to_run(run_model)
188
190
  run_spec = run.run_spec
189
- profile = run_spec.merged_profile
191
+ run_profile = run_spec.merged_profile
190
192
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
191
193
  multinode = job.job_spec.jobs_per_replica > 1
192
194
 
@@ -333,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
333
335
  job_model.status = JobStatus.PROVISIONING
334
336
  else:
335
337
  # Assigned no instance, create a new one
336
- if profile.creation_policy == CreationPolicy.REUSE:
338
+ if run_profile.creation_policy == CreationPolicy.REUSE:
337
339
  logger.debug("%s: reuse instance failed", fmt(job_model))
338
340
  job_model.status = JobStatus.TERMINATING
339
341
  job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
@@ -362,7 +364,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
362
364
  return
363
365
 
364
366
  logger.info("%s: now is provisioning a new instance", fmt(job_model))
365
- job_provisioning_data, offer = run_job_result
367
+ job_provisioning_data, offer, effective_profile, _ = run_job_result
366
368
  job_model.job_provisioning_data = job_provisioning_data.json()
367
369
  job_model.status = JobStatus.PROVISIONING
368
370
  if fleet_model is None:
@@ -382,12 +384,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
382
384
  instance = _create_instance_model_for_job(
383
385
  project=project,
384
386
  fleet_model=fleet_model,
385
- run_spec=run_spec,
386
387
  job_model=job_model,
387
- job=job,
388
388
  job_provisioning_data=job_provisioning_data,
389
389
  offer=offer,
390
390
  instance_num=instance_num,
391
+ profile=effective_profile,
391
392
  )
392
393
  job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
393
394
  # Both this task and process_fleets can add instances to fleets.
@@ -546,23 +547,22 @@ async def _find_optimal_fleet_with_offers(
546
547
  fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
547
548
 
548
549
  candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
549
- profile = combine_fleet_and_run_profiles(
550
- candidate_fleet.spec.merged_profile, run_spec.merged_profile
551
- )
552
- fleet_requirements = get_fleet_requirements(candidate_fleet.spec)
553
- requirements = combine_fleet_and_run_requirements(
554
- fleet_requirements, job.job_spec.requirements
555
- )
556
- multinode = (
557
- candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
558
- or job.job_spec.jobs_per_replica > 1
559
- )
550
+ profile = None
551
+ requirements = None
552
+ try:
553
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
554
+ job=job,
555
+ run_spec=run_spec,
556
+ fleet=candidate_fleet,
557
+ )
558
+ except ValueError:
559
+ pass
560
560
  fleet_backend_offers = []
561
- if (
562
- _check_can_create_new_instance_in_fleet(candidate_fleet)
563
- and profile is not None
564
- and requirements is not None
565
- ):
561
+ if profile is not None and requirements is not None:
562
+ multinode = (
563
+ candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
564
+ or job.job_spec.jobs_per_replica > 1
565
+ )
566
566
  fleet_backend_offers = await get_offers_by_requirements(
567
567
  project=project,
568
568
  profile=profile,
@@ -704,7 +704,7 @@ async def _run_job_on_new_instance(
704
704
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
705
705
  volumes: Optional[List[List[Volume]]] = None,
706
706
  fleet_model: Optional[FleetModel] = None,
707
- ) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
707
+ ) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
708
708
  if volumes is None:
709
709
  volumes = []
710
710
  profile = run.run_spec.merged_profile
@@ -712,21 +712,14 @@ async def _run_job_on_new_instance(
712
712
  fleet = None
713
713
  if fleet_model is not None:
714
714
  fleet = fleet_model_to_fleet(fleet_model)
715
- if not _check_can_create_new_instance_in_fleet(fleet):
716
- logger.debug(
717
- "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
718
- )
719
- return None
720
- profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
721
- if profile is None:
722
- logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
723
- return None
724
- fleet_requirements = get_fleet_requirements(fleet.spec)
725
- requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
726
- if requirements is None:
727
- logger.debug(
728
- "%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
715
+ try:
716
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
717
+ job=job,
718
+ run_spec=run.run_spec,
719
+ fleet=fleet,
729
720
  )
721
+ except ValueError as e:
722
+ logger.debug("%s: %s", fmt(job_model), e.args[0])
730
723
  return None
731
724
  # TODO: Respect fleet provisioning properties such as tags
732
725
 
@@ -766,7 +759,7 @@ async def _run_job_on_new_instance(
766
759
  project_ssh_private_key,
767
760
  offer_volumes,
768
761
  )
769
- return job_provisioning_data, offer
762
+ return job_provisioning_data, offer, profile, requirements
770
763
  except BackendError as e:
771
764
  logger.warning(
772
765
  "%s: %s launch in %s/%s failed: %s",
@@ -789,6 +782,25 @@ async def _run_job_on_new_instance(
789
782
  return None
790
783
 
791
784
 
785
+ def _get_run_profile_and_requirements_in_fleet(
786
+ job: Job,
787
+ run_spec: RunSpec,
788
+ fleet: Fleet,
789
+ ) -> tuple[Profile, Requirements]:
790
+ if not _check_can_create_new_instance_in_fleet(fleet):
791
+ raise ValueError("Cannot fit new instance into fleet")
792
+ profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
793
+ if profile is None:
794
+ raise ValueError("Cannot combine fleet profile")
795
+ fleet_requirements = get_fleet_requirements(fleet.spec)
796
+ requirements = combine_fleet_and_run_requirements(
797
+ fleet_requirements, job.job_spec.requirements
798
+ )
799
+ if requirements is None:
800
+ raise ValueError("Cannot combine fleet requirements")
801
+ return profile, requirements
802
+
803
+
792
804
  def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
793
805
  if fleet.spec.configuration.ssh_config is not None:
794
806
  return False
@@ -857,14 +869,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
857
869
  def _create_instance_model_for_job(
858
870
  project: ProjectModel,
859
871
  fleet_model: FleetModel,
860
- run_spec: RunSpec,
861
872
  job_model: JobModel,
862
- job: Job,
863
873
  job_provisioning_data: JobProvisioningData,
864
874
  offer: InstanceOfferWithAvailability,
865
875
  instance_num: int,
876
+ profile: Profile,
866
877
  ) -> InstanceModel:
867
- profile = run_spec.merged_profile
868
878
  if not job_provisioning_data.dockerized:
869
879
  # terminate vastai/k8s instances immediately
870
880
  termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
@@ -7,6 +7,7 @@ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGrou
7
7
  from dstack._internal.core.backends.features import (
8
8
  BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
9
9
  BACKENDS_WITH_MULTINODE_SUPPORT,
10
+ BACKENDS_WITH_PRIVILEGED_SUPPORT,
10
11
  BACKENDS_WITH_RESERVATION_SUPPORT,
11
12
  )
12
13
  from dstack._internal.core.models.backends.base import BackendType
@@ -67,7 +68,12 @@ async def get_offers_by_requirements(
67
68
  backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
68
69
  backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
69
70
 
70
- if privileged or instance_mounts:
71
+ if privileged:
72
+ if backend_types is None:
73
+ backend_types = BACKENDS_WITH_PRIVILEGED_SUPPORT
74
+ backend_types = [b for b in backend_types if b in BACKENDS_WITH_PRIVILEGED_SUPPORT]
75
+
76
+ if instance_mounts:
71
77
  if backend_types is None:
72
78
  backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
73
79
  backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]
@@ -16,6 +16,7 @@ from dstack._internal.core.backends.base.compute import (
16
16
  ComputeWithMultinodeSupport,
17
17
  ComputeWithPlacementGroupSupport,
18
18
  ComputeWithPrivateGatewaySupport,
19
+ ComputeWithPrivilegedSupport,
19
20
  ComputeWithReservationSupport,
20
21
  ComputeWithVolumeSupport,
21
22
  )
@@ -1131,6 +1132,7 @@ class AsyncContextManager:
1131
1132
  class ComputeMockSpec(
1132
1133
  Compute,
1133
1134
  ComputeWithCreateInstanceSupport,
1135
+ ComputeWithPrivilegedSupport,
1134
1136
  ComputeWithMultinodeSupport,
1135
1137
  ComputeWithReservationSupport,
1136
1138
  ComputeWithPlacementGroupSupport,
@@ -6,7 +6,7 @@ from textwrap import dedent
6
6
  from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
- from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
9
+ from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
10
10
 
11
11
  from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
12
12
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
@@ -248,14 +248,7 @@ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]:
248
248
  return out
249
249
 
250
250
 
251
- def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
252
- _cpu_arch: CPUArchitecture
253
- if cpu_arch == "amd64":
254
- _cpu_arch = CPUArchitecture.X86
255
- elif cpu_arch == "arm64":
256
- _cpu_arch = CPUArchitecture.ARM
257
- else:
258
- raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
251
+ def host_info_to_instance_type(host_info: Dict[str, Any], arch: GoArchType) -> InstanceType:
259
252
  gpu_count = host_info.get("gpu_count", 0)
260
253
  if gpu_count > 0:
261
254
  gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -280,7 +273,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType)
280
273
  instance_type = InstanceType(
281
274
  name="instance",
282
275
  resources=Resources(
283
- cpu_arch=_cpu_arch,
276
+ cpu_arch=arch.to_cpu_architecture(),
284
277
  cpus=host_info["cpus"],
285
278
  memory_mib=host_info["memory"] / 1024 / 1024,
286
279
  spot=False,
@@ -50,8 +50,28 @@ def make_ssh_command_for_git(identity_file: PathLike) -> str:
50
50
  )
51
51
 
52
52
 
53
- def make_git_env(*, identity_file: Optional[PathLike] = None) -> dict[str, str]:
54
- env: dict[str, str] = {"GIT_TERMINAL_PROMPT": "0"}
53
+ def make_git_env(
54
+ *,
55
+ disable_prompt: bool = True,
56
+ disable_askpass: bool = False,
57
+ disable_config: bool = False,
58
+ identity_file: Optional[PathLike] = None,
59
+ ) -> dict[str, str]:
60
+ env: dict[str, str] = {}
61
+ if disable_prompt:
62
+ # Fail with error instead of prompting on the terminal (e.g., when asking for
63
+ # HTTP authentication)
64
+ env["GIT_TERMINAL_PROMPT"] = "0"
65
+ if disable_askpass:
66
+ env["GIT_ASKPASS"] = ""
67
+ env["SSH_ASKPASS"] = ""
68
+ if disable_config:
69
+ # Disable system-wide config (usually /etc/gitconfig)
70
+ env["GIT_CONFIG_SYSTEM"] = os.devnull
71
+ # Disable user (aka "global") config ($XDG_CONFIG_HOME/git/config or ~/.git/config)
72
+ env["GIT_CONFIG_GLOBAL"] = os.devnull
73
+ # Disable repo (aka "local") config (./.git/config)
74
+ env["GIT_DIR"] = os.devnull
55
75
  if identity_file is not None:
56
76
  env["GIT_SSH_COMMAND"] = make_ssh_command_for_git(identity_file)
57
77
  return env
dstack/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.19.30rc1"
1
+ __version__ = "0.19.31"
2
2
  __is_release__ = True
3
- base_image = "0.10"
3
+ base_image = "0.11rc2"
4
4
  base_image_ubuntu_version = "22.04"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.30rc1
3
+ Version: 0.19.31
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -331,24 +331,28 @@ Description-Content-Type: text/markdown
331
331
 
332
332
  </div>
333
333
 
334
- `dstack` is an open-source container orchestrator that simplifies workload orchestration and drives GPU utilization for ML teams. It works with any GPU cloud, on-prem cluster, or accelerated hardware.
334
+ `dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters.
335
335
 
336
- #### Accelerators
336
+ It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks.
337
+
338
+ #### Hardware
337
339
 
338
340
  `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
339
341
 
340
342
  ## Latest news ✨
343
+ - [2025/09] [dstack 0.19.27: Offers UI, Digital Ocean and AMD Developer Cloud](https://github.com/dstackai/dstack/releases/tag/0.19.27)
344
+ - [2025/08] [dstack 0.19.26: Repos – explicit repo configuration via YAML](https://github.com/dstackai/dstack/releases/tag/0.19.26)
345
+ - [2025/08] [dstack 0.19.25: `dstack offer` CLI command](https://github.com/dstackai/dstack/releases/tag/0.19.25)
346
+ - [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy, Secrets UI](https://github.com/dstackai/dstack/releases/tag/0.19.22)
347
+ - [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
341
348
  - [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
342
349
  - [2025/06] [dstack 0.19.16: Docker in Docker, CloudRift](https://github.com/dstackai/dstack/releases/tag/0.19.16)
343
350
  - [2025/06] [dstack 0.19.13: InfiniBand support in default images](https://github.com/dstackai/dstack/releases/tag/0.19.13)
344
351
  - [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
345
- - [2025/05] [dstack 0.19.10: Priorities](https://github.com/dstackai/dstack/releases/tag/0.19.10)
346
- - [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
347
- - [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
348
352
 
349
353
  ## How does it work?
350
354
 
351
- <img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v10.svg" width="750" />
355
+ <img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v11.svg" width="750" />
352
356
 
353
357
  ### Installation
354
358
 
@@ -356,15 +360,15 @@ Description-Content-Type: text/markdown
356
360
 
357
361
  #### Set up the server
358
362
 
359
- ##### (Optional) Configure backends
363
+ ##### Configure backends
364
+
365
+ To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
360
366
 
361
- To use `dstack` with cloud providers, configure backends
362
- via the `~/.dstack/server/config.yml` file.
367
+ Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI.
363
368
 
364
- For more details on how to configure backends, check [Backends](https://dstack.ai/docs/concepts/backends).
369
+ For more details, see [Backends](../concepts/backends.md).
365
370
 
366
- > For using `dstack` with on-prem servers, create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh)
367
- > once the server is up.
371
+ > When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh) once the server is up.
368
372
 
369
373
  ##### Start the server
370
374