dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +56 -13
- dstack/_internal/cli/utils/run.py +10 -5
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +3 -1
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +48 -0
- dstack/_internal/core/backends/nebius/models.py +9 -1
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +79 -4
- dstack/_internal/core/models/runs.py +26 -9
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +18 -8
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +37 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +48 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import uuid
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
-
from typing import Dict, List, Optional, Union
|
|
5
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
import gpuhunt
|
|
@@ -25,7 +25,12 @@ from dstack._internal.core.models.configurations import (
|
|
|
25
25
|
DevEnvironmentConfiguration,
|
|
26
26
|
)
|
|
27
27
|
from dstack._internal.core.models.envs import Env
|
|
28
|
-
from dstack._internal.core.models.fleets import
|
|
28
|
+
from dstack._internal.core.models.fleets import (
|
|
29
|
+
FleetConfiguration,
|
|
30
|
+
FleetSpec,
|
|
31
|
+
FleetStatus,
|
|
32
|
+
InstanceGroupPlacement,
|
|
33
|
+
)
|
|
29
34
|
from dstack._internal.core.models.gateways import GatewayComputeConfiguration, GatewayStatus
|
|
30
35
|
from dstack._internal.core.models.instances import (
|
|
31
36
|
Disk,
|
|
@@ -51,7 +56,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
51
56
|
)
|
|
52
57
|
from dstack._internal.core.models.repos.base import RepoType
|
|
53
58
|
from dstack._internal.core.models.repos.local import LocalRunRepoData
|
|
54
|
-
from dstack._internal.core.models.resources import Memory, Range, ResourcesSpec
|
|
59
|
+
from dstack._internal.core.models.resources import CPUSpec, Memory, Range, ResourcesSpec
|
|
55
60
|
from dstack._internal.core.models.runs import (
|
|
56
61
|
JobProvisioningData,
|
|
57
62
|
JobRuntimeData,
|
|
@@ -297,6 +302,7 @@ async def create_job(
|
|
|
297
302
|
job_num: int = 0,
|
|
298
303
|
replica_num: int = 0,
|
|
299
304
|
instance_assigned: bool = False,
|
|
305
|
+
disconnected_at: Optional[datetime] = None,
|
|
300
306
|
) -> JobModel:
|
|
301
307
|
run_spec = RunSpec.parse_raw(run.run_spec)
|
|
302
308
|
job_spec = (await get_job_specs_from_run_spec(run_spec, replica_num=replica_num))[0]
|
|
@@ -318,6 +324,7 @@ async def create_job(
|
|
|
318
324
|
instance=instance,
|
|
319
325
|
instance_assigned=instance_assigned,
|
|
320
326
|
used_instance_id=instance.id if instance is not None else None,
|
|
327
|
+
disconnected_at=disconnected_at,
|
|
321
328
|
)
|
|
322
329
|
session.add(job)
|
|
323
330
|
await session.commit()
|
|
@@ -497,10 +504,12 @@ def get_fleet_spec(conf: Optional[FleetConfiguration] = None) -> FleetSpec:
|
|
|
497
504
|
def get_fleet_configuration(
|
|
498
505
|
name: str = "test-fleet",
|
|
499
506
|
nodes: Range[int] = Range(min=1, max=1),
|
|
507
|
+
placement: Optional[InstanceGroupPlacement] = None,
|
|
500
508
|
) -> FleetConfiguration:
|
|
501
509
|
return FleetConfiguration(
|
|
502
510
|
name=name,
|
|
503
511
|
nodes=nodes,
|
|
512
|
+
placement=placement,
|
|
504
513
|
)
|
|
505
514
|
|
|
506
515
|
|
|
@@ -519,13 +528,13 @@ async def create_instance(
|
|
|
519
528
|
instance_id: Optional[UUID] = None,
|
|
520
529
|
job: Optional[JobModel] = None,
|
|
521
530
|
instance_num: int = 0,
|
|
522
|
-
backend: BackendType = BackendType.DATACRUNCH,
|
|
531
|
+
backend: Optional[BackendType] = BackendType.DATACRUNCH,
|
|
523
532
|
termination_policy: Optional[TerminationPolicy] = None,
|
|
524
533
|
termination_idle_time: int = DEFAULT_FLEET_TERMINATION_IDLE_TIME,
|
|
525
|
-
region: str = "eu-west",
|
|
534
|
+
region: Optional[str] = "eu-west",
|
|
526
535
|
remote_connection_info: Optional[RemoteConnectionInfo] = None,
|
|
527
|
-
offer: Optional[InstanceOfferWithAvailability] =
|
|
528
|
-
job_provisioning_data: Optional[JobProvisioningData] =
|
|
536
|
+
offer: Optional[Union[InstanceOfferWithAvailability, Literal["auto"]]] = "auto",
|
|
537
|
+
job_provisioning_data: Optional[Union[JobProvisioningData, Literal["auto"]]] = "auto",
|
|
529
538
|
total_blocks: Optional[int] = 1,
|
|
530
539
|
busy_blocks: int = 0,
|
|
531
540
|
name: str = "test_instance",
|
|
@@ -534,7 +543,7 @@ async def create_instance(
|
|
|
534
543
|
) -> InstanceModel:
|
|
535
544
|
if instance_id is None:
|
|
536
545
|
instance_id = uuid.uuid4()
|
|
537
|
-
if job_provisioning_data
|
|
546
|
+
if job_provisioning_data == "auto":
|
|
538
547
|
job_provisioning_data = get_job_provisioning_data(
|
|
539
548
|
dockerized=True,
|
|
540
549
|
backend=backend,
|
|
@@ -543,13 +552,13 @@ async def create_instance(
|
|
|
543
552
|
hostname="running_instance.ip",
|
|
544
553
|
internal_ip=None,
|
|
545
554
|
)
|
|
546
|
-
if offer
|
|
555
|
+
if offer == "auto":
|
|
547
556
|
offer = get_instance_offer_with_availability(backend=backend, region=region, spot=spot)
|
|
548
557
|
if profile is None:
|
|
549
558
|
profile = Profile(name="test_name")
|
|
550
559
|
|
|
551
560
|
if requirements is None:
|
|
552
|
-
requirements = Requirements(resources=ResourcesSpec(cpu=1))
|
|
561
|
+
requirements = Requirements(resources=ResourcesSpec(cpu=CPUSpec.parse("1")))
|
|
553
562
|
|
|
554
563
|
if instance_configuration is None:
|
|
555
564
|
instance_configuration = get_instance_configuration()
|
|
@@ -571,8 +580,8 @@ async def create_instance(
|
|
|
571
580
|
created_at=created_at,
|
|
572
581
|
started_at=created_at,
|
|
573
582
|
finished_at=finished_at,
|
|
574
|
-
job_provisioning_data=job_provisioning_data.json(),
|
|
575
|
-
offer=offer.json(),
|
|
583
|
+
job_provisioning_data=job_provisioning_data.json() if job_provisioning_data else None,
|
|
584
|
+
offer=offer.json() if offer else None,
|
|
576
585
|
price=price,
|
|
577
586
|
region=region,
|
|
578
587
|
backend=backend,
|
|
@@ -659,20 +668,7 @@ def get_remote_connection_info(
|
|
|
659
668
|
env: Optional[Union[Env, dict]] = None,
|
|
660
669
|
):
|
|
661
670
|
if ssh_keys is None:
|
|
662
|
-
ssh_keys = [
|
|
663
|
-
SSHKey(
|
|
664
|
-
public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
|
|
665
|
-
private="""
|
|
666
|
-
-----BEGIN OPENSSH PRIVATE KEY-----
|
|
667
|
-
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
|
|
668
|
-
QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
|
|
669
|
-
VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
|
|
670
|
-
AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
|
|
671
|
-
CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
|
|
672
|
-
-----END OPENSSH PRIVATE KEY-----
|
|
673
|
-
""",
|
|
674
|
-
)
|
|
675
|
-
]
|
|
671
|
+
ssh_keys = [get_ssh_key()]
|
|
676
672
|
if env is None:
|
|
677
673
|
env = Env()
|
|
678
674
|
elif isinstance(env, dict):
|
|
@@ -686,6 +682,21 @@ def get_remote_connection_info(
|
|
|
686
682
|
)
|
|
687
683
|
|
|
688
684
|
|
|
685
|
+
def get_ssh_key() -> SSHKey:
|
|
686
|
+
return SSHKey(
|
|
687
|
+
public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
|
|
688
|
+
private="""
|
|
689
|
+
-----BEGIN OPENSSH PRIVATE KEY-----
|
|
690
|
+
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
|
|
691
|
+
QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
|
|
692
|
+
VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
|
|
693
|
+
AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
|
|
694
|
+
CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
|
|
695
|
+
-----END OPENSSH PRIVATE KEY-----
|
|
696
|
+
""",
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
|
|
689
700
|
async def create_volume(
|
|
690
701
|
session: AsyncSession,
|
|
691
702
|
project: ProjectModel,
|
dstack/_internal/utils/common.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import enum
|
|
2
3
|
import itertools
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
@@ -83,6 +84,8 @@ def pretty_date(time: datetime) -> str:
|
|
|
83
84
|
|
|
84
85
|
|
|
85
86
|
def pretty_resources(
|
|
87
|
+
*,
|
|
88
|
+
cpu_arch: Optional[Any] = None,
|
|
86
89
|
cpus: Optional[Any] = None,
|
|
87
90
|
memory: Optional[Any] = None,
|
|
88
91
|
gpu_count: Optional[Any] = None,
|
|
@@ -110,7 +113,16 @@ def pretty_resources(
|
|
|
110
113
|
"""
|
|
111
114
|
parts = []
|
|
112
115
|
if cpus is not None:
|
|
113
|
-
|
|
116
|
+
cpu_arch_lower: Optional[str] = None
|
|
117
|
+
if isinstance(cpu_arch, enum.Enum):
|
|
118
|
+
cpu_arch_lower = str(cpu_arch.value).lower()
|
|
119
|
+
elif isinstance(cpu_arch, str):
|
|
120
|
+
cpu_arch_lower = cpu_arch.lower()
|
|
121
|
+
if cpu_arch_lower == "arm":
|
|
122
|
+
cpu_arch_prefix = "arm:"
|
|
123
|
+
else:
|
|
124
|
+
cpu_arch_prefix = ""
|
|
125
|
+
parts.append(f"cpu={cpu_arch_prefix}{cpus}")
|
|
114
126
|
if memory is not None:
|
|
115
127
|
parts.append(f"mem={memory}")
|
|
116
128
|
if disk_size:
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
def add_extra_schema_types(schema_property: dict, extra_types: list[dict]):
|
|
2
2
|
if "allOf" in schema_property:
|
|
3
|
-
|
|
3
|
+
refs = [schema_property.pop("allOf")[0]]
|
|
4
|
+
elif "anyOf" in schema_property:
|
|
5
|
+
refs = schema_property.pop("anyOf")
|
|
4
6
|
else:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
+
refs = [{"type": schema_property.pop("type")}]
|
|
8
|
+
refs.extend(extra_types)
|
|
9
|
+
schema_property["anyOf"] = refs
|
dstack/api/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from dstack._internal.core.models.repos.local import LocalRepo
|
|
|
14
14
|
from dstack._internal.core.models.repos.remote import RemoteRepo
|
|
15
15
|
from dstack._internal.core.models.repos.virtual import VirtualRepo
|
|
16
16
|
from dstack._internal.core.models.resources import ComputeCapability, Memory, Range
|
|
17
|
+
from dstack._internal.core.models.resources import CPUSpec as CPU
|
|
17
18
|
from dstack._internal.core.models.resources import DiskSpec as Disk
|
|
18
19
|
from dstack._internal.core.models.resources import GPUSpec as GPU
|
|
19
20
|
from dstack._internal.core.models.resources import ResourcesSpec as Resources
|
dstack/api/server/_fleets.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
3
3
|
from pydantic import parse_obj_as
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.fleets import ApplyFleetPlanInput, Fleet, FleetPlan, FleetSpec
|
|
6
|
+
from dstack._internal.core.models.instances import Instance
|
|
6
7
|
from dstack._internal.server.schemas.fleets import (
|
|
7
8
|
ApplyFleetPlanRequest,
|
|
8
9
|
CreateFleetRequest,
|
|
@@ -83,9 +84,24 @@ def _get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
|
|
|
83
84
|
spec_excludes = _get_fleet_spec_excludes(plan_input.spec)
|
|
84
85
|
if spec_excludes:
|
|
85
86
|
apply_plan_excludes["spec"] = apply_plan_excludes
|
|
87
|
+
current_resource = plan_input.current_resource
|
|
88
|
+
if current_resource is not None:
|
|
89
|
+
current_resource_excludes = {}
|
|
90
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
91
|
+
if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
|
|
92
|
+
current_resource_excludes["instances"] = {
|
|
93
|
+
"__all__": {"instance_type": {"resources": {"cpu_arch"}}}
|
|
94
|
+
}
|
|
86
95
|
return {"plan": apply_plan_excludes}
|
|
87
96
|
|
|
88
97
|
|
|
98
|
+
def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
|
|
99
|
+
try:
|
|
100
|
+
return instance.instance_type.resources.cpu_arch is None
|
|
101
|
+
except AttributeError:
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
|
|
89
105
|
def _get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
90
106
|
create_fleet_excludes = {}
|
|
91
107
|
spec_excludes = _get_fleet_spec_excludes(fleet_spec)
|
dstack/api/server/_runs.py
CHANGED
|
@@ -7,6 +7,7 @@ from pydantic import parse_obj_as
|
|
|
7
7
|
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
8
8
|
from dstack._internal.core.models.runs import (
|
|
9
9
|
ApplyRunPlanInput,
|
|
10
|
+
JobSubmission,
|
|
10
11
|
Run,
|
|
11
12
|
RunPlan,
|
|
12
13
|
RunSpec,
|
|
@@ -96,13 +97,57 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
96
97
|
run_spec_excludes = _get_run_spec_excludes(plan.run_spec)
|
|
97
98
|
if run_spec_excludes is not None:
|
|
98
99
|
apply_plan_excludes["run_spec"] = run_spec_excludes
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
100
|
+
current_resource = plan.current_resource
|
|
101
|
+
if current_resource is not None:
|
|
102
|
+
current_resource_excludes = {}
|
|
103
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
104
|
+
current_resource_excludes["run_spec"] = _get_run_spec_excludes(current_resource.run_spec)
|
|
105
|
+
job_submissions_excludes = {}
|
|
106
|
+
current_resource_excludes["jobs"] = {
|
|
107
|
+
"__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
|
|
102
108
|
}
|
|
109
|
+
job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
|
|
110
|
+
if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
|
|
111
|
+
job_submissions_excludes["job_provisioning_data"] = {
|
|
112
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
113
|
+
}
|
|
114
|
+
if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
|
|
115
|
+
job_submissions_excludes["job_runtime_data"] = {
|
|
116
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
117
|
+
}
|
|
118
|
+
if all(js.exit_status is None for js in job_submissions):
|
|
119
|
+
job_submissions_excludes["exit_status"] = True
|
|
120
|
+
latest_job_submission = current_resource.latest_job_submission
|
|
121
|
+
if latest_job_submission is not None:
|
|
122
|
+
latest_job_submission_excludes = {}
|
|
123
|
+
current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
|
|
124
|
+
if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
|
|
125
|
+
latest_job_submission_excludes["job_provisioning_data"] = {
|
|
126
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
127
|
+
}
|
|
128
|
+
if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
|
|
129
|
+
latest_job_submission_excludes["job_runtime_data"] = {
|
|
130
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
131
|
+
}
|
|
132
|
+
if latest_job_submission.exit_status is None:
|
|
133
|
+
latest_job_submission_excludes["exit_status"] = True
|
|
103
134
|
return {"plan": apply_plan_excludes}
|
|
104
135
|
|
|
105
136
|
|
|
137
|
+
def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
138
|
+
try:
|
|
139
|
+
return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
|
|
140
|
+
except AttributeError:
|
|
141
|
+
return True
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
145
|
+
try:
|
|
146
|
+
return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
|
|
147
|
+
except AttributeError:
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
|
|
106
151
|
def _get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
|
|
107
152
|
"""
|
|
108
153
|
Excludes new fields when they are not set to keep
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.9
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -49,11 +49,12 @@ Requires-Dist: asyncpg; extra == 'all'
|
|
|
49
49
|
Requires-Dist: azure-identity>=1.12.0; extra == 'all'
|
|
50
50
|
Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'all'
|
|
51
51
|
Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'all'
|
|
52
|
+
Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'all'
|
|
52
53
|
Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'all'
|
|
53
54
|
Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'all'
|
|
54
55
|
Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'all'
|
|
55
56
|
Requires-Dist: backports-entry-points-selectable; extra == 'all'
|
|
56
|
-
Requires-Dist: boto3; extra == 'all'
|
|
57
|
+
Requires-Dist: boto3>=1.38.13; extra == 'all'
|
|
57
58
|
Requires-Dist: botocore; extra == 'all'
|
|
58
59
|
Requires-Dist: datacrunch; extra == 'all'
|
|
59
60
|
Requires-Dist: docker>=6.0.0; extra == 'all'
|
|
@@ -89,7 +90,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'aws'
|
|
|
89
90
|
Requires-Dist: apscheduler<4; extra == 'aws'
|
|
90
91
|
Requires-Dist: asyncpg; extra == 'aws'
|
|
91
92
|
Requires-Dist: backports-entry-points-selectable; extra == 'aws'
|
|
92
|
-
Requires-Dist: boto3; extra == 'aws'
|
|
93
|
+
Requires-Dist: boto3>=1.38.13; extra == 'aws'
|
|
93
94
|
Requires-Dist: botocore; extra == 'aws'
|
|
94
95
|
Requires-Dist: docker>=6.0.0; extra == 'aws'
|
|
95
96
|
Requires-Dist: fastapi; extra == 'aws'
|
|
@@ -116,6 +117,7 @@ Requires-Dist: asyncpg; extra == 'azure'
|
|
|
116
117
|
Requires-Dist: azure-identity>=1.12.0; extra == 'azure'
|
|
117
118
|
Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'azure'
|
|
118
119
|
Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'azure'
|
|
120
|
+
Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'azure'
|
|
119
121
|
Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'azure'
|
|
120
122
|
Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'azure'
|
|
121
123
|
Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'azure'
|
|
@@ -229,7 +231,7 @@ Requires-Dist: alembic>=1.10.2; extra == 'lambda'
|
|
|
229
231
|
Requires-Dist: apscheduler<4; extra == 'lambda'
|
|
230
232
|
Requires-Dist: asyncpg; extra == 'lambda'
|
|
231
233
|
Requires-Dist: backports-entry-points-selectable; extra == 'lambda'
|
|
232
|
-
Requires-Dist: boto3; extra == 'lambda'
|
|
234
|
+
Requires-Dist: boto3>=1.38.13; extra == 'lambda'
|
|
233
235
|
Requires-Dist: botocore; extra == 'lambda'
|
|
234
236
|
Requires-Dist: docker>=6.0.0; extra == 'lambda'
|
|
235
237
|
Requires-Dist: fastapi; extra == 'lambda'
|
|
@@ -336,24 +338,27 @@ orchestration for ML teams across top clouds and on-prem clusters.
|
|
|
336
338
|
|
|
337
339
|
#### Accelerators
|
|
338
340
|
|
|
339
|
-
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`,
|
|
341
|
+
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
|
|
340
342
|
|
|
341
|
-
##
|
|
343
|
+
## Latest news ✨
|
|
342
344
|
|
|
343
|
-
- [2025/
|
|
344
|
-
- [2025/
|
|
345
|
-
- [2025/
|
|
346
|
-
- [
|
|
347
|
-
- [
|
|
348
|
-
- [2024/10] [dstack 0.18.18: Hardware metrics monitoring](https://github.com/dstackai/dstack/releases/tag/0.18.18)
|
|
345
|
+
- [2025/05] [dstack 0.19.8: Nebius clusters, GH200 on Lambda](https://github.com/dstackai/dstack/releases/tag/0.19.8)
|
|
346
|
+
- [2025/04] [dstack 0.19.6: Tenstorrent, Plugins](https://github.com/dstackai/dstack/releases/tag/0.19.6)
|
|
347
|
+
- [2025/04] [dstack 0.19.5: GCP A3 High clusters](https://github.com/dstackai/dstack/releases/tag/0.19.5)
|
|
348
|
+
- [2025/04] [dstack 0.19.3: GCP A3 Mega clusters](https://github.com/dstackai/dstack/releases/tag/0.19.3)
|
|
349
|
+
- [2025/03] [dstack 0.19.0: Prometheus](https://github.com/dstackai/dstack/releases/tag/0.19.0)
|
|
349
350
|
|
|
350
|
-
##
|
|
351
|
+
## How does it work?
|
|
352
|
+
|
|
353
|
+
<img src="https://dstack.ai/static-assets/static-assets/images/dstack-architecture-diagram-v8.svg" width="750" />
|
|
354
|
+
|
|
355
|
+
### Installation
|
|
351
356
|
|
|
352
357
|
> Before using `dstack` through CLI or API, set up a `dstack` server. If you already have a running `dstack` server, you only need to [set up the CLI](#set-up-the-cli).
|
|
353
358
|
|
|
354
|
-
|
|
359
|
+
#### Set up the server
|
|
355
360
|
|
|
356
|
-
|
|
361
|
+
##### (Optional) Configure backends
|
|
357
362
|
|
|
358
363
|
To use `dstack` with cloud providers, configure backends
|
|
359
364
|
via the `~/.dstack/server/config.yml` file.
|
|
@@ -363,21 +368,21 @@ For more details on how to configure backends, check [Backends](https://dstack.a
|
|
|
363
368
|
> For using `dstack` with on-prem servers, create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh)
|
|
364
369
|
> once the server is up.
|
|
365
370
|
|
|
366
|
-
|
|
371
|
+
##### Start the server
|
|
367
372
|
|
|
368
373
|
You can install the server on Linux, macOS, and Windows (via WSL 2). It requires Git and
|
|
369
374
|
OpenSSH.
|
|
370
375
|
|
|
371
|
-
#####
|
|
376
|
+
##### uv
|
|
372
377
|
|
|
373
378
|
```shell
|
|
374
|
-
$
|
|
379
|
+
$ uv tool install "dstack[all]" -U
|
|
375
380
|
```
|
|
376
381
|
|
|
377
|
-
#####
|
|
382
|
+
##### pip
|
|
378
383
|
|
|
379
384
|
```shell
|
|
380
|
-
$
|
|
385
|
+
$ pip install "dstack[all]" -U
|
|
381
386
|
```
|
|
382
387
|
|
|
383
388
|
Once it's installed, go ahead and start the server.
|
|
@@ -390,25 +395,28 @@ The admin token is "bbae0f28-d3dd-4820-bf61-8f4bb40815da"
|
|
|
390
395
|
The server is running at http://127.0.0.1:3000/
|
|
391
396
|
```
|
|
392
397
|
|
|
393
|
-
For more details on server configuration options, see the
|
|
398
|
+
> For more details on server configuration options, see the
|
|
394
399
|
[Server deployment](https://dstack.ai/docs/guides/server-deployment) guide.
|
|
395
400
|
|
|
396
|
-
|
|
401
|
+
|
|
402
|
+
<details><summary>Set up the CLI</summary>
|
|
403
|
+
|
|
404
|
+
#### Set up the CLI
|
|
397
405
|
|
|
398
406
|
Once the server is up, you can access it via the `dstack` CLI.
|
|
399
407
|
|
|
400
408
|
The CLI can be installed on Linux, macOS, and Windows. It requires Git and OpenSSH.
|
|
401
409
|
|
|
402
|
-
#####
|
|
410
|
+
##### uv
|
|
403
411
|
|
|
404
412
|
```shell
|
|
405
|
-
$
|
|
413
|
+
$ uv tool install dstack -U
|
|
406
414
|
```
|
|
407
415
|
|
|
408
|
-
#####
|
|
416
|
+
##### pip
|
|
409
417
|
|
|
410
418
|
```shell
|
|
411
|
-
$
|
|
419
|
+
$ pip install dstack -U
|
|
412
420
|
```
|
|
413
421
|
|
|
414
422
|
To point the CLI to the `dstack` server, configure it
|
|
@@ -423,9 +431,9 @@ $ dstack config \
|
|
|
423
431
|
Configuration is updated at ~/.dstack/config.yml
|
|
424
432
|
```
|
|
425
433
|
|
|
426
|
-
|
|
434
|
+
</details>
|
|
427
435
|
|
|
428
|
-
###
|
|
436
|
+
### Define configurations
|
|
429
437
|
|
|
430
438
|
`dstack` supports the following configurations:
|
|
431
439
|
|
|
@@ -438,7 +446,7 @@ Configuration is updated at ~/.dstack/config.yml
|
|
|
438
446
|
|
|
439
447
|
Configuration can be defined as YAML files within your repo.
|
|
440
448
|
|
|
441
|
-
###
|
|
449
|
+
### Apply configurations
|
|
442
450
|
|
|
443
451
|
Apply the configuration either via the `dstack apply` CLI command or through a programmatic API.
|
|
444
452
|
|
|
@@ -450,6 +458,7 @@ out-of-capacity errors, port-forwarding, and more — across clouds and on-p
|
|
|
450
458
|
For additional information, see the following links:
|
|
451
459
|
|
|
452
460
|
* [Docs](https://dstack.ai/docs)
|
|
461
|
+
* [Examples](https://dstack.ai/examples)
|
|
453
462
|
* [Discord](https://discord.gg/u8SmfwPpMd)
|
|
454
463
|
|
|
455
464
|
## Contributing
|