dstack 0.19.7__py3-none-any.whl → 0.19.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +38 -2
- dstack/_internal/cli/utils/run.py +3 -3
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +47 -0
- dstack/_internal/core/backends/nebius/models.py +8 -0
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +78 -3
- dstack/_internal/core/models/runs.py +7 -2
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runs.py +16 -6
- dstack/_internal/server/testing/common.py +35 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +44 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/METADATA +3 -1
- {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/RECORD +52 -50
- {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
import uuid
|
|
4
2
|
from datetime import datetime, timezone
|
|
5
3
|
from typing import List, Literal, Optional, Tuple, Union, cast
|
|
@@ -33,6 +31,7 @@ from dstack._internal.core.models.instances import (
|
|
|
33
31
|
SSHConnectionParams,
|
|
34
32
|
SSHKey,
|
|
35
33
|
)
|
|
34
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
36
35
|
from dstack._internal.core.models.profiles import (
|
|
37
36
|
Profile,
|
|
38
37
|
SpotPolicy,
|
|
@@ -62,6 +61,7 @@ from dstack._internal.server.services.projects import (
|
|
|
62
61
|
list_project_models,
|
|
63
62
|
list_user_project_models,
|
|
64
63
|
)
|
|
64
|
+
from dstack._internal.server.services.resources import set_resources_defaults
|
|
65
65
|
from dstack._internal.utils import random_names
|
|
66
66
|
from dstack._internal.utils.logging import get_logger
|
|
67
67
|
from dstack._internal.utils.ssh import pkey_from_str
|
|
@@ -243,6 +243,7 @@ async def get_plan(
|
|
|
243
243
|
spec=effective_spec,
|
|
244
244
|
)
|
|
245
245
|
effective_spec = FleetSpec.parse_obj(effective_spec.dict())
|
|
246
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
246
247
|
current_fleet: Optional[Fleet] = None
|
|
247
248
|
current_fleet_id: Optional[uuid.UUID] = None
|
|
248
249
|
if effective_spec.configuration.name is not None:
|
|
@@ -282,6 +283,7 @@ async def get_create_instance_offers(
|
|
|
282
283
|
project: ProjectModel,
|
|
283
284
|
profile: Profile,
|
|
284
285
|
requirements: Requirements,
|
|
286
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
285
287
|
fleet_spec: Optional[FleetSpec] = None,
|
|
286
288
|
fleet_model: Optional[FleetModel] = None,
|
|
287
289
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
@@ -307,6 +309,7 @@ async def get_create_instance_offers(
|
|
|
307
309
|
exclude_not_available=exclude_not_available,
|
|
308
310
|
multinode=multinode,
|
|
309
311
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
312
|
+
placement_group=placement_group,
|
|
310
313
|
blocks=blocks,
|
|
311
314
|
)
|
|
312
315
|
offers = [
|
|
@@ -345,7 +348,7 @@ async def create_fleet(
|
|
|
345
348
|
spec=spec,
|
|
346
349
|
)
|
|
347
350
|
spec = FleetSpec.parse_obj(spec.dict())
|
|
348
|
-
|
|
351
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
349
352
|
|
|
350
353
|
if spec.configuration.ssh_config is not None:
|
|
351
354
|
_check_can_manage_ssh_fleets(user=user, project=project)
|
|
@@ -393,17 +396,12 @@ async def create_fleet(
|
|
|
393
396
|
)
|
|
394
397
|
fleet_model.instances.append(instances_model)
|
|
395
398
|
else:
|
|
396
|
-
placement_group_name = _get_placement_group_name(
|
|
397
|
-
project=project,
|
|
398
|
-
fleet_spec=spec,
|
|
399
|
-
)
|
|
400
399
|
for i in range(_get_fleet_nodes_to_provision(spec)):
|
|
401
400
|
instance_model = await create_fleet_instance_model(
|
|
402
401
|
session=session,
|
|
403
402
|
project=project,
|
|
404
403
|
user=user,
|
|
405
404
|
spec=spec,
|
|
406
|
-
placement_group_name=placement_group_name,
|
|
407
405
|
reservation=spec.configuration.reservation,
|
|
408
406
|
instance_num=i,
|
|
409
407
|
)
|
|
@@ -417,7 +415,6 @@ async def create_fleet_instance_model(
|
|
|
417
415
|
project: ProjectModel,
|
|
418
416
|
user: UserModel,
|
|
419
417
|
spec: FleetSpec,
|
|
420
|
-
placement_group_name: Optional[str],
|
|
421
418
|
reservation: Optional[str],
|
|
422
419
|
instance_num: int,
|
|
423
420
|
) -> InstanceModel:
|
|
@@ -431,7 +428,6 @@ async def create_fleet_instance_model(
|
|
|
431
428
|
requirements=requirements,
|
|
432
429
|
instance_name=f"{spec.configuration.name}-{instance_num}",
|
|
433
430
|
instance_num=instance_num,
|
|
434
|
-
placement_group_name=placement_group_name,
|
|
435
431
|
reservation=reservation,
|
|
436
432
|
blocks=spec.configuration.blocks,
|
|
437
433
|
tags=spec.configuration.tags,
|
|
@@ -652,7 +648,7 @@ def _remove_fleet_spec_sensitive_info(spec: FleetSpec):
|
|
|
652
648
|
host.ssh_key = None
|
|
653
649
|
|
|
654
650
|
|
|
655
|
-
def
|
|
651
|
+
def _validate_fleet_spec_and_set_defaults(spec: FleetSpec):
|
|
656
652
|
if spec.configuration.name is not None:
|
|
657
653
|
validate_dstack_resource_name(spec.configuration.name)
|
|
658
654
|
if spec.configuration.ssh_config is None and spec.configuration.nodes is None:
|
|
@@ -665,6 +661,8 @@ def _validate_fleet_spec(spec: FleetSpec):
|
|
|
665
661
|
if isinstance(host, SSHHostParams) and host.ssh_key is not None:
|
|
666
662
|
_validate_ssh_key(host.ssh_key)
|
|
667
663
|
_validate_internal_ips(spec.configuration.ssh_config)
|
|
664
|
+
if spec.configuration.resources is not None:
|
|
665
|
+
set_resources_defaults(spec.configuration.resources)
|
|
668
666
|
|
|
669
667
|
|
|
670
668
|
def _validate_all_ssh_params_specified(ssh_config: SSHParams):
|
|
@@ -735,18 +733,3 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
|
735
733
|
reservation=fleet_spec.configuration.reservation,
|
|
736
734
|
)
|
|
737
735
|
return requirements
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def _get_placement_group_name(
|
|
741
|
-
project: ProjectModel,
|
|
742
|
-
fleet_spec: FleetSpec,
|
|
743
|
-
) -> Optional[str]:
|
|
744
|
-
if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER:
|
|
745
|
-
return None
|
|
746
|
-
# A random suffix to avoid clashing with to-be-deleted placement groups left by old fleets
|
|
747
|
-
suffix = _generate_random_placement_group_suffix()
|
|
748
|
-
return f"{project.name}-{fleet_spec.configuration.name}-{suffix}-pg"
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
def _generate_random_placement_group_suffix(length: int = 8) -> str:
|
|
752
|
-
return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
|
|
@@ -408,7 +408,6 @@ async def create_instance_model(
|
|
|
408
408
|
requirements: Requirements,
|
|
409
409
|
instance_name: str,
|
|
410
410
|
instance_num: int,
|
|
411
|
-
placement_group_name: Optional[str],
|
|
412
411
|
reservation: Optional[str],
|
|
413
412
|
blocks: Union[Literal["auto"], int],
|
|
414
413
|
tags: Optional[Dict[str, str]],
|
|
@@ -427,7 +426,6 @@ async def create_instance_model(
|
|
|
427
426
|
user=user.name,
|
|
428
427
|
ssh_keys=[project_ssh_key],
|
|
429
428
|
instance_id=str(instance_id),
|
|
430
|
-
placement_group_name=placement_group_name,
|
|
431
429
|
reservation=reservation,
|
|
432
430
|
tags=tags,
|
|
433
431
|
)
|
|
@@ -8,12 +8,14 @@ from dstack._internal.core.backends import (
|
|
|
8
8
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.core.backends.base.backend import Backend
|
|
11
|
+
from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport
|
|
11
12
|
from dstack._internal.core.models.backends.base import BackendType
|
|
12
13
|
from dstack._internal.core.models.instances import (
|
|
13
14
|
InstanceOfferWithAvailability,
|
|
14
15
|
InstanceType,
|
|
15
16
|
Resources,
|
|
16
17
|
)
|
|
18
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
17
19
|
from dstack._internal.core.models.profiles import Profile
|
|
18
20
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
19
21
|
from dstack._internal.core.models.volumes import Volume
|
|
@@ -31,6 +33,7 @@ async def get_offers_by_requirements(
|
|
|
31
33
|
volumes: Optional[List[List[Volume]]] = None,
|
|
32
34
|
privileged: bool = False,
|
|
33
35
|
instance_mounts: bool = False,
|
|
36
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
34
37
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
35
38
|
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
36
39
|
backends: List[Backend] = await backends_services.get_project_backends(project=project)
|
|
@@ -116,6 +119,18 @@ async def get_offers_by_requirements(
|
|
|
116
119
|
new_offers.append((b, new_offer))
|
|
117
120
|
offers = new_offers
|
|
118
121
|
|
|
122
|
+
if placement_group is not None:
|
|
123
|
+
new_offers = []
|
|
124
|
+
for b, o in offers:
|
|
125
|
+
for backend in backends:
|
|
126
|
+
compute = backend.compute()
|
|
127
|
+
if isinstance(
|
|
128
|
+
compute, ComputeWithPlacementGroupSupport
|
|
129
|
+
) and compute.is_suitable_placement_group(placement_group, o):
|
|
130
|
+
new_offers.append((b, o))
|
|
131
|
+
break
|
|
132
|
+
offers = new_offers
|
|
133
|
+
|
|
119
134
|
if profile.instance_types is not None:
|
|
120
135
|
instance_types = [i.lower() for i in profile.instance_types]
|
|
121
136
|
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from typing import Optional
|
|
2
3
|
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
from git import List
|
|
5
|
-
from sqlalchemy import select
|
|
6
|
+
from sqlalchemy import and_, select, update
|
|
6
7
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
8
|
|
|
8
9
|
from dstack._internal.core.models.placement import (
|
|
@@ -13,15 +14,35 @@ from dstack._internal.core.models.placement import (
|
|
|
13
14
|
from dstack._internal.server.models import PlacementGroupModel
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
async def
|
|
17
|
+
async def get_fleet_placement_group_models(
|
|
17
18
|
session: AsyncSession,
|
|
18
19
|
fleet_id: UUID,
|
|
19
|
-
) -> List[
|
|
20
|
+
) -> List[PlacementGroupModel]:
|
|
20
21
|
res = await session.execute(
|
|
21
|
-
select(PlacementGroupModel).where(
|
|
22
|
+
select(PlacementGroupModel).where(
|
|
23
|
+
and_(
|
|
24
|
+
PlacementGroupModel.fleet_id == fleet_id,
|
|
25
|
+
PlacementGroupModel.deleted == False,
|
|
26
|
+
PlacementGroupModel.fleet_deleted == False,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
return list(res.scalars().all())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def schedule_fleet_placement_groups_deletion(
|
|
34
|
+
session: AsyncSession, fleet_id: UUID, except_placement_group_ids: Iterable[UUID] = ()
|
|
35
|
+
) -> None:
|
|
36
|
+
await session.execute(
|
|
37
|
+
update(PlacementGroupModel)
|
|
38
|
+
.where(
|
|
39
|
+
and_(
|
|
40
|
+
PlacementGroupModel.fleet_id == fleet_id,
|
|
41
|
+
PlacementGroupModel.id.not_in(except_placement_group_ids),
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
.values(fleet_deleted=True) # TODO: rename `fleet_deleted` -> `to_be_deleted`
|
|
22
45
|
)
|
|
23
|
-
placement_groups = res.scalars().all()
|
|
24
|
-
return [placement_group_model_to_placement_group(pg) for pg in placement_groups]
|
|
25
46
|
|
|
26
47
|
|
|
27
48
|
def placement_group_model_to_placement_group(
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import gpuhunt
|
|
2
|
+
from pydantic import parse_obj_as
|
|
3
|
+
|
|
4
|
+
from dstack._internal.core.models.resources import CPUSpec, ResourcesSpec
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def set_resources_defaults(resources: ResourcesSpec) -> None:
|
|
8
|
+
# TODO: Remove in 0.20. Use resources.cpu directly
|
|
9
|
+
cpu = parse_obj_as(CPUSpec, resources.cpu)
|
|
10
|
+
if cpu.arch is None:
|
|
11
|
+
gpu = resources.gpu
|
|
12
|
+
if (
|
|
13
|
+
gpu is not None
|
|
14
|
+
and gpu.vendor in [None, gpuhunt.AcceleratorVendor.NVIDIA]
|
|
15
|
+
and gpu.name
|
|
16
|
+
and any(map(gpuhunt.is_nvidia_superchip, gpu.name))
|
|
17
|
+
):
|
|
18
|
+
cpu.arch = gpuhunt.CPUArchitecture.ARM
|
|
19
|
+
else:
|
|
20
|
+
cpu.arch = gpuhunt.CPUArchitecture.X86
|
|
21
|
+
resources.cpu = cpu
|
|
@@ -81,6 +81,7 @@ from dstack._internal.server.services.logging import fmt
|
|
|
81
81
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
82
82
|
from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
83
83
|
from dstack._internal.server.services.projects import list_project_models, list_user_project_models
|
|
84
|
+
from dstack._internal.server.services.resources import set_resources_defaults
|
|
84
85
|
from dstack._internal.server.services.users import get_user_model_by_name
|
|
85
86
|
from dstack._internal.utils.logging import get_logger
|
|
86
87
|
from dstack._internal.utils.random_names import generate_name
|
|
@@ -301,12 +302,14 @@ async def get_plan(
|
|
|
301
302
|
project=project,
|
|
302
303
|
run_name=effective_run_spec.run_name,
|
|
303
304
|
)
|
|
304
|
-
if
|
|
305
|
-
current_resource
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
305
|
+
if current_resource is not None:
|
|
306
|
+
# For backward compatibility (current_resource may has been submitted before
|
|
307
|
+
# some fields, e.g., CPUSpec.arch, were added)
|
|
308
|
+
set_resources_defaults(current_resource.run_spec.configuration.resources)
|
|
309
|
+
if not current_resource.status.is_finished() and _can_update_run_spec(
|
|
310
|
+
current_resource.run_spec, effective_run_spec
|
|
311
|
+
):
|
|
312
|
+
action = ApplyAction.UPDATE
|
|
310
313
|
|
|
311
314
|
jobs = await get_jobs_from_run_spec(effective_run_spec, replica_num=0)
|
|
312
315
|
|
|
@@ -406,6 +409,10 @@ async def apply_plan(
|
|
|
406
409
|
project=project,
|
|
407
410
|
run_spec=run_spec,
|
|
408
411
|
)
|
|
412
|
+
|
|
413
|
+
# For backward compatibility (current_resource may has been submitted before
|
|
414
|
+
# some fields, e.g., CPUSpec.arch, were added)
|
|
415
|
+
set_resources_defaults(current_resource.run_spec.configuration.resources)
|
|
409
416
|
try:
|
|
410
417
|
_check_can_update_run_spec(current_resource.run_spec, run_spec)
|
|
411
418
|
except ServerClientError:
|
|
@@ -414,6 +421,8 @@ async def apply_plan(
|
|
|
414
421
|
raise ServerClientError("Cannot override active run. Stop the run first.")
|
|
415
422
|
raise
|
|
416
423
|
if not force:
|
|
424
|
+
if plan.current_resource is not None:
|
|
425
|
+
set_resources_defaults(plan.current_resource.run_spec.configuration.resources)
|
|
417
426
|
if (
|
|
418
427
|
plan.current_resource is None
|
|
419
428
|
or plan.current_resource.id != current_resource.id
|
|
@@ -866,6 +875,7 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
866
875
|
raise ServerClientError(
|
|
867
876
|
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_TTL_SECONDS}s"
|
|
868
877
|
)
|
|
878
|
+
set_resources_defaults(run_spec.configuration.resources)
|
|
869
879
|
|
|
870
880
|
|
|
871
881
|
_UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import uuid
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
-
from typing import Dict, List, Optional, Union
|
|
5
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
8
|
import gpuhunt
|
|
@@ -25,7 +25,12 @@ from dstack._internal.core.models.configurations import (
|
|
|
25
25
|
DevEnvironmentConfiguration,
|
|
26
26
|
)
|
|
27
27
|
from dstack._internal.core.models.envs import Env
|
|
28
|
-
from dstack._internal.core.models.fleets import
|
|
28
|
+
from dstack._internal.core.models.fleets import (
|
|
29
|
+
FleetConfiguration,
|
|
30
|
+
FleetSpec,
|
|
31
|
+
FleetStatus,
|
|
32
|
+
InstanceGroupPlacement,
|
|
33
|
+
)
|
|
29
34
|
from dstack._internal.core.models.gateways import GatewayComputeConfiguration, GatewayStatus
|
|
30
35
|
from dstack._internal.core.models.instances import (
|
|
31
36
|
Disk,
|
|
@@ -51,7 +56,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
51
56
|
)
|
|
52
57
|
from dstack._internal.core.models.repos.base import RepoType
|
|
53
58
|
from dstack._internal.core.models.repos.local import LocalRunRepoData
|
|
54
|
-
from dstack._internal.core.models.resources import Memory, Range, ResourcesSpec
|
|
59
|
+
from dstack._internal.core.models.resources import CPUSpec, Memory, Range, ResourcesSpec
|
|
55
60
|
from dstack._internal.core.models.runs import (
|
|
56
61
|
JobProvisioningData,
|
|
57
62
|
JobRuntimeData,
|
|
@@ -497,10 +502,12 @@ def get_fleet_spec(conf: Optional[FleetConfiguration] = None) -> FleetSpec:
|
|
|
497
502
|
def get_fleet_configuration(
|
|
498
503
|
name: str = "test-fleet",
|
|
499
504
|
nodes: Range[int] = Range(min=1, max=1),
|
|
505
|
+
placement: Optional[InstanceGroupPlacement] = None,
|
|
500
506
|
) -> FleetConfiguration:
|
|
501
507
|
return FleetConfiguration(
|
|
502
508
|
name=name,
|
|
503
509
|
nodes=nodes,
|
|
510
|
+
placement=placement,
|
|
504
511
|
)
|
|
505
512
|
|
|
506
513
|
|
|
@@ -519,13 +526,13 @@ async def create_instance(
|
|
|
519
526
|
instance_id: Optional[UUID] = None,
|
|
520
527
|
job: Optional[JobModel] = None,
|
|
521
528
|
instance_num: int = 0,
|
|
522
|
-
backend: BackendType = BackendType.DATACRUNCH,
|
|
529
|
+
backend: Optional[BackendType] = BackendType.DATACRUNCH,
|
|
523
530
|
termination_policy: Optional[TerminationPolicy] = None,
|
|
524
531
|
termination_idle_time: int = DEFAULT_FLEET_TERMINATION_IDLE_TIME,
|
|
525
|
-
region: str = "eu-west",
|
|
532
|
+
region: Optional[str] = "eu-west",
|
|
526
533
|
remote_connection_info: Optional[RemoteConnectionInfo] = None,
|
|
527
|
-
offer: Optional[InstanceOfferWithAvailability] =
|
|
528
|
-
job_provisioning_data: Optional[JobProvisioningData] =
|
|
534
|
+
offer: Optional[Union[InstanceOfferWithAvailability, Literal["auto"]]] = "auto",
|
|
535
|
+
job_provisioning_data: Optional[Union[JobProvisioningData, Literal["auto"]]] = "auto",
|
|
529
536
|
total_blocks: Optional[int] = 1,
|
|
530
537
|
busy_blocks: int = 0,
|
|
531
538
|
name: str = "test_instance",
|
|
@@ -534,7 +541,7 @@ async def create_instance(
|
|
|
534
541
|
) -> InstanceModel:
|
|
535
542
|
if instance_id is None:
|
|
536
543
|
instance_id = uuid.uuid4()
|
|
537
|
-
if job_provisioning_data
|
|
544
|
+
if job_provisioning_data == "auto":
|
|
538
545
|
job_provisioning_data = get_job_provisioning_data(
|
|
539
546
|
dockerized=True,
|
|
540
547
|
backend=backend,
|
|
@@ -543,13 +550,13 @@ async def create_instance(
|
|
|
543
550
|
hostname="running_instance.ip",
|
|
544
551
|
internal_ip=None,
|
|
545
552
|
)
|
|
546
|
-
if offer
|
|
553
|
+
if offer == "auto":
|
|
547
554
|
offer = get_instance_offer_with_availability(backend=backend, region=region, spot=spot)
|
|
548
555
|
if profile is None:
|
|
549
556
|
profile = Profile(name="test_name")
|
|
550
557
|
|
|
551
558
|
if requirements is None:
|
|
552
|
-
requirements = Requirements(resources=ResourcesSpec(cpu=1))
|
|
559
|
+
requirements = Requirements(resources=ResourcesSpec(cpu=CPUSpec.parse("1")))
|
|
553
560
|
|
|
554
561
|
if instance_configuration is None:
|
|
555
562
|
instance_configuration = get_instance_configuration()
|
|
@@ -571,8 +578,8 @@ async def create_instance(
|
|
|
571
578
|
created_at=created_at,
|
|
572
579
|
started_at=created_at,
|
|
573
580
|
finished_at=finished_at,
|
|
574
|
-
job_provisioning_data=job_provisioning_data.json(),
|
|
575
|
-
offer=offer.json(),
|
|
581
|
+
job_provisioning_data=job_provisioning_data.json() if job_provisioning_data else None,
|
|
582
|
+
offer=offer.json() if offer else None,
|
|
576
583
|
price=price,
|
|
577
584
|
region=region,
|
|
578
585
|
backend=backend,
|
|
@@ -659,20 +666,7 @@ def get_remote_connection_info(
|
|
|
659
666
|
env: Optional[Union[Env, dict]] = None,
|
|
660
667
|
):
|
|
661
668
|
if ssh_keys is None:
|
|
662
|
-
ssh_keys = [
|
|
663
|
-
SSHKey(
|
|
664
|
-
public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
|
|
665
|
-
private="""
|
|
666
|
-
-----BEGIN OPENSSH PRIVATE KEY-----
|
|
667
|
-
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
|
|
668
|
-
QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
|
|
669
|
-
VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
|
|
670
|
-
AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
|
|
671
|
-
CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
|
|
672
|
-
-----END OPENSSH PRIVATE KEY-----
|
|
673
|
-
""",
|
|
674
|
-
)
|
|
675
|
-
]
|
|
669
|
+
ssh_keys = [get_ssh_key()]
|
|
676
670
|
if env is None:
|
|
677
671
|
env = Env()
|
|
678
672
|
elif isinstance(env, dict):
|
|
@@ -686,6 +680,21 @@ def get_remote_connection_info(
|
|
|
686
680
|
)
|
|
687
681
|
|
|
688
682
|
|
|
683
|
+
def get_ssh_key() -> SSHKey:
|
|
684
|
+
return SSHKey(
|
|
685
|
+
public="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO6mJxVbNtm0zXgMLvByrhXJCmJRveSrJxLB5/OzcyCk",
|
|
686
|
+
private="""
|
|
687
|
+
-----BEGIN OPENSSH PRIVATE KEY-----
|
|
688
|
+
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
|
|
689
|
+
QyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpAAAAJCiWa5Volmu
|
|
690
|
+
VQAAAAtzc2gtZWQyNTUxOQAAACDupicVWzbZtM14DC7wcq4VyQpiUb3kqycSwefzs3MgpA
|
|
691
|
+
AAAEAncHi4AhS6XdMp5Gzd+IMse/4ekyQ54UngByf0Sp0uH+6mJxVbNtm0zXgMLvByrhXJ
|
|
692
|
+
CmJRveSrJxLB5/OzcyCkAAAACWRlZkBkZWZwYwECAwQ=
|
|
693
|
+
-----END OPENSSH PRIVATE KEY-----
|
|
694
|
+
""",
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
|
|
689
698
|
async def create_volume(
|
|
690
699
|
session: AsyncSession,
|
|
691
700
|
project: ProjectModel,
|
dstack/_internal/utils/common.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import enum
|
|
2
3
|
import itertools
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
@@ -83,6 +84,8 @@ def pretty_date(time: datetime) -> str:
|
|
|
83
84
|
|
|
84
85
|
|
|
85
86
|
def pretty_resources(
|
|
87
|
+
*,
|
|
88
|
+
cpu_arch: Optional[Any] = None,
|
|
86
89
|
cpus: Optional[Any] = None,
|
|
87
90
|
memory: Optional[Any] = None,
|
|
88
91
|
gpu_count: Optional[Any] = None,
|
|
@@ -110,7 +113,16 @@ def pretty_resources(
|
|
|
110
113
|
"""
|
|
111
114
|
parts = []
|
|
112
115
|
if cpus is not None:
|
|
113
|
-
|
|
116
|
+
cpu_arch_lower: Optional[str] = None
|
|
117
|
+
if isinstance(cpu_arch, enum.Enum):
|
|
118
|
+
cpu_arch_lower = str(cpu_arch.value).lower()
|
|
119
|
+
elif isinstance(cpu_arch, str):
|
|
120
|
+
cpu_arch_lower = cpu_arch.lower()
|
|
121
|
+
if cpu_arch_lower == "arm":
|
|
122
|
+
cpu_arch_prefix = "arm:"
|
|
123
|
+
else:
|
|
124
|
+
cpu_arch_prefix = ""
|
|
125
|
+
parts.append(f"cpu={cpu_arch_prefix}{cpus}")
|
|
114
126
|
if memory is not None:
|
|
115
127
|
parts.append(f"mem={memory}")
|
|
116
128
|
if disk_size:
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
def add_extra_schema_types(schema_property: dict, extra_types: list[dict]):
|
|
2
2
|
if "allOf" in schema_property:
|
|
3
|
-
|
|
3
|
+
refs = [schema_property.pop("allOf")[0]]
|
|
4
|
+
elif "anyOf" in schema_property:
|
|
5
|
+
refs = schema_property.pop("anyOf")
|
|
4
6
|
else:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
+
refs = [{"type": schema_property.pop("type")}]
|
|
8
|
+
refs.extend(extra_types)
|
|
9
|
+
schema_property["anyOf"] = refs
|
dstack/api/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from dstack._internal.core.models.repos.local import LocalRepo
|
|
|
14
14
|
from dstack._internal.core.models.repos.remote import RemoteRepo
|
|
15
15
|
from dstack._internal.core.models.repos.virtual import VirtualRepo
|
|
16
16
|
from dstack._internal.core.models.resources import ComputeCapability, Memory, Range
|
|
17
|
+
from dstack._internal.core.models.resources import CPUSpec as CPU
|
|
17
18
|
from dstack._internal.core.models.resources import DiskSpec as Disk
|
|
18
19
|
from dstack._internal.core.models.resources import GPUSpec as GPU
|
|
19
20
|
from dstack._internal.core.models.resources import ResourcesSpec as Resources
|
dstack/api/server/_fleets.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
3
3
|
from pydantic import parse_obj_as
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.fleets import ApplyFleetPlanInput, Fleet, FleetPlan, FleetSpec
|
|
6
|
+
from dstack._internal.core.models.instances import Instance
|
|
6
7
|
from dstack._internal.server.schemas.fleets import (
|
|
7
8
|
ApplyFleetPlanRequest,
|
|
8
9
|
CreateFleetRequest,
|
|
@@ -83,9 +84,24 @@ def _get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
|
|
|
83
84
|
spec_excludes = _get_fleet_spec_excludes(plan_input.spec)
|
|
84
85
|
if spec_excludes:
|
|
85
86
|
apply_plan_excludes["spec"] = apply_plan_excludes
|
|
87
|
+
current_resource = plan_input.current_resource
|
|
88
|
+
if current_resource is not None:
|
|
89
|
+
current_resource_excludes = {}
|
|
90
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
91
|
+
if all(map(_should_exclude_instance_cpu_arch, current_resource.instances)):
|
|
92
|
+
current_resource_excludes["instances"] = {
|
|
93
|
+
"__all__": {"instance_type": {"resources": {"cpu_arch"}}}
|
|
94
|
+
}
|
|
86
95
|
return {"plan": apply_plan_excludes}
|
|
87
96
|
|
|
88
97
|
|
|
98
|
+
def _should_exclude_instance_cpu_arch(instance: Instance) -> bool:
|
|
99
|
+
try:
|
|
100
|
+
return instance.instance_type.resources.cpu_arch is None
|
|
101
|
+
except AttributeError:
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
|
|
89
105
|
def _get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
|
|
90
106
|
create_fleet_excludes = {}
|
|
91
107
|
spec_excludes = _get_fleet_spec_excludes(fleet_spec)
|
dstack/api/server/_runs.py
CHANGED
|
@@ -7,6 +7,7 @@ from pydantic import parse_obj_as
|
|
|
7
7
|
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
8
8
|
from dstack._internal.core.models.runs import (
|
|
9
9
|
ApplyRunPlanInput,
|
|
10
|
+
JobSubmission,
|
|
10
11
|
Run,
|
|
11
12
|
RunPlan,
|
|
12
13
|
RunSpec,
|
|
@@ -96,13 +97,53 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
|
96
97
|
run_spec_excludes = _get_run_spec_excludes(plan.run_spec)
|
|
97
98
|
if run_spec_excludes is not None:
|
|
98
99
|
apply_plan_excludes["run_spec"] = run_spec_excludes
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
100
|
+
current_resource = plan.current_resource
|
|
101
|
+
if current_resource is not None:
|
|
102
|
+
current_resource_excludes = {}
|
|
103
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
104
|
+
current_resource_excludes["run_spec"] = _get_run_spec_excludes(current_resource.run_spec)
|
|
105
|
+
job_submissions_excludes = {}
|
|
106
|
+
current_resource_excludes["jobs"] = {
|
|
107
|
+
"__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
|
|
102
108
|
}
|
|
109
|
+
job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
|
|
110
|
+
if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
|
|
111
|
+
job_submissions_excludes["job_provisioning_data"] = {
|
|
112
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
113
|
+
}
|
|
114
|
+
if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
|
|
115
|
+
job_submissions_excludes["job_runtime_data"] = {
|
|
116
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
117
|
+
}
|
|
118
|
+
latest_job_submission = current_resource.latest_job_submission
|
|
119
|
+
if latest_job_submission is not None:
|
|
120
|
+
latest_job_submission_excludes = {}
|
|
121
|
+
current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
|
|
122
|
+
if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
|
|
123
|
+
latest_job_submission_excludes["job_provisioning_data"] = {
|
|
124
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
125
|
+
}
|
|
126
|
+
if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
|
|
127
|
+
latest_job_submission_excludes["job_runtime_data"] = {
|
|
128
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
129
|
+
}
|
|
103
130
|
return {"plan": apply_plan_excludes}
|
|
104
131
|
|
|
105
132
|
|
|
133
|
+
def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
134
|
+
try:
|
|
135
|
+
return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
|
|
136
|
+
except AttributeError:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
141
|
+
try:
|
|
142
|
+
return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
|
|
143
|
+
except AttributeError:
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
|
|
106
147
|
def _get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
|
|
107
148
|
"""
|
|
108
149
|
Excludes new fields when they are not set to keep
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.8
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -49,6 +49,7 @@ Requires-Dist: asyncpg; extra == 'all'
|
|
|
49
49
|
Requires-Dist: azure-identity>=1.12.0; extra == 'all'
|
|
50
50
|
Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'all'
|
|
51
51
|
Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'all'
|
|
52
|
+
Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'all'
|
|
52
53
|
Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'all'
|
|
53
54
|
Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'all'
|
|
54
55
|
Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'all'
|
|
@@ -116,6 +117,7 @@ Requires-Dist: asyncpg; extra == 'azure'
|
|
|
116
117
|
Requires-Dist: azure-identity>=1.12.0; extra == 'azure'
|
|
117
118
|
Requires-Dist: azure-mgmt-authorization>=3.0.0; extra == 'azure'
|
|
118
119
|
Requires-Dist: azure-mgmt-compute>=29.1.0; extra == 'azure'
|
|
120
|
+
Requires-Dist: azure-mgmt-msi>=7.0.0; extra == 'azure'
|
|
119
121
|
Requires-Dist: azure-mgmt-network<28.0.0,>=23.0.0; extra == 'azure'
|
|
120
122
|
Requires-Dist: azure-mgmt-resource>=22.0.0; extra == 'azure'
|
|
121
123
|
Requires-Dist: azure-mgmt-subscription>=3.1.1; extra == 'azure'
|