dstack 0.19.6rc1__py3-none-any.whl → 0.19.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/fleet.py +3 -2
- dstack/_internal/cli/services/configurators/run.py +50 -4
- dstack/_internal/cli/utils/fleet.py +3 -1
- dstack/_internal/cli/utils/run.py +25 -28
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/gcp/resources.py +6 -1
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +47 -0
- dstack/_internal/core/backends/nebius/models.py +8 -0
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -1
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/fleets.py +2 -0
- dstack/_internal/core/models/instances.py +4 -3
- dstack/_internal/core/models/resources.py +80 -3
- dstack/_internal/core/models/runs.py +10 -3
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1 -1
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gateways.py +2 -1
- dstack/_internal/server/services/config.py +7 -2
- dstack/_internal/server/services/fleets.py +24 -26
- dstack/_internal/server/services/gateways/__init__.py +17 -2
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/plugins.py +77 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runs.py +41 -17
- dstack/_internal/server/services/volumes.py +10 -1
- dstack/_internal/server/testing/common.py +35 -26
- dstack/_internal/utils/common.py +22 -9
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/__init__.py +8 -1
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +44 -3
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/version.py +1 -1
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/METADATA +14 -2
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/RECORD +69 -62
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
from sqlalchemy import select
|
|
1
|
+
from sqlalchemy import select
|
|
2
2
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
3
3
|
from sqlalchemy.orm import joinedload
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
6
6
|
from dstack._internal.server.db import get_session_ctx
|
|
7
|
-
from dstack._internal.server.models import FleetModel
|
|
7
|
+
from dstack._internal.server.models import FleetModel
|
|
8
8
|
from dstack._internal.server.services.fleets import (
|
|
9
9
|
is_fleet_empty,
|
|
10
10
|
is_fleet_in_use,
|
|
11
11
|
)
|
|
12
12
|
from dstack._internal.server.services.locking import get_locker
|
|
13
|
+
from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
|
|
13
14
|
from dstack._internal.utils.common import get_current_datetime
|
|
14
15
|
from dstack._internal.utils.logging import get_logger
|
|
15
16
|
|
|
@@ -68,16 +69,6 @@ async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
|
|
|
68
69
|
fleet_model.status = FleetStatus.TERMINATED
|
|
69
70
|
fleet_model.deleted = True
|
|
70
71
|
fleet_model.last_processed_at = get_current_datetime()
|
|
71
|
-
await
|
|
72
|
+
await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
|
|
72
73
|
await session.commit()
|
|
73
74
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async def _mark_placement_groups_as_ready_for_deletion(
|
|
77
|
-
session: AsyncSession, fleet_model: FleetModel
|
|
78
|
-
):
|
|
79
|
-
await session.execute(
|
|
80
|
-
update(PlacementGroupModel)
|
|
81
|
-
.where(PlacementGroupModel.fleet_id == fleet_model.id)
|
|
82
|
-
.values(fleet_deleted=True)
|
|
83
|
-
)
|
|
@@ -19,6 +19,8 @@ from dstack._internal.core.backends import (
|
|
|
19
19
|
from dstack._internal.core.backends.base.compute import (
|
|
20
20
|
ComputeWithCreateInstanceSupport,
|
|
21
21
|
ComputeWithPlacementGroupSupport,
|
|
22
|
+
GoArchType,
|
|
23
|
+
generate_unique_placement_group_name,
|
|
22
24
|
get_dstack_runner_binary_path,
|
|
23
25
|
get_dstack_shim_binary_path,
|
|
24
26
|
get_dstack_working_dir,
|
|
@@ -26,6 +28,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
26
28
|
get_shim_pre_start_commands,
|
|
27
29
|
)
|
|
28
30
|
from dstack._internal.core.backends.remote.provisioning import (
|
|
31
|
+
detect_cpu_arch,
|
|
29
32
|
get_host_info,
|
|
30
33
|
get_paramiko_connection,
|
|
31
34
|
get_shim_healthcheck,
|
|
@@ -39,11 +42,16 @@ from dstack._internal.core.backends.remote.provisioning import (
|
|
|
39
42
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
40
43
|
|
|
41
44
|
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
42
|
-
from dstack._internal.core.errors import
|
|
45
|
+
from dstack._internal.core.errors import (
|
|
46
|
+
BackendError,
|
|
47
|
+
NotYetTerminated,
|
|
48
|
+
ProvisioningError,
|
|
49
|
+
)
|
|
43
50
|
from dstack._internal.core.models.backends.base import BackendType
|
|
44
51
|
from dstack._internal.core.models.fleets import InstanceGroupPlacement
|
|
45
52
|
from dstack._internal.core.models.instances import (
|
|
46
53
|
InstanceAvailability,
|
|
54
|
+
InstanceOffer,
|
|
47
55
|
InstanceOfferWithAvailability,
|
|
48
56
|
InstanceRuntime,
|
|
49
57
|
InstanceStatus,
|
|
@@ -51,7 +59,6 @@ from dstack._internal.core.models.instances import (
|
|
|
51
59
|
SSHKey,
|
|
52
60
|
)
|
|
53
61
|
from dstack._internal.core.models.placement import (
|
|
54
|
-
PlacementGroup,
|
|
55
62
|
PlacementGroupConfiguration,
|
|
56
63
|
PlacementStrategy,
|
|
57
64
|
)
|
|
@@ -89,8 +96,9 @@ from dstack._internal.server.services.instances import (
|
|
|
89
96
|
from dstack._internal.server.services.locking import get_locker
|
|
90
97
|
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
91
98
|
from dstack._internal.server.services.placement import (
|
|
92
|
-
|
|
99
|
+
get_fleet_placement_group_models,
|
|
93
100
|
placement_group_model_to_placement_group,
|
|
101
|
+
schedule_fleet_placement_groups_deletion,
|
|
94
102
|
)
|
|
95
103
|
from dstack._internal.server.services.runner import client as runner_client
|
|
96
104
|
from dstack._internal.server.services.runner.client import HealthStatus
|
|
@@ -264,7 +272,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
264
272
|
)
|
|
265
273
|
deploy_timeout = 20 * 60 # 20 minutes
|
|
266
274
|
result = await asyncio.wait_for(future, timeout=deploy_timeout)
|
|
267
|
-
health, host_info = result
|
|
275
|
+
health, host_info, cpu_arch = result
|
|
268
276
|
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
269
277
|
raise ProvisioningError(f"Deploy timeout: {e}") from e
|
|
270
278
|
except Exception as e:
|
|
@@ -285,7 +293,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
285
293
|
instance.last_retry_at = get_current_datetime()
|
|
286
294
|
return
|
|
287
295
|
|
|
288
|
-
instance_type = host_info_to_instance_type(host_info)
|
|
296
|
+
instance_type = host_info_to_instance_type(host_info, cpu_arch)
|
|
289
297
|
instance_network = None
|
|
290
298
|
internal_ip = None
|
|
291
299
|
try:
|
|
@@ -388,7 +396,7 @@ def _deploy_instance(
|
|
|
388
396
|
pkeys: List[PKey],
|
|
389
397
|
ssh_proxy_pkeys: Optional[list[PKey]],
|
|
390
398
|
authorized_keys: List[str],
|
|
391
|
-
) -> Tuple[HealthStatus, Dict[str, Any]]:
|
|
399
|
+
) -> Tuple[HealthStatus, Dict[str, Any], GoArchType]:
|
|
392
400
|
with get_paramiko_connection(
|
|
393
401
|
remote_details.ssh_user,
|
|
394
402
|
remote_details.host,
|
|
@@ -399,13 +407,16 @@ def _deploy_instance(
|
|
|
399
407
|
) as client:
|
|
400
408
|
logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}")
|
|
401
409
|
|
|
410
|
+
arch = detect_cpu_arch(client)
|
|
411
|
+
logger.info("%s: CPU arch is %s", remote_details.host, arch)
|
|
412
|
+
|
|
402
413
|
# Execute pre start commands
|
|
403
|
-
shim_pre_start_commands = get_shim_pre_start_commands()
|
|
414
|
+
shim_pre_start_commands = get_shim_pre_start_commands(arch=arch)
|
|
404
415
|
run_pre_start_commands(client, shim_pre_start_commands, authorized_keys)
|
|
405
416
|
logger.debug("The script for installing dstack has been executed")
|
|
406
417
|
|
|
407
418
|
# Upload envs
|
|
408
|
-
shim_envs = get_shim_env(authorized_keys)
|
|
419
|
+
shim_envs = get_shim_env(authorized_keys, arch=arch)
|
|
409
420
|
try:
|
|
410
421
|
fleet_configuration_envs = remote_details.env.as_dict()
|
|
411
422
|
except ValueError as e:
|
|
@@ -440,7 +451,7 @@ def _deploy_instance(
|
|
|
440
451
|
raise ProvisioningError("Cannot read HealthcheckResponse") from e
|
|
441
452
|
health = runner_client.health_response_to_health_status(health_response)
|
|
442
453
|
|
|
443
|
-
return health, host_info
|
|
454
|
+
return health, host_info, arch
|
|
444
455
|
|
|
445
456
|
|
|
446
457
|
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
@@ -509,11 +520,39 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
509
520
|
)
|
|
510
521
|
return
|
|
511
522
|
|
|
523
|
+
placement_group_models = []
|
|
524
|
+
placement_group_model = None
|
|
525
|
+
if instance.fleet_id:
|
|
526
|
+
placement_group_models = await get_fleet_placement_group_models(
|
|
527
|
+
session=session,
|
|
528
|
+
fleet_id=instance.fleet_id,
|
|
529
|
+
)
|
|
530
|
+
# The placement group is determined when provisioning the master instance
|
|
531
|
+
# and used for all other instances in the fleet.
|
|
532
|
+
if not _is_fleet_master_instance(instance):
|
|
533
|
+
if placement_group_models:
|
|
534
|
+
placement_group_model = placement_group_models[0]
|
|
535
|
+
if len(placement_group_models) > 1:
|
|
536
|
+
logger.error(
|
|
537
|
+
(
|
|
538
|
+
"Expected 0 or 1 placement groups associated with fleet %s, found %s."
|
|
539
|
+
" An incorrect placement group might have been selected for instance %s"
|
|
540
|
+
),
|
|
541
|
+
instance.fleet_id,
|
|
542
|
+
len(placement_group_models),
|
|
543
|
+
instance.name,
|
|
544
|
+
)
|
|
545
|
+
|
|
512
546
|
offers = await get_create_instance_offers(
|
|
513
547
|
project=instance.project,
|
|
514
548
|
profile=profile,
|
|
515
549
|
requirements=requirements,
|
|
516
550
|
fleet_model=instance.fleet,
|
|
551
|
+
placement_group=(
|
|
552
|
+
placement_group_model_to_placement_group(placement_group_model)
|
|
553
|
+
if placement_group_model
|
|
554
|
+
else None
|
|
555
|
+
),
|
|
517
556
|
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
518
557
|
exclude_not_available=True,
|
|
519
558
|
)
|
|
@@ -527,12 +566,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
527
566
|
)
|
|
528
567
|
return
|
|
529
568
|
|
|
530
|
-
placement_groups = []
|
|
531
|
-
if instance.fleet_id:
|
|
532
|
-
placement_groups = await get_fleet_placement_groups(
|
|
533
|
-
session=session, fleet_id=instance.fleet_id
|
|
534
|
-
)
|
|
535
|
-
|
|
536
569
|
# Limit number of offers tried to prevent long-running processing
|
|
537
570
|
# in case all offers fail.
|
|
538
571
|
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
@@ -542,25 +575,28 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
542
575
|
assert isinstance(compute, ComputeWithCreateInstanceSupport)
|
|
543
576
|
instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
|
|
544
577
|
if (
|
|
545
|
-
|
|
578
|
+
_is_fleet_master_instance(instance)
|
|
579
|
+
and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
|
|
546
580
|
and instance.fleet
|
|
547
|
-
and
|
|
581
|
+
and _is_cloud_cluster(instance.fleet)
|
|
548
582
|
):
|
|
549
583
|
assert isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
550
|
-
placement_group_model =
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
name=instance_configuration.placement_group_name,
|
|
555
|
-
backend=instance_offer.backend,
|
|
556
|
-
region=instance_offer.region,
|
|
584
|
+
placement_group_model = _find_suitable_placement_group(
|
|
585
|
+
placement_groups=placement_group_models,
|
|
586
|
+
instance_offer=instance_offer,
|
|
587
|
+
compute=compute,
|
|
557
588
|
)
|
|
558
|
-
if placement_group_model is
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
589
|
+
if placement_group_model is None:
|
|
590
|
+
placement_group_model = await _create_placement_group(
|
|
591
|
+
fleet_model=instance.fleet,
|
|
592
|
+
master_instance_offer=instance_offer,
|
|
593
|
+
compute=compute,
|
|
594
|
+
)
|
|
595
|
+
if placement_group_model is None: # error occurred
|
|
596
|
+
continue
|
|
562
597
|
session.add(placement_group_model)
|
|
563
|
-
|
|
598
|
+
await session.flush()
|
|
599
|
+
placement_group_models.append(placement_group_model)
|
|
564
600
|
logger.debug(
|
|
565
601
|
"Trying %s in %s/%s for $%0.4f per hour",
|
|
566
602
|
instance_offer.instance.name,
|
|
@@ -573,6 +609,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
573
609
|
compute.create_instance,
|
|
574
610
|
instance_offer,
|
|
575
611
|
instance_configuration,
|
|
612
|
+
(
|
|
613
|
+
placement_group_model_to_placement_group(placement_group_model)
|
|
614
|
+
if placement_group_model
|
|
615
|
+
else None
|
|
616
|
+
),
|
|
576
617
|
)
|
|
577
618
|
except BackendError as e:
|
|
578
619
|
logger.warning(
|
|
@@ -612,22 +653,46 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
612
653
|
"instance_status": InstanceStatus.PROVISIONING.value,
|
|
613
654
|
},
|
|
614
655
|
)
|
|
656
|
+
if instance.fleet_id and _is_fleet_master_instance(instance):
|
|
657
|
+
# Clean up placement groups that did not end up being used
|
|
658
|
+
await schedule_fleet_placement_groups_deletion(
|
|
659
|
+
session=session,
|
|
660
|
+
fleet_id=instance.fleet_id,
|
|
661
|
+
except_placement_group_ids=(
|
|
662
|
+
[placement_group_model.id] if placement_group_model is not None else []
|
|
663
|
+
),
|
|
664
|
+
)
|
|
615
665
|
return
|
|
616
666
|
|
|
617
667
|
instance.last_retry_at = get_current_datetime()
|
|
618
668
|
|
|
619
669
|
if not should_retry:
|
|
620
|
-
instance
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
instance.
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
670
|
+
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
|
|
671
|
+
if (
|
|
672
|
+
instance.fleet
|
|
673
|
+
and _is_fleet_master_instance(instance)
|
|
674
|
+
and _is_cloud_cluster(instance.fleet)
|
|
675
|
+
):
|
|
676
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
677
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
678
|
+
for sibling_instance in instance.fleet.instances:
|
|
679
|
+
if sibling_instance.id == instance.id:
|
|
680
|
+
continue
|
|
681
|
+
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
|
|
685
|
+
instance.status = InstanceStatus.TERMINATED
|
|
686
|
+
instance.termination_reason = termination_reason
|
|
687
|
+
logger.info(
|
|
688
|
+
"Terminated instance %s: %s",
|
|
689
|
+
instance.name,
|
|
690
|
+
instance.termination_reason,
|
|
691
|
+
extra={
|
|
692
|
+
"instance_name": instance.name,
|
|
693
|
+
"instance_status": InstanceStatus.TERMINATED.value,
|
|
694
|
+
},
|
|
695
|
+
)
|
|
631
696
|
|
|
632
697
|
|
|
633
698
|
async def _check_instance(instance: InstanceModel) -> None:
|
|
@@ -906,12 +971,20 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool:
|
|
|
906
971
|
if instance.fleet is None:
|
|
907
972
|
return False
|
|
908
973
|
if (
|
|
909
|
-
instance
|
|
974
|
+
_is_fleet_master_instance(instance)
|
|
910
975
|
or instance.fleet.instances[0].job_provisioning_data is not None
|
|
911
976
|
or instance.fleet.instances[0].status == InstanceStatus.TERMINATED
|
|
912
977
|
):
|
|
913
978
|
return False
|
|
914
|
-
|
|
979
|
+
return _is_cloud_cluster(instance.fleet)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def _is_fleet_master_instance(instance: InstanceModel) -> bool:
|
|
983
|
+
return instance.fleet is not None and instance.id == instance.fleet.instances[0].id
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def _is_cloud_cluster(fleet_model: FleetModel) -> bool:
|
|
987
|
+
fleet = fleet_model_to_fleet(fleet_model)
|
|
915
988
|
return (
|
|
916
989
|
fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
917
990
|
and fleet.spec.configuration.ssh_config is None
|
|
@@ -944,28 +1017,76 @@ def _get_instance_offer_for_instance(
|
|
|
944
1017
|
return instance_offer
|
|
945
1018
|
|
|
946
1019
|
|
|
947
|
-
def
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
name: str,
|
|
952
|
-
backend: BackendType,
|
|
953
|
-
region: str,
|
|
1020
|
+
def _find_suitable_placement_group(
|
|
1021
|
+
placement_groups: List[PlacementGroupModel],
|
|
1022
|
+
instance_offer: InstanceOffer,
|
|
1023
|
+
compute: ComputeWithPlacementGroupSupport,
|
|
954
1024
|
) -> Optional[PlacementGroupModel]:
|
|
955
1025
|
for pg in placement_groups:
|
|
956
|
-
if
|
|
957
|
-
|
|
1026
|
+
if compute.is_suitable_placement_group(
|
|
1027
|
+
placement_group_model_to_placement_group(pg), instance_offer
|
|
1028
|
+
):
|
|
1029
|
+
return pg
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
async def _create_placement_group(
|
|
1034
|
+
fleet_model: FleetModel,
|
|
1035
|
+
master_instance_offer: InstanceOffer,
|
|
1036
|
+
compute: ComputeWithPlacementGroupSupport,
|
|
1037
|
+
) -> Optional[PlacementGroupModel]:
|
|
958
1038
|
placement_group_model = PlacementGroupModel(
|
|
959
|
-
name
|
|
1039
|
+
# TODO: generate the name in Compute.create_placement_group to allow
|
|
1040
|
+
# backend-specific name length limits
|
|
1041
|
+
name=generate_unique_placement_group_name(
|
|
1042
|
+
project_name=fleet_model.project.name,
|
|
1043
|
+
fleet_name=fleet_model.name,
|
|
1044
|
+
),
|
|
960
1045
|
project=fleet_model.project,
|
|
961
1046
|
fleet=fleet_model,
|
|
962
1047
|
configuration=PlacementGroupConfiguration(
|
|
963
|
-
backend=backend,
|
|
964
|
-
region=region,
|
|
1048
|
+
backend=master_instance_offer.backend,
|
|
1049
|
+
region=master_instance_offer.region,
|
|
965
1050
|
placement_strategy=PlacementStrategy.CLUSTER,
|
|
966
1051
|
).json(),
|
|
967
1052
|
)
|
|
968
|
-
|
|
1053
|
+
placement_group = placement_group_model_to_placement_group(placement_group_model)
|
|
1054
|
+
logger.debug(
|
|
1055
|
+
"Creating placement group %s in %s/%s",
|
|
1056
|
+
placement_group.name,
|
|
1057
|
+
placement_group.configuration.backend.value,
|
|
1058
|
+
placement_group.configuration.region,
|
|
1059
|
+
)
|
|
1060
|
+
try:
|
|
1061
|
+
pgpd = await run_async(
|
|
1062
|
+
compute.create_placement_group,
|
|
1063
|
+
placement_group_model_to_placement_group(placement_group_model),
|
|
1064
|
+
master_instance_offer,
|
|
1065
|
+
)
|
|
1066
|
+
except BackendError as e:
|
|
1067
|
+
logger.warning(
|
|
1068
|
+
"Failed to create placement group %s in %s/%s: %r",
|
|
1069
|
+
placement_group.name,
|
|
1070
|
+
placement_group.configuration.backend.value,
|
|
1071
|
+
placement_group.configuration.region,
|
|
1072
|
+
e,
|
|
1073
|
+
)
|
|
1074
|
+
return None
|
|
1075
|
+
except Exception:
|
|
1076
|
+
logger.exception(
|
|
1077
|
+
"Got exception when creating placement group %s in %s/%s",
|
|
1078
|
+
placement_group.name,
|
|
1079
|
+
placement_group.configuration.backend.value,
|
|
1080
|
+
placement_group.configuration.region,
|
|
1081
|
+
)
|
|
1082
|
+
return None
|
|
1083
|
+
logger.info(
|
|
1084
|
+
"Created placement group %s in %s/%s",
|
|
1085
|
+
placement_group.name,
|
|
1086
|
+
placement_group.configuration.backend.value,
|
|
1087
|
+
placement_group.configuration.region,
|
|
1088
|
+
)
|
|
1089
|
+
placement_group_model.provisioning_data = pgpd.json()
|
|
969
1090
|
return placement_group_model
|
|
970
1091
|
|
|
971
1092
|
|
|
@@ -66,7 +66,7 @@ async def _delete_placement_groups(
|
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
async def _delete_placement_group(placement_group_model: PlacementGroupModel):
|
|
69
|
-
logger.
|
|
69
|
+
logger.debug("Deleting placement group %s", placement_group_model.name)
|
|
70
70
|
placement_group = placement_group_model_to_placement_group(placement_group_model)
|
|
71
71
|
if placement_group.provisioning_data is None:
|
|
72
72
|
logger.error(
|
|
@@ -99,11 +99,14 @@ async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: dateti
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
|
|
102
|
-
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
103
102
|
jpd = get_job_provisioning_data(job_model)
|
|
104
|
-
jrd = get_job_runtime_data(job_model)
|
|
105
103
|
if jpd is None:
|
|
106
104
|
return None
|
|
105
|
+
if not jpd.dockerized:
|
|
106
|
+
# Container-based backend, no shim
|
|
107
|
+
return None
|
|
108
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
109
|
+
jrd = get_job_runtime_data(job_model)
|
|
107
110
|
try:
|
|
108
111
|
res = await run_async(
|
|
109
112
|
_pull_job_metrics,
|
|
@@ -197,7 +197,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
197
197
|
pool_instances = list(res.unique().scalars().all())
|
|
198
198
|
instances_ids = sorted([i.id for i in pool_instances])
|
|
199
199
|
if get_db().dialect_name == "sqlite":
|
|
200
|
-
# Start new transaction to see
|
|
200
|
+
# Start new transaction to see committed changes after lock
|
|
201
201
|
await session.commit()
|
|
202
202
|
async with get_locker().lock_ctx(InstanceModel.__tablename__, instances_ids):
|
|
203
203
|
# If another job freed the instance but is still trying to detach volumes,
|
|
@@ -659,6 +659,7 @@ class PlacementGroupModel(BaseModel):
|
|
|
659
659
|
|
|
660
660
|
fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id"))
|
|
661
661
|
fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id])
|
|
662
|
+
# TODO: rename `fleet_deleted` -> `to_be_deleted`
|
|
662
663
|
fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
663
664
|
|
|
664
665
|
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
@@ -47,9 +47,10 @@ async def create_gateway(
|
|
|
47
47
|
session: AsyncSession = Depends(get_session),
|
|
48
48
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()),
|
|
49
49
|
) -> models.Gateway:
|
|
50
|
-
|
|
50
|
+
user, project = user_project
|
|
51
51
|
return await gateways.create_gateway(
|
|
52
52
|
session=session,
|
|
53
|
+
user=user,
|
|
53
54
|
project=project,
|
|
54
55
|
configuration=body.configuration,
|
|
55
56
|
)
|
|
@@ -29,6 +29,7 @@ from dstack._internal.server.services.permissions import (
|
|
|
29
29
|
DefaultPermissions,
|
|
30
30
|
set_default_permissions,
|
|
31
31
|
)
|
|
32
|
+
from dstack._internal.server.services.plugins import load_plugins
|
|
32
33
|
from dstack._internal.utils.logging import get_logger
|
|
33
34
|
|
|
34
35
|
logger = get_logger(__name__)
|
|
@@ -38,7 +39,7 @@ logger = get_logger(__name__)
|
|
|
38
39
|
# If a collection has nested collections, it will be assigned the block style. Otherwise it will have the flow style.
|
|
39
40
|
#
|
|
40
41
|
# We want mapping to always be displayed in block-style but lists without nested objects in flow-style.
|
|
41
|
-
# So we define a custom
|
|
42
|
+
# So we define a custom representer.
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
def seq_representer(dumper, sequence):
|
|
@@ -75,7 +76,10 @@ class ServerConfig(CoreModel):
|
|
|
75
76
|
] = None
|
|
76
77
|
default_permissions: Annotated[
|
|
77
78
|
Optional[DefaultPermissions], Field(description="The default user permissions")
|
|
78
|
-
]
|
|
79
|
+
] = None
|
|
80
|
+
plugins: Annotated[
|
|
81
|
+
Optional[List[str]], Field(description="The server-side plugins to enable")
|
|
82
|
+
] = None
|
|
79
83
|
|
|
80
84
|
|
|
81
85
|
class ServerConfigManager:
|
|
@@ -112,6 +116,7 @@ class ServerConfigManager:
|
|
|
112
116
|
await self._apply_project_config(
|
|
113
117
|
session=session, owner=owner, project_config=project_config
|
|
114
118
|
)
|
|
119
|
+
load_plugins(enabled_plugins=self.config.plugins or [])
|
|
115
120
|
|
|
116
121
|
async def _apply_project_config(
|
|
117
122
|
self,
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
import uuid
|
|
4
2
|
from datetime import datetime, timezone
|
|
5
3
|
from typing import List, Literal, Optional, Tuple, Union, cast
|
|
@@ -33,6 +31,7 @@ from dstack._internal.core.models.instances import (
|
|
|
33
31
|
SSHConnectionParams,
|
|
34
32
|
SSHKey,
|
|
35
33
|
)
|
|
34
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
36
35
|
from dstack._internal.core.models.profiles import (
|
|
37
36
|
Profile,
|
|
38
37
|
SpotPolicy,
|
|
@@ -55,12 +54,14 @@ from dstack._internal.server.services.locking import (
|
|
|
55
54
|
get_locker,
|
|
56
55
|
string_to_lock_id,
|
|
57
56
|
)
|
|
57
|
+
from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
58
58
|
from dstack._internal.server.services.projects import (
|
|
59
59
|
get_member,
|
|
60
60
|
get_member_permissions,
|
|
61
61
|
list_project_models,
|
|
62
62
|
list_user_project_models,
|
|
63
63
|
)
|
|
64
|
+
from dstack._internal.server.services.resources import set_resources_defaults
|
|
64
65
|
from dstack._internal.utils import random_names
|
|
65
66
|
from dstack._internal.utils.logging import get_logger
|
|
66
67
|
from dstack._internal.utils.ssh import pkey_from_str
|
|
@@ -234,7 +235,15 @@ async def get_plan(
|
|
|
234
235
|
user: UserModel,
|
|
235
236
|
spec: FleetSpec,
|
|
236
237
|
) -> FleetPlan:
|
|
238
|
+
# Spec must be copied by parsing to calculate merged_profile
|
|
237
239
|
effective_spec = FleetSpec.parse_obj(spec.dict())
|
|
240
|
+
effective_spec = apply_plugin_policies(
|
|
241
|
+
user=user.name,
|
|
242
|
+
project=project.name,
|
|
243
|
+
spec=effective_spec,
|
|
244
|
+
)
|
|
245
|
+
effective_spec = FleetSpec.parse_obj(effective_spec.dict())
|
|
246
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
238
247
|
current_fleet: Optional[Fleet] = None
|
|
239
248
|
current_fleet_id: Optional[uuid.UUID] = None
|
|
240
249
|
if effective_spec.configuration.name is not None:
|
|
@@ -274,6 +283,7 @@ async def get_create_instance_offers(
|
|
|
274
283
|
project: ProjectModel,
|
|
275
284
|
profile: Profile,
|
|
276
285
|
requirements: Requirements,
|
|
286
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
277
287
|
fleet_spec: Optional[FleetSpec] = None,
|
|
278
288
|
fleet_model: Optional[FleetModel] = None,
|
|
279
289
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
@@ -299,6 +309,7 @@ async def get_create_instance_offers(
|
|
|
299
309
|
exclude_not_available=exclude_not_available,
|
|
300
310
|
multinode=multinode,
|
|
301
311
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
312
|
+
placement_group=placement_group,
|
|
302
313
|
blocks=blocks,
|
|
303
314
|
)
|
|
304
315
|
offers = [
|
|
@@ -330,7 +341,14 @@ async def create_fleet(
|
|
|
330
341
|
user: UserModel,
|
|
331
342
|
spec: FleetSpec,
|
|
332
343
|
) -> Fleet:
|
|
333
|
-
|
|
344
|
+
# Spec must be copied by parsing to calculate merged_profile
|
|
345
|
+
spec = apply_plugin_policies(
|
|
346
|
+
user=user.name,
|
|
347
|
+
project=project.name,
|
|
348
|
+
spec=spec,
|
|
349
|
+
)
|
|
350
|
+
spec = FleetSpec.parse_obj(spec.dict())
|
|
351
|
+
_validate_fleet_spec_and_set_defaults(spec)
|
|
334
352
|
|
|
335
353
|
if spec.configuration.ssh_config is not None:
|
|
336
354
|
_check_can_manage_ssh_fleets(user=user, project=project)
|
|
@@ -378,17 +396,12 @@ async def create_fleet(
|
|
|
378
396
|
)
|
|
379
397
|
fleet_model.instances.append(instances_model)
|
|
380
398
|
else:
|
|
381
|
-
placement_group_name = _get_placement_group_name(
|
|
382
|
-
project=project,
|
|
383
|
-
fleet_spec=spec,
|
|
384
|
-
)
|
|
385
399
|
for i in range(_get_fleet_nodes_to_provision(spec)):
|
|
386
400
|
instance_model = await create_fleet_instance_model(
|
|
387
401
|
session=session,
|
|
388
402
|
project=project,
|
|
389
403
|
user=user,
|
|
390
404
|
spec=spec,
|
|
391
|
-
placement_group_name=placement_group_name,
|
|
392
405
|
reservation=spec.configuration.reservation,
|
|
393
406
|
instance_num=i,
|
|
394
407
|
)
|
|
@@ -402,7 +415,6 @@ async def create_fleet_instance_model(
|
|
|
402
415
|
project: ProjectModel,
|
|
403
416
|
user: UserModel,
|
|
404
417
|
spec: FleetSpec,
|
|
405
|
-
placement_group_name: Optional[str],
|
|
406
418
|
reservation: Optional[str],
|
|
407
419
|
instance_num: int,
|
|
408
420
|
) -> InstanceModel:
|
|
@@ -416,7 +428,6 @@ async def create_fleet_instance_model(
|
|
|
416
428
|
requirements=requirements,
|
|
417
429
|
instance_name=f"{spec.configuration.name}-{instance_num}",
|
|
418
430
|
instance_num=instance_num,
|
|
419
|
-
placement_group_name=placement_group_name,
|
|
420
431
|
reservation=reservation,
|
|
421
432
|
blocks=spec.configuration.blocks,
|
|
422
433
|
tags=spec.configuration.tags,
|
|
@@ -637,7 +648,7 @@ def _remove_fleet_spec_sensitive_info(spec: FleetSpec):
|
|
|
637
648
|
host.ssh_key = None
|
|
638
649
|
|
|
639
650
|
|
|
640
|
-
def
|
|
651
|
+
def _validate_fleet_spec_and_set_defaults(spec: FleetSpec):
|
|
641
652
|
if spec.configuration.name is not None:
|
|
642
653
|
validate_dstack_resource_name(spec.configuration.name)
|
|
643
654
|
if spec.configuration.ssh_config is None and spec.configuration.nodes is None:
|
|
@@ -650,6 +661,8 @@ def _validate_fleet_spec(spec: FleetSpec):
|
|
|
650
661
|
if isinstance(host, SSHHostParams) and host.ssh_key is not None:
|
|
651
662
|
_validate_ssh_key(host.ssh_key)
|
|
652
663
|
_validate_internal_ips(spec.configuration.ssh_config)
|
|
664
|
+
if spec.configuration.resources is not None:
|
|
665
|
+
set_resources_defaults(spec.configuration.resources)
|
|
653
666
|
|
|
654
667
|
|
|
655
668
|
def _validate_all_ssh_params_specified(ssh_config: SSHParams):
|
|
@@ -720,18 +733,3 @@ def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
|
|
|
720
733
|
reservation=fleet_spec.configuration.reservation,
|
|
721
734
|
)
|
|
722
735
|
return requirements
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
def _get_placement_group_name(
|
|
726
|
-
project: ProjectModel,
|
|
727
|
-
fleet_spec: FleetSpec,
|
|
728
|
-
) -> Optional[str]:
|
|
729
|
-
if fleet_spec.configuration.placement != InstanceGroupPlacement.CLUSTER:
|
|
730
|
-
return None
|
|
731
|
-
# A random suffix to avoid clashing with to-be-deleted placement groups left by old fleets
|
|
732
|
-
suffix = _generate_random_placement_group_suffix()
|
|
733
|
-
return f"{project.name}-{fleet_spec.configuration.name}-{suffix}-pg"
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
def _generate_random_placement_group_suffix(length: int = 8) -> str:
|
|
737
|
-
return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
|