dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +56 -13
- dstack/_internal/cli/utils/run.py +10 -5
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +3 -1
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +48 -0
- dstack/_internal/core/backends/nebius/models.py +9 -1
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +79 -4
- dstack/_internal/core/models/runs.py +26 -9
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +18 -8
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +37 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +48 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -19,6 +19,8 @@ from dstack._internal.core.backends import (
|
|
|
19
19
|
from dstack._internal.core.backends.base.compute import (
|
|
20
20
|
ComputeWithCreateInstanceSupport,
|
|
21
21
|
ComputeWithPlacementGroupSupport,
|
|
22
|
+
GoArchType,
|
|
23
|
+
generate_unique_placement_group_name,
|
|
22
24
|
get_dstack_runner_binary_path,
|
|
23
25
|
get_dstack_shim_binary_path,
|
|
24
26
|
get_dstack_working_dir,
|
|
@@ -26,6 +28,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
26
28
|
get_shim_pre_start_commands,
|
|
27
29
|
)
|
|
28
30
|
from dstack._internal.core.backends.remote.provisioning import (
|
|
31
|
+
detect_cpu_arch,
|
|
29
32
|
get_host_info,
|
|
30
33
|
get_paramiko_connection,
|
|
31
34
|
get_shim_healthcheck,
|
|
@@ -39,11 +42,16 @@ from dstack._internal.core.backends.remote.provisioning import (
|
|
|
39
42
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
40
43
|
|
|
41
44
|
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
42
|
-
from dstack._internal.core.errors import
|
|
45
|
+
from dstack._internal.core.errors import (
|
|
46
|
+
BackendError,
|
|
47
|
+
NotYetTerminated,
|
|
48
|
+
ProvisioningError,
|
|
49
|
+
)
|
|
43
50
|
from dstack._internal.core.models.backends.base import BackendType
|
|
44
51
|
from dstack._internal.core.models.fleets import InstanceGroupPlacement
|
|
45
52
|
from dstack._internal.core.models.instances import (
|
|
46
53
|
InstanceAvailability,
|
|
54
|
+
InstanceOffer,
|
|
47
55
|
InstanceOfferWithAvailability,
|
|
48
56
|
InstanceRuntime,
|
|
49
57
|
InstanceStatus,
|
|
@@ -51,7 +59,6 @@ from dstack._internal.core.models.instances import (
|
|
|
51
59
|
SSHKey,
|
|
52
60
|
)
|
|
53
61
|
from dstack._internal.core.models.placement import (
|
|
54
|
-
PlacementGroup,
|
|
55
62
|
PlacementGroupConfiguration,
|
|
56
63
|
PlacementStrategy,
|
|
57
64
|
)
|
|
@@ -89,8 +96,9 @@ from dstack._internal.server.services.instances import (
|
|
|
89
96
|
from dstack._internal.server.services.locking import get_locker
|
|
90
97
|
from dstack._internal.server.services.offers import is_divisible_into_blocks
|
|
91
98
|
from dstack._internal.server.services.placement import (
|
|
92
|
-
|
|
99
|
+
get_fleet_placement_group_models,
|
|
93
100
|
placement_group_model_to_placement_group,
|
|
101
|
+
schedule_fleet_placement_groups_deletion,
|
|
94
102
|
)
|
|
95
103
|
from dstack._internal.server.services.runner import client as runner_client
|
|
96
104
|
from dstack._internal.server.services.runner.client import HealthStatus
|
|
@@ -264,7 +272,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
264
272
|
)
|
|
265
273
|
deploy_timeout = 20 * 60 # 20 minutes
|
|
266
274
|
result = await asyncio.wait_for(future, timeout=deploy_timeout)
|
|
267
|
-
health, host_info = result
|
|
275
|
+
health, host_info, cpu_arch = result
|
|
268
276
|
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
269
277
|
raise ProvisioningError(f"Deploy timeout: {e}") from e
|
|
270
278
|
except Exception as e:
|
|
@@ -285,7 +293,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
285
293
|
instance.last_retry_at = get_current_datetime()
|
|
286
294
|
return
|
|
287
295
|
|
|
288
|
-
instance_type = host_info_to_instance_type(host_info)
|
|
296
|
+
instance_type = host_info_to_instance_type(host_info, cpu_arch)
|
|
289
297
|
instance_network = None
|
|
290
298
|
internal_ip = None
|
|
291
299
|
try:
|
|
@@ -388,7 +396,7 @@ def _deploy_instance(
|
|
|
388
396
|
pkeys: List[PKey],
|
|
389
397
|
ssh_proxy_pkeys: Optional[list[PKey]],
|
|
390
398
|
authorized_keys: List[str],
|
|
391
|
-
) -> Tuple[HealthStatus, Dict[str, Any]]:
|
|
399
|
+
) -> Tuple[HealthStatus, Dict[str, Any], GoArchType]:
|
|
392
400
|
with get_paramiko_connection(
|
|
393
401
|
remote_details.ssh_user,
|
|
394
402
|
remote_details.host,
|
|
@@ -399,13 +407,16 @@ def _deploy_instance(
|
|
|
399
407
|
) as client:
|
|
400
408
|
logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}")
|
|
401
409
|
|
|
410
|
+
arch = detect_cpu_arch(client)
|
|
411
|
+
logger.info("%s: CPU arch is %s", remote_details.host, arch)
|
|
412
|
+
|
|
402
413
|
# Execute pre start commands
|
|
403
|
-
shim_pre_start_commands = get_shim_pre_start_commands()
|
|
414
|
+
shim_pre_start_commands = get_shim_pre_start_commands(arch=arch)
|
|
404
415
|
run_pre_start_commands(client, shim_pre_start_commands, authorized_keys)
|
|
405
416
|
logger.debug("The script for installing dstack has been executed")
|
|
406
417
|
|
|
407
418
|
# Upload envs
|
|
408
|
-
shim_envs = get_shim_env(authorized_keys)
|
|
419
|
+
shim_envs = get_shim_env(authorized_keys, arch=arch)
|
|
409
420
|
try:
|
|
410
421
|
fleet_configuration_envs = remote_details.env.as_dict()
|
|
411
422
|
except ValueError as e:
|
|
@@ -440,7 +451,7 @@ def _deploy_instance(
|
|
|
440
451
|
raise ProvisioningError("Cannot read HealthcheckResponse") from e
|
|
441
452
|
health = runner_client.health_response_to_health_status(health_response)
|
|
442
453
|
|
|
443
|
-
return health, host_info
|
|
454
|
+
return health, host_info, arch
|
|
444
455
|
|
|
445
456
|
|
|
446
457
|
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
@@ -509,11 +520,39 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
509
520
|
)
|
|
510
521
|
return
|
|
511
522
|
|
|
523
|
+
placement_group_models = []
|
|
524
|
+
placement_group_model = None
|
|
525
|
+
if instance.fleet_id:
|
|
526
|
+
placement_group_models = await get_fleet_placement_group_models(
|
|
527
|
+
session=session,
|
|
528
|
+
fleet_id=instance.fleet_id,
|
|
529
|
+
)
|
|
530
|
+
# The placement group is determined when provisioning the master instance
|
|
531
|
+
# and used for all other instances in the fleet.
|
|
532
|
+
if not _is_fleet_master_instance(instance):
|
|
533
|
+
if placement_group_models:
|
|
534
|
+
placement_group_model = placement_group_models[0]
|
|
535
|
+
if len(placement_group_models) > 1:
|
|
536
|
+
logger.error(
|
|
537
|
+
(
|
|
538
|
+
"Expected 0 or 1 placement groups associated with fleet %s, found %s."
|
|
539
|
+
" An incorrect placement group might have been selected for instance %s"
|
|
540
|
+
),
|
|
541
|
+
instance.fleet_id,
|
|
542
|
+
len(placement_group_models),
|
|
543
|
+
instance.name,
|
|
544
|
+
)
|
|
545
|
+
|
|
512
546
|
offers = await get_create_instance_offers(
|
|
513
547
|
project=instance.project,
|
|
514
548
|
profile=profile,
|
|
515
549
|
requirements=requirements,
|
|
516
550
|
fleet_model=instance.fleet,
|
|
551
|
+
placement_group=(
|
|
552
|
+
placement_group_model_to_placement_group(placement_group_model)
|
|
553
|
+
if placement_group_model
|
|
554
|
+
else None
|
|
555
|
+
),
|
|
517
556
|
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
518
557
|
exclude_not_available=True,
|
|
519
558
|
)
|
|
@@ -527,12 +566,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
527
566
|
)
|
|
528
567
|
return
|
|
529
568
|
|
|
530
|
-
placement_groups = []
|
|
531
|
-
if instance.fleet_id:
|
|
532
|
-
placement_groups = await get_fleet_placement_groups(
|
|
533
|
-
session=session, fleet_id=instance.fleet_id
|
|
534
|
-
)
|
|
535
|
-
|
|
536
569
|
# Limit number of offers tried to prevent long-running processing
|
|
537
570
|
# in case all offers fail.
|
|
538
571
|
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
@@ -542,25 +575,28 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
542
575
|
assert isinstance(compute, ComputeWithCreateInstanceSupport)
|
|
543
576
|
instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
|
|
544
577
|
if (
|
|
545
|
-
|
|
578
|
+
_is_fleet_master_instance(instance)
|
|
579
|
+
and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
|
|
546
580
|
and instance.fleet
|
|
547
|
-
and
|
|
581
|
+
and _is_cloud_cluster(instance.fleet)
|
|
548
582
|
):
|
|
549
583
|
assert isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
550
|
-
placement_group_model =
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
name=instance_configuration.placement_group_name,
|
|
555
|
-
backend=instance_offer.backend,
|
|
556
|
-
region=instance_offer.region,
|
|
584
|
+
placement_group_model = _find_suitable_placement_group(
|
|
585
|
+
placement_groups=placement_group_models,
|
|
586
|
+
instance_offer=instance_offer,
|
|
587
|
+
compute=compute,
|
|
557
588
|
)
|
|
558
|
-
if placement_group_model is
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
589
|
+
if placement_group_model is None:
|
|
590
|
+
placement_group_model = await _create_placement_group(
|
|
591
|
+
fleet_model=instance.fleet,
|
|
592
|
+
master_instance_offer=instance_offer,
|
|
593
|
+
compute=compute,
|
|
594
|
+
)
|
|
595
|
+
if placement_group_model is None: # error occurred
|
|
596
|
+
continue
|
|
562
597
|
session.add(placement_group_model)
|
|
563
|
-
|
|
598
|
+
await session.flush()
|
|
599
|
+
placement_group_models.append(placement_group_model)
|
|
564
600
|
logger.debug(
|
|
565
601
|
"Trying %s in %s/%s for $%0.4f per hour",
|
|
566
602
|
instance_offer.instance.name,
|
|
@@ -573,6 +609,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
573
609
|
compute.create_instance,
|
|
574
610
|
instance_offer,
|
|
575
611
|
instance_configuration,
|
|
612
|
+
(
|
|
613
|
+
placement_group_model_to_placement_group(placement_group_model)
|
|
614
|
+
if placement_group_model
|
|
615
|
+
else None
|
|
616
|
+
),
|
|
576
617
|
)
|
|
577
618
|
except BackendError as e:
|
|
578
619
|
logger.warning(
|
|
@@ -612,22 +653,46 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
612
653
|
"instance_status": InstanceStatus.PROVISIONING.value,
|
|
613
654
|
},
|
|
614
655
|
)
|
|
656
|
+
if instance.fleet_id and _is_fleet_master_instance(instance):
|
|
657
|
+
# Clean up placement groups that did not end up being used
|
|
658
|
+
await schedule_fleet_placement_groups_deletion(
|
|
659
|
+
session=session,
|
|
660
|
+
fleet_id=instance.fleet_id,
|
|
661
|
+
except_placement_group_ids=(
|
|
662
|
+
[placement_group_model.id] if placement_group_model is not None else []
|
|
663
|
+
),
|
|
664
|
+
)
|
|
615
665
|
return
|
|
616
666
|
|
|
617
667
|
instance.last_retry_at = get_current_datetime()
|
|
618
668
|
|
|
619
669
|
if not should_retry:
|
|
620
|
-
instance
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
instance.
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
670
|
+
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
|
|
671
|
+
if (
|
|
672
|
+
instance.fleet
|
|
673
|
+
and _is_fleet_master_instance(instance)
|
|
674
|
+
and _is_cloud_cluster(instance.fleet)
|
|
675
|
+
):
|
|
676
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
677
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
678
|
+
for sibling_instance in instance.fleet.instances:
|
|
679
|
+
if sibling_instance.id == instance.id:
|
|
680
|
+
continue
|
|
681
|
+
_mark_terminated(sibling_instance, "Master instance failed to start")
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
|
|
685
|
+
instance.status = InstanceStatus.TERMINATED
|
|
686
|
+
instance.termination_reason = termination_reason
|
|
687
|
+
logger.info(
|
|
688
|
+
"Terminated instance %s: %s",
|
|
689
|
+
instance.name,
|
|
690
|
+
instance.termination_reason,
|
|
691
|
+
extra={
|
|
692
|
+
"instance_name": instance.name,
|
|
693
|
+
"instance_status": InstanceStatus.TERMINATED.value,
|
|
694
|
+
},
|
|
695
|
+
)
|
|
631
696
|
|
|
632
697
|
|
|
633
698
|
async def _check_instance(instance: InstanceModel) -> None:
|
|
@@ -906,12 +971,20 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool:
|
|
|
906
971
|
if instance.fleet is None:
|
|
907
972
|
return False
|
|
908
973
|
if (
|
|
909
|
-
instance
|
|
974
|
+
_is_fleet_master_instance(instance)
|
|
910
975
|
or instance.fleet.instances[0].job_provisioning_data is not None
|
|
911
976
|
or instance.fleet.instances[0].status == InstanceStatus.TERMINATED
|
|
912
977
|
):
|
|
913
978
|
return False
|
|
914
|
-
|
|
979
|
+
return _is_cloud_cluster(instance.fleet)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def _is_fleet_master_instance(instance: InstanceModel) -> bool:
|
|
983
|
+
return instance.fleet is not None and instance.id == instance.fleet.instances[0].id
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
def _is_cloud_cluster(fleet_model: FleetModel) -> bool:
|
|
987
|
+
fleet = fleet_model_to_fleet(fleet_model)
|
|
915
988
|
return (
|
|
916
989
|
fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
917
990
|
and fleet.spec.configuration.ssh_config is None
|
|
@@ -944,28 +1017,76 @@ def _get_instance_offer_for_instance(
|
|
|
944
1017
|
return instance_offer
|
|
945
1018
|
|
|
946
1019
|
|
|
947
|
-
def
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
name: str,
|
|
952
|
-
backend: BackendType,
|
|
953
|
-
region: str,
|
|
1020
|
+
def _find_suitable_placement_group(
|
|
1021
|
+
placement_groups: List[PlacementGroupModel],
|
|
1022
|
+
instance_offer: InstanceOffer,
|
|
1023
|
+
compute: ComputeWithPlacementGroupSupport,
|
|
954
1024
|
) -> Optional[PlacementGroupModel]:
|
|
955
1025
|
for pg in placement_groups:
|
|
956
|
-
if
|
|
957
|
-
|
|
1026
|
+
if compute.is_suitable_placement_group(
|
|
1027
|
+
placement_group_model_to_placement_group(pg), instance_offer
|
|
1028
|
+
):
|
|
1029
|
+
return pg
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
async def _create_placement_group(
|
|
1034
|
+
fleet_model: FleetModel,
|
|
1035
|
+
master_instance_offer: InstanceOffer,
|
|
1036
|
+
compute: ComputeWithPlacementGroupSupport,
|
|
1037
|
+
) -> Optional[PlacementGroupModel]:
|
|
958
1038
|
placement_group_model = PlacementGroupModel(
|
|
959
|
-
name
|
|
1039
|
+
# TODO: generate the name in Compute.create_placement_group to allow
|
|
1040
|
+
# backend-specific name length limits
|
|
1041
|
+
name=generate_unique_placement_group_name(
|
|
1042
|
+
project_name=fleet_model.project.name,
|
|
1043
|
+
fleet_name=fleet_model.name,
|
|
1044
|
+
),
|
|
960
1045
|
project=fleet_model.project,
|
|
961
1046
|
fleet=fleet_model,
|
|
962
1047
|
configuration=PlacementGroupConfiguration(
|
|
963
|
-
backend=backend,
|
|
964
|
-
region=region,
|
|
1048
|
+
backend=master_instance_offer.backend,
|
|
1049
|
+
region=master_instance_offer.region,
|
|
965
1050
|
placement_strategy=PlacementStrategy.CLUSTER,
|
|
966
1051
|
).json(),
|
|
967
1052
|
)
|
|
968
|
-
|
|
1053
|
+
placement_group = placement_group_model_to_placement_group(placement_group_model)
|
|
1054
|
+
logger.debug(
|
|
1055
|
+
"Creating placement group %s in %s/%s",
|
|
1056
|
+
placement_group.name,
|
|
1057
|
+
placement_group.configuration.backend.value,
|
|
1058
|
+
placement_group.configuration.region,
|
|
1059
|
+
)
|
|
1060
|
+
try:
|
|
1061
|
+
pgpd = await run_async(
|
|
1062
|
+
compute.create_placement_group,
|
|
1063
|
+
placement_group_model_to_placement_group(placement_group_model),
|
|
1064
|
+
master_instance_offer,
|
|
1065
|
+
)
|
|
1066
|
+
except BackendError as e:
|
|
1067
|
+
logger.warning(
|
|
1068
|
+
"Failed to create placement group %s in %s/%s: %r",
|
|
1069
|
+
placement_group.name,
|
|
1070
|
+
placement_group.configuration.backend.value,
|
|
1071
|
+
placement_group.configuration.region,
|
|
1072
|
+
e,
|
|
1073
|
+
)
|
|
1074
|
+
return None
|
|
1075
|
+
except Exception:
|
|
1076
|
+
logger.exception(
|
|
1077
|
+
"Got exception when creating placement group %s in %s/%s",
|
|
1078
|
+
placement_group.name,
|
|
1079
|
+
placement_group.configuration.backend.value,
|
|
1080
|
+
placement_group.configuration.region,
|
|
1081
|
+
)
|
|
1082
|
+
return None
|
|
1083
|
+
logger.info(
|
|
1084
|
+
"Created placement group %s in %s/%s",
|
|
1085
|
+
placement_group.name,
|
|
1086
|
+
placement_group.configuration.backend.value,
|
|
1087
|
+
placement_group.configuration.region,
|
|
1088
|
+
)
|
|
1089
|
+
placement_group_model.provisioning_data = pgpd.json()
|
|
969
1090
|
return placement_group_model
|
|
970
1091
|
|
|
971
1092
|
|
|
@@ -42,10 +42,33 @@ async def collect_metrics():
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
async def delete_metrics():
|
|
45
|
-
|
|
45
|
+
now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
|
|
46
|
+
running_timestamp_micro_cutoff = (
|
|
47
|
+
now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
|
|
48
|
+
)
|
|
49
|
+
finished_timestamp_micro_cutoff = (
|
|
50
|
+
now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
|
|
51
|
+
)
|
|
46
52
|
async with get_session_ctx() as session:
|
|
47
|
-
await
|
|
48
|
-
|
|
53
|
+
await asyncio.gather(
|
|
54
|
+
session.execute(
|
|
55
|
+
delete(JobMetricsPoint).where(
|
|
56
|
+
JobMetricsPoint.job_id.in_(
|
|
57
|
+
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
58
|
+
),
|
|
59
|
+
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
60
|
+
)
|
|
61
|
+
),
|
|
62
|
+
session.execute(
|
|
63
|
+
delete(JobMetricsPoint).where(
|
|
64
|
+
JobMetricsPoint.job_id.in_(
|
|
65
|
+
select(JobModel.id).where(
|
|
66
|
+
JobModel.status.in_(JobStatus.finished_statuses())
|
|
67
|
+
)
|
|
68
|
+
),
|
|
69
|
+
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
70
|
+
)
|
|
71
|
+
),
|
|
49
72
|
)
|
|
50
73
|
await session.commit()
|
|
51
74
|
|
|
@@ -134,9 +157,3 @@ def _pull_runner_metrics(
|
|
|
134
157
|
) -> Optional[MetricsResponse]:
|
|
135
158
|
runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
|
|
136
159
|
return runner_client.get_metrics()
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _get_delete_metrics_cutoff() -> int:
|
|
140
|
-
now = int(get_current_datetime().timestamp() * 1_000_000)
|
|
141
|
-
cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
|
|
142
|
-
return cutoff
|
|
@@ -66,7 +66,7 @@ async def _delete_placement_groups(
|
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
async def _delete_placement_group(placement_group_model: PlacementGroupModel):
|
|
69
|
-
logger.
|
|
69
|
+
logger.debug("Deleting placement group %s", placement_group_model.name)
|
|
70
70
|
placement_group = placement_group_model_to_placement_group(placement_group_model)
|
|
71
71
|
if placement_group.provisioning_data is None:
|
|
72
72
|
logger.error(
|
|
@@ -99,11 +99,14 @@ async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: dateti
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
|
|
102
|
-
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
103
102
|
jpd = get_job_provisioning_data(job_model)
|
|
104
|
-
jrd = get_job_runtime_data(job_model)
|
|
105
103
|
if jpd is None:
|
|
106
104
|
return None
|
|
105
|
+
if not jpd.dockerized:
|
|
106
|
+
# Container-based backend, no shim
|
|
107
|
+
return None
|
|
108
|
+
ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
|
|
109
|
+
jrd = get_job_runtime_data(job_model)
|
|
107
110
|
try:
|
|
108
111
|
res = await run_async(
|
|
109
112
|
_pull_job_metrics,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Iterable
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta, timezone
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import select
|
|
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
|
|
|
71
71
|
logger = get_logger(__name__)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
# Minimum time before terminating active job in case of connectivity issues.
|
|
75
|
+
# Should be sufficient to survive most problems caused by
|
|
76
|
+
# the server network flickering and providers' glitches.
|
|
77
|
+
JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
|
|
78
|
+
|
|
79
|
+
|
|
74
80
|
async def process_running_jobs(batch_size: int = 1):
|
|
75
81
|
tasks = []
|
|
76
82
|
for _ in range(batch_size):
|
|
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
202
208
|
user_ssh_key = run.run_spec.ssh_key_pub.strip()
|
|
203
209
|
public_keys = [project.ssh_public_key.strip(), user_ssh_key]
|
|
204
210
|
if job_provisioning_data.backend == BackendType.LOCAL:
|
|
205
|
-
# No need to update ~/.ssh/authorized_keys when running shim
|
|
211
|
+
# No need to update ~/.ssh/authorized_keys when running shim locally
|
|
206
212
|
user_ssh_key = ""
|
|
207
213
|
success = await common_utils.run_async(
|
|
208
214
|
_process_provisioning_with_shim,
|
|
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
299
305
|
run_model,
|
|
300
306
|
job_model,
|
|
301
307
|
)
|
|
302
|
-
if not success:
|
|
303
|
-
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
304
308
|
|
|
305
|
-
if
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
309
|
+
if success:
|
|
310
|
+
job_model.disconnected_at = None
|
|
311
|
+
else:
|
|
312
|
+
if job_model.termination_reason:
|
|
313
|
+
logger.warning(
|
|
314
|
+
"%s: failed because shim/runner returned an error, age=%s",
|
|
315
|
+
fmt(job_model),
|
|
316
|
+
job_submission.age,
|
|
317
|
+
)
|
|
318
|
+
job_model.status = JobStatus.TERMINATING
|
|
319
|
+
# job will be terminated and instance will be emptied by process_terminating_jobs
|
|
320
|
+
else:
|
|
321
|
+
# No job_model.termination_reason set means ssh connection failed
|
|
322
|
+
if job_model.disconnected_at is None:
|
|
323
|
+
job_model.disconnected_at = common_utils.get_current_datetime()
|
|
324
|
+
if _should_terminate_job_due_to_disconnect(job_model):
|
|
325
|
+
logger.warning(
|
|
326
|
+
"%s: failed because instance is unreachable, age=%s",
|
|
327
|
+
fmt(job_model),
|
|
328
|
+
job_submission.age,
|
|
329
|
+
)
|
|
330
|
+
# TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
|
|
331
|
+
# when CLI <= 0.19.8 is no longer supported
|
|
332
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
333
|
+
job_model.status = JobStatus.TERMINATING
|
|
334
|
+
else:
|
|
335
|
+
logger.warning(
|
|
336
|
+
"%s: is unreachable, waiting for the instance to become reachable again, age=%s",
|
|
337
|
+
fmt(job_model),
|
|
338
|
+
job_submission.age,
|
|
339
|
+
)
|
|
315
340
|
|
|
316
341
|
if (
|
|
317
342
|
initial_status != job_model.status
|
|
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
|
|
|
543
568
|
if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
|
|
544
569
|
task = shim_client.get_task(job_model.id)
|
|
545
570
|
|
|
546
|
-
# If task goes to terminated before the job is submitted to runner, then an error
|
|
571
|
+
# If task goes to terminated before the job is submitted to runner, then an error occurred
|
|
547
572
|
if task.status == TaskStatus.TERMINATED:
|
|
548
573
|
logger.warning(
|
|
549
574
|
"shim failed to execute job %s: %s (%s)",
|
|
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
|
|
|
572
597
|
else:
|
|
573
598
|
shim_status = shim_client.pull() # raises error if shim is down, causes retry
|
|
574
599
|
|
|
575
|
-
# If shim goes to pending before the job is submitted to runner, then an error
|
|
600
|
+
# If shim goes to pending before the job is submitted to runner, then an error occurred
|
|
576
601
|
if (
|
|
577
602
|
shim_status.state == "pending"
|
|
578
603
|
and shim_status.result is not None
|
|
@@ -651,6 +676,10 @@ def _process_running(
|
|
|
651
676
|
)
|
|
652
677
|
if latest_state_event.termination_message:
|
|
653
678
|
job_model.termination_reason_message = latest_state_event.termination_message
|
|
679
|
+
if (exit_status := latest_state_event.exit_status) is not None:
|
|
680
|
+
job_model.exit_status = exit_status
|
|
681
|
+
if exit_status != 0:
|
|
682
|
+
logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
|
|
654
683
|
else:
|
|
655
684
|
_terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
|
|
656
685
|
if job_model.status != previous_status:
|
|
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
688
717
|
)
|
|
689
718
|
|
|
690
719
|
|
|
720
|
+
def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
|
|
721
|
+
if job_model.disconnected_at is None:
|
|
722
|
+
return False
|
|
723
|
+
return (
|
|
724
|
+
common_utils.get_current_datetime()
|
|
725
|
+
> job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
|
|
691
729
|
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
692
730
|
policy = job.job_spec.utilization_policy
|
|
693
731
|
if policy is None:
|
|
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
|
|
|
818
856
|
return success_if_not_available
|
|
819
857
|
|
|
820
858
|
runner_client.submit_job(
|
|
821
|
-
|
|
822
|
-
|
|
859
|
+
run=run,
|
|
860
|
+
job=job,
|
|
823
861
|
cluster_info=cluster_info,
|
|
824
862
|
secrets=secrets,
|
|
825
863
|
repo_credentials=repo_credentials,
|