dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +56 -13
  3. dstack/_internal/cli/utils/run.py +10 -5
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +3 -1
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +48 -0
  19. dstack/_internal/core/backends/nebius/models.py +9 -1
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +79 -4
  28. dstack/_internal/core/models/runs.py +26 -9
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_metrics.py +26 -9
  33. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  34. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  36. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  37. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  38. dstack/_internal/server/models.py +6 -1
  39. dstack/_internal/server/schemas/runner.py +41 -8
  40. dstack/_internal/server/services/fleets.py +9 -26
  41. dstack/_internal/server/services/instances.py +0 -2
  42. dstack/_internal/server/services/jobs/__init__.py +1 -0
  43. dstack/_internal/server/services/offers.py +15 -0
  44. dstack/_internal/server/services/placement.py +27 -6
  45. dstack/_internal/server/services/resources.py +21 -0
  46. dstack/_internal/server/services/runner/client.py +7 -4
  47. dstack/_internal/server/services/runs.py +18 -8
  48. dstack/_internal/server/settings.py +20 -1
  49. dstack/_internal/server/testing/common.py +37 -26
  50. dstack/_internal/utils/common.py +13 -1
  51. dstack/_internal/utils/json_schema.py +6 -3
  52. dstack/api/__init__.py +1 -0
  53. dstack/api/server/_fleets.py +16 -0
  54. dstack/api/server/_runs.py +48 -3
  55. dstack/version.py +1 -1
  56. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
  57. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
  58. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -19,6 +19,8 @@ from dstack._internal.core.backends import (
19
19
  from dstack._internal.core.backends.base.compute import (
20
20
  ComputeWithCreateInstanceSupport,
21
21
  ComputeWithPlacementGroupSupport,
22
+ GoArchType,
23
+ generate_unique_placement_group_name,
22
24
  get_dstack_runner_binary_path,
23
25
  get_dstack_shim_binary_path,
24
26
  get_dstack_working_dir,
@@ -26,6 +28,7 @@ from dstack._internal.core.backends.base.compute import (
26
28
  get_shim_pre_start_commands,
27
29
  )
28
30
  from dstack._internal.core.backends.remote.provisioning import (
31
+ detect_cpu_arch,
29
32
  get_host_info,
30
33
  get_paramiko_connection,
31
34
  get_shim_healthcheck,
@@ -39,11 +42,16 @@ from dstack._internal.core.backends.remote.provisioning import (
39
42
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
40
43
 
41
44
  # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
42
- from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
45
+ from dstack._internal.core.errors import (
46
+ BackendError,
47
+ NotYetTerminated,
48
+ ProvisioningError,
49
+ )
43
50
  from dstack._internal.core.models.backends.base import BackendType
44
51
  from dstack._internal.core.models.fleets import InstanceGroupPlacement
45
52
  from dstack._internal.core.models.instances import (
46
53
  InstanceAvailability,
54
+ InstanceOffer,
47
55
  InstanceOfferWithAvailability,
48
56
  InstanceRuntime,
49
57
  InstanceStatus,
@@ -51,7 +59,6 @@ from dstack._internal.core.models.instances import (
51
59
  SSHKey,
52
60
  )
53
61
  from dstack._internal.core.models.placement import (
54
- PlacementGroup,
55
62
  PlacementGroupConfiguration,
56
63
  PlacementStrategy,
57
64
  )
@@ -89,8 +96,9 @@ from dstack._internal.server.services.instances import (
89
96
  from dstack._internal.server.services.locking import get_locker
90
97
  from dstack._internal.server.services.offers import is_divisible_into_blocks
91
98
  from dstack._internal.server.services.placement import (
92
- get_fleet_placement_groups,
99
+ get_fleet_placement_group_models,
93
100
  placement_group_model_to_placement_group,
101
+ schedule_fleet_placement_groups_deletion,
94
102
  )
95
103
  from dstack._internal.server.services.runner import client as runner_client
96
104
  from dstack._internal.server.services.runner.client import HealthStatus
@@ -264,7 +272,7 @@ async def _add_remote(instance: InstanceModel) -> None:
264
272
  )
265
273
  deploy_timeout = 20 * 60 # 20 minutes
266
274
  result = await asyncio.wait_for(future, timeout=deploy_timeout)
267
- health, host_info = result
275
+ health, host_info, cpu_arch = result
268
276
  except (asyncio.TimeoutError, TimeoutError) as e:
269
277
  raise ProvisioningError(f"Deploy timeout: {e}") from e
270
278
  except Exception as e:
@@ -285,7 +293,7 @@ async def _add_remote(instance: InstanceModel) -> None:
285
293
  instance.last_retry_at = get_current_datetime()
286
294
  return
287
295
 
288
- instance_type = host_info_to_instance_type(host_info)
296
+ instance_type = host_info_to_instance_type(host_info, cpu_arch)
289
297
  instance_network = None
290
298
  internal_ip = None
291
299
  try:
@@ -388,7 +396,7 @@ def _deploy_instance(
388
396
  pkeys: List[PKey],
389
397
  ssh_proxy_pkeys: Optional[list[PKey]],
390
398
  authorized_keys: List[str],
391
- ) -> Tuple[HealthStatus, Dict[str, Any]]:
399
+ ) -> Tuple[HealthStatus, Dict[str, Any], GoArchType]:
392
400
  with get_paramiko_connection(
393
401
  remote_details.ssh_user,
394
402
  remote_details.host,
@@ -399,13 +407,16 @@ def _deploy_instance(
399
407
  ) as client:
400
408
  logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}")
401
409
 
410
+ arch = detect_cpu_arch(client)
411
+ logger.info("%s: CPU arch is %s", remote_details.host, arch)
412
+
402
413
  # Execute pre start commands
403
- shim_pre_start_commands = get_shim_pre_start_commands()
414
+ shim_pre_start_commands = get_shim_pre_start_commands(arch=arch)
404
415
  run_pre_start_commands(client, shim_pre_start_commands, authorized_keys)
405
416
  logger.debug("The script for installing dstack has been executed")
406
417
 
407
418
  # Upload envs
408
- shim_envs = get_shim_env(authorized_keys)
419
+ shim_envs = get_shim_env(authorized_keys, arch=arch)
409
420
  try:
410
421
  fleet_configuration_envs = remote_details.env.as_dict()
411
422
  except ValueError as e:
@@ -440,7 +451,7 @@ def _deploy_instance(
440
451
  raise ProvisioningError("Cannot read HealthcheckResponse") from e
441
452
  health = runner_client.health_response_to_health_status(health_response)
442
453
 
443
- return health, host_info
454
+ return health, host_info, arch
444
455
 
445
456
 
446
457
  async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
@@ -509,11 +520,39 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
509
520
  )
510
521
  return
511
522
 
523
+ placement_group_models = []
524
+ placement_group_model = None
525
+ if instance.fleet_id:
526
+ placement_group_models = await get_fleet_placement_group_models(
527
+ session=session,
528
+ fleet_id=instance.fleet_id,
529
+ )
530
+ # The placement group is determined when provisioning the master instance
531
+ # and used for all other instances in the fleet.
532
+ if not _is_fleet_master_instance(instance):
533
+ if placement_group_models:
534
+ placement_group_model = placement_group_models[0]
535
+ if len(placement_group_models) > 1:
536
+ logger.error(
537
+ (
538
+ "Expected 0 or 1 placement groups associated with fleet %s, found %s."
539
+ " An incorrect placement group might have been selected for instance %s"
540
+ ),
541
+ instance.fleet_id,
542
+ len(placement_group_models),
543
+ instance.name,
544
+ )
545
+
512
546
  offers = await get_create_instance_offers(
513
547
  project=instance.project,
514
548
  profile=profile,
515
549
  requirements=requirements,
516
550
  fleet_model=instance.fleet,
551
+ placement_group=(
552
+ placement_group_model_to_placement_group(placement_group_model)
553
+ if placement_group_model
554
+ else None
555
+ ),
517
556
  blocks="auto" if instance.total_blocks is None else instance.total_blocks,
518
557
  exclude_not_available=True,
519
558
  )
@@ -527,12 +566,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
527
566
  )
528
567
  return
529
568
 
530
- placement_groups = []
531
- if instance.fleet_id:
532
- placement_groups = await get_fleet_placement_groups(
533
- session=session, fleet_id=instance.fleet_id
534
- )
535
-
536
569
  # Limit number of offers tried to prevent long-running processing
537
570
  # in case all offers fail.
538
571
  for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
@@ -542,25 +575,28 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
542
575
  assert isinstance(compute, ComputeWithCreateInstanceSupport)
543
576
  instance_offer = _get_instance_offer_for_instance(instance_offer, instance)
544
577
  if (
545
- instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
578
+ _is_fleet_master_instance(instance)
579
+ and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
546
580
  and instance.fleet
547
- and instance_configuration.placement_group_name
581
+ and _is_cloud_cluster(instance.fleet)
548
582
  ):
549
583
  assert isinstance(compute, ComputeWithPlacementGroupSupport)
550
- placement_group_model = _create_placement_group_if_does_not_exist(
551
- session=session,
552
- fleet_model=instance.fleet,
553
- placement_groups=placement_groups,
554
- name=instance_configuration.placement_group_name,
555
- backend=instance_offer.backend,
556
- region=instance_offer.region,
584
+ placement_group_model = _find_suitable_placement_group(
585
+ placement_groups=placement_group_models,
586
+ instance_offer=instance_offer,
587
+ compute=compute,
557
588
  )
558
- if placement_group_model is not None:
559
- placement_group = placement_group_model_to_placement_group(placement_group_model)
560
- pgpd = await run_async(compute.create_placement_group, placement_group)
561
- placement_group_model.provisioning_data = pgpd.json()
589
+ if placement_group_model is None:
590
+ placement_group_model = await _create_placement_group(
591
+ fleet_model=instance.fleet,
592
+ master_instance_offer=instance_offer,
593
+ compute=compute,
594
+ )
595
+ if placement_group_model is None: # error occurred
596
+ continue
562
597
  session.add(placement_group_model)
563
- placement_groups.append(placement_group)
598
+ await session.flush()
599
+ placement_group_models.append(placement_group_model)
564
600
  logger.debug(
565
601
  "Trying %s in %s/%s for $%0.4f per hour",
566
602
  instance_offer.instance.name,
@@ -573,6 +609,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
573
609
  compute.create_instance,
574
610
  instance_offer,
575
611
  instance_configuration,
612
+ (
613
+ placement_group_model_to_placement_group(placement_group_model)
614
+ if placement_group_model
615
+ else None
616
+ ),
576
617
  )
577
618
  except BackendError as e:
578
619
  logger.warning(
@@ -612,22 +653,46 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
612
653
  "instance_status": InstanceStatus.PROVISIONING.value,
613
654
  },
614
655
  )
656
+ if instance.fleet_id and _is_fleet_master_instance(instance):
657
+ # Clean up placement groups that did not end up being used
658
+ await schedule_fleet_placement_groups_deletion(
659
+ session=session,
660
+ fleet_id=instance.fleet_id,
661
+ except_placement_group_ids=(
662
+ [placement_group_model.id] if placement_group_model is not None else []
663
+ ),
664
+ )
615
665
  return
616
666
 
617
667
  instance.last_retry_at = get_current_datetime()
618
668
 
619
669
  if not should_retry:
620
- instance.status = InstanceStatus.TERMINATED
621
- instance.termination_reason = "All offers failed" if offers else "No offers found"
622
- logger.info(
623
- "Terminated instance %s: %s",
624
- instance.name,
625
- instance.termination_reason,
626
- extra={
627
- "instance_name": instance.name,
628
- "instance_status": InstanceStatus.TERMINATED.value,
629
- },
630
- )
670
+ _mark_terminated(instance, "All offers failed" if offers else "No offers found")
671
+ if (
672
+ instance.fleet
673
+ and _is_fleet_master_instance(instance)
674
+ and _is_cloud_cluster(instance.fleet)
675
+ ):
676
+ # Do not attempt to deploy other instances, as they won't determine the correct cluster
677
+ # backend, region, and placement group without a successfully deployed master instance
678
+ for sibling_instance in instance.fleet.instances:
679
+ if sibling_instance.id == instance.id:
680
+ continue
681
+ _mark_terminated(sibling_instance, "Master instance failed to start")
682
+
683
+
684
+ def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
685
+ instance.status = InstanceStatus.TERMINATED
686
+ instance.termination_reason = termination_reason
687
+ logger.info(
688
+ "Terminated instance %s: %s",
689
+ instance.name,
690
+ instance.termination_reason,
691
+ extra={
692
+ "instance_name": instance.name,
693
+ "instance_status": InstanceStatus.TERMINATED.value,
694
+ },
695
+ )
631
696
 
632
697
 
633
698
  async def _check_instance(instance: InstanceModel) -> None:
@@ -906,12 +971,20 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool:
906
971
  if instance.fleet is None:
907
972
  return False
908
973
  if (
909
- instance.id == instance.fleet.instances[0].id
974
+ _is_fleet_master_instance(instance)
910
975
  or instance.fleet.instances[0].job_provisioning_data is not None
911
976
  or instance.fleet.instances[0].status == InstanceStatus.TERMINATED
912
977
  ):
913
978
  return False
914
- fleet = fleet_model_to_fleet(instance.fleet)
979
+ return _is_cloud_cluster(instance.fleet)
980
+
981
+
982
+ def _is_fleet_master_instance(instance: InstanceModel) -> bool:
983
+ return instance.fleet is not None and instance.id == instance.fleet.instances[0].id
984
+
985
+
986
+ def _is_cloud_cluster(fleet_model: FleetModel) -> bool:
987
+ fleet = fleet_model_to_fleet(fleet_model)
915
988
  return (
916
989
  fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
917
990
  and fleet.spec.configuration.ssh_config is None
@@ -944,28 +1017,76 @@ def _get_instance_offer_for_instance(
944
1017
  return instance_offer
945
1018
 
946
1019
 
947
- def _create_placement_group_if_does_not_exist(
948
- session: AsyncSession,
949
- fleet_model: FleetModel,
950
- placement_groups: List[PlacementGroup],
951
- name: str,
952
- backend: BackendType,
953
- region: str,
1020
+ def _find_suitable_placement_group(
1021
+ placement_groups: List[PlacementGroupModel],
1022
+ instance_offer: InstanceOffer,
1023
+ compute: ComputeWithPlacementGroupSupport,
954
1024
  ) -> Optional[PlacementGroupModel]:
955
1025
  for pg in placement_groups:
956
- if pg.configuration.backend == backend and pg.configuration.region == region:
957
- return None
1026
+ if compute.is_suitable_placement_group(
1027
+ placement_group_model_to_placement_group(pg), instance_offer
1028
+ ):
1029
+ return pg
1030
+ return None
1031
+
1032
+
1033
+ async def _create_placement_group(
1034
+ fleet_model: FleetModel,
1035
+ master_instance_offer: InstanceOffer,
1036
+ compute: ComputeWithPlacementGroupSupport,
1037
+ ) -> Optional[PlacementGroupModel]:
958
1038
  placement_group_model = PlacementGroupModel(
959
- name=name,
1039
+ # TODO: generate the name in Compute.create_placement_group to allow
1040
+ # backend-specific name length limits
1041
+ name=generate_unique_placement_group_name(
1042
+ project_name=fleet_model.project.name,
1043
+ fleet_name=fleet_model.name,
1044
+ ),
960
1045
  project=fleet_model.project,
961
1046
  fleet=fleet_model,
962
1047
  configuration=PlacementGroupConfiguration(
963
- backend=backend,
964
- region=region,
1048
+ backend=master_instance_offer.backend,
1049
+ region=master_instance_offer.region,
965
1050
  placement_strategy=PlacementStrategy.CLUSTER,
966
1051
  ).json(),
967
1052
  )
968
- session.add(placement_group_model)
1053
+ placement_group = placement_group_model_to_placement_group(placement_group_model)
1054
+ logger.debug(
1055
+ "Creating placement group %s in %s/%s",
1056
+ placement_group.name,
1057
+ placement_group.configuration.backend.value,
1058
+ placement_group.configuration.region,
1059
+ )
1060
+ try:
1061
+ pgpd = await run_async(
1062
+ compute.create_placement_group,
1063
+ placement_group_model_to_placement_group(placement_group_model),
1064
+ master_instance_offer,
1065
+ )
1066
+ except BackendError as e:
1067
+ logger.warning(
1068
+ "Failed to create placement group %s in %s/%s: %r",
1069
+ placement_group.name,
1070
+ placement_group.configuration.backend.value,
1071
+ placement_group.configuration.region,
1072
+ e,
1073
+ )
1074
+ return None
1075
+ except Exception:
1076
+ logger.exception(
1077
+ "Got exception when creating placement group %s in %s/%s",
1078
+ placement_group.name,
1079
+ placement_group.configuration.backend.value,
1080
+ placement_group.configuration.region,
1081
+ )
1082
+ return None
1083
+ logger.info(
1084
+ "Created placement group %s in %s/%s",
1085
+ placement_group.name,
1086
+ placement_group.configuration.backend.value,
1087
+ placement_group.configuration.region,
1088
+ )
1089
+ placement_group_model.provisioning_data = pgpd.json()
969
1090
  return placement_group_model
970
1091
 
971
1092
 
@@ -42,10 +42,33 @@ async def collect_metrics():
42
42
 
43
43
 
44
44
  async def delete_metrics():
45
- cutoff = _get_delete_metrics_cutoff()
45
+ now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
46
+ running_timestamp_micro_cutoff = (
47
+ now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
48
+ )
49
+ finished_timestamp_micro_cutoff = (
50
+ now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
51
+ )
46
52
  async with get_session_ctx() as session:
47
- await session.execute(
48
- delete(JobMetricsPoint).where(JobMetricsPoint.timestamp_micro < cutoff)
53
+ await asyncio.gather(
54
+ session.execute(
55
+ delete(JobMetricsPoint).where(
56
+ JobMetricsPoint.job_id.in_(
57
+ select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
58
+ ),
59
+ JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
60
+ )
61
+ ),
62
+ session.execute(
63
+ delete(JobMetricsPoint).where(
64
+ JobMetricsPoint.job_id.in_(
65
+ select(JobModel.id).where(
66
+ JobModel.status.in_(JobStatus.finished_statuses())
67
+ )
68
+ ),
69
+ JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
70
+ )
71
+ ),
49
72
  )
50
73
  await session.commit()
51
74
 
@@ -134,9 +157,3 @@ def _pull_runner_metrics(
134
157
  ) -> Optional[MetricsResponse]:
135
158
  runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
136
159
  return runner_client.get_metrics()
137
-
138
-
139
- def _get_delete_metrics_cutoff() -> int:
140
- now = int(get_current_datetime().timestamp() * 1_000_000)
141
- cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
142
- return cutoff
@@ -66,7 +66,7 @@ async def _delete_placement_groups(
66
66
 
67
67
 
68
68
  async def _delete_placement_group(placement_group_model: PlacementGroupModel):
69
- logger.info("Deleting placement group %s", placement_group_model.name)
69
+ logger.debug("Deleting placement group %s", placement_group_model.name)
70
70
  placement_group = placement_group_model_to_placement_group(placement_group_model)
71
71
  if placement_group.provisioning_data is None:
72
72
  logger.error(
@@ -99,11 +99,14 @@ async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: dateti
99
99
 
100
100
 
101
101
  async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
102
- ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
103
102
  jpd = get_job_provisioning_data(job_model)
104
- jrd = get_job_runtime_data(job_model)
105
103
  if jpd is None:
106
104
  return None
105
+ if not jpd.dockerized:
106
+ # Container-based backend, no shim
107
+ return None
108
+ ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
109
+ jrd = get_job_runtime_data(job_model)
107
110
  try:
108
111
  res = await run_async(
109
112
  _pull_job_metrics,
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from collections.abc import Iterable
3
- from datetime import timedelta
3
+ from datetime import timedelta, timezone
4
4
  from typing import Dict, List, Optional
5
5
 
6
6
  from sqlalchemy import select
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
71
71
  logger = get_logger(__name__)
72
72
 
73
73
 
74
+ # Minimum time before terminating active job in case of connectivity issues.
75
+ # Should be sufficient to survive most problems caused by
76
+ # the server network flickering and providers' glitches.
77
+ JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
78
+
79
+
74
80
  async def process_running_jobs(batch_size: int = 1):
75
81
  tasks = []
76
82
  for _ in range(batch_size):
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
202
208
  user_ssh_key = run.run_spec.ssh_key_pub.strip()
203
209
  public_keys = [project.ssh_public_key.strip(), user_ssh_key]
204
210
  if job_provisioning_data.backend == BackendType.LOCAL:
205
- # No need to update ~/.ssh/authorized_keys when running shim localy
211
+ # No need to update ~/.ssh/authorized_keys when running shim locally
206
212
  user_ssh_key = ""
207
213
  success = await common_utils.run_async(
208
214
  _process_provisioning_with_shim,
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
299
305
  run_model,
300
306
  job_model,
301
307
  )
302
- if not success:
303
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
304
308
 
305
- if not success: # kill the job
306
- logger.warning(
307
- "%s: failed because runner is not available or return an error, age=%s",
308
- fmt(job_model),
309
- job_submission.age,
310
- )
311
- job_model.status = JobStatus.TERMINATING
312
- if not job_model.termination_reason:
313
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
314
- # job will be terminated and instance will be emptied by process_terminating_jobs
309
+ if success:
310
+ job_model.disconnected_at = None
311
+ else:
312
+ if job_model.termination_reason:
313
+ logger.warning(
314
+ "%s: failed because shim/runner returned an error, age=%s",
315
+ fmt(job_model),
316
+ job_submission.age,
317
+ )
318
+ job_model.status = JobStatus.TERMINATING
319
+ # job will be terminated and instance will be emptied by process_terminating_jobs
320
+ else:
321
+ # No job_model.termination_reason set means ssh connection failed
322
+ if job_model.disconnected_at is None:
323
+ job_model.disconnected_at = common_utils.get_current_datetime()
324
+ if _should_terminate_job_due_to_disconnect(job_model):
325
+ logger.warning(
326
+ "%s: failed because instance is unreachable, age=%s",
327
+ fmt(job_model),
328
+ job_submission.age,
329
+ )
330
+ # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
331
+ # when CLI <= 0.19.8 is no longer supported
332
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
333
+ job_model.status = JobStatus.TERMINATING
334
+ else:
335
+ logger.warning(
336
+ "%s: is unreachable, waiting for the instance to become reachable again, age=%s",
337
+ fmt(job_model),
338
+ job_submission.age,
339
+ )
315
340
 
316
341
  if (
317
342
  initial_status != job_model.status
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
543
568
  if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
544
569
  task = shim_client.get_task(job_model.id)
545
570
 
546
- # If task goes to terminated before the job is submitted to runner, then an error occured
571
+ # If task goes to terminated before the job is submitted to runner, then an error occurred
547
572
  if task.status == TaskStatus.TERMINATED:
548
573
  logger.warning(
549
574
  "shim failed to execute job %s: %s (%s)",
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
572
597
  else:
573
598
  shim_status = shim_client.pull() # raises error if shim is down, causes retry
574
599
 
575
- # If shim goes to pending before the job is submitted to runner, then an error occured
600
+ # If shim goes to pending before the job is submitted to runner, then an error occurred
576
601
  if (
577
602
  shim_status.state == "pending"
578
603
  and shim_status.result is not None
@@ -651,6 +676,10 @@ def _process_running(
651
676
  )
652
677
  if latest_state_event.termination_message:
653
678
  job_model.termination_reason_message = latest_state_event.termination_message
679
+ if (exit_status := latest_state_event.exit_status) is not None:
680
+ job_model.exit_status = exit_status
681
+ if exit_status != 0:
682
+ logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
654
683
  else:
655
684
  _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
656
685
  if job_model.status != previous_status:
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
688
717
  )
689
718
 
690
719
 
720
+ def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
721
+ if job_model.disconnected_at is None:
722
+ return False
723
+ return (
724
+ common_utils.get_current_datetime()
725
+ > job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
726
+ )
727
+
728
+
691
729
  async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
692
730
  policy = job.job_spec.utilization_policy
693
731
  if policy is None:
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
818
856
  return success_if_not_available
819
857
 
820
858
  runner_client.submit_job(
821
- run_spec=run.run_spec,
822
- job_spec=job.job_spec,
859
+ run=run,
860
+ job=job,
823
861
  cluster_info=cluster_info,
824
862
  secrets=secrets,
825
863
  repo_credentials=repo_credentials,