dstack 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +10 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/core/backends/aws/compute.py +22 -10
  17. dstack/_internal/core/backends/aws/resources.py +3 -3
  18. dstack/_internal/core/backends/azure/compute.py +14 -8
  19. dstack/_internal/core/backends/azure/resources.py +2 -0
  20. dstack/_internal/core/backends/base/compute.py +102 -2
  21. dstack/_internal/core/backends/base/offers.py +7 -1
  22. dstack/_internal/core/backends/cudo/compute.py +8 -4
  23. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  24. dstack/_internal/core/backends/gcp/auth.py +19 -13
  25. dstack/_internal/core/backends/gcp/compute.py +25 -19
  26. dstack/_internal/core/backends/gcp/resources.py +3 -10
  27. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  28. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  29. dstack/_internal/core/backends/nebius/compute.py +2 -2
  30. dstack/_internal/core/backends/oci/compute.py +10 -4
  31. dstack/_internal/core/backends/runpod/compute.py +11 -4
  32. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  33. dstack/_internal/core/backends/vastai/compute.py +12 -2
  34. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  35. dstack/_internal/core/backends/vultr/compute.py +9 -3
  36. dstack/_internal/core/models/backends/aws.py +2 -0
  37. dstack/_internal/core/models/backends/base.py +1 -0
  38. dstack/_internal/core/models/configurations.py +0 -1
  39. dstack/_internal/core/services/__init__.py +5 -1
  40. dstack/_internal/core/services/configs/__init__.py +3 -0
  41. dstack/_internal/server/background/tasks/common.py +22 -0
  42. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  43. dstack/_internal/server/background/tasks/process_running_jobs.py +9 -16
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  45. dstack/_internal/server/routers/logs.py +3 -0
  46. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  47. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  48. dstack/_internal/server/services/config.py +11 -1
  49. dstack/_internal/server/services/jobs/__init__.py +12 -9
  50. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  51. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  52. dstack/_internal/server/services/logs/__init__.py +78 -0
  53. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  54. dstack/_internal/server/services/logs/base.py +47 -0
  55. dstack/_internal/server/services/logs/filelog.py +110 -0
  56. dstack/_internal/server/services/logs/gcp.py +165 -0
  57. dstack/_internal/server/services/pools.py +16 -17
  58. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  59. dstack/_internal/server/settings.py +3 -0
  60. dstack/_internal/server/statics/index.html +1 -1
  61. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
  62. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
  63. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
  64. dstack/_internal/server/testing/common.py +33 -8
  65. dstack/api/_public/runs.py +1 -1
  66. dstack/version.py +2 -2
  67. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
  68. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/RECORD +80 -71
  69. tests/_internal/core/backends/base/__init__.py +0 -0
  70. tests/_internal/core/backends/base/test_compute.py +56 -0
  71. tests/_internal/server/background/tasks/test_process_running_jobs.py +1 -1
  72. tests/_internal/server/conftest.py +4 -5
  73. tests/_internal/server/routers/test_backends.py +1 -0
  74. tests/_internal/server/routers/test_logs.py +1 -1
  75. tests/_internal/server/routers/test_runs.py +2 -2
  76. tests/_internal/server/services/test_logs.py +3 -3
  77. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
  78. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
  79. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
  80. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
@@ -5,8 +5,10 @@ from typing import List, Optional
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ generate_unique_volume_name,
8
10
  get_docker_commands,
9
- get_instance_name,
11
+ get_job_instance_name,
10
12
  )
11
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
12
14
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
31
33
 
32
34
  logger = get_logger(__name__)
33
35
 
36
+ # Undocumented but names of len 60 work
37
+ MAX_RESOURCE_NAME_LEN = 60
38
+
34
39
  CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
35
40
 
36
41
 
@@ -69,7 +74,7 @@ class RunpodCompute(Compute):
69
74
  ) -> JobProvisioningData:
70
75
  instance_config = InstanceConfiguration(
71
76
  project_name=run.project_name,
72
- instance_name=get_instance_name(run, job),
77
+ instance_name=get_job_instance_name(run, job),
73
78
  ssh_keys=[
74
79
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
75
80
  SSHKey(public=project_ssh_public_key.strip()),
@@ -77,6 +82,7 @@ class RunpodCompute(Compute):
77
82
  user=run.user,
78
83
  )
79
84
 
85
+ pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
80
86
  authorized_keys = instance_config.get_public_keys()
81
87
  memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
82
88
  disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -98,7 +104,7 @@ class RunpodCompute(Compute):
98
104
  bid_per_gpu = instance_offer.price / gpu_count
99
105
 
100
106
  resp = self.api_client.create_pod(
101
- name=instance_config.instance_name,
107
+ name=pod_name,
102
108
  image_name=job.job_spec.image_name,
103
109
  gpu_type_id=instance_offer.instance.name,
104
110
  cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]:
@@ -197,9 +203,10 @@ class RunpodCompute(Compute):
197
203
  )
198
204
 
199
205
  def create_volume(self, volume: Volume) -> VolumeProvisioningData:
206
+ volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
200
207
  size_gb = volume.configuration.size_gb
201
208
  volume_id = self.api_client.create_network_volume(
202
- name=volume.name,
209
+ name=volume_name,
203
210
  region=volume.configuration.region,
204
211
  size=size_gb,
205
212
  )
@@ -4,7 +4,11 @@ from typing import List, Optional
4
4
  import requests
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_instance_name, get_shim_commands
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ get_job_instance_name,
10
+ get_shim_commands,
11
+ )
8
12
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
13
  from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
10
14
  from dstack._internal.core.backends.tensordock.config import TensorDockConfig
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
23
27
  logger = get_logger(__name__)
24
28
 
25
29
 
30
+ # Undocumented but names of len 60 work
31
+ MAX_INSTANCE_NAME_LEN = 60
32
+
33
+
26
34
  class TensorDockCompute(Compute):
27
35
  def __init__(self, config: TensorDockConfig):
28
36
  super().__init__()
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
49
57
  instance_offer: InstanceOfferWithAvailability,
50
58
  instance_config: InstanceConfiguration,
51
59
  ) -> JobProvisioningData:
60
+ instance_name = generate_unique_instance_name(
61
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
62
+ )
52
63
  commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
53
64
  try:
54
65
  resp = self.api_client.deploy_single(
55
- instance_name=instance_config.instance_name,
66
+ instance_name=instance_name,
56
67
  instance=instance_offer.instance,
57
68
  cloudinit={
58
69
  "ssh_pwauth": False, # disable password auth
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
113
124
  ) -> JobProvisioningData:
114
125
  instance_config = InstanceConfiguration(
115
126
  project_name=run.project_name,
116
- instance_name=get_instance_name(run, job), # TODO: generate name
127
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
117
128
  ssh_keys=[
118
129
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
119
130
  SSHKey(public=project_ssh_public_key.strip()),
@@ -4,7 +4,10 @@ import gpuhunt
4
4
  from gpuhunt.providers.vastai import VastAIProvider
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_docker_commands, get_instance_name
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name_for_job,
9
+ get_docker_commands,
10
+ )
8
11
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
12
  from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
10
13
  from dstack._internal.core.backends.vastai.config import VastAIConfig
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
23
26
  logger = get_logger(__name__)
24
27
 
25
28
 
29
+ # Undocumented but names of len 60 work
30
+ MAX_INSTANCE_NAME_LEN = 60
31
+
32
+
26
33
  class VastAICompute(Compute):
27
34
  def __init__(self, config: VastAIConfig):
28
35
  super().__init__()
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
70
77
  project_ssh_private_key: str,
71
78
  volumes: List[Volume],
72
79
  ) -> JobProvisioningData:
80
+ instance_name = generate_unique_instance_name_for_job(
81
+ run, job, max_length=MAX_INSTANCE_NAME_LEN
82
+ )
73
83
  commands = get_docker_commands(
74
84
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
75
85
  )
76
86
  resp = self.api_client.create_instance(
77
- instance_name=get_instance_name(run, job),
87
+ instance_name=instance_name,
78
88
  bundle_id=instance_offer.instance.name,
79
89
  image_name=job.job_spec.image_name,
80
90
  onstart=" && ".join(commands),
@@ -20,7 +20,7 @@ class VultrApiClient:
20
20
  return False
21
21
  return True
22
22
 
23
- def get_instance(self, instance_id: str, plan_type: str):
23
+ def get_instance(self, instance_id: str, plan_type: str) -> dict:
24
24
  if plan_type == "bare-metal":
25
25
  response = self._make_request("GET", f"/bare-metals/{instance_id}")
26
26
  return response.json()["bare_metal"]
@@ -28,7 +28,7 @@ class VultrApiClient:
28
28
  response = self._make_request("GET", f"/instances/{instance_id}")
29
29
  return response.json()["instance"]
30
30
 
31
- def get_vpc_for_region(self, region: str) -> Optional[str]:
31
+ def get_vpc_for_region(self, region: str) -> Optional[dict]:
32
32
  response = self._make_request("GET", "/vpcs?per_page=500")
33
33
  vpcs = response.json().get("vpcs", [])
34
34
  if vpcs:
@@ -37,7 +37,7 @@ class VultrApiClient:
37
37
  return vpc
38
38
  return None
39
39
 
40
- def create_vpc(self, region: str):
40
+ def create_vpc(self, region: str) -> dict:
41
41
  data = {"region": region, "description": f"dstack-vpc-{region}"}
42
42
  response = self._make_request("POST", "/vpcs", data=data)
43
43
  return response.json()["vpc"]
@@ -6,7 +6,8 @@ import requests
6
6
 
7
7
  from dstack._internal.core.backends.base import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
- get_instance_name,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
10
11
  get_user_data,
11
12
  )
12
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
27
28
 
28
29
  logger = get_logger(__name__)
29
30
 
31
+ MAX_INSTANCE_NAME_LEN = 64
32
+
30
33
 
31
34
  class VultrCompute(Compute):
32
35
  def __init__(self, config: VultrConfig):
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
62
65
  ) -> JobProvisioningData:
63
66
  instance_config = InstanceConfiguration(
64
67
  project_name=run.project_name,
65
- instance_name=get_instance_name(run, job),
68
+ instance_name=get_job_instance_name(run, job),
66
69
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
67
70
  user=run.user,
68
71
  )
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
71
74
  def create_instance(
72
75
  self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
73
76
  ) -> JobProvisioningData:
77
+ instance_name = generate_unique_instance_name(
78
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
79
+ )
74
80
  # create vpc
75
81
  vpc = self.api_client.get_vpc_for_region(instance_offer.region)
76
82
  if not vpc:
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
85
91
  ]
86
92
  instance_id = self.api_client.launch_instance(
87
93
  region=instance_offer.region,
88
- label=instance_config.instance_name,
94
+ label=instance_name,
89
95
  plan=instance_offer.instance.name,
90
96
  user_data=get_user_data(
91
97
  authorized_keys=instance_config.get_public_keys(),
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
32
32
  vpc_ids: Optional[Dict[str, str]] = None
33
33
  default_vpcs: Optional[bool] = None
34
34
  public_ips: Optional[bool] = None
35
+ iam_instance_profile: Optional[str] = None
35
36
  tags: Optional[Dict[str, str]] = None
36
37
  os_images: Optional[AWSOSImageConfig] = None
37
38
 
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
70
71
  vpc_ids: Optional[Dict[str, str]]
71
72
  default_vpcs: Optional[bool]
72
73
  public_ips: Optional[bool]
74
+ iam_instance_profile: Optional[str]
73
75
  tags: Optional[Dict[str, str]]
74
76
  os_images: Optional["AWSOSImageConfig"]
75
77
 
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
15
15
  DATACRUNCH (BackendType): DataCrunch
16
16
  KUBERNETES (BackendType): Kubernetes
17
17
  LAMBDA (BackendType): Lambda Cloud
18
+ OCI (BackendType): Oracle Cloud Infrastructure
18
19
  RUNPOD (BackendType): Runpod Cloud
19
20
  TENSORDOCK (BackendType): TensorDock Marketplace
20
21
  VASTAI (BackendType): Vast.ai Marketplace
@@ -31,7 +31,6 @@ class RunConfigurationType(str, Enum):
31
31
 
32
32
 
33
33
  class PythonVersion(str, Enum):
34
- PY38 = "3.8" # TODO(0.19 or earlier): drop 3.8, stop building Docker images with 3.8
35
34
  PY39 = "3.9"
36
35
  PY310 = "3.10"
37
36
  PY311 = "3.11"
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
4
4
 
5
5
 
6
6
  def validate_dstack_resource_name(resource_name: str):
7
- if not re.match("^[a-z][a-z0-9-]{1,40}$", resource_name):
7
+ if not is_valid_dstack_resource_name(resource_name):
8
8
  raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
9
+
10
+
11
+ def is_valid_dstack_resource_name(resource_name: str) -> bool:
12
+ return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
@@ -65,6 +65,9 @@ class ConfigManager:
65
65
  if len(self.config.projects) == 1:
66
66
  self.config.projects[0].default = True
67
67
 
68
+ def list_projects(self):
69
+ return [project.name for project in self.config.projects]
70
+
68
71
  def delete_project(self, name: str):
69
72
  self.config.projects = [p for p in self.config.projects if p.name != name]
70
73
 
@@ -0,0 +1,22 @@
1
+ from datetime import timedelta
2
+
3
+ from dstack._internal.core.models.backends.base import BackendType
4
+
5
+
6
+ def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
7
+ """
8
+ This timeout is used in a few places, but roughly refers to the max time between
9
+ requesting instance creation and the instance becoming ready to accept jobs.
10
+ For container-based backends, this also includes the image pulling time.
11
+ """
12
+ if backend_type == BackendType.LAMBDA:
13
+ return timedelta(minutes=30)
14
+ if backend_type == BackendType.RUNPOD:
15
+ return timedelta(minutes=20)
16
+ if backend_type == BackendType.KUBERNETES:
17
+ return timedelta(minutes=20)
18
+ if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
19
+ return timedelta(minutes=20)
20
+ if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
21
+ return timedelta(minutes=55)
22
+ return timedelta(minutes=10)
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
45
45
  InstanceOfferWithAvailability,
46
46
  InstanceRuntime,
47
47
  InstanceStatus,
48
- InstanceType,
49
48
  RemoteConnectionInfo,
50
49
  SSHKey,
51
50
  )
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
63
62
  Retry,
64
63
  )
65
64
  from dstack._internal.core.services.profiles import get_retry
65
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
66
66
  from dstack._internal.server.db import get_session_ctx
67
67
  from dstack._internal.server.models import (
68
68
  FleetModel,
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
695
695
 
696
696
  if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
697
697
  provisioning_deadline = _get_provisioning_deadline(
698
- instance, job_provisioning_data.instance_type
698
+ instance=instance,
699
+ job_provisioning_data=job_provisioning_data,
699
700
  )
700
701
  if get_current_datetime() > provisioning_deadline:
701
702
  instance.status = InstanceStatus.TERMINATING
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
737
738
  instance.name,
738
739
  )
739
740
  provisioning_deadline = _get_provisioning_deadline(
740
- instance, job_provisioning_data.instance_type
741
+ instance=instance,
742
+ job_provisioning_data=job_provisioning_data,
741
743
  )
742
744
  if get_current_datetime() > provisioning_deadline:
743
745
  logger.warning(
@@ -959,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
959
961
 
960
962
 
961
963
  def _get_provisioning_deadline(
962
- instance: InstanceModel, instance_type: InstanceType
964
+ instance: InstanceModel,
965
+ job_provisioning_data: JobProvisioningData,
963
966
  ) -> datetime.datetime:
964
- timeout_interval = _get_instance_timeout_interval(instance.backend, instance_type.name)
967
+ timeout_interval = get_provisioning_timeout(
968
+ backend_type=job_provisioning_data.get_base_backend(),
969
+ instance_type_name=job_provisioning_data.instance_type.name,
970
+ )
965
971
  return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
966
972
 
967
973
 
968
- def _get_instance_timeout_interval(
969
- backend_type: BackendType, instance_type_name: str
970
- ) -> timedelta:
971
- # when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
972
- if backend_type == BackendType.RUNPOD:
973
- return timedelta(seconds=1200)
974
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
975
- return timedelta(seconds=1200)
976
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
977
- return timedelta(seconds=3300)
978
- return timedelta(seconds=600)
979
-
980
-
981
974
  def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
982
975
  return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- from datetime import timedelta
3
2
  from typing import Dict, List, Optional
4
3
 
5
4
  from sqlalchemy import select
@@ -21,6 +20,7 @@ from dstack._internal.core.models.runs import (
21
20
  ClusterInfo,
22
21
  Job,
23
22
  JobProvisioningData,
23
+ JobRuntimeData,
24
24
  JobSpec,
25
25
  JobStatus,
26
26
  JobTerminationReason,
@@ -28,6 +28,7 @@ from dstack._internal.core.models.runs import (
28
28
  RunSpec,
29
29
  )
30
30
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
31
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
31
32
  from dstack._internal.server.db import get_session_ctx
32
33
  from dstack._internal.server.models import (
33
34
  InstanceModel,
@@ -148,6 +149,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
148
149
  jobs=run.jobs,
149
150
  replica_num=job.job_spec.replica_num,
150
151
  job_provisioning_data=job_provisioning_data,
152
+ job_runtime_data=job_submission.job_runtime_data,
151
153
  )
152
154
 
153
155
  volumes = await get_job_attached_volumes(
@@ -242,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
242
244
 
243
245
  if not success:
244
246
  # check timeout
245
- if job_submission.age > _get_runner_timeout_interval(
247
+ if job_submission.age > get_provisioning_timeout(
246
248
  backend_type=job_provisioning_data.get_base_backend(),
247
249
  instance_type_name=job_provisioning_data.instance_type.name,
248
250
  ):
@@ -671,6 +673,7 @@ def _get_cluster_info(
671
673
  jobs: List[Job],
672
674
  replica_num: int,
673
675
  job_provisioning_data: JobProvisioningData,
676
+ job_runtime_data: Optional[JobRuntimeData],
674
677
  ) -> ClusterInfo:
675
678
  job_ips = []
676
679
  for job in jobs:
@@ -681,10 +684,13 @@ def _get_cluster_info(
681
684
  ).internal_ip
682
685
  or ""
683
686
  )
687
+ gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
688
+ if job_runtime_data is not None and job_runtime_data.offer is not None:
689
+ gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
684
690
  cluster_info = ClusterInfo(
685
691
  job_ips=job_ips,
686
692
  master_job_ip=job_ips[0],
687
- gpus_per_job=len(job_provisioning_data.instance_type.resources.gpus),
693
+ gpus_per_job=gpus_per_job,
688
694
  )
689
695
  return cluster_info
690
696
 
@@ -763,16 +769,3 @@ def _submit_job_to_runner(
763
769
  # do not log here, because the runner will send a new status
764
770
 
765
771
  return True
766
-
767
-
768
- def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
769
- # when changing timeouts, also consider process_instances._get_instance_timeout_interval
770
- if backend_type == BackendType.LAMBDA:
771
- return timedelta(seconds=1200)
772
- if backend_type == BackendType.KUBERNETES:
773
- return timedelta(seconds=1200)
774
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
775
- return timedelta(seconds=1200)
776
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
777
- return timedelta(seconds=3300)
778
- return timedelta(seconds=600)
@@ -11,7 +11,6 @@ from dstack._internal.server.models import (
11
11
  JobModel,
12
12
  ProjectModel,
13
13
  VolumeAttachmentModel,
14
- VolumeModel,
15
14
  )
16
15
  from dstack._internal.server.services.jobs import (
17
16
  process_terminating_job,
@@ -86,12 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
86
85
  .where(InstanceModel.id == job_model.used_instance_id)
87
86
  .options(
88
87
  joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
89
- joinedload(InstanceModel.volume_attachments)
90
- .joinedload(VolumeAttachmentModel.volume)
91
- .joinedload(VolumeModel.user),
92
- joinedload(InstanceModel.volume_attachments)
93
- .joinedload(VolumeAttachmentModel.volume)
94
- .joinedload(VolumeModel.attachments),
88
+ joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
95
89
  )
96
90
  )
97
91
  instance_model = res.unique().scalar()
@@ -24,4 +24,7 @@ async def poll_logs(
24
24
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
25
  ) -> JobSubmissionLogs:
26
26
  _, project = user_project
27
+ # The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
28
+ # Otherwise, some logs with duplicated timestamps may be filtered out.
29
+ # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
27
30
  return await logs.poll_logs_async(project=project, request=body)
@@ -2,6 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  from typing import List
4
4
 
5
+ import botocore.exceptions
5
6
  from boto3.session import Session
6
7
 
7
8
  from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
35
36
  Configurator,
36
37
  raise_invalid_credentials_error,
37
38
  )
39
+ from dstack._internal.utils.logging import get_logger
40
+
41
+ logger = get_logger(__name__)
38
42
 
39
43
  REGIONS = [
40
44
  ("US East, N. Virginia", "us-east-1"),
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
137
141
 
138
142
  def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
139
143
  self._check_tags_config(config)
140
- self._check_vpc_config(session=session, config=config)
144
+ self._check_iam_instance_profile_config(session, config)
145
+ self._check_vpc_config(session, config)
141
146
 
142
147
  def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
143
148
  if not config.tags:
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
151
156
  except BackendError as e:
152
157
  raise ServerClientError(e.args[0])
153
158
 
159
+ def _check_iam_instance_profile_config(
160
+ self, session: Session, config: AWSConfigInfoWithCredsPartial
161
+ ):
162
+ if config.iam_instance_profile is None:
163
+ return
164
+ try:
165
+ iam_client = session.client("iam")
166
+ iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
167
+ except botocore.exceptions.ClientError as e:
168
+ if e.response["Error"]["Code"] == "NoSuchEntity":
169
+ raise ServerClientError(
170
+ f"IAM instance profile {config.iam_instance_profile} not found"
171
+ )
172
+ logger.exception(
173
+ "Got botocore.exceptions.ClientError when checking iam_instance_profile"
174
+ )
175
+ raise ServerClientError(
176
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
177
+ )
178
+ except Exception:
179
+ logger.exception("Got exception when checking iam_instance_profile")
180
+ raise ServerClientError(
181
+ f"Failed to check IAM instance profile {config.iam_instance_profile}"
182
+ )
183
+
154
184
  def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
155
185
  allocate_public_ip = config.public_ips if config.public_ips is not None else True
156
186
  use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
127
127
  _, project_id = auth.authenticate(GCPDefaultCreds())
128
128
  except BackendAuthError:
129
129
  return []
130
-
131
- if project_id is None:
132
- return []
133
-
134
130
  return [
135
131
  GCPConfigInfoWithCreds(
136
132
  project_id=project_id,
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
152
148
  ):
153
149
  raise_invalid_credentials_error(fields=[["creds"]])
154
150
  try:
155
- credentials, project_id = auth.authenticate(creds=config.creds)
156
- except BackendAuthError:
151
+ credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
152
+ except BackendAuthError as e:
153
+ details = None
154
+ if len(e.args) > 0:
155
+ details = e.args[0]
157
156
  if is_core_model_instance(config.creds, GCPServiceAccountCreds):
158
- raise_invalid_credentials_error(fields=[["creds", "data"]])
157
+ raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
159
158
  else:
160
- raise_invalid_credentials_error(fields=[["creds"]])
161
- if (
162
- project_id is not None
163
- and config.project_id is not None
164
- and config.project_id != project_id
165
- ):
166
- raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
167
- config_values.project_id = self._get_project_id_element(selected=project_id)
159
+ raise_invalid_credentials_error(fields=[["creds"]], details=details)
168
160
  config_values.regions = self._get_regions_element(
169
161
  selected=config.regions or DEFAULT_REGIONS
170
162
  )
171
163
  if config.project_id is None:
172
164
  return config_values
165
+ config_values.project_id = self._get_project_id_element(selected=config.project_id)
173
166
  self._check_config(config=config, credentials=credentials)
174
167
  return config_values
175
168
 
@@ -107,6 +107,16 @@ class AWSConfig(CoreModel):
107
107
  )
108
108
  ),
109
109
  ] = None
110
+ iam_instance_profile: Annotated[
111
+ Optional[str],
112
+ Field(
113
+ description=(
114
+ "The name of the IAM instance profile to associate with EC2 instances."
115
+ " You can also specify the IAM role name for roles created via the AWS console."
116
+ " AWS automatically creates an instance profile and gives it the same name as the role"
117
+ )
118
+ ),
119
+ ] = None
110
120
  tags: Annotated[
111
121
  Optional[Dict[str, str]],
112
122
  Field(description="The tags that will be assigned to resources created by `dstack`"),
@@ -251,7 +261,7 @@ class GCPConfig(CoreModel):
251
261
  ),
252
262
  ] = None
253
263
  vm_service_account: Annotated[
254
- Optional[str], Field(description="The service account associated with provisioned VMs")
264
+ Optional[str], Field(description="The service account to associate with provisioned VMs")
255
265
  ] = None
256
266
  tags: Annotated[
257
267
  Optional[Dict[str, str]],
@@ -236,13 +236,14 @@ async def process_terminating_job(
236
236
  logger.debug("%s: stopping container", fmt(job_model))
237
237
  ssh_private_keys = get_instance_ssh_private_keys(instance_model)
238
238
  await stop_container(job_model, jpd, ssh_private_keys)
239
- volume_models: list[VolumeModel]
240
239
  if jrd is not None and jrd.volume_names is not None:
241
- volume_models = await list_project_volume_models(
242
- session=session, project=instance_model.project, names=jrd.volume_names
243
- )
240
+ volume_names = jrd.volume_names
244
241
  else:
245
- volume_models = [va.volume for va in instance_model.volume_attachments]
242
+ # Legacy jobs before job_runtime_data/blocks were introduced
243
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
244
+ volume_models = await list_project_volume_models(
245
+ session=session, project=instance_model.project, names=volume_names
246
+ )
246
247
  if len(volume_models) > 0:
247
248
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
248
249
  all_volumes_detached = await _detach_volumes_from_job_instance(
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
302
303
  jpd = get_or_error(get_job_provisioning_data(job_model))
303
304
  jrd = get_job_runtime_data(job_model)
304
305
  if jrd is not None and jrd.volume_names is not None:
305
- volume_models = await list_project_volume_models(
306
- session=session, project=instance_model.project, names=jrd.volume_names
307
- )
306
+ volume_names = jrd.volume_names
308
307
  else:
309
- volume_models = [va.volume for va in instance_model.volume_attachments]
308
+ # Legacy jobs before job_runtime_data/blocks were introduced
309
+ volume_names = [va.volume.name for va in instance_model.volume_attachments]
310
+ volume_models = await list_project_volume_models(
311
+ session=session, project=instance_model.project, names=volume_names
312
+ )
310
313
  logger.info("Detaching volumes: %s", [v.name for v in volume_models])
311
314
  all_volumes_detached = await _detach_volumes_from_job_instance(
312
315
  project=instance_model.project,
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
6
6
  from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
7
7
  from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
8
8
 
9
- DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
10
-
11
9
  INSTALL_IPYKERNEL = (
12
10
  "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
13
11
  'echo "no pip, ipykernel was not installed"'
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
44
42
  return False
45
43
 
46
44
  def _default_max_duration(self) -> Optional[int]:
47
- return DEFAULT_MAX_DURATION_SECONDS
45
+ return None
48
46
 
49
47
  def _spot_policy(self) -> SpotPolicy:
50
48
  return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND