dstack 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +10 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/cli/utils/volume.py +9 -0
  17. dstack/_internal/core/backends/aws/compute.py +24 -11
  18. dstack/_internal/core/backends/aws/resources.py +3 -3
  19. dstack/_internal/core/backends/azure/compute.py +14 -8
  20. dstack/_internal/core/backends/azure/resources.py +2 -0
  21. dstack/_internal/core/backends/base/compute.py +102 -2
  22. dstack/_internal/core/backends/base/offers.py +7 -1
  23. dstack/_internal/core/backends/cudo/compute.py +8 -4
  24. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  25. dstack/_internal/core/backends/gcp/auth.py +19 -13
  26. dstack/_internal/core/backends/gcp/compute.py +27 -20
  27. dstack/_internal/core/backends/gcp/resources.py +3 -10
  28. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  29. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  30. dstack/_internal/core/backends/nebius/compute.py +2 -2
  31. dstack/_internal/core/backends/oci/compute.py +10 -4
  32. dstack/_internal/core/backends/runpod/compute.py +11 -4
  33. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  34. dstack/_internal/core/backends/vastai/compute.py +12 -2
  35. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  36. dstack/_internal/core/backends/vultr/compute.py +9 -3
  37. dstack/_internal/core/models/backends/aws.py +2 -0
  38. dstack/_internal/core/models/backends/base.py +1 -0
  39. dstack/_internal/core/models/configurations.py +0 -1
  40. dstack/_internal/core/models/runs.py +3 -3
  41. dstack/_internal/core/models/volumes.py +23 -0
  42. dstack/_internal/core/services/__init__.py +5 -1
  43. dstack/_internal/core/services/configs/__init__.py +3 -0
  44. dstack/_internal/server/background/tasks/common.py +22 -0
  45. dstack/_internal/server/background/tasks/process_instances.py +13 -21
  46. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -16
  47. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -7
  48. dstack/_internal/server/background/tasks/process_terminating_jobs.py +7 -2
  49. dstack/_internal/server/background/tasks/process_volumes.py +11 -1
  50. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  51. dstack/_internal/server/models.py +17 -19
  52. dstack/_internal/server/routers/logs.py +3 -0
  53. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  54. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  55. dstack/_internal/server/services/config.py +11 -1
  56. dstack/_internal/server/services/fleets.py +5 -1
  57. dstack/_internal/server/services/jobs/__init__.py +14 -11
  58. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  59. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  60. dstack/_internal/server/services/logs/__init__.py +78 -0
  61. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  62. dstack/_internal/server/services/logs/base.py +47 -0
  63. dstack/_internal/server/services/logs/filelog.py +110 -0
  64. dstack/_internal/server/services/logs/gcp.py +165 -0
  65. dstack/_internal/server/services/offers.py +7 -7
  66. dstack/_internal/server/services/pools.py +19 -20
  67. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  68. dstack/_internal/server/services/runner/client.py +8 -5
  69. dstack/_internal/server/services/volumes.py +68 -9
  70. dstack/_internal/server/settings.py +3 -0
  71. dstack/_internal/server/statics/index.html +1 -1
  72. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
  73. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
  74. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
  75. dstack/_internal/server/testing/common.py +46 -17
  76. dstack/api/_public/runs.py +1 -1
  77. dstack/version.py +2 -2
  78. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
  79. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/RECORD +97 -86
  80. tests/_internal/core/backends/base/__init__.py +0 -0
  81. tests/_internal/core/backends/base/test_compute.py +56 -0
  82. tests/_internal/server/background/tasks/test_process_running_jobs.py +2 -1
  83. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +5 -3
  84. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +11 -6
  85. tests/_internal/server/conftest.py +4 -5
  86. tests/_internal/server/routers/test_backends.py +1 -0
  87. tests/_internal/server/routers/test_logs.py +1 -1
  88. tests/_internal/server/routers/test_runs.py +2 -2
  89. tests/_internal/server/routers/test_volumes.py +9 -2
  90. tests/_internal/server/services/runner/test_client.py +22 -3
  91. tests/_internal/server/services/test_logs.py +3 -3
  92. tests/_internal/server/services/test_offers.py +167 -0
  93. tests/_internal/server/services/test_pools.py +105 -1
  94. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
  95. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
  96. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
  97. {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ from typing import List, Optional
6
6
  import dstack.version as version
7
7
  from dstack._internal import settings
8
8
  from dstack._internal.core.backends.base import Compute
9
- from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data
9
+ from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
10
10
  from dstack._internal.core.backends.base.offers import get_catalog_offers
11
11
  from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
12
12
  from dstack._internal.core.backends.nebius.config import NebiusConfig
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
130
130
  ) -> JobProvisioningData:
131
131
  instance_config = InstanceConfiguration(
132
132
  project_name=run.project_name,
133
- instance_name=get_instance_name(run, job), # TODO: generate name
133
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
134
134
  ssh_keys=[
135
135
  SSHKey(public=project_ssh_public_key.strip()),
136
136
  ],
@@ -4,7 +4,12 @@ from typing import List, Optional
4
4
 
5
5
  import oci
6
6
 
7
- from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data
7
+ from dstack._internal.core.backends.base.compute import (
8
+ Compute,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
11
+ get_user_data,
12
+ )
8
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
14
  from dstack._internal.core.backends.oci import resources
10
15
  from dstack._internal.core.backends.oci.config import OCIConfig
@@ -98,7 +103,7 @@ class OCICompute(Compute):
98
103
  ) -> JobProvisioningData:
99
104
  instance_config = InstanceConfiguration(
100
105
  project_name=run.project_name,
101
- instance_name=get_instance_name(run, job),
106
+ instance_name=get_job_instance_name(run, job),
102
107
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
103
108
  user=run.user,
104
109
  )
@@ -148,6 +153,7 @@ class OCICompute(Compute):
148
153
  ]
149
154
  cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
150
155
 
156
+ display_name = generate_unique_instance_name(instance_config)
151
157
  try:
152
158
  instance = resources.launch_instance(
153
159
  region=region,
@@ -155,7 +161,7 @@ class OCICompute(Compute):
155
161
  compartment_id=self.config.compartment_id,
156
162
  subnet_id=subnet.id,
157
163
  security_group_id=security_group.id,
158
- display_name=instance_config.instance_name,
164
+ display_name=display_name,
159
165
  cloud_init_user_data=cloud_init_user_data,
160
166
  shape=instance_offer.instance.name,
161
167
  is_spot=instance_offer.instance.resources.spot,
@@ -163,7 +169,7 @@ class OCICompute(Compute):
163
169
  image_id=package.image_id,
164
170
  )
165
171
  except oci.exceptions.ServiceError as e:
166
- if e.code in ("LimitExceeded", "QuotaExceeded"):
172
+ if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
167
173
  raise NoCapacityError(e.message)
168
174
  raise
169
175
 
@@ -5,8 +5,10 @@ from typing import List, Optional
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ generate_unique_volume_name,
8
10
  get_docker_commands,
9
- get_instance_name,
11
+ get_job_instance_name,
10
12
  )
11
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
12
14
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
31
33
 
32
34
  logger = get_logger(__name__)
33
35
 
36
+ # Undocumented but names of len 60 work
37
+ MAX_RESOURCE_NAME_LEN = 60
38
+
34
39
  CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
35
40
 
36
41
 
@@ -69,7 +74,7 @@ class RunpodCompute(Compute):
69
74
  ) -> JobProvisioningData:
70
75
  instance_config = InstanceConfiguration(
71
76
  project_name=run.project_name,
72
- instance_name=get_instance_name(run, job),
77
+ instance_name=get_job_instance_name(run, job),
73
78
  ssh_keys=[
74
79
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
75
80
  SSHKey(public=project_ssh_public_key.strip()),
@@ -77,6 +82,7 @@ class RunpodCompute(Compute):
77
82
  user=run.user,
78
83
  )
79
84
 
85
+ pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
80
86
  authorized_keys = instance_config.get_public_keys()
81
87
  memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
82
88
  disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -98,7 +104,7 @@ class RunpodCompute(Compute):
98
104
  bid_per_gpu = instance_offer.price / gpu_count
99
105
 
100
106
  resp = self.api_client.create_pod(
101
- name=instance_config.instance_name,
107
+ name=pod_name,
102
108
  image_name=job.job_spec.image_name,
103
109
  gpu_type_id=instance_offer.instance.name,
104
110
  cloud_type="SECURE", # ["ALL", "COMMUNITY", "SECURE"]:
@@ -197,9 +203,10 @@ class RunpodCompute(Compute):
197
203
  )
198
204
 
199
205
  def create_volume(self, volume: Volume) -> VolumeProvisioningData:
206
+ volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
200
207
  size_gb = volume.configuration.size_gb
201
208
  volume_id = self.api_client.create_network_volume(
202
- name=volume.name,
209
+ name=volume_name,
203
210
  region=volume.configuration.region,
204
211
  size=size_gb,
205
212
  )
@@ -4,7 +4,11 @@ from typing import List, Optional
4
4
  import requests
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_instance_name, get_shim_commands
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name,
9
+ get_job_instance_name,
10
+ get_shim_commands,
11
+ )
8
12
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
13
  from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
10
14
  from dstack._internal.core.backends.tensordock.config import TensorDockConfig
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
23
27
  logger = get_logger(__name__)
24
28
 
25
29
 
30
+ # Undocumented but names of len 60 work
31
+ MAX_INSTANCE_NAME_LEN = 60
32
+
33
+
26
34
  class TensorDockCompute(Compute):
27
35
  def __init__(self, config: TensorDockConfig):
28
36
  super().__init__()
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
49
57
  instance_offer: InstanceOfferWithAvailability,
50
58
  instance_config: InstanceConfiguration,
51
59
  ) -> JobProvisioningData:
60
+ instance_name = generate_unique_instance_name(
61
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
62
+ )
52
63
  commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
53
64
  try:
54
65
  resp = self.api_client.deploy_single(
55
- instance_name=instance_config.instance_name,
66
+ instance_name=instance_name,
56
67
  instance=instance_offer.instance,
57
68
  cloudinit={
58
69
  "ssh_pwauth": False, # disable password auth
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
113
124
  ) -> JobProvisioningData:
114
125
  instance_config = InstanceConfiguration(
115
126
  project_name=run.project_name,
116
- instance_name=get_instance_name(run, job), # TODO: generate name
127
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
117
128
  ssh_keys=[
118
129
  SSHKey(public=run.run_spec.ssh_key_pub.strip()),
119
130
  SSHKey(public=project_ssh_public_key.strip()),
@@ -4,7 +4,10 @@ import gpuhunt
4
4
  from gpuhunt.providers.vastai import VastAIProvider
5
5
 
6
6
  from dstack._internal.core.backends.base import Compute
7
- from dstack._internal.core.backends.base.compute import get_docker_commands, get_instance_name
7
+ from dstack._internal.core.backends.base.compute import (
8
+ generate_unique_instance_name_for_job,
9
+ get_docker_commands,
10
+ )
8
11
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
12
  from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
10
13
  from dstack._internal.core.backends.vastai.config import VastAIConfig
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
23
26
  logger = get_logger(__name__)
24
27
 
25
28
 
29
+ # Undocumented but names of len 60 work
30
+ MAX_INSTANCE_NAME_LEN = 60
31
+
32
+
26
33
  class VastAICompute(Compute):
27
34
  def __init__(self, config: VastAIConfig):
28
35
  super().__init__()
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
70
77
  project_ssh_private_key: str,
71
78
  volumes: List[Volume],
72
79
  ) -> JobProvisioningData:
80
+ instance_name = generate_unique_instance_name_for_job(
81
+ run, job, max_length=MAX_INSTANCE_NAME_LEN
82
+ )
73
83
  commands = get_docker_commands(
74
84
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
75
85
  )
76
86
  resp = self.api_client.create_instance(
77
- instance_name=get_instance_name(run, job),
87
+ instance_name=instance_name,
78
88
  bundle_id=instance_offer.instance.name,
79
89
  image_name=job.job_spec.image_name,
80
90
  onstart=" && ".join(commands),
@@ -20,7 +20,7 @@ class VultrApiClient:
20
20
  return False
21
21
  return True
22
22
 
23
- def get_instance(self, instance_id: str, plan_type: str):
23
+ def get_instance(self, instance_id: str, plan_type: str) -> dict:
24
24
  if plan_type == "bare-metal":
25
25
  response = self._make_request("GET", f"/bare-metals/{instance_id}")
26
26
  return response.json()["bare_metal"]
@@ -28,7 +28,7 @@ class VultrApiClient:
28
28
  response = self._make_request("GET", f"/instances/{instance_id}")
29
29
  return response.json()["instance"]
30
30
 
31
- def get_vpc_for_region(self, region: str) -> Optional[str]:
31
+ def get_vpc_for_region(self, region: str) -> Optional[dict]:
32
32
  response = self._make_request("GET", "/vpcs?per_page=500")
33
33
  vpcs = response.json().get("vpcs", [])
34
34
  if vpcs:
@@ -37,7 +37,7 @@ class VultrApiClient:
37
37
  return vpc
38
38
  return None
39
39
 
40
- def create_vpc(self, region: str):
40
+ def create_vpc(self, region: str) -> dict:
41
41
  data = {"region": region, "description": f"dstack-vpc-{region}"}
42
42
  response = self._make_request("POST", "/vpcs", data=data)
43
43
  return response.json()["vpc"]
@@ -6,7 +6,8 @@ import requests
6
6
 
7
7
  from dstack._internal.core.backends.base import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
- get_instance_name,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
10
11
  get_user_data,
11
12
  )
12
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
27
28
 
28
29
  logger = get_logger(__name__)
29
30
 
31
+ MAX_INSTANCE_NAME_LEN = 64
32
+
30
33
 
31
34
  class VultrCompute(Compute):
32
35
  def __init__(self, config: VultrConfig):
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
62
65
  ) -> JobProvisioningData:
63
66
  instance_config = InstanceConfiguration(
64
67
  project_name=run.project_name,
65
- instance_name=get_instance_name(run, job),
68
+ instance_name=get_job_instance_name(run, job),
66
69
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
67
70
  user=run.user,
68
71
  )
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
71
74
  def create_instance(
72
75
  self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
73
76
  ) -> JobProvisioningData:
77
+ instance_name = generate_unique_instance_name(
78
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
79
+ )
74
80
  # create vpc
75
81
  vpc = self.api_client.get_vpc_for_region(instance_offer.region)
76
82
  if not vpc:
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
85
91
  ]
86
92
  instance_id = self.api_client.launch_instance(
87
93
  region=instance_offer.region,
88
- label=instance_config.instance_name,
94
+ label=instance_name,
89
95
  plan=instance_offer.instance.name,
90
96
  user_data=get_user_data(
91
97
  authorized_keys=instance_config.get_public_keys(),
@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
32
32
  vpc_ids: Optional[Dict[str, str]] = None
33
33
  default_vpcs: Optional[bool] = None
34
34
  public_ips: Optional[bool] = None
35
+ iam_instance_profile: Optional[str] = None
35
36
  tags: Optional[Dict[str, str]] = None
36
37
  os_images: Optional[AWSOSImageConfig] = None
37
38
 
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
70
71
  vpc_ids: Optional[Dict[str, str]]
71
72
  default_vpcs: Optional[bool]
72
73
  public_ips: Optional[bool]
74
+ iam_instance_profile: Optional[str]
73
75
  tags: Optional[Dict[str, str]]
74
76
  os_images: Optional["AWSOSImageConfig"]
75
77
 
@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
15
15
  DATACRUNCH (BackendType): DataCrunch
16
16
  KUBERNETES (BackendType): Kubernetes
17
17
  LAMBDA (BackendType): Lambda Cloud
18
+ OCI (BackendType): Oracle Cloud Infrastructure
18
19
  RUNPOD (BackendType): Runpod Cloud
19
20
  TENSORDOCK (BackendType): TensorDock Marketplace
20
21
  VASTAI (BackendType): Vast.ai Marketplace
@@ -31,7 +31,6 @@ class RunConfigurationType(str, Enum):
31
31
 
32
32
 
33
33
  class PythonVersion(str, Enum):
34
- PY38 = "3.8" # TODO(0.19 or earlier): drop 3.8, stop building Docker images with 3.8
35
34
  PY39 = "3.9"
36
35
  PY310 = "3.10"
37
36
  PY311 = "3.11"
@@ -150,9 +150,9 @@ class JobTerminationReason(str, Enum):
150
150
  class Requirements(CoreModel):
151
151
  # TODO: Make requirements' fields required
152
152
  resources: ResourcesSpec
153
- max_price: Optional[float]
154
- spot: Optional[bool]
155
- reservation: Optional[str]
153
+ max_price: Optional[float] = None
154
+ spot: Optional[bool] = None
155
+ reservation: Optional[str] = None
156
156
 
157
157
  def pretty_format(self, resources_only: bool = False):
158
158
  res = self.resources.pretty_format()
@@ -71,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
71
71
  device_name: Optional[str] = None
72
72
 
73
73
 
74
+ class VolumeInstance(CoreModel):
75
+ name: str
76
+ fleet_name: Optional[str] = None
77
+ instance_num: int
78
+ instance_id: Optional[str] = None
79
+
80
+
81
+ class VolumeAttachment(CoreModel):
82
+ instance: VolumeInstance
83
+ attachment_data: Optional[VolumeAttachmentData] = None
84
+
85
+
74
86
  class Volume(CoreModel):
75
87
  id: uuid.UUID
76
88
  name: str
@@ -86,8 +98,19 @@ class Volume(CoreModel):
86
98
  deleted: bool
87
99
  volume_id: Optional[str] = None # id of the volume in the cloud
88
100
  provisioning_data: Optional[VolumeProvisioningData] = None
101
+ attachments: Optional[List[VolumeAttachment]] = None
102
+ # attachment_data is deprecated in favor of attachments.
103
+ # It's only set for volumes that were attached before attachments.
89
104
  attachment_data: Optional[VolumeAttachmentData] = None
90
105
 
106
+ def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
107
+ if self.attachments is not None:
108
+ for attachment in self.attachments:
109
+ if attachment.instance.instance_id == instance_id:
110
+ return attachment.attachment_data
111
+ # volume was attached before attachments were introduced
112
+ return self.attachment_data
113
+
91
114
 
92
115
  class VolumePlan(CoreModel):
93
116
  project_name: str
@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
4
4
 
5
5
 
6
6
  def validate_dstack_resource_name(resource_name: str):
7
- if not re.match("^[a-z][a-z0-9-]{1,40}$", resource_name):
7
+ if not is_valid_dstack_resource_name(resource_name):
8
8
  raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
9
+
10
+
11
+ def is_valid_dstack_resource_name(resource_name: str) -> bool:
12
+ return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None
@@ -65,6 +65,9 @@ class ConfigManager:
65
65
  if len(self.config.projects) == 1:
66
66
  self.config.projects[0].default = True
67
67
 
68
+ def list_projects(self):
69
+ return [project.name for project in self.config.projects]
70
+
68
71
  def delete_project(self, name: str):
69
72
  self.config.projects = [p for p in self.config.projects if p.name != name]
70
73
 
@@ -0,0 +1,22 @@
1
+ from datetime import timedelta
2
+
3
+ from dstack._internal.core.models.backends.base import BackendType
4
+
5
+
6
+ def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
7
+ """
8
+ This timeout is used in a few places, but roughly refers to the max time between
9
+ requesting instance creation and the instance becoming ready to accept jobs.
10
+ For container-based backends, this also includes the image pulling time.
11
+ """
12
+ if backend_type == BackendType.LAMBDA:
13
+ return timedelta(minutes=30)
14
+ if backend_type == BackendType.RUNPOD:
15
+ return timedelta(minutes=20)
16
+ if backend_type == BackendType.KUBERNETES:
17
+ return timedelta(minutes=20)
18
+ if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
19
+ return timedelta(minutes=20)
20
+ if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
21
+ return timedelta(minutes=55)
22
+ return timedelta(minutes=10)
@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
45
45
  InstanceOfferWithAvailability,
46
46
  InstanceRuntime,
47
47
  InstanceStatus,
48
- InstanceType,
49
48
  RemoteConnectionInfo,
50
49
  SSHKey,
51
50
  )
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
63
62
  Retry,
64
63
  )
65
64
  from dstack._internal.core.services.profiles import get_retry
65
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
66
66
  from dstack._internal.server.db import get_session_ctx
67
67
  from dstack._internal.server.models import (
68
68
  FleetModel,
@@ -507,9 +507,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
507
507
  project=instance.project,
508
508
  profile=profile,
509
509
  requirements=requirements,
510
- exclude_not_available=True,
511
510
  fleet_model=instance.fleet,
512
511
  blocks="auto" if instance.total_blocks is None else instance.total_blocks,
512
+ exclude_not_available=True,
513
513
  )
514
514
 
515
515
  if not offers and should_retry:
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
695
695
 
696
696
  if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
697
697
  provisioning_deadline = _get_provisioning_deadline(
698
- instance, job_provisioning_data.instance_type
698
+ instance=instance,
699
+ job_provisioning_data=job_provisioning_data,
699
700
  )
700
701
  if get_current_datetime() > provisioning_deadline:
701
702
  instance.status = InstanceStatus.TERMINATING
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
737
738
  instance.name,
738
739
  )
739
740
  provisioning_deadline = _get_provisioning_deadline(
740
- instance, job_provisioning_data.instance_type
741
+ instance=instance,
742
+ job_provisioning_data=job_provisioning_data,
741
743
  )
742
744
  if get_current_datetime() > provisioning_deadline:
743
745
  logger.warning(
@@ -915,9 +917,8 @@ def _get_instance_offer_for_instance(
915
917
  instance_offer.availability_zones = [
916
918
  z
917
919
  for z in instance_offer.availability_zones
918
- if instance_offer.availability_zones == master_job_provisioning_data.availability_zone
920
+ if z == master_job_provisioning_data.availability_zone
919
921
  ]
920
-
921
922
  return instance_offer
922
923
 
923
924
 
@@ -960,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
960
961
 
961
962
 
962
963
  def _get_provisioning_deadline(
963
- instance: InstanceModel, instance_type: InstanceType
964
+ instance: InstanceModel,
965
+ job_provisioning_data: JobProvisioningData,
964
966
  ) -> datetime.datetime:
965
- timeout_interval = _get_instance_timeout_interval(instance.backend, instance_type.name)
967
+ timeout_interval = get_provisioning_timeout(
968
+ backend_type=job_provisioning_data.get_base_backend(),
969
+ instance_type_name=job_provisioning_data.instance_type.name,
970
+ )
966
971
  return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
967
972
 
968
973
 
969
- def _get_instance_timeout_interval(
970
- backend_type: BackendType, instance_type_name: str
971
- ) -> timedelta:
972
- # when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
973
- if backend_type == BackendType.RUNPOD:
974
- return timedelta(seconds=1200)
975
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
976
- return timedelta(seconds=1200)
977
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
978
- return timedelta(seconds=3300)
979
- return timedelta(seconds=600)
980
-
981
-
982
974
  def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
983
975
  return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- from datetime import timedelta
3
2
  from typing import Dict, List, Optional
4
3
 
5
4
  from sqlalchemy import select
@@ -21,6 +20,7 @@ from dstack._internal.core.models.runs import (
21
20
  ClusterInfo,
22
21
  Job,
23
22
  JobProvisioningData,
23
+ JobRuntimeData,
24
24
  JobSpec,
25
25
  JobStatus,
26
26
  JobTerminationReason,
@@ -28,6 +28,7 @@ from dstack._internal.core.models.runs import (
28
28
  RunSpec,
29
29
  )
30
30
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
31
+ from dstack._internal.server.background.tasks.common import get_provisioning_timeout
31
32
  from dstack._internal.server.db import get_session_ctx
32
33
  from dstack._internal.server.models import (
33
34
  InstanceModel,
@@ -148,6 +149,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
148
149
  jobs=run.jobs,
149
150
  replica_num=job.job_spec.replica_num,
150
151
  job_provisioning_data=job_provisioning_data,
152
+ job_runtime_data=job_submission.job_runtime_data,
151
153
  )
152
154
 
153
155
  volumes = await get_job_attached_volumes(
@@ -205,6 +207,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
205
207
  None,
206
208
  run,
207
209
  job_model,
210
+ job_provisioning_data,
208
211
  volumes,
209
212
  secrets,
210
213
  job.job_spec.registry_auth,
@@ -241,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
241
244
 
242
245
  if not success:
243
246
  # check timeout
244
- if job_submission.age > _get_runner_timeout_interval(
247
+ if job_submission.age > get_provisioning_timeout(
245
248
  backend_type=job_provisioning_data.get_base_backend(),
246
249
  instance_type_name=job_provisioning_data.instance_type.name,
247
250
  ):
@@ -376,6 +379,7 @@ def _process_provisioning_with_shim(
376
379
  ports: Dict[int, int],
377
380
  run: Run,
378
381
  job_model: JobModel,
382
+ job_provisioning_data: JobProvisioningData,
379
383
  volumes: List[Volume],
380
384
  secrets: Dict[str, str],
381
385
  registry_auth: Optional[RegistryAuth],
@@ -459,6 +463,7 @@ def _process_provisioning_with_shim(
459
463
  host_ssh_user=ssh_user,
460
464
  host_ssh_keys=[ssh_key] if ssh_key else [],
461
465
  container_ssh_keys=public_keys,
466
+ instance_id=job_provisioning_data.instance_id,
462
467
  )
463
468
  else:
464
469
  submitted = shim_client.submit(
@@ -475,6 +480,7 @@ def _process_provisioning_with_shim(
475
480
  mounts=volume_mounts,
476
481
  volumes=volumes,
477
482
  instance_mounts=instance_mounts,
483
+ instance_id=job_provisioning_data.instance_id,
478
484
  )
479
485
  if not submitted:
480
486
  # This can happen when we lost connection to the runner (e.g., network issues), marked
@@ -667,6 +673,7 @@ def _get_cluster_info(
667
673
  jobs: List[Job],
668
674
  replica_num: int,
669
675
  job_provisioning_data: JobProvisioningData,
676
+ job_runtime_data: Optional[JobRuntimeData],
670
677
  ) -> ClusterInfo:
671
678
  job_ips = []
672
679
  for job in jobs:
@@ -677,10 +684,13 @@ def _get_cluster_info(
677
684
  ).internal_ip
678
685
  or ""
679
686
  )
687
+ gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
688
+ if job_runtime_data is not None and job_runtime_data.offer is not None:
689
+ gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
680
690
  cluster_info = ClusterInfo(
681
691
  job_ips=job_ips,
682
692
  master_job_ip=job_ips[0],
683
- gpus_per_job=len(job_provisioning_data.instance_type.resources.gpus),
693
+ gpus_per_job=gpus_per_job,
684
694
  )
685
695
  return cluster_info
686
696
 
@@ -759,16 +769,3 @@ def _submit_job_to_runner(
759
769
  # do not log here, because the runner will send a new status
760
770
 
761
771
  return True
762
-
763
-
764
- def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
765
- # when changing timeouts, also consider process_instances._get_instance_timeout_interval
766
- if backend_type == BackendType.LAMBDA:
767
- return timedelta(seconds=1200)
768
- if backend_type == BackendType.KUBERNETES:
769
- return timedelta(seconds=1200)
770
- if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
771
- return timedelta(seconds=1200)
772
- if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
773
- return timedelta(seconds=3300)
774
- return timedelta(seconds=600)