dstack 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -1
  2. dstack/_internal/cli/commands/apply.py +4 -2
  3. dstack/_internal/cli/commands/attach.py +21 -1
  4. dstack/_internal/cli/commands/completion.py +20 -0
  5. dstack/_internal/cli/commands/delete.py +3 -1
  6. dstack/_internal/cli/commands/fleet.py +2 -1
  7. dstack/_internal/cli/commands/gateway.py +7 -2
  8. dstack/_internal/cli/commands/logs.py +3 -2
  9. dstack/_internal/cli/commands/stats.py +2 -1
  10. dstack/_internal/cli/commands/stop.py +2 -1
  11. dstack/_internal/cli/commands/volume.py +2 -1
  12. dstack/_internal/cli/main.py +6 -0
  13. dstack/_internal/cli/services/completion.py +86 -0
  14. dstack/_internal/cli/services/configurators/run.py +10 -17
  15. dstack/_internal/cli/utils/fleet.py +5 -1
  16. dstack/_internal/core/backends/aws/compute.py +22 -10
  17. dstack/_internal/core/backends/aws/resources.py +3 -3
  18. dstack/_internal/core/backends/azure/compute.py +14 -8
  19. dstack/_internal/core/backends/azure/resources.py +2 -0
  20. dstack/_internal/core/backends/base/compute.py +102 -2
  21. dstack/_internal/core/backends/base/offers.py +7 -1
  22. dstack/_internal/core/backends/cudo/compute.py +8 -4
  23. dstack/_internal/core/backends/datacrunch/compute.py +10 -4
  24. dstack/_internal/core/backends/gcp/auth.py +19 -13
  25. dstack/_internal/core/backends/gcp/compute.py +25 -19
  26. dstack/_internal/core/backends/gcp/resources.py +3 -10
  27. dstack/_internal/core/backends/kubernetes/compute.py +4 -3
  28. dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
  29. dstack/_internal/core/backends/nebius/compute.py +2 -2
  30. dstack/_internal/core/backends/oci/compute.py +10 -4
  31. dstack/_internal/core/backends/runpod/compute.py +11 -4
  32. dstack/_internal/core/backends/tensordock/compute.py +14 -3
  33. dstack/_internal/core/backends/vastai/compute.py +12 -2
  34. dstack/_internal/core/backends/vultr/api_client.py +3 -3
  35. dstack/_internal/core/backends/vultr/compute.py +9 -3
  36. dstack/_internal/core/models/backends/aws.py +2 -0
  37. dstack/_internal/core/models/backends/base.py +1 -0
  38. dstack/_internal/core/models/configurations.py +0 -1
  39. dstack/_internal/core/services/__init__.py +5 -1
  40. dstack/_internal/core/services/configs/__init__.py +3 -0
  41. dstack/_internal/server/background/tasks/common.py +22 -0
  42. dstack/_internal/server/background/tasks/process_instances.py +11 -18
  43. dstack/_internal/server/background/tasks/process_running_jobs.py +9 -16
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -7
  45. dstack/_internal/server/routers/logs.py +3 -0
  46. dstack/_internal/server/services/backends/configurators/aws.py +31 -1
  47. dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
  48. dstack/_internal/server/services/config.py +11 -1
  49. dstack/_internal/server/services/jobs/__init__.py +12 -9
  50. dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
  51. dstack/_internal/server/services/jobs/configurators/task.py +1 -3
  52. dstack/_internal/server/services/logs/__init__.py +78 -0
  53. dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
  54. dstack/_internal/server/services/logs/base.py +47 -0
  55. dstack/_internal/server/services/logs/filelog.py +110 -0
  56. dstack/_internal/server/services/logs/gcp.py +165 -0
  57. dstack/_internal/server/services/pools.py +16 -17
  58. dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
  59. dstack/_internal/server/settings.py +3 -0
  60. dstack/_internal/server/statics/index.html +1 -1
  61. dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
  62. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
  63. dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
  64. dstack/_internal/server/testing/common.py +33 -8
  65. dstack/api/_public/runs.py +1 -1
  66. dstack/version.py +2 -2
  67. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
  68. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/RECORD +80 -71
  69. tests/_internal/core/backends/base/__init__.py +0 -0
  70. tests/_internal/core/backends/base/test_compute.py +56 -0
  71. tests/_internal/server/background/tasks/test_process_running_jobs.py +1 -1
  72. tests/_internal/server/conftest.py +4 -5
  73. tests/_internal/server/routers/test_backends.py +1 -0
  74. tests/_internal/server/routers/test_logs.py +1 -1
  75. tests/_internal/server/routers/test_runs.py +2 -2
  76. tests/_internal/server/services/test_logs.py +3 -3
  77. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
  78. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
  79. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
  80. {dstack-0.18.42.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,8 @@ from azure.mgmt.network.models import Subnet
6
6
 
7
7
  from dstack._internal.core.errors import BackendError
8
8
 
9
+ MAX_RESOURCE_NAME_LEN = 64
10
+
9
11
 
10
12
  def get_network_subnets(
11
13
  network_client: network_mgmt.NetworkManagementClient,
@@ -1,5 +1,7 @@
1
1
  import os
2
+ import random
2
3
  import re
4
+ import string
3
5
  import threading
4
6
  from abc import ABC, abstractmethod
5
7
  from functools import lru_cache
@@ -31,6 +33,7 @@ from dstack._internal.core.models.volumes import (
31
33
  VolumeAttachmentData,
32
34
  VolumeProvisioningData,
33
35
  )
36
+ from dstack._internal.core.services import is_valid_dstack_resource_name
34
37
  from dstack._internal.utils.logging import get_logger
35
38
 
36
39
  logger = get_logger(__name__)
@@ -209,8 +212,105 @@ class Compute(ABC):
209
212
  return self.get_offers(requirements)
210
213
 
211
214
 
212
- def get_instance_name(run: Run, job: Job) -> str:
213
- return f"{run.project_name.lower()}-{job.job_spec.job_name}"
215
+ def get_job_instance_name(run: Run, job: Job) -> str:
216
+ return job.job_spec.job_name
217
+
218
+
219
+ _DEFAULT_MAX_RESOURCE_NAME_LEN = 60
220
+ _CLOUD_RESOURCE_SUFFIX_LEN = 8
221
+
222
+
223
+ def generate_unique_instance_name(
224
+ instance_configuration: InstanceConfiguration,
225
+ max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
226
+ ) -> str:
227
+ """
228
+ Generates a unique instance name valid across all backends.
229
+ """
230
+ return generate_unique_backend_name(
231
+ resource_name=instance_configuration.instance_name,
232
+ project_name=instance_configuration.project_name,
233
+ max_length=max_length,
234
+ )
235
+
236
+
237
+ def generate_unique_instance_name_for_job(
238
+ run: Run,
239
+ job: Job,
240
+ max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
241
+ ) -> str:
242
+ """
243
+ Generates a unique instance name for a job valid across all backends.
244
+ """
245
+ return generate_unique_backend_name(
246
+ resource_name=get_job_instance_name(run, job),
247
+ project_name=run.project_name,
248
+ max_length=max_length,
249
+ )
250
+
251
+
252
+ def generate_unique_gateway_instance_name(
253
+ gateway_compute_configuration: GatewayComputeConfiguration,
254
+ max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
255
+ ) -> str:
256
+ """
257
+ Generates a unique gateway instance name valid across all backends.
258
+ """
259
+ return generate_unique_backend_name(
260
+ resource_name=gateway_compute_configuration.instance_name,
261
+ project_name=gateway_compute_configuration.project_name,
262
+ max_length=max_length,
263
+ )
264
+
265
+
266
+ def generate_unique_volume_name(
267
+ volume: Volume,
268
+ max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
269
+ ) -> str:
270
+ """
271
+ Generates a unique volume name valid across all backends.
272
+ """
273
+ return generate_unique_backend_name(
274
+ resource_name=volume.name,
275
+ project_name=volume.project_name,
276
+ max_length=max_length,
277
+ )
278
+
279
+
280
+ def generate_unique_backend_name(
281
+ resource_name: str,
282
+ project_name: Optional[str],
283
+ max_length: int,
284
+ ) -> str:
285
+ """
286
+ Generates a unique resource name valid across all backends.
287
+ Backend resource names must be unique on every provisioning so that
288
+ resource re-submission/re-creation doesn't lead to conflicts
289
+ on backends that require unique names (e.g. Azure, GCP).
290
+ """
291
+ # resource_name is guaranteed to be valid in all backends
292
+ prefix = f"dstack-{resource_name}"
293
+ if project_name is not None and is_valid_dstack_resource_name(project_name):
294
+ # project_name is not guaranteed to be valid in all backends,
295
+ # so we add it only if it passes the validation
296
+ prefix = f"dstack-{project_name}-{resource_name}"
297
+ return _generate_unique_backend_name_with_prefix(
298
+ prefix=prefix,
299
+ max_length=max_length,
300
+ )
301
+
302
+
303
+ def _generate_unique_backend_name_with_prefix(
304
+ prefix: str,
305
+ max_length: int,
306
+ ) -> str:
307
+ prefix_len = max_length - _CLOUD_RESOURCE_SUFFIX_LEN - 1
308
+ prefix = prefix[:prefix_len]
309
+ suffix = "".join(
310
+ random.choice(string.ascii_lowercase + string.digits)
311
+ for _ in range(_CLOUD_RESOURCE_SUFFIX_LEN)
312
+ )
313
+ return f"{prefix}-{suffix}"
214
314
 
215
315
 
216
316
  def get_cloud_config(**config) -> str:
@@ -14,6 +14,12 @@ from dstack._internal.core.models.instances import (
14
14
  from dstack._internal.core.models.resources import DEFAULT_DISK, Memory, Range
15
15
  from dstack._internal.core.models.runs import Requirements
16
16
 
17
+ # Offers not supported by all dstack versions are hidden behind one or more flags.
18
+ # This list enables the flags that are currently supported.
19
+ SUPPORTED_GPUHUNT_FLAGS = [
20
+ "oci-spot",
21
+ ]
22
+
17
23
 
18
24
  def get_catalog_offers(
19
25
  backend: BackendType,
@@ -110,7 +116,7 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
110
116
 
111
117
 
112
118
  def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFilter:
113
- q = gpuhunt.QueryFilter()
119
+ q = gpuhunt.QueryFilter(allowed_flags=SUPPORTED_GPUHUNT_FLAGS)
114
120
  if req is None:
115
121
  return q
116
122
 
@@ -4,7 +4,8 @@ import requests
4
4
 
5
5
  from dstack._internal.core.backends.base import Compute
6
6
  from dstack._internal.core.backends.base.compute import (
7
- get_instance_name,
7
+ generate_unique_instance_name,
8
+ get_job_instance_name,
8
9
  get_shim_commands,
9
10
  )
10
11
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -25,6 +26,9 @@ from dstack._internal.utils.logging import get_logger
25
26
  logger = get_logger(__name__)
26
27
 
27
28
 
29
+ MAX_RESOURCE_NAME_LEN = 30
30
+
31
+
28
32
  class CudoCompute(Compute):
29
33
  def __init__(self, config: CudoConfig):
30
34
  super().__init__()
@@ -58,7 +62,7 @@ class CudoCompute(Compute):
58
62
  ) -> JobProvisioningData:
59
63
  instance_config = InstanceConfiguration(
60
64
  project_name=run.project_name,
61
- instance_name=get_instance_name(run, job),
65
+ instance_name=get_job_instance_name(run, job),
62
66
  ssh_keys=[
63
67
  SSHKey(public=project_ssh_public_key.strip()),
64
68
  ],
@@ -71,6 +75,7 @@ class CudoCompute(Compute):
71
75
  instance_offer: InstanceOfferWithAvailability,
72
76
  instance_config: InstanceConfiguration,
73
77
  ) -> JobProvisioningData:
78
+ vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
74
79
  public_keys = instance_config.get_public_keys()
75
80
  memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
76
81
  disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -81,13 +86,12 @@ class CudoCompute(Compute):
81
86
  shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
82
87
  )
83
88
 
84
- vm_id = f"{instance_config.instance_name}-{instance_offer.region}"
85
89
  try:
86
90
  resp_data = self.api_client.create_virtual_machine(
87
91
  project_id=self.config.project_id,
88
92
  boot_disk_storage_class="STORAGE_CLASS_NETWORK",
89
93
  boot_disk_size_gib=disk_size,
90
- book_disk_id=f"{instance_config.instance_name}_{instance_offer.region}_disk_id",
94
+ book_disk_id=f"{vm_id}_disk_id",
91
95
  boot_disk_image_id=_get_image_id(gpus_no > 0),
92
96
  data_center_id=instance_offer.region,
93
97
  gpus=gpus_no,
@@ -2,6 +2,7 @@ from typing import Dict, List, Optional
2
2
 
3
3
  from dstack._internal.core.backends.base import Compute
4
4
  from dstack._internal.core.backends.base.compute import (
5
+ generate_unique_instance_name,
5
6
  get_shim_commands,
6
7
  )
7
8
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -22,6 +23,8 @@ from dstack._internal.utils.logging import get_logger
22
23
 
23
24
  logger = get_logger("datacrunch.compute")
24
25
 
26
+ MAX_INSTANCE_NAME_LEN = 60
27
+
25
28
  # Ubuntu 22.04 + CUDA 12.0 + Docker
26
29
  # from API https://datacrunch.stoplight.io/docs/datacrunch-public/c46ab45dbc508-get-all-image-types
27
30
  IMAGE_ID = "2088da25-bb0d-41cc-a191-dccae45d96fd"
@@ -78,6 +81,9 @@ class DataCrunchCompute(Compute):
78
81
  instance_offer: InstanceOfferWithAvailability,
79
82
  instance_config: InstanceConfiguration,
80
83
  ) -> JobProvisioningData:
84
+ instance_name = generate_unique_instance_name(
85
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
86
+ )
81
87
  public_keys = instance_config.get_public_keys()
82
88
  ssh_ids = []
83
89
  for ssh_public_key in public_keys:
@@ -106,8 +112,8 @@ class DataCrunchCompute(Compute):
106
112
  instance_type=instance_offer.instance.name,
107
113
  ssh_key_ids=ssh_ids,
108
114
  startup_script_id=startup_script_ids,
109
- hostname=instance_config.instance_name,
110
- description=instance_config.instance_name,
115
+ hostname=instance_name,
116
+ description=instance_name,
111
117
  image=IMAGE_ID,
112
118
  disk_size=disk_size,
113
119
  location=instance_offer.region,
@@ -119,8 +125,8 @@ class DataCrunchCompute(Compute):
119
125
  "instance_type": instance_offer.instance.name,
120
126
  "ssh_key_ids": ssh_ids,
121
127
  "startup_script_id": startup_script_ids,
122
- "hostname": instance_config.instance_name,
123
- "description": instance_config.instance_name,
128
+ "hostname": instance_name,
129
+ "description": instance_name,
124
130
  "image": IMAGE_ID,
125
131
  "disk_size": disk_size,
126
132
  "location": instance_offer.region,
@@ -1,10 +1,11 @@
1
1
  import json
2
2
  from typing import Optional, Tuple
3
3
 
4
+ import google.api_core.exceptions
4
5
  import google.auth
6
+ import google.cloud.compute_v1 as compute_v1
5
7
  from google.auth.credentials import Credentials
6
8
  from google.auth.exceptions import DefaultCredentialsError
7
- from google.cloud import storage
8
9
  from google.oauth2 import service_account
9
10
 
10
11
  from dstack._internal.core.errors import BackendAuthError
@@ -16,13 +17,16 @@ from dstack._internal.core.models.backends.gcp import (
16
17
  from dstack._internal.core.models.common import is_core_model_instance
17
18
 
18
19
 
19
- def authenticate(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
20
- """
21
- :raises BackendAuthError:
22
- :return: GCP credentials and project_id
23
- """
24
- credentials, project_id = get_credentials(creds)
25
- validate_credentials(credentials)
20
+ def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[Credentials, str]:
21
+ credentials, credentials_project_id = get_credentials(creds)
22
+ if project_id is None:
23
+ # If project_id is not specified explicitly, try using credentials' project_id.
24
+ # Explicit project_id takes precedence bacause credentials' project_id may be irrelevant.
25
+ # For example, with Workload Identity Federation for GKE, it's cluster project_id.
26
+ project_id = credentials_project_id
27
+ if project_id is None:
28
+ raise BackendAuthError("Credentials require project_id to be specified")
29
+ validate_credentials(credentials, project_id)
26
30
  return credentials, project_id
27
31
 
28
32
 
@@ -40,17 +44,19 @@ def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
40
44
  try:
41
45
  default_credentials, project_id = google.auth.default()
42
46
  except DefaultCredentialsError:
43
- raise BackendAuthError()
47
+ raise BackendAuthError("Failed to find default credentials")
44
48
 
45
49
  return default_credentials, project_id
46
50
 
47
51
 
48
- def validate_credentials(credentials: Credentials):
52
+ def validate_credentials(credentials: Credentials, project_id: str):
49
53
  try:
50
- storage_client = storage.Client(credentials=credentials)
51
- storage_client.list_buckets(max_results=1)
54
+ regions_client = compute_v1.RegionsClient(credentials=credentials)
55
+ regions_client.list(project=project_id)
56
+ except google.api_core.exceptions.NotFound:
57
+ raise BackendAuthError(f"project_id {project_id} not found")
52
58
  except Exception:
53
- raise BackendAuthError()
59
+ raise BackendAuthError("Insufficient permissions")
54
60
 
55
61
 
56
62
  def default_creds_available() -> bool:
@@ -12,8 +12,11 @@ import dstack._internal.core.backends.gcp.auth as auth
12
12
  import dstack._internal.core.backends.gcp.resources as gcp_resources
13
13
  from dstack._internal.core.backends.base.compute import (
14
14
  Compute,
15
+ generate_unique_gateway_instance_name,
16
+ generate_unique_instance_name,
17
+ generate_unique_volume_name,
15
18
  get_gateway_user_data,
16
- get_instance_name,
19
+ get_job_instance_name,
17
20
  get_shim_commands,
18
21
  get_user_data,
19
22
  merge_tags,
@@ -70,7 +73,7 @@ class GCPCompute(Compute):
70
73
  def __init__(self, config: GCPConfig):
71
74
  super().__init__()
72
75
  self.config = config
73
- self.credentials, self.project_id = auth.authenticate(config.creds)
76
+ self.credentials, _ = auth.authenticate(config.creds, self.config.project_id)
74
77
  self.instances_client = compute_v1.InstancesClient(credentials=self.credentials)
75
78
  self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
76
79
  self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
@@ -147,17 +150,10 @@ class GCPCompute(Compute):
147
150
  instance_offer: InstanceOfferWithAvailability,
148
151
  instance_config: InstanceConfiguration,
149
152
  ) -> JobProvisioningData:
150
- instance_name = instance_config.instance_name
153
+ instance_name = generate_unique_instance_name(
154
+ instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
155
+ )
151
156
  allocate_public_ip = self.config.allocate_public_ips
152
- if not gcp_resources.is_valid_resource_name(instance_name):
153
- # In a rare case the instance name is invalid in GCP,
154
- # we better use a random instance name than fail provisioning.
155
- instance_name = gcp_resources.generate_random_resource_name()
156
- logger.warning(
157
- "Invalid GCP instance name: %s. A new valid name is generated: %s",
158
- instance_config.instance_name,
159
- instance_name,
160
- )
161
157
  authorized_keys = instance_config.get_public_keys()
162
158
 
163
159
  # get_offers always fills instance_offer.availability_zones
@@ -182,6 +178,7 @@ class GCPCompute(Compute):
182
178
  labels = {
183
179
  "owner": "dstack",
184
180
  "dstack_project": instance_config.project_name.lower(),
181
+ "dstack_name": instance_config.instance_name,
185
182
  "dstack_user": instance_config.user.lower(),
186
183
  }
187
184
  labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
@@ -192,7 +189,7 @@ class GCPCompute(Compute):
192
189
  else False
193
190
  )
194
191
  if is_tpu:
195
- instance_id = f"tpu-{instance_config.instance_name}"
192
+ instance_id = instance_name
196
193
  startup_script = _get_tpu_startup_script(authorized_keys)
197
194
  # GCP does not allow attaching disks while TPUs is creating,
198
195
  # so we need to attach the disks on creation.
@@ -378,7 +375,7 @@ class GCPCompute(Compute):
378
375
  # TODO: run_job is the same for vm-based backends, refactor
379
376
  instance_config = InstanceConfiguration(
380
377
  project_name=run.project_name,
381
- instance_name=get_instance_name(run, job), # TODO: generate name
378
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
382
379
  ssh_keys=[
383
380
  SSHKey(public=project_ssh_public_key.strip()),
384
381
  ],
@@ -421,6 +418,9 @@ class GCPCompute(Compute):
421
418
  else:
422
419
  raise ComputeResourceNotFoundError()
423
420
 
421
+ instance_name = generate_unique_gateway_instance_name(
422
+ configuration, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
423
+ )
424
424
  # Choose any usable subnet in a VPC.
425
425
  # Configuring a specific subnet per region is not supported yet.
426
426
  subnetwork = _get_vpc_subnet(
@@ -432,6 +432,7 @@ class GCPCompute(Compute):
432
432
  labels = {
433
433
  "owner": "dstack",
434
434
  "dstack_project": configuration.project_name.lower(),
435
+ "dstack_name": configuration.instance_name,
435
436
  }
436
437
  labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
437
438
  labels = merge_tags(tags=labels, backend_tags=self.config.tags)
@@ -449,7 +450,7 @@ class GCPCompute(Compute):
449
450
  authorized_keys=[configuration.ssh_key_pub],
450
451
  labels=labels,
451
452
  tags=[gcp_resources.DSTACK_GATEWAY_TAG],
452
- instance_name=configuration.instance_name,
453
+ instance_name=instance_name,
453
454
  zone=zone,
454
455
  service_account=self.config.vm_service_account,
455
456
  network=self.config.vpc_resource_name,
@@ -458,10 +459,10 @@ class GCPCompute(Compute):
458
459
  operation = self.instances_client.insert(request=request)
459
460
  gcp_resources.wait_for_extended_operation(operation, "instance creation")
460
461
  instance = self.instances_client.get(
461
- project=self.config.project_id, zone=zone, instance=configuration.instance_name
462
+ project=self.config.project_id, zone=zone, instance=instance_name
462
463
  )
463
464
  return GatewayProvisioningData(
464
- instance_id=configuration.instance_name,
465
+ instance_id=instance_name,
465
466
  region=configuration.region, # used for instance termination
466
467
  availability_zone=zone,
467
468
  ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p,
@@ -525,16 +526,21 @@ class GCPCompute(Compute):
525
526
  )
526
527
  zone = zones[0]
527
528
 
529
+ disk_name = generate_unique_volume_name(
530
+ volume, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
531
+ )
532
+
528
533
  labels = {
529
534
  "owner": "dstack",
530
535
  "dstack_project": volume.project_name.lower(),
536
+ "dstack_name": volume.name,
531
537
  "dstack_user": volume.user,
532
538
  }
533
539
  labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
534
540
  labels = merge_tags(tags=labels, backend_tags=self.config.tags)
535
541
 
536
542
  disk = compute_v1.Disk()
537
- disk.name = volume.name
543
+ disk.name = disk_name
538
544
  disk.size_gb = volume.configuration.size_gb
539
545
  disk.type_ = f"zones/{zone}/diskTypes/pd-balanced"
540
546
  disk.labels = labels
@@ -552,7 +558,7 @@ class GCPCompute(Compute):
552
558
  created_disk = self.disk_client.get(
553
559
  project=self.config.project_id,
554
560
  zone=zone,
555
- disk=volume.name,
561
+ disk=disk_name,
556
562
  )
557
563
  logger.debug("Created persistent disk for volume %s", volume.name)
558
564
  return VolumeProvisioningData(
@@ -1,7 +1,5 @@
1
1
  import concurrent.futures
2
- import random
3
2
  import re
4
- import string
5
3
  from typing import Dict, List, Optional
6
4
 
7
5
  import google.api_core.exceptions
@@ -64,7 +62,7 @@ def check_vpc(
64
62
  region=region,
65
63
  )
66
64
  except google.api_core.exceptions.NotFound:
67
- raise ComputeError(f"Failed to find Shared VPC project {vpc_project_id}")
65
+ raise ComputeError(f"Failed to find VPC project {vpc_project_id}")
68
66
 
69
67
  if allocate_public_ip:
70
68
  return
@@ -322,12 +320,13 @@ def _is_valid_label(key: str, value: str) -> bool:
322
320
  return is_valid_resource_name(key) and is_valid_label_value(value)
323
321
 
324
322
 
323
+ MAX_RESOURCE_NAME_LEN = 63
325
324
  NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
326
325
  LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
327
326
 
328
327
 
329
328
  def is_valid_resource_name(name: str) -> bool:
330
- if len(name) < 1 or len(name) > 63:
329
+ if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
331
330
  return False
332
331
  match = re.match(NAME_PATTERN, name)
333
332
  return match is not None
@@ -338,12 +337,6 @@ def is_valid_label_value(value: str) -> bool:
338
337
  return match is not None
339
338
 
340
339
 
341
- def generate_random_resource_name(length: int = 40) -> str:
342
- return random.choice(string.ascii_lowercase) + "".join(
343
- random.choice(string.ascii_lowercase + string.digits) for _ in range(length)
344
- )
345
-
346
-
347
340
  def create_tpu_node_struct(
348
341
  instance_name: str,
349
342
  startup_script: str,
@@ -9,9 +9,10 @@ from kubernetes import client
9
9
 
10
10
  from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
+ generate_unique_gateway_instance_name,
13
+ generate_unique_instance_name_for_job,
12
14
  get_docker_commands,
13
15
  get_dstack_gateway_commands,
14
- get_instance_name,
15
16
  )
16
17
  from dstack._internal.core.backends.base.offers import match_requirements
17
18
  from dstack._internal.core.backends.kubernetes.config import KubernetesConfig
@@ -99,7 +100,7 @@ class KubernetesCompute(Compute):
99
100
  project_ssh_private_key: str,
100
101
  volumes: List[Volume],
101
102
  ) -> JobProvisioningData:
102
- instance_name = get_instance_name(run, job)
103
+ instance_name = generate_unique_instance_name_for_job(run, job)
103
104
  commands = get_docker_commands(
104
105
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
105
106
  )
@@ -231,7 +232,7 @@ class KubernetesCompute(Compute):
231
232
  # TODO: By default EKS creates a Classic Load Balancer for Load Balancer services.
232
233
  # Consider deploying an NLB. It seems it requires some extra configuration on the cluster:
233
234
  # https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html
234
- instance_name = configuration.instance_name
235
+ instance_name = generate_unique_gateway_instance_name(configuration)
235
236
  commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub])
236
237
  self.api.create_namespaced_pod(
237
238
  namespace=DEFAULT_NAMESPACE,
@@ -6,7 +6,8 @@ from typing import Dict, List, Optional
6
6
 
7
7
  from dstack._internal.core.backends.base.compute import (
8
8
  Compute,
9
- get_instance_name,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
10
11
  get_shim_commands,
11
12
  )
12
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -23,6 +24,8 @@ from dstack._internal.core.models.instances import (
23
24
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
24
25
  from dstack._internal.core.models.volumes import Volume
25
26
 
27
+ MAX_INSTANCE_NAME_LEN = 60
28
+
26
29
 
27
30
  class LambdaCompute(Compute):
28
31
  def __init__(self, config: LambdaConfig):
@@ -44,6 +47,9 @@ class LambdaCompute(Compute):
44
47
  def create_instance(
45
48
  self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
46
49
  ) -> JobProvisioningData:
50
+ instance_name = generate_unique_instance_name(
51
+ instance_config, max_length=MAX_INSTANCE_NAME_LEN
52
+ )
47
53
  project_ssh_key = instance_config.ssh_keys[0]
48
54
  project_key_name = _add_project_ssh_key(
49
55
  api_client=self.api_client,
@@ -53,7 +59,7 @@ class LambdaCompute(Compute):
53
59
  region_name=instance_offer.region,
54
60
  instance_type_name=instance_offer.instance.name,
55
61
  ssh_key_names=[project_key_name],
56
- name=instance_config.instance_name,
62
+ name=instance_name,
57
63
  quantity=1,
58
64
  file_system_names=[],
59
65
  )
@@ -107,7 +113,7 @@ class LambdaCompute(Compute):
107
113
  ) -> JobProvisioningData:
108
114
  instance_config = InstanceConfiguration(
109
115
  project_name=run.project_name,
110
- instance_name=get_instance_name(run, job), # TODO: generate name
116
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
111
117
  ssh_keys=[
112
118
  SSHKey(
113
119
  public=project_ssh_public_key.strip(), private=project_ssh_private_key.strip()
@@ -6,7 +6,7 @@ from typing import List, Optional
6
6
  import dstack.version as version
7
7
  from dstack._internal import settings
8
8
  from dstack._internal.core.backends.base import Compute
9
- from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data
9
+ from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
10
10
  from dstack._internal.core.backends.base.offers import get_catalog_offers
11
11
  from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
12
12
  from dstack._internal.core.backends.nebius.config import NebiusConfig
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
130
130
  ) -> JobProvisioningData:
131
131
  instance_config = InstanceConfiguration(
132
132
  project_name=run.project_name,
133
- instance_name=get_instance_name(run, job), # TODO: generate name
133
+ instance_name=get_job_instance_name(run, job), # TODO: generate name
134
134
  ssh_keys=[
135
135
  SSHKey(public=project_ssh_public_key.strip()),
136
136
  ],
@@ -4,7 +4,12 @@ from typing import List, Optional
4
4
 
5
5
  import oci
6
6
 
7
- from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data
7
+ from dstack._internal.core.backends.base.compute import (
8
+ Compute,
9
+ generate_unique_instance_name,
10
+ get_job_instance_name,
11
+ get_user_data,
12
+ )
8
13
  from dstack._internal.core.backends.base.offers import get_catalog_offers
9
14
  from dstack._internal.core.backends.oci import resources
10
15
  from dstack._internal.core.backends.oci.config import OCIConfig
@@ -98,7 +103,7 @@ class OCICompute(Compute):
98
103
  ) -> JobProvisioningData:
99
104
  instance_config = InstanceConfiguration(
100
105
  project_name=run.project_name,
101
- instance_name=get_instance_name(run, job),
106
+ instance_name=get_job_instance_name(run, job),
102
107
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
103
108
  user=run.user,
104
109
  )
@@ -148,6 +153,7 @@ class OCICompute(Compute):
148
153
  ]
149
154
  cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
150
155
 
156
+ display_name = generate_unique_instance_name(instance_config)
151
157
  try:
152
158
  instance = resources.launch_instance(
153
159
  region=region,
@@ -155,7 +161,7 @@ class OCICompute(Compute):
155
161
  compartment_id=self.config.compartment_id,
156
162
  subnet_id=subnet.id,
157
163
  security_group_id=security_group.id,
158
- display_name=instance_config.instance_name,
164
+ display_name=display_name,
159
165
  cloud_init_user_data=cloud_init_user_data,
160
166
  shape=instance_offer.instance.name,
161
167
  is_spot=instance_offer.instance.resources.spot,
@@ -163,7 +169,7 @@ class OCICompute(Compute):
163
169
  image_id=package.image_id,
164
170
  )
165
171
  except oci.exceptions.ServiceError as e:
166
- if e.code in ("LimitExceeded", "QuotaExceeded"):
172
+ if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
167
173
  raise NoCapacityError(e.message)
168
174
  raise
169
175