dstack 0.19.4rc3__py3-none-any.whl → 0.19.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (183) hide show
  1. dstack/_internal/cli/commands/attach.py +22 -20
  2. dstack/_internal/cli/commands/offer.py +116 -0
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/cli/services/configurators/base.py +1 -2
  5. dstack/_internal/cli/services/configurators/fleet.py +43 -20
  6. dstack/_internal/cli/services/configurators/run.py +3 -3
  7. dstack/_internal/cli/utils/run.py +43 -38
  8. dstack/_internal/core/backends/aws/auth.py +1 -2
  9. dstack/_internal/core/backends/aws/compute.py +24 -9
  10. dstack/_internal/core/backends/aws/configurator.py +2 -3
  11. dstack/_internal/core/backends/aws/resources.py +10 -0
  12. dstack/_internal/core/backends/azure/auth.py +1 -2
  13. dstack/_internal/core/backends/azure/compute.py +15 -5
  14. dstack/_internal/core/backends/azure/configurator.py +4 -5
  15. dstack/_internal/core/backends/azure/resources.py +14 -0
  16. dstack/_internal/core/backends/base/compute.py +99 -31
  17. dstack/_internal/core/backends/gcp/auth.py +1 -2
  18. dstack/_internal/core/backends/gcp/compute.py +58 -14
  19. dstack/_internal/core/backends/gcp/configurator.py +2 -3
  20. dstack/_internal/core/backends/gcp/features/tcpx.py +31 -0
  21. dstack/_internal/core/backends/gcp/resources.py +10 -0
  22. dstack/_internal/core/backends/nebius/compute.py +6 -2
  23. dstack/_internal/core/backends/nebius/configurator.py +4 -10
  24. dstack/_internal/core/backends/nebius/models.py +14 -1
  25. dstack/_internal/core/backends/nebius/resources.py +91 -10
  26. dstack/_internal/core/backends/oci/auth.py +1 -2
  27. dstack/_internal/core/backends/oci/configurator.py +1 -2
  28. dstack/_internal/core/backends/runpod/compute.py +1 -1
  29. dstack/_internal/core/errors.py +4 -0
  30. dstack/_internal/core/models/common.py +2 -14
  31. dstack/_internal/core/models/configurations.py +24 -2
  32. dstack/_internal/core/models/envs.py +2 -2
  33. dstack/_internal/core/models/fleets.py +34 -3
  34. dstack/_internal/core/models/gateways.py +18 -4
  35. dstack/_internal/core/models/instances.py +2 -1
  36. dstack/_internal/core/models/profiles.py +12 -0
  37. dstack/_internal/core/models/runs.py +6 -0
  38. dstack/_internal/core/models/secrets.py +1 -1
  39. dstack/_internal/core/models/volumes.py +17 -1
  40. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +3 -3
  41. dstack/_internal/proxy/gateway/services/nginx.py +0 -1
  42. dstack/_internal/proxy/gateway/services/registry.py +0 -1
  43. dstack/_internal/server/background/tasks/process_instances.py +12 -9
  44. dstack/_internal/server/background/tasks/process_running_jobs.py +66 -15
  45. dstack/_internal/server/routers/fleets.py +22 -0
  46. dstack/_internal/server/routers/runs.py +1 -0
  47. dstack/_internal/server/schemas/fleets.py +12 -2
  48. dstack/_internal/server/schemas/runner.py +6 -0
  49. dstack/_internal/server/schemas/runs.py +3 -0
  50. dstack/_internal/server/services/docker.py +1 -2
  51. dstack/_internal/server/services/fleets.py +30 -12
  52. dstack/_internal/server/services/gateways/__init__.py +1 -0
  53. dstack/_internal/server/services/instances.py +3 -1
  54. dstack/_internal/server/services/jobs/__init__.py +1 -2
  55. dstack/_internal/server/services/jobs/configurators/base.py +17 -8
  56. dstack/_internal/server/services/locking.py +16 -1
  57. dstack/_internal/server/services/projects.py +1 -2
  58. dstack/_internal/server/services/proxy/repo.py +1 -2
  59. dstack/_internal/server/services/runner/client.py +3 -0
  60. dstack/_internal/server/services/runs.py +19 -16
  61. dstack/_internal/server/services/services/__init__.py +1 -2
  62. dstack/_internal/server/services/volumes.py +29 -2
  63. dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
  64. dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
  65. dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
  66. dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
  67. dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
  68. dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
  69. dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
  70. dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
  71. dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
  72. dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
  73. dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
  74. dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
  75. dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
  76. dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
  77. dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
  78. dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
  79. dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
  80. dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
  81. dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
  82. dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
  83. dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
  84. dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
  85. dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
  86. dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
  87. dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
  88. dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
  89. dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
  90. dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
  91. dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
  92. dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
  93. dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
  94. dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
  95. dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
  96. dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
  97. dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
  98. dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
  99. dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
  100. dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
  101. dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
  102. dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
  103. dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
  104. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
  105. dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
  106. dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
  107. dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
  108. dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
  109. dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
  110. dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
  111. dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
  112. dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
  113. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
  114. dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
  115. dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
  116. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
  117. dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
  118. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
  119. dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
  120. dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
  121. dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
  122. dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
  123. dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
  124. dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
  125. dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
  126. dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
  127. dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
  128. dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
  129. dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
  130. dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
  131. dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
  132. dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
  133. dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
  134. dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
  135. dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
  136. dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
  137. dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
  138. dstack/_internal/server/statics/assets/favicon.ico +0 -0
  139. dstack/_internal/server/statics/assets/manifest.webmanifest +67 -0
  140. dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
  141. dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
  142. dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
  143. dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
  144. dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
  145. dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
  146. dstack/_internal/server/statics/assets/yandex-browser-manifest.json +9 -0
  147. dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
  148. dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
  149. dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
  150. dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
  151. dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
  152. dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
  153. dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
  154. dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
  155. dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
  156. dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
  157. dstack/_internal/server/statics/index.html +3 -0
  158. dstack/_internal/server/statics/main-8f9c66f404e9c7e7e020.css +3 -0
  159. dstack/_internal/server/statics/main-b4f65323f5df007e1664.js +136480 -0
  160. dstack/_internal/server/statics/main-b4f65323f5df007e1664.js.map +1 -0
  161. dstack/_internal/server/statics/manifest.json +16 -0
  162. dstack/_internal/server/statics/robots.txt +3 -0
  163. dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
  164. dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +3 -0
  165. dstack/_internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg +124 -0
  166. dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
  167. dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
  168. dstack/_internal/server/testing/common.py +10 -0
  169. dstack/_internal/utils/tags.py +42 -0
  170. dstack/api/server/__init__.py +3 -1
  171. dstack/api/server/_fleets.py +52 -9
  172. dstack/api/server/_gateways.py +17 -2
  173. dstack/api/server/_runs.py +34 -11
  174. dstack/api/server/_volumes.py +2 -3
  175. dstack/version.py +1 -1
  176. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/METADATA +2 -2
  177. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/RECORD +180 -76
  178. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +0 -1
  179. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +0 -27
  180. dstack-0.19.4rc3.data/data/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +0 -88
  181. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/WHEEL +0 -0
  182. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/entry_points.txt +0 -0
  183. {dstack-0.19.4rc3.dist-info → dstack-0.19.6rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -136,13 +136,18 @@ class AzureCompute(
136
136
  location=location,
137
137
  )
138
138
 
139
- tags = {
139
+ base_tags = {
140
140
  "owner": "dstack",
141
141
  "dstack_project": instance_config.project_name,
142
142
  "dstack_name": instance_config.instance_name,
143
143
  "dstack_user": instance_config.user,
144
144
  }
145
- tags = merge_tags(tags=tags, backend_tags=self.config.tags)
145
+ tags = merge_tags(
146
+ base_tags=base_tags,
147
+ backend_tags=self.config.tags,
148
+ resource_tags=instance_config.tags,
149
+ )
150
+ tags = azure_resources.filter_invalid_tags(tags)
146
151
 
147
152
  # TODO: Support custom availability_zones.
148
153
  # Currently, VMs are regional, which means they don't have zone info.
@@ -228,14 +233,19 @@ class AzureCompute(
228
233
  location=configuration.region,
229
234
  )
230
235
 
231
- tags = {
236
+ base_tags = {
232
237
  "owner": "dstack",
233
238
  "dstack_project": configuration.project_name,
234
239
  "dstack_name": configuration.instance_name,
235
240
  }
236
241
  if settings.DSTACK_VERSION is not None:
237
- tags["dstack_version"] = settings.DSTACK_VERSION
238
- tags = merge_tags(tags=tags, backend_tags=self.config.tags)
242
+ base_tags["dstack_version"] = settings.DSTACK_VERSION
243
+ tags = merge_tags(
244
+ base_tags=base_tags,
245
+ backend_tags=self.config.tags,
246
+ resource_tags=configuration.tags,
247
+ )
248
+ tags = azure_resources.filter_invalid_tags(tags)
239
249
 
240
250
  vm = _launch_instance(
241
251
  compute_client=self._compute_client,
@@ -46,7 +46,6 @@ from dstack._internal.core.errors import (
46
46
  from dstack._internal.core.models.backends.base import (
47
47
  BackendType,
48
48
  )
49
- from dstack._internal.core.models.common import is_core_model_instance
50
49
 
51
50
  LOCATIONS = [
52
51
  ("(US) Central US", "centralus"),
@@ -76,14 +75,14 @@ class AzureConfigurator(Configurator):
76
75
  BACKEND_CLASS = AzureBackend
77
76
 
78
77
  def validate_config(self, config: AzureBackendConfigWithCreds, default_creds_enabled: bool):
79
- if is_core_model_instance(config.creds, AzureDefaultCreds) and not default_creds_enabled:
78
+ if isinstance(config.creds, AzureDefaultCreds) and not default_creds_enabled:
80
79
  raise_invalid_credentials_error(fields=[["creds"]])
81
- if is_core_model_instance(config.creds, AzureClientCreds):
80
+ if isinstance(config.creds, AzureClientCreds):
82
81
  self._set_client_creds_tenant_id(config.creds, config.tenant_id)
83
82
  try:
84
83
  credential, _ = auth.authenticate(config.creds)
85
84
  except BackendAuthError:
86
- if is_core_model_instance(config.creds, AzureClientCreds):
85
+ if isinstance(config.creds, AzureClientCreds):
87
86
  raise_invalid_credentials_error(
88
87
  fields=[
89
88
  ["creds", "tenant_id"],
@@ -105,7 +104,7 @@ class AzureConfigurator(Configurator):
105
104
  ) -> BackendRecord:
106
105
  if config.regions is None:
107
106
  config.regions = DEFAULT_LOCATIONS
108
- if is_core_model_instance(config.creds, AzureClientCreds):
107
+ if isinstance(config.creds, AzureClientCreds):
109
108
  self._set_client_creds_tenant_id(config.creds, config.tenant_id)
110
109
  credential, _ = auth.authenticate(config.creds)
111
110
  if config.resource_group is None:
@@ -5,6 +5,10 @@ from azure.mgmt import network as network_mgmt
5
5
  from azure.mgmt.network.models import Subnet
6
6
 
7
7
  from dstack._internal.core.errors import BackendError
8
+ from dstack._internal.utils.logging import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
8
12
 
9
13
  MAX_RESOURCE_NAME_LEN = 64
10
14
 
@@ -77,6 +81,16 @@ def _is_eligible_private_subnet(
77
81
  return False
78
82
 
79
83
 
84
+ def filter_invalid_tags(tags: Dict[str, str]) -> Dict[str, str]:
85
+ filtered_tags = {}
86
+ for k, v in tags.items():
87
+ if not _is_valid_tag(k, v):
88
+ logger.warning("Skipping invalid tag '%s: %s'", k, v)
89
+ continue
90
+ filtered_tags[k] = v
91
+ return filtered_tags
92
+
93
+
80
94
  def validate_tags(tags: Dict[str, str]):
81
95
  for k, v in tags.items():
82
96
  if not _is_valid_tag(k, v):
@@ -5,6 +5,7 @@ import string
5
5
  import threading
6
6
  from abc import ABC, abstractmethod
7
7
  from functools import lru_cache
8
+ from pathlib import Path
8
9
  from typing import Dict, List, Optional
9
10
 
10
11
  import git
@@ -36,14 +37,12 @@ from dstack._internal.core.models.volumes import (
36
37
  )
37
38
  from dstack._internal.core.services import is_valid_dstack_resource_name
38
39
  from dstack._internal.utils.logging import get_logger
40
+ from dstack._internal.utils.path import PathLike
39
41
 
40
42
  logger = get_logger(__name__)
41
43
 
42
- DSTACK_WORKING_DIR = "/root/.dstack"
43
44
  DSTACK_SHIM_BINARY_NAME = "dstack-shim"
44
- DSTACK_SHIM_BINARY_PATH = f"/usr/local/bin/{DSTACK_SHIM_BINARY_NAME}"
45
45
  DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
46
- DSTACK_RUNNER_BINARY_PATH = f"/usr/local/bin/{DSTACK_RUNNER_BINARY_NAME}"
47
46
 
48
47
 
49
48
  class Compute(ABC):
@@ -173,6 +172,7 @@ class ComputeWithCreateInstanceSupport(ABC):
173
172
  ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
174
173
  volumes=volumes,
175
174
  reservation=run.run_spec.configuration.reservation,
175
+ tags=run.run_spec.merged_profile.tags,
176
176
  )
177
177
  instance_offer = instance_offer.copy()
178
178
  self._restrict_instance_offer_az_to_volumes_az(instance_offer, volumes)
@@ -335,6 +335,24 @@ class ComputeWithVolumeSupport(ABC):
335
335
  return True
336
336
 
337
337
 
338
+ def get_dstack_working_dir(base_path: Optional[PathLike] = None) -> str:
339
+ if base_path is None:
340
+ base_path = "/root"
341
+ return str(Path(base_path, ".dstack"))
342
+
343
+
344
+ def get_dstack_shim_binary_path(bin_path: Optional[PathLike] = None) -> str:
345
+ if bin_path is None:
346
+ bin_path = "/usr/local/bin"
347
+ return str(Path(bin_path, DSTACK_SHIM_BINARY_NAME))
348
+
349
+
350
+ def get_dstack_runner_binary_path(bin_path: Optional[PathLike] = None) -> str:
351
+ if bin_path is None:
352
+ bin_path = "/usr/local/bin"
353
+ return str(Path(bin_path, DSTACK_RUNNER_BINARY_NAME))
354
+
355
+
338
356
  def get_job_instance_name(run: Run, job: Job) -> str:
339
357
  return job.job_spec.job_name
340
358
 
@@ -441,9 +459,18 @@ def get_cloud_config(**config) -> str:
441
459
 
442
460
 
443
461
  def get_user_data(
444
- authorized_keys: List[str], backend_specific_commands: Optional[List[str]] = None
462
+ authorized_keys: List[str],
463
+ backend_specific_commands: Optional[List[str]] = None,
464
+ base_path: Optional[PathLike] = None,
465
+ bin_path: Optional[PathLike] = None,
466
+ backend_shim_env: Optional[Dict[str, str]] = None,
445
467
  ) -> str:
446
- shim_commands = get_shim_commands(authorized_keys)
468
+ shim_commands = get_shim_commands(
469
+ authorized_keys=authorized_keys,
470
+ base_path=base_path,
471
+ bin_path=bin_path,
472
+ backend_shim_env=backend_shim_env,
473
+ )
447
474
  commands = (backend_specific_commands or []) + shim_commands
448
475
  return get_cloud_config(
449
476
  runcmd=[["sh", "-c", " && ".join(commands)]],
@@ -451,29 +478,55 @@ def get_user_data(
451
478
  )
452
479
 
453
480
 
454
- def get_shim_env(authorized_keys: List[str]) -> Dict[str, str]:
481
+ def get_shim_env(
482
+ authorized_keys: List[str],
483
+ base_path: Optional[PathLike] = None,
484
+ bin_path: Optional[PathLike] = None,
485
+ backend_shim_env: Optional[Dict[str, str]] = None,
486
+ ) -> Dict[str, str]:
455
487
  log_level = "6" # Trace
456
488
  envs = {
457
- "DSTACK_SHIM_HOME": DSTACK_WORKING_DIR,
489
+ "DSTACK_SHIM_HOME": get_dstack_working_dir(base_path),
458
490
  "DSTACK_SHIM_HTTP_PORT": str(DSTACK_SHIM_HTTP_PORT),
459
491
  "DSTACK_SHIM_LOG_LEVEL": log_level,
460
492
  "DSTACK_RUNNER_DOWNLOAD_URL": get_dstack_runner_download_url(),
461
- "DSTACK_RUNNER_BINARY_PATH": DSTACK_RUNNER_BINARY_PATH,
493
+ "DSTACK_RUNNER_BINARY_PATH": get_dstack_runner_binary_path(bin_path),
462
494
  "DSTACK_RUNNER_HTTP_PORT": str(DSTACK_RUNNER_HTTP_PORT),
463
495
  "DSTACK_RUNNER_SSH_PORT": str(DSTACK_RUNNER_SSH_PORT),
464
496
  "DSTACK_RUNNER_LOG_LEVEL": log_level,
465
497
  "DSTACK_PUBLIC_SSH_KEY": "\n".join(authorized_keys),
466
498
  }
499
+ if backend_shim_env is not None:
500
+ envs |= backend_shim_env
467
501
  return envs
468
502
 
469
503
 
470
504
  def get_shim_commands(
471
- authorized_keys: List[str], *, is_privileged: bool = False, pjrt_device: Optional[str] = None
505
+ authorized_keys: List[str],
506
+ *,
507
+ is_privileged: bool = False,
508
+ pjrt_device: Optional[str] = None,
509
+ base_path: Optional[PathLike] = None,
510
+ bin_path: Optional[PathLike] = None,
511
+ backend_shim_env: Optional[Dict[str, str]] = None,
472
512
  ) -> List[str]:
473
- commands = get_shim_pre_start_commands()
474
- for k, v in get_shim_env(authorized_keys).items():
513
+ commands = get_shim_pre_start_commands(
514
+ base_path=base_path,
515
+ bin_path=bin_path,
516
+ )
517
+ shim_env = get_shim_env(
518
+ authorized_keys=authorized_keys,
519
+ base_path=base_path,
520
+ bin_path=bin_path,
521
+ backend_shim_env=backend_shim_env,
522
+ )
523
+ for k, v in shim_env.items():
475
524
  commands += [f'export "{k}={v}"']
476
- commands += get_run_shim_script(is_privileged, pjrt_device)
525
+ commands += get_run_shim_script(
526
+ is_privileged=is_privileged,
527
+ pjrt_device=pjrt_device,
528
+ bin_path=bin_path,
529
+ )
477
530
  return commands
478
531
 
479
532
 
@@ -510,25 +563,33 @@ def get_dstack_shim_download_url() -> str:
510
563
  return f"https://{bucket}.s3.eu-west-1.amazonaws.com/{build}/binaries/dstack-shim-linux-amd64"
511
564
 
512
565
 
513
- def get_shim_pre_start_commands() -> List[str]:
566
+ def get_shim_pre_start_commands(
567
+ base_path: Optional[PathLike] = None,
568
+ bin_path: Optional[PathLike] = None,
569
+ ) -> List[str]:
514
570
  url = get_dstack_shim_download_url()
515
-
571
+ dstack_shim_binary_path = get_dstack_shim_binary_path(bin_path)
572
+ dstack_working_dir = get_dstack_working_dir(base_path)
516
573
  return [
517
574
  f"dlpath=$(sudo mktemp -t {DSTACK_SHIM_BINARY_NAME}.XXXXXXXXXX)",
518
575
  # -sS -- disable progress meter and warnings, but still show errors (unlike bare -s)
519
576
  f'sudo curl -sS --compressed --connect-timeout 60 --max-time 240 --retry 1 --output "$dlpath" "{url}"',
520
- f'sudo mv "$dlpath" {DSTACK_SHIM_BINARY_PATH}',
521
- f"sudo chmod +x {DSTACK_SHIM_BINARY_PATH}",
522
- f"sudo mkdir {DSTACK_WORKING_DIR} -p",
577
+ f'sudo mv "$dlpath" {dstack_shim_binary_path}',
578
+ f"sudo chmod +x {dstack_shim_binary_path}",
579
+ f"sudo mkdir {dstack_working_dir} -p",
523
580
  ]
524
581
 
525
582
 
526
- def get_run_shim_script(is_privileged: bool, pjrt_device: Optional[str]) -> List[str]:
583
+ def get_run_shim_script(
584
+ is_privileged: bool,
585
+ pjrt_device: Optional[str],
586
+ bin_path: Optional[PathLike] = None,
587
+ ) -> List[str]:
588
+ dstack_shim_binary_path = get_dstack_shim_binary_path(bin_path)
527
589
  privileged_flag = "--privileged" if is_privileged else ""
528
590
  pjrt_device_env = f"--pjrt-device={pjrt_device}" if pjrt_device else ""
529
-
530
591
  return [
531
- f"nohup {DSTACK_SHIM_BINARY_PATH} {privileged_flag} {pjrt_device_env} &",
592
+ f"nohup {dstack_shim_binary_path} {privileged_flag} {pjrt_device_env} &",
532
593
  ]
533
594
 
534
595
 
@@ -555,8 +616,10 @@ def get_gateway_user_data(authorized_key: str) -> str:
555
616
 
556
617
 
557
618
  def get_docker_commands(
558
- authorized_keys: List[str], fix_path_in_dot_profile: bool = True
559
- ) -> List[str]:
619
+ authorized_keys: list[str],
620
+ bin_path: Optional[PathLike] = None,
621
+ ) -> list[str]:
622
+ dstack_runner_binary_path = get_dstack_runner_binary_path(bin_path)
560
623
  authorized_keys_content = "\n".join(authorized_keys).strip()
561
624
  commands = [
562
625
  # save and unset ld.so variables
@@ -580,9 +643,6 @@ def get_docker_commands(
580
643
  "chmod 700 ~/.ssh",
581
644
  f"echo '{authorized_keys_content}' > ~/.ssh/authorized_keys",
582
645
  "chmod 600 ~/.ssh/authorized_keys",
583
- r"""if [ -f ~/.profile ]; then sed -ie '1s@^@export PATH="'"$PATH"':$PATH"\n\n@' ~/.profile; fi"""
584
- if fix_path_in_dot_profile
585
- else ":",
586
646
  # regenerate host keys
587
647
  "rm -rf /etc/ssh/ssh_host_*",
588
648
  "ssh-keygen -A > /dev/null",
@@ -600,7 +660,6 @@ def get_docker_commands(
600
660
  " -o PidFile=none"
601
661
  " -o PasswordAuthentication=no"
602
662
  " -o AllowTcpForwarding=yes"
603
- " -o PermitUserEnvironment=yes"
604
663
  " -o ClientAliveInterval=30"
605
664
  " -o ClientAliveCountMax=4"
606
665
  ),
@@ -611,10 +670,10 @@ def get_docker_commands(
611
670
 
612
671
  url = get_dstack_runner_download_url()
613
672
  commands += [
614
- f"curl --connect-timeout 60 --max-time 240 --retry 1 --output {DSTACK_RUNNER_BINARY_PATH} {url}",
615
- f"chmod +x {DSTACK_RUNNER_BINARY_PATH}",
673
+ f"curl --connect-timeout 60 --max-time 240 --retry 1 --output {dstack_runner_binary_path} {url}",
674
+ f"chmod +x {dstack_runner_binary_path}",
616
675
  (
617
- f"{DSTACK_RUNNER_BINARY_PATH}"
676
+ f"{dstack_runner_binary_path}"
618
677
  " --log-level 6"
619
678
  " start"
620
679
  f" --http-port {DSTACK_RUNNER_HTTP_PORT}"
@@ -692,9 +751,18 @@ def get_dstack_gateway_commands() -> List[str]:
692
751
  ]
693
752
 
694
753
 
695
- def merge_tags(tags: Dict[str, str], backend_tags: Optional[Dict[str, str]]) -> Dict[str, str]:
696
- res = tags.copy()
754
+ def merge_tags(
755
+ base_tags: Dict[str, str],
756
+ backend_tags: Optional[Dict[str, str]] = None,
757
+ resource_tags: Optional[Dict[str, str]] = None,
758
+ ) -> Dict[str, str]:
759
+ res = base_tags.copy()
760
+ # backend_tags have priority over resource_tags
761
+ # so that regular users do not override the tags set by admins
697
762
  if backend_tags is not None:
698
763
  for k, v in backend_tags.items():
699
764
  res.setdefault(k, v)
765
+ if resource_tags is not None:
766
+ for k, v in resource_tags.items():
767
+ res.setdefault(k, v)
700
768
  return res
@@ -13,7 +13,6 @@ from dstack._internal.core.backends.gcp.models import (
13
13
  GCPServiceAccountCreds,
14
14
  )
15
15
  from dstack._internal.core.errors import BackendAuthError
16
- from dstack._internal.core.models.common import is_core_model_instance
17
16
 
18
17
 
19
18
  def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[Credentials, str]:
@@ -30,7 +29,7 @@ def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[
30
29
 
31
30
 
32
31
  def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
33
- if is_core_model_instance(creds, GCPServiceAccountCreds):
32
+ if isinstance(creds, GCPServiceAccountCreds):
34
33
  try:
35
34
  service_account_info = json.loads(creds.data)
36
35
  credentials = service_account.Credentials.from_service_account_info(
@@ -211,8 +211,12 @@ class GCPCompute(
211
211
  "dstack_name": instance_config.instance_name,
212
212
  "dstack_user": instance_config.user.lower(),
213
213
  }
214
- labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
215
- labels = merge_tags(tags=labels, backend_tags=self.config.tags)
214
+ labels = merge_tags(
215
+ base_tags=labels,
216
+ backend_tags=self.config.tags,
217
+ resource_tags=instance_config.tags,
218
+ )
219
+ labels = gcp_resources.filter_invalid_labels(labels)
216
220
  is_tpu = (
217
221
  _is_tpu(instance_offer.instance.resources.gpus[0].name)
218
222
  if instance_offer.instance.resources.gpus
@@ -292,11 +296,9 @@ class GCPCompute(
292
296
  gpus=instance_offer.instance.resources.gpus,
293
297
  ),
294
298
  spot=instance_offer.instance.resources.spot,
295
- user_data=get_user_data(
296
- authorized_keys,
297
- backend_specific_commands=_get_backend_specific_commands(
298
- instance_offer.instance.name
299
- ),
299
+ user_data=_get_user_data(
300
+ authorized_keys=authorized_keys,
301
+ instance_type_name=instance_offer.instance.name,
300
302
  ),
301
303
  authorized_keys=authorized_keys,
302
304
  labels=labels,
@@ -471,8 +473,12 @@ class GCPCompute(
471
473
  "dstack_project": configuration.project_name.lower(),
472
474
  "dstack_name": configuration.instance_name,
473
475
  }
474
- labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
475
- labels = merge_tags(tags=labels, backend_tags=self.config.tags)
476
+ labels = merge_tags(
477
+ base_tags=labels,
478
+ backend_tags=self.config.tags,
479
+ resource_tags=configuration.tags,
480
+ )
481
+ labels = gcp_resources.filter_invalid_labels(labels)
476
482
 
477
483
  request = compute_v1.InsertInstanceRequest()
478
484
  request.zone = zone
@@ -573,8 +579,12 @@ class GCPCompute(
573
579
  "dstack_name": volume.name,
574
580
  "dstack_user": volume.user,
575
581
  }
576
- labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
577
- labels = merge_tags(tags=labels, backend_tags=self.config.tags)
582
+ labels = merge_tags(
583
+ base_tags=labels,
584
+ backend_tags=self.config.tags,
585
+ resource_tags=volume.configuration.tags,
586
+ )
587
+ labels = gcp_resources.filter_invalid_labels(labels)
578
588
 
579
589
  disk = compute_v1.Disk()
580
590
  disk.name = disk_name
@@ -829,10 +839,14 @@ def _get_extra_subnets(
829
839
  ) -> List[Tuple[str, str]]:
830
840
  if config.extra_vpcs is None:
831
841
  return []
832
- if instance_type_name != "a3-megagpu-8g":
842
+ if instance_type_name == "a3-megagpu-8g":
843
+ subnets_num = 8
844
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
845
+ subnets_num = 4
846
+ else:
833
847
  return []
834
848
  extra_subnets = []
835
- for vpc_name in config.extra_vpcs:
849
+ for vpc_name in config.extra_vpcs[:subnets_num]:
836
850
  subnet = gcp_resources.get_vpc_subnet_or_error(
837
851
  subnetworks_client=subnetworks_client,
838
852
  vpc_project_id=config.vpc_project_id or config.project_id,
@@ -844,12 +858,14 @@ def _get_extra_subnets(
844
858
  vpc_name=vpc_name,
845
859
  )
846
860
  extra_subnets.append((vpc_resource_name, subnet))
847
- return extra_subnets[:8]
861
+ return extra_subnets
848
862
 
849
863
 
850
864
  def _get_image_id(instance_type_name: str, cuda: bool) -> str:
851
865
  if instance_type_name == "a3-megagpu-8g":
852
866
  image_name = "dstack-a3mega-5"
867
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
868
+ return "projects/cos-cloud/global/images/cos-105-17412-535-78"
853
869
  elif cuda:
854
870
  image_name = f"dstack-cuda-{version.base_image}"
855
871
  else:
@@ -862,9 +878,37 @@ def _get_gateway_image_id() -> str:
862
878
  return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
863
879
 
864
880
 
881
+ def _get_user_data(authorized_keys: List[str], instance_type_name: str) -> str:
882
+ base_path = None
883
+ bin_path = None
884
+ backend_shim_env = None
885
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
886
+ # In the COS image the / file system is not writable.
887
+ # /home and /var are writable but not executable.
888
+ # Only /etc is both writable and executable, so use it for shim/runner binaries.
889
+ # See: https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
890
+ base_path = bin_path = "/etc"
891
+ backend_shim_env = {
892
+ # In COS nvidia binaries are not installed on PATH by default.
893
+ # Set so that shim can run nvidia-smi.
894
+ "PATH": "/var/lib/nvidia/bin:$PATH",
895
+ }
896
+ return get_user_data(
897
+ authorized_keys=authorized_keys,
898
+ backend_specific_commands=_get_backend_specific_commands(
899
+ instance_type_name=instance_type_name,
900
+ ),
901
+ base_path=base_path,
902
+ bin_path=bin_path,
903
+ backend_shim_env=backend_shim_env,
904
+ )
905
+
906
+
865
907
  def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
866
908
  if instance_type_name == "a3-megagpu-8g":
867
909
  return tcpx_features.get_backend_specific_commands_tcpxo()
910
+ if instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
911
+ return tcpx_features.get_backend_specific_commands_tcpx()
868
912
  return []
869
913
 
870
914
 
@@ -24,7 +24,6 @@ from dstack._internal.core.errors import BackendAuthError, BackendError, ServerC
24
24
  from dstack._internal.core.models.backends.base import (
25
25
  BackendType,
26
26
  )
27
- from dstack._internal.core.models.common import is_core_model_instance
28
27
 
29
28
  LOCATIONS = [
30
29
  {
@@ -115,7 +114,7 @@ class GCPConfigurator(Configurator):
115
114
  BACKEND_CLASS = GCPBackend
116
115
 
117
116
  def validate_config(self, config: GCPBackendConfigWithCreds, default_creds_enabled: bool):
118
- if is_core_model_instance(config.creds, GCPDefaultCreds) and not default_creds_enabled:
117
+ if isinstance(config.creds, GCPDefaultCreds) and not default_creds_enabled:
119
118
  raise_invalid_credentials_error(fields=[["creds"]])
120
119
  try:
121
120
  credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
@@ -123,7 +122,7 @@ class GCPConfigurator(Configurator):
123
122
  details = None
124
123
  if len(e.args) > 0:
125
124
  details = e.args[0]
126
- if is_core_model_instance(config.creds, GCPServiceAccountCreds):
125
+ if isinstance(config.creds, GCPServiceAccountCreds):
127
126
  raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
128
127
  else:
129
128
  raise_invalid_credentials_error(fields=[["creds"]], details=details)
@@ -32,3 +32,34 @@ def get_backend_specific_commands_tcpxo() -> List[str]:
32
32
  "--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
33
33
  ),
34
34
  ]
35
+
36
+
37
+ def get_backend_specific_commands_tcpx() -> List[str]:
38
+ return [
39
+ "cos-extensions install gpu -- --version=latest",
40
+ "sudo mount --bind /var/lib/nvidia /var/lib/nvidia",
41
+ "sudo mount -o remount,exec /var/lib/nvidia",
42
+ (
43
+ "docker run "
44
+ "--detach "
45
+ "--pull=always "
46
+ "--name receive-datapath-manager "
47
+ "--privileged "
48
+ "--cap-add=NET_ADMIN --network=host "
49
+ "--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 "
50
+ "--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 "
51
+ "--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 "
52
+ "--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 "
53
+ "--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 "
54
+ "--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl "
55
+ "--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 "
56
+ "--volume /run/tcpx:/run/tcpx "
57
+ "--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd "
58
+ "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd "
59
+ '--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0"'
60
+ ),
61
+ "sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT",
62
+ "docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl",
63
+ "sudo mount --bind /var/lib/tcpx /var/lib/tcpx",
64
+ "sudo mount -o remount,exec /var/lib/tcpx",
65
+ ]
@@ -332,6 +332,16 @@ def get_accelerators(
332
332
  return [accelerator_config]
333
333
 
334
334
 
335
+ def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
336
+ filtered_labels = {}
337
+ for k, v in labels.items():
338
+ if not _is_valid_label(k, v):
339
+ logger.warning("Skipping invalid label '%s: %s'", k, v)
340
+ continue
341
+ filtered_labels[k] = v
342
+ return filtered_labels
343
+
344
+
335
345
  def validate_labels(labels: Dict[str, str]):
336
346
  for k, v in labels.items():
337
347
  if not _is_valid_label(k, v):
@@ -86,7 +86,11 @@ class NebiusCompute(
86
86
 
87
87
  @cached_property
88
88
  def _region_to_project_id(self) -> dict[str, str]:
89
- return resources.get_region_to_project_id_map(self._sdk)
89
+ return resources.get_region_to_project_id_map(
90
+ self._sdk,
91
+ configured_regions=self.config.regions,
92
+ configured_project_ids=self.config.projects,
93
+ )
90
94
 
91
95
  def _get_subnet_id(self, region: str) -> str:
92
96
  if region not in self._subnet_id_cache:
@@ -100,7 +104,7 @@ class NebiusCompute(
100
104
  ) -> List[InstanceOfferWithAvailability]:
101
105
  offers = get_catalog_offers(
102
106
  backend=BackendType.NEBIUS,
103
- locations=self.config.regions or list(self._region_to_project_id),
107
+ locations=list(self._region_to_project_id),
104
108
  requirements=requirements,
105
109
  extra_filter=_supported_instances,
106
110
  configurable_disk_size=CONFIGURABLE_DISK_SIZE,
@@ -29,21 +29,15 @@ class NebiusConfigurator(Configurator):
29
29
  assert isinstance(config.creds, NebiusServiceAccountCreds)
30
30
  try:
31
31
  sdk = resources.make_sdk(config.creds)
32
- available_regions = set(resources.get_region_to_project_id_map(sdk))
32
+ # check that it's possible to build the projects map with configured settings
33
+ resources.get_region_to_project_id_map(
34
+ sdk, configured_regions=config.regions, configured_project_ids=config.projects
35
+ )
33
36
  except (ValueError, RequestError) as e:
34
37
  raise_invalid_credentials_error(
35
38
  fields=[["creds"]],
36
39
  details=str(e),
37
40
  )
38
- if invalid_regions := set(config.regions or []) - available_regions:
39
- raise_invalid_credentials_error(
40
- fields=[["regions"]],
41
- details=(
42
- f"Configured regions {invalid_regions} do not exist in this Nebius tenancy."
43
- " Omit `regions` to use all regions or select some of the available regions:"
44
- f" {available_regions}"
45
- ),
46
- )
47
41
 
48
42
  def create_backend(
49
43
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -5,6 +5,8 @@ from pydantic import Field, root_validator
5
5
  from dstack._internal.core.backends.base.models import fill_data
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
8
+ DEFAULT_PROJECT_NAME_PREFIX = "default-project"
9
+
8
10
 
9
11
  class NebiusServiceAccountCreds(CoreModel):
10
12
  type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
@@ -70,9 +72,20 @@ class NebiusBackendConfig(CoreModel):
70
72
  Literal["nebius"],
71
73
  Field(description="The type of backend"),
72
74
  ] = "nebius"
75
+ projects: Annotated[
76
+ Optional[list[str]],
77
+ Field(
78
+ description=(
79
+ "The list of allowed Nebius project IDs."
80
+ " Omit to use the default project in each region."
81
+ " The project is considered default if it is the only project in the region"
82
+ f" or if its name starts with `{DEFAULT_PROJECT_NAME_PREFIX}`"
83
+ )
84
+ ),
85
+ ] = None
73
86
  regions: Annotated[
74
87
  Optional[list[str]],
75
- Field(description="The list of Nebius regions. Omit to use all regions"),
88
+ Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
76
89
  ] = None
77
90
 
78
91